1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * FIXME: add checking of cram_external_type to return NULL on unsupported
33 * {codec,type} tuples.
34 */
35
36 #include <config.h>
37
38 #include <stdlib.h>
39 #include <string.h>
40 #include <assert.h>
41 #include <limits.h>
42 #include <stdint.h>
43 #include <errno.h>
44
45 #include "cram/cram.h"
46
47 /*
48 * ---------------------------------------------------------------------------
49 * Block bit-level I/O functions.
50 * All defined static here to promote easy inlining by the compiler.
51 */
52
53 #if 0
54 /* Get a single bit, MSB first */
55 static signed int get_bit_MSB(cram_block *block) {
56 unsigned int val;
57
58 if (block->byte > block->alloc)
59 return -1;
60
61 val = block->data[block->byte] >> block->bit;
62 if (--block->bit == -1) {
63 block->bit = 7;
64 block->byte++;
65 //printf("(%02X)", block->data[block->byte]);
66 }
67
68 //printf("-B%d-", val&1);
69
70 return val & 1;
71 }
72 #endif
73
74 /*
75 * Count number of successive 0 and 1 bits
76 */
get_one_bits_MSB(cram_block * block)77 static int get_one_bits_MSB(cram_block *block) {
78 int n = 0, b;
79 if (block->byte >= block->uncomp_size)
80 return -1;
81 do {
82 b = block->data[block->byte] >> block->bit;
83 if (--block->bit == -1) {
84 block->bit = 7;
85 block->byte++;
86 if (block->byte == block->uncomp_size && (b&1))
87 return -1;
88 }
89 n++;
90 } while (b&1);
91
92 return n-1;
93 }
94
get_zero_bits_MSB(cram_block * block)95 static int get_zero_bits_MSB(cram_block *block) {
96 int n = 0, b;
97 if (block->byte >= block->uncomp_size)
98 return -1;
99 do {
100 b = block->data[block->byte] >> block->bit;
101 if (--block->bit == -1) {
102 block->bit = 7;
103 block->byte++;
104 if (block->byte == block->uncomp_size && !(b&1))
105 return -1;
106 }
107 n++;
108 } while (!(b&1));
109
110 return n-1;
111 }
112
113 #if 0
114 /* Stores a single bit */
115 static void store_bit_MSB(cram_block *block, unsigned int bit) {
116 if (block->byte >= block->alloc) {
117 block->alloc = block->alloc ? block->alloc*2 : 1024;
118 block->data = realloc(block->data, block->alloc);
119 }
120
121 if (bit)
122 block->data[block->byte] |= (1 << block->bit);
123
124 if (--block->bit == -1) {
125 block->bit = 7;
126 block->byte++;
127 block->data[block->byte] = 0;
128 }
129 }
130 #endif
131
132 #if 0
133 /* Rounds to the next whole byte boundary first */
134 static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
135 if (block->bit != 7) {
136 block->bit = 7;
137 block->byte++;
138 }
139
140 while (block->byte + len >= block->alloc) {
141 block->alloc = block->alloc ? block->alloc*2 : 1024;
142 block->data = realloc(block->data, block->alloc);
143 }
144
145 memcpy(&block->data[block->byte], bytes, len);
146 block->byte += len;
147 }
148 #endif
149
150 /* Local optimised copy for inlining */
get_bits_MSB(cram_block * block,int nbits)151 static inline unsigned int get_bits_MSB(cram_block *block, int nbits) {
152 unsigned int val = 0;
153 int i;
154
155 #if 0
156 // Fits within the current byte */
157 if (nbits <= block->bit+1) {
158 val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
159 if ((block->bit -= nbits) == -1) {
160 block->bit = 7;
161 block->byte++;
162 }
163 return val;
164 }
165
166 // partial first byte
167 val = block->data[block->byte] & ((1<<(block->bit+1))-1);
168 nbits -= block->bit+1;
169 block->bit = 7;
170 block->byte++;
171
172 // whole middle bytes
173 while (nbits >= 8) {
174 val = (val << 8) | block->data[block->byte++];
175 nbits -= 8;
176 }
177
178 val <<= nbits;
179 val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
180 block->bit -= nbits;
181 return val;
182 #endif
183
184 #if 0
185 /* Inefficient implementation! */
186 //printf("{");
187 for (i = 0; i < nbits; i++)
188 //val = (val << 1) | get_bit_MSB(block);
189 GET_BIT_MSB(block, val);
190 #endif
191
192 #if 1
193 /* Combination of 1st two methods */
194 if (nbits <= block->bit+1) {
195 val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
196 if ((block->bit -= nbits) == -1) {
197 block->bit = 7;
198 block->byte++;
199 }
200 return val;
201 }
202
203 switch(nbits) {
204 // case 15: GET_BIT_MSB(block, val);
205 // case 14: GET_BIT_MSB(block, val);
206 // case 13: GET_BIT_MSB(block, val);
207 // case 12: GET_BIT_MSB(block, val);
208 // case 11: GET_BIT_MSB(block, val);
209 // case 10: GET_BIT_MSB(block, val);
210 // case 9: GET_BIT_MSB(block, val);
211 case 8: GET_BIT_MSB(block, val);
212 case 7: GET_BIT_MSB(block, val);
213 case 6: GET_BIT_MSB(block, val);
214 case 5: GET_BIT_MSB(block, val);
215 case 4: GET_BIT_MSB(block, val);
216 case 3: GET_BIT_MSB(block, val);
217 case 2: GET_BIT_MSB(block, val);
218 case 1: GET_BIT_MSB(block, val);
219 break;
220
221 default:
222 for (i = 0; i < nbits; i++)
223 //val = (val << 1) | get_bit_MSB(block);
224 GET_BIT_MSB(block, val);
225 }
226 #endif
227
228 //printf("=0x%x}", val);
229
230 return val;
231 }
232
233 /*
234 * Can store up to 24-bits worth of data encoded in an integer value
235 * Possibly we'd want to have a less optimal store_bits function when dealing
236 * with nbits > 24, but for now we assume the codes generated are never
237 * that big. (Given this is only possible with 121392 or more
238 * characters with exactly the correct frequency distribution we check
239 * for it elsewhere.)
240 */
store_bits_MSB(cram_block * block,unsigned int val,int nbits)241 static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) {
242 //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
243
244 /*
245 * Use slow mode until we tweak the huffman generator to never generate
246 * codes longer than 24-bits.
247 */
248 unsigned int mask;
249
250 if (block->byte+4 >= block->alloc) {
251 if (block->byte) {
252 block->alloc *= 2;
253 block->data = realloc(block->data, block->alloc + 4);
254 if (!block->data)
255 return -1;
256 } else {
257 block->alloc = 1024;
258 block->data = realloc(block->data, block->alloc + 4);
259 if (!block->data)
260 return -1;
261 block->data[0] = 0; // initialise first byte of buffer
262 }
263 }
264
265 /* fits in current bit-field */
266 if (nbits <= block->bit+1) {
267 block->data[block->byte] |= (val << (block->bit+1-nbits));
268 if ((block->bit-=nbits) == -1) {
269 block->bit = 7;
270 block->byte++;
271 block->data[block->byte] = 0;
272 }
273 return 0;
274 }
275
276 block->data[block->byte] |= (val >> (nbits -= block->bit+1));
277 block->bit = 7;
278 block->byte++;
279 block->data[block->byte] = 0;
280
281 mask = 1<<(nbits-1);
282 do {
283 if (val & mask)
284 block->data[block->byte] |= (1 << block->bit);
285 if (--block->bit == -1) {
286 block->bit = 7;
287 block->byte++;
288 block->data[block->byte] = 0;
289 }
290 mask >>= 1;
291 } while(--nbits);
292
293 return 0;
294 }
295
296 /*
297 * Returns the next 'size' bytes from a block, or NULL if insufficient
298 * data left.This is just a pointer into the block data and not an
299 * allocated object, so do not free the result.
300 */
cram_extract_block(cram_block * b,int size)301 static char *cram_extract_block(cram_block *b, int size) {
302 char *cp = (char *)b->data + b->idx;
303 b->idx += size;
304 if (b->idx > b->uncomp_size)
305 return NULL;
306
307 return cp;
308 }
309
310 /*
311 * ---------------------------------------------------------------------------
312 * EXTERNAL
313 */
cram_external_decode_int(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)314 int cram_external_decode_int(cram_slice *slice, cram_codec *c,
315 cram_block *in, char *out, int *out_size) {
316 int l;
317 char *cp;
318 cram_block *b;
319
320 /* Find the external block */
321 b = cram_get_block_by_id(slice, c->external.content_id);
322 if (!b)
323 return *out_size?-1:0;
324
325 cp = (char *)b->data + b->idx;
326 // E_INT and E_LONG are guaranteed single item queries
327 l = safe_itf8_get(cp, (char *)b->data + b->uncomp_size, (int32_t *)out);
328 b->idx += l;
329 *out_size = 1;
330
331 return l > 0 ? 0 : -1;
332 }
333
cram_external_decode_char(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)334 int cram_external_decode_char(cram_slice *slice, cram_codec *c,
335 cram_block *in, char *out,
336 int *out_size) {
337 char *cp;
338 cram_block *b;
339
340 /* Find the external block */
341 b = cram_get_block_by_id(slice, c->external.content_id);
342 if (!b)
343 return *out_size?-1:0;
344
345 cp = cram_extract_block(b, *out_size);
346 if (!cp)
347 return -1;
348
349 if (out)
350 memcpy(out, cp, *out_size);
351 return 0;
352 }
353
cram_external_decode_block(cram_slice * slice,cram_codec * c,cram_block * in,char * out_,int * out_size)354 static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
355 cram_block *in, char *out_,
356 int *out_size) {
357 char *cp;
358 cram_block *out = (cram_block *)out_;
359 cram_block *b = NULL;
360
361 /* Find the external block */
362 b = cram_get_block_by_id(slice, c->external.content_id);
363 if (!b)
364 return *out_size?-1:0;
365
366 cp = cram_extract_block(b, *out_size);
367 if (!cp)
368 return -1;
369
370 BLOCK_APPEND(out, cp, *out_size);
371 return 0;
372 }
373
cram_external_decode_free(cram_codec * c)374 void cram_external_decode_free(cram_codec *c) {
375 if (c)
376 free(c);
377 }
378
cram_external_decode_init(char * data,int size,enum cram_external_type option,int version)379 cram_codec *cram_external_decode_init(char *data, int size,
380 enum cram_external_type option,
381 int version) {
382 cram_codec *c = NULL;
383 char *cp = data;
384
385 if (size < 1)
386 goto malformed;
387
388 if (!(c = malloc(sizeof(*c))))
389 return NULL;
390
391 c->codec = E_EXTERNAL;
392 if (option == E_INT || option == E_LONG)
393 c->decode = cram_external_decode_int;
394 else if (option == E_BYTE_ARRAY || option == E_BYTE)
395 c->decode = cram_external_decode_char;
396 else
397 c->decode = cram_external_decode_block;
398 c->free = cram_external_decode_free;
399
400 cp += safe_itf8_get(cp, data + size, &c->external.content_id);
401
402 if (cp - data != size)
403 goto malformed;
404
405 c->external.type = option;
406
407 return c;
408
409 malformed:
410 hts_log_error("Malformed external header stream");
411 free(c);
412 return NULL;
413 }
414
cram_external_encode_int(cram_slice * slice,cram_codec * c,char * in,int in_size)415 int cram_external_encode_int(cram_slice *slice, cram_codec *c,
416 char *in, int in_size) {
417 uint32_t *i32 = (uint32_t *)in;
418
419 itf8_put_blk(c->out, *i32);
420 return 0;
421 }
422
cram_external_encode_char(cram_slice * slice,cram_codec * c,char * in,int in_size)423 int cram_external_encode_char(cram_slice *slice, cram_codec *c,
424 char *in, int in_size) {
425 BLOCK_APPEND(c->out, in, in_size);
426 return 0;
427 }
428
cram_external_encode_free(cram_codec * c)429 void cram_external_encode_free(cram_codec *c) {
430 if (!c)
431 return;
432 free(c);
433 }
434
cram_external_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)435 int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
436 int version) {
437 char tmp[99], *tp = tmp;
438 int len = 0;
439
440 if (prefix) {
441 size_t l = strlen(prefix);
442 BLOCK_APPEND(b, prefix, l);
443 len += l;
444 }
445
446 tp += itf8_put(tp, c->e_external.content_id);
447 len += itf8_put_blk(b, c->codec);
448 len += itf8_put_blk(b, tp-tmp);
449 BLOCK_APPEND(b, tmp, tp-tmp);
450 len += tp-tmp;
451
452 return len;
453 }
454
cram_external_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)455 cram_codec *cram_external_encode_init(cram_stats *st,
456 enum cram_external_type option,
457 void *dat,
458 int version) {
459 cram_codec *c;
460
461 c = malloc(sizeof(*c));
462 if (!c)
463 return NULL;
464 c->codec = E_EXTERNAL;
465 c->free = cram_external_encode_free;
466 if (option == E_INT || option == E_LONG)
467 c->encode = cram_external_encode_int;
468 else if (option == E_BYTE_ARRAY || option == E_BYTE)
469 c->encode = cram_external_encode_char;
470 else
471 abort();
472 c->store = cram_external_encode_store;
473
474 c->e_external.content_id = (size_t)dat;
475
476 return c;
477 }
478
479 /*
480 * ---------------------------------------------------------------------------
481 * BETA
482 */
cram_beta_decode_int(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)483 int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
484 int32_t *out_i = (int32_t *)out;
485 int i, n = *out_size;
486
487 if (c->beta.nbits) {
488 if (cram_not_enough_bits(in, c->beta.nbits * n))
489 return -1;
490
491 for (i = 0; i < n; i++)
492 out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
493 } else {
494 for (i = 0; i < n; i++)
495 out_i[i] = -c->beta.offset;
496 }
497
498 return 0;
499 }
500
cram_beta_decode_char(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)501 int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
502 int i, n = *out_size;
503
504
505 if (c->beta.nbits) {
506 if (cram_not_enough_bits(in, c->beta.nbits * n))
507 return -1;
508
509 if (out)
510 for (i = 0; i < n; i++)
511 out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
512 else
513 for (i = 0; i < n; i++)
514 get_bits_MSB(in, c->beta.nbits);
515 } else {
516 if (out)
517 for (i = 0; i < n; i++)
518 out[i] = -c->beta.offset;
519 }
520
521 return 0;
522 }
523
cram_beta_decode_free(cram_codec * c)524 void cram_beta_decode_free(cram_codec *c) {
525 if (c)
526 free(c);
527 }
528
cram_beta_decode_init(char * data,int size,enum cram_external_type option,int version)529 cram_codec *cram_beta_decode_init(char *data, int size,
530 enum cram_external_type option,
531 int version) {
532 cram_codec *c;
533 char *cp = data;
534
535 if (!(c = malloc(sizeof(*c))))
536 return NULL;
537
538 c->codec = E_BETA;
539 if (option == E_INT || option == E_LONG)
540 c->decode = cram_beta_decode_int;
541 else if (option == E_BYTE_ARRAY || option == E_BYTE)
542 c->decode = cram_beta_decode_char;
543 else {
544 hts_log_error("BYTE_ARRAYs not supported by this codec");
545 return NULL;
546 }
547 c->free = cram_beta_decode_free;
548
549 c->beta.nbits = -1;
550 cp += safe_itf8_get(cp, data + size, &c->beta.offset);
551 if (cp < data + size) // Ensure test below works
552 cp += safe_itf8_get(cp, data + size, &c->beta.nbits);
553
554 if (cp - data != size
555 || c->beta.nbits < 0 || c->beta.nbits > 8 * sizeof(int)) {
556 hts_log_error("Malformed beta header stream");
557 free(c);
558 return NULL;
559 }
560
561 return c;
562 }
563
cram_beta_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)564 int cram_beta_encode_store(cram_codec *c, cram_block *b,
565 char *prefix, int version) {
566 int len = 0;
567
568 if (prefix) {
569 size_t l = strlen(prefix);
570 BLOCK_APPEND(b, prefix, l);
571 len += l;
572 }
573
574 len += itf8_put_blk(b, c->codec);
575 len += itf8_put_blk(b, itf8_size(c->e_beta.offset)
576 + itf8_size(c->e_beta.nbits)); // codec length
577 len += itf8_put_blk(b, c->e_beta.offset);
578 len += itf8_put_blk(b, c->e_beta.nbits);
579
580 return len;
581 }
582
cram_beta_encode_int(cram_slice * slice,cram_codec * c,char * in,int in_size)583 int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
584 char *in, int in_size) {
585 int *syms = (int *)in;
586 int i, r = 0;
587
588 for (i = 0; i < in_size; i++)
589 r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
590 c->e_beta.nbits);
591
592 return r;
593 }
594
cram_beta_encode_char(cram_slice * slice,cram_codec * c,char * in,int in_size)595 int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
596 char *in, int in_size) {
597 unsigned char *syms = (unsigned char *)in;
598 int i, r = 0;
599
600 for (i = 0; i < in_size; i++)
601 r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
602 c->e_beta.nbits);
603
604 return r;
605 }
606
cram_beta_encode_free(cram_codec * c)607 void cram_beta_encode_free(cram_codec *c) {
608 if (c) free(c);
609 }
610
cram_beta_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)611 cram_codec *cram_beta_encode_init(cram_stats *st,
612 enum cram_external_type option,
613 void *dat,
614 int version) {
615 cram_codec *c;
616 int min_val, max_val, len = 0;
617 int64_t range;
618
619 c = malloc(sizeof(*c));
620 if (!c)
621 return NULL;
622 c->codec = E_BETA;
623 c->free = cram_beta_encode_free;
624 if (option == E_INT)
625 c->encode = cram_beta_encode_int;
626 else
627 c->encode = cram_beta_encode_char;
628 c->store = cram_beta_encode_store;
629
630 if (dat) {
631 min_val = ((int *)dat)[0];
632 max_val = ((int *)dat)[1];
633 } else {
634 min_val = INT_MAX;
635 max_val = INT_MIN;
636 int i;
637 for (i = 0; i < MAX_STAT_VAL; i++) {
638 if (!st->freqs[i])
639 continue;
640 if (min_val > i)
641 min_val = i;
642 max_val = i;
643 }
644 if (st->h) {
645 khint_t k;
646
647 for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
648 if (!kh_exist(st->h, k))
649 continue;
650
651 i = kh_key(st->h, k);
652 if (min_val > i)
653 min_val = i;
654 if (max_val < i)
655 max_val = i;
656 }
657 }
658 }
659
660 assert(max_val >= min_val);
661 c->e_beta.offset = -min_val;
662 range = (int64_t) max_val - min_val;
663 while (range) {
664 len++;
665 range >>= 1;
666 }
667 c->e_beta.nbits = len;
668
669 return c;
670 }
671
672 /*
673 * ---------------------------------------------------------------------------
674 * SUBEXP
675 */
cram_subexp_decode(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)676 int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
677 int32_t *out_i = (int32_t *)out;
678 int n, count;
679 int k = c->subexp.k;
680
681 for (count = 0, n = *out_size; count < n; count++) {
682 int i = 0, tail;
683 int val;
684
685 /* Get number of 1s */
686 //while (get_bit_MSB(in) == 1) i++;
687 i = get_one_bits_MSB(in);
688 if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
689 return -1;
690 /*
691 * Val is
692 * i > 0: 2^(k+i-1) + k+i-1 bits
693 * i = 0: k bits
694 */
695 if (i) {
696 tail = i + k-1;
697 val = 0;
698 while (tail) {
699 //val = val<<1; val |= get_bit_MSB(in);
700 GET_BIT_MSB(in, val);
701 tail--;
702 }
703 val += 1 << (i + k-1);
704 } else {
705 tail = k;
706 val = 0;
707 while (tail) {
708 //val = val<<1; val |= get_bit_MSB(in);
709 GET_BIT_MSB(in, val);
710 tail--;
711 }
712 }
713
714 out_i[count] = val - c->subexp.offset;
715 }
716
717 return 0;
718 }
719
cram_subexp_decode_free(cram_codec * c)720 void cram_subexp_decode_free(cram_codec *c) {
721 if (c)
722 free(c);
723 }
724
cram_subexp_decode_init(char * data,int size,enum cram_external_type option,int version)725 cram_codec *cram_subexp_decode_init(char *data, int size,
726 enum cram_external_type option,
727 int version) {
728 cram_codec *c;
729 char *cp = data;
730
731 if (option != E_INT) {
732 hts_log_error("This codec only supports INT encodings");
733 return NULL;
734 }
735
736 if (!(c = malloc(sizeof(*c))))
737 return NULL;
738
739 c->codec = E_SUBEXP;
740 c->decode = cram_subexp_decode;
741 c->free = cram_subexp_decode_free;
742 c->subexp.k = -1;
743
744 cp += safe_itf8_get(cp, data + size, &c->subexp.offset);
745 cp += safe_itf8_get(cp, data + size, &c->subexp.k);
746
747 if (cp - data != size || c->subexp.k < 0) {
748 hts_log_error("Malformed subexp header stream");
749 free(c);
750 return NULL;
751 }
752
753 return c;
754 }
755
756 /*
757 * ---------------------------------------------------------------------------
758 * GAMMA
759 */
cram_gamma_decode(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)760 int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
761 int32_t *out_i = (int32_t *)out;
762 int i, n;
763
764 for (i = 0, n = *out_size; i < n; i++) {
765 int nz = 0;
766 int val;
767 //while (get_bit_MSB(in) == 0) nz++;
768 nz = get_zero_bits_MSB(in);
769 if (cram_not_enough_bits(in, nz))
770 return -1;
771 val = 1;
772 while (nz > 0) {
773 //val <<= 1; val |= get_bit_MSB(in);
774 GET_BIT_MSB(in, val);
775 nz--;
776 }
777
778 out_i[i] = val - c->gamma.offset;
779 }
780
781 return 0;
782 }
783
cram_gamma_decode_free(cram_codec * c)784 void cram_gamma_decode_free(cram_codec *c) {
785 if (c)
786 free(c);
787 }
788
cram_gamma_decode_init(char * data,int size,enum cram_external_type option,int version)789 cram_codec *cram_gamma_decode_init(char *data, int size,
790 enum cram_external_type option,
791 int version) {
792 cram_codec *c = NULL;
793 char *cp = data;
794
795 if (option != E_INT) {
796 hts_log_error("This codec only supports INT encodings");
797 return NULL;
798 }
799
800 if (size < 1)
801 goto malformed;
802
803 if (!(c = malloc(sizeof(*c))))
804 return NULL;
805
806 c->codec = E_GAMMA;
807 c->decode = cram_gamma_decode;
808 c->free = cram_gamma_decode_free;
809
810 cp += safe_itf8_get(cp, data + size, &c->gamma.offset);
811
812 if (cp - data != size)
813 goto malformed;
814
815 return c;
816
817 malformed:
818 hts_log_error("Malformed gamma header stream");
819 free(c);
820 return NULL;
821 }
822
823 /*
824 * ---------------------------------------------------------------------------
825 * HUFFMAN
826 */
827
code_sort(const void * vp1,const void * vp2)828 static int code_sort(const void *vp1, const void *vp2) {
829 const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
830 const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
831
832 if (c1->len != c2->len)
833 return c1->len - c2->len;
834 else
835 return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
836 }
837
cram_huffman_decode_free(cram_codec * c)838 void cram_huffman_decode_free(cram_codec *c) {
839 if (!c)
840 return;
841
842 if (c->huffman.codes)
843 free(c->huffman.codes);
844 free(c);
845 }
846
cram_huffman_decode_null(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)847 int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
848 cram_block *in, char *out, int *out_size) {
849 return -1;
850 }
851
cram_huffman_decode_char0(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)852 int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
853 cram_block *in, char *out, int *out_size) {
854 int i, n;
855
856 if (!out)
857 return 0;
858
859 /* Special case of 0 length codes */
860 for (i = 0, n = *out_size; i < n; i++) {
861 out[i] = c->huffman.codes[0].symbol;
862 }
863 return 0;
864 }
865
cram_huffman_decode_char(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)866 int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
867 cram_block *in, char *out, int *out_size) {
868 int i, n, ncodes = c->huffman.ncodes;
869 const cram_huffman_code * const codes = c->huffman.codes;
870
871 for (i = 0, n = *out_size; i < n; i++) {
872 int idx = 0;
873 int val = 0, len = 0, last_len = 0;
874
875 for (;;) {
876 int dlen = codes[idx].len - last_len;
877 if (cram_not_enough_bits(in, dlen))
878 return -1;
879
880 //val <<= dlen;
881 //val |= get_bits_MSB(in, dlen);
882 //last_len = (len += dlen);
883
884 last_len = (len += dlen);
885 for (; dlen; dlen--) GET_BIT_MSB(in, val);
886
887 idx = val - codes[idx].p;
888 if (idx >= ncodes || idx < 0)
889 return -1;
890
891 if (codes[idx].code == val && codes[idx].len == len) {
892 if (out) out[i] = codes[idx].symbol;
893 break;
894 }
895 }
896 }
897
898 return 0;
899 }
900
cram_huffman_decode_int0(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)901 int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
902 cram_block *in, char *out, int *out_size) {
903 int32_t *out_i = (int32_t *)out;
904 int i, n;
905 const cram_huffman_code * const codes = c->huffman.codes;
906
907 /* Special case of 0 length codes */
908 for (i = 0, n = *out_size; i < n; i++) {
909 out_i[i] = codes[0].symbol;
910 }
911 return 0;
912 }
913
cram_huffman_decode_int(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)914 int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
915 cram_block *in, char *out, int *out_size) {
916 int32_t *out_i = (int32_t *)out;
917 int i, n, ncodes = c->huffman.ncodes;
918 const cram_huffman_code * const codes = c->huffman.codes;
919
920 for (i = 0, n = *out_size; i < n; i++) {
921 int idx = 0;
922 int val = 0, len = 0, last_len = 0;
923
924 // Now one bit at a time for remaining checks
925 for (;;) {
926 int dlen = codes[idx].len - last_len;
927 if (cram_not_enough_bits(in, dlen))
928 return -1;
929
930 //val <<= dlen;
931 //val |= get_bits_MSB(in, dlen);
932 //last_len = (len += dlen);
933
934 last_len = (len += dlen);
935 for (; dlen; dlen--) GET_BIT_MSB(in, val);
936
937 idx = val - codes[idx].p;
938 if (idx >= ncodes || idx < 0)
939 return -1;
940
941 if (codes[idx].code == val && codes[idx].len == len) {
942 out_i[i] = codes[idx].symbol;
943 break;
944 }
945 }
946 }
947
948 return 0;
949 }
950
951 /*
952 * Initialises a huffman decoder from an encoding data stream.
953 */
cram_huffman_decode_init(char * data,int size,enum cram_external_type option,int version)954 cram_codec *cram_huffman_decode_init(char *data, int size,
955 enum cram_external_type option,
956 int version) {
957 int32_t ncodes = 0, i, j;
958 char *cp = data, *data_end = &data[size];
959 cram_codec *h;
960 cram_huffman_code *codes = NULL;
961 int32_t val, last_len, max_len = 0;
962 uint32_t max_val; // needs one more bit than val
963 const int max_code_bits = sizeof(val) * 8 - 1;
964 int l;
965
966 if (option == E_BYTE_ARRAY_BLOCK) {
967 hts_log_error("BYTE_ARRAYs not supported by this codec");
968 return NULL;
969 }
970
971 cp += safe_itf8_get(cp, data_end, &ncodes);
972 if (ncodes < 0) {
973 hts_log_error("Invalid number of symbols in huffman stream");
974 return NULL;
975 }
976 if (ncodes >= SIZE_MAX / sizeof(*codes)) {
977 errno = ENOMEM;
978 return NULL;
979 }
980
981 h = calloc(1, sizeof(*h));
982 if (!h)
983 return NULL;
984
985 h->codec = E_HUFFMAN;
986 h->free = cram_huffman_decode_free;
987
988 h->huffman.ncodes = ncodes;
989 if (ncodes) {
990 codes = h->huffman.codes = malloc(ncodes * sizeof(*codes));
991 if (!codes) {
992 free(h);
993 return NULL;
994 }
995 } else {
996 codes = h->huffman.codes = NULL;
997 }
998
999 /* Read symbols and bit-lengths */
1000 for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) {
1001 l = safe_itf8_get(cp, data_end, &codes[i].symbol);
1002 }
1003
1004 if (l < 1)
1005 goto malformed;
1006
1007 cp += safe_itf8_get(cp, data_end, &i);
1008 if (i != ncodes)
1009 goto malformed;
1010
1011 if (ncodes == 0) {
1012 /* NULL huffman stream. Ensure it returns an error if
1013 anything tries to use it. */
1014 h->decode = cram_huffman_decode_null;
1015 return h;
1016 }
1017
1018 for (i = 0, l = 1; i < ncodes; i++, cp += l) {
1019 l = safe_itf8_get(cp, data_end, &codes[i].len);
1020 if (l < 1)
1021 break;
1022 if (max_len < codes[i].len)
1023 max_len = codes[i].len;
1024 }
1025 if (l < 1 || cp - data != size || max_len >= ncodes)
1026 goto malformed;
1027
1028 /* 31 is max. bits available in val */
1029 if (max_len > max_code_bits) {
1030 hts_log_error("Huffman code length (%d) is greater "
1031 "than maximum supported (%d)", max_len, max_code_bits);
1032 free(h);
1033 free(codes);
1034 return NULL;
1035 }
1036
1037 /* Sort by bit length and then by symbol value */
1038 qsort(codes, ncodes, sizeof(*codes), code_sort);
1039
1040 /* Assign canonical codes */
1041 val = -1, last_len = 0, max_val = 0;
1042 for (i = 0; i < ncodes; i++) {
1043 val++;
1044 if (val > max_val)
1045 goto malformed;
1046
1047 if (codes[i].len > last_len) {
1048 val <<= (codes[i].len - last_len);
1049 last_len = codes[i].len;
1050 max_val = (1U << codes[i].len) - 1;
1051 }
1052 codes[i].code = val;
1053 }
1054
1055 /*
1056 * Compute the next starting point, offset by the i'th value.
1057 * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
1058 * codes[10..13].p = 30 - 10.
1059 */
1060 last_len = 0;
1061 for (i = j = 0; i < ncodes; i++) {
1062 if (codes[i].len > last_len) {
1063 j = codes[i].code - i;
1064 last_len = codes[i].len;
1065 }
1066 codes[i].p = j;
1067 }
1068
1069 // puts("==HUFF LEN==");
1070 // for (i = 0; i <= last_len+1; i++) {
1071 // printf("len %d=%d prefix %d\n", i, h->huffman.lengths[i], h->huffman.prefix[i]);
1072 // }
1073 // puts("===HUFFMAN CODES===");
1074 // for (i = 0; i < ncodes; i++) {
1075 // int j;
1076 // printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
1077 // j = codes[i].len;
1078 // while (j) {
1079 // putchar(codes[i].code & (1 << --j) ? '1' : '0');
1080 // }
1081 // printf(" %d\n", codes[i].code);
1082 // }
1083
1084 if (option == E_BYTE || option == E_BYTE_ARRAY) {
1085 if (h->huffman.codes[0].len == 0)
1086 h->decode = cram_huffman_decode_char0;
1087 else
1088 h->decode = cram_huffman_decode_char;
1089 } else if (option == E_BYTE_ARRAY_BLOCK) {
1090 abort();
1091 } else {
1092 if (h->huffman.codes[0].len == 0)
1093 h->decode = cram_huffman_decode_int0;
1094 else
1095 h->decode = cram_huffman_decode_int;
1096 }
1097
1098 return (cram_codec *)h;
1099
1100 malformed:
1101 hts_log_error("Malformed huffman header stream");
1102 free(codes);
1103 free(h);
1104 return NULL;
1105 }
1106
cram_huffman_encode_char0(cram_slice * slice,cram_codec * c,char * in,int in_size)1107 int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
1108 char *in, int in_size) {
1109 return 0;
1110 }
1111
cram_huffman_encode_char(cram_slice * slice,cram_codec * c,char * in,int in_size)1112 int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
1113 char *in, int in_size) {
1114 int i, code, len, r = 0;
1115 unsigned char *syms = (unsigned char *)in;
1116
1117 while (in_size--) {
1118 int sym = *syms++;
1119 if (sym >= -1 && sym < MAX_HUFF) {
1120 i = c->e_huffman.val2code[sym+1];
1121 assert(c->e_huffman.codes[i].symbol == sym);
1122 code = c->e_huffman.codes[i].code;
1123 len = c->e_huffman.codes[i].len;
1124 } else {
1125 /* Slow - use a lookup table for when sym < MAX_HUFF? */
1126 for (i = 0; i < c->e_huffman.nvals; i++) {
1127 if (c->e_huffman.codes[i].symbol == sym)
1128 break;
1129 }
1130 if (i == c->e_huffman.nvals)
1131 return -1;
1132
1133 code = c->e_huffman.codes[i].code;
1134 len = c->e_huffman.codes[i].len;
1135 }
1136
1137 r |= store_bits_MSB(c->out, code, len);
1138 }
1139
1140 return r;
1141 }
1142
cram_huffman_encode_int0(cram_slice * slice,cram_codec * c,char * in,int in_size)1143 int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
1144 char *in, int in_size) {
1145 return 0;
1146 }
1147
cram_huffman_encode_int(cram_slice * slice,cram_codec * c,char * in,int in_size)1148 int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
1149 char *in, int in_size) {
1150 int i, code, len, r = 0;
1151 int *syms = (int *)in;
1152
1153 while (in_size--) {
1154 int sym = *syms++;
1155
1156 if (sym >= -1 && sym < MAX_HUFF) {
1157 i = c->e_huffman.val2code[sym+1];
1158 assert(c->e_huffman.codes[i].symbol == sym);
1159 code = c->e_huffman.codes[i].code;
1160 len = c->e_huffman.codes[i].len;
1161 } else {
1162 /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
1163 for (i = 0; i < c->e_huffman.nvals; i++) {
1164 if (c->e_huffman.codes[i].symbol == sym)
1165 break;
1166 }
1167 if (i == c->e_huffman.nvals)
1168 return -1;
1169
1170 code = c->e_huffman.codes[i].code;
1171 len = c->e_huffman.codes[i].len;
1172 }
1173
1174 r |= store_bits_MSB(c->out, code, len);
1175 }
1176
1177 return r;
1178 }
1179
cram_huffman_encode_free(cram_codec * c)1180 void cram_huffman_encode_free(cram_codec *c) {
1181 if (!c)
1182 return;
1183
1184 if (c->e_huffman.codes)
1185 free(c->e_huffman.codes);
1186 free(c);
1187 }
1188
1189 /*
1190 * Encodes a huffman tree.
1191 * Returns number of bytes written.
1192 */
cram_huffman_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)1193 int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
1194 int version) {
1195 int i, len = 0;
1196 cram_huffman_code *codes = c->e_huffman.codes;
1197 /*
1198 * Up to code length 127 means 2.5e+26 bytes of data required (worst
1199 * case huffman tree needs symbols with freqs matching the Fibonacci
1200 * series). So guaranteed 1 byte per code.
1201 *
1202 * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
1203 *
1204 * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
1205 */
1206 char *tmp = malloc(6*c->e_huffman.nvals+16);
1207 char *tp = tmp;
1208
1209 if (!tmp)
1210 return -1;
1211
1212 if (prefix) {
1213 size_t l = strlen(prefix);
1214 BLOCK_APPEND(b, prefix, l);
1215 len += l;
1216 }
1217
1218 tp += itf8_put(tp, c->e_huffman.nvals);
1219 for (i = 0; i < c->e_huffman.nvals; i++) {
1220 tp += itf8_put(tp, codes[i].symbol);
1221 }
1222
1223 tp += itf8_put(tp, c->e_huffman.nvals);
1224 for (i = 0; i < c->e_huffman.nvals; i++) {
1225 tp += itf8_put(tp, codes[i].len);
1226 }
1227
1228 len += itf8_put_blk(b, c->codec);
1229 len += itf8_put_blk(b, tp-tmp);
1230 BLOCK_APPEND(b, tmp, tp-tmp);
1231 len += tp-tmp;
1232
1233 free(tmp);
1234
1235 return len;
1236 }
1237
cram_huffman_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)1238 cram_codec *cram_huffman_encode_init(cram_stats *st,
1239 enum cram_external_type option,
1240 void *dat,
1241 int version) {
1242 int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens, code, len;
1243 int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
1244 cram_codec *c;
1245 cram_huffman_code *codes;
1246
1247 c = malloc(sizeof(*c));
1248 if (!c)
1249 return NULL;
1250 c->codec = E_HUFFMAN;
1251
1252 /* Count number of unique symbols */
1253 for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
1254 if (!st->freqs[i])
1255 continue;
1256 if (nvals >= vals_alloc) {
1257 vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
1258 vals = realloc(vals, vals_alloc * sizeof(int));
1259 freqs = realloc(freqs, vals_alloc * sizeof(int));
1260 if (!vals || !freqs) {
1261 if (vals) free(vals);
1262 if (freqs) free(freqs);
1263 free(c);
1264 return NULL;
1265 }
1266 }
1267 vals[nvals] = i;
1268 freqs[nvals] = st->freqs[i];
1269 assert(st->freqs[i] > 0);
1270 ntot += freqs[nvals];
1271 if (max_val < i) max_val = i;
1272 if (min_val > i) min_val = i;
1273 nvals++;
1274 }
1275 if (st->h) {
1276 khint_t k;
1277
1278 for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1279 if (!kh_exist(st->h, k))
1280 continue;
1281 if (nvals >= vals_alloc) {
1282 vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
1283 vals = realloc(vals, vals_alloc * sizeof(int));
1284 freqs = realloc(freqs, vals_alloc * sizeof(int));
1285 if (!vals || !freqs)
1286 return NULL;
1287 }
1288 vals[nvals]= kh_key(st->h, k);
1289 freqs[nvals] = kh_val(st->h, k);
1290 assert(freqs[nvals] > 0);
1291 ntot += freqs[nvals];
1292 if (max_val < i) max_val = i;
1293 if (min_val > i) min_val = i;
1294 nvals++;
1295 }
1296 }
1297
1298 assert(nvals > 0);
1299
1300 freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
1301 lens = calloc(2*nvals, sizeof(*lens));
1302 if (!lens || !freqs)
1303 return NULL;
1304
1305 /* Inefficient, use pointers to form chain so we can insert and maintain
1306 * a sorted list? This is currently O(nvals^2) complexity.
1307 */
1308 for (;;) {
1309 int low1 = INT_MAX, low2 = INT_MAX;
1310 int ind1 = 0, ind2 = 0;
1311 for (i = 0; i < nvals; i++) {
1312 if (freqs[i] < 0)
1313 continue;
1314 if (low1 > freqs[i])
1315 low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
1316 else if (low2 > freqs[i])
1317 low2 = freqs[i], ind2 = i;
1318 }
1319 if (low2 == INT_MAX)
1320 break;
1321
1322 freqs[nvals] = low1 + low2;
1323 lens[ind1] = nvals;
1324 lens[ind2] = nvals;
1325 freqs[ind1] *= -1;
1326 freqs[ind2] *= -1;
1327 nvals++;
1328 }
1329 nvals = nvals/2+1;
1330
1331 /* Assign lengths */
1332 for (i = 0; i < nvals; i++) {
1333 int code_len = 0;
1334 for (k = lens[i]; k; k = lens[k])
1335 code_len++;
1336 lens[i] = code_len;
1337 freqs[i] *= -1;
1338 //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
1339 }
1340
1341
1342 /* Sort, need in a struct */
1343 if (!(codes = malloc(nvals * sizeof(*codes))))
1344 return NULL;
1345 for (i = 0; i < nvals; i++) {
1346 codes[i].symbol = vals[i];
1347 codes[i].len = lens[i];
1348 }
1349 qsort(codes, nvals, sizeof(*codes), code_sort);
1350
1351 /*
1352 * Generate canonical codes from lengths.
1353 * Sort by length.
1354 * Start with 0.
1355 * Every new code of same length is +1.
1356 * Every new code of new length is +1 then <<1 per extra length.
1357 *
1358 * /\
1359 * a/\
1360 * /\/\
1361 * bcd/\
1362 * ef
1363 *
1364 * a 1 0
1365 * b 3 4 (0+1)<<2
1366 * c 3 5
1367 * d 3 6
1368 * e 4 14 (6+1)<<1
1369 * f 5 15
1370 */
1371 code = 0; len = codes[0].len;
1372 for (i = 0; i < nvals; i++) {
1373 while (len != codes[i].len) {
1374 code<<=1;
1375 len++;
1376 }
1377 codes[i].code = code++;
1378
1379 if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
1380 c->e_huffman.val2code[codes[i].symbol+1] = i;
1381
1382 //fprintf(stderr, "sym %d, code %d, len %d\n",
1383 // codes[i].symbol, codes[i].code, codes[i].len);
1384 }
1385
1386 free(lens);
1387 free(vals);
1388 free(freqs);
1389
1390 c->e_huffman.codes = codes;
1391 c->e_huffman.nvals = nvals;
1392
1393 c->free = cram_huffman_encode_free;
1394 if (option == E_BYTE || option == E_BYTE_ARRAY) {
1395 if (c->e_huffman.codes[0].len == 0)
1396 c->encode = cram_huffman_encode_char0;
1397 else
1398 c->encode = cram_huffman_encode_char;
1399 } else {
1400 if (c->e_huffman.codes[0].len == 0)
1401 c->encode = cram_huffman_encode_int0;
1402 else
1403 c->encode = cram_huffman_encode_int;
1404 }
1405 c->store = cram_huffman_encode_store;
1406
1407 return c;
1408 }
1409
1410 /*
1411 * ---------------------------------------------------------------------------
1412 * BYTE_ARRAY_LEN
1413 */
cram_byte_array_len_decode(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)1414 int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
1415 cram_block *in, char *out,
1416 int *out_size) {
1417 /* Fetch length */
1418 int32_t len = 0, one = 1;
1419 int r;
1420
1421 r = c->byte_array_len.len_codec->decode(slice, c->byte_array_len.len_codec,
1422 in, (char *)&len, &one);
1423 //printf("ByteArray Len=%d\n", len);
1424
1425 if (!r && c->byte_array_len.val_codec && len >= 0) {
1426 r = c->byte_array_len.val_codec->decode(slice,
1427 c->byte_array_len.val_codec,
1428 in, out, &len);
1429 } else {
1430 return -1;
1431 }
1432
1433 *out_size = len;
1434
1435 return r;
1436 }
1437
cram_byte_array_len_decode_free(cram_codec * c)1438 void cram_byte_array_len_decode_free(cram_codec *c) {
1439 if (!c) return;
1440
1441 if (c->byte_array_len.len_codec)
1442 c->byte_array_len.len_codec->free(c->byte_array_len.len_codec);
1443
1444 if (c->byte_array_len.val_codec)
1445 c->byte_array_len.val_codec->free(c->byte_array_len.val_codec);
1446
1447 free(c);
1448 }
1449
cram_byte_array_len_decode_init(char * data,int size,enum cram_external_type option,int version)1450 cram_codec *cram_byte_array_len_decode_init(char *data, int size,
1451 enum cram_external_type option,
1452 int version) {
1453 cram_codec *c;
1454 char *cp = data;
1455 char *endp = data + size;
1456 int32_t encoding = 0;
1457 int32_t sub_size = -1;
1458
1459 if (!(c = malloc(sizeof(*c))))
1460 return NULL;
1461
1462 c->codec = E_BYTE_ARRAY_LEN;
1463 c->decode = cram_byte_array_len_decode;
1464 c->free = cram_byte_array_len_decode_free;
1465
1466 cp += safe_itf8_get(cp, endp, &encoding);
1467 cp += safe_itf8_get(cp, endp, &sub_size);
1468 if (sub_size < 0 || endp - cp < sub_size)
1469 goto malformed;
1470 c->byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size,
1471 E_INT, version);
1472 if (c->byte_array_len.len_codec == NULL)
1473 goto no_codec;
1474 cp += sub_size;
1475
1476 sub_size = -1;
1477 cp += safe_itf8_get(cp, endp, &encoding);
1478 cp += safe_itf8_get(cp, endp, &sub_size);
1479 if (sub_size < 0 || endp - cp < sub_size)
1480 goto malformed;
1481 c->byte_array_len.val_codec = cram_decoder_init(encoding, cp, sub_size,
1482 option, version);
1483 if (c->byte_array_len.val_codec == NULL)
1484 goto no_codec;
1485 cp += sub_size;
1486
1487 if (cp - data != size)
1488 goto malformed;
1489
1490 return c;
1491
1492 malformed:
1493 hts_log_error("Malformed byte_array_len header stream");
1494 no_codec:
1495 free(c);
1496 return NULL;
1497 }
1498
cram_byte_array_len_encode(cram_slice * slice,cram_codec * c,char * in,int in_size)1499 int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
1500 char *in, int in_size) {
1501 int32_t i32 = in_size;
1502 int r = 0;
1503
1504 r |= c->e_byte_array_len.len_codec->encode(slice,
1505 c->e_byte_array_len.len_codec,
1506 (char *)&i32, 1);
1507 r |= c->e_byte_array_len.val_codec->encode(slice,
1508 c->e_byte_array_len.val_codec,
1509 in, in_size);
1510 return r;
1511 }
1512
cram_byte_array_len_encode_free(cram_codec * c)1513 void cram_byte_array_len_encode_free(cram_codec *c) {
1514 if (!c)
1515 return;
1516
1517 if (c->e_byte_array_len.len_codec)
1518 c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec);
1519
1520 if (c->e_byte_array_len.val_codec)
1521 c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec);
1522
1523 free(c);
1524 }
1525
cram_byte_array_len_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)1526 int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
1527 char *prefix, int version) {
1528 int len = 0, len2, len3;
1529 cram_codec *tc;
1530 cram_block *b_len, *b_val;
1531
1532 if (prefix) {
1533 size_t l = strlen(prefix);
1534 BLOCK_APPEND(b, prefix, l);
1535 len += l;
1536 }
1537
1538 tc = c->e_byte_array_len.len_codec;
1539 b_len = cram_new_block(0, 0);
1540 len2 = tc->store(tc, b_len, NULL, version);
1541
1542 tc = c->e_byte_array_len.val_codec;
1543 b_val = cram_new_block(0, 0);
1544 len3 = tc->store(tc, b_val, NULL, version);
1545
1546 len += itf8_put_blk(b, c->codec);
1547 len += itf8_put_blk(b, len2+len3);
1548 BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
1549 BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
1550
1551 cram_free_block(b_len);
1552 cram_free_block(b_val);
1553
1554 return len + len2 + len3;
1555 }
1556
cram_byte_array_len_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)1557 cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
1558 enum cram_external_type option,
1559 void *dat,
1560 int version) {
1561 cram_codec *c;
1562 cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
1563
1564 c = malloc(sizeof(*c));
1565 if (!c)
1566 return NULL;
1567 c->codec = E_BYTE_ARRAY_LEN;
1568 c->free = cram_byte_array_len_encode_free;
1569 c->encode = cram_byte_array_len_encode;
1570 c->store = cram_byte_array_len_encode_store;
1571
1572 c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
1573 st, E_INT,
1574 e->len_dat,
1575 version);
1576 c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
1577 NULL, E_BYTE_ARRAY,
1578 e->val_dat,
1579 version);
1580
1581 return c;
1582 }
1583
1584 /*
1585 * ---------------------------------------------------------------------------
1586 * BYTE_ARRAY_STOP
1587 */
cram_byte_array_stop_decode_char(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)1588 static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
1589 cram_block *in, char *out,
1590 int *out_size) {
1591 char *cp, ch;
1592 cram_block *b = NULL;
1593
1594 b = cram_get_block_by_id(slice, c->byte_array_stop.content_id);
1595 if (!b)
1596 return *out_size?-1:0;
1597
1598 if (b->idx >= b->uncomp_size)
1599 return -1;
1600
1601 cp = (char *)b->data + b->idx;
1602 if (out) {
1603 while ((ch = *cp) != (char)c->byte_array_stop.stop) {
1604 if (cp - (char *)b->data >= b->uncomp_size)
1605 return -1;
1606 *out++ = ch;
1607 cp++;
1608 }
1609 } else {
1610 // Consume input, but produce no output
1611 while ((ch = *cp) != (char)c->byte_array_stop.stop) {
1612 if (cp - (char *)b->data >= b->uncomp_size)
1613 return -1;
1614 cp++;
1615 }
1616 }
1617
1618 *out_size = cp - (char *)(b->data + b->idx);
1619 b->idx = cp - (char *)b->data + 1;
1620
1621 return 0;
1622 }
1623
cram_byte_array_stop_decode_block(cram_slice * slice,cram_codec * c,cram_block * in,char * out_,int * out_size)1624 int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
1625 cram_block *in, char *out_,
1626 int *out_size) {
1627 cram_block *b;
1628 cram_block *out = (cram_block *)out_;
1629 char *cp, *out_cp, *cp_end;
1630 char stop;
1631
1632 b = cram_get_block_by_id(slice, c->byte_array_stop.content_id);
1633 if (!b)
1634 return *out_size?-1:0;
1635
1636 if (b->idx >= b->uncomp_size)
1637 return -1;
1638 cp = (char *)b->data + b->idx;
1639 cp_end = (char *)b->data + b->uncomp_size;
1640 out_cp = (char *)BLOCK_END(out);
1641
1642 stop = c->byte_array_stop.stop;
1643 if (cp_end - cp < out->alloc - out->byte) {
1644 while (cp != cp_end && *cp != stop)
1645 *out_cp++ = *cp++;
1646 BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out);
1647 } else {
1648 char *cp_start;
1649 for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
1650 ;
1651 BLOCK_APPEND(out, cp_start, cp - cp_start);
1652 BLOCK_GROW(out, cp - cp_start);
1653 }
1654
1655 *out_size = cp - (char *)(b->data + b->idx);
1656 b->idx = cp - (char *)b->data + 1;
1657
1658 return 0;
1659 }
1660
cram_byte_array_stop_decode_free(cram_codec * c)1661 void cram_byte_array_stop_decode_free(cram_codec *c) {
1662 if (!c) return;
1663
1664 free(c);
1665 }
1666
cram_byte_array_stop_decode_init(char * data,int size,enum cram_external_type option,int version)1667 cram_codec *cram_byte_array_stop_decode_init(char *data, int size,
1668 enum cram_external_type option,
1669 int version) {
1670 cram_codec *c = NULL;
1671 unsigned char *cp = (unsigned char *)data;
1672
1673 if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
1674 goto malformed;
1675
1676 if (!(c = malloc(sizeof(*c))))
1677 return NULL;
1678
1679 c->codec = E_BYTE_ARRAY_STOP;
1680 switch (option) {
1681 case E_BYTE_ARRAY_BLOCK:
1682 c->decode = cram_byte_array_stop_decode_block;
1683 break;
1684 case E_BYTE_ARRAY:
1685 c->decode = cram_byte_array_stop_decode_char;
1686 break;
1687 default:
1688 hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
1689 free(c);
1690 return NULL;
1691 }
1692 c->free = cram_byte_array_stop_decode_free;
1693
1694 c->byte_array_stop.stop = *cp++;
1695 if (CRAM_MAJOR_VERS(version) == 1) {
1696 c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
1697 + (cp[3]<<24);
1698 cp += 4;
1699 } else {
1700 cp += safe_itf8_get((char *) cp, data + size,
1701 &c->byte_array_stop.content_id);
1702 }
1703
1704 if ((char *)cp - data != size)
1705 goto malformed;
1706
1707 return c;
1708
1709 malformed:
1710 hts_log_error("Malformed byte_array_stop header stream");
1711 free(c);
1712 return NULL;
1713 }
1714
cram_byte_array_stop_encode(cram_slice * slice,cram_codec * c,char * in,int in_size)1715 int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
1716 char *in, int in_size) {
1717 BLOCK_APPEND(c->out, in, in_size);
1718 BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop);
1719 return 0;
1720 }
1721
cram_byte_array_stop_encode_free(cram_codec * c)1722 void cram_byte_array_stop_encode_free(cram_codec *c) {
1723 if (!c)
1724 return;
1725 free(c);
1726 }
1727
cram_byte_array_stop_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)1728 int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
1729 char *prefix, int version) {
1730 int len = 0;
1731 char buf[20], *cp = buf;
1732
1733 if (prefix) {
1734 size_t l = strlen(prefix);
1735 BLOCK_APPEND(b, prefix, l);
1736 len += l;
1737 }
1738
1739 cp += itf8_put(cp, c->codec);
1740
1741 if (CRAM_MAJOR_VERS(version) == 1) {
1742 cp += itf8_put(cp, 5);
1743 *cp++ = c->e_byte_array_stop.stop;
1744 *cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff;
1745 *cp++ = (c->e_byte_array_stop.content_id >> 8) & 0xff;
1746 *cp++ = (c->e_byte_array_stop.content_id >> 16) & 0xff;
1747 *cp++ = (c->e_byte_array_stop.content_id >> 24) & 0xff;
1748 } else {
1749 cp += itf8_put(cp, 1 + itf8_size(c->e_byte_array_stop.content_id));
1750 *cp++ = c->e_byte_array_stop.stop;
1751 cp += itf8_put(cp, c->e_byte_array_stop.content_id);
1752 }
1753
1754 BLOCK_APPEND(b, buf, cp-buf);
1755 len += cp-buf;
1756
1757 return len;
1758 }
1759
cram_byte_array_stop_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)1760 cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
1761 enum cram_external_type option,
1762 void *dat,
1763 int version) {
1764 cram_codec *c;
1765
1766 c = malloc(sizeof(*c));
1767 if (!c)
1768 return NULL;
1769 c->codec = E_BYTE_ARRAY_STOP;
1770 c->free = cram_byte_array_stop_encode_free;
1771 c->encode = cram_byte_array_stop_encode;
1772 c->store = cram_byte_array_stop_encode_store;
1773
1774 c->e_byte_array_stop.stop = ((int *)dat)[0];
1775 c->e_byte_array_stop.content_id = ((int *)dat)[1];
1776
1777 return c;
1778 }
1779
1780 /*
1781 * ---------------------------------------------------------------------------
1782 */
1783
cram_encoding2str(enum cram_encoding t)1784 const char *cram_encoding2str(enum cram_encoding t) {
1785 switch (t) {
1786 case E_NULL: return "NULL";
1787 case E_EXTERNAL: return "EXTERNAL";
1788 case E_GOLOMB: return "GOLOMB";
1789 case E_HUFFMAN: return "HUFFMAN";
1790 case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN";
1791 case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
1792 case E_BETA: return "BETA";
1793 case E_SUBEXP: return "SUBEXP";
1794 case E_GOLOMB_RICE: return "GOLOMB_RICE";
1795 case E_GAMMA: return "GAMMA";
1796 case E_NUM_CODECS:
1797 default: return "?";
1798 }
1799 }
1800
1801 static cram_codec *(*decode_init[])(char *data,
1802 int size,
1803 enum cram_external_type option,
1804 int version) = {
1805 NULL,
1806 cram_external_decode_init,
1807 NULL,
1808 cram_huffman_decode_init,
1809 cram_byte_array_len_decode_init,
1810 cram_byte_array_stop_decode_init,
1811 cram_beta_decode_init,
1812 cram_subexp_decode_init,
1813 NULL,
1814 cram_gamma_decode_init,
1815 };
1816
cram_decoder_init(enum cram_encoding codec,char * data,int size,enum cram_external_type option,int version)1817 cram_codec *cram_decoder_init(enum cram_encoding codec,
1818 char *data, int size,
1819 enum cram_external_type option,
1820 int version) {
1821 if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
1822 return decode_init[codec](data, size, option, version);
1823 } else {
1824 hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
1825 return NULL;
1826 }
1827 }
1828
1829 static cram_codec *(*encode_init[])(cram_stats *stx,
1830 enum cram_external_type option,
1831 void *opt,
1832 int version) = {
1833 NULL,
1834 cram_external_encode_init,
1835 NULL,
1836 cram_huffman_encode_init,
1837 cram_byte_array_len_encode_init,
1838 cram_byte_array_stop_encode_init,
1839 cram_beta_encode_init,
1840 NULL, //cram_subexp_encode_init,
1841 NULL,
1842 NULL, //cram_gamma_encode_init,
1843 };
1844
cram_encoder_init(enum cram_encoding codec,cram_stats * st,enum cram_external_type option,void * dat,int version)1845 cram_codec *cram_encoder_init(enum cram_encoding codec,
1846 cram_stats *st,
1847 enum cram_external_type option,
1848 void *dat,
1849 int version) {
1850 if (st && !st->nvals)
1851 return NULL;
1852
1853 if (encode_init[codec]) {
1854 cram_codec *r;
1855 if ((r = encode_init[codec](st, option, dat, version)))
1856 r->out = NULL;
1857 return r;
1858 } else {
1859 hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
1860 abort();
1861 }
1862 }
1863
1864 /*
1865 * Returns the content_id used by this codec, also in id2 if byte_array_len.
1866 * Returns -1 for the CORE block and -2 for unneeded.
1867 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
1868 */
cram_codec_to_id(cram_codec * c,int * id2)1869 int cram_codec_to_id(cram_codec *c, int *id2) {
1870 int bnum1, bnum2 = -2;
1871
1872 switch (c->codec) {
1873 case E_HUFFMAN:
1874 bnum1 = c->huffman.ncodes == 1 ? -2 : -1;
1875 break;
1876 case E_GOLOMB:
1877 case E_BETA:
1878 case E_SUBEXP:
1879 case E_GOLOMB_RICE:
1880 case E_GAMMA:
1881 bnum1 = -1;
1882 break;
1883 case E_EXTERNAL:
1884 bnum1 = c->external.content_id;
1885 break;
1886 case E_BYTE_ARRAY_LEN:
1887 bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL);
1888 bnum2 = cram_codec_to_id(c->byte_array_len.val_codec, NULL);
1889 break;
1890 case E_BYTE_ARRAY_STOP:
1891 bnum1 = c->byte_array_stop.content_id;
1892 break;
1893 case E_NULL:
1894 bnum1 = -2;
1895 break;
1896 default:
1897 hts_log_error("Unknown codec type %d", c->codec);
1898 bnum1 = -1;
1899 }
1900
1901 if (id2)
1902 *id2 = bnum2;
1903 return bnum1;
1904 }
1905
1906
1907 /*
1908 * cram_codec structures are specialised for decoding or encoding.
1909 * Unfortunately this makes turning a decoder into an encoder (such as
1910 * when transcoding files) problematic.
1911 *
1912 * This function converts a cram decoder codec into an encoder version
1913 * in-place (ie it modifiers the codec itself).
1914 *
1915 * Returns 0 on success;
1916 * -1 on failure.
1917 */
cram_codec_decoder2encoder(cram_fd * fd,cram_codec * c)1918 int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
1919 int j;
1920
1921 switch (c->codec) {
1922 case E_EXTERNAL:
1923 // shares struct with decode
1924 c->free = cram_external_encode_free;
1925 c->store = cram_external_encode_store;
1926 if (c->decode == cram_external_decode_int)
1927 c->encode = cram_external_encode_int;
1928 else if (c->decode == cram_external_decode_char)
1929 c->encode = cram_external_encode_char;
1930 else
1931 return -1;
1932 break;
1933
1934 case E_HUFFMAN: {
1935 // New structure, so switch.
1936 // FIXME: we huffman and e_huffman structs amended, we could
1937 // unify this.
1938 cram_codec *t = malloc(sizeof(*t));
1939 t->codec = E_HUFFMAN;
1940 t->free = cram_huffman_encode_free;
1941 t->store = cram_huffman_encode_store;
1942 t->e_huffman.codes = c->huffman.codes;
1943 t->e_huffman.nvals = c->huffman.ncodes;
1944 for (j = 0; j < t->e_huffman.nvals; j++) {
1945 int32_t sym = t->e_huffman.codes[j].symbol;
1946 if (sym >= -1 && sym < MAX_HUFF)
1947 t->e_huffman.val2code[sym+1] = j;
1948 }
1949
1950 if (c->decode == cram_huffman_decode_char0)
1951 t->encode = cram_huffman_encode_char0;
1952 else if (c->decode == cram_huffman_decode_char)
1953 t->encode = cram_huffman_encode_char;
1954 else if (c->decode == cram_huffman_decode_int0)
1955 t->encode = cram_huffman_encode_int0;
1956 else if (c->decode == cram_huffman_decode_int)
1957 t->encode = cram_huffman_encode_int;
1958 else {
1959 free(t);
1960 return -1;
1961 }
1962 *c = *t;
1963 free(t);
1964 break;
1965 }
1966
1967 case E_BETA:
1968 // shares struct with decode
1969 c->free = cram_beta_encode_free;
1970 c->store = cram_beta_encode_store;
1971 if (c->decode == cram_beta_decode_int)
1972 c->encode = cram_beta_encode_int;
1973 else if (c->decode == cram_beta_decode_char)
1974 c->encode = cram_beta_encode_char;
1975 else
1976 return -1;
1977 break;
1978
1979 case E_BYTE_ARRAY_LEN: {
1980 cram_codec *t = malloc(sizeof(*t));
1981 t->codec = E_BYTE_ARRAY_LEN;
1982 t->free = cram_byte_array_len_encode_free;
1983 t->store = cram_byte_array_len_encode_store;
1984 t->encode = cram_byte_array_len_encode;
1985 t->e_byte_array_len.len_codec = c->byte_array_len.len_codec;
1986 t->e_byte_array_len.val_codec = c->byte_array_len.val_codec;
1987 if (cram_codec_decoder2encoder(fd, t->e_byte_array_len.len_codec) == -1 ||
1988 cram_codec_decoder2encoder(fd, t->e_byte_array_len.val_codec) == -1) {
1989 t->free(t);
1990 return -1;
1991 }
1992
1993 // {len,val}_{encoding,dat} are undefined, but unused.
1994 // Leaving them unset here means we can test that assertion.
1995 *c = *t;
1996 free(t);
1997 break;
1998 }
1999
2000 case E_BYTE_ARRAY_STOP:
2001 // shares struct with decode
2002 c->free = cram_byte_array_stop_encode_free;
2003 c->store = cram_byte_array_stop_encode_store;
2004 c->encode = cram_byte_array_stop_encode;
2005 break;
2006
2007 default:
2008 return -1;
2009 }
2010
2011 return 0;
2012 }
2013