1 /*
2 Copyright (c) 2012-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*! \file
32  * Include cram.h instead.
33  *
34  * This is an internal part of the CRAM system and is automatically included
35  * when you #include cram.h.
36  *
37  * Implements the low level CRAM I/O primitives.
38  * This includes basic data types such as byte, int, ITF-8,
39  * maps, bitwise I/O, etc.
40  */
41 
42 #ifndef CRAM_IO_H
43 #define CRAM_IO_H
44 
45 #include <stdint.h>
46 
47 #include "misc.h"
48 
49 #ifdef __cplusplus
50 extern "C" {
51 #endif
52 
53 /**@{ ----------------------------------------------------------------------
54  * ITF8 encoding and decoding.
55  *
56  * Also see the itf8_get and itf8_put macros.
57  */
58 
59 /*! INTERNAL: Converts two characters into an integer for use in switch{} */
60 #define CRAM_KEY(a,b) ((((unsigned char) a)<<8)|(((unsigned char) b)))
61 
62 /*! Reads an integer in ITF-8 encoding from 'fd' and stores it in
63  * *val.
64  *
65  * @return
66  * Returns the number of bytes read on success;
67  *        -1 on failure
68  */
69 int itf8_decode(cram_fd *fd, int32_t *val);
70 
itf8_get(char * cp,int32_t * val_p)71 static inline int itf8_get(char *cp, int32_t *val_p) {
72     unsigned char *up = (unsigned char *)cp;
73 
74     if (up[0] < 0x80) {
75         *val_p =   up[0];
76         return 1;
77     } else if (up[0] < 0xc0) {
78         *val_p = ((up[0] <<8) |  up[1])                           & 0x3fff;
79         return 2;
80     } else if (up[0] < 0xe0) {
81         *val_p = ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
82         return 3;
83     } else if (up[0] < 0xf0) {
84         *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
85         return 4;
86     } else {
87         *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
88         return 5;
89     }
90 }
91 
92 /*
93  * Stores a value to memory in ITF-8 format.
94  *
95  * Returns the number of bytes required to store the number.
96  * This is a maximum of 5 bytes.
97  */
itf8_put(char * cp,int32_t val)98 static inline int itf8_put(char *cp, int32_t val) {
99     unsigned char *up = (unsigned char *)cp;
100     if        (!(val & ~0x00000007f)) { // 1 byte
101         *up = val;
102         return 1;
103     } else if (!(val & ~0x00003fff)) { // 2 byte
104         *up++ = (val >> 8 ) | 0x80;
105         *up   = val & 0xff;
106         return 2;
107     } else if (!(val & ~0x01fffff)) { // 3 byte
108         *up++ = (val >> 16) | 0xc0;
109         *up++ = (val >> 8 ) & 0xff;
110         *up   = val & 0xff;
111         return 3;
112     } else if (!(val & ~0x0fffffff)) { // 4 byte
113         *up++ = (val >> 24) | 0xe0;
114         *up++ = (val >> 16) & 0xff;
115         *up++ = (val >> 8 ) & 0xff;
116         *up   = val & 0xff;
117         return 4;
118     } else {                           // 5 byte
119         *up++ = 0xf0 | ((val>>28) & 0xff);
120         *up++ = (val >> 20) & 0xff;
121         *up++ = (val >> 12) & 0xff;
122         *up++ = (val >> 4 ) & 0xff;
123         *up = val & 0x0f;
124         return 5;
125     }
126 }
127 
128 
129 /* 64-bit itf8 variant */
ltf8_put(char * cp,int64_t val)130 static inline int ltf8_put(char *cp, int64_t val) {
131     unsigned char *up = (unsigned char *)cp;
132     if        (!(val & ~((1LL<<7)-1))) {
133         *up = val;
134         return 1;
135     } else if (!(val & ~((1LL<<(6+8))-1))) {
136         *up++ = (val >> 8 ) | 0x80;
137         *up   = val & 0xff;
138         return 2;
139     } else if (!(val & ~((1LL<<(5+2*8))-1))) {
140         *up++ = (val >> 16) | 0xc0;
141         *up++ = (val >> 8 ) & 0xff;
142         *up   = val & 0xff;
143         return 3;
144     } else if (!(val & ~((1LL<<(4+3*8))-1))) {
145         *up++ = (val >> 24) | 0xe0;
146         *up++ = (val >> 16) & 0xff;
147         *up++ = (val >> 8 ) & 0xff;
148         *up   = val & 0xff;
149         return 4;
150     } else if (!(val & ~((1LL<<(3+4*8))-1))) {
151         *up++ = (val >> 32) | 0xf0;
152         *up++ = (val >> 24) & 0xff;
153         *up++ = (val >> 16) & 0xff;
154         *up++ = (val >> 8 ) & 0xff;
155         *up   = val & 0xff;
156         return 5;
157     } else if (!(val & ~((1LL<<(2+5*8))-1))) {
158         *up++ = (val >> 40) | 0xf8;
159         *up++ = (val >> 32) & 0xff;
160         *up++ = (val >> 24) & 0xff;
161         *up++ = (val >> 16) & 0xff;
162         *up++ = (val >> 8 ) & 0xff;
163         *up   = val & 0xff;
164         return 6;
165     } else if (!(val & ~((1LL<<(1+6*8))-1))) {
166         *up++ = (val >> 48) | 0xfc;
167         *up++ = (val >> 40) & 0xff;
168         *up++ = (val >> 32) & 0xff;
169         *up++ = (val >> 24) & 0xff;
170         *up++ = (val >> 16) & 0xff;
171         *up++ = (val >> 8 ) & 0xff;
172         *up   = val & 0xff;
173         return 7;
174     } else if (!(val & ~((1LL<<(7*8))-1))) {
175         *up++ = (val >> 56) | 0xfe;
176         *up++ = (val >> 48) & 0xff;
177         *up++ = (val >> 40) & 0xff;
178         *up++ = (val >> 32) & 0xff;
179         *up++ = (val >> 24) & 0xff;
180         *up++ = (val >> 16) & 0xff;
181         *up++ = (val >> 8 ) & 0xff;
182         *up   = val & 0xff;
183         return 8;
184     } else {
185         *up++ = 0xff;
186         *up++ = (val >> 56) & 0xff;
187         *up++ = (val >> 48) & 0xff;
188         *up++ = (val >> 40) & 0xff;
189         *up++ = (val >> 32) & 0xff;
190         *up++ = (val >> 24) & 0xff;
191         *up++ = (val >> 16) & 0xff;
192         *up++ = (val >> 8 ) & 0xff;
193         *up   = val & 0xff;
194         return 9;
195     }
196 }
197 
ltf8_get(char * cp,int64_t * val_p)198 static inline int ltf8_get(char *cp, int64_t *val_p) {
199     unsigned char *up = (unsigned char *)cp;
200 
201     if (up[0] < 0x80) {
202         *val_p =   up[0];
203         return 1;
204     } else if (up[0] < 0xc0) {
205         *val_p = (((uint64_t)up[0]<< 8) |
206                    (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
207         return 2;
208     } else if (up[0] < 0xe0) {
209         *val_p = (((uint64_t)up[0]<<16) |
210                   ((uint64_t)up[1]<< 8) |
211                    (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
212         return 3;
213     } else if (up[0] < 0xf0) {
214         *val_p = (((uint64_t)up[0]<<24) |
215                   ((uint64_t)up[1]<<16) |
216                   ((uint64_t)up[2]<< 8) |
217                    (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
218         return 4;
219     } else if (up[0] < 0xf8) {
220         *val_p = (((uint64_t)up[0]<<32) |
221                   ((uint64_t)up[1]<<24) |
222                   ((uint64_t)up[2]<<16) |
223                   ((uint64_t)up[3]<< 8) |
224                    (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
225         return 5;
226     } else if (up[0] < 0xfc) {
227         *val_p = (((uint64_t)up[0]<<40) |
228                   ((uint64_t)up[1]<<32) |
229                   ((uint64_t)up[2]<<24) |
230                   ((uint64_t)up[3]<<16) |
231                   ((uint64_t)up[4]<< 8) |
232                    (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
233         return 6;
234     } else if (up[0] < 0xfe) {
235         *val_p = (((uint64_t)up[0]<<48) |
236                   ((uint64_t)up[1]<<40) |
237                   ((uint64_t)up[2]<<32) |
238                   ((uint64_t)up[3]<<24) |
239                   ((uint64_t)up[4]<<16) |
240                   ((uint64_t)up[5]<< 8) |
241                    (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
242         return 7;
243     } else if (up[0] < 0xff) {
244         *val_p = (((uint64_t)up[1]<<48) |
245                   ((uint64_t)up[2]<<40) |
246                   ((uint64_t)up[3]<<32) |
247                   ((uint64_t)up[4]<<24) |
248                   ((uint64_t)up[5]<<16) |
249                   ((uint64_t)up[6]<< 8) |
250                    (uint64_t)up[7]) & ((1LL<<(7*8))-1);
251         return 8;
252     } else {
253         *val_p = (((uint64_t)up[1]<<56) |
254                   ((uint64_t)up[2]<<48) |
255                   ((uint64_t)up[3]<<40) |
256                   ((uint64_t)up[4]<<32) |
257                   ((uint64_t)up[5]<<24) |
258                   ((uint64_t)up[6]<<16) |
259                   ((uint64_t)up[7]<< 8) |
260                    (uint64_t)up[8]);
261         return 9;
262     }
263 }
264 
265 #define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5)
266 
267 
268 /* Version of itf8_get that checks it hasn't run out of input */
269 
270 extern const int itf8_bytes[16];
271 extern const int ltf8_bytes[256];
272 
safe_itf8_get(const char * cp,const char * endp,int32_t * val_p)273 static inline int safe_itf8_get(const char *cp, const char *endp,
274                                 int32_t *val_p) {
275     const unsigned char *up = (unsigned char *)cp;
276 
277     if (endp - cp < 5 &&
278         (cp >= endp || endp - cp < itf8_bytes[up[0]>>4])) {
279         *val_p = 0;
280         return 0;
281     }
282 
283     if (up[0] < 0x80) {
284         *val_p =   up[0];
285         return 1;
286     } else if (up[0] < 0xc0) {
287         *val_p = ((up[0] <<8) |  up[1])                           & 0x3fff;
288         return 2;
289     } else if (up[0] < 0xe0) {
290         *val_p = ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
291         return 3;
292     } else if (up[0] < 0xf0) {
293         *val_p = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
294         return 4;
295     } else {
296         uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
297         *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
298         return 5;
299     }
300 }
301 
safe_ltf8_get(const char * cp,const char * endp,int64_t * val_p)302 static inline int safe_ltf8_get(const char *cp, const char *endp,
303                                 int64_t *val_p) {
304     unsigned char *up = (unsigned char *)cp;
305 
306     if (endp - cp < 9 &&
307         (cp >= endp || endp - cp < ltf8_bytes[up[0]])) return 0;
308 
309     if (up[0] < 0x80) {
310         *val_p =   up[0];
311         return 1;
312     } else if (up[0] < 0xc0) {
313         *val_p = (((uint64_t)up[0]<< 8) |
314                    (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
315         return 2;
316     } else if (up[0] < 0xe0) {
317         *val_p = (((uint64_t)up[0]<<16) |
318                   ((uint64_t)up[1]<< 8) |
319                    (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
320         return 3;
321     } else if (up[0] < 0xf0) {
322         *val_p = (((uint64_t)up[0]<<24) |
323                   ((uint64_t)up[1]<<16) |
324                   ((uint64_t)up[2]<< 8) |
325                    (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
326         return 4;
327     } else if (up[0] < 0xf8) {
328         *val_p = (((uint64_t)up[0]<<32) |
329                   ((uint64_t)up[1]<<24) |
330                   ((uint64_t)up[2]<<16) |
331                   ((uint64_t)up[3]<< 8) |
332                    (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
333         return 5;
334     } else if (up[0] < 0xfc) {
335         *val_p = (((uint64_t)up[0]<<40) |
336                   ((uint64_t)up[1]<<32) |
337                   ((uint64_t)up[2]<<24) |
338                   ((uint64_t)up[3]<<16) |
339                   ((uint64_t)up[4]<< 8) |
340                    (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
341         return 6;
342     } else if (up[0] < 0xfe) {
343         *val_p = (((uint64_t)up[0]<<48) |
344                   ((uint64_t)up[1]<<40) |
345                   ((uint64_t)up[2]<<32) |
346                   ((uint64_t)up[3]<<24) |
347                   ((uint64_t)up[4]<<16) |
348                   ((uint64_t)up[5]<< 8) |
349                    (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
350         return 7;
351     } else if (up[0] < 0xff) {
352         *val_p = (((uint64_t)up[1]<<48) |
353                   ((uint64_t)up[2]<<40) |
354                   ((uint64_t)up[3]<<32) |
355                   ((uint64_t)up[4]<<24) |
356                   ((uint64_t)up[5]<<16) |
357                   ((uint64_t)up[6]<< 8) |
358                    (uint64_t)up[7]) & ((1LL<<(7*8))-1);
359         return 8;
360     } else {
361         *val_p = (((uint64_t)up[1]<<56) |
362                   ((uint64_t)up[2]<<48) |
363                   ((uint64_t)up[3]<<40) |
364                   ((uint64_t)up[4]<<32) |
365                   ((uint64_t)up[5]<<24) |
366                   ((uint64_t)up[6]<<16) |
367                   ((uint64_t)up[7]<< 8) |
368                    (uint64_t)up[8]);
369         return 9;
370     }
371 }
372 
373 /*! Pushes a value in ITF8 format onto the end of a block.
374  *
375  * This shouldn't be used for high-volume data as it is not the fastest
376  * method.
377  *
378  * @return
379  * Returns the number of bytes written
380  */
381 int itf8_put_blk(cram_block *blk, int32_t val);
382 int ltf8_put_blk(cram_block *blk, int64_t val);
383 
384 /*! Pulls a literal 32-bit value from a block.
385  *
386  * @returns the number of bytes decoded;
387  *         -1 on failure.
388  */
389 int int32_get_blk(cram_block *b, int32_t *val);
390 
391 /*! Pushes a literal 32-bit value onto the end of a block.
392  *
393  * @return
394  * Returns 0 on success;
395  *        -1 on failure.
396  */
397 int int32_put_blk(cram_block *blk, int32_t val);
398 
399 
400 /**@}*/
401 /**@{ ----------------------------------------------------------------------
402  * CRAM blocks - the dynamically growable data block. We have code to
403  * create, update, (un)compress and read/write.
404  *
405  * These are derived from the deflate_interlaced.c blocks, but with the
406  * CRAM extension of content types and IDs.
407  */
408 
409 /*! Allocates a new cram_block structure with a specified content_type and
410  * id.
411  *
412  * @return
413  * Returns block pointer on success;
414  *         NULL on failure
415  */
416 cram_block *cram_new_block(enum cram_content_type content_type,
417                            int content_id);
418 
419 /*! Reads a block from a cram file.
420  *
421  * @return
422  * Returns cram_block pointer on success;
423  *         NULL on failure
424  */
425 cram_block *cram_read_block(cram_fd *fd);
426 
427 /*! Writes a CRAM block.
428  *
429  * @return
430  * Returns 0 on success;
431  *        -1 on failure
432  */
433 int cram_write_block(cram_fd *fd, cram_block *b);
434 
435 /*! Frees a CRAM block, deallocating internal data too.
436  */
437 void cram_free_block(cram_block *b);
438 
439 /*! Uncompress a memory block using Zlib.
440  *
441  * @return
442  * Returns 0 on success;
443  *        -1 on failure
444  */
445 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size);
446 
447 /*! Uncompresses a CRAM block, if compressed.
448  *
449  * @return
450  * Returns 0 on success;
451  *        -1 on failure
452  */
453 int cram_uncompress_block(cram_block *b);
454 
455 /*! Compresses a block.
456  *
457  * Compresses a block using one of two different zlib strategies. If we only
458  * want one choice set strat2 to be -1.
459  *
460  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
461  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
462  * significantly faster.
463  *
464  * @return
465  * Returns 0 on success;
466  *        -1 on failure
467  */
468 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
469                         int method, int level);
470 
471 cram_metrics *cram_new_metrics(void);
472 char *cram_block_method2str(enum cram_block_method m);
473 char *cram_content_type2str(enum cram_content_type t);
474 
475 /*
476  * Find an external block by its content_id
477  */
478 
cram_get_block_by_id(cram_slice * slice,int id)479 static inline cram_block *cram_get_block_by_id(cram_slice *slice, int id) {
480   //fprintf(stderr, "%d\t%p\n", id, slice->block_by_id);
481     uint32_t v = id;
482     if (slice->block_by_id && v < 256) {
483         return slice->block_by_id[v];
484     } else {
485         v = 256 + v % 251;
486         if (slice->block_by_id &&
487             slice->block_by_id[v] &&
488             slice->block_by_id[v]->content_id == id)
489             return slice->block_by_id[v];
490 
491         // Otherwise a linear search in case of collision
492         int i;
493         for (i = 0; i < slice->hdr->num_blocks; i++) {
494             cram_block *b = slice->block[i];
495             if (b && b->content_type == EXTERNAL && b->content_id == id)
496                 return b;
497         }
498     }
499     return NULL;
500 }
501 
502 /* --- Accessor macros for manipulating blocks on a byte by byte basis --- */
503 
504 /* Block size and data pointer. */
505 #define BLOCK_SIZE(b) ((b)->byte)
506 #define BLOCK_DATA(b) ((b)->data)
507 
508 /* Returns the address one past the end of the block */
509 #define BLOCK_END(b) (&(b)->data[(b)->byte])
510 
511 /* Make block exactly 'l' bytes long */
block_resize_exact(cram_block * b,size_t len)512 static inline int block_resize_exact(cram_block *b, size_t len) {
513     unsigned char *tmp = realloc(b->data, len);
514     if (!tmp)
515         return -1;
516     b->alloc = len;
517     b->data = tmp;
518     return 0;
519 }
520 
521 /* Request block to be at least 'l' bytes long */
block_resize(cram_block * b,size_t len)522 static inline int block_resize(cram_block *b, size_t len) {
523     if (b->alloc > len)
524         return 0;
525 
526     size_t alloc = b->alloc;
527     while (alloc <= len)
528         alloc = alloc ? alloc*1.5 : 1024;
529 
530     return block_resize_exact(b, alloc);
531 }
532 
533 
534 /* Ensure the block can hold at least another 'l' bytes */
block_grow(cram_block * b,size_t len)535 static inline int block_grow(cram_block *b, size_t len) {
536     return block_resize(b, BLOCK_SIZE(b) + len);
537 }
538 
539 /* Append string 's' of length 'l'. */
block_append(cram_block * b,const void * s,size_t len)540 static inline int block_append(cram_block *b, const void *s, size_t len) {
541     if (block_grow(b, len) < 0)
542         return -1;
543 
544     memcpy(BLOCK_END(b), s, len);
545     BLOCK_SIZE(b) += len;
546 
547     return 0;
548 }
549 
550 /* Append as single character 'c' */
block_append_char(cram_block * b,char c)551 static inline int block_append_char(cram_block *b, char c) {
552     if (block_grow(b, 1) < 0)
553         return -1;
554 
555     b->data[b->byte++] = c;
556     return 0;
557 }
558 
559 /* Append a single unsigned integer */
560 static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i);
block_append_uint(cram_block * b,unsigned int i)561 static inline int block_append_uint(cram_block *b, unsigned int i) {
562     if (block_grow(b, 11) < 0)
563         return -1;
564 
565     unsigned char *cp = &b->data[b->byte];
566     b->byte += append_uint32(cp, i) - cp;
567     return 0;
568 }
569 
570 // Versions of above with built in goto block_err calls.
571 #define BLOCK_RESIZE_EXACT(b,l) if (block_resize_exact((b),(l))<0) goto block_err
572 #define BLOCK_RESIZE(b,l)       if (block_resize((b),(l))      <0) goto block_err
573 #define BLOCK_GROW(b,l)         if (block_grow((b),(l))        <0) goto block_err
574 #define BLOCK_APPEND(b,s,l)     if (block_append((b),(s),(l))  <0) goto block_err
575 #define BLOCK_APPEND_CHAR(b,c)  if (block_append_char((b),(c)) <0) goto block_err
576 #define BLOCK_APPEND_UINT(b,i)  if (block_append_uint((b),(i)) <0) goto block_err
577 
append_uint32(unsigned char * cp,uint32_t i)578 static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) {
579     uint32_t j;
580 
581     if (i == 0) {
582         *cp++ = '0';
583         return cp;
584     }
585 
586     if (i < 100)        goto b1;
587     if (i < 10000)      goto b3;
588     if (i < 1000000)    goto b5;
589     if (i < 100000000)  goto b7;
590 
591     if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
592     if ((j = i / 100000000))  {*cp++ = j + '0'; i -= j*100000000;  goto x7;}
593  b7:if ((j = i / 10000000))   {*cp++ = j + '0'; i -= j*10000000;   goto x6;}
594     if ((j = i / 1000000))    {*cp++ = j + '0', i -= j*1000000;    goto x5;}
595  b5:if ((j = i / 100000))     {*cp++ = j + '0', i -= j*100000;     goto x4;}
596     if ((j = i / 10000))      {*cp++ = j + '0', i -= j*10000;      goto x3;}
597  b3:if ((j = i / 1000))       {*cp++ = j + '0', i -= j*1000;       goto x2;}
598     if ((j = i / 100))        {*cp++ = j + '0', i -= j*100;        goto x1;}
599  b1:if ((j = i / 10))         {*cp++ = j + '0', i -= j*10;         goto x0;}
600     if (i)                     *cp++ = i + '0';
601     return cp;
602 
603  x8: *cp++ = i / 100000000 + '0', i %= 100000000;
604  x7: *cp++ = i / 10000000  + '0', i %= 10000000;
605  x6: *cp++ = i / 1000000   + '0', i %= 1000000;
606  x5: *cp++ = i / 100000    + '0', i %= 100000;
607  x4: *cp++ = i / 10000     + '0', i %= 10000;
608  x3: *cp++ = i / 1000      + '0', i %= 1000;
609  x2: *cp++ = i / 100       + '0', i %= 100;
610  x1: *cp++ = i / 10        + '0', i %= 10;
611  x0: *cp++ = i             + '0';
612 
613     return cp;
614 }
615 
append_sub32(unsigned char * cp,uint32_t i)616 static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) {
617     *cp++ = i / 100000000 + '0', i %= 100000000;
618     *cp++ = i / 10000000  + '0', i %= 10000000;
619     *cp++ = i / 1000000   + '0', i %= 1000000;
620     *cp++ = i / 100000    + '0', i %= 100000;
621     *cp++ = i / 10000     + '0', i %= 10000;
622     *cp++ = i / 1000      + '0', i %= 1000;
623     *cp++ = i / 100       + '0', i %= 100;
624     *cp++ = i / 10        + '0', i %= 10;
625     *cp++ = i             + '0';
626 
627     return cp;
628 }
629 
append_uint64(unsigned char * cp,uint64_t i)630 static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) {
631     uint64_t j;
632 
633     if (i <= 0xffffffff)
634         return append_uint32(cp, i);
635 
636     if ((j = i/1000000000) > 1000000000) {
637         cp = append_uint32(cp, j/1000000000);
638         j %= 1000000000;
639         cp = append_sub32(cp, j);
640     } else {
641         cp = append_uint32(cp, i / 1000000000);
642     }
643     cp = append_sub32(cp, i % 1000000000);
644 
645     return cp;
646 }
647 
648 #define BLOCK_UPLEN(b) \
649     (b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b))
650 
651 /**@}*/
652 /**@{ ----------------------------------------------------------------------
653  * Reference sequence handling
654  */
655 
656 /*! Loads a reference set from fn and stores in the cram_fd.
657  *
658  * @return
659  * Returns 0 on success;
660  *        -1 on failure
661  */
662 int cram_load_reference(cram_fd *fd, char *fn);
663 
664 /*! Generates a lookup table in refs based on the SQ headers in sam_hdr_t.
665  *
666  * Indexes references by the order they appear in a BAM file. This may not
667  * necessarily be the same order they appear in the fasta reference file.
668  *
669  * @return
670  * Returns 0 on success;
671  *        -1 on failure
672  */
673 int refs2id(refs_t *r, sam_hdr_t *hdr);
674 
675 void refs_free(refs_t *r);
676 
677 /*! Returns a portion of a reference sequence from start to end inclusive.
678  *
679  * The returned pointer is owned by the cram_file fd and should not be freed
680  * by the caller. It is valid only until the next cram_get_ref is called
681  * with the same fd parameter (so is thread-safe if given multiple files).
682  *
683  * To return the entire reference sequence, specify start as 1 and end
684  * as 0.
685  *
686  * @return
687  * Returns reference on success;
688  *         NULL on failure
689  */
690 char *cram_get_ref(cram_fd *fd, int id, int start, int end);
691 void cram_ref_incr(refs_t *r, int id);
692 void cram_ref_decr(refs_t *r, int id);
693 /**@}*/
694 /**@{ ----------------------------------------------------------------------
695  * Containers
696  */
697 
698 /*! Creates a new container, specifying the maximum number of slices
699  * and records permitted.
700  *
701  * @return
702  * Returns cram_container ptr on success;
703  *         NULL on failure
704  */
705 cram_container *cram_new_container(int nrec, int nslice);
706 void cram_free_container(cram_container *c);
707 
708 /*! Reads a container header.
709  *
710  * @return
711  * Returns cram_container on success;
712  *         NULL on failure or no container left (fd->err == 0).
713  */
714 cram_container *cram_read_container(cram_fd *fd);
715 
716 /*! Writes a container structure.
717  *
718  * @return
719  * Returns 0 on success;
720  *        -1 on failure
721  */
722 int cram_write_container(cram_fd *fd, cram_container *h);
723 
724 /*! Flushes a container to disk.
725  *
726  * Flushes a completely or partially full container to disk, writing
727  * container structure, header and blocks. This also calls the encoder
728  * functions.
729  *
730  * @return
731  * Returns 0 on success;
732  *        -1 on failure
733  */
734 int cram_flush_container(cram_fd *fd, cram_container *c);
735 int cram_flush_container_mt(cram_fd *fd, cram_container *c);
736 
737 
738 /**@}*/
739 /**@{ ----------------------------------------------------------------------
740  * Compression headers; the first part of the container
741  */
742 
743 /*! Creates a new blank container compression header
744  *
745  * @return
746  * Returns header ptr on success;
747  *         NULL on failure
748  */
749 cram_block_compression_hdr *cram_new_compression_header(void);
750 
751 /*! Frees a cram_block_compression_hdr */
752 void cram_free_compression_header(cram_block_compression_hdr *hdr);
753 
754 
755 /**@}*/
756 /**@{ ----------------------------------------------------------------------
757  * Slices and slice headers
758  */
759 
760 /*! Frees a slice header */
761 void cram_free_slice_header(cram_block_slice_hdr *hdr);
762 
763 /*! Frees a slice */
764 void cram_free_slice(cram_slice *s);
765 
766 /*! Creates a new empty slice in memory, for subsequent writing to
767  * disk.
768  *
769  * @return
770  * Returns cram_slice ptr on success;
771  *         NULL on failure
772  */
773 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs);
774 
775 /*! Loads an entire slice.
776  *
777  * FIXME: In 1.0 the native unit of slices within CRAM is broken
778  * as slices contain references to objects in other slices.
779  * To work around this while keeping the slice oriented outer loop
780  * we read all slices and stitch them together into a fake large
781  * slice instead.
782  *
783  * @return
784  * Returns cram_slice ptr on success;
785  *         NULL on failure
786  */
787 cram_slice *cram_read_slice(cram_fd *fd);
788 
789 
790 
791 /**@}*/
792 /**@{ ----------------------------------------------------------------------
793  * CRAM file definition (header)
794  */
795 
796 /*! Reads a CRAM file definition structure.
797  *
798  * @return
799  * Returns file_def ptr on success;
800  *         NULL on failure
801  */
802 cram_file_def *cram_read_file_def(cram_fd *fd);
803 
804 /*! Writes a cram_file_def structure to cram_fd.
805  *
806  * @return
807  * Returns 0 on success;
808  *        -1 on failure
809  */
810 int cram_write_file_def(cram_fd *fd, cram_file_def *def);
811 
812 /*! Frees a cram_file_def structure. */
813 void cram_free_file_def(cram_file_def *def);
814 
815 
816 /**@}*/
817 /**@{ ----------------------------------------------------------------------
818  * SAM header I/O
819  */
820 
821 /*! Reads the SAM header from the first CRAM data block.
822  *
823  * Also performs minimal parsing to extract read-group
824  * and sample information.
825  *
826  * @return
827  * Returns SAM hdr ptr on success;
828  *         NULL on failure
829  */
830 sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd);
831 
832 /*! Writes a CRAM SAM header.
833  *
834  * @return
835  * Returns 0 on success;
836  *        -1 on failure
837  */
838 int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr);
839 
840 
841 /**@}*/
842 /**@{ ----------------------------------------------------------------------
843  * The top-level cram opening, closing and option handling
844  */
845 
846 /*! Opens a CRAM file for read (mode "rb") or write ("wb").
847  *
848  * The filename may be "-" to indicate stdin or stdout.
849  *
850  * @return
851  * Returns file handle on success;
852  *         NULL on failure.
853  */
854 cram_fd *cram_open(const char *filename, const char *mode);
855 
856 /*! Opens an existing stream for reading or writing.
857  *
858  * @return
859  * Returns file handle on success;
860  *         NULL on failure.
861  */
862 cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
863 
864 /*! Closes a CRAM file.
865  *
866  * @return
867  * Returns 0 on success;
868  *        -1 on failure
869  */
870 int cram_close(cram_fd *fd);
871 
872 /*
873  * Seek within a CRAM file.
874  *
875  * Returns 0 on success
876  *        -1 on failure
877  */
878 int cram_seek(cram_fd *fd, off_t offset, int whence);
879 
880 /*
881  * Flushes a CRAM file.
882  * Useful for when writing to stdout without wishing to close the stream.
883  *
884  * Returns 0 on success
885  *        -1 on failure
886  */
887 int cram_flush(cram_fd *fd);
888 
889 /*! Checks for end of file on a cram_fd stream.
890  *
891  * @return
892  * Returns 0 if not at end of file
893  *         1 if we hit an expected EOF (end of range or EOF block)
894  *         2 for other EOF (end of stream without EOF block)
895  */
896 int cram_eof(cram_fd *fd);
897 
898 /*! Sets options on the cram_fd.
899  *
900  * See CRAM_OPT_* definitions in cram_structs.h.
901  * Use this immediately after opening.
902  *
903  * @return
904  * Returns 0 on success;
905  *        -1 on failure
906  */
907 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...);
908 
909 /*! Sets options on the cram_fd.
910  *
911  * See CRAM_OPT_* definitions in cram_structs.h.
912  * Use this immediately after opening.
913  *
914  * @return
915  * Returns 0 on success;
916  *        -1 on failure
917  */
918 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args);
919 
920 /*!
921  * Attaches a header to a cram_fd.
922  *
923  * This should be used when creating a new cram_fd for writing where
924  * we have an sam_hdr_t already constructed (eg from a file we've read
925  * in).
926  *
927  * @return
928  * Returns 0 on success;
929  *        -1 on failure
930  */
931 int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr);
932 
933 /*!
934  * Returns the hFILE connected to a cram_fd.
935  */
cram_hfile(cram_fd * fd)936 static inline struct hFILE *cram_hfile(cram_fd *fd) {
937     return fd->fp;
938 }
939 
940 #ifdef __cplusplus
941 }
942 #endif
943 
944 #endif /* CRAM_IO_H */
945