1 /*
2 Copyright (c) 2012-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*! \file
32  * Include cram.h instead.
33  *
34  * This is an internal part of the CRAM system and is automatically included
35  * when you #include cram.h.
36  *
37  * Implements the low level CRAM I/O primitives.
38  * This includes basic data types such as byte, int, ITF-8,
39  * maps, bitwise I/O, etc.
40  */
41 
42 #ifndef _CRAM_IO_H_
43 #define _CRAM_IO_H_
44 
45 #include <stdint.h>
46 #include <cram/misc.h>
47 
48 #ifdef __cplusplus
49 extern "C" {
50 #endif
51 
52 /**@{ ----------------------------------------------------------------------
53  * ITF8 encoding and decoding.
54  *
55  * Also see the itf8_get and itf8_put macros.
56  */
57 
58 /*! INTERNAL: Converts two characters into an integer for use in switch{} */
59 #define CRAM_KEY(a,b) (((a)<<8)|((b)))
60 
61 /*! Reads an integer in ITF-8 encoding from 'fd' and stores it in
62  * *val.
63  *
64  * @return
65  * Returns the number of bytes read on success;
66  *        -1 on failure
67  */
68 int itf8_decode(cram_fd *fd, int32_t *val);
69 
itf8_get(char * cp,int32_t * val_p)70 static inline int itf8_get(char *cp, int32_t *val_p) {
71     unsigned char *up = (unsigned char *)cp;
72 
73     if (up[0] < 0x80) {
74         *val_p =   up[0];
75         return 1;
76     } else if (up[0] < 0xc0) {
77         *val_p = ((up[0] <<8) |  up[1])                           & 0x3fff;
78         return 2;
79     } else if (up[0] < 0xe0) {
80         *val_p = ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
81         return 3;
82     } else if (up[0] < 0xf0) {
83         *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
84         return 4;
85     } else {
86         *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
87         return 5;
88     }
89 }
90 
91 /*
92  * Stores a value to memory in ITF-8 format.
93  *
94  * Returns the number of bytes required to store the number.
95  * This is a maximum of 5 bytes.
96  */
itf8_put(char * cp,int32_t val)97 static inline int itf8_put(char *cp, int32_t val) {
98     unsigned char *up = (unsigned char *)cp;
99     if        (!(val & ~0x00000007f)) { // 1 byte
100         *up = val;
101         return 1;
102     } else if (!(val & ~0x00003fff)) { // 2 byte
103         *up++ = (val >> 8 ) | 0x80;
104         *up   = val & 0xff;
105         return 2;
106     } else if (!(val & ~0x01fffff)) { // 3 byte
107         *up++ = (val >> 16) | 0xc0;
108         *up++ = (val >> 8 ) & 0xff;
109         *up   = val & 0xff;
110         return 3;
111     } else if (!(val & ~0x0fffffff)) { // 4 byte
112         *up++ = (val >> 24) | 0xe0;
113         *up++ = (val >> 16) & 0xff;
114         *up++ = (val >> 8 ) & 0xff;
115         *up   = val & 0xff;
116         return 4;
117     } else {                           // 5 byte
118         *up++ = 0xf0 | ((val>>28) & 0xff);
119         *up++ = (val >> 20) & 0xff;
120         *up++ = (val >> 12) & 0xff;
121         *up++ = (val >> 4 ) & 0xff;
122         *up = val & 0x0f;
123         return 5;
124     }
125 }
126 
127 
128 /* 64-bit itf8 variant */
ltf8_put(char * cp,int64_t val)129 static inline int ltf8_put(char *cp, int64_t val) {
130     unsigned char *up = (unsigned char *)cp;
131     if        (!(val & ~((1LL<<7)-1))) {
132         *up = val;
133         return 1;
134     } else if (!(val & ~((1LL<<(6+8))-1))) {
135         *up++ = (val >> 8 ) | 0x80;
136         *up   = val & 0xff;
137         return 2;
138     } else if (!(val & ~((1LL<<(5+2*8))-1))) {
139         *up++ = (val >> 16) | 0xc0;
140         *up++ = (val >> 8 ) & 0xff;
141         *up   = val & 0xff;
142         return 3;
143     } else if (!(val & ~((1LL<<(4+3*8))-1))) {
144         *up++ = (val >> 24) | 0xe0;
145         *up++ = (val >> 16) & 0xff;
146         *up++ = (val >> 8 ) & 0xff;
147         *up   = val & 0xff;
148         return 4;
149     } else if (!(val & ~((1LL<<(3+4*8))-1))) {
150         *up++ = (val >> 32) | 0xf0;
151         *up++ = (val >> 24) & 0xff;
152         *up++ = (val >> 16) & 0xff;
153         *up++ = (val >> 8 ) & 0xff;
154         *up   = val & 0xff;
155         return 5;
156     } else if (!(val & ~((1LL<<(2+5*8))-1))) {
157         *up++ = (val >> 40) | 0xf8;
158         *up++ = (val >> 32) & 0xff;
159         *up++ = (val >> 24) & 0xff;
160         *up++ = (val >> 16) & 0xff;
161         *up++ = (val >> 8 ) & 0xff;
162         *up   = val & 0xff;
163         return 6;
164     } else if (!(val & ~((1LL<<(1+6*8))-1))) {
165         *up++ = (val >> 48) | 0xfc;
166         *up++ = (val >> 40) & 0xff;
167         *up++ = (val >> 32) & 0xff;
168         *up++ = (val >> 24) & 0xff;
169         *up++ = (val >> 16) & 0xff;
170         *up++ = (val >> 8 ) & 0xff;
171         *up   = val & 0xff;
172         return 7;
173     } else if (!(val & ~((1LL<<(7*8))-1))) {
174         *up++ = (val >> 56) | 0xfe;
175         *up++ = (val >> 48) & 0xff;
176         *up++ = (val >> 40) & 0xff;
177         *up++ = (val >> 32) & 0xff;
178         *up++ = (val >> 24) & 0xff;
179         *up++ = (val >> 16) & 0xff;
180         *up++ = (val >> 8 ) & 0xff;
181         *up   = val & 0xff;
182         return 8;
183     } else {
184         *up++ = 0xff;
185         *up++ = (val >> 56) & 0xff;
186         *up++ = (val >> 48) & 0xff;
187         *up++ = (val >> 40) & 0xff;
188         *up++ = (val >> 32) & 0xff;
189         *up++ = (val >> 24) & 0xff;
190         *up++ = (val >> 16) & 0xff;
191         *up++ = (val >> 8 ) & 0xff;
192         *up   = val & 0xff;
193         return 9;
194     }
195 }
196 
ltf8_get(char * cp,int64_t * val_p)197 static inline int ltf8_get(char *cp, int64_t *val_p) {
198     unsigned char *up = (unsigned char *)cp;
199 
200     if (up[0] < 0x80) {
201         *val_p =   up[0];
202         return 1;
203     } else if (up[0] < 0xc0) {
204         *val_p = (((uint64_t)up[0]<< 8) |
205                    (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
206         return 2;
207     } else if (up[0] < 0xe0) {
208         *val_p = (((uint64_t)up[0]<<16) |
209                   ((uint64_t)up[1]<< 8) |
210                    (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
211         return 3;
212     } else if (up[0] < 0xf0) {
213         *val_p = (((uint64_t)up[0]<<24) |
214                   ((uint64_t)up[1]<<16) |
215                   ((uint64_t)up[2]<< 8) |
216                    (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
217         return 4;
218     } else if (up[0] < 0xf8) {
219         *val_p = (((uint64_t)up[0]<<32) |
220                   ((uint64_t)up[1]<<24) |
221                   ((uint64_t)up[2]<<16) |
222                   ((uint64_t)up[3]<< 8) |
223                    (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
224         return 5;
225     } else if (up[0] < 0xfc) {
226         *val_p = (((uint64_t)up[0]<<40) |
227                   ((uint64_t)up[1]<<32) |
228                   ((uint64_t)up[2]<<24) |
229                   ((uint64_t)up[3]<<16) |
230                   ((uint64_t)up[4]<< 8) |
231                    (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
232         return 6;
233     } else if (up[0] < 0xfe) {
234         *val_p = (((uint64_t)up[0]<<48) |
235                   ((uint64_t)up[1]<<40) |
236                   ((uint64_t)up[2]<<32) |
237                   ((uint64_t)up[3]<<24) |
238                   ((uint64_t)up[4]<<16) |
239                   ((uint64_t)up[5]<< 8) |
240                    (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
241         return 7;
242     } else if (up[0] < 0xff) {
243         *val_p = (((uint64_t)up[1]<<48) |
244                   ((uint64_t)up[2]<<40) |
245                   ((uint64_t)up[3]<<32) |
246                   ((uint64_t)up[4]<<24) |
247                   ((uint64_t)up[5]<<16) |
248                   ((uint64_t)up[6]<< 8) |
249                    (uint64_t)up[7]) & ((1LL<<(7*8))-1);
250         return 8;
251     } else {
252         *val_p = (((uint64_t)up[1]<<56) |
253                   ((uint64_t)up[2]<<48) |
254                   ((uint64_t)up[3]<<40) |
255                   ((uint64_t)up[4]<<32) |
256                   ((uint64_t)up[5]<<24) |
257                   ((uint64_t)up[6]<<16) |
258                   ((uint64_t)up[7]<< 8) |
259                    (uint64_t)up[8]);
260         return 9;
261     }
262 }
263 
264 #define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5)
265 
266 
267 /* Version of itf8_get that checks it hasn't run out of input */
268 
269 extern const int itf8_bytes[16];
270 extern const int ltf8_bytes[256];
271 
safe_itf8_get(const char * cp,const char * endp,int32_t * val_p)272 static inline int safe_itf8_get(const char *cp, const char *endp,
273                                 int32_t *val_p) {
274     const unsigned char *up = (unsigned char *)cp;
275 
276     if (endp - cp < 5 &&
277         (cp >= endp || endp - cp < itf8_bytes[up[0]>>4])) {
278         *val_p = 0;
279         return 0;
280     }
281 
282     if (up[0] < 0x80) {
283         *val_p =   up[0];
284         return 1;
285     } else if (up[0] < 0xc0) {
286         *val_p = ((up[0] <<8) |  up[1])                           & 0x3fff;
287         return 2;
288     } else if (up[0] < 0xe0) {
289         *val_p = ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
290         return 3;
291     } else if (up[0] < 0xf0) {
292         *val_p = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
293         return 4;
294     } else {
295         uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
296         *val_p = uv < 0x80000000UL ? uv : -((int32_t) (0xffffffffUL - uv)) - 1;
297         return 5;
298     }
299 }
300 
safe_ltf8_get(const char * cp,const char * endp,int64_t * val_p)301 static inline int safe_ltf8_get(const char *cp, const char *endp,
302                                 int64_t *val_p) {
303     unsigned char *up = (unsigned char *)cp;
304 
305     if (endp - cp < 9 &&
306         (cp >= endp || endp - cp < ltf8_bytes[up[0]])) return 0;
307 
308     if (up[0] < 0x80) {
309         *val_p =   up[0];
310         return 1;
311     } else if (up[0] < 0xc0) {
312         *val_p = (((uint64_t)up[0]<< 8) |
313                    (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
314         return 2;
315     } else if (up[0] < 0xe0) {
316         *val_p = (((uint64_t)up[0]<<16) |
317                   ((uint64_t)up[1]<< 8) |
318                    (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
319         return 3;
320     } else if (up[0] < 0xf0) {
321         *val_p = (((uint64_t)up[0]<<24) |
322                   ((uint64_t)up[1]<<16) |
323                   ((uint64_t)up[2]<< 8) |
324                    (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
325         return 4;
326     } else if (up[0] < 0xf8) {
327         *val_p = (((uint64_t)up[0]<<32) |
328                   ((uint64_t)up[1]<<24) |
329                   ((uint64_t)up[2]<<16) |
330                   ((uint64_t)up[3]<< 8) |
331                    (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
332         return 5;
333     } else if (up[0] < 0xfc) {
334         *val_p = (((uint64_t)up[0]<<40) |
335                   ((uint64_t)up[1]<<32) |
336                   ((uint64_t)up[2]<<24) |
337                   ((uint64_t)up[3]<<16) |
338                   ((uint64_t)up[4]<< 8) |
339                    (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
340         return 6;
341     } else if (up[0] < 0xfe) {
342         *val_p = (((uint64_t)up[0]<<48) |
343                   ((uint64_t)up[1]<<40) |
344                   ((uint64_t)up[2]<<32) |
345                   ((uint64_t)up[3]<<24) |
346                   ((uint64_t)up[4]<<16) |
347                   ((uint64_t)up[5]<< 8) |
348                    (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
349         return 7;
350     } else if (up[0] < 0xff) {
351         *val_p = (((uint64_t)up[1]<<48) |
352                   ((uint64_t)up[2]<<40) |
353                   ((uint64_t)up[3]<<32) |
354                   ((uint64_t)up[4]<<24) |
355                   ((uint64_t)up[5]<<16) |
356                   ((uint64_t)up[6]<< 8) |
357                    (uint64_t)up[7]) & ((1LL<<(7*8))-1);
358         return 8;
359     } else {
360         *val_p = (((uint64_t)up[1]<<56) |
361                   ((uint64_t)up[2]<<48) |
362                   ((uint64_t)up[3]<<40) |
363                   ((uint64_t)up[4]<<32) |
364                   ((uint64_t)up[5]<<24) |
365                   ((uint64_t)up[6]<<16) |
366                   ((uint64_t)up[7]<< 8) |
367                    (uint64_t)up[8]);
368         return 9;
369     }
370 }
371 
372 /*! Pushes a value in ITF8 format onto the end of a block.
373  *
374  * This shouldn't be used for high-volume data as it is not the fastest
375  * method.
376  *
377  * @return
378  * Returns the number of bytes written
379  */
380 int itf8_put_blk(cram_block *blk, int val);
381 
382 /*! Pulls a literal 32-bit value from a block.
383  *
384  * @returns the number of bytes decoded;
385  *         -1 on failure.
386  */
387 int int32_get_blk(cram_block *b, int32_t *val);
388 
389 /*! Pushes a literal 32-bit value onto the end of a block.
390  *
391  * @return
392  * Returns 0 on success;
393  *        -1 on failure.
394  */
395 int int32_put_blk(cram_block *blk, int32_t val);
396 
397 
398 /**@}*/
399 /**@{ ----------------------------------------------------------------------
400  * CRAM blocks - the dynamically growable data block. We have code to
401  * create, update, (un)compress and read/write.
402  *
403  * These are derived from the deflate_interlaced.c blocks, but with the
404  * CRAM extension of content types and IDs.
405  */
406 
407 /*! Allocates a new cram_block structure with a specified content_type and
408  * id.
409  *
410  * @return
411  * Returns block pointer on success;
412  *         NULL on failure
413  */
414 cram_block *cram_new_block(enum cram_content_type content_type,
415                            int content_id);
416 
417 /*! Reads a block from a cram file.
418  *
419  * @return
420  * Returns cram_block pointer on success;
421  *         NULL on failure
422  */
423 cram_block *cram_read_block(cram_fd *fd);
424 
425 /*! Writes a CRAM block.
426  *
427  * @return
428  * Returns 0 on success;
429  *        -1 on failure
430  */
431 int cram_write_block(cram_fd *fd, cram_block *b);
432 
433 /*! Frees a CRAM block, deallocating internal data too.
434  */
435 void cram_free_block(cram_block *b);
436 
437 /*! Uncompress a memory block using Zlib.
438  *
439  * @return
440  * Returns 0 on success;
441  *        -1 on failure
442  */
443 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size);
444 
445 /*! Uncompresses a CRAM block, if compressed.
446  *
447  * @return
448  * Returns 0 on success;
449  *        -1 on failure
450  */
451 int cram_uncompress_block(cram_block *b);
452 
453 /*! Compresses a block.
454  *
455  * Compresses a block using one of two different zlib strategies. If we only
456  * want one choice set strat2 to be -1.
457  *
458  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
459  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
460  * significantly faster.
461  *
462  * @return
463  * Returns 0 on success;
464  *        -1 on failure
465  */
466 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
467                         int method, int level);
468 
469 cram_metrics *cram_new_metrics(void);
470 char *cram_block_method2str(enum cram_block_method m);
471 char *cram_content_type2str(enum cram_content_type t);
472 
473 /*
474  * Find an external block by its content_id
475  */
476 
cram_get_block_by_id(cram_slice * slice,int id)477 static inline cram_block *cram_get_block_by_id(cram_slice *slice, int id) {
478   //fprintf(stderr, "%d\t%p\n", id, slice->block_by_id);
479     if (slice->block_by_id && id >= 0 && id < 256) {
480         return slice->block_by_id[id];
481     } else {
482         int v = 256 + (id > 0 ? id % 251 : (-id) % 251);
483         if (slice->block_by_id &&
484             slice->block_by_id[v] &&
485             slice->block_by_id[v]->content_id == id)
486             return slice->block_by_id[v];
487 
488         // Otherwise a linear search in case of collision
489         int i;
490         for (i = 0; i < slice->hdr->num_blocks; i++) {
491             cram_block *b = slice->block[i];
492             if (b && b->content_type == EXTERNAL && b->content_id == id)
493                 return b;
494         }
495     }
496     return NULL;
497 }
498 
499 /* --- Accessor macros for manipulating blocks on a byte by byte basis --- */
500 
501 /* Block size and data pointer. */
502 #define BLOCK_SIZE(b) ((b)->byte)
503 #define BLOCK_DATA(b) ((b)->data)
504 
505 /* Returns the address one past the end of the block */
506 #define BLOCK_END(b) (&(b)->data[(b)->byte])
507 
508 /* Request block to be at least 'l' bytes long */
509 #define BLOCK_RESIZE(b,l)                                       \
510     do {                                                        \
511         while((b)->alloc <= (l)) {                              \
512             (b)->alloc = (b)->alloc ? (b)->alloc*1.5 : 1024;    \
513             (b)->data = realloc((b)->data, (b)->alloc);         \
514         }                                                       \
515      } while(0)
516 
517 /* Make block exactly 'l' bytes long */
518 #define BLOCK_RESIZE_EXACT(b,l)                                 \
519     do {                                                        \
520         (b)->alloc = (l);                                       \
521         (b)->data = realloc((b)->data, (b)->alloc);             \
522      } while(0)
523 
524 /* Ensure the block can hold at least another 'l' bytes */
525 #define BLOCK_GROW(b,l) BLOCK_RESIZE((b), BLOCK_SIZE((b)) + (l))
526 
527 /* Append string 's' of length 'l' */
528 #define BLOCK_APPEND(b,s,l)               \
529     do {                                  \
530         BLOCK_GROW((b),(l));              \
531         memcpy(BLOCK_END((b)), (s), (l)); \
532         BLOCK_SIZE((b)) += (l);           \
533     } while (0)
534 
535 /* Append as single character 'c' */
536 #define BLOCK_APPEND_CHAR(b,c)            \
537     do {                                  \
538         BLOCK_GROW((b),1);                \
539         (b)->data[(b)->byte++] = (c);     \
540     } while (0)
541 
542 /* Append a single unsigned integer */
543 #define BLOCK_APPEND_UINT(b,i)                       \
544     do {                                             \
545         unsigned char *cp;                           \
546         BLOCK_GROW((b),11);                          \
547         cp = &(b)->data[(b)->byte];                  \
548         (b)->byte += append_uint32(cp, (i)) - cp;       \
549     } while (0)
550 
append_uint32(unsigned char * cp,uint32_t i)551 static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) {
552     uint32_t j;
553 
554     if (i == 0) {
555         *cp++ = '0';
556         return cp;
557     }
558 
559     if (i < 100)        goto b1;
560     if (i < 10000)      goto b3;
561     if (i < 1000000)    goto b5;
562     if (i < 100000000)  goto b7;
563 
564     if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
565     if ((j = i / 100000000))  {*cp++ = j + '0'; i -= j*100000000;  goto x7;}
566  b7:if ((j = i / 10000000))   {*cp++ = j + '0'; i -= j*10000000;   goto x6;}
567     if ((j = i / 1000000))    {*cp++ = j + '0', i -= j*1000000;    goto x5;}
568  b5:if ((j = i / 100000))     {*cp++ = j + '0', i -= j*100000;     goto x4;}
569     if ((j = i / 10000))      {*cp++ = j + '0', i -= j*10000;      goto x3;}
570  b3:if ((j = i / 1000))       {*cp++ = j + '0', i -= j*1000;       goto x2;}
571     if ((j = i / 100))        {*cp++ = j + '0', i -= j*100;        goto x1;}
572  b1:if ((j = i / 10))         {*cp++ = j + '0', i -= j*10;         goto x0;}
573     if (i)                     *cp++ = i + '0';
574     return cp;
575 
576  x8: *cp++ = i / 100000000 + '0', i %= 100000000;
577  x7: *cp++ = i / 10000000  + '0', i %= 10000000;
578  x6: *cp++ = i / 1000000   + '0', i %= 1000000;
579  x5: *cp++ = i / 100000    + '0', i %= 100000;
580  x4: *cp++ = i / 10000     + '0', i %= 10000;
581  x3: *cp++ = i / 1000      + '0', i %= 1000;
582  x2: *cp++ = i / 100       + '0', i %= 100;
583  x1: *cp++ = i / 10        + '0', i %= 10;
584  x0: *cp++ = i             + '0';
585 
586     return cp;
587 }
588 
append_sub32(unsigned char * cp,uint32_t i)589 static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) {
590     *cp++ = i / 100000000 + '0', i %= 100000000;
591     *cp++ = i / 10000000  + '0', i %= 10000000;
592     *cp++ = i / 1000000   + '0', i %= 1000000;
593     *cp++ = i / 100000    + '0', i %= 100000;
594     *cp++ = i / 10000     + '0', i %= 10000;
595     *cp++ = i / 1000      + '0', i %= 1000;
596     *cp++ = i / 100       + '0', i %= 100;
597     *cp++ = i / 10        + '0', i %= 10;
598     *cp++ = i             + '0';
599 
600     return cp;
601 }
602 
append_uint64(unsigned char * cp,uint64_t i)603 static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) {
604     uint64_t j;
605 
606     if (i <= 0xffffffff)
607         return append_uint32(cp, i);
608 
609     if ((j = i/1000000000) > 1000000000) {
610         cp = append_uint32(cp, j/1000000000);
611         j %= 1000000000;
612         cp = append_sub32(cp, j);
613     } else {
614         cp = append_uint32(cp, i / 1000000000);
615     }
616     cp = append_sub32(cp, i % 1000000000);
617 
618     return cp;
619 }
620 
621 #define BLOCK_UPLEN(b) \
622     (b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b))
623 
624 /**@}*/
625 /**@{ ----------------------------------------------------------------------
626  * Reference sequence handling
627  */
628 
629 /*! Loads a reference set from fn and stores in the cram_fd.
630  *
631  * @return
632  * Returns 0 on success;
633  *        -1 on failure
634  */
635 int cram_load_reference(cram_fd *fd, char *fn);
636 
637 /*! Generates a lookup table in refs based on the SQ headers in SAM_hdr.
638  *
639  * Indexes references by the order they appear in a BAM file. This may not
640  * necessarily be the same order they appear in the fasta reference file.
641  *
642  * @return
643  * Returns 0 on success;
644  *        -1 on failure
645  */
646 int refs2id(refs_t *r, SAM_hdr *bfd);
647 
648 void refs_free(refs_t *r);
649 
650 /*! Returns a portion of a reference sequence from start to end inclusive.
651  *
652  * The returned pointer is owned by the cram_file fd and should not be freed
653  * by the caller. It is valid only until the next cram_get_ref is called
654  * with the same fd parameter (so is thread-safe if given multiple files).
655  *
656  * To return the entire reference sequence, specify start as 1 and end
657  * as 0.
658  *
659  * @return
660  * Returns reference on success;
661  *         NULL on failure
662  */
663 char *cram_get_ref(cram_fd *fd, int id, int start, int end);
664 void cram_ref_incr(refs_t *r, int id);
665 void cram_ref_decr(refs_t *r, int id);
666 /**@}*/
667 /**@{ ----------------------------------------------------------------------
668  * Containers
669  */
670 
671 /*! Creates a new container, specifying the maximum number of slices
672  * and records permitted.
673  *
674  * @return
675  * Returns cram_container ptr on success;
676  *         NULL on failure
677  */
678 cram_container *cram_new_container(int nrec, int nslice);
679 void cram_free_container(cram_container *c);
680 
681 /*! Reads a container header.
682  *
683  * @return
684  * Returns cram_container on success;
685  *         NULL on failure or no container left (fd->err == 0).
686  */
687 cram_container *cram_read_container(cram_fd *fd);
688 
689 /*! Writes a container structure.
690  *
691  * @return
692  * Returns 0 on success;
693  *        -1 on failure
694  */
695 int cram_write_container(cram_fd *fd, cram_container *h);
696 
697 /*! Flushes a container to disk.
698  *
699  * Flushes a completely or partially full container to disk, writing
700  * container structure, header and blocks. This also calls the encoder
701  * functions.
702  *
703  * @return
704  * Returns 0 on success;
705  *        -1 on failure
706  */
707 int cram_flush_container(cram_fd *fd, cram_container *c);
708 int cram_flush_container_mt(cram_fd *fd, cram_container *c);
709 
710 
711 /**@}*/
712 /**@{ ----------------------------------------------------------------------
713  * Compression headers; the first part of the container
714  */
715 
716 /*! Creates a new blank container compression header
717  *
718  * @return
719  * Returns header ptr on success;
720  *         NULL on failure
721  */
722 cram_block_compression_hdr *cram_new_compression_header(void);
723 
724 /*! Frees a cram_block_compression_hdr */
725 void cram_free_compression_header(cram_block_compression_hdr *hdr);
726 
727 
728 /**@}*/
729 /**@{ ----------------------------------------------------------------------
730  * Slices and slice headers
731  */
732 
733 /*! Frees a slice header */
734 void cram_free_slice_header(cram_block_slice_hdr *hdr);
735 
736 /*! Frees a slice */
737 void cram_free_slice(cram_slice *s);
738 
739 /*! Creates a new empty slice in memory, for subsequent writing to
740  * disk.
741  *
742  * @return
743  * Returns cram_slice ptr on success;
744  *         NULL on failure
745  */
746 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs);
747 
748 /*! Loads an entire slice.
749  *
750  * FIXME: In 1.0 the native unit of slices within CRAM is broken
751  * as slices contain references to objects in other slices.
752  * To work around this while keeping the slice oriented outer loop
753  * we read all slices and stitch them together into a fake large
754  * slice instead.
755  *
756  * @return
757  * Returns cram_slice ptr on success;
758  *         NULL on failure
759  */
760 cram_slice *cram_read_slice(cram_fd *fd);
761 
762 
763 
764 /**@}*/
765 /**@{ ----------------------------------------------------------------------
766  * CRAM file definition (header)
767  */
768 
769 /*! Reads a CRAM file definition structure.
770  *
771  * @return
772  * Returns file_def ptr on success;
773  *         NULL on failure
774  */
775 cram_file_def *cram_read_file_def(cram_fd *fd);
776 
777 /*! Writes a cram_file_def structure to cram_fd.
778  *
779  * @return
780  * Returns 0 on success;
781  *        -1 on failure
782  */
783 int cram_write_file_def(cram_fd *fd, cram_file_def *def);
784 
785 /*! Frees a cram_file_def structure. */
786 void cram_free_file_def(cram_file_def *def);
787 
788 
789 /**@}*/
790 /**@{ ----------------------------------------------------------------------
791  * SAM header I/O
792  */
793 
794 /*! Reads the SAM header from the first CRAM data block.
795  *
796  * Also performs minimal parsing to extract read-group
797  * and sample information.
798  *
799  * @return
800  * Returns SAM hdr ptr on success;
801  *         NULL on failure
802  */
803 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd);
804 
805 /*! Writes a CRAM SAM header.
806  *
807  * @return
808  * Returns 0 on success;
809  *        -1 on failure
810  */
811 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr);
812 
813 
814 /**@}*/
815 /**@{ ----------------------------------------------------------------------
816  * The top-level cram opening, closing and option handling
817  */
818 
819 /*! Opens a CRAM file for read (mode "rb") or write ("wb").
820  *
821  * The filename may be "-" to indicate stdin or stdout.
822  *
823  * @return
824  * Returns file handle on success;
825  *         NULL on failure.
826  */
827 cram_fd *cram_open(const char *filename, const char *mode);
828 
829 /*! Opens an existing stream for reading or writing.
830  *
831  * @return
832  * Returns file handle on success;
833  *         NULL on failure.
834  */
835 cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
836 
837 /*! Closes a CRAM file.
838  *
839  * @return
840  * Returns 0 on success;
841  *        -1 on failure
842  */
843 int cram_close(cram_fd *fd);
844 
845 /*
846  * Seek within a CRAM file.
847  *
848  * Returns 0 on success
849  *        -1 on failure
850  */
851 int cram_seek(cram_fd *fd, off_t offset, int whence);
852 
853 int64_t cram_tell(cram_fd *fd);
854 
855 /*
856  * Flushes a CRAM file.
857  * Useful for when writing to stdout without wishing to close the stream.
858  *
859  * Returns 0 on success
860  *        -1 on failure
861  */
862 int cram_flush(cram_fd *fd);
863 
864 /*! Checks for end of file on a cram_fd stream.
865  *
866  * @return
867  * Returns 0 if not at end of file
868  *         1 if we hit an expected EOF (end of range or EOF block)
869  *         2 for other EOF (end of stream without EOF block)
870  */
871 int cram_eof(cram_fd *fd);
872 
873 /*! Sets options on the cram_fd.
874  *
875  * See CRAM_OPT_* definitions in cram_structs.h.
876  * Use this immediately after opening.
877  *
878  * @return
879  * Returns 0 on success;
880  *        -1 on failure
881  */
882 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...);
883 
884 /*! Sets options on the cram_fd.
885  *
886  * See CRAM_OPT_* definitions in cram_structs.h.
887  * Use this immediately after opening.
888  *
889  * @return
890  * Returns 0 on success;
891  *        -1 on failure
892  */
893 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args);
894 
895 /*!
896  * Attaches a header to a cram_fd.
897  *
898  * This should be used when creating a new cram_fd for writing where
899  * we have an SAM_hdr already constructed (eg from a file we've read
900  * in).
901  *
902  * @return
903  * Returns 0 on success;
904  *        -1 on failure
905  */
906 int cram_set_header(cram_fd *fd, SAM_hdr *hdr);
907 
908 /*!
909  * Returns the hFILE connected to a cram_fd.
910  */
cram_hfile(cram_fd * fd)911 static inline struct hFILE *cram_hfile(cram_fd *fd) {
912     return fd->fp;
913 }
914 
915 #ifdef __cplusplus
916 }
917 #endif
918 
919 #endif /* _CRAM_IO_H_ */
920