1 /*
2 Copyright (c) 2012-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*! \file
32 * Include cram.h instead.
33 *
34 * This is an internal part of the CRAM system and is automatically included
35 * when you #include cram.h.
36 *
37 * Implements the low level CRAM I/O primitives.
38 * This includes basic data types such as byte, int, ITF-8,
39 * maps, bitwise I/O, etc.
40 */
41
42 #ifndef CRAM_IO_H
43 #define CRAM_IO_H
44
45 #include <stdint.h>
46
47 #include "misc.h"
48
49 #ifdef __cplusplus
50 extern "C" {
51 #endif
52
53 /**@{ ----------------------------------------------------------------------
54 * ITF8 encoding and decoding.
55 *
56 * Also see the itf8_get and itf8_put macros.
57 */
58
59 /*! INTERNAL: Converts two characters into an integer for use in switch{} */
60 #define CRAM_KEY(a,b) ((((unsigned char) a)<<8)|(((unsigned char) b)))
61
62 /*! Reads an integer in ITF-8 encoding from 'fd' and stores it in
63 * *val.
64 *
65 * @return
66 * Returns the number of bytes read on success;
67 * -1 on failure
68 */
69 int itf8_decode(cram_fd *fd, int32_t *val);
70
itf8_get(char * cp,int32_t * val_p)71 static inline int itf8_get(char *cp, int32_t *val_p) {
72 unsigned char *up = (unsigned char *)cp;
73
74 if (up[0] < 0x80) {
75 *val_p = up[0];
76 return 1;
77 } else if (up[0] < 0xc0) {
78 *val_p = ((up[0] <<8) | up[1]) & 0x3fff;
79 return 2;
80 } else if (up[0] < 0xe0) {
81 *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
82 return 3;
83 } else if (up[0] < 0xf0) {
84 *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
85 return 4;
86 } else {
87 *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
88 return 5;
89 }
90 }
91
92 /*
93 * Stores a value to memory in ITF-8 format.
94 *
95 * Returns the number of bytes required to store the number.
96 * This is a maximum of 5 bytes.
97 */
itf8_put(char * cp,int32_t val)98 static inline int itf8_put(char *cp, int32_t val) {
99 unsigned char *up = (unsigned char *)cp;
100 if (!(val & ~0x00000007f)) { // 1 byte
101 *up = val;
102 return 1;
103 } else if (!(val & ~0x00003fff)) { // 2 byte
104 *up++ = (val >> 8 ) | 0x80;
105 *up = val & 0xff;
106 return 2;
107 } else if (!(val & ~0x01fffff)) { // 3 byte
108 *up++ = (val >> 16) | 0xc0;
109 *up++ = (val >> 8 ) & 0xff;
110 *up = val & 0xff;
111 return 3;
112 } else if (!(val & ~0x0fffffff)) { // 4 byte
113 *up++ = (val >> 24) | 0xe0;
114 *up++ = (val >> 16) & 0xff;
115 *up++ = (val >> 8 ) & 0xff;
116 *up = val & 0xff;
117 return 4;
118 } else { // 5 byte
119 *up++ = 0xf0 | ((val>>28) & 0xff);
120 *up++ = (val >> 20) & 0xff;
121 *up++ = (val >> 12) & 0xff;
122 *up++ = (val >> 4 ) & 0xff;
123 *up = val & 0x0f;
124 return 5;
125 }
126 }
127
128
129 /* 64-bit itf8 variant */
ltf8_put(char * cp,int64_t val)130 static inline int ltf8_put(char *cp, int64_t val) {
131 unsigned char *up = (unsigned char *)cp;
132 if (!(val & ~((1LL<<7)-1))) {
133 *up = val;
134 return 1;
135 } else if (!(val & ~((1LL<<(6+8))-1))) {
136 *up++ = (val >> 8 ) | 0x80;
137 *up = val & 0xff;
138 return 2;
139 } else if (!(val & ~((1LL<<(5+2*8))-1))) {
140 *up++ = (val >> 16) | 0xc0;
141 *up++ = (val >> 8 ) & 0xff;
142 *up = val & 0xff;
143 return 3;
144 } else if (!(val & ~((1LL<<(4+3*8))-1))) {
145 *up++ = (val >> 24) | 0xe0;
146 *up++ = (val >> 16) & 0xff;
147 *up++ = (val >> 8 ) & 0xff;
148 *up = val & 0xff;
149 return 4;
150 } else if (!(val & ~((1LL<<(3+4*8))-1))) {
151 *up++ = (val >> 32) | 0xf0;
152 *up++ = (val >> 24) & 0xff;
153 *up++ = (val >> 16) & 0xff;
154 *up++ = (val >> 8 ) & 0xff;
155 *up = val & 0xff;
156 return 5;
157 } else if (!(val & ~((1LL<<(2+5*8))-1))) {
158 *up++ = (val >> 40) | 0xf8;
159 *up++ = (val >> 32) & 0xff;
160 *up++ = (val >> 24) & 0xff;
161 *up++ = (val >> 16) & 0xff;
162 *up++ = (val >> 8 ) & 0xff;
163 *up = val & 0xff;
164 return 6;
165 } else if (!(val & ~((1LL<<(1+6*8))-1))) {
166 *up++ = (val >> 48) | 0xfc;
167 *up++ = (val >> 40) & 0xff;
168 *up++ = (val >> 32) & 0xff;
169 *up++ = (val >> 24) & 0xff;
170 *up++ = (val >> 16) & 0xff;
171 *up++ = (val >> 8 ) & 0xff;
172 *up = val & 0xff;
173 return 7;
174 } else if (!(val & ~((1LL<<(7*8))-1))) {
175 *up++ = (val >> 56) | 0xfe;
176 *up++ = (val >> 48) & 0xff;
177 *up++ = (val >> 40) & 0xff;
178 *up++ = (val >> 32) & 0xff;
179 *up++ = (val >> 24) & 0xff;
180 *up++ = (val >> 16) & 0xff;
181 *up++ = (val >> 8 ) & 0xff;
182 *up = val & 0xff;
183 return 8;
184 } else {
185 *up++ = 0xff;
186 *up++ = (val >> 56) & 0xff;
187 *up++ = (val >> 48) & 0xff;
188 *up++ = (val >> 40) & 0xff;
189 *up++ = (val >> 32) & 0xff;
190 *up++ = (val >> 24) & 0xff;
191 *up++ = (val >> 16) & 0xff;
192 *up++ = (val >> 8 ) & 0xff;
193 *up = val & 0xff;
194 return 9;
195 }
196 }
197
ltf8_get(char * cp,int64_t * val_p)198 static inline int ltf8_get(char *cp, int64_t *val_p) {
199 unsigned char *up = (unsigned char *)cp;
200
201 if (up[0] < 0x80) {
202 *val_p = up[0];
203 return 1;
204 } else if (up[0] < 0xc0) {
205 *val_p = (((uint64_t)up[0]<< 8) |
206 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
207 return 2;
208 } else if (up[0] < 0xe0) {
209 *val_p = (((uint64_t)up[0]<<16) |
210 ((uint64_t)up[1]<< 8) |
211 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
212 return 3;
213 } else if (up[0] < 0xf0) {
214 *val_p = (((uint64_t)up[0]<<24) |
215 ((uint64_t)up[1]<<16) |
216 ((uint64_t)up[2]<< 8) |
217 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
218 return 4;
219 } else if (up[0] < 0xf8) {
220 *val_p = (((uint64_t)up[0]<<32) |
221 ((uint64_t)up[1]<<24) |
222 ((uint64_t)up[2]<<16) |
223 ((uint64_t)up[3]<< 8) |
224 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
225 return 5;
226 } else if (up[0] < 0xfc) {
227 *val_p = (((uint64_t)up[0]<<40) |
228 ((uint64_t)up[1]<<32) |
229 ((uint64_t)up[2]<<24) |
230 ((uint64_t)up[3]<<16) |
231 ((uint64_t)up[4]<< 8) |
232 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
233 return 6;
234 } else if (up[0] < 0xfe) {
235 *val_p = (((uint64_t)up[0]<<48) |
236 ((uint64_t)up[1]<<40) |
237 ((uint64_t)up[2]<<32) |
238 ((uint64_t)up[3]<<24) |
239 ((uint64_t)up[4]<<16) |
240 ((uint64_t)up[5]<< 8) |
241 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
242 return 7;
243 } else if (up[0] < 0xff) {
244 *val_p = (((uint64_t)up[1]<<48) |
245 ((uint64_t)up[2]<<40) |
246 ((uint64_t)up[3]<<32) |
247 ((uint64_t)up[4]<<24) |
248 ((uint64_t)up[5]<<16) |
249 ((uint64_t)up[6]<< 8) |
250 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
251 return 8;
252 } else {
253 *val_p = (((uint64_t)up[1]<<56) |
254 ((uint64_t)up[2]<<48) |
255 ((uint64_t)up[3]<<40) |
256 ((uint64_t)up[4]<<32) |
257 ((uint64_t)up[5]<<24) |
258 ((uint64_t)up[6]<<16) |
259 ((uint64_t)up[7]<< 8) |
260 (uint64_t)up[8]);
261 return 9;
262 }
263 }
264
265 #define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5)
266
267
268 /* Version of itf8_get that checks it hasn't run out of input */
269
270 extern const int itf8_bytes[16];
271 extern const int ltf8_bytes[256];
272
safe_itf8_get(const char * cp,const char * endp,int32_t * val_p)273 static inline int safe_itf8_get(const char *cp, const char *endp,
274 int32_t *val_p) {
275 const unsigned char *up = (unsigned char *)cp;
276
277 if (endp - cp < 5 &&
278 (cp >= endp || endp - cp < itf8_bytes[up[0]>>4])) {
279 *val_p = 0;
280 return 0;
281 }
282
283 if (up[0] < 0x80) {
284 *val_p = up[0];
285 return 1;
286 } else if (up[0] < 0xc0) {
287 *val_p = ((up[0] <<8) | up[1]) & 0x3fff;
288 return 2;
289 } else if (up[0] < 0xe0) {
290 *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
291 return 3;
292 } else if (up[0] < 0xf0) {
293 *val_p = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
294 return 4;
295 } else {
296 uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
297 *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
298 return 5;
299 }
300 }
301
safe_ltf8_get(const char * cp,const char * endp,int64_t * val_p)302 static inline int safe_ltf8_get(const char *cp, const char *endp,
303 int64_t *val_p) {
304 unsigned char *up = (unsigned char *)cp;
305
306 if (endp - cp < 9 &&
307 (cp >= endp || endp - cp < ltf8_bytes[up[0]])) return 0;
308
309 if (up[0] < 0x80) {
310 *val_p = up[0];
311 return 1;
312 } else if (up[0] < 0xc0) {
313 *val_p = (((uint64_t)up[0]<< 8) |
314 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
315 return 2;
316 } else if (up[0] < 0xe0) {
317 *val_p = (((uint64_t)up[0]<<16) |
318 ((uint64_t)up[1]<< 8) |
319 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
320 return 3;
321 } else if (up[0] < 0xf0) {
322 *val_p = (((uint64_t)up[0]<<24) |
323 ((uint64_t)up[1]<<16) |
324 ((uint64_t)up[2]<< 8) |
325 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
326 return 4;
327 } else if (up[0] < 0xf8) {
328 *val_p = (((uint64_t)up[0]<<32) |
329 ((uint64_t)up[1]<<24) |
330 ((uint64_t)up[2]<<16) |
331 ((uint64_t)up[3]<< 8) |
332 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
333 return 5;
334 } else if (up[0] < 0xfc) {
335 *val_p = (((uint64_t)up[0]<<40) |
336 ((uint64_t)up[1]<<32) |
337 ((uint64_t)up[2]<<24) |
338 ((uint64_t)up[3]<<16) |
339 ((uint64_t)up[4]<< 8) |
340 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
341 return 6;
342 } else if (up[0] < 0xfe) {
343 *val_p = (((uint64_t)up[0]<<48) |
344 ((uint64_t)up[1]<<40) |
345 ((uint64_t)up[2]<<32) |
346 ((uint64_t)up[3]<<24) |
347 ((uint64_t)up[4]<<16) |
348 ((uint64_t)up[5]<< 8) |
349 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
350 return 7;
351 } else if (up[0] < 0xff) {
352 *val_p = (((uint64_t)up[1]<<48) |
353 ((uint64_t)up[2]<<40) |
354 ((uint64_t)up[3]<<32) |
355 ((uint64_t)up[4]<<24) |
356 ((uint64_t)up[5]<<16) |
357 ((uint64_t)up[6]<< 8) |
358 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
359 return 8;
360 } else {
361 *val_p = (((uint64_t)up[1]<<56) |
362 ((uint64_t)up[2]<<48) |
363 ((uint64_t)up[3]<<40) |
364 ((uint64_t)up[4]<<32) |
365 ((uint64_t)up[5]<<24) |
366 ((uint64_t)up[6]<<16) |
367 ((uint64_t)up[7]<< 8) |
368 (uint64_t)up[8]);
369 return 9;
370 }
371 }
372
373 /*! Pushes a value in ITF8 format onto the end of a block.
374 *
375 * This shouldn't be used for high-volume data as it is not the fastest
376 * method.
377 *
378 * @return
379 * Returns the number of bytes written
380 */
381 int itf8_put_blk(cram_block *blk, int32_t val);
382 int ltf8_put_blk(cram_block *blk, int64_t val);
383
384 /*! Pulls a literal 32-bit value from a block.
385 *
386 * @returns the number of bytes decoded;
387 * -1 on failure.
388 */
389 int int32_get_blk(cram_block *b, int32_t *val);
390
391 /*! Pushes a literal 32-bit value onto the end of a block.
392 *
393 * @return
394 * Returns 0 on success;
395 * -1 on failure.
396 */
397 int int32_put_blk(cram_block *blk, int32_t val);
398
399
400 /**@}*/
401 /**@{ ----------------------------------------------------------------------
402 * CRAM blocks - the dynamically growable data block. We have code to
403 * create, update, (un)compress and read/write.
404 *
405 * These are derived from the deflate_interlaced.c blocks, but with the
406 * CRAM extension of content types and IDs.
407 */
408
409 /*! Allocates a new cram_block structure with a specified content_type and
410 * id.
411 *
412 * @return
413 * Returns block pointer on success;
414 * NULL on failure
415 */
416 cram_block *cram_new_block(enum cram_content_type content_type,
417 int content_id);
418
419 /*! Reads a block from a cram file.
420 *
421 * @return
422 * Returns cram_block pointer on success;
423 * NULL on failure
424 */
425 cram_block *cram_read_block(cram_fd *fd);
426
427 /*! Writes a CRAM block.
428 *
429 * @return
430 * Returns 0 on success;
431 * -1 on failure
432 */
433 int cram_write_block(cram_fd *fd, cram_block *b);
434
435 /*! Frees a CRAM block, deallocating internal data too.
436 */
437 void cram_free_block(cram_block *b);
438
439 /*! Uncompress a memory block using Zlib.
440 *
441 * @return
442 * Returns 0 on success;
443 * -1 on failure
444 */
445 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size);
446
447 /*! Uncompresses a CRAM block, if compressed.
448 *
449 * @return
450 * Returns 0 on success;
451 * -1 on failure
452 */
453 int cram_uncompress_block(cram_block *b);
454
455 /*! Compresses a block.
456 *
457 * Compresses a block using one of two different zlib strategies. If we only
458 * want one choice set strat2 to be -1.
459 *
460 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
461 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
462 * significantly faster.
463 *
464 * @return
465 * Returns 0 on success;
466 * -1 on failure
467 */
468 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
469 int method, int level);
470
471 cram_metrics *cram_new_metrics(void);
472 char *cram_block_method2str(enum cram_block_method m);
473 char *cram_content_type2str(enum cram_content_type t);
474
475 /*
476 * Find an external block by its content_id
477 */
478
cram_get_block_by_id(cram_slice * slice,int id)479 static inline cram_block *cram_get_block_by_id(cram_slice *slice, int id) {
480 //fprintf(stderr, "%d\t%p\n", id, slice->block_by_id);
481 uint32_t v = id;
482 if (slice->block_by_id && v < 256) {
483 return slice->block_by_id[v];
484 } else {
485 v = 256 + v % 251;
486 if (slice->block_by_id &&
487 slice->block_by_id[v] &&
488 slice->block_by_id[v]->content_id == id)
489 return slice->block_by_id[v];
490
491 // Otherwise a linear search in case of collision
492 int i;
493 for (i = 0; i < slice->hdr->num_blocks; i++) {
494 cram_block *b = slice->block[i];
495 if (b && b->content_type == EXTERNAL && b->content_id == id)
496 return b;
497 }
498 }
499 return NULL;
500 }
501
502 /* --- Accessor macros for manipulating blocks on a byte by byte basis --- */
503
504 /* Block size and data pointer. */
505 #define BLOCK_SIZE(b) ((b)->byte)
506 #define BLOCK_DATA(b) ((b)->data)
507
508 /* Returns the address one past the end of the block */
509 #define BLOCK_END(b) (&(b)->data[(b)->byte])
510
511 /* Make block exactly 'l' bytes long */
block_resize_exact(cram_block * b,size_t len)512 static inline int block_resize_exact(cram_block *b, size_t len) {
513 unsigned char *tmp = realloc(b->data, len);
514 if (!tmp)
515 return -1;
516 b->alloc = len;
517 b->data = tmp;
518 return 0;
519 }
520
521 /* Request block to be at least 'l' bytes long */
block_resize(cram_block * b,size_t len)522 static inline int block_resize(cram_block *b, size_t len) {
523 if (b->alloc > len)
524 return 0;
525
526 size_t alloc = b->alloc;
527 while (alloc <= len)
528 alloc = alloc ? alloc*1.5 : 1024;
529
530 return block_resize_exact(b, alloc);
531 }
532
533
534 /* Ensure the block can hold at least another 'l' bytes */
block_grow(cram_block * b,size_t len)535 static inline int block_grow(cram_block *b, size_t len) {
536 return block_resize(b, BLOCK_SIZE(b) + len);
537 }
538
539 /* Append string 's' of length 'l'. */
block_append(cram_block * b,const void * s,size_t len)540 static inline int block_append(cram_block *b, const void *s, size_t len) {
541 if (block_grow(b, len) < 0)
542 return -1;
543
544 memcpy(BLOCK_END(b), s, len);
545 BLOCK_SIZE(b) += len;
546
547 return 0;
548 }
549
550 /* Append as single character 'c' */
block_append_char(cram_block * b,char c)551 static inline int block_append_char(cram_block *b, char c) {
552 if (block_grow(b, 1) < 0)
553 return -1;
554
555 b->data[b->byte++] = c;
556 return 0;
557 }
558
559 /* Append a single unsigned integer */
560 static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i);
block_append_uint(cram_block * b,unsigned int i)561 static inline int block_append_uint(cram_block *b, unsigned int i) {
562 if (block_grow(b, 11) < 0)
563 return -1;
564
565 unsigned char *cp = &b->data[b->byte];
566 b->byte += append_uint32(cp, i) - cp;
567 return 0;
568 }
569
570 // Versions of above with built in goto block_err calls.
571 #define BLOCK_RESIZE_EXACT(b,l) if (block_resize_exact((b),(l))<0) goto block_err
572 #define BLOCK_RESIZE(b,l) if (block_resize((b),(l)) <0) goto block_err
573 #define BLOCK_GROW(b,l) if (block_grow((b),(l)) <0) goto block_err
574 #define BLOCK_APPEND(b,s,l) if (block_append((b),(s),(l)) <0) goto block_err
575 #define BLOCK_APPEND_CHAR(b,c) if (block_append_char((b),(c)) <0) goto block_err
576 #define BLOCK_APPEND_UINT(b,i) if (block_append_uint((b),(i)) <0) goto block_err
577
append_uint32(unsigned char * cp,uint32_t i)578 static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) {
579 uint32_t j;
580
581 if (i == 0) {
582 *cp++ = '0';
583 return cp;
584 }
585
586 if (i < 100) goto b1;
587 if (i < 10000) goto b3;
588 if (i < 1000000) goto b5;
589 if (i < 100000000) goto b7;
590
591 if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
592 if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
593 b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
594 if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
595 b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
596 if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
597 b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
598 if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
599 b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
600 if (i) *cp++ = i + '0';
601 return cp;
602
603 x8: *cp++ = i / 100000000 + '0', i %= 100000000;
604 x7: *cp++ = i / 10000000 + '0', i %= 10000000;
605 x6: *cp++ = i / 1000000 + '0', i %= 1000000;
606 x5: *cp++ = i / 100000 + '0', i %= 100000;
607 x4: *cp++ = i / 10000 + '0', i %= 10000;
608 x3: *cp++ = i / 1000 + '0', i %= 1000;
609 x2: *cp++ = i / 100 + '0', i %= 100;
610 x1: *cp++ = i / 10 + '0', i %= 10;
611 x0: *cp++ = i + '0';
612
613 return cp;
614 }
615
append_sub32(unsigned char * cp,uint32_t i)616 static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) {
617 *cp++ = i / 100000000 + '0', i %= 100000000;
618 *cp++ = i / 10000000 + '0', i %= 10000000;
619 *cp++ = i / 1000000 + '0', i %= 1000000;
620 *cp++ = i / 100000 + '0', i %= 100000;
621 *cp++ = i / 10000 + '0', i %= 10000;
622 *cp++ = i / 1000 + '0', i %= 1000;
623 *cp++ = i / 100 + '0', i %= 100;
624 *cp++ = i / 10 + '0', i %= 10;
625 *cp++ = i + '0';
626
627 return cp;
628 }
629
append_uint64(unsigned char * cp,uint64_t i)630 static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) {
631 uint64_t j;
632
633 if (i <= 0xffffffff)
634 return append_uint32(cp, i);
635
636 if ((j = i/1000000000) > 1000000000) {
637 cp = append_uint32(cp, j/1000000000);
638 j %= 1000000000;
639 cp = append_sub32(cp, j);
640 } else {
641 cp = append_uint32(cp, i / 1000000000);
642 }
643 cp = append_sub32(cp, i % 1000000000);
644
645 return cp;
646 }
647
648 #define BLOCK_UPLEN(b) \
649 (b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b))
650
651 /**@}*/
652 /**@{ ----------------------------------------------------------------------
653 * Reference sequence handling
654 */
655
656 /*! Loads a reference set from fn and stores in the cram_fd.
657 *
658 * @return
659 * Returns 0 on success;
660 * -1 on failure
661 */
662 int cram_load_reference(cram_fd *fd, char *fn);
663
664 /*! Generates a lookup table in refs based on the SQ headers in sam_hdr_t.
665 *
666 * Indexes references by the order they appear in a BAM file. This may not
667 * necessarily be the same order they appear in the fasta reference file.
668 *
669 * @return
670 * Returns 0 on success;
671 * -1 on failure
672 */
673 int refs2id(refs_t *r, sam_hdr_t *hdr);
674
675 void refs_free(refs_t *r);
676
677 /*! Returns a portion of a reference sequence from start to end inclusive.
678 *
679 * The returned pointer is owned by the cram_file fd and should not be freed
680 * by the caller. It is valid only until the next cram_get_ref is called
681 * with the same fd parameter (so is thread-safe if given multiple files).
682 *
683 * To return the entire reference sequence, specify start as 1 and end
684 * as 0.
685 *
686 * @return
687 * Returns reference on success;
688 * NULL on failure
689 */
690 char *cram_get_ref(cram_fd *fd, int id, int start, int end);
691 void cram_ref_incr(refs_t *r, int id);
692 void cram_ref_decr(refs_t *r, int id);
693 /**@}*/
694 /**@{ ----------------------------------------------------------------------
695 * Containers
696 */
697
698 /*! Creates a new container, specifying the maximum number of slices
699 * and records permitted.
700 *
701 * @return
702 * Returns cram_container ptr on success;
703 * NULL on failure
704 */
705 cram_container *cram_new_container(int nrec, int nslice);
706 void cram_free_container(cram_container *c);
707
708 /*! Reads a container header.
709 *
710 * @return
711 * Returns cram_container on success;
712 * NULL on failure or no container left (fd->err == 0).
713 */
714 cram_container *cram_read_container(cram_fd *fd);
715
716 /*! Writes a container structure.
717 *
718 * @return
719 * Returns 0 on success;
720 * -1 on failure
721 */
722 int cram_write_container(cram_fd *fd, cram_container *h);
723
724 /*! Flushes a container to disk.
725 *
726 * Flushes a completely or partially full container to disk, writing
727 * container structure, header and blocks. This also calls the encoder
728 * functions.
729 *
730 * @return
731 * Returns 0 on success;
732 * -1 on failure
733 */
734 int cram_flush_container(cram_fd *fd, cram_container *c);
735 int cram_flush_container_mt(cram_fd *fd, cram_container *c);
736
737
738 /**@}*/
739 /**@{ ----------------------------------------------------------------------
740 * Compression headers; the first part of the container
741 */
742
743 /*! Creates a new blank container compression header
744 *
745 * @return
746 * Returns header ptr on success;
747 * NULL on failure
748 */
749 cram_block_compression_hdr *cram_new_compression_header(void);
750
751 /*! Frees a cram_block_compression_hdr */
752 void cram_free_compression_header(cram_block_compression_hdr *hdr);
753
754
755 /**@}*/
756 /**@{ ----------------------------------------------------------------------
757 * Slices and slice headers
758 */
759
760 /*! Frees a slice header */
761 void cram_free_slice_header(cram_block_slice_hdr *hdr);
762
763 /*! Frees a slice */
764 void cram_free_slice(cram_slice *s);
765
766 /*! Creates a new empty slice in memory, for subsequent writing to
767 * disk.
768 *
769 * @return
770 * Returns cram_slice ptr on success;
771 * NULL on failure
772 */
773 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs);
774
775 /*! Loads an entire slice.
776 *
777 * FIXME: In 1.0 the native unit of slices within CRAM is broken
778 * as slices contain references to objects in other slices.
779 * To work around this while keeping the slice oriented outer loop
780 * we read all slices and stitch them together into a fake large
781 * slice instead.
782 *
783 * @return
784 * Returns cram_slice ptr on success;
785 * NULL on failure
786 */
787 cram_slice *cram_read_slice(cram_fd *fd);
788
789
790
791 /**@}*/
792 /**@{ ----------------------------------------------------------------------
793 * CRAM file definition (header)
794 */
795
796 /*! Reads a CRAM file definition structure.
797 *
798 * @return
799 * Returns file_def ptr on success;
800 * NULL on failure
801 */
802 cram_file_def *cram_read_file_def(cram_fd *fd);
803
804 /*! Writes a cram_file_def structure to cram_fd.
805 *
806 * @return
807 * Returns 0 on success;
808 * -1 on failure
809 */
810 int cram_write_file_def(cram_fd *fd, cram_file_def *def);
811
812 /*! Frees a cram_file_def structure. */
813 void cram_free_file_def(cram_file_def *def);
814
815
816 /**@}*/
817 /**@{ ----------------------------------------------------------------------
818 * SAM header I/O
819 */
820
821 /*! Reads the SAM header from the first CRAM data block.
822 *
823 * Also performs minimal parsing to extract read-group
824 * and sample information.
825 *
826 * @return
827 * Returns SAM hdr ptr on success;
828 * NULL on failure
829 */
830 sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd);
831
832 /*! Writes a CRAM SAM header.
833 *
834 * @return
835 * Returns 0 on success;
836 * -1 on failure
837 */
838 int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr);
839
840
841 /**@}*/
842 /**@{ ----------------------------------------------------------------------
843 * The top-level cram opening, closing and option handling
844 */
845
846 /*! Opens a CRAM file for read (mode "rb") or write ("wb").
847 *
848 * The filename may be "-" to indicate stdin or stdout.
849 *
850 * @return
851 * Returns file handle on success;
852 * NULL on failure.
853 */
854 cram_fd *cram_open(const char *filename, const char *mode);
855
856 /*! Opens an existing stream for reading or writing.
857 *
858 * @return
859 * Returns file handle on success;
860 * NULL on failure.
861 */
862 cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
863
864 /*! Closes a CRAM file.
865 *
866 * @return
867 * Returns 0 on success;
868 * -1 on failure
869 */
870 int cram_close(cram_fd *fd);
871
872 /*
873 * Seek within a CRAM file.
874 *
875 * Returns 0 on success
876 * -1 on failure
877 */
878 int cram_seek(cram_fd *fd, off_t offset, int whence);
879
880 /*
881 * Flushes a CRAM file.
882 * Useful for when writing to stdout without wishing to close the stream.
883 *
884 * Returns 0 on success
885 * -1 on failure
886 */
887 int cram_flush(cram_fd *fd);
888
889 /*! Checks for end of file on a cram_fd stream.
890 *
891 * @return
892 * Returns 0 if not at end of file
893 * 1 if we hit an expected EOF (end of range or EOF block)
894 * 2 for other EOF (end of stream without EOF block)
895 */
896 int cram_eof(cram_fd *fd);
897
898 /*! Sets options on the cram_fd.
899 *
900 * See CRAM_OPT_* definitions in cram_structs.h.
901 * Use this immediately after opening.
902 *
903 * @return
904 * Returns 0 on success;
905 * -1 on failure
906 */
907 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...);
908
909 /*! Sets options on the cram_fd.
910 *
911 * See CRAM_OPT_* definitions in cram_structs.h.
912 * Use this immediately after opening.
913 *
914 * @return
915 * Returns 0 on success;
916 * -1 on failure
917 */
918 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args);
919
920 /*!
921 * Attaches a header to a cram_fd.
922 *
923 * This should be used when creating a new cram_fd for writing where
924 * we have an sam_hdr_t already constructed (eg from a file we've read
925 * in).
926 *
927 * @return
928 * Returns 0 on success;
929 * -1 on failure
930 */
931 int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr);
932
933 /*!
934 * Returns the hFILE connected to a cram_fd.
935 */
cram_hfile(cram_fd * fd)936 static inline struct hFILE *cram_hfile(cram_fd *fd) {
937 return fd->fp;
938 }
939
940 #ifdef __cplusplus
941 }
942 #endif
943
944 #endif /* CRAM_IO_H */
945