1 /*
2 * Copyright (c) 2013 Genome Research Ltd.
3 * Author(s): James Bonfield, Rob Davies
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Author: James Bonfield, Wellcome Trust Sanger Institute. 2010-3
36 */
37
38 #ifdef HAVE_CONFIG_H
39 #include "io_lib_config.h"
40 #endif
41
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <inttypes.h>
46 #include <fcntl.h>
47 #include <zlib.h>
48 #ifdef HAVE_LIBDEFLATE
49 #include <libdeflate.h>
50 #endif
51 #include <assert.h>
52 #include <ctype.h>
53 #include <errno.h>
54 #include <stdarg.h>
55 #include <stddef.h>
56
57 #include <pthread.h>
58
59 #include "io_lib/bam.h"
60 #include "io_lib/os.h"
61 #include "io_lib/thread_pool.h"
62 #include "io_lib/crc32.h"
63 #include "io_lib/bgzip.h"
64
65 // On later gcc releases the ALLOW_UAC code causes the vectorizor to
66 // use aligned SIMD instructions on unaligned memory access. This is due
67 // to our own abuse of char to int aliasing, but doing things the legal
68 // way is still slower overall (5-14% depending on system and compiler).
69 // However by explicitly altering the alignment of the integer types
70 // we can persuade the compiler to generate the unaligned SIMD
71 // instructions instead.
72 #if defined(__GNUC__) && !(defined(__clang__) || defined(__ICC))
73 typedef int16_t int16_u __attribute__ ((aligned (1)));
74 typedef uint16_t uint16_u __attribute__ ((aligned (1)));
75 typedef int32_t int32_u __attribute__ ((aligned (1)));
76 typedef uint32_t uint32_u __attribute__ ((aligned (1)));
77 #else
78 typedef int16_t int16_u;
79 typedef uint16_t uint16_u;
80 typedef int32_t int32_u;
81 typedef uint32_t uint32_u;
82 #endif
83
84 // If using Cloudflare's zlib, consider just switching to the zlib crc.
85 // However this doesn't work on older machines.
86
87 //#define iolib_crc32 crc32
88
89 #define USE_MT
90
91 #ifdef USE_MT
92 # define BGZF_WRITE bgzf_write_mt
93 # define BGZF_FLUSH bgzf_flush_mt
94 #else
95 # define BGZF_WRITE bgzf_write
96 # define BGZF_FLUSH bgzf_flush
97 #endif
98
99 #ifndef MIN
100 # define MIN(a,b) ((a)<(b)?(a):(b))
101 #endif
102
103 #define EOF_BLOCK "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"
104
105 /* Macros to store integers of various sizes in little endian byte order.
106 * The value is put in the location pointed to by ucp, which should be
107 * an unsigned char pointer. ucp is incremented by the size of the
108 * stored value. */
109
110 #define STORE_UINT16(ucp, val) \
111 *(ucp)++ = ((uint16_t) val) & 0xff; \
112 *(ucp)++ = ((uint16_t) val >> 8) & 0xff;
113
114 #define STORE_UINT32(ucp, val) \
115 *(ucp)++ = ((uint32_t) (val)) & 0xff; \
116 *(ucp)++ = ((uint32_t) (val) >> 8) & 0xff; \
117 *(ucp)++ = ((uint32_t) (val) >> 16) & 0xff; \
118 *(ucp)++ = ((uint32_t) (val) >> 24) & 0xff;
119
120 #define STORE_UINT64(ucp, val) \
121 *(ucp)++ = ((uint64_t) (val)) & 0xff; \
122 *(ucp)++ = ((uint64_t) (val) >> 8) & 0xff; \
123 *(ucp)++ = ((uint64_t) (val) >> 16) & 0xff; \
124 *(ucp)++ = ((uint64_t) (val) >> 24) & 0xff; \
125 *(ucp)++ = ((uint64_t) (val) >> 32) & 0xff; \
126 *(ucp)++ = ((uint64_t) (val) >> 40) & 0xff; \
127 *(ucp)++ = ((uint64_t) (val) >> 48) & 0xff; \
128 *(ucp)++ = ((uint64_t) (val) >> 56) & 0xff;
129
130 static int bam_more_input(bam_file_t *b);
131 static int bam_uncompress_input(bam_file_t *b);
132 static int reg2bin(int start, int end);
133 static int bgzf_block_write(bam_file_t *bf, int level, const void *buf, size_t count);
134 static int bgzf_write(bam_file_t *bf, int level, const void *buf, size_t count);
135 static int bgzf_write_mt(bam_file_t *bf, int level, const void *buf, size_t count);
136 #ifdef USE_MT
137 static int bgzf_flush_mt(bam_file_t *bf);
138 #else
139 static int bgzf_flush(bam_file_t *bf);
140 #endif
141
142 /*
143 * Reads len bytes from fp into data.
144 *
145 * Returns the number of bytes read.
146 * 0 for eof.
147 * -1 for failure.
148 */
bam_read(bam_file_t * b,void * data,size_t len)149 static int bam_read(bam_file_t *b, void *data, size_t len) {
150 int nb = 0, n;
151 unsigned char *cdata = data;
152
153 while (len) {
154 /* Consume any available uncompressed output */
155 if (b->uncomp_sz) {
156 size_t l = MIN(b->uncomp_sz, len);
157 memcpy(cdata, b->uncomp_p, l);
158 b->uncomp_p += l;
159 b->uncomp_sz -= l;
160 cdata += l;
161 len -= l;
162 nb += l;
163
164 if (!len)
165 return nb;
166 }
167
168 if (!b->gzip) {
169 /* Already uncompressed, so easy to deal with */
170 if (!b->comp_sz)
171 if (-1 == bam_more_input(b))
172 return nb ? nb : 0;
173
174 b->uncomp_p = b->comp_p;
175 b->uncomp_sz = b->comp_sz;
176 b->comp_sz = 0;
177 continue;
178 }
179
180 /* in=compressed out=uncompressed, but used as input (sorry!) */
181 n = bam_uncompress_input(b);
182 if (n == -1)
183 return -1;
184 if (n == 0)
185 return nb;
186 }
187
188 return nb;
189 }
190
191 /*
192 * Reads a line of text of unknown length.
193 * 'str' is both input and output. If *str == NULL then memory is allocated
194 * for the line. If *str != NULL then it is expected to point to an existing
195 * block of memory that we can write into and realloc as required.
196 *
197 * Similarly *len is both input and output. It is expected to hold the
198 * current allocated size of *str. It is modified if we realloc it.
199 *
200 * Lines have the \n removed and will be null terminated.
201 *
202 * Returns actual line length used (note note the same as *len) on success
203 * -1 on failure
204 */
bam_get_line(bam_file_t * b,unsigned char ** str,size_t * len)205 static int bam_get_line(bam_file_t *b, unsigned char **str, size_t *len) {
206 unsigned char *buf = *str;
207 int used_l = 0;
208 size_t alloc_l = *len;
209 int next_condition, r = 0;
210
211 while (b->uncomp_sz || (r=bam_uncompress_input(b)) > 0) {
212 int tmp;
213 unsigned char *from = b->uncomp_p;
214 unsigned char *to = &buf[used_l];
215
216 /*
217 * Next condition is the number of loop iterations before something
218 * has to be done - either getting more uncompressed output or
219 * resizing the buffer. We don't care which, but it allows us to
220 * have just one check per loop instead of two. Once out of the loop
221 * we can then afford to determine which case is and deal with it.
222 */
223 tmp = next_condition = MIN(b->uncomp_sz, alloc_l-used_l);
224
225 /*
226 * Consume 32 or 64 bits at a time, looking for \n in any byte.
227 * On 64-bit OS this function becomes 3x faster.
228 */
229 #ifdef ALLOW_UAC
230 #if SIZEOF_LONG == 8 && ULONG_MAX != 0xffffffff
231 #define hasless(x,n) (((x)-0x0101010101010101UL*(n))&~(x)&0x8080808080808080UL)
232 #define haszero(x) (((x)-0x0101010101010101UL)&~(x)&0x8080808080808080UL)
233 {
234 uint64_t *fromi = (uint64_t *)from;
235 uint64_t *toi = (uint64_t *)to;
236 while (next_condition >= 8) {
237 uint64_t w = *fromi ^ 0x0a0a0a0a0a0a0a0aUL;
238 if (haszero(w))
239 break;
240
241 *toi++ = *fromi++;
242 next_condition -= 8;
243 }
244 }
245 #else
246 #define hasless(x,n) (((x)-0x01010101UL*(n))&~(x)&0x80808080UL)
247 #define haszero(x) (((x)-0x01010101UL)&~(x)&0x80808080UL)
248 {
249 uint32_t *fromi = (uint32_t *)from;
250 uint32_t *toi = (uint32_t *)to;
251 while (next_condition >= 4) {
252 uint32_t w = *fromi ^ 0x0a0a0a0aUL;
253 if (haszero(w))
254 break;
255
256 *toi++ = *fromi++;
257 next_condition -= 4;
258 }
259 }
260 #endif
261 from += tmp-next_condition;
262 to += tmp-next_condition;
263 #endif
264
265 while (next_condition-- > 0) { /* these 3 lines are 50% of SAM cpu */
266 if (*from != '\n') {
267 *to++ = *from++;
268 } else {
269 if (to > buf && to[-1] == '\r') *--to = 0; // handle \r\n too
270 b->uncomp_p = from;
271 used_l = to-buf;
272 b->uncomp_p++;
273 buf[used_l] = 0;
274 b->uncomp_sz -= (tmp - next_condition);
275 return used_l;
276 }
277 }
278
279 used_l = to-buf;
280 b->uncomp_p = from;
281 b->uncomp_sz -= tmp;
282
283 if (used_l >= alloc_l) {
284 alloc_l = alloc_l ? alloc_l * 2 : 1024;
285 // +8 to cope with the 64-bit copy function in the
286 // COPY_CPF_TO_CPTM macro.
287 if (NULL == (buf = realloc(buf, alloc_l+8)))
288 return -1;
289 *str = buf;
290 *len = alloc_l;
291 }
292 }
293
294 if (r == -1)
295 return -1;
296 if (b->uncomp_sz)
297 return -1;
298
299 b->eof_block = 1; // expected eof
300 return 0;
301 }
302
load_bam_header(bam_file_t * b)303 static int load_bam_header(bam_file_t *b) {
304 char magic[4], *header;
305 int i;
306 int32_t header_len, nref;
307
308 if (4 != bam_read(b, magic, 4))
309 return -1;
310 if (memcmp(magic, "BAM\x01",4) != 0)
311 return -1;
312 if (4 != bam_read(b, &header_len, 4))
313 return -1;
314 header_len = le_int4(header_len);
315 if (!(header = malloc(header_len+1)))
316 return -1;
317 *header = 0;
318 if (header_len != bam_read(b, header, header_len))
319 return -1;
320
321 if (!(b->header = sam_hdr_parse(header, header_len)))
322 return -1;
323 free(header);
324
325 /* Load the reference data and check it matches the parsed header */
326 if (4 != bam_read(b, &nref, 4))
327 return -1;
328 nref = le_int4(nref);
329 if (b->header->nref != nref && b->header->nref) {
330 fprintf(stderr, "Error: @RG lines are at odds with "
331 "binary encoded reference data\n");
332 return -1;
333 }
334
335 for (i = 0; i < nref; i++) {
336 uint32_t nlen, len;
337 char name_a[1024], *name;
338
339 if (4 != bam_read(b, &nlen, 4))
340 return -1;
341 nlen = le_int4(nlen);
342 name = (nlen < 1023 ? name_a
343 : (nlen < UINT32_MAX ? malloc(nlen + 1) : NULL));
344 if (!name)
345 return -1;
346 if (nlen != bam_read(b, name, nlen))
347 return -1;
348 name[nlen] = 0;
349
350 if (4 != bam_read(b, &len, 4))
351 return -1;
352 len = le_int4(len);
353
354 if (i < b->header->nref && b->header->ref[i].name) {
355 if (strcmp(b->header->ref[i].name, name)) {
356 fprintf(stderr, "Error: @SQ lines are at odds with "
357 "binary encoded reference data\n");
358 return -1;
359 }
360
361 if (b->header->ref[i].len != len) {
362 fprintf(stderr, "Error: @SQ lines are at odds with "
363 "binary encoded reference data\n");
364 return -1;
365 }
366 } else {
367 char len_c[100];
368 sprintf(len_c, "%d", len);
369 if (sam_hdr_add(b->header, "SQ", "SN", name, "LN", len_c, NULL)<0)
370 return -1;
371 }
372
373 if (name != name_a)
374 free(name);
375 }
376
377 b->line = 0; // FIXME
378
379 return 0;
380 }
381
load_sam_header(bam_file_t * b)382 static int load_sam_header(bam_file_t *b) {
383 unsigned char *str = NULL;
384 size_t alloc = 0, len;
385 dstring_t *header = dstring_create(NULL);;
386 int r = 0;
387
388 while ((b->uncomp_sz > 0 || (r=bam_uncompress_input(b)) > 0) && *b->uncomp_p == '@') {
389 b->line++;
390 if ((len = bam_get_line(b, &str, &alloc)) == -1)
391 return -1;
392
393 if (-1 == dstring_nappend(header, (char *)str, len))
394 return -1;
395 if (-1 == dstring_append_char(header, '\n'))
396 return -1;
397 }
398 if (r == -1)
399 return -1;
400 b->line = 0; // FIXME
401
402 if (!(b->header = sam_hdr_parse((char *)dstring_str(header),
403 dstring_length(header))))
404 return -1;
405
406 dstring_destroy(header);
407 free(str);
408
409 return 0;
410 }
411
412 /* --------------------------------------------------------------------------
413 *
414 */
415
416 #ifndef O_BINARY
417 # define O_BINARY 0
418 #endif
419
bam_file_init(bam_file_t * b)420 static void bam_file_init(bam_file_t *b) {
421 b->comp_p = b->comp;
422 b->comp_sz = 0;
423 b->uncomp_p = b->uncomp;
424 b->uncomp_sz = 0;
425 b->next_len = -1;
426 b->bs = NULL;
427 b->bs_size = 0;
428 b->z_finish = 1;
429 b->bgzf = 0;
430 b->no_aux = 0;
431 b->line = 0;
432 b->binary = 0;
433 b->level = Z_DEFAULT_COMPRESSION;
434 b->sam_str = NULL;
435 b->pool = NULL;
436 b->equeue = NULL;
437 b->dqueue = NULL;
438 b->job_pending = NULL;
439 b->eof = 0;
440 b->nd_jobs = 0;
441 b->ne_jobs = 0;
442 b->idx = NULL;
443 b->current_block = 0;
444 b->bgbuf_p = b->bgbuf;
445 b->bgbuf_sz = 0;
446 b->idx_fn = NULL;
447 }
448
449 /*! Opens a SAM or BAM file.
450 *
451 * The mode parameter indicates the file
452 * type (if not auto-detecting) and whether it is for reading or
453 * writing. Use "rb" or "wb" for reading or writing BAM and "r" or
454 * "w" or reading or writing SAM. When writing BAM, the mode may end
455 * with a digit from 0 to 9 to indicate the compression to use with 0
456 * indicating uncompressed data.
457 *
458 * @param fn The filename to open or create.
459 * @param mode The input/output mode, similar to fopen().
460 *
461 * @return
462 * Returns a bam_file_t pointer on success;
463 * NULL on failure.
464 */
bam_open(const char * fn,const char * mode)465 bam_file_t *bam_open(const char *fn, const char *mode) {
466 bam_file_t *b = calloc(1, sizeof *b);
467
468 if (!b)
469 return NULL;
470
471 bam_file_init(b);
472
473 /* Creation */
474 if (*mode == 'w') {
475 b->mode = O_WRONLY | O_TRUNC | O_CREAT;
476 if (mode[1] == 'b') {
477 b->mode |= O_BINARY;
478 b->binary = 1;
479 }
480 if (mode[2] >= '0' && mode[2] <= '9')
481 b->level = mode[2] - '0';
482
483 if (strcmp(fn, "-") == 0) {
484 b->fp = stdout; /* Stdout */
485 #ifdef _WIN32
486 _setmode(1, _O_BINARY);
487 #endif
488 } else {
489 if (NULL == (b->fp = fopen(fn, "wb")))
490 goto error;
491 }
492
493 return b;
494 }
495
496 if (*mode != 'r')
497 return NULL;
498
499 if (strcmp(mode, "rb") == 0) {
500 b->mode = O_RDONLY | O_BINARY;
501 } else {
502 b->mode = O_RDONLY;
503 }
504 if (strcmp(fn, "-") == 0) {
505 b->fp = stdin;
506 } else {
507 if (NULL == (b->fp = fopen(fn, "rb")))
508 goto error;
509 }
510
511 if (NULL == (b->idx = gzi_index_init()))
512 goto error;
513
514 /* Load first block so we can check */
515 bam_more_input(b);
516 if (b->comp_sz >= 2 && b->comp_p[0] == 31 && b->comp_p[1] == 139)
517 b->gzip = 1;
518 else
519 b->gzip = 0;
520
521 if (b->gzip) {
522 /* Set up zlib */
523 b->s.zalloc = NULL;
524 b->s.zfree = NULL;
525 b->s.opaque = NULL;
526 inflateInit2(&b->s, -15);
527 }
528
529 if (-1 == bam_uncompress_input(b))
530 return NULL;
531 /* Auto-correct open file type if we detect a BAM */
532 if (b->uncomp_sz >= 4 && strncmp("BAM\001", (char *)b->uncomp_p, 4) == 0) {
533 b->mode |= O_BINARY;
534 b->binary = 1;
535 mode = "rb";
536 } else {
537 b->mode &= ~O_BINARY;
538 mode = "r";
539 }
540
541 /* Load header */
542 if (strcmp(mode, "rb") == 0) {
543 if (-1 == load_bam_header(b))
544 goto error;
545 b->bam = 1;
546 } else {
547 if (-1 == load_sam_header(b))
548 goto error;
549 b->bam = 0;
550 }
551
552 return b;
553
554 error:
555 if (b) {
556 if (b->header)
557 free(b->header);
558 free(b);
559 }
560
561 return NULL;
562 }
563
bam_open_block(const char * blk,size_t blk_size,SAM_hdr * sh)564 bam_file_t *bam_open_block(const char *blk, size_t blk_size, SAM_hdr *sh) {
565 bam_file_t *b = calloc(1, sizeof *b);
566
567 if (!b)
568 return NULL;
569
570 bam_file_init(b);
571
572 b->fp = NULL; // forces bam_more_input() to fail
573 b->bam = 1;
574 b->gzip = 0;
575 b->comp_sz = 0;
576 b->uncomp_p = (unsigned char *) blk;
577 b->uncomp_sz = blk_size;
578 b->header = sh;
579
580 sam_hdr_incr_ref(sh);
581
582 return b;
583 }
584
585
bam_close(bam_file_t * b)586 int bam_close(bam_file_t *b) {
587 int r = 0;
588
589 if (!b)
590 return 0;
591
592 if (b->mode & O_WRONLY) {
593 if (b->binary) {
594 if (bgzf_block_write(b, b->level, b->uncomp,
595 b->uncomp_p - b->uncomp)) {
596 fprintf(stderr, "Write failed in bam_close()\n");
597 }
598
599 BGZF_FLUSH(b);
600
601 /* Output a blank BGZF block too to mark EOF */
602 if (28 != fwrite(EOF_BLOCK, 1, 28, b->fp)) {
603 fprintf(stderr, "Write failed in bam_close()\n");
604 }
605 } else {
606 BGZF_FLUSH(b);
607
608 if (b->uncomp_p - b->uncomp !=
609 fwrite(b->uncomp, 1, b->uncomp_p - b->uncomp, b->fp)) {
610 fprintf(stderr, "Write failed in bam_close()\n");
611 }
612 }
613 }
614
615 if (b->bs)
616 free(b->bs);
617
618 if (b->header)
619 sam_hdr_free(b->header);
620
621 if (b->gzip)
622 inflateEnd(&b->s);
623
624 if (b->sam_str)
625 free(b->sam_str);
626
627 if (b->fp)
628 r = fclose(b->fp);
629
630 if (b->idx) {
631 if ((b->mode == O_RDONLY) && b->idx_fn) {
632 gzi_index_dump(b->idx, b->idx_fn, NULL);
633 }
634 gzi_index_free(b->idx);
635 }
636
637 if (b->pool) {
638 /* Should be no BAM jobs left in the pool, but if we abort on
639 * and error and close early then we need to drain the pool of
640 * jobs before destroying the results queue they are about to
641 * append to.
642 *
643 * Consider adding a t_pool_terminate function or similar to
644 * abort in-flight jobs connected to this specific results queue.
645 */
646 //fprintf(stderr, "BAM: Draining pool\n");
647 t_pool_flush(b->pool);
648 }
649
650 //fprintf(stderr, "BAM: destroying equeue %p, dqueue %p\n",
651 // b->equeue, b->dqueue);
652
653 if (b->equeue)
654 t_results_queue_destroy(b->equeue);
655 if (b->dqueue)
656 t_results_queue_destroy(b->dqueue);
657
658 free(b);
659
660 return r;
661 }
662
663 /*
664 * Loads more data into the input (compressed) buffer.
665 *
666 * Returns 0 on success
667 * -1 on failure.
668 */
bam_more_input(bam_file_t * b)669 static int bam_more_input(bam_file_t *b) {
670 size_t l;
671
672 if (!b->fp)
673 return -1;
674
675 if (b->comp != b->comp_p) {
676 memmove(b->comp, b->comp_p, b->comp_sz);
677 b->comp_p = b->comp;
678 }
679
680 l = fread(&b->comp[b->comp_sz], 1, Z_BUFF_SIZE - b->comp_sz, b->fp);
681 if (l <= 0)
682 return -1;
683
684 b->comp_sz += l;
685 return 0;
686 }
687
688 typedef struct {
689 unsigned char comp[Z_BUFF_SIZE];
690 unsigned char uncomp[Z_BUFF_SIZE];
691 size_t comp_sz, uncomp_sz;
692 int ignore_chksum;
693 } bgzf_decode_job;
694 static bgzf_decode_job *last_job = NULL;
695
696
697 /*
698 * Uncompresses a single zlib buffer.
699 */
700 #ifdef HAVE_LIBDEFLATE
bgzf_decode_thread(void * arg)701 void *bgzf_decode_thread(void *arg) {
702 bgzf_decode_job *j = (bgzf_decode_job *)arg;
703 struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
704 if (!z) return NULL;
705
706 int err = libdeflate_deflate_decompress(z, j->comp, j->comp_sz,
707 j->uncomp, Z_BUFF_SIZE, &j->uncomp_sz);
708
709 libdeflate_free_decompressor(z);
710 if (err != LIBDEFLATE_SUCCESS) {
711 fprintf(stderr, "Libdeflate returned error code %d\n", err);
712 return NULL;
713 }
714
715 if (!j->ignore_chksum) {
716 uint32_t crc1=libdeflate_crc32(0L, (unsigned char *)j->uncomp, j->uncomp_sz);
717 uint32_t crc2;
718 memcpy(&crc2, j->comp + j->comp_sz, 4);
719 crc2 = le_int2(crc2);
720 if (crc1 != crc2) {
721 fprintf(stderr, "Invalid CRC in Deflate stream: %08x vs %08x\n",
722 crc1, crc2);
723 return NULL;
724 }
725 }
726
727 return j;
728 }
729 #else
bgzf_decode_thread(void * arg)730 void *bgzf_decode_thread(void *arg) {
731 bgzf_decode_job *j = (bgzf_decode_job *)arg;
732 int err;
733 z_stream s;
734
735 s.avail_in = j->comp_sz;
736 s.next_in = j->comp;
737 s.avail_out = Z_BUFF_SIZE;
738 s.next_out = j->uncomp;
739 s.total_out = 0;
740 s.zalloc = NULL;
741 s.zfree = NULL;
742 s.opaque = NULL;
743
744 inflateInit2(&s, -15);
745 err = inflate(&s, Z_FINISH);
746 inflateEnd(&s);
747
748 if (err != Z_STREAM_END) {
749 fprintf(stderr, "Inflate returned error code %d\n", err);
750 return NULL;
751 }
752
753 if (!j->ignore_chksum) {
754 uint32_t crc1=iolib_crc32(0L, (unsigned char *)j->uncomp, s.total_out);
755 uint32_t crc2;
756 memcpy(&crc2, j->comp + j->comp_sz, 4);
757 crc2 = le_int2(crc2);
758 if (crc1 != crc2) {
759 fprintf(stderr, "Invalid CRC in Deflate stream: %08x vs %08x\n",
760 crc1, crc2);
761 return NULL;
762 }
763 }
764
765 j->uncomp_sz = s.total_out;
766
767 return j;
768 }
769 #endif
770
771 /*
772 * Converts compressed input to the uncompressed output buffer
773 *
774 * Returns number of additional output bytes on success
775 * 0 on eof
776 * -1 on failure.
777 */
bam_uncompress_input(bam_file_t * b)778 static int bam_uncompress_input(bam_file_t *b) {
779 int err = Z_OK;
780 unsigned char *bgzf;
781 int xlen, bsize;
782 bgzf_decode_job *j;
783
784 assert(b->uncomp_sz == 0);
785
786 if (!b->gzip) {
787 /* Already uncompressed, so easy to deal with */
788 if (!b->comp_sz)
789 if (-1 == bam_more_input(b))
790 return 0;
791
792 b->uncomp_p = b->comp_p;
793 b->uncomp_sz = b->comp_sz;
794 b->comp_sz = 0;
795 return b->uncomp_sz;
796 }
797
798 if (b->pool) {
799 t_pool_result *res;
800
801 /* Multi-threaded decoding. Assume BGZF for now */
802 //while (b->nd_jobs < b->pool->qsize) {
803 while (t_pool_results_queue_sz(b->dqueue) < b->pool->qsize) {
804 bgzf_decode_job *j;
805 int nonblock;
806
807 if (b->job_pending) {
808 j = b->job_pending;
809 } else {
810 if (!(j = malloc(sizeof(*j))))
811 return -1;
812
813 empty_block_1:
814 if (b->comp_sz < 28 && !b->eof) {
815 if (-1 == bam_more_input(b)) {
816 b->eof = 1;
817 if (b->comp_sz < 28) {
818 b->eof = 2;
819 free(j);
820 break;
821 }
822 }
823 } else if (b->comp_sz == 0 && b->eof) {
824 b->eof = 2;
825 free(j);
826 break;
827 }
828
829 if (!b->eof) {
830 if (memcmp(b->comp_p, EOF_BLOCK, 28) == 0) {
831 b->eof_block = 1;
832 b->comp_p += 28; b->comp_sz -= 28;
833 goto empty_block_1;
834 } else {
835 b->eof_block = 0;
836 }
837 }
838
839 bgzf = b->comp_p;
840 b->comp_p += 10; b->comp_sz -= 10;
841
842 if (bgzf[0] != 31 || bgzf[1] != 139) {
843 fprintf(stderr, "Zlib magic number failure\n");
844 free(j);
845 return -1; /* magic number failure */
846 }
847
848 if ((bgzf[3] & 4) == 4) {
849 /* has extra fields, eg BGZF */
850 xlen = bgzf[10] + bgzf[11]*256;
851 b->comp_p += 2; b->comp_sz -= 2;
852 } else {
853 fprintf(stderr, "Not BGZF\n");
854 free(j);
855 return -1;
856 }
857
858 if (xlen != 6) {
859 fprintf(stderr, "XLEN != 6\n");
860 free(j);
861 return -1;
862 }
863
864 b->comp_p += 6;
865 b->comp_sz -= 6;
866
867 if (bgzf[12] != 'B' || bgzf[13] != 'C' ||
868 bgzf[14] != 2 || bgzf[15] != 0) {
869 fprintf(stderr, "BGZF XLEN block incorrect\n");
870 free(j);
871 return -1;
872 }
873 bsize = bgzf[16] + bgzf[17]*256;
874 bsize -= 6+19;
875
876 if (b->comp_sz < bsize + 8) {
877 do {
878 if (bam_more_input(b) == -1) {
879 fprintf(stderr, "EOF - truncated block\n");
880 free(j);
881 return -1; /* Truncated */
882 }
883 } while (b->comp_sz < bsize + 8);
884 }
885
886 memcpy(j->comp, b->comp_p, bsize+8);
887 j->comp_sz = bsize;
888 j->ignore_chksum = b->ignore_chksum;
889
890 b->comp_p += bsize + 8; // crc & isize
891 b->comp_sz -= bsize + 8; // crc & isize
892 }
893
894 //nonblock = b->nd_jobs ? 1 : 0;
895 nonblock = t_pool_results_queue_len(b->dqueue) ? 1 : 0;
896
897 if (-1 == t_pool_dispatch2(b->pool, b->dqueue,
898 bgzf_decode_thread, j, nonblock)) {
899 /* Would block */
900 b->job_pending = j;
901 break;
902 } else {
903 b->job_pending = NULL;
904 b->nd_jobs++;
905 }
906 }
907
908 if (b->eof == 2 && t_pool_results_queue_empty(b->dqueue))
909 return 0;
910
911 res = t_pool_next_result_wait(b->dqueue);
912 if (!res || !res->data) {
913 fprintf(stderr, "t_pool_next_result failure\n");
914 return -1;
915 }
916
917 b->nd_jobs--;
918
919 /* make a start on the next job as we know there is room now */
920 if (b->job_pending) {
921 if (0 == t_pool_dispatch2(b->pool, b->dqueue,
922 bgzf_decode_thread,
923 b->job_pending, 1)) {
924 b->job_pending = NULL;
925 b->nd_jobs++;
926 }
927 }
928
929 j = (bgzf_decode_job *)res->data;
930
931 #if 0
932 memcpy(b->uncomp, j->uncomp, j->uncomp_sz);
933 b->uncomp_p = b->uncomp;
934 #else
935 if (last_job)
936 free(last_job);
937 last_job = j;
938 b->uncomp_p = j->uncomp;
939 #endif
940 b->uncomp_sz = j->uncomp_sz;
941 t_pool_delete_result(res, 0);
942 if (b->idx){
943 if (gzi_index_add_block(b->idx, j->comp_sz + 26, b->uncomp_sz))
944 return -1;
945 }
946 } else {
947 /* Single threaded version, or non-bgzf format data */
948
949 /* Uncompress another BGZF block */
950 /* BGZF header */
951 empty_block_2:
952 if (b->comp_sz < 28) {
953 if (-1 == bam_more_input(b))
954 return 0;
955 if (b->comp_sz < 28)
956 return -1;
957 }
958
959 if (memcmp(b->comp_p, EOF_BLOCK, 28) == 0) {
960 b->eof_block = 1;
961 b->comp_p += 28; b->comp_sz -= 28;
962 goto empty_block_2;
963 } else {
964 b->eof_block = 0;
965 }
966
967 if (b->z_finish) {
968 /*
969 * BGZF header is gzip + extra fields.
970 */
971 bgzf = b->comp_p;
972 b->comp_p += 10; b->comp_sz -= 10;
973
974 if (bgzf[0] != 31 || bgzf[1] != 139)
975 return -1; /* magic number failure */
976 if ((bgzf[3] & 4) == 4) {
977 /* has extra fields, eg BGZF */
978 xlen = bgzf[10] + bgzf[11]*256;
979 b->comp_p += 2; b->comp_sz -= 2;
980 } else {
981 xlen = 0;
982 }
983 } else {
984 /* Continuing with an existing data stream */
985 xlen = 0;
986 }
987
988
989 /* BGZF */
990 if (xlen == 6) {
991 b->bgzf = 1;
992 b->comp_p += 6; b->comp_sz -= 6;
993
994 if (bgzf[12] != 'B' || bgzf[13] != 'C' ||
995 bgzf[14] != 2 || bgzf[15] != 0)
996 return -1;
997 bsize = bgzf[16] + bgzf[17]*256;
998 bsize -= 6+19;
999
1000 /* Inflate */
1001 if (b->comp_sz < bsize + 8) {
1002 do {
1003 if (bam_more_input(b) == -1) {
1004 fprintf(stderr, "EOF - truncated block\n");
1005 return -1; /* Truncated */
1006 }
1007 } while (b->comp_sz < bsize + 8);
1008 }
1009
1010 #ifdef HAVE_LIBDEFLATE
1011 struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1012 if (!z) return -1;
1013
1014 err = libdeflate_deflate_decompress(z, b->comp_p, bsize,
1015 b->uncomp, Z_BUFF_SIZE, &b->uncomp_sz);
1016
1017 libdeflate_free_decompressor(z);
1018 if (err != LIBDEFLATE_SUCCESS) {
1019 fprintf(stderr, "Libdeflate returned error code %d\n", err);
1020 return -1;
1021 }
1022
1023 b->comp_p += bsize + 8; /* crc & isize */
1024 b->comp_sz -= bsize + 8;
1025 b->uncomp_p = b->uncomp;
1026 #else
1027 b->s.avail_in = bsize;
1028 b->s.next_in = b->comp_p;
1029 b->s.avail_out = Z_BUFF_SIZE;
1030 b->s.next_out = b->uncomp;
1031 b->s.total_out = 0;
1032
1033 inflateReset(&b->s);
1034 err = inflate(&b->s, Z_FINISH);
1035
1036 if (err != Z_STREAM_END) {
1037 fprintf(stderr, "Inflate returned error code %d\n", err);
1038 return -1;
1039 }
1040 b->z_finish = 1;
1041
1042 b->comp_p += bsize + 8; /* crc & isize */
1043 b->comp_sz -= bsize + 8;
1044 b->uncomp_sz = b->s.total_out;
1045 b->uncomp_p = b->uncomp;
1046 #endif
1047
1048 if (b->idx){
1049 if (gzi_index_add_block(b->idx, bsize + 26, b->uncomp_sz))
1050 return -1;
1051 }
1052
1053 if (!b->ignore_chksum) {
1054 uint32_t crc1 = iolib_crc32(0L, (unsigned char *)b->uncomp,
1055 b->uncomp_sz);
1056 uint32_t crc2;
1057 memcpy(&crc2, b->comp_p-8, 4);
1058 crc2 = le_int2(crc2);
1059 if (crc1 != crc2) {
1060 fprintf(stderr, "Invalid CRC in Deflate stream: "
1061 "%08x vs %08x\n", crc1, crc2);
1062 return -1;
1063 }
1064 }
1065 } else {
1066 /* Some other gzip variant, but possibly still having xlen */
1067 /* NB: we don't check CRCs here, but I don't think these BAMs exist either */
1068 while (xlen) {
1069 int d = MIN(b->comp_sz, xlen);
1070 xlen -= d;
1071 b->comp_p += d;
1072 b->comp_sz -= d;
1073 if (b->comp_sz == 0)
1074 bam_more_input(b);
1075 if (b->comp_sz == 0)
1076 return -1; /* truncated file */
1077 }
1078
1079 b->s.avail_in = b->comp_sz;
1080 b->s.next_in = b->comp_p;
1081 b->s.avail_out = Z_BUFF_SIZE;
1082 b->s.next_out = b->uncomp;
1083 b->s.total_out = 0;
1084 if (b->z_finish)
1085 inflateReset(&b->s);
1086
1087 err = inflate(&b->s, Z_BLOCK);
1088 //printf("err=%d\n", err);
1089
1090 if (err == Z_OK || err == Z_STREAM_END || err == Z_BUF_ERROR) {
1091 b->comp_p += b->comp_sz - b->s.avail_in;
1092 b->comp_sz = b->s.avail_in;
1093 b->uncomp_sz = b->s.total_out;
1094 b->uncomp_p = b->uncomp;
1095
1096 if (err == Z_STREAM_END) {
1097 b->z_finish = 1;
1098
1099 /* Consume (ignore) CRC & ISIZE */
1100 if (b->comp_sz < 8)
1101 bam_more_input(b);
1102
1103 if (b->comp_sz < 8)
1104 return -1; /* truncated file */
1105
1106 b->comp_sz -= 8;
1107 b->comp_p += 8;
1108 } else {
1109 b->z_finish = 0;
1110 }
1111 } else {
1112 fprintf(stderr, "Inflate returned error code %d\n", err);
1113 return -1;
1114 }
1115 }
1116 }
1117
1118 /*
1119 * Zero length blocks may not actually be EOF, just bizarre. We return
1120 * 0 elsewhere for the EOF case, so if we got here and b->uncomp_sz is 0
1121 * then go around again.
1122 */
1123 return b->uncomp_sz ? b->uncomp_sz : bam_uncompress_input(b);
1124
1125 }
1126
1127 #ifdef ALLOW_UAC
1128 #if SIZEOF_LONG == 8 && ULONG_MAX != 0xffffffff
1129 #define COPY_CPF_TO_CPTM(n) \
1130 do { \
1131 uint64_t *cpfi = (uint64_t *)cpf; \
1132 uint64_t *cpti = (uint64_t *)cpt; \
1133 uint64_t *orig = cpfi; \
1134 while (!hasless(*cpfi,10)) { \
1135 *cpti++ = *cpfi++ - (n)*0x0101010101010101UL; \
1136 } \
1137 cpf += (cpfi-orig)*8; cpt += (cpfi-orig)*8; \
1138 while (*cpf > '\t') \
1139 *cpt++ = *cpf++ - (n); \
1140 } while (0)
1141
1142 #define CPF_SKIP() \
1143 do { \
1144 uint64_t *cpfi = (uint64_t *)cpf; \
1145 uint64_t *orig = cpfi; \
1146 while(!hasless(*cpfi,10)) \
1147 cpfi++; \
1148 cpf += (cpfi-orig)*8; \
1149 while (*cpf > '\t') \
1150 cpf++; \
1151 } while (0)
1152
1153 #else
1154 #define COPY_CPF_TO_CPTM(n) \
1155 do { \
1156 uint32_t *cpfi = (uint32_t *)cpf; \
1157 uint32_t *cpti = (uint32_t *)cpt; \
1158 uint32_t *orig = cpfi; \
1159 while (!hasless(*cpfi,10)) { \
1160 *cpti++ = *cpfi++ - (n)*0x01010101; \
1161 } \
1162 cpf += (cpfi-orig)*4; cpt += (cpfi-orig)*4; \
1163 while (*cpf > '\t') \
1164 *cpt++ = *cpf++ - (n); \
1165 } while (0)
1166
1167 #define CPF_SKIP() \
1168 do { \
1169 uint32_t *cpfi = (uint32_t *)cpf; \
1170 uint32_t *orig = cpfi; \
1171 while(!hasless(*cpfi,10)) \
1172 cpfi++; \
1173 cpf += (cpfi-orig)*4; \
1174 while (*cpf > '\t') \
1175 cpf++; \
1176 } while (0)
1177 #endif
1178
1179 #else /* !ALLOW_UAC */
1180 #define COPY_CPF_TO_CPTM(n) \
1181 do { \
1182 while (*cpf > '\t') \
1183 *cpt++ = *cpf++ - (n); \
1184 } while (0);
1185
1186 #define CPF_SKIP() \
1187 do { \
1188 while (*cpf > '\t') \
1189 cpf++; \
1190 } while (0)
1191 #endif
1192
1193
1194 /* Custom strtol for aux tags, always base 10 */
STRTOL64(const char * v,const char ** rv,int b)1195 static int64_t inline STRTOL64(const char *v, const char **rv, int b) {
1196 int64_t n = 0;
1197 int neg = 1;
1198 switch(*v) {
1199 case '-':
1200 neg=-1;
1201 break;
1202 case '+':
1203 break;
1204 case '0': case '1': case '2': case '3': case '4':
1205 case '5': case '6': case '7': case '8': case '9':
1206 n = *v - '0';
1207 break;
1208 default:
1209 *rv = v;
1210 return 0;
1211 }
1212 v++;
1213
1214 while (isdigit(*v))
1215 n = n*10 + *v++ - '0';
1216 *rv = v;
1217 return neg*n;
1218 }
1219
1220 /*
1221 * Decodes the next line of SAM into a bam_seq_t struct.
1222 *
1223 * Returns 1 on success
1224 * 0 on eof
1225 * -1 on error
1226 */
sam_next_seq(bam_file_t * b,bam_seq_t ** bsp)1227 static int sam_next_seq(bam_file_t *b, bam_seq_t **bsp) {
1228 int used_l, sign;
1229 int64_t n;
1230 unsigned char *cpf, *cpt, *cp;
1231 int cigar_len;
1232 bam_seq_t *bs;
1233 HashItem *hi;
1234 int64_t start, end;
1235 SAM_hdr *sh = b->header;
1236
1237 static const char lookup[256] = {
1238 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* 00 */
1239 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* 10 */
1240 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* 20 */
1241 15,15,15,15,15,15,15,15, 15,15,15,15,15, 0,15,15, /* 30 */
1242 15, 1,14, 2,13,15,15, 4, 11,15,15,12,15, 3,15,15, /* 40 */
1243 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15,15,15,15,15, /* 50 */
1244 15, 1,14, 2,13,15,15, 4, 11,15,15,12,15, 3,15,15, /* 60 */
1245 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15,15,15,15,15, /* 70 */
1246 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* 80 */
1247 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* 90 */
1248 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* a0 */
1249 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* b0 */
1250 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* c0 */
1251 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* d0 */
1252 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, /* e0 */
1253 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15};/* f0 */
1254
1255 /* Fetch a single line */
1256 if ((used_l = bam_get_line(b, &b->sam_str, &b->alloc_l)) <= 0) {
1257 return used_l;
1258 }
1259
1260 used_l *= 4; // FIXME, what is the correct max size?
1261
1262 /* Over sized memory, for worst case? FIXME: cigar can break this! */
1263 if (!*bsp || used_l + sizeof(*bs) > (*bsp)->alloc) {
1264 if (!(*bsp = realloc(*bsp, used_l + sizeof(*bs))))
1265 return -1;
1266 (*bsp)->alloc = used_l + sizeof(*bs);
1267 (*bsp)->blk_size = 0; /* compute later */
1268 }
1269
1270 bs = *bsp;
1271 bs->flag_packed = 0;
1272 bs->bin_packed = 0;
1273
1274 /* Decode line */
1275 cpf = b->sam_str;
1276 cpt = (unsigned char *)&bs->data;
1277
1278 /* Name */
1279 cp = cpf;
1280 //while (*cpf && *cpf != '\t')
1281 COPY_CPF_TO_CPTM(0);
1282 // while (*cpf > '\t')
1283 // *cpt++ = *cpf++;
1284 *cpt++ = 0;
1285 if (!*cpf++) return -1;
1286 if (cpf-cp > 255) {
1287 fprintf(stderr, "SAM name length >= 256 characters are invalid\n");
1288 return -1;
1289 }
1290 bam_set_name_len(bs, cpf-cp);
1291
1292 /* flag */
1293 n = 0;
1294 //while (*cpf && *cpf != '\t')
1295 while (*cpf > '\t')
1296 n = n*10 + *cpf++-'0';
1297 if (!*cpf++) return -1;
1298 bam_set_flag(bs, n);
1299
1300 /* ref */
1301 cp = cpf;
1302 CPF_SKIP();
1303 if (*cp == '*') {
1304 /* Unmapped */
1305 bs->ref = -1;
1306 } else {
1307 hi = HashTableSearch(b->header->ref_hash, (char *)cp, cpf-cp);
1308 if (!hi) {
1309 SAM_hdr *sh = b->header;
1310 HashData hd;
1311
1312 fprintf(stderr, "Reference seq %.*s unknown\n", (int)(cpf-cp), cp);
1313
1314 /* Fabricate it instead */
1315 sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
1316 if (!sh->ref)
1317 return -1;
1318 sh->ref[sh->nref].len = 0; /* Unknown value */
1319 sh->ref[sh->nref].name = malloc(cpf-cp+1);
1320 if (!sh->ref[sh->nref].name)
1321 return -1;
1322 memcpy(sh->ref[sh->nref].name, cp, cpf-cp);
1323 sh->ref[sh->nref].name[cpf-cp] = 0;
1324
1325 hd.i = sh->nref;
1326 hi = HashTableAdd(sh->ref_hash, sh->ref[sh->nref].name, 0,
1327 hd, NULL);
1328 sh->nref++;
1329 }
1330 bs->ref = hi->data.i;
1331 }
1332 if (!*cpf++) return -1;
1333
1334 /* Pos */
1335 n = 0;
1336 //while (*cpf && *cpf != '\t')
1337 while (*cpf > '\t')
1338 n = n*10 + *cpf++-'0';
1339 if (!*cpf++) return -1;
1340 bs->pos = n-1;
1341 start = end = bs->pos;
1342
1343 /* map qual */
1344 n = 0;
1345 //while (*cpf && *cpf != '\t')
1346 while (*cpf > '\t')
1347 n = n*10 + *cpf++-'0';
1348 if (!*cpf++) return -1;
1349 bam_set_map_qual(bs, n);
1350
1351 /* cigar */
1352 n = 0;
1353 cigar_len = 0;
1354 cpt = (unsigned char *)bam_cigar(bs);
1355 if (*cpf == '*') {
1356 cpf++;
1357 } else {
1358 //while (*cpf && *cpf != '\t') {
1359 while (*cpf > '\t') {
1360 if (isdigit(*cpf)) {
1361 n = n*10 + *cpf++ - '0';
1362 } else {
1363 unsigned char op;
1364 union {
1365 unsigned char c[4];
1366 uint32_t i;
1367 } c4i;
1368
1369 switch (*cpf++) {
1370 case 'M': op=BAM_CMATCH; end+=n; break;
1371 case 'I': op=BAM_CINS; break;
1372 case 'D': op=BAM_CDEL; end+=n; break;
1373 case 'N': op=BAM_CREF_SKIP; end+=n; break;
1374 case 'S': op=BAM_CSOFT_CLIP; break;
1375 case 'H': op=BAM_CHARD_CLIP; break;
1376 case 'P': op=BAM_CPAD; break;
1377 case '=': op=BAM_CBASE_MATCH; end+=n; break;
1378 case 'X': op=BAM_CBASE_MISMATCH; end+=n; break;
1379 default:
1380 fprintf(stderr, "Unknown cigar opcode '%c'\n", cpf[-1]);
1381 return -1;
1382 }
1383
1384 c4i.i = (n << 4) | op;
1385 *cpt++ = c4i.c[0];
1386 *cpt++ = c4i.c[1];
1387 *cpt++ = c4i.c[2];
1388 *cpt++ = c4i.c[3];
1389
1390 n = 0;
1391 cigar_len++;
1392 }
1393 }
1394 }
1395 bam_set_cigar_len(bs, cigar_len);
1396 //printf("pos %d, %d..%d => bin %d\n", bs->pos, start, end, reg2bin(start, end));
1397 // SAM spec changed 8th April 2014 (f7651377) for unmapped data.
1398 bam_set_bin(bs, reg2bin(start, bs->flag & BAM_FUNMAP ? start+1 : end));
1399 if (!*cpf++) return -1;
1400
1401 /* mate ref name */
1402 cp = cpf;
1403 CPF_SKIP();
1404 if (*cp == '*' && cp[1] == '\t') {
1405 bs->mate_ref = -1;
1406 } else if (*cp == '=' && cp[1] == '\t') {
1407 bs->mate_ref = bs->ref;
1408 } else {
1409 hi = HashTableSearch(sh->ref_hash, (char *)cp, cpf-cp);
1410 if (!hi) {
1411 HashData hd;
1412
1413 fprintf(stderr, "Mate ref seq \"%.*s\" unknown\n", (int)(cpf-cp), cp);
1414
1415 /* Fabricate it instead */
1416 sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
1417 if (!sh->ref)
1418 return -1;
1419 sh->ref[sh->nref].len = 0; /* Unknown value */
1420 sh->ref[sh->nref].name = malloc(cpf-cp+1);
1421 if (!sh->ref[sh->nref].name)
1422 return -1;
1423 memcpy(sh->ref[sh->nref].name, cp, cpf-cp);
1424 sh->ref[sh->nref].name[cpf-cp] = 0;
1425
1426 hd.i = sh->nref;
1427 hi = HashTableAdd(sh->ref_hash, sh->ref[sh->nref].name, 0,
1428 hd, NULL);
1429 sh->nref++;
1430 }
1431 bs->mate_ref = hi->data.i;
1432 }
1433 if (!*cpf++) return -1;
1434
1435 /* mate pos */
1436 n = 0;
1437 //while (*cpf && *cpf != '\t')
1438 while (*cpf > '\t')
1439 n = n*10 + *cpf++-'0';
1440 if (!*cpf++) return -1;
1441 bs->mate_pos = n-1;
1442
1443 /* insert size */
1444 n = 0;
1445 if (*cpf == '-') {
1446 sign = -1;
1447 cpf++;
1448 } else {
1449 sign = 1;
1450 }
1451 //while (*cpf && *cpf != '\t')
1452 while (*cpf > '\t')
1453 n = n*10 + *cpf++-'0';
1454 if (!*cpf++) return -1;
1455 bs->ins_size = n*sign;
1456
1457 /* seq */
1458 cp = cpf;
1459 //while (*cpf && *cpf != '\t') {
1460 if (cpf[0] == '*' && cpf[1] == '\t') {
1461 cpf++;
1462 bs->len = 0;
1463 } else {
1464 while (cpf[0] > '\t' && cpf[1] > '\t') {
1465 /* 9% of cpu time is here */
1466 *cpt++ = (lookup[cpf[0]]<<4) | lookup[cpf[1]];
1467 cpf+=2;
1468 }
1469
1470 if (*cpf > '\t')
1471 *cpt++ = lookup[*cpf++]<<4;
1472
1473 // while (*cpf > '\t') {
1474 // *cpt = lookup[*cpf]<<4;
1475 // if (*++cpf <= '\t') {
1476 // cpt++;
1477 // break;
1478 // }
1479 // *cpt++ |= lookup[*cpf];
1480 // cpf++;
1481 // }
1482 bs->len = cpf-cp;
1483 }
1484 if (!*cpf++) return -1;
1485
1486 /* qual */
1487 cp = cpf;
1488 //while (*cpf && *cpf != '\t')
1489 if (cpf[0] == '*' && (cpf[1] == '\0' || cpf[1] == '\t')) {
1490 /* no qual */
1491 memset(cpt, '\xFF', bs->len);
1492 cpt += bs->len;
1493 cpf++;
1494 } else {
1495 COPY_CPF_TO_CPTM('!');
1496 // while (*cpf > '\t')
1497 // *cpt++ = *cpf++ - '!';
1498 }
1499
1500 if ((char *)cpt != (char *)(bam_aux(bs))) return -1;
1501
1502 if (!*cpf++ || b->no_aux) goto skip_aux;
1503
1504 /* aux */
1505 while (*cpf) {
1506 unsigned char *key = cpf, *value;
1507 if (!(key[0] && key[1] && key[2] == ':' && key[3] && key[4] == ':'))
1508 return -1;
1509 cpf += 5;
1510
1511 value = cpf;
1512 CPF_SKIP();
1513
1514 *cpt++ = key[0];
1515 *cpt++ = key[1];
1516
1517 switch(key[3]) {
1518 int64_t n;
1519 case 'A':
1520 *cpt++ = 'A';
1521 *cpt++ = *value;
1522 break;
1523
1524 case 'i':
1525 //n = atoi((char *)value);
1526 n = STRTOL64((char *)value, (const char **)&value, 10);
1527 if (n >= 0) {
1528 if (n < 256) {
1529 *cpt++ = 'C';
1530 *cpt++ = n;
1531 } else if (n < 65536) {
1532 *cpt++ = 'S';
1533 STORE_UINT16(cpt, n);
1534 } else {
1535 *cpt++ = 'I';
1536 STORE_UINT32(cpt, n);
1537 }
1538 } else {
1539 if (n >= -128 && n < 128) {
1540 *cpt++ = 'c';
1541 *cpt++ = n;
1542 } else if (n >= -32768 && n < 32768) {
1543 *cpt++ = 's';
1544 STORE_UINT16(cpt, n);
1545 } else {
1546 *cpt++ = 'i';
1547 STORE_UINT32(cpt, n);
1548 }
1549 }
1550 break;
1551
1552 case 'f': {
1553 union {
1554 float f;
1555 int i;
1556 } u;
1557 u.f = atof((char *)value);
1558 *cpt++ = 'f';
1559 STORE_UINT32(cpt, u.i);
1560 break;
1561 }
1562
1563 case 'Z':
1564 *cpt++ = 'Z';
1565 memcpy(cpt, value, cpf-value); cpt += cpf-value;
1566 *cpt++ = 0;
1567 break;
1568
1569 case 'H':
1570 *cpt++ = 'H';
1571 memcpy(cpt, value, cpf-value); cpt += cpf-value;
1572 *cpt++ = 0;
1573 break;
1574
1575 case 'B': {
1576 char subtype = *value++;
1577 unsigned char *sz;
1578 int count = 0;
1579
1580 if (subtype == '\0')
1581 break;
1582
1583 *cpt++ = 'B';
1584 *cpt++ = subtype;
1585 sz = cpt; cpt += 4; /* Fill out later */
1586
1587 while (*value == ',') {
1588 value++;
1589 switch (subtype) {
1590 case 'c': case 'C':
1591 *cpt++ = strtol((char *)value, (char **)&value, 10);
1592 break;
1593
1594 case 's': case 'S':
1595 n = strtol((char *)value, (char **)&value, 10);
1596 STORE_UINT16(cpt, n);
1597 break;
1598
1599 case 'i': case 'I':
1600 n = strtoll((char *)value, (char **)&value, 10);
1601 STORE_UINT32(cpt, n);
1602 break;
1603
1604 case 'f': {
1605 union {
1606 float f;
1607 int i;
1608 } u;
1609
1610 u.f = strtod((char *)value, (char **)&value);
1611 STORE_UINT32(cpt, u.i);
1612 break;
1613 }
1614 }
1615 count++;
1616 }
1617 if (value != cpf) {
1618 fprintf(stderr, "Malformed %c%c:B:... auxiliary field\n",
1619 key[0], key[1]);
1620 value = cpf;
1621 }
1622 STORE_UINT32(sz, count);
1623 break;
1624 }
1625
1626 default:
1627 fprintf(stderr, "Unknown aux format code '%c'\n", key[3]);
1628 break;
1629 }
1630
1631 if (*cpf == '\t')
1632 cpf++;
1633 }
1634
1635 skip_aux:
1636 *cpt++ = 0;
1637
1638 bs->blk_size = (unsigned char *)cpt - (unsigned char *)&bs->ref - 1;
1639 if (bs->blk_size >= bs->alloc)
1640 abort();
1641
1642 return 1;
1643 }
1644
1645 /*
1646 * Fills out the next bam_seq_t struct.
1647 * bs must be non-null, but *bs may be NULL or an existing bam_seq_t pointer.
1648 * This function will alloc and/or grow the memory accordingly, allowing for
1649 * efficient reuse.
1650 *
1651 * Returns 1 on success
1652 * 0 on eof
1653 * -1 on error
1654 */
1655 #ifdef ALLOW_UAC
bam_get_seq(bam_file_t * b,bam_seq_t ** bsp)1656 int bam_get_seq(bam_file_t *b, bam_seq_t **bsp) {
1657 int32_t blk_size, blk_ret;
1658 bam_seq_t *bs;
1659 uint32_t i32;
1660
1661 b->line++;
1662
1663 if (!b->bam)
1664 return sam_next_seq(b, bsp);
1665
1666 if (b->next_len > 0) {
1667 blk_size = b->next_len;
1668 } else {
1669 if (4 != bam_read(b, &blk_size, 4))
1670 return 0;
1671 blk_size = le_int4(blk_size);
1672 if (blk_size < 36) /* Minimum valid BAM record size */
1673 return -1;
1674 }
1675
1676 if (!*bsp || blk_size+44 > (*bsp)->alloc) {
1677 /* 44 extra is for bs->alloc to bs->cigar_len plus next_len */
1678 if (!(bs = realloc(*bsp, blk_size+44)))
1679 return -1;
1680 *bsp = bs;
1681 (*bsp)->alloc = blk_size+44;
1682 (*bsp)->blk_size = blk_size;
1683 }
1684 bs = *bsp;
1685
1686 if ((blk_ret = bam_read(b, &bs->ref, blk_size+4)) == 0)
1687 return 0;
1688
1689 if (blk_size+4 != blk_ret) {
1690 if (blk_size != blk_ret) {
1691 return -1;
1692 } else {
1693 b->next_len = 0;
1694 ((char *)(&bs->ref))[blk_size] = 0;
1695 }
1696 } else {
1697 memcpy(&b->next_len, &((char *)(&bs->ref))[blk_size], 4);
1698 ((char *)(&bs->ref))[blk_size] = 0;
1699 }
1700 b->next_len = le_int4(b->next_len);
1701
1702 bs->blk_size = blk_size;
1703 bs->ref = le_int4(bs->ref);
1704 bs->pos = le_int4(bs->pos_32);
1705
1706 // order of bit-fields in struct is platform specific, so manually decode
1707 i32 = le_int4(bs->bin_packed);
1708 bs->bin = i32 >> 16;
1709 bs->map_qual = (i32 >> 8) & 0xff;
1710 bs->name_len = i32 & 0xff;
1711
1712 i32 = le_int4(bs->flag_packed);
1713 bs->flag = i32 >> 16;
1714 bs->cigar_len = i32 & 0xffff;
1715
1716 bs->len = le_int4(bs->len);
1717 bs->mate_ref = le_int4(bs->mate_ref);
1718 bs->mate_pos = le_int4(bs->mate_pos_32);
1719 bs->ins_size = le_int4(bs->ins_size_32);
1720
1721 if (10 == be_int4(10)) {
1722 int i, cigar_len = bam_cigar_len(bs);
1723 uint32_t *cigar = bam_cigar(bs);
1724 for (i = 0; i < cigar_len; i++) {
1725 cigar[i] = le_int4(cigar[i]);
1726 }
1727 }
1728
1729 return 1;
1730 }
1731
1732 #else
1733
bam_get_seq(bam_file_t * b,bam_seq_t ** bsp)1734 int bam_get_seq(bam_file_t *b, bam_seq_t **bsp) {
1735 int32_t blk_size, blk_ret;
1736 bam_seq_t *bs;
1737 uint32_t i32;
1738
1739 b->line++;
1740
1741 if (!b->bam)
1742 return sam_next_seq(b, bsp);
1743
1744 if (b->next_len > 0) {
1745 blk_size = b->next_len;
1746 } else {
1747 if (4 != bam_read(b, &blk_size, 4))
1748 return 0;
1749 blk_size = le_int4(blk_size);
1750 if (blk_size < 36) /* Minimum valid BAM record size */
1751 return -1;
1752 }
1753
1754 if (!*bsp || blk_size+24 > (*bsp)->alloc) {
1755 if (!(bs = realloc(*bsp, blk_size+24)))
1756 return -1;
1757 *bsp = bs;
1758 (*bsp)->alloc = blk_size+24;
1759 (*bsp)->blk_size = blk_size;
1760 }
1761 bs = *bsp;
1762
1763 /* The fixed-sized fields */
1764 if ((blk_ret = bam_read(b, &bs->ref, 32)) == 0)
1765 return 0;
1766
1767 if (blk_ret != 32)
1768 return -1;
1769
1770 bs->blk_size = blk_size;
1771 bs->ref = le_int4(bs->ref);
1772 bs->pos = le_int4(bs->pos_32);
1773
1774 // order of bit-fields in struct is platform specific, so manually decode
1775 i32 = le_int4(bs->bin_packed);
1776 bs->bin = i32 >> 16;
1777 bs->map_qual = (i32 >> 8) & 0xff;
1778 bs->name_len = i32 & 0xff;
1779
1780 i32 = le_int4(bs->flag_packed);
1781 bs->flag = i32 >> 16;
1782 bs->cigar_len = i32 & 0xffff;
1783
1784 bs->len = le_int4(bs->len);
1785 bs->mate_ref = le_int4(bs->mate_ref);
1786 bs->mate_pos = le_int4(bs->mate_pos_32);
1787 bs->ins_size = le_int4(bs->ins_size_32);
1788
1789 /* Name */
1790 if (bam_read(b, &bs->data, bam_name_len(bs)) != bam_name_len(bs))
1791 return -1;
1792
1793 /* Pad name out to end on a word-aligned boundary */
1794 blk_ret = blk_size - 32 - bam_name_len(bs);
1795 //bam_set_name_len(bs, round4(bam_name_len(bs)));
1796
1797 bs->blk_size += round4(bam_name_len(bs)) - bam_name_len(bs);
1798
1799 /* The remainder, word aligned */
1800 blk_size = blk_ret;
1801 if ((blk_ret = bam_read(b, (char *)bam_cigar(bs), blk_size+4)) == 0 &&
1802 blk_size != 0)
1803 return 0;
1804 if (blk_size+4 != blk_ret) {
1805 if (blk_size != blk_ret) {
1806 return -1;
1807 } else {
1808 b->next_len = 0;
1809 ((char *)bam_cigar(bs))[blk_size] = 0;
1810 }
1811 } else {
1812 memcpy(&b->next_len, &((char *)bam_cigar(bs))[blk_size], 4);
1813 ((char *)bam_cigar(bs))[blk_size] = 0;
1814 }
1815 b->next_len = le_int4(b->next_len);
1816
1817 if (10 == be_int4(10)) {
1818 int i, cigar_len = bam_cigar_len(bs);
1819 uint32_t *cigar = bam_cigar(bs);
1820 for (i = 0; i < cigar_len; i++) {
1821 cigar[i] = le_int4(cigar[i]);
1822 }
1823 }
1824
1825 return 1;
1826 }
1827 #endif
1828
1829 /* Old name */
bam_next_seq(bam_file_t * b,bam_seq_t ** bsp)1830 int bam_next_seq(bam_file_t *b, bam_seq_t **bsp) {
1831 return bam_get_seq(b, bsp);
1832 }
1833
1834 static int8_t aux_type_size[256] = {
1835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1839 0, 1, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
1840 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1841 0, 0, 0, 1, 8, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0,
1842 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1843 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1845 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1846 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1851 };
1852
1853 /*
1854 * Looks for aux field 'key' and returns the type + value.
1855 * The type is the first char and the value is the 2nd character onwards.
1856 *
1857 * Returns NULL if not found.
1858 */
bam_aux_find(bam_seq_t * b,const char * key)1859 char *bam_aux_find(bam_seq_t *b, const char *key) {
1860 char *cp = bam_aux(b);
1861
1862 while (*cp) {
1863 int sz;
1864
1865 //printf("%c%c:%c:?\n", cp[0], cp[1], cp[2]);
1866
1867 if (cp[0] == key[0] && cp[1] == key[1])
1868 return cp+2;
1869
1870 if ((sz = aux_type_size[(uint8_t) cp[2]])) {
1871 /* Fixed length fields */
1872 cp += sz + 3;
1873 } else {
1874 switch (cp[2]) {
1875 case 'Z':
1876 case 'H': { /* Variable length, null terminated */
1877 cp += 3;
1878 while (*cp++)
1879 ;
1880 break;
1881 }
1882 case 'B': { /* Array types */
1883 uint32_t count;
1884 if ((sz = aux_type_size[(uint8_t) cp[3]])) {
1885 count = ((uint32_t) cp[4]
1886 + ((uint32_t) cp[5] << 8)
1887 + ((uint32_t) cp[6] << 16)
1888 + ((uint32_t) cp[7] << 24));
1889 cp += 8 + count * sz;
1890 } else {
1891 return NULL;
1892 }
1893 }
1894 default:
1895 return NULL;
1896 }
1897 }
1898 }
1899
1900 return NULL;
1901 }
1902
bam_aux_i(const uint8_t * dat)1903 int32_t bam_aux_i(const uint8_t *dat) {
1904 switch(dat[0]) {
1905 case 'i':
1906 return (int32_t)(dat[1] + (dat[2]<<8) + (dat[3]<<16) + (dat[4]<<24));
1907 case 'I':
1908 return (uint32_t)(dat[1] + (dat[2]<<8) + (dat[3]<<16) + (dat[4]<<24));
1909 break;
1910 case 's':
1911 return (int16_t)(dat[1] + (dat[2]<<8));
1912 case 'S':
1913 return (uint16_t)(dat[1] + (dat[2]<<8));
1914 case 'c':
1915 return (int8_t)dat[1];
1916 case 'C':
1917 return (uint8_t)dat[1];
1918 }
1919
1920 abort();
1921 }
1922
bam_aux_f(const uint8_t * dat)1923 float bam_aux_f(const uint8_t *dat) {
1924 assert(dat[0] == 'f');
1925 return (float)((int32_t)((dat[1]<<0)+
1926 (dat[2]<<8)+
1927 (dat[3]<<16)+
1928 (dat[4]<<24)));
1929 }
1930
bam_aux_d(const uint8_t * dat)1931 double bam_aux_d(const uint8_t *dat) {
1932 assert(dat[0] == 'd');
1933 return (double)((int64_t)((((uint64_t)dat[1])<<0)+
1934 (((uint64_t)dat[2])<<8)+
1935 (((uint64_t)dat[3])<<16)+
1936 (((uint64_t)dat[4])<<24)+
1937 (((uint64_t)dat[5])<<32)+
1938 (((uint64_t)dat[6])<<40)+
1939 (((uint64_t)dat[7])<<48)+
1940 (((uint64_t)dat[8])<<54)));
1941 }
1942
bam_aux_A(const uint8_t * dat)1943 char bam_aux_A(const uint8_t *dat) {
1944 assert(dat[0] == 'A');
1945 return dat[1];
1946 }
1947
bam_aux_Z(const uint8_t * dat)1948 char *bam_aux_Z(const uint8_t *dat) {
1949 assert(dat[0] == 'Z' || dat[0] == 'H');
1950 return (char *)(dat+1);
1951 }
1952
1953 /*
1954 * An iterator on bam aux fields. NB: This code is not reentrant or multi-
1955 * thread capable. The values returned are valid until the next call to
1956 * this function.
1957 * key: points to an array of 3 characters (eg "RGi", "NMC")
1958 * type: points to an address of 1 character (eg 'Z', 'i') for the SAM type
1959 * val: points to an address of a bam_aux_t union.
1960 *
1961 * The first two bytes of key are the real key and the next byte is the
1962 * BAM type field. Note that this may differ to the returned SAM type
1963 * field. For example key[2] == 'S' for unsigned short while *type is
1964 * set to 'i'.
1965 *
1966 * Pass in *iter_handle as NULL to initialise the search and then
1967 * pass in the modified value on each subsequent call to continue the search.
1968 *
1969 * Returns 0 if the next value is valid, setting key, type and val.
1970 * -1 when no more found.
1971 */
bam_aux_iter_full(bam_seq_t * b,char ** iter_handle,char * key,char * type,bam_aux_t * val)1972 int bam_aux_iter_full(bam_seq_t *b, char **iter_handle,
1973 char *key, char *type, bam_aux_t *val) {
1974 char *s;
1975
1976 if (!iter_handle || !*iter_handle) {
1977 s = (char *)bam_aux(b);
1978 } else {
1979 s = *iter_handle;
1980 }
1981
1982 /* We null terminate our aux list for ease */
1983 if (s[0] == 0)
1984 return -1;
1985
1986 key[0] = s[0];
1987 key[1] = s[1];
1988 key[2] = s[2];
1989
1990 switch (s[2]) {
1991 case 'A':
1992 if (type) *type = 'A';
1993 if (val) val->i = *(s+3);
1994 s+=4;
1995 break;
1996
1997 case 'C':
1998 if (type) *type = 'i';
1999 if (val) val->i = *(uint8_t *)(s+3);
2000 s+=4;
2001 break;
2002
2003 case 'c':
2004 if (type) *type = 'i';
2005 if (val) val->i = *(int8_t *)(s+3);
2006 s+=4;
2007 break;
2008
2009 case 'S':
2010 if (type) *type = 'i';
2011 if (val)
2012 val->i = (uint16_t)((((unsigned char *)s)[3]<< 0) +
2013 (((unsigned char *)s)[4]<< 8));
2014 s+=5;
2015 break;
2016
2017 case 's':
2018 if (type) *type = 'i';
2019 if (val)
2020 val->i = (int16_t)((((unsigned char *)s)[3]<< 0) +
2021 (((unsigned char *)s)[4]<< 8));
2022 s+=5;
2023 break;
2024
2025 case 'I':
2026 if (type) *type = 'i';
2027 if (val)
2028 val->i = (uint32_t)((((unsigned char *)s)[3]<< 0) +
2029 (((unsigned char *)s)[4]<< 8) +
2030 (((unsigned char *)s)[5]<<16) +
2031 (((unsigned char *)s)[6]<<24));
2032 s+=7;
2033 break;
2034
2035 case 'i':
2036 if (type) *type = 'i';
2037 if (val)
2038 val->i = (int32_t)((((unsigned char *)s)[3]<< 0) +
2039 (((unsigned char *)s)[4]<< 8) +
2040 (((unsigned char *)s)[5]<<16) +
2041 (((unsigned char *)s)[6]<<24));
2042 s+=7;
2043 break;
2044
2045 case 'f':
2046 if (type) *type = 'f';
2047 if (val) /* Assume same endianness as integer */
2048 val->i = (int32_t)((((unsigned char *)s)[3]<< 0) +
2049 (((unsigned char *)s)[4]<< 8) +
2050 (((unsigned char *)s)[5]<<16) +
2051 (((unsigned char *)s)[6]<<24));
2052 s+=7;
2053 break;
2054
2055 case 'd':
2056 if (type) *type = 'd';
2057 if (val) /* Assume same endianness as integer */
2058 val->i64 = (uint64_t)(((uint64_t)(((unsigned char *)s)[ 3])<< 0) +
2059 ((uint64_t)(((unsigned char *)s)[ 4])<< 8) +
2060 ((uint64_t)(((unsigned char *)s)[ 5])<<16) +
2061 ((uint64_t)(((unsigned char *)s)[ 6])<<24) +
2062 ((uint64_t)(((unsigned char *)s)[ 7])<<32) +
2063 ((uint64_t)(((unsigned char *)s)[ 8])<<40) +
2064 ((uint64_t)(((unsigned char *)s)[ 9])<<48) +
2065 ((uint64_t)(((unsigned char *)s)[10])<<54));
2066 s+=11;
2067 break;
2068
2069 case 'Z': case 'H':
2070 if (type) *type = s[2];
2071 s+=3;
2072 if (val) val->s = s;
2073 while (*s++);
2074 break;
2075
2076 case 'B': {
2077 uint32_t count;
2078 if (type) *type = 'B';
2079 count = (unsigned int)((((unsigned char *)s)[4]<< 0) +
2080 (((unsigned char *)s)[5]<< 8) +
2081 (((unsigned char *)s)[6]<<16) +
2082 (((unsigned char *)s)[7]<<24));
2083
2084 if (val) {
2085 val->B.n = count;
2086 val->B.t = s[3];
2087 val->B.s = (unsigned char *)s+8;
2088 }
2089 s+=8;
2090
2091 switch(val->B.t) {
2092 case 'c': case 'C': s += count; break;
2093 case 's': case 'S': s += 2*count; break;
2094 case 'i': case 'I': s += 4*count; break;
2095 case 'f': s += 4*count; break;
2096 default:
2097 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
2098 val->B.t);
2099 return -1;
2100 }
2101 break;
2102 }
2103
2104 default:
2105 fprintf(stderr, "Unknown aux type '%c'\n", s[2]);
2106 return -1;
2107 }
2108
2109 if (iter_handle)
2110 *iter_handle = s;
2111
2112 return 0;
2113 }
2114
2115 /*
2116 * As above, but only 2 characters of the key are returned so the
2117 * original BAM type is not visible.
2118 *
2119 * Note this can cause ambiguities if you wish to distinguish between
2120 * -1 billion and +3 billion.
2121 */
bam_aux_iter(bam_seq_t * b,char ** iter_handle,char * key,char * type,bam_aux_t * val)2122 int bam_aux_iter(bam_seq_t *b, char **iter_handle,
2123 char *key, char *type, bam_aux_t *val) {
2124 char k3[3];
2125 int r = bam_aux_iter_full(b, iter_handle, k3, type, val);
2126
2127 if (r == 0) {
2128 key[0] = k3[0];
2129 key[1] = k3[1];
2130 }
2131
2132 return r;
2133 }
2134
reg2bin(int start,int end)2135 static int reg2bin(int start, int end) {
2136 if (end>start) end--;
2137 if ((start>>14) == (end>>14)) return ((1<<15)-1)/7 + (start>>14);
2138 if ((start>>17) == (end>>17)) return ((1<<12)-1)/7 + (start>>17);
2139 if ((start>>20) == (end>>20)) return ((1<<9 )-1)/7 + (start>>20);
2140 if ((start>>23) == (end>>23)) return ((1<<6 )-1)/7 + (start>>23);
2141 if ((start>>26) == (end>>26)) return ((1<<3 )-1)/7 + (start>>26);
2142 return 0;
2143 }
2144
2145 /*
2146 * Constructs a bam_seq_t from components.
2147 * Ignores auxiliary tags for now.
2148 *
2149 * Returns -1 on error
2150 * number of bytes written to bam_seq_t on success (ie tag offset)
2151 */
bam_construct_seq(bam_seq_t ** b,size_t extra_len,const char * qname,size_t qname_len,int flag,int rname,int64_t pos,int64_t end,int mapq,uint32_t ncigar,const uint32_t * cigar,int mrnm,int64_t mpos,int64_t isize,int len,const char * seq,const char * qual)2152 int bam_construct_seq(bam_seq_t **b, size_t extra_len,
2153 const char *qname, size_t qname_len,
2154 int flag,
2155 int rname, // Ref ID
2156 int64_t pos, // first aligned base (1-based)
2157 int64_t end, // last aligned base (to calculate bin)
2158 int mapq,
2159 uint32_t ncigar, const uint32_t *cigar,
2160 int mrnm, // Mate Ref ID
2161 int64_t mpos,
2162 int64_t isize,
2163 int len,
2164 const char *seq,
2165 const char *qual) {
2166 size_t required;
2167 char *cp;
2168 int i;
2169 uint32_t *ip;
2170
2171 /*
2172 * cp = "=ACMGRSVTWYHKDBN";
2173 * memset(L, 15, 256);
2174 * for (i = 0; i < 16; i++) {
2175 * L[cp[i]] = L[tolower(cp[i])] = i;
2176 * }
2177 */
2178 static const char L[256] = {
2179 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2180 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2181 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2182 15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15,
2183 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15,
2184 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15,
2185 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15,
2186 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15,
2187 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2188 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2189 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2190 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2191 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2192 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2193 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
2194 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
2195 };
2196
2197 /* Sanity checks */
2198 if (NULL == b) return -1;
2199 if (len < 0) return -1; /* not sure why the spec has it as an int */
2200 if (qname_len > 0 && NULL == qname) return -1;
2201 if (ncigar > 0 && NULL == cigar) return -1;
2202 if (len > 0 && NULL == seq) return -1;
2203
2204 /* Reallocate if needed */
2205 required = (sizeof(**b) /* the struct itself */
2206 #ifdef ALLOW_UAC
2207 + qname_len + 1 /* query name (unaligned) */
2208 #else
2209 + round4(qname_len + 1) /* query name (aligned) */
2210 #endif
2211 + 4 * ncigar /* CIGAR string */
2212 + (len + 1) / 2 /* Sequence, 2 bases per byte */
2213 + len /* Quality */
2214 + extra_len + 1); /* Extra for optional tags */
2215
2216 if (NULL == *b || (*b)->alloc < required) {
2217 bam_seq_t *new_bam = realloc(*b, required);
2218 if (NULL == new_bam) return -1;
2219 *b = new_bam;
2220 (*b)->alloc = required;
2221 }
2222
2223 (*b)->ref = rname;
2224 (*b)->pos = pos-1;
2225 bam_set_map_qual(*b, mapq);
2226 bam_set_name_len(*b, qname_len+1);
2227 bam_set_flag(*b, flag);
2228 bam_set_cigar_len(*b, ncigar);
2229 (*b)->len = len;
2230 (*b)->mate_ref = mrnm;
2231 (*b)->mate_pos = mpos-1;
2232 (*b)->ins_size = isize;
2233
2234 cp = bam_name(*b);
2235 memcpy(cp, qname, qname_len);
2236 cp[qname_len] = 0;
2237
2238 /* Cigar */
2239 cp = (char *)bam_cigar(*b);
2240 ip = (uint32_t *)cp;
2241 for (i = 0; i < ncigar; i++) {
2242 ip[i] = cigar[i];
2243 }
2244 cp += ncigar*4;
2245
2246 /* Bin */
2247 if (!((*b)->flag & BAM_CIGAR32)) {
2248 if (0 == end) { /* Calculate end from pos and cigar */
2249 end = pos;
2250 for (i = 0; i < ncigar; i++) {
2251 if (BAM_CONSUME_REF(cigar[i] & BAM_CIGAR_MASK))
2252 end += cigar[i] >> BAM_CIGAR_SHIFT;
2253 }
2254 }
2255
2256 // range is [beg,end) and zero based.
2257 bam_set_bin(*b, reg2bin(pos-1,end));
2258 }
2259
2260 /* Seq */
2261 for (i = 0; i < len-1; i += 2) {
2262 *cp++ = (L[(uc)seq[i]]<<4) + L[(uc)seq[i+1]];
2263 }
2264 if (i < len)
2265 *cp++ = L[(uc)seq[i]]<<4;
2266
2267 /* Qual */
2268 if (qual) {
2269 memcpy(cp, qual, len);
2270 cp += len;
2271 } else {
2272 for (i = 0; i < len; i++) {
2273 *cp++ = '\xff';
2274 }
2275 }
2276
2277 *cp = 0; /* terminate aux list, for ease of parsing later */
2278
2279 /* cp now points to the auxiliary tags if required */
2280 (*b)->blk_size = (int)(cp-(char *)&(*b)->ref);
2281 return (int)(cp-(char *)(*b));
2282 }
2283
bam_aux_add(bam_seq_t ** b,const char tag[2],char type,uint32_t array_len,const void * data)2284 int bam_aux_add(bam_seq_t **b, const char tag[2], char type,
2285 uint32_t array_len, const void *data) {
2286 int tlen;
2287 size_t len;
2288 size_t used;
2289 uint8_t *cp;
2290 #ifndef SP_LITTLE_ENDIAN
2291 uint32_t i;
2292 #endif
2293
2294 if (NULL == b || NULL == *b) return -1;
2295
2296 /* Find size of data type and how much space is needed */
2297 if (0 == (tlen = aux_type_size[(uint8_t) type])) {
2298 if (type == 'H' || type == 'Z') { /* Variable length types */
2299 if (array_len != 0) return -1; /* No arrays for these allowed */
2300 tlen = strlen((const char *) data) + 1;
2301 } else {
2302 /* unknown type */
2303 return -1;
2304 }
2305 }
2306
2307 len = array_len > 0 ? 8 + tlen * array_len : 3 + tlen;
2308
2309 /* Find the end of the existing tags and ensure there is enough space */
2310 cp = (uint8_t *)&(*b)->ref + (*b)->blk_size;
2311 used = cp - (uint8_t *)(*b);
2312
2313 if ((*b)->alloc < used + len + 1) { /* + 1 for NUL terminator */
2314 size_t required = used + len + 1;
2315 bam_seq_t *new_bam = realloc((*b), required);
2316 if (NULL == new_bam) return -1;
2317 *b = new_bam;
2318 (*b)->alloc = required;
2319 cp = (uint8_t *)new_bam + used;
2320 }
2321
2322 /* Append the data */
2323 *cp++ = tag[0];
2324 *cp++ = tag[1];
2325 if (array_len > 0) { /* Array type */
2326 *cp++ = 'B';
2327 *cp++ = type;
2328 STORE_UINT32(cp, array_len);
2329 } else {
2330 *cp++ = type;
2331 }
2332
2333 if (array_len == 0) array_len = 1;
2334 #ifdef SP_LITTLE_ENDIAN
2335 memcpy(cp, data, array_len * tlen);
2336 cp += array_len * tlen;
2337 #else
2338 switch (type) {
2339 case 'A': case 'c': case 'C':
2340 memcpy(cp, data, array_len);
2341 cp += array_len;
2342 break;
2343 case 's':
2344 case 'S': {
2345 uint16_t *sdata = (uint16_t *) data;
2346 for (i = 0; i < array_len; i++) {
2347 STORE_UINT16(cp, sdata[i]);
2348 }
2349 break;
2350 }
2351 case 'i':
2352 case 'I':
2353 case 'f': {
2354 uint32_t *idata = (uint32_t *) data;
2355 for (i = 0; i < array_len; i++) {
2356 STORE_UINT32(cp, idata[i]);
2357 }
2358 break;
2359 }
2360 case 'd': {
2361 uint64_t *ddata = (uint64_t *) data;
2362 for (i = 0; i < array_len; i++) {
2363 STORE_UINT64(cp, ddata[i]);
2364 }
2365 break;
2366 }
2367 case 'H': case 'Z':
2368 memcpy(cp, data, tlen);
2369 cp += tlen;
2370 break;
2371 }
2372 #endif
2373
2374 /* Put a NUL at the end for bam_aux_iter */
2375 *cp = 0;
2376
2377 /* Update block_size */
2378 (*b)->blk_size = (uint32_t)(cp - (uint8_t *)&(*b)->ref);
2379
2380 return 0;
2381 }
2382
2383 /* Calculate space needed to store tags */
2384
bam_aux_size_vec(uint32_t count,bam_aux_tag_t * tags)2385 ssize_t bam_aux_size_vec(uint32_t count, bam_aux_tag_t *tags) {
2386 uint32_t i;
2387 ssize_t len = 0;
2388 int sz;
2389
2390 if (NULL == tags) return -1;
2391
2392 for (i = 0; i < count; i++) {
2393 switch (tags[i].type) {
2394 case 'C': case 'S': case 'I':
2395 if (tags[i].value.ui < 256) {
2396 sz = 1;
2397 } else if (tags[i].value.ui < 65536) {
2398 sz = 2;
2399 } else {
2400 sz = 4;
2401 }
2402 break;
2403 case 'c': case 's': case 'i':
2404 if (tags[i].value.i >= -128 && tags[i].value.i < 128) {
2405 sz = 1;
2406 } else if (tags[i].value.i >= -32768 && tags[i].value.i < 32768) {
2407 sz = 2;
2408 } else {
2409 sz = 4;
2410 }
2411 break;
2412 case 'A':
2413 sz = 1;
2414 break;
2415 case 'f':
2416 sz = 4;
2417 break;
2418 case 'd':
2419 sz = 8;
2420 break;
2421 case 'H': case 'Z':
2422 if (tags[i].array_len != 0) return -1;
2423 sz = strlen(tags[i].value.z) + 1;
2424 break;
2425 default:
2426 return -1; /* bad data type */
2427 }
2428 len += tags[i].array_len == 0 ? sz + 3 : sz * tags[i].array_len + 8;
2429 }
2430
2431 return len + 1; /* + 1 for NUL byte at end */
2432 }
2433
bam_aux_add_vec(bam_seq_t ** b,uint32_t count,bam_aux_tag_t * tags)2434 int bam_aux_add_vec(bam_seq_t **b, uint32_t count, bam_aux_tag_t *tags) {
2435 ssize_t required = bam_aux_size_vec(count, tags);
2436 uint32_t i;
2437 size_t used;
2438
2439 if (required < 0) return -1;
2440 if (NULL == b || NULL == *b) return -1;
2441
2442 /* Find the end of the existing tags and ensure there is enough space.
2443 Do this once for the entire vector so we don't keep reallocing */
2444 used = (uint8_t *)&(*b)->ref + (*b)->blk_size - (uint8_t *)(*b);
2445
2446 if ((*b)->alloc < used + required) {
2447 bam_seq_t *new_bam = realloc((*b), used + required);
2448 if (NULL == new_bam) return -1;
2449 *b = new_bam;
2450 (*b)->alloc = used + required;
2451 }
2452
2453 /* Add the tags, storing integers in the most appropriate size */
2454 for (i = 0; i < count; i++) {
2455 if (tags[i].array_len > 0) {
2456 /* Deal with array tags */
2457 if (bam_aux_add(b, tags[i].tag, tags[i].type,
2458 tags[i].array_len, tags[i].value.array)) return -1;
2459 continue;
2460 }
2461
2462 /* Non-array tags, storing integers as the most appropriate size */
2463 switch (tags[i].type) {
2464 case 'C': case 'S': case 'I':
2465 if (tags[i].value.ui < 256) {
2466 uint8_t byte = tags[i].value.ui;
2467 if (bam_aux_add(b, tags[i].tag, 'C', 0, &byte)) return -1;
2468 } else if (tags[i].value.ui < 65536) {
2469 uint16_t word = tags[i].value.ui;
2470 if (bam_aux_add(b, tags[i].tag, 'S', 0, &word)) return -1;
2471 } else {
2472 if (bam_aux_add(b, tags[i].tag, 'I', 0, &tags[i].value.ui)) {
2473 return -1;
2474 }
2475 }
2476 break;
2477 case 'c': case 's': case 'i':
2478 if (tags[i].value.i >= -128 && tags[i].value.i < 128) {
2479 int8_t byte = tags[i].value.i;
2480 if (bam_aux_add(b, tags[i].tag, 'c', 0, &byte)) return -1;
2481 } else if (tags[i].value.i >= -32768 && tags[i].value.i < 32768) {
2482 int16_t word = tags[i].value.i;
2483 if (bam_aux_add(b, tags[i].tag, 's', 0, &word)) return -1;
2484 } else {
2485 if (bam_aux_add(b, tags[i].tag, 'i', 0, &tags[i].value.i))
2486 return -1;
2487 }
2488 break;
2489 case 'A':
2490 if (bam_aux_add(b, tags[i].tag, 'A', 0, &tags[i].value.a))
2491 return -1;
2492 break;
2493 case 'f': case 'd':
2494 if (bam_aux_add(b, tags[i].tag, tags[i].type, 0, &tags[i].value.f))
2495 return -1;
2496 break;
2497 case 'H': case 'Z':
2498 if (bam_aux_add(b, tags[i].tag, tags[i].type, 0, tags[i].value.z))
2499 return -1;
2500 break;
2501 default:
2502 return -1; /* unknown type */
2503 }
2504 }
2505 return 0;
2506 }
2507
2508 /* Add SAM-formatted aux tags to a bam_seq_t struct.
2509 This is basically a copy of the code in sam_next_seq. Unfortunately
2510 trying to get them to use a common version slows sam_next_seq down
2511 rather a lot, even when inlined. Hence this extra copy. */
2512
bam_aux_add_from_sam(bam_seq_t ** bsp,char * sam)2513 int bam_aux_add_from_sam(bam_seq_t **bsp, char *sam) {
2514 unsigned char *cpf = (unsigned char *) sam;
2515 unsigned char *cpt = (unsigned char *)&(*bsp)->ref + (*bsp)->blk_size;
2516 unsigned char *end = (unsigned char *)(*bsp) + (*bsp)->alloc;
2517
2518 while (*cpf) {
2519 unsigned char *key = cpf, *value;
2520 size_t max_len;
2521
2522 if (!(key[0] && key[1] && key[2] == ':' && key[3] && key[4] == ':'))
2523 return -1;
2524 cpf += 5;
2525
2526 value = cpf;
2527 while (*cpf && *cpf != '\t')
2528 cpf++;
2529
2530 if (aux_type_size[key[3]]) {
2531 max_len = aux_type_size[key[3]] + 3;
2532 } else if (key[3] != 'B') {
2533 max_len = cpf - value + 4;
2534 } else {
2535 /* Worst case */
2536 max_len = (cpf - value) * 4 + 8;
2537 }
2538
2539 /* ensure we have enough room */
2540 if (end - cpt < max_len) {
2541 size_t used = cpt - (unsigned char *)(*bsp);
2542 bam_seq_t *new_bam = realloc(*bsp, used + max_len);
2543 if (NULL == new_bam) return -1;
2544 *bsp = new_bam;
2545 (*bsp)->alloc += used + max_len;
2546 cpt = (unsigned char *)(*bsp) + used;
2547 end = (unsigned char *)(*bsp) + (*bsp)->alloc;
2548 }
2549
2550 *cpt++ = key[0];
2551 *cpt++ = key[1];
2552
2553 switch(key[3]) {
2554 int64_t n;
2555
2556 case 'A':
2557 *cpt++ = 'A';
2558 *cpt++ = *value;
2559 break;
2560
2561 case 'i':
2562 //n = atoi((char *)value);
2563 n = STRTOL64((char *)value, (const char **)&value, 10);
2564 if (n >= 0) {
2565 if (n < 256) {
2566 *cpt++ = 'C';
2567 *cpt++ = n;
2568 } else if (n < 65536) {
2569 *cpt++ = 'S';
2570 STORE_UINT16(cpt, n);
2571 } else {
2572 *cpt++ = 'I';
2573 STORE_UINT32(cpt, n);
2574 }
2575 } else {
2576 if (n >= -128 && n < 128) {
2577 *cpt++ = 'c';
2578 *cpt++ = n;
2579 } else if (n >= -32768 && n < 32768) {
2580 *cpt++ = 's';
2581 STORE_UINT16(cpt, n);
2582 } else {
2583 *cpt++ = 'i';
2584 STORE_UINT32(cpt, n);
2585 }
2586 }
2587 break;
2588
2589 case 'f': {
2590 union {
2591 float f;
2592 int i;
2593 } u;
2594 u.f = atof((char *)value);
2595 *cpt++ = 'f';
2596 STORE_UINT32(cpt, u.i);
2597 break;
2598 }
2599
2600 case 'Z':
2601 *cpt++ = 'Z';
2602 while (value != cpf)
2603 *cpt++=*value++;
2604 *cpt++ = 0;
2605 break;
2606
2607 case 'H':
2608 *cpt++ = 'H';
2609 while (value != cpf)
2610 *cpt++=*value++;
2611 *cpt++ = 0;
2612 break;
2613
2614 case 'B': {
2615 char subtype = *value++;
2616 unsigned char *sz;
2617 int count = 0;
2618
2619 *cpt++ = 'B';
2620 *cpt++ = subtype;
2621 sz = cpt; cpt += 4; /* Fill out later */
2622
2623 while (*value == ',') {
2624 value++;
2625 switch (subtype) {
2626 case 'c': case 'C':
2627 *cpt++ = strtol((char *)value, (char **)&value, 10);
2628 break;
2629
2630 case 's': case 'S':
2631 n = strtol((char *)value, (char **)&value, 10);
2632 STORE_UINT16(cpt, n);
2633 break;
2634
2635 case 'i': case 'I':
2636 n = strtoll((char *)value, (char **)&value, 10);
2637 STORE_UINT32(cpt, n);
2638 break;
2639
2640 case 'f': {
2641 union {
2642 float f;
2643 int i;
2644 } u;
2645
2646 u.f = strtod((char *)value, (char **)&value);
2647 STORE_UINT32(cpt, u.i);
2648 break;
2649 }
2650 }
2651 count++;
2652 }
2653 if (value != cpf) {
2654 fprintf(stderr, "Malformed %c%c:B:... auxiliary field\n",
2655 key[0], key[1]);
2656 value = cpf;
2657 }
2658 STORE_UINT32(sz, count);
2659 break;
2660 }
2661
2662 default:
2663 fprintf(stderr, "Unknown aux format code '%c'\n", key[3]);
2664 break;
2665 }
2666
2667 if (*cpf == '\t')
2668 cpf++;
2669 }
2670
2671 if (cpt == end) { /* Hopefully very unlikely */
2672 size_t used = cpt - (unsigned char *)(*bsp);
2673 bam_seq_t *new_bam = realloc(*bsp, used + 1);
2674 if (NULL == new_bam) return -1;
2675 *bsp = new_bam;
2676 (*bsp)->alloc += used + 1;
2677 cpt = (unsigned char *)(*bsp) + used;
2678 }
2679
2680 *cpt = 0;
2681 (*bsp)->blk_size = cpt - (unsigned char *)&(*bsp)->ref;
2682 return 0;
2683 }
2684
2685 /*! Add preformated raw aux data to the bam_seq.
2686 *
2687 * Consider using bam_aux_add instead if you have information in a more
2688 * integer or string form.
2689 *
2690 * Returns 0 on success;
2691 * -1 on failure
2692 */
bam_aux_add_data(bam_seq_t ** b,const char tag[2],char type,size_t len,const uint8_t * data)2693 int bam_aux_add_data(bam_seq_t **b, const char tag[2], char type,
2694 size_t len, const uint8_t *data) {
2695 uint8_t *cp;
2696 size_t used;
2697
2698 if (NULL == b || NULL == data) return -1;
2699
2700 /* Find the end of the existing tags and ensure there is enough space */
2701 cp = (uint8_t *)&(*b)->ref + (*b)->blk_size;
2702 used = cp - (uint8_t *)(*b);
2703
2704 if ((*b)->alloc < used + len + 4) {
2705 size_t required = used + len + 4;
2706 bam_seq_t *new_bam = realloc((*b), required);
2707 if (NULL == new_bam) return -1;
2708 *b = new_bam;
2709 (*b)->alloc = required;
2710 cp = (uint8_t *) new_bam + used;
2711 }
2712
2713 *cp++ = tag[0];
2714 *cp++ = tag[1];
2715 *cp++ = type;
2716 memcpy(cp, data, len);
2717 cp += len;
2718 *cp = 0;
2719
2720 (*b)->blk_size = (uint32_t)(cp - (uint8_t *)&(*b)->ref);
2721
2722 return 0;
2723 }
2724
2725 /*! Add raw data to a bam structure.
2726 *
2727 * This could be useful if you wish to manually construct your own bam
2728 * entries or if you need to append an entire block of preformatting
2729 * aux data.
2730 *
2731 * Returns 0 on success;
2732 * -1 on failure
2733 */
bam_add_raw(bam_seq_t ** b,size_t len,const uint8_t * data)2734 int bam_add_raw(bam_seq_t **b, size_t len, const uint8_t *data) {
2735 uint8_t *cp;
2736 size_t used;
2737
2738 if (NULL == b || NULL == data) return -1;
2739
2740 /* Find the end of the existing tags and ensure there is enough space */
2741 cp = (uint8_t *)&(*b)->ref + (*b)->blk_size;
2742 used = cp - (uint8_t *)(*b);
2743
2744 if ((*b)->alloc < used + len + 1) {
2745 size_t required = used + len + 1;
2746 bam_seq_t *new_bam = realloc((*b), required);
2747 if (NULL == new_bam) return -1;
2748 *b = new_bam;
2749 (*b)->alloc = required;
2750 cp = (uint8_t *) new_bam + used;
2751 }
2752
2753 memcpy(cp, data, len);
2754 cp += len;
2755 *cp = 0;
2756
2757 (*b)->blk_size = (uint32_t)(cp - (uint8_t *)&(*b)->ref);
2758
2759 return 0;
2760 }
2761
2762
2763 /*! Duplicates a bam_seq_t structure.
2764 *
2765 * @return
2766 * Returns the new bam_seq_t pointer on success;
2767 * NULL on failure.
2768 */
bam_dup(bam_seq_t * b)2769 bam_seq_t *bam_dup(bam_seq_t *b) {
2770 bam_seq_t *d;
2771 int a = ((int)((b->alloc+15)/16))*16;
2772
2773 if (!b)
2774 return NULL;
2775
2776 if (!(d = malloc(a)))
2777 return NULL;
2778
2779 memcpy(d, b, b->alloc);
2780 d->alloc = a;
2781 return d;
2782 }
2783
2784
append_int(unsigned char * cp,int32_t i)2785 unsigned char *append_int(unsigned char *cp, int32_t i) {
2786 int32_t j;
2787
2788 if (i < 0) {
2789 *cp++ = '-';
2790 if (i == INT_MIN) {
2791 *cp++ = '2'; *cp++ = '1'; *cp++ = '4'; *cp++ = '7';
2792 *cp++ = '4'; *cp++ = '8'; *cp++ = '3'; *cp++ = '6';
2793 *cp++ = '4'; *cp++ = '8';
2794 return cp;
2795 }
2796
2797 i = -i;
2798 } else if (i == 0) {
2799 *cp++ = '0';
2800 return cp;
2801 }
2802
2803 //if (i < 10) goto b0;
2804 if (i < 100) goto b1;
2805 //if (i < 1000) goto b2;
2806 if (i < 10000) goto b3;
2807 //if (i < 100000) goto b4;
2808 if (i < 1000000) goto b5;
2809 //if (i < 10000000) goto b6;
2810 if (i < 100000000) goto b7;
2811
2812 if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
2813 if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
2814 b7: if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
2815 if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
2816 b5: if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
2817 if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
2818 b3: if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
2819 if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
2820 b1: if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
2821 if (i) *cp++ = i + '0';
2822 return cp;
2823
2824 x8: *cp++ = i / 100000000 + '0', i %= 100000000;
2825 x7: *cp++ = i / 10000000 + '0', i %= 10000000;
2826 x6: *cp++ = i / 1000000 + '0', i %= 1000000;
2827 x5: *cp++ = i / 100000 + '0', i %= 100000;
2828 x4: *cp++ = i / 10000 + '0', i %= 10000;
2829 x3: *cp++ = i / 1000 + '0', i %= 1000;
2830 x2: *cp++ = i / 100 + '0', i %= 100;
2831 x1: *cp++ = i / 10 + '0', i %= 10;
2832 x0: *cp++ = i + '0';
2833
2834 return cp;
2835 }
2836
2837 /*
2838 * Unsigned version of above.
2839 * Only differs when the int has the top bit set (~2.15 billion and above).
2840 */
append_uint(unsigned char * cp,uint32_t i)2841 unsigned char *append_uint(unsigned char *cp, uint32_t i) {
2842 uint32_t j;
2843
2844 if (i == 0) {
2845 *cp++ = '0';
2846 return cp;
2847 }
2848
2849 //if (i < 10) goto b0;
2850 if (i < 100) goto b1;
2851 //if (i < 1000) goto b2;
2852 if (i < 10000) goto b3;
2853 //if (i < 100000) goto b4;
2854 if (i < 1000000) goto b5;
2855 //if (i < 10000000) goto b6;
2856 if (i < 100000000) goto b7;
2857
2858 if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
2859 if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
2860 b7: if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
2861 if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
2862 b5: if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
2863 if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
2864 b3: if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
2865 if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
2866 b1: if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
2867 if (i) *cp++ = i + '0';
2868 return cp;
2869
2870 x8: *cp++ = i / 100000000 + '0', i %= 100000000;
2871 x7: *cp++ = i / 10000000 + '0', i %= 10000000;
2872 x6: *cp++ = i / 1000000 + '0', i %= 1000000;
2873 x5: *cp++ = i / 100000 + '0', i %= 100000;
2874 x4: *cp++ = i / 10000 + '0', i %= 10000;
2875 x3: *cp++ = i / 1000 + '0', i %= 1000;
2876 x2: *cp++ = i / 100 + '0', i %= 100;
2877 x1: *cp++ = i / 10 + '0', i %= 10;
2878 x0: *cp++ = i + '0';
2879
2880 return cp;
2881 }
2882
append_int64(unsigned char * cp,int64_t i)2883 unsigned char *append_int64(unsigned char *cp, int64_t i) {
2884 int64_t j;
2885
2886 if (i < 0) {
2887 *cp++ = '-';
2888 if (i == INT_MIN) {
2889 *cp++ = '2'; *cp++ = '1'; *cp++ = '4'; *cp++ = '7';
2890 *cp++ = '4'; *cp++ = '8'; *cp++ = '3'; *cp++ = '6';
2891 *cp++ = '4'; *cp++ = '8';
2892 return cp;
2893 }
2894
2895 i = -i;
2896 } else if (i == 0) {
2897 *cp++ = '0';
2898 return cp;
2899 }
2900
2901 //if (i < 10) goto b0;
2902 if (i < 100) goto b1;
2903 //if (i < 1000) goto b2;
2904 if (i < 10000) goto b3;
2905 //if (i < 100000) goto b4;
2906 if (i < 1000000) goto b5;
2907 //if (i < 10000000) goto b6;
2908 if (i < 100000000) goto b7;
2909
2910 if ((j = i / 1000000000000000000)) {*cp++ = j + '0'; i -= j*1000000000000000000; goto xh;}
2911 if ((j = i / 100000000000000000)) {*cp++ = j + '0'; i -= j*100000000000000000; goto xg;}
2912 if ((j = i / 10000000000000000)) {*cp++ = j + '0'; i -= j*10000000000000000; goto xf;}
2913 if ((j = i / 1000000000000000)) {*cp++ = j + '0'; i -= j*1000000000000000; goto xe;}
2914 if ((j = i / 100000000000000)) {*cp++ = j + '0'; i -= j*100000000000000; goto xd;}
2915 if ((j = i / 10000000000000)) {*cp++ = j + '0'; i -= j*10000000000000; goto xc;}
2916 if ((j = i / 1000000000000)) {*cp++ = j + '0'; i -= j*1000000000000; goto xb;}
2917 if ((j = i / 100000000000)) {*cp++ = j + '0'; i -= j*100000000000; goto xa;}
2918 if ((j = i / 10000000000)) {*cp++ = j + '0'; i -= j*10000000000; goto x9;}
2919 if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
2920 if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
2921 b7: if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
2922 if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
2923 b5: if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
2924 if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
2925 b3: if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
2926 if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
2927 b1: if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
2928 if (i) *cp++ = i + '0';
2929 return cp;
2930
2931 xh: *cp++ = i / 100000000000000000 + '0', i %= 100000000000000000;
2932 xg: *cp++ = i / 10000000000000000 + '0', i %= 10000000000000000;
2933 xf: *cp++ = i / 1000000000000000 + '0', i %= 1000000000000000;
2934 xe: *cp++ = i / 100000000000000 + '0', i %= 100000000000000;
2935 xd: *cp++ = i / 10000000000000 + '0', i %= 10000000000000;
2936 xc: *cp++ = i / 1000000000000 + '0', i %= 1000000000000;
2937 xb: *cp++ = i / 100000000000 + '0', i %= 100000000000;
2938 xa: *cp++ = i / 10000000000 + '0', i %= 10000000000;
2939 x9: *cp++ = i / 1000000000 + '0', i %= 1000000000;
2940 x8: *cp++ = i / 100000000 + '0', i %= 100000000;
2941 x7: *cp++ = i / 10000000 + '0', i %= 10000000;
2942 x6: *cp++ = i / 1000000 + '0', i %= 1000000;
2943 x5: *cp++ = i / 100000 + '0', i %= 100000;
2944 x4: *cp++ = i / 10000 + '0', i %= 10000;
2945 x3: *cp++ = i / 1000 + '0', i %= 1000;
2946 x2: *cp++ = i / 100 + '0', i %= 100;
2947 x1: *cp++ = i / 10 + '0', i %= 10;
2948 x0: *cp++ = i + '0';
2949
2950 return cp;
2951 }
2952
2953 /*
2954 * This is set up so that count should never be more than BGZF_BUFF_SIZE.
2955 * This has been chosen to deliberately be small enough such that the
2956 * bgzf header/footer + worst-case expansion (deflateBound() func) of 'buf'
2957 * are <= 65536, thus ensuring BGZF BSIZE is always 16-bit.
2958 *
2959 * Returns 0 on success;
2960 * -1 on error
2961 */
2962 #ifdef HAVE_LIBDEFLATE
bgzf_encode(int level,const void * buf,uint32_t in_sz,void * out,uint32_t * out_sz)2963 static int bgzf_encode(int level,
2964 const void *buf, uint32_t in_sz,
2965 void *out, uint32_t *out_sz) {
2966 size_t clen;
2967 unsigned char *blk = out;
2968 level = level >= 0 ? level : 6; // libdeflate doesn't honour -1 as default
2969 if (level > 7) level++;
2970 if (level > 9) level+=2; // max 12
2971
2972 if (level == 0) {
2973 if (Z_BUFF_SIZE < in_sz+5+18)
2974 return -1;
2975 blk[18] = 1; // BFINAL=1
2976 blk[19] = (in_sz >> 0) & 0xff;
2977 blk[20] = (in_sz >> 8) & 0xff;
2978 blk[21] = (~in_sz >> 0) & 0xff;
2979 blk[22] = (~in_sz >> 8) & 0xff;
2980 memcpy(blk+18+5, buf, in_sz);
2981 clen = in_sz+5;
2982 } else {
2983 struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
2984 if (!z)
2985 return -1;
2986
2987 clen = libdeflate_deflate_compress(z, buf, in_sz, blk + 18, Z_BUFF_SIZE);
2988 libdeflate_free_compressor(z);
2989 if (clen <= 0) {
2990 fprintf(stderr, "Libdeflate failed to compress\n");
2991 return -1;
2992 }
2993 }
2994
2995 /* Fill out gzip header */
2996 blk[ 0] = 31; // ID1
2997 blk[ 1] = 139; // ID2
2998 blk[ 2] = 8; // CM (deflate)
2999 blk[ 3] = 4; // FLAGS (FEXTRA)
3000 blk[ 4] = blk[5] = blk[6] = blk[7] = 0; // MTIME
3001 blk[ 8] = 0; // XFL
3002 blk[ 9] = 255; // OS (unknown)
3003 blk[10] = 6; // XLEN
3004 blk[11] = 0; // XLEN
3005
3006 /* Extra BGZF fields */
3007 blk[12] = 66; // SI1
3008 blk[13] = 67; // SI2
3009 blk[14] = 2; // SLEN
3010 blk[15] = 0; // SLEN
3011 blk[16] = ((clen + 25) >> 0) & 0xff;
3012 blk[17] = ((clen + 25) >> 8) & 0xff;
3013
3014 uint32_t crc;
3015 crc = libdeflate_crc32(0L, NULL, 0L);
3016 crc = libdeflate_crc32(crc, (unsigned char *)buf, in_sz);
3017 blk[18+clen+0] = (crc >> 0) & 0xff;
3018 blk[18+clen+1] = (crc >> 8) & 0xff;
3019 blk[18+clen+2] = (crc >>16) & 0xff;
3020 blk[18+clen+3] = (crc >>24) & 0xff;
3021 blk[18+clen+4] = (in_sz >> 0) & 0xff;
3022 blk[18+clen+5] = (in_sz >> 8) & 0xff;
3023 blk[18+clen+6] = (in_sz >>16) & 0xff;
3024 blk[18+clen+7] = (in_sz >>24) & 0xff;
3025
3026 *out_sz = 18+clen+8;
3027
3028 return 0;
3029 }
3030 #else
bgzf_encode(int level,const void * buf,uint32_t in_sz,void * out,uint32_t * out_sz)3031 static int bgzf_encode(int level,
3032 const void *buf, uint32_t in_sz,
3033 void *out, uint32_t *out_sz) {
3034 unsigned char *blk = out;
3035 z_stream s;
3036 int cdata_pos;
3037 int cdata_size;
3038 int cdata_alloc;
3039 int err;
3040 uint32_t crc;
3041
3042 /* Initialise zlib stream */
3043 cdata_pos = 18;
3044 cdata_alloc = Z_BUFF_SIZE;
3045 s.zalloc = Z_NULL; /* use default allocation functions */
3046 s.zfree = Z_NULL;
3047 s.opaque = Z_NULL;
3048 s.next_in = (unsigned char *)buf;
3049 s.avail_in = in_sz;
3050 s.total_in = 0;
3051 s.next_out = blk + cdata_pos;
3052 s.avail_out = cdata_alloc;
3053 s.total_out = 0;
3054 s.data_type = Z_BINARY;
3055
3056 /* Compress it */
3057 err = deflateInit2(&s, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
3058 //err = deflateInit2(&s, level, Z_DEFLATED, -15, 8, Z_FILTERED);
3059
3060 if (err != Z_OK) {
3061 fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
3062 return -1;
3063 }
3064
3065 /* Encode to 'cdata' array */
3066 for (;s.avail_in;) {
3067 s.next_out = blk + cdata_pos;
3068 s.avail_out = cdata_alloc - cdata_pos;
3069 if (cdata_alloc - cdata_pos <= 0) {
3070 fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
3071 return -1;
3072 }
3073 err = deflate(&s, Z_NO_FLUSH); // or Z_FINISH?
3074 cdata_pos = cdata_alloc - s.avail_out;
3075 if (err != Z_OK) {
3076 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
3077 break;
3078 }
3079 }
3080 if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
3081 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
3082 }
3083 cdata_size = s.total_out;
3084
3085 if (deflateEnd(&s) != Z_OK) {
3086 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
3087 }
3088
3089 assert(cdata_size <= 65536);
3090
3091 /* Fill out gzip header */
3092 blk[ 0] = 31; // ID1
3093 blk[ 1] = 139; // ID2
3094 blk[ 2] = 8; // CM (deflate)
3095 blk[ 3] = 4; // FLAGS (FEXTRA)
3096 blk[ 4] = blk[5] = blk[6] = blk[7] = 0; // MTIME
3097 blk[ 8] = 0; // XFL
3098 blk[ 9] = 255; // OS (unknown)
3099 blk[10] = 6; // XLEN
3100 blk[11] = 0; // XLEN
3101
3102 /* Extra BGZF fields */
3103 blk[12] = 66; // SI1
3104 blk[13] = 67; // SI2
3105 blk[14] = 2; // SLEN
3106 blk[15] = 0; // SLEN
3107 blk[16] = ((cdata_size + 25) >> 0) & 0xff;
3108 blk[17] = ((cdata_size + 25) >> 8) & 0xff;
3109
3110 crc = iolib_crc32(0L, NULL, 0L);
3111 crc = iolib_crc32(crc, (unsigned char *)buf, in_sz);
3112 blk[18+cdata_size+0] = (crc >> 0) & 0xff;
3113 blk[18+cdata_size+1] = (crc >> 8) & 0xff;
3114 blk[18+cdata_size+2] = (crc >>16) & 0xff;
3115 blk[18+cdata_size+3] = (crc >>24) & 0xff;
3116 blk[18+cdata_size+4] = (in_sz >> 0) & 0xff;
3117 blk[18+cdata_size+5] = (in_sz >> 8) & 0xff;
3118 blk[18+cdata_size+6] = (in_sz >>16) & 0xff;
3119 blk[18+cdata_size+7] = (in_sz >>24) & 0xff;
3120
3121 *out_sz = 18+cdata_size+8;
3122
3123 return 0;
3124 }
3125 #endif
3126
bgzf_block_write(bam_file_t * bf,int level,const void * buf,size_t count)3127 static int bgzf_block_write(bam_file_t *bf, int level,
3128 const void *buf, size_t count) {
3129 if (!bf->idx)
3130 return BGZF_WRITE(bf, level, buf, count);
3131
3132 const uint8_t *input = (const uint8_t*)buf;
3133 // amount of uncompressed data to be fed into next block
3134 uint64_t ublock_size;
3135 ssize_t remaining = count;
3136
3137 while (remaining > 0) {
3138 if ((bf->current_block) == (bf->idx->n)) { //
3139 fprintf(stderr, "ERROR: reached end of bgzip index with more data to write\n");
3140 return -1;
3141 }
3142 ublock_size = bf->idx->u_off[bf->current_block+1]
3143 - bf->idx->u_off[bf->current_block];
3144
3145 int copy_length = ublock_size - bf->bgbuf_sz;
3146 if (copy_length > remaining)
3147 copy_length = remaining;
3148
3149 memcpy(bf->bgbuf_p + bf->bgbuf_sz, input, copy_length);
3150 input += copy_length;
3151 bf->bgbuf_sz += copy_length;
3152 remaining -= copy_length;
3153
3154 if (bf->bgbuf_sz == ublock_size) {
3155 BGZF_WRITE(bf, level, bf->bgbuf_p, ublock_size);
3156 bf->bgbuf_sz=0;
3157 bf->current_block++; // track the blocks
3158 }
3159 }
3160
3161 return 0;
3162 }
3163
3164
bgzf_write(bam_file_t * bf,int level,const void * buf,size_t count)3165 static int bgzf_write(bam_file_t *bf, int level, const void *buf, size_t count) {
3166 unsigned char blk[Z_BUFF_SIZE+4];
3167 uint32_t len;
3168
3169 if (0 != bgzf_encode(level, buf, count, blk, &len))
3170 return -1;
3171
3172 if (len != fwrite(blk, 1, len, bf->fp))
3173 return -1;
3174
3175 return 0;
3176 }
3177
3178 typedef struct {
3179 int level;
3180 unsigned char in[Z_BUFF_SIZE];
3181 unsigned char out[Z_BUFF_SIZE];
3182 uint32_t in_sz, out_sz;
3183 } bgzf_encode_job;
3184
bgzf_encode_thread(void * arg)3185 void *bgzf_encode_thread(void *arg) {
3186 bgzf_encode_job *j = (bgzf_encode_job *)arg;
3187
3188 bgzf_encode(j->level, j->in, j->in_sz, j->out, &j->out_sz);
3189 return arg;
3190 }
3191
bgzf_write_mt(bam_file_t * bf,int level,const void * buf,size_t count)3192 static int bgzf_write_mt(bam_file_t *bf, int level, const void *buf,
3193 size_t count) {
3194 bgzf_encode_job *j;
3195 t_pool_result *r;
3196
3197 if (!bf->pool)
3198 return bgzf_write(bf, level, buf, count);
3199
3200 j = malloc(sizeof(*j));
3201 j->level = level;
3202 memcpy(j->in, buf, count);
3203 j->in_sz = count;
3204 t_pool_dispatch(bf->pool, bf->equeue, bgzf_encode_thread, j);
3205
3206 while ((r = t_pool_next_result(bf->equeue))) {
3207 j = (bgzf_encode_job *)r->data;
3208 if (j->out_sz != fwrite(j->out, 1, j->out_sz, bf->fp))
3209 return -1;
3210 t_pool_delete_result(r, 1);
3211 }
3212
3213 return 0;
3214 }
3215
3216 #ifdef USE_MT
bgzf_flush_mt(bam_file_t * bf)3217 static int bgzf_flush_mt(bam_file_t *bf) {
3218 t_pool_result *r;
3219 bgzf_encode_job *j;
3220
3221 if (!bf->pool)
3222 return 0;
3223
3224 t_pool_flush(bf->pool);
3225
3226 while ((r = t_pool_next_result(bf->equeue))) {
3227 j = (bgzf_encode_job *)r->data;
3228 if (j->out_sz != fwrite(j->out, 1, j->out_sz, bf->fp))
3229 return -1;
3230 t_pool_delete_result(r, 1);
3231 }
3232
3233 return 0;
3234 }
3235
3236 #else
bgzf_flush(bam_file_t * bf)3237 static int bgzf_flush(bam_file_t *bf) {
3238 return 0;
3239 }
3240 #endif
3241
3242 /*
3243 * Writes a single bam sequence object.
3244 * Returns 0 on success
3245 * -1 on failure
3246 */
bam_put_seq(bam_file_t * fp,bam_seq_t * b)3247 int bam_put_seq(bam_file_t *fp, bam_seq_t *b) {
3248 char *auxh, aux_key[3], type;
3249 bam_aux_t val;
3250
3251 /*
3252 * Thread safe version of:
3253 *
3254 * static int init_done = 0;
3255 * static uint16_t code2base[256];
3256 *
3257 * if (!init_done) {
3258 * int i;
3259 * char c[2];
3260 * uint16_t s;
3261 * for (i = 0; i < 256; i++) {
3262 * c[0] = "=ACMGRSVTWYHKDBN"[i >> 4];
3263 * c[1] = "=ACMGRSVTWYHKDBN"[i & 15];
3264 * s = *(uint16_t *)c;
3265 * code2base[i] = s;
3266 *
3267 * //printf("%5d,%c", code2base[i],i%8==7?'\n':' ');
3268 * }
3269 * init_done = 1;
3270 * }
3271 */
3272 #ifdef ALLOW_UAC
3273 static const uint16_t code2base[256] = {
3274 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077,
3275 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029,
3276 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081,
3277 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033,
3278 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083,
3279 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035,
3280 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093,
3281 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045,
3282 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087,
3283 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039,
3284 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098,
3285 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050,
3286 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099,
3287 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051,
3288 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102,
3289 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054,
3290 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100,
3291 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052,
3292 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103,
3293 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055,
3294 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105,
3295 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057,
3296 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088,
3297 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040,
3298 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091,
3299 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043,
3300 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084,
3301 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036,
3302 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082,
3303 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034,
3304 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094,
3305 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046
3306 };
3307 #endif
3308
3309 if (!fp->binary) {
3310 /* SAM */
3311 unsigned char *end = fp->uncomp + BGZF_BUFF_SIZE, *dat;
3312 int sz, i, n;
3313
3314 #define BF_FLUSH() \
3315 do { \
3316 if (fp->uncomp_p - fp->uncomp != \
3317 fwrite(fp->uncomp, 1, fp->uncomp_p - fp->uncomp, fp->fp)) \
3318 return -1; \
3319 fp->uncomp_p=fp->uncomp; \
3320 } while(0)
3321
3322 /* QNAME */
3323 if (end - fp->uncomp_p < (sz = bam_name_len(b))) BF_FLUSH();
3324 if (bam_name(b) - (char *)b + sz-1 >
3325 b->blk_size + offsetof(bam_seq_t, ref)) {
3326 fprintf(stderr, "Name length too large for bam block\n");
3327 return -1;
3328 }
3329 memcpy(fp->uncomp_p, bam_name(b), sz-1); fp->uncomp_p += sz-1;
3330 *fp->uncomp_p++ = '\t';
3331
3332 /* FLAG */
3333 if (end-fp->uncomp_p < 5) BF_FLUSH();
3334 fp->uncomp_p = append_int(fp->uncomp_p, bam_flag(b));
3335 *fp->uncomp_p++ = '\t';
3336
3337 /* RNAME */
3338 if (b->ref < -1 || b->ref >= fp->header->nref)
3339 return -1;
3340
3341 if (b->ref != -1) {
3342 size_t l = strlen(fp->header->ref[b->ref].name);
3343 if (end-fp->uncomp_p < l+1) BF_FLUSH();
3344 memcpy(fp->uncomp_p, fp->header->ref[b->ref].name, l);
3345 fp->uncomp_p += l;
3346 } else {
3347 if (end-fp->uncomp_p < 2) BF_FLUSH();
3348 *fp->uncomp_p++ = '*';
3349 }
3350 *fp->uncomp_p++ = '\t';
3351
3352 /* POS */
3353 if (b->pos < -1) return -1;
3354 if (end-fp->uncomp_p < 12) BF_FLUSH();
3355 fp->uncomp_p = append_int64(fp->uncomp_p, b->pos+1); *fp->uncomp_p++ = '\t';
3356
3357 /* MAPQ */
3358 if (end-fp->uncomp_p < 5) BF_FLUSH();
3359 fp->uncomp_p = append_int(fp->uncomp_p, bam_map_qual(b)); *fp->uncomp_p++ = '\t';
3360
3361 /* CIGAR */
3362 n = bam_cigar_len(b);dat = (uc *)bam_cigar(b);
3363 if (n < 0 ||
3364 dat - (uc *)b + n*4 > b->blk_size + offsetof(bam_seq_t, ref))
3365 return -1;
3366 for (i = 0; i < n; i++, dat+=4) {
3367 uint32_t c = *(uint32_t *)dat;
3368 if (end-fp->uncomp_p < 13) BF_FLUSH();
3369 fp->uncomp_p = append_int(fp->uncomp_p, c>>4);
3370 *fp->uncomp_p++="MIDNSHP=X???????"[c&15];
3371 }
3372 if (n==0) {
3373 if (end-fp->uncomp_p < 2) BF_FLUSH();
3374 *fp->uncomp_p++='*';
3375 }
3376 *fp->uncomp_p++='\t';
3377
3378 /* NRNM */
3379 if (b->mate_ref < -1 || b->mate_ref >= fp->header->nref)
3380 return -1;
3381
3382 if (b->mate_ref != -1) {
3383 if (b->mate_ref == b->ref) {
3384 if (end-fp->uncomp_p < 2) BF_FLUSH();
3385 *fp->uncomp_p++ = '=';
3386 } else {
3387 size_t l = strlen(fp->header->ref[b->mate_ref].name);
3388 if (end-fp->uncomp_p < l+1) BF_FLUSH();
3389 memcpy(fp->uncomp_p, fp->header->ref[b->mate_ref].name, l);
3390 fp->uncomp_p += l;
3391 }
3392 } else {
3393 if (end-fp->uncomp_p < 2) BF_FLUSH();
3394 *fp->uncomp_p++ = '*';
3395 }
3396 *fp->uncomp_p++ = '\t';
3397
3398 /* MPOS */
3399 if (end-fp->uncomp_p < 12) BF_FLUSH();
3400 fp->uncomp_p = append_int64(fp->uncomp_p, b->mate_pos+1); *fp->uncomp_p++ = '\t';
3401
3402 /* ISIZE */
3403 if (end-fp->uncomp_p < 12) BF_FLUSH();
3404 fp->uncomp_p = append_int64(fp->uncomp_p, b->ins_size); *fp->uncomp_p++ = '\t';
3405
3406 /* SEQ */
3407 n = (b->len+1)/2;
3408 dat = (uc *)bam_seq(b);
3409
3410 if (dat - (uc *)b + b->len > b->blk_size + offsetof(bam_seq_t, ref)) {
3411 fprintf(stderr, "Sequence length too large for bam block\n");
3412 return -1;
3413 }
3414
3415 /* BAM encoding */
3416 // while (n) {
3417 // int l = end-fp->uncomp_p < n ? end-fp->uncomp_p : n;
3418 // memcpy(fp->uncomp_p, dat, l); fp->uncomp_p += l;
3419 // n -= l; dat += l;
3420 // if (end == fp->uncomp_p) BF_FLUSH();
3421 // }
3422 if (b->len != 0) {
3423 if (end - fp->uncomp_p < b->len + 3) BF_FLUSH();
3424 if (end - fp->uncomp_p < b->len + 3) {
3425 /* Extra long seqs need more regular checks */
3426 for (i = 0; i < b->len-1; i+=2) {
3427 if (end - fp->uncomp_p < 3) BF_FLUSH();
3428 *fp->uncomp_p++ = "=ACMGRSVTWYHKDBN"[*dat >> 4];
3429 *fp->uncomp_p++ = "=ACMGRSVTWYHKDBN"[*dat++ & 15];
3430 }
3431 if (i < b->len) {
3432 if (end - fp->uncomp_p < 3) BF_FLUSH();
3433 *fp->uncomp_p++ = "=ACMGRSVTWYHKDBN"[*dat >> 4];
3434 }
3435 } else {
3436 unsigned char *cp = fp->uncomp_p;
3437 int n = b->len & ~1;
3438 for (i = 0; i < n; i+=2) {
3439 #ifdef ALLOW_UAC
3440 *(int16_u *)cp = le_int2(code2base[*dat++]);
3441 cp += 2;
3442 #else
3443 cp[0] = "=ACMGRSVTWYHKDBN"[*dat >> 4];
3444 cp[1] = "=ACMGRSVTWYHKDBN"[*dat++ & 15];
3445 cp += 2;
3446 #endif
3447 }
3448 if (i < b->len) {
3449 *cp++ = "=ACMGRSVTWYHKDBN"[*dat >> 4];
3450 }
3451 fp->uncomp_p = cp;
3452 }
3453 } else {
3454 if (end - fp->uncomp_p < 2) BF_FLUSH();
3455 *fp->uncomp_p++ = '*';
3456 }
3457 *fp->uncomp_p++ = '\t';
3458
3459 /* QUAL */
3460 n = b->len;
3461 if (b->len < 0) return -1;
3462 dat = (uc *)bam_qual(b);
3463 if (dat - (uc *)b + b->len > b->blk_size + offsetof(bam_seq_t, ref))
3464 return -1;
3465 /* BAM encoding */
3466 // while (n) {
3467 // int l = end-fp->uncomp_p < n ? end-fp->uncomp_p : n;
3468 // memcpy(fp->uncomp_p, dat, l); fp->uncomp_p += l;
3469 // n -= l; dat += l;
3470 // if (end == fp->uncomp_p) BF_FLUSH();
3471 // }
3472 if (b->len != 0) {
3473 if (*dat == 0xff) {
3474 if (end - fp->uncomp_p < 2) BF_FLUSH();
3475 *fp->uncomp_p++ = '*';
3476 dat += b->len;
3477 } else {
3478 if (end - fp->uncomp_p < b->len + 3) BF_FLUSH();
3479 if (end - fp->uncomp_p < b->len + 3 ||
3480 fp->binning == BINNING_ILLUMINA) {
3481
3482 /* Long seqs */
3483 if (fp->binning == BINNING_ILLUMINA) {
3484 for (i = 0; i < b->len; i++) {
3485 if (end - fp->uncomp_p < 3) BF_FLUSH();
3486 *fp->uncomp_p++ = illumina_bin_33[(uc)*dat++];
3487 }
3488 } else {
3489 for (i = 0; i < b->len; i++) {
3490 if (end - fp->uncomp_p < 3) BF_FLUSH();
3491 *fp->uncomp_p++ = *dat++ + '!';
3492 }
3493 }
3494 } else {
3495 unsigned char *cp = fp->uncomp_p;
3496 i = 0;
3497 #ifdef ALLOW_UAC
3498 int n = b->len & ~3;
3499 for (; i < n; i+=4) {
3500 //*cp++ = *dat++ + '!';
3501 *(uint32_u *)cp = *(uint32_u *)dat + 0x21212121;
3502 cp += 4;
3503 dat += 4;
3504 }
3505 #endif
3506 for (; i < b->len; i++) {
3507 *cp++ = *dat++ + '!';
3508 }
3509 fp->uncomp_p = cp;
3510 }
3511 }
3512 } else {
3513 if (end - fp->uncomp_p < 2) BF_FLUSH();
3514 *fp->uncomp_p++ = '*';
3515 }
3516
3517 /* Auxiliary tags */
3518 auxh = NULL;
3519 while (0 == bam_aux_iter_full(b, &auxh, aux_key, &type, &val)) {
3520 if (end - fp->uncomp_p < 20) BF_FLUSH();
3521 *fp->uncomp_p++ = '\t';
3522 *fp->uncomp_p++ = aux_key[0];
3523 *fp->uncomp_p++ = aux_key[1];
3524 *fp->uncomp_p++ = ':';
3525 *fp->uncomp_p++ = type;
3526 *fp->uncomp_p++ = ':';
3527 switch(aux_key[2]) {
3528 case 'A':
3529 *fp->uncomp_p++ = val.i;
3530 break;
3531
3532 case 'C':
3533 fp->uncomp_p = append_uint(fp->uncomp_p, (uint8_t)val.i);
3534 break;
3535
3536 case 'c':
3537 fp->uncomp_p = append_int(fp->uncomp_p, (int8_t)val.i);
3538 break;
3539
3540 case 'S':
3541 fp->uncomp_p = append_uint(fp->uncomp_p, (uint16_t)val.i);
3542 break;
3543
3544 case 's':
3545 fp->uncomp_p = append_int(fp->uncomp_p, (int16_t)val.i);
3546 break;
3547
3548 case 'I':
3549 fp->uncomp_p = append_uint(fp->uncomp_p, (uint32_t)val.i);
3550 break;
3551
3552 case 'i':
3553 fp->uncomp_p = append_int(fp->uncomp_p, (int32_t)val.i);
3554 break;
3555
3556 case 'f':
3557 fp->uncomp_p += sprintf((char *)fp->uncomp_p, "%g", val.f);
3558 break;
3559
3560 case 'd':
3561 fp->uncomp_p += sprintf((char *)fp->uncomp_p, "%g", val.d);
3562 break;
3563
3564 case 'Z':
3565 case 'H': {
3566 size_t l = strlen(val.s), l2;
3567 char *dat = val.s;
3568 do {
3569 if (end - fp->uncomp_p < l+2) BF_FLUSH();
3570 l2 = MIN(l, end-fp->uncomp_p);
3571 memcpy(fp->uncomp_p, dat, l2);
3572 fp->uncomp_p += l2;
3573 l -= l2;
3574 dat += l2;
3575 } while (l);
3576 break;
3577 }
3578
3579 case 'B': {
3580 uint32_t count = val.B.n, sz, j;
3581 unsigned char *s = val.B.s;
3582 *fp->uncomp_p++ = val.B.t;
3583
3584 /*
3585 * Chew through count items 4000 at a time.
3586 * This is because 4000*14 (biggest %g output plus comma?)
3587 * is just shy of 64k, so we avoid buffer overflows.
3588 */
3589 switch (val.B.t) {
3590 case 'C': case 'c': sz = 4; break;
3591 case 'S': case 's': sz = 6; break;
3592 default: sz = 14; break;
3593 }
3594
3595 for (j = 0; j < count; j += 4000) {
3596 int i_start = j;
3597 int i_end = j + 4000 < count ? j + 4000 : count;
3598
3599 if (end - fp->uncomp_p < 5+(i_end-i_start)*sz) BF_FLUSH();
3600
3601 switch (val.B.t) {
3602 int i;
3603 case 'C':
3604 for (i = i_start; i < i_end; i++, s++) {
3605 *fp->uncomp_p++ = ',';
3606 fp->uncomp_p = append_int(fp->uncomp_p, (uint8_t)s[0]);
3607 }
3608 break;
3609
3610 case 'c':
3611 for (i = i_start; i < i_end; i++, s++) {
3612 *fp->uncomp_p++ = ',';
3613 fp->uncomp_p = append_int(fp->uncomp_p, (int8_t)s[0]);
3614 }
3615 break;
3616
3617 case 'S':
3618 for (i = i_start; i < i_end; i++, s+=2) {
3619 *fp->uncomp_p++ = ',';
3620 fp->uncomp_p = append_int(fp->uncomp_p,
3621 (uint16_t)((s[0] << 0) +
3622 (s[1] << 8)));
3623 }
3624 break;
3625
3626 case 's':
3627 for (i = i_start; i < i_end; i++, s+=2) {
3628 *fp->uncomp_p++ = ',';
3629 fp->uncomp_p = append_int(fp->uncomp_p,
3630 (int16_t)((s[0] << 0) +
3631 (s[1] << 8)));
3632 }
3633 break;
3634
3635 case 'I':
3636 for (i = i_start; i < i_end; i++, s+=4) {
3637 *fp->uncomp_p++ = ',';
3638 fp->uncomp_p = append_uint(fp->uncomp_p,
3639 (uint32_t)((s[0] << 0) +
3640 (s[1] << 8) +
3641 (s[2] <<16) +
3642 (s[3] <<24)));
3643 }
3644 break;
3645
3646 case 'i':
3647 for (i = i_start; i < i_end; i++, s+=4) {
3648 *fp->uncomp_p++ = ',';
3649 fp->uncomp_p = append_int(fp->uncomp_p,
3650 (int32_t)((s[0] << 0) +
3651 (s[1] << 8) +
3652 (s[2] <<16) +
3653 (s[3] <<24)));
3654 }
3655 break;
3656
3657 case 'f': {
3658 union {
3659 float f;
3660 unsigned char c[4];
3661 } u;
3662 for (i = i_start; i < i_end; i++, s+=4) {
3663 *fp->uncomp_p++ = ',';
3664 u.c[0] = s[0];
3665 u.c[1] = s[1];
3666 u.c[2] = s[2];
3667 u.c[3] = s[3];
3668 fp->uncomp_p += sprintf((char *)fp->uncomp_p, "%g", u.f);
3669 }
3670 break;
3671 }
3672
3673 default:
3674 fprintf(stderr, "Unhandled sub-type of aux type B\n");
3675 }
3676 }
3677 break;
3678 }
3679
3680 default:
3681 fprintf(stderr, "Unhandled auxilary type '%c' in "
3682 "bam_put_seq()\n", type);
3683 }
3684 }
3685
3686 *fp->uncomp_p++ = '\n';
3687
3688 } else {
3689 /* BAM */
3690 unsigned char *end = fp->uncomp + BGZF_BUFF_SIZE, *ptr;
3691 size_t to_write;
3692 uint32_t i32;
3693 #ifndef ALLOW_UAC
3694 int name_len = bam_name_len(b);
3695 #endif
3696 #if !defined(ALLOW_UAC) || defined(SP_BIG_ENDIAN)
3697 uint32_t *cigar = bam_cigar(b);
3698 #endif
3699 #if defined(SP_BIG_ENDIAN)
3700 int i, n = bam_cigar_len(b);
3701 #endif
3702
3703 #define CF_FLUSH() \
3704 do { \
3705 if (bgzf_block_write(fp, fp->level, fp->uncomp, \
3706 fp->uncomp_p - fp->uncomp)) \
3707 return -1; \
3708 fp->uncomp_p=fp->uncomp; \
3709 } while(0)
3710
3711 /* If big endian, byte swap inline, write it out, and byte swap back */
3712 b->bin_packed = (b->bin << 16) | (b->map_qual << 8) | b->name_len;
3713 b->flag_packed = (b->flag << 16) | b->cigar_len;
3714
3715 #ifdef SP_BIG_ENDIAN
3716 b->ref = le_int4(b->ref);
3717 b->pos_32 = le_int4(b->pos);
3718 b->bin_packed = le_int4(b->bin_packed);
3719 b->flag_packed = le_int4(b->flag_packed);
3720 b->len = le_int4(b->len);
3721 b->mate_ref = le_int4(b->mate_ref);
3722 b->mate_pos_32 = le_int4(b->mate_pos);
3723 b->ins_size_32 = le_int4(b->ins_size);
3724
3725 for (i = 0; i < n; i++) {
3726 cigar[i] = le_int4(cigar[i]);
3727 }
3728 #else
3729 b->pos_32 = b->pos;
3730 b->mate_pos_32 = b->mate_pos;
3731 b->ins_size_32 = b->ins_size;
3732 #endif
3733
3734 #ifdef ALLOW_UAC
3735 /* Room for fixed size bits + name */
3736 if (end - fp->uncomp_p < 4) CF_FLUSH();
3737 to_write = b->blk_size;
3738 STORE_UINT32(fp->uncomp_p, to_write);
3739
3740 ptr = (unsigned char *)&b->ref;
3741 #else
3742 /* Room for fixed size bits + name */
3743 if (end - fp->uncomp_p < 36+257) CF_FLUSH();
3744 to_write = b->blk_size - (round4(name_len) - name_len);
3745 //to_write = b->blk_size;
3746 STORE_UINT32(fp->uncomp_p, to_write);
3747
3748 ptr = (unsigned char *)&b->ref;
3749
3750 /* Do fixed size bits + name first */
3751 memcpy(fp->uncomp_p, ptr, 32 + name_len);
3752 fp->uncomp_p += 32 + name_len;
3753 to_write -= 32 + name_len;
3754 ptr = (unsigned char *)cigar;
3755 #endif
3756
3757 if (fp->binning == BINNING_ILLUMINA) {
3758 int i;
3759 uc *q = (uc *)bam_qual(b);
3760 for (i = 0; i < b->len; i++) {
3761 q[i] = illumina_bin[q[i]];
3762 }
3763 }
3764
3765 do {
3766 size_t blk_len = MIN(to_write, end - fp->uncomp_p);
3767 memcpy(fp->uncomp_p, ptr, blk_len);
3768 fp->uncomp_p += blk_len;
3769 to_write -= blk_len;
3770 ptr += blk_len;
3771
3772 if (to_write) {
3773 //printf("flushing %d+%d\n",
3774 // (int)(ptr-(unsigned char *)&b->ref),
3775 // (int)(fp->uncomp_p-fp->uncomp));
3776 CF_FLUSH();
3777 }
3778 } while(to_write > 0);
3779
3780 #ifdef SP_BIG_ENDIAN
3781 /* Swap back again */
3782 b->ref = le_int4(b->ref);
3783 b->pos = le_int4(b->pos);
3784 b->bin_packed = le_int4(b->bin_packed);
3785 b->flag_packed = le_int4(b->flag_packed);
3786 b->len = le_int4(b->len);
3787 b->mate_ref = le_int4(b->mate_ref);
3788 b->mate_pos = le_int4(b->mate_pos);
3789 b->ins_size = le_int4(b->ins_size);
3790
3791 for (i = 0; i < n; i++) {
3792 cigar[i] = le_int4(cigar[i]);
3793 }
3794 #endif
3795
3796 i32 = b->bin_packed;
3797 b->bin = i32 >> 16;
3798 b->map_qual = (i32 >> 8) & 0xff;
3799 b->name_len = i32 & 0xff;
3800
3801 i32 = b->flag_packed;
3802 b->flag = i32 >> 16;
3803 b->cigar_len = i32 & 0xffff;
3804 }
3805
3806 return 0;
3807 }
3808
3809 /*
3810 * Writes a header block.
3811 * Returns 0 for success
3812 * -1 for failure
3813 */
bam_write_header(bam_file_t * out)3814 int bam_write_header(bam_file_t *out) {
3815 char *header, *hp, *htext;
3816 size_t hdr_size;
3817 int i, htext_len;
3818
3819 if (sam_hdr_rebuild(out->header))
3820 return -1;
3821
3822 htext = sam_hdr_str(out->header);
3823 htext_len = sam_hdr_length(out->header);
3824
3825 hdr_size = 12 + htext_len+1;
3826 for (i = 0; i < out->header->nref; i++) {
3827 hdr_size += strlen(out->header->ref[i].name)+1 + 8;
3828 }
3829 if (NULL == (hp = header = malloc(hdr_size)))
3830 return -1;
3831
3832 if (out->binary) {
3833 *hp++ = 'B'; *hp++ = 'A'; *hp++ = 'M'; *hp++ = 1;
3834 STORE_UINT32(hp, htext_len);
3835 }
3836 memcpy(hp, htext, htext_len);
3837 hp += htext_len;
3838
3839 if (out->binary) {
3840 int i;
3841
3842 STORE_UINT32(hp, out->header->nref);
3843
3844 for (i = 0; i < out->header->nref; i++) {
3845 size_t l = strlen(out->header->ref[i].name)+1;
3846 STORE_UINT32(hp, l);
3847
3848 strcpy(hp, out->header->ref[i].name);
3849 hp += l;
3850
3851 l = out->header->ref[i].len;
3852 STORE_UINT32(hp, l);
3853 }
3854 }
3855
3856 if (out->binary) {
3857 int len = hp-header;
3858 char *cp = header;
3859
3860 while (len) {
3861 int sz = BGZF_BUFF_SIZE < len ? BGZF_BUFF_SIZE : len;
3862 if (bgzf_block_write(out, out->level, cp, sz))
3863 return -1;
3864 cp += sz;
3865 len -= sz;
3866 }
3867 } else {
3868 if (hp-header != fwrite(header, 1, hp-header, out->fp))
3869 return -1;
3870 }
3871
3872 free(header);
3873
3874 return 0;
3875 }
3876
3877
3878 /*
3879 * Sets options on the bam_file_t. See BAM_OPT_* definitions in bam.h.
3880 * Use this immediately after opening.
3881 *
3882 * Returns 0 on success
3883 * -1 on failure
3884 */
bam_set_option(bam_file_t * fd,enum bam_option opt,...)3885 int bam_set_option(bam_file_t *fd, enum bam_option opt, ...) {
3886 int r;
3887 va_list args;
3888
3889 va_start(args, opt);
3890 r = bam_set_voption(fd, opt, args);
3891 va_end(args);
3892
3893 return r;
3894 }
3895
3896 /*
3897 * Sets options on the bam_file_t. See BAM_OPT_* definitions in bam.h.
3898 * Use this immediately after opening.
3899 *
3900 * Returns 0 on success
3901 * -1 on failure
3902 */
bam_set_voption(bam_file_t * fd,enum bam_option opt,va_list args)3903 int bam_set_voption(bam_file_t *fd, enum bam_option opt, va_list args) {
3904 switch (opt) {
3905 case BAM_OPT_THREAD_POOL:
3906 fd->pool = va_arg(args, t_pool *);
3907 fd->equeue = t_results_queue_init();
3908 fd->dqueue = t_results_queue_init();
3909 break;
3910
3911 case BAM_OPT_BINNING:
3912 fd->binning = va_arg(args, int);
3913 break;
3914
3915 case BAM_OPT_IGNORE_CHKSUM:
3916 fd->ignore_chksum = va_arg(args, int);
3917 break;
3918 case BAM_OPT_WITH_BGZIP_IDX:
3919 fd->idx = va_arg(args, gzi *);
3920 break;
3921 case BAM_OPT_OUTPUT_BGZIP_IDX:
3922 fd->idx_fn = va_arg(args, char *);
3923 break;
3924 }
3925
3926 return 0;
3927 }
3928