1 /* sam.c -- SAM and BAM file I/O and manipulation.
2
3 Copyright (C) 2008-2010, 2012-2018 Genome Research Ltd.
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <strings.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <zlib.h>
34 #include <assert.h>
35 #include "htslib/sam.h"
36 #include "htslib/bgzf.h"
37 #include "cram/cram.h"
38 #include "hts_internal.h"
39 #include "htslib/hfile.h"
40 #include "htslib/hts_endian.h"
41
42 #include "htslib/khash.h"
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44
45 typedef khash_t(s2i) sdict_t;
46
47 #ifndef EOVERFLOW
48 #define EOVERFLOW ERANGE
49 #endif
50
51 /**********************
52 *** BAM header I/O ***
53 **********************/
54
bam_hdr_init()55 bam_hdr_t *bam_hdr_init()
56 {
57 return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
58 }
59
bam_hdr_destroy(bam_hdr_t * h)60 void bam_hdr_destroy(bam_hdr_t *h)
61 {
62 int32_t i;
63 if (h == NULL) return;
64 if (h->target_name) {
65 for (i = 0; i < h->n_targets; ++i)
66 free(h->target_name[i]);
67 free(h->target_name);
68 free(h->target_len);
69 }
70 free(h->text); free(h->cigar_tab);
71 if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
72 free(h);
73 }
74
bam_hdr_dup(const bam_hdr_t * h0)75 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
76 {
77 if (h0 == NULL) return NULL;
78 bam_hdr_t *h;
79 if ((h = bam_hdr_init()) == NULL) return NULL;
80 // copy the simple data
81 h->n_targets = h0->n_targets;
82 h->ignore_sam_err = h0->ignore_sam_err;
83 h->l_text = h0->l_text;
84 // Then the pointery stuff
85 h->cigar_tab = NULL;
86 h->sdict = NULL;
87 // TODO Check for memory allocation failures
88 h->text = (char*)calloc(h->l_text + 1, 1);
89 memcpy(h->text, h0->text, h->l_text);
90 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
91 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
92 int i;
93 for (i = 0; i < h->n_targets; ++i) {
94 h->target_len[i] = h0->target_len[i];
95 h->target_name[i] = strdup(h0->target_name[i]);
96 }
97 return h;
98 }
99
100
hdr_from_dict(sdict_t * d)101 static bam_hdr_t *hdr_from_dict(sdict_t *d)
102 {
103 bam_hdr_t *h;
104 khint_t k;
105 h = bam_hdr_init();
106 h->sdict = d;
107 h->n_targets = kh_size(d);
108 // TODO Check for memory allocation failures
109 h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
110 h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
111 for (k = kh_begin(d); k != kh_end(d); ++k) {
112 if (!kh_exist(d, k)) continue;
113 h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
114 h->target_len[kh_val(d, k)>>32] = kh_val(d, k) & 0xffffffffUL;
115 kh_val(d, k) >>= 32;
116 }
117 return h;
118 }
119
bam_hdr_read(BGZF * fp)120 bam_hdr_t *bam_hdr_read(BGZF *fp)
121 {
122 bam_hdr_t *h;
123 char buf[4];
124 int magic_len, has_EOF;
125 int32_t i, name_len, num_names = 0;
126 size_t bufsize;
127 ssize_t bytes;
128 // check EOF
129 has_EOF = bgzf_check_EOF(fp);
130 if (has_EOF < 0) {
131 perror("[W::bam_hdr_read] bgzf_check_EOF");
132 } else if (has_EOF == 0) {
133 hts_log_warning("EOF marker is absent. The input is probably truncated");
134 }
135 // read "BAM1"
136 magic_len = bgzf_read(fp, buf, 4);
137 if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
138 hts_log_error("Invalid BAM binary header");
139 return 0;
140 }
141 h = bam_hdr_init();
142 if (!h) goto nomem;
143
144 // read plain text and the number of reference sequences
145 bytes = bgzf_read(fp, &h->l_text, 4);
146 if (bytes != 4) goto read_err;
147 if (fp->is_be) ed_swap_4p(&h->l_text);
148
149 bufsize = ((size_t) h->l_text) + 1;
150 if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
151 h->text = (char*)malloc(bufsize);
152 if (!h->text) goto nomem;
153 h->text[h->l_text] = 0; // make sure it is NULL terminated
154 bytes = bgzf_read(fp, h->text, h->l_text);
155 if (bytes != h->l_text) goto read_err;
156
157 bytes = bgzf_read(fp, &h->n_targets, 4);
158 if (bytes != 4) goto read_err;
159 if (fp->is_be) ed_swap_4p(&h->n_targets);
160
161 if (h->n_targets < 0) goto invalid;
162
163 // read reference sequence names and lengths
164 if (h->n_targets > 0) {
165 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
166 if (!h->target_name) goto nomem;
167 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
168 if (!h->target_len) goto nomem;
169 }
170 else {
171 h->target_name = NULL;
172 h->target_len = NULL;
173 }
174
175 for (i = 0; i != h->n_targets; ++i) {
176 bytes = bgzf_read(fp, &name_len, 4);
177 if (bytes != 4) goto read_err;
178 if (fp->is_be) ed_swap_4p(&name_len);
179 if (name_len <= 0) goto invalid;
180
181 h->target_name[i] = (char*)malloc(name_len);
182 if (!h->target_name[i]) goto nomem;
183 num_names++;
184
185 bytes = bgzf_read(fp, h->target_name[i], name_len);
186 if (bytes != name_len) goto read_err;
187
188 if (h->target_name[i][name_len - 1] != '\0') {
189 /* Fix missing NUL-termination. Is this being too nice?
190 We could alternatively bail out with an error. */
191 char *new_name;
192 if (name_len == INT32_MAX) goto invalid;
193 new_name = realloc(h->target_name[i], name_len + 1);
194 if (new_name == NULL) goto nomem;
195 h->target_name[i] = new_name;
196 h->target_name[i][name_len] = '\0';
197 }
198
199 bytes = bgzf_read(fp, &h->target_len[i], 4);
200 if (bytes != 4) goto read_err;
201 if (fp->is_be) ed_swap_4p(&h->target_len[i]);
202 }
203 return h;
204
205 nomem:
206 hts_log_error("Out of memory");
207 goto clean;
208
209 read_err:
210 if (bytes < 0) {
211 hts_log_error("Error reading BGZF stream");
212 } else {
213 hts_log_error("Truncated BAM header");
214 }
215 goto clean;
216
217 invalid:
218 hts_log_error("Invalid BAM binary header");
219
220 clean:
221 if (h != NULL) {
222 h->n_targets = num_names; // ensure we free only allocated target_names
223 bam_hdr_destroy(h);
224 }
225 return NULL;
226 }
227
bam_hdr_write(BGZF * fp,const bam_hdr_t * h)228 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
229 {
230 int32_t i, name_len, x;
231 // write "BAM1"
232 if (bgzf_write(fp, "BAM\1", 4) < 0) return -1;
233 // write plain text and the number of reference sequences
234 if (fp->is_be) {
235 x = ed_swap_4(h->l_text);
236 if (bgzf_write(fp, &x, 4) < 0) return -1;
237 if (h->l_text) {
238 if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
239 }
240 x = ed_swap_4(h->n_targets);
241 if (bgzf_write(fp, &x, 4) < 0) return -1;
242 } else {
243 if (bgzf_write(fp, &h->l_text, 4) < 0) return -1;
244 if (h->l_text) {
245 if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
246 }
247 if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1;
248 }
249 // write sequence names and lengths
250 for (i = 0; i != h->n_targets; ++i) {
251 char *p = h->target_name[i];
252 name_len = strlen(p) + 1;
253 if (fp->is_be) {
254 x = ed_swap_4(name_len);
255 if (bgzf_write(fp, &x, 4) < 0) return -1;
256 } else {
257 if (bgzf_write(fp, &name_len, 4) < 0) return -1;
258 }
259 if (bgzf_write(fp, p, name_len) < 0) return -1;
260 if (fp->is_be) {
261 x = ed_swap_4(h->target_len[i]);
262 if (bgzf_write(fp, &x, 4) < 0) return -1;
263 } else {
264 if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
265 }
266 }
267 if (bgzf_flush(fp) < 0) return -1;
268 return 0;
269 }
270
bam_name2id(bam_hdr_t * h,const char * ref)271 int bam_name2id(bam_hdr_t *h, const char *ref)
272 {
273 sdict_t *d = (sdict_t*)h->sdict;
274 khint_t k;
275 if (h->sdict == 0) {
276 int i, absent;
277 d = kh_init(s2i);
278 for (i = 0; i < h->n_targets; ++i) {
279 k = kh_put(s2i, d, h->target_name[i], &absent);
280 kh_val(d, k) = i;
281 }
282 h->sdict = d;
283 }
284 k = kh_get(s2i, d, ref);
285 return k == kh_end(d)? -1 : kh_val(d, k);
286 }
287
288 /*************************
289 *** BAM alignment I/O ***
290 *************************/
291
bam_init1()292 bam1_t *bam_init1()
293 {
294 return (bam1_t*)calloc(1, sizeof(bam1_t));
295 }
296
do_realloc_bam_data(bam1_t * b,size_t desired)297 static int do_realloc_bam_data(bam1_t *b, size_t desired)
298 {
299 uint32_t new_m_data;
300 uint8_t *new_data;
301 new_m_data = desired;
302 kroundup32(new_m_data);
303 if (new_m_data < desired) {
304 errno = ENOMEM; // Not strictly true but we can't store the size
305 return -1;
306 }
307 new_data = realloc(b->data, new_m_data);
308 if (!new_data) return -1;
309 b->data = new_data;
310 b->m_data = new_m_data;
311 return 0;
312 }
313
realloc_bam_data(bam1_t * b,size_t desired)314 static inline int realloc_bam_data(bam1_t *b, size_t desired)
315 {
316 if (desired <= b->m_data) return 0;
317 return do_realloc_bam_data(b, desired);
318 }
319
possibly_expand_bam_data(bam1_t * b,size_t bytes)320 static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) {
321 uint32_t new_len = b->l_data + bytes;
322
323 if (new_len > INT32_MAX || new_len < b->l_data) {
324 errno = ENOMEM;
325 return -1;
326 }
327 if (new_len <= b->m_data) return 0;
328 return do_realloc_bam_data(b, new_len);
329 }
330
bam_destroy1(bam1_t * b)331 void bam_destroy1(bam1_t *b)
332 {
333 if (b == 0) return;
334 free(b->data); free(b);
335 }
336
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)337 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
338 {
339 uint8_t *data = bdst->data;
340 int m_data = bdst->m_data; // backup data and m_data
341 if (m_data < bsrc->l_data) { // double the capacity
342 m_data = bsrc->l_data; kroundup32(m_data);
343 data = (uint8_t*)realloc(data, m_data);
344 }
345 memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
346 *bdst = *bsrc; // copy the rest
347 // restore the backup
348 bdst->m_data = m_data;
349 bdst->data = data;
350 return bdst;
351 }
352
bam_dup1(const bam1_t * bsrc)353 bam1_t *bam_dup1(const bam1_t *bsrc)
354 {
355 if (bsrc == NULL) return NULL;
356 bam1_t *bdst = bam_init1();
357 if (bdst == NULL) return NULL;
358 return bam_copy1(bdst, bsrc);
359 }
360
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,int * rlen,int * qlen)361 void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen)
362 {
363 int k;
364 *rlen = *qlen = 0;
365 for (k = 0; k < n_cigar; ++k) {
366 int type = bam_cigar_type(bam_cigar_op(cigar[k]));
367 int len = bam_cigar_oplen(cigar[k]);
368 if (type & 1) *qlen += len;
369 if (type & 2) *rlen += len;
370 }
371 }
372
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)373 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
374 {
375 int k, l;
376 for (k = l = 0; k < n_cigar; ++k)
377 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
378 l += bam_cigar_oplen(cigar[k]);
379 return l;
380 }
381
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)382 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
383 {
384 int k, l;
385 for (k = l = 0; k < n_cigar; ++k)
386 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
387 l += bam_cigar_oplen(cigar[k]);
388 return l;
389 }
390
bam_endpos(const bam1_t * b)391 int32_t bam_endpos(const bam1_t *b)
392 {
393 if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
394 return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
395 else
396 return b->core.pos + 1;
397 }
398
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)399 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
400 {
401 bam1_core_t *c = &b->core;
402 uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
403 uint8_t *CG;
404
405 // test where there is a real CIGAR in the CG tag to move
406 if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
407 cigar0 = bam_get_cigar(b);
408 if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
409 fake_bytes = c->n_cigar * 4;
410 if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag
411 if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
412 CG_len = le_to_u32(CG + 2);
413 if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
414
415 // move from the CG tag to the right position
416 cigar_st = (uint8_t*)cigar0 - b->data;
417 c->n_cigar = CG_len;
418 n_cigar4 = c->n_cigar * 4;
419 CG_st = CG - b->data - 2;
420 CG_en = CG_st + 8 + n_cigar4;
421 if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
422 b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
423 memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
424 memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
425 if (ori_len > CG_en) // move data after the CG tag
426 memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
427 b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
428 if (recal_bin)
429 b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5);
430 if (give_warning)
431 hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
432 return 1;
433 }
434
aux_type2size(uint8_t type)435 static inline int aux_type2size(uint8_t type)
436 {
437 switch (type) {
438 case 'A': case 'c': case 'C':
439 return 1;
440 case 's': case 'S':
441 return 2;
442 case 'i': case 'I': case 'f':
443 return 4;
444 case 'd':
445 return 8;
446 case 'Z': case 'H': case 'B':
447 return type;
448 default:
449 return 0;
450 }
451 }
452
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)453 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
454 {
455 uint32_t *cigar = (uint32_t*)(data + c->l_qname);
456 uint32_t i;
457 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
458 }
459
bam_read1(BGZF * fp,bam1_t * b)460 int bam_read1(BGZF *fp, bam1_t *b)
461 {
462 bam1_core_t *c = &b->core;
463 int32_t block_len, ret, i;
464 uint32_t x[8], new_l_data;
465 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
466 if (ret == 0) return -1; // normal end-of-file
467 else return -2; // truncated
468 }
469 if (fp->is_be)
470 ed_swap_4p(&block_len);
471 if (block_len < 32) return -4; // block_len includes core data
472 if (bgzf_read(fp, x, 32) != 32) return -3;
473 if (fp->is_be) {
474 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
475 }
476 c->tid = x[0]; c->pos = x[1];
477 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
478 c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
479 if ((uint32_t) c->l_qname + c->l_extranul > 255) // l_qname would overflow
480 return -4;
481 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
482 c->l_qseq = x[4];
483 c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
484
485 new_l_data = block_len - 32 + c->l_extranul;
486 if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
487 if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
488 + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
489 return -4;
490 if (realloc_bam_data(b, new_l_data) < 0) return -4;
491 b->l_data = new_l_data;
492
493 if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
494 for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
495 c->l_qname += c->l_extranul;
496 if (b->l_data < c->l_qname ||
497 bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
498 return -4;
499 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
500 if (bam_tag2cigar(b, 0, 0) < 0)
501 return -4;
502
503 if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
504 int rlen, qlen;
505 bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
506 if ((b->core.flag & BAM_FUNMAP)) rlen=1;
507 b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
508 // Sanity check for broken CIGAR alignments
509 if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
510 hts_log_error("CIGAR and query sequence lengths differ for %s",
511 bam_get_qname(b));
512 return -4;
513 }
514 }
515
516 return 4 + block_len;
517 }
518
bam_write1(BGZF * fp,const bam1_t * b)519 int bam_write1(BGZF *fp, const bam1_t *b)
520 {
521 const bam1_core_t *c = &b->core;
522 uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
523 int i, ok;
524 if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
525 x[0] = c->tid;
526 x[1] = c->pos;
527 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
528 if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
529 else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
530 x[4] = c->l_qseq;
531 x[5] = c->mtid;
532 x[6] = c->mpos;
533 x[7] = c->isize;
534 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
535 if (fp->is_be) {
536 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
537 y = block_len;
538 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
539 swap_data(c, b->l_data, b->data, 1);
540 } else {
541 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
542 }
543 if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
544 if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
545 if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
546 if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
547 } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
548 uint8_t buf[8];
549 uint32_t cigar_st, cigar_en, cigar[2];
550 cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
551 cigar_en = cigar_st + c->n_cigar * 4;
552 cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
553 cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP;
554 u32_to_le(cigar[0], buf);
555 u32_to_le(cigar[1], buf + 4);
556 if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
557 if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
558 if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
559 u32_to_le(c->n_cigar, buf);
560 if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
561 if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
562 }
563 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
564 return ok? 4 + block_len : -1;
565 }
566
567 /********************
568 *** BAM indexing ***
569 ********************/
570
bam_index(BGZF * fp,int min_shift)571 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
572 {
573 int n_lvls, i, fmt, ret;
574 bam1_t *b;
575 hts_idx_t *idx;
576 bam_hdr_t *h;
577 h = bam_hdr_read(fp);
578 if (h == NULL) return NULL;
579 if (min_shift > 0) {
580 int64_t max_len = 0, s;
581 for (i = 0; i < h->n_targets; ++i)
582 if (max_len < h->target_len[i]) max_len = h->target_len[i];
583 max_len += 256;
584 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
585 fmt = HTS_FMT_CSI;
586 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
587 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
588 bam_hdr_destroy(h);
589 b = bam_init1();
590 while ((ret = bam_read1(fp, b)) >= 0) {
591 ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
592 if (ret < 0) goto err; // unsorted
593 }
594 if (ret < -1) goto err; // corrupted BAM file
595
596 hts_idx_finish(idx, bgzf_tell(fp));
597 bam_destroy1(b);
598 return idx;
599
600 err:
601 bam_destroy1(b);
602 hts_idx_destroy(idx);
603 return NULL;
604 }
605
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)606 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
607 {
608 hts_idx_t *idx;
609 htsFile *fp;
610 int ret = 0;
611
612 if ((fp = hts_open(fn, "r")) == 0) return -2;
613 if (nthreads)
614 hts_set_threads(fp, nthreads);
615
616 switch (fp->format.format) {
617 case cram:
618 ret = cram_index_build(fp->fp.cram, fn, fnidx);
619 break;
620
621 case bam:
622 idx = bam_index(fp->fp.bgzf, min_shift);
623 if (idx) {
624 ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
625 if (ret < 0) ret = -4;
626 hts_idx_destroy(idx);
627 }
628 else ret = -1;
629 break;
630
631 default:
632 ret = -3;
633 break;
634 }
635 hts_close(fp);
636
637 return ret;
638 }
639
sam_index_build2(const char * fn,const char * fnidx,int min_shift)640 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
641 {
642 return sam_index_build3(fn, fnidx, min_shift, 0);
643 }
644
sam_index_build(const char * fn,int min_shift)645 int sam_index_build(const char *fn, int min_shift)
646 {
647 return sam_index_build3(fn, NULL, min_shift, 0);
648 }
649
650 // Provide bam_index_build() symbol for binary compability with earlier HTSlib
651 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)652 int bam_index_build(const char *fn, int min_shift)
653 {
654 return sam_index_build2(fn, NULL, min_shift);
655 }
656
bam_readrec(BGZF * fp,void * ignored,void * bv,int * tid,int * beg,int * end)657 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
658 {
659 bam1_t *b = bv;
660 int ret;
661 if ((ret = bam_read1(fp, b)) >= 0) {
662 *tid = b->core.tid;
663 *beg = b->core.pos;
664 *end = bam_endpos(b);
665 }
666 return ret;
667 }
668
669 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,int * beg,int * end)670 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
671 {
672 htsFile *fp = fpv;
673 bam1_t *b = bv;
674 int ret = cram_get_bam_seq(fp->fp.cram, &b);
675 if (ret < 0)
676 return cram_eof(fp->fp.cram) ? -1 : -2;
677
678 if (bam_tag2cigar(b, 1, 1) < 0)
679 return -2;
680
681 *tid = b->core.tid;
682 *beg = b->core.pos;
683 *end = bam_endpos(b);
684
685 return ret;
686 }
687
cram_pseek(void * fp,int64_t offset,int whence)688 static int cram_pseek(void *fp, int64_t offset, int whence)
689 {
690 cram_fd *fd = (cram_fd *)fp;
691
692 if ((0 != cram_seek(fd, offset, SEEK_SET))
693 && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
694 return -1;
695
696 if (fd->ctr) {
697 cram_free_container(fd->ctr);
698 if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
699 cram_free_container(fd->ctr_mt);
700
701 fd->ctr = NULL;
702 fd->ctr_mt = NULL;
703 fd->ooc = 0;
704 }
705
706 return 0;
707 }
708
709 /*
710 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
711 * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
712 * container previously fetched. It was designed like this to integrate with the functionality
713 * of the iterator stepping logic.
714 */
715
cram_ptell(void * fp)716 static int64_t cram_ptell(void *fp)
717 {
718 cram_fd *fd = (cram_fd *)fp;
719 cram_container *c;
720 int64_t ret = -1L;
721
722 if (fd && fd->fp) {
723 ret = htell(fd->fp);
724 if ((c = fd->ctr) != NULL) {
725 ret -= ((c->curr_slice < c->max_slice || c->curr_rec < c->num_records) ? c->offset + 1 : 0);
726 }
727 }
728
729 return ret;
730 }
731
bam_pseek(void * fp,int64_t offset,int whence)732 static int bam_pseek(void *fp, int64_t offset, int whence)
733 {
734 BGZF *fd = (BGZF *)fp;
735
736 return bgzf_seek(fd, offset, whence);
737 }
738
bam_ptell(void * fp)739 static int64_t bam_ptell(void *fp)
740 {
741 BGZF *fd = (BGZF *)fp;
742 if (!fd)
743 return -1L;
744
745 return bgzf_tell(fd);
746 }
747
748 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_bam_cram_readrec(BGZF * bgzfp,void * fpv,void * bv,int * tid,int * beg,int * end)749 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
750 {
751 htsFile *fp = fpv;
752 bam1_t *b = bv;
753 switch (fp->format.format) {
754 case bam: return bam_read1(bgzfp, b);
755 case cram: {
756 int ret = cram_get_bam_seq(fp->fp.cram, &b);
757 if (ret < 0)
758 return cram_eof(fp->fp.cram) ? -1 : -2;
759
760 if (bam_tag2cigar(b, 1, 1) < 0)
761 return -2;
762 return ret;
763 }
764 default:
765 // TODO Need headers available to implement this for SAM files
766 hts_log_error("Not implemented for SAM files");
767 abort();
768 }
769 }
770
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)771 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
772 {
773 switch (fp->format.format) {
774 case bam:
775 return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI);
776
777 case cram: {
778 if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
779 // Cons up a fake "index" just pointing at the associated cram_fd:
780 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
781 if (idx == NULL) return NULL;
782 idx->fmt = HTS_FMT_CRAI;
783 idx->cram = fp->fp.cram;
784 return (hts_idx_t *) idx;
785 }
786
787 default:
788 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
789 }
790 }
791
sam_index_load(htsFile * fp,const char * fn)792 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
793 {
794 return sam_index_load2(fp, fn, NULL);
795 }
796
cram_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)797 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
798 {
799 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
800 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
801 if (iter == NULL) return NULL;
802
803 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
804 // the readrec function:
805 iter->is_cram = 1;
806 iter->read_rest = 1;
807 iter->off = NULL;
808 iter->bins.a = NULL;
809 iter->readrec = readrec;
810
811 if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
812 cram_range r = { tid, beg+1, end };
813 int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
814
815 iter->curr_off = 0;
816 // The following fields are not required by hts_itr_next(), but are
817 // filled in in case user code wants to look at them.
818 iter->tid = tid;
819 iter->beg = beg;
820 iter->end = end;
821
822 switch (ret) {
823 case 0:
824 break;
825
826 case -2:
827 // No data vs this ref, so mark iterator as completed.
828 // Same as HTS_IDX_NONE.
829 iter->finished = 1;
830 break;
831
832 default:
833 free(iter);
834 return NULL;
835 }
836 }
837 else switch (tid) {
838 case HTS_IDX_REST:
839 iter->curr_off = 0;
840 break;
841 case HTS_IDX_NONE:
842 iter->curr_off = 0;
843 iter->finished = 1;
844 break;
845 default:
846 hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
847 abort();
848 break;
849 }
850
851 return iter;
852 }
853
sam_itr_queryi(const hts_idx_t * idx,int tid,int beg,int end)854 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
855 {
856 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
857 if (idx == NULL)
858 return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
859 else if (cidx->fmt == HTS_FMT_CRAI)
860 return cram_itr_query(idx, tid, beg, end, cram_readrec);
861 else
862 return hts_itr_query(idx, tid, beg, end, bam_readrec);
863 }
864
cram_name2id(void * fdv,const char * ref)865 static int cram_name2id(void *fdv, const char *ref)
866 {
867 cram_fd *fd = (cram_fd *) fdv;
868 return sam_hdr_name2ref(fd->header, ref);
869 }
870
sam_itr_querys(const hts_idx_t * idx,bam_hdr_t * hdr,const char * region)871 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
872 {
873 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
874 if (cidx->fmt == HTS_FMT_CRAI)
875 return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
876 else
877 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
878 }
879
sam_itr_regions(const hts_idx_t * idx,bam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)880 hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
881 {
882 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
883 if (cidx->fmt == HTS_FMT_CRAI)
884 return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
885 hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
886 else
887 return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
888 hts_itr_multi_bam, bam_readrec, bam_pseek, bam_ptell);
889 }
890
891 /**********************
892 *** SAM header I/O ***
893 **********************/
894
895 #include "htslib/kseq.h"
896 #include "htslib/kstring.h"
897
sam_hdr_parse(int l_text,const char * text)898 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
899 {
900 const char *q, *r, *p;
901 khash_t(s2i) *d;
902 d = kh_init(s2i);
903 for (p = text; *p; ++p) {
904 if (strncmp(p, "@SQ\t", 4) == 0) {
905 char *sn = 0;
906 int ln = -1;
907 for (q = p + 4;; ++q) {
908 if (strncmp(q, "SN:", 3) == 0) {
909 q += 3;
910 for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r);
911 sn = (char*)calloc(r - q + 1, 1);
912 strncpy(sn, q, r - q);
913 q = r;
914 } else if (strncmp(q, "LN:", 3) == 0)
915 ln = strtol(q + 3, (char**)&q, 10);
916 while (*q != '\t' && *q != '\n' && *q != '\0') ++q;
917 if (*q == '\0' || *q == '\n') break;
918 }
919 p = q;
920 if (sn && ln >= 0) {
921 khint_t k;
922 int absent;
923 k = kh_put(s2i, d, sn, &absent);
924 if (!absent) {
925 hts_log_warning("Duplicated sequence '%s'", sn);
926 free(sn);
927 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
928 }
929 }
930 while (*p != '\0' && *p != '\n') ++p;
931 }
932 return hdr_from_dict(d);
933 }
934
935 // Minimal sanitisation of a header to ensure.
936 // - null terminated string.
937 // - all lines start with @ (also implies no blank lines).
938 //
939 // Much more could be done, but currently is not, including:
940 // - checking header types are known (HD, SQ, etc).
941 // - syntax (eg checking tab separated fields).
942 // - validating n_targets matches @SQ records.
943 // - validating target lengths against @SQ records.
sam_hdr_sanitise(bam_hdr_t * h)944 static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) {
945 if (!h)
946 return NULL;
947
948 // Special case for empty headers.
949 if (h->l_text == 0)
950 return h;
951
952 uint32_t i, lnum = 0;
953 char *cp = h->text, last = '\n';
954 for (i = 0; i < h->l_text; i++) {
955 // NB: l_text excludes terminating nul. This finds early ones.
956 if (cp[i] == 0)
957 break;
958
959 // Error on \n[^@], including duplicate newlines
960 if (last == '\n') {
961 lnum++;
962 if (cp[i] != '@') {
963 hts_log_error("Malformed SAM header at line %u", lnum);
964 bam_hdr_destroy(h);
965 return NULL;
966 }
967 }
968
969 last = cp[i];
970 }
971
972 if (i < h->l_text) { // Early nul found. Complain if not just padding.
973 uint32_t j = i;
974 while (j < h->l_text && cp[j] == '\0') j++;
975 if (j < h->l_text)
976 hts_log_warning("Unexpected NUL character in header. Possibly truncated");
977 }
978
979 // Add trailing newline and/or trailing nul if required.
980 if (last != '\n') {
981 hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
982
983 if (h->l_text == UINT32_MAX) {
984 hts_log_error("No room for extra newline");
985 bam_hdr_destroy(h);
986 return NULL;
987 }
988
989 if (i >= h->l_text - 1) {
990 cp = realloc(h->text, (size_t) h->l_text+2);
991 if (!cp) {
992 bam_hdr_destroy(h);
993 return NULL;
994 }
995 h->text = cp;
996 }
997 cp[i++] = '\n';
998
999 // l_text may be larger already due to multiple nul padding
1000 if (h->l_text < i)
1001 h->l_text = i;
1002 cp[h->l_text] = '\0';
1003 }
1004
1005 return h;
1006 }
1007
sam_hdr_read(htsFile * fp)1008 bam_hdr_t *sam_hdr_read(htsFile *fp)
1009 {
1010 if (!fp) {
1011 errno = EINVAL;
1012 return NULL;
1013 }
1014
1015 switch (fp->format.format) {
1016 case bam:
1017 return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1018
1019 case cram:
1020 return sam_hdr_sanitise(cram_header_to_bam(fp->fp.cram->header));
1021
1022 case sam: {
1023 kstring_t str = { 0, 0, NULL };
1024 bam_hdr_t *h = NULL;
1025 int ret, has_SQ = 0;
1026 while ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1027 if (fp->line.s[0] != '@') break;
1028 if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
1029 kputsn(fp->line.s, fp->line.l, &str);
1030 kputc('\n', &str);
1031 }
1032 if (ret < -1) goto error;
1033 if (! has_SQ && fp->fn_aux) {
1034 kstring_t line = { 0, 0, NULL };
1035 hFILE *f = hopen(fp->fn_aux, "r");
1036 if (f == NULL) goto error;
1037 while (line.l = 0, kgetline(&line, (kgets_func *) hgets, f) >= 0) {
1038 char *tab = strchr(line.s, '\t');
1039 if (tab == NULL) continue;
1040 kputs("@SQ\tSN:", &str);
1041 kputsn(line.s, tab - line.s, &str);
1042 kputs("\tLN:", &str);
1043 kputl(atol(tab), &str);
1044 kputc('\n', &str);
1045 }
1046 free(line.s);
1047 if (hclose(f) != 0) {
1048 hts_log_warning("Failed to close %s", fp->fn_aux);
1049 }
1050 }
1051 if (str.l == 0) kputsn("", 0, &str);
1052 h = sam_hdr_parse(str.l, str.s);
1053 h->l_text = str.l; h->text = str.s;
1054 return sam_hdr_sanitise(h);
1055
1056 error:
1057 bam_hdr_destroy(h);
1058 free(str.s);
1059 return NULL;
1060 }
1061
1062 default:
1063 abort();
1064 }
1065 }
1066
sam_hdr_write(htsFile * fp,const bam_hdr_t * h)1067 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
1068 {
1069 if (!fp || !h) {
1070 errno = EINVAL;
1071 return -1;
1072 }
1073
1074 switch (fp->format.format) {
1075 case binary_format:
1076 fp->format.category = sequence_data;
1077 fp->format.format = bam;
1078 /* fall-through */
1079 case bam:
1080 if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1081 break;
1082
1083 case cram: {
1084 cram_fd *fd = fp->fp.cram;
1085 SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h);
1086 if (! hdr) return -1;
1087 if (cram_set_header(fd, hdr) < 0) return -1;
1088 if (fp->fn_aux)
1089 cram_load_reference(fd, fp->fn_aux);
1090 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1091 }
1092 break;
1093
1094 case text_format:
1095 fp->format.category = sequence_data;
1096 fp->format.format = sam;
1097 /* fall-through */
1098 case sam: {
1099 char *p;
1100 hputs(h->text, fp->fp.hfile);
1101 p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
1102 if (p == 0) {
1103 int i;
1104 for (i = 0; i < h->n_targets; ++i) {
1105 fp->line.l = 0;
1106 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
1107 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
1108 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1109 }
1110 }
1111 if ( hflush(fp->fp.hfile) != 0 ) return -1;
1112 }
1113 break;
1114
1115 default:
1116 abort();
1117 }
1118 return 0;
1119 }
1120
sam_hdr_change_HD(bam_hdr_t * h,const char * key,const char * val)1121 int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val)
1122 {
1123 char *p, *q, *beg = NULL, *end = NULL, *newtext;
1124 if (!h || !key)
1125 return -1;
1126
1127 if (h->l_text > 3) {
1128 if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
1129 if ((p = strchr(h->text, '\n')) == 0) return -1;
1130 *p = '\0'; // for strstr call
1131
1132 char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
1133
1134 if ((q = strstr(h->text, tmp)) != 0) { // key exists
1135 *p = '\n'; // change back
1136
1137 // mark the key:val
1138 beg = q;
1139 for (q += 4; *q != '\n' && *q != '\t'; ++q);
1140 end = q;
1141
1142 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
1143 && strlen(val) == end - beg - 4)
1144 return 0; // val is the same, no need to change
1145
1146 } else {
1147 beg = end = p;
1148 *p = '\n';
1149 }
1150 }
1151 }
1152 if (beg == NULL) { // no @HD
1153 if (h->l_text > UINT32_MAX - strlen(SAM_FORMAT_VERSION) - 9)
1154 return -1;
1155 h->l_text += strlen(SAM_FORMAT_VERSION) + 8;
1156 if (val) {
1157 if (h->l_text > UINT32_MAX - strlen(val) - 5)
1158 return -1;
1159 h->l_text += strlen(val) + 4;
1160 }
1161 newtext = (char*)malloc(h->l_text + 1);
1162 if (!newtext) return -1;
1163
1164 if (val)
1165 snprintf(newtext, h->l_text + 1,
1166 "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
1167 else
1168 snprintf(newtext, h->l_text + 1,
1169 "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
1170 } else { // has @HD but different or no key
1171 h->l_text = (beg - h->text) + (h->text + h->l_text - end);
1172 if (val) {
1173 if (h->l_text > UINT32_MAX - strlen(val) - 5)
1174 return -1;
1175 h->l_text += strlen(val) + 4;
1176 }
1177 newtext = (char*)malloc(h->l_text + 1);
1178 if (!newtext) return -1;
1179
1180 if (val) {
1181 snprintf(newtext, h->l_text + 1, "%.*s\t%s:%s%s",
1182 (int) (beg - h->text), h->text, key, val, end);
1183 } else { //delete key
1184 snprintf(newtext, h->l_text + 1, "%.*s%s",
1185 (int) (beg - h->text), h->text, end);
1186 }
1187 }
1188 free(h->text);
1189 h->text = newtext;
1190 return 0;
1191 }
1192
1193 /**********************
1194 *** SAM record I/O ***
1195 **********************/
1196
sam_parse1(kstring_t * s,bam_hdr_t * h,bam1_t * b)1197 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
1198 {
1199 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
1200 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
1201 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
1202 #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0)
1203 #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0)
1204 #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0)
1205
1206 uint8_t *t;
1207 char *p = s->s, *q;
1208 int i;
1209 kstring_t str;
1210 bam1_core_t *c = &b->core;
1211
1212 str.l = b->l_data = 0;
1213 str.s = (char*)b->data; str.m = b->m_data;
1214 memset(c, 0, 32);
1215 if (h->cigar_tab == 0) {
1216 h->cigar_tab = (int8_t*) malloc(128);
1217 for (i = 0; i < 128; ++i)
1218 h->cigar_tab[i] = -1;
1219 for (i = 0; BAM_CIGAR_STR[i]; ++i)
1220 h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
1221 }
1222 // qname
1223 q = _read_token(p);
1224 _parse_warn(p - q <= 1, "empty query name");
1225 _parse_err(p - q > 252, "query name too long");
1226 kputsn_(q, p - q, &str);
1227 for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++)
1228 kputc_('\0', &str);
1229 c->l_qname = p - q + c->l_extranul;
1230 // flag
1231 c->flag = strtol(p, &p, 0);
1232 if (*p++ != '\t') goto err_ret; // malformated flag
1233 // chr
1234 q = _read_token(p);
1235 if (strcmp(q, "*")) {
1236 _parse_err(h->n_targets == 0, "missing SAM header");
1237 c->tid = bam_name2id(h, q);
1238 _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
1239 } else c->tid = -1;
1240 // pos
1241 c->pos = strtol(p, &p, 10) - 1;
1242 if (*p++ != '\t') goto err_ret;
1243 if (c->pos < 0 && c->tid >= 0) {
1244 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
1245 c->tid = -1;
1246 }
1247 if (c->tid < 0) c->flag |= BAM_FUNMAP;
1248 // mapq
1249 c->qual = strtol(p, &p, 10);
1250 if (*p++ != '\t') goto err_ret;
1251 // cigar
1252 if (*p != '*') {
1253 uint32_t *cigar;
1254 size_t n_cigar = 0;
1255 for (q = p; *p && *p != '\t'; ++p)
1256 if (!isdigit_c(*p)) ++n_cigar;
1257 if (*p++ != '\t') goto err_ret;
1258 _parse_err(n_cigar == 0, "no CIGAR operations");
1259 _parse_err(n_cigar >= 2147483647, "too many CIGAR operations");
1260 c->n_cigar = n_cigar;
1261 _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t));
1262 for (i = 0; i < c->n_cigar; ++i, ++q) {
1263 int op;
1264 cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
1265 op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
1266 _parse_err(op < 0, "unrecognized CIGAR operator");
1267 cigar[i] |= op;
1268 }
1269 // can't use bam_endpos() directly as some fields not yet set up
1270 i = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
1271 } else {
1272 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
1273 c->flag |= BAM_FUNMAP;
1274 q = _read_token(p);
1275 i = 1;
1276 }
1277 c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
1278 // mate chr
1279 q = _read_token(p);
1280 if (strcmp(q, "=") == 0) {
1281 c->mtid = c->tid;
1282 } else if (strcmp(q, "*") == 0) {
1283 c->mtid = -1;
1284 } else {
1285 c->mtid = bam_name2id(h, q);
1286 _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped");
1287 }
1288 // mpos
1289 c->mpos = strtol(p, &p, 10) - 1;
1290 if (*p++ != '\t') goto err_ret;
1291 if (c->mpos < 0 && c->mtid >= 0) {
1292 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
1293 c->mtid = -1;
1294 }
1295 // tlen
1296 c->isize = strtol(p, &p, 10);
1297 if (*p++ != '\t') goto err_ret;
1298 // seq
1299 q = _read_token(p);
1300 if (strcmp(q, "*")) {
1301 c->l_qseq = p - q - 1;
1302 i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
1303 _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
1304 i = (c->l_qseq + 1) >> 1;
1305 _get_mem(uint8_t, &t, &str, i);
1306 memset(t, 0, i);
1307 for (i = 0; i < c->l_qseq; ++i)
1308 t[i>>1] |= seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
1309 } else c->l_qseq = 0;
1310 // qual
1311 q = _read_token_aux(p);
1312 _get_mem(uint8_t, &t, &str, c->l_qseq);
1313 if (strcmp(q, "*")) {
1314 _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
1315 for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
1316 } else memset(t, 0xff, c->l_qseq);
1317 // aux
1318 while (p < s->s + s->l) {
1319 uint8_t type;
1320 q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
1321 _parse_err(p - q - 1 < 5, "incomplete aux field");
1322 kputsn_(q, 2, &str);
1323 q += 3; type = *q++; ++q; // q points to value
1324 if (type != 'Z' && type != 'H') // the only zero length acceptable fields
1325 _parse_err(p - q - 1 < 1, "incomplete aux field");
1326
1327 // Ensure str has enough space for a double + type allocated.
1328 // This is so we can stuff bigger integers and floats directly into
1329 // the kstring. Sorry.
1330 _parse_err(ks_resize(&str, str.l + 16), "out of memory");
1331
1332 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
1333 kputc_('A', &str);
1334 kputc_(*q, &str);
1335 } else if (type == 'i' || type == 'I') {
1336 if (*q == '-') {
1337 long x = strtol(q, &q, 10);
1338 if (x >= INT8_MIN) {
1339 kputc_('c', &str); kputc_(x, &str);
1340 } else if (x >= INT16_MIN) {
1341 str.s[str.l++] = 's';
1342 i16_to_le(x, (uint8_t *) str.s + str.l);
1343 str.l += 2;
1344 } else {
1345 str.s[str.l++] = 'i';
1346 i32_to_le(x, (uint8_t *) str.s + str.l);
1347 str.l += 4;
1348 }
1349 } else {
1350 unsigned long x = strtoul(q, &q, 10);
1351 if (x <= UINT8_MAX) {
1352 kputc_('C', &str); kputc_(x, &str);
1353 } else if (x <= UINT16_MAX) {
1354 str.s[str.l++] = 'S';
1355 u16_to_le(x, (uint8_t *) str.s + str.l);
1356 str.l += 2;
1357 } else {
1358 str.s[str.l++] = 'I';
1359 u32_to_le(x, (uint8_t *) str.s + str.l);
1360 str.l += 4;
1361 }
1362 }
1363 } else if (type == 'f') {
1364 str.s[str.l++] = 'f';
1365 float_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1366 str.l += sizeof(float);
1367 } else if (type == 'd') {
1368 str.s[str.l++] = 'd';
1369 double_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1370 str.l += sizeof(double);
1371 } else if (type == 'Z' || type == 'H') {
1372 _parse_err(type == 'H' && !((p-q)&1),
1373 "hex field does not have an even number of digits");
1374 kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
1375 } else if (type == 'B') {
1376 int32_t n, size;
1377 size_t bytes;
1378 char *r;
1379 _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
1380 type = *q++; // q points to the first ',' following the typing byte
1381
1382 size = aux_type2size(type);
1383 _parse_err_param(size <= 0 || size > 4,
1384 "unrecognized type B:%c", type);
1385 _parse_err(*q && *q != ',', "B aux field type not followed by ','");
1386
1387 for (r = q, n = 0; *r; ++r)
1388 if (*r == ',') ++n;
1389
1390 // Ensure space for type + values
1391 bytes = (size_t) n * (size_t) size;
1392 _parse_err(bytes / size != n
1393 || ks_resize(&str, str.l + bytes + 2 + sizeof(uint32_t)),
1394 "out of memory");
1395 str.s[str.l++] = 'B';
1396 str.s[str.l++] = type;
1397 i32_to_le(n, (uint8_t *) str.s + str.l);
1398 str.l += sizeof(uint32_t);
1399
1400 // This ensures that q always ends up at the next comma after
1401 // reading a number even if it's followed by junk. It
1402 // prevents the possibility of trying to read more than n items.
1403 #define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0)
1404
1405 if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
1406 else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
1407 else if (type == 's') while (q + 1 < p) { i16_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1408 else if (type == 'S') while (q + 1 < p) { u16_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1409 else if (type == 'i') while (q + 1 < p) { i32_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1410 else if (type == 'I') while (q + 1 < p) { u32_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1411 else if (type == 'f') while (q + 1 < p) { float_to_le(strtod(q + 1, &q), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1412 else _parse_err_param(1, "unrecognized type B:%c", type);
1413
1414 #undef _skip_to_comma
1415
1416 } else _parse_err_param(1, "unrecognized type %c", type);
1417 }
1418 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1419 if (bam_tag2cigar(b, 1, 1) < 0)
1420 return -2;
1421 return 0;
1422
1423 #undef _parse_warn
1424 #undef _parse_err
1425 #undef _parse_err_param
1426 #undef _get_mem
1427 #undef _read_token_aux
1428 #undef _read_token
1429 err_ret:
1430 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1431 return -2;
1432 }
1433
sam_read1(htsFile * fp,bam_hdr_t * h,bam1_t * b)1434 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
1435 {
1436 switch (fp->format.format) {
1437 case bam: {
1438 int r = bam_read1(fp->fp.bgzf, b);
1439 if (r >= 0) {
1440 if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
1441 b->core.mtid >= h->n_targets || b->core.mtid < -1)
1442 return -3;
1443 }
1444 return r;
1445 }
1446
1447 case cram: {
1448 int ret = cram_get_bam_seq(fp->fp.cram, &b);
1449 if (ret < 0)
1450 return cram_eof(fp->fp.cram) ? -1 : -2;
1451
1452 if (bam_tag2cigar(b, 1, 1) < 0)
1453 return -2;
1454 return ret;
1455 }
1456
1457 case sam: {
1458 int ret;
1459 err_recover:
1460 if (fp->line.l == 0) {
1461 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1462 if (ret < 0) return ret;
1463 }
1464 ret = sam_parse1(&fp->line, h, b);
1465 fp->line.l = 0;
1466 if (ret < 0) {
1467 hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
1468 if (h->ignore_sam_err) goto err_recover;
1469 }
1470 return ret;
1471 }
1472
1473 default:
1474 abort();
1475 }
1476 }
1477
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)1478 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
1479 {
1480 int i;
1481 uint8_t *s, *end;
1482 const bam1_core_t *c = &b->core;
1483
1484 str->l = 0;
1485 kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name
1486 kputw(c->flag, str); kputc('\t', str); // flag
1487 if (c->tid >= 0) { // chr
1488 kputs(h->target_name[c->tid] , str);
1489 kputc('\t', str);
1490 } else kputsn("*\t", 2, str);
1491 kputw(c->pos + 1, str); kputc('\t', str); // pos
1492 kputw(c->qual, str); kputc('\t', str); // qual
1493 if (c->n_cigar) { // cigar
1494 uint32_t *cigar = bam_get_cigar(b);
1495 for (i = 0; i < c->n_cigar; ++i) {
1496 kputw(bam_cigar_oplen(cigar[i]), str);
1497 kputc(bam_cigar_opchr(cigar[i]), str);
1498 }
1499 } else kputc('*', str);
1500 kputc('\t', str);
1501 if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
1502 else if (c->mtid == c->tid) kputsn("=\t", 2, str);
1503 else {
1504 kputs(h->target_name[c->mtid], str);
1505 kputc('\t', str);
1506 }
1507 kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
1508 kputw(c->isize, str); kputc('\t', str); // template len
1509 if (c->l_qseq) { // seq and qual
1510 uint8_t *s = bam_get_seq(b);
1511 for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
1512 kputc('\t', str);
1513 s = bam_get_qual(b);
1514 if (s[0] == 0xff) kputc('*', str);
1515 else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
1516 } else kputsn("*\t*", 3, str);
1517
1518 s = bam_get_aux(b); // aux
1519 end = b->data + b->l_data;
1520 while (end - s >= 4) {
1521 uint8_t type, key[2];
1522 key[0] = s[0]; key[1] = s[1];
1523 s += 2; type = *s++;
1524 kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
1525 if (type == 'A') {
1526 kputsn("A:", 2, str);
1527 kputc(*s, str);
1528 ++s;
1529 } else if (type == 'C') {
1530 kputsn("i:", 2, str);
1531 kputw(*s, str);
1532 ++s;
1533 } else if (type == 'c') {
1534 kputsn("i:", 2, str);
1535 kputw(*(int8_t*)s, str);
1536 ++s;
1537 } else if (type == 'S') {
1538 if (end - s >= 2) {
1539 kputsn("i:", 2, str);
1540 kputuw(le_to_u16(s), str);
1541 s += 2;
1542 } else goto bad_aux;
1543 } else if (type == 's') {
1544 if (end - s >= 2) {
1545 kputsn("i:", 2, str);
1546 kputw(le_to_i16(s), str);
1547 s += 2;
1548 } else goto bad_aux;
1549 } else if (type == 'I') {
1550 if (end - s >= 4) {
1551 kputsn("i:", 2, str);
1552 kputuw(le_to_u32(s), str);
1553 s += 4;
1554 } else goto bad_aux;
1555 } else if (type == 'i') {
1556 if (end - s >= 4) {
1557 kputsn("i:", 2, str);
1558 kputw(le_to_i32(s), str);
1559 s += 4;
1560 } else goto bad_aux;
1561 } else if (type == 'f') {
1562 if (end - s >= 4) {
1563 ksprintf(str, "f:%g", le_to_float(s));
1564 s += 4;
1565 } else goto bad_aux;
1566
1567 } else if (type == 'd') {
1568 if (end - s >= 8) {
1569 ksprintf(str, "d:%g", le_to_double(s));
1570 s += 8;
1571 } else goto bad_aux;
1572 } else if (type == 'Z' || type == 'H') {
1573 kputc(type, str); kputc(':', str);
1574 while (s < end && *s) kputc(*s++, str);
1575 if (s >= end)
1576 goto bad_aux;
1577 ++s;
1578 } else if (type == 'B') {
1579 uint8_t sub_type = *(s++);
1580 int sub_type_size = aux_type2size(sub_type);
1581 uint32_t n;
1582 if (sub_type_size == 0 || end - s < 4)
1583 goto bad_aux;
1584 n = le_to_u32(s);
1585 s += 4; // now points to the start of the array
1586 if ((end - s) / sub_type_size < n)
1587 goto bad_aux;
1588 kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
1589 for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
1590 kputc(',', str);
1591 if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
1592 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
1593 else if ('s' == sub_type) { kputw(le_to_i16(s), str); s += 2; }
1594 else if ('S' == sub_type) { kputw(le_to_u16(s), str); s += 2; }
1595 else if ('i' == sub_type) { kputw(le_to_i32(s), str); s += 4; }
1596 else if ('I' == sub_type) { kputuw(le_to_u32(s), str); s += 4; }
1597 else if ('f' == sub_type) { kputd(le_to_float(s), str); s += 4; }
1598 else goto bad_aux; // Unknown sub-type
1599 }
1600 } else { // Unknown type
1601 goto bad_aux;
1602 }
1603 }
1604 return str->l;
1605
1606 bad_aux:
1607 hts_log_error("Corrupted aux data for read %.*s",
1608 b->core.l_qname, bam_get_qname(b));
1609 errno = EINVAL;
1610 return -1;
1611 }
1612
sam_write1(htsFile * fp,const bam_hdr_t * h,const bam1_t * b)1613 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1614 {
1615 switch (fp->format.format) {
1616 case binary_format:
1617 fp->format.category = sequence_data;
1618 fp->format.format = bam;
1619 /* fall-through */
1620 case bam:
1621 return bam_write1(fp->fp.bgzf, b);
1622
1623 case cram:
1624 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1625
1626 case text_format:
1627 fp->format.category = sequence_data;
1628 fp->format.format = sam;
1629 /* fall-through */
1630 case sam:
1631 if (sam_format1(h, b, &fp->line) < 0) return -1;
1632 kputc('\n', &fp->line);
1633 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1634 return fp->line.l;
1635
1636 default:
1637 abort();
1638 }
1639 }
1640
1641 /************************
1642 *** Auxiliary fields ***
1643 ************************/
1644 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)1645 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
1646 int tsz = aux_type2size(type);
1647
1648 if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
1649
1650 switch (tsz) {
1651 case 'H': case 'Z': case 1: // Trivial
1652 memcpy(out, in, len);
1653 break;
1654
1655 #define aux_val_to_le(type_t, store_le) do { \
1656 type_t v; \
1657 size_t i; \
1658 for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
1659 memcpy(&v, in + i, sizeof(type_t)); \
1660 store_le(v, out); \
1661 } \
1662 } while (0)
1663
1664 case 2: aux_val_to_le(uint16_t, u16_to_le); break;
1665 case 4: aux_val_to_le(uint32_t, u32_to_le); break;
1666 case 8: aux_val_to_le(uint64_t, u64_to_le); break;
1667
1668 #undef aux_val_to_le
1669
1670 case 'B': { // Recurse!
1671 uint32_t n;
1672 if (len < 5) return -1;
1673 memcpy(&n, in + 1, 4);
1674 out[0] = in[0];
1675 u32_to_le(n, out + 1);
1676 return aux_to_le(in[0], out + 5, in + 5, len - 5);
1677 }
1678
1679 default: // Unknown type code
1680 return -1;
1681 }
1682
1683
1684
1685 return 0;
1686 }
1687 #endif
1688
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)1689 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
1690 {
1691 uint32_t new_len;
1692
1693 assert(b->l_data >= 0);
1694 new_len = b->l_data + 3 + len;
1695 if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
1696
1697 if (realloc_bam_data(b, new_len) < 0) return -1;
1698
1699 b->data[b->l_data] = tag[0];
1700 b->data[b->l_data + 1] = tag[1];
1701 b->data[b->l_data + 2] = type;
1702
1703 #ifdef HTS_LITTLE_ENDIAN
1704 memcpy(b->data + b->l_data + 3, data, len);
1705 #else
1706 if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
1707 errno = EINVAL;
1708 return -1;
1709 }
1710 #endif
1711
1712 b->l_data = new_len;
1713
1714 return 0;
1715
1716 nomem:
1717 errno = ENOMEM;
1718 return -1;
1719 }
1720
skip_aux(uint8_t * s,uint8_t * end)1721 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
1722 {
1723 int size;
1724 uint32_t n;
1725 if (s >= end) return end;
1726 size = aux_type2size(*s); ++s; // skip type
1727 switch (size) {
1728 case 'Z':
1729 case 'H':
1730 while (*s && s < end) ++s;
1731 return s < end ? s + 1 : end;
1732 case 'B':
1733 if (end - s < 5) return NULL;
1734 size = aux_type2size(*s); ++s;
1735 n = le_to_u32(s);
1736 s += 4;
1737 if (size == 0 || end - s < size * n) return NULL;
1738 return s + size * n;
1739 case 0:
1740 return NULL;
1741 default:
1742 if (end - s < size) return NULL;
1743 return s + size;
1744 }
1745 }
1746
bam_aux_get(const bam1_t * b,const char tag[2])1747 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1748 {
1749 uint8_t *s, *end, *t = (uint8_t *) tag;
1750 uint16_t y = (uint16_t) t[0]<<8 | t[1];
1751 s = bam_get_aux(b);
1752 end = b->data + b->l_data;
1753 while (s != NULL && end - s >= 3) {
1754 uint16_t x = (uint16_t) s[0]<<8 | s[1];
1755 s += 2;
1756 if (x == y) {
1757 // Check the tag value is valid and complete
1758 uint8_t *e = skip_aux(s, end);
1759 if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
1760 goto bad_aux; // Unterminated string
1761 }
1762 if (e != NULL) {
1763 return s;
1764 } else {
1765 goto bad_aux;
1766 }
1767 }
1768 s = skip_aux(s, end);
1769 }
1770 if (s == NULL) goto bad_aux;
1771 errno = ENOENT;
1772 return NULL;
1773
1774 bad_aux:
1775 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1776 errno = EINVAL;
1777 return NULL;
1778 }
1779 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)1780 int bam_aux_del(bam1_t *b, uint8_t *s)
1781 {
1782 uint8_t *p, *aux;
1783 int l_aux = bam_get_l_aux(b);
1784 aux = bam_get_aux(b);
1785 p = s - 2;
1786 s = skip_aux(s, aux + l_aux);
1787 if (s == NULL) goto bad_aux;
1788 memmove(p, s, l_aux - (s - aux));
1789 b->l_data -= s - p;
1790 return 0;
1791
1792 bad_aux:
1793 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1794 errno = EINVAL;
1795 return -1;
1796 }
1797
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)1798 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
1799 {
1800 // FIXME: This is not at all efficient!
1801 uint8_t *s = bam_aux_get(b,tag);
1802 if (!s) {
1803 if (errno == ENOENT) { // Tag doesn't exist - add a new one
1804 return bam_aux_append(b, tag, 'Z', len, (const uint8_t *) data);
1805 } else { // Invalid aux data, give up.
1806 return -1;
1807 }
1808 }
1809 char type = *s;
1810 if (type != 'Z') {
1811 hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
1812 errno = EINVAL;
1813 return -1;
1814 }
1815
1816 bam_aux_del(b,s);
1817 s -= 2;
1818 int l_aux = bam_get_l_aux(b);
1819
1820 ptrdiff_t s_offset = s - b->data;
1821 if (possibly_expand_bam_data(b, 3 + len) < 0) return -1;
1822 s = b->data + s_offset;
1823 b->l_data += 3 + len;
1824
1825 memmove(s+3+len, s, l_aux - (s - bam_get_aux(b)));
1826 s[0] = tag[0];
1827 s[1] = tag[1];
1828 s[2] = type;
1829 memmove(s+3,data,len);
1830 return 0;
1831 }
1832
bam_aux_update_int(bam1_t * b,const char tag[2],int64_t val)1833 int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
1834 {
1835 uint32_t sz, old_sz = 0, new = 0;
1836 uint8_t *s, type;
1837
1838 if (val < INT32_MIN || val > UINT32_MAX) {
1839 errno = EOVERFLOW;
1840 return -1;
1841 }
1842 if (val < INT16_MIN) { type = 'i'; sz = 4; }
1843 else if (val < INT8_MIN) { type = 's'; sz = 2; }
1844 else if (val < 0) { type = 'c'; sz = 1; }
1845 else if (val < UINT8_MAX) { type = 'C'; sz = 1; }
1846 else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
1847 else { type = 'I'; sz = 4; }
1848
1849 s = bam_aux_get(b, tag);
1850 if (s) { // Tag present - how big was the old one?
1851 switch (*s) {
1852 case 'c': case 'C': old_sz = 1; break;
1853 case 's': case 'S': old_sz = 2; break;
1854 case 'i': case 'I': old_sz = 4; break;
1855 default: errno = EINVAL; return -1; // Not an integer
1856 }
1857 } else {
1858 if (errno == ENOENT) { // Tag doesn't exist - add a new one
1859 s = b->data + b->l_data;
1860 new = 1;
1861 } else { // Invalid aux data, give up.
1862 return -1;
1863 }
1864 }
1865
1866 if (new || old_sz < sz) {
1867 // Make room for new tag
1868 ptrdiff_t s_offset = s - b->data;
1869 if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
1870 return -1;
1871 s = b->data + s_offset;
1872 if (new) { // Add tag id
1873 *s++ = tag[0];
1874 *s++ = tag[1];
1875 } else { // Shift following data so we have space
1876 memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
1877 }
1878 } else {
1879 // Reuse old space. Data value may be bigger than necessary but
1880 // we avoid having to move everything else
1881 sz = old_sz;
1882 type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
1883 assert(type > 0);
1884 }
1885 *s++ = type;
1886 #ifdef HTS_LITTLE_ENDIAN
1887 memcpy(s, &val, sz);
1888 #else
1889 switch (sz) {
1890 case 4: u32_to_le(val, s); break;
1891 case 2: u16_to_le(val, s); break;
1892 default: *s = val; break;
1893 }
1894 #endif
1895 b->l_data += (new ? 3 : 0) + sz - old_sz;
1896 return 0;
1897 }
1898
bam_aux_update_float(bam1_t * b,const char tag[2],float val)1899 int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
1900 {
1901 uint8_t *s = bam_aux_get(b, tag);
1902 int shrink = 0, new = 0;
1903
1904 if (s) { // Tag present - what was it?
1905 switch (*s) {
1906 case 'f': break;
1907 case 'd': shrink = 1; break;
1908 default: errno = EINVAL; return -1; // Not a float
1909 }
1910 } else {
1911 if (errno == ENOENT) { // Tag doesn't exist - add a new one
1912 new = 1;
1913 } else { // Invalid aux data, give up.
1914 return -1;
1915 }
1916 }
1917
1918 if (new) { // Ensure there's room
1919 if (possibly_expand_bam_data(b, 3 + 4) < 0)
1920 return -1;
1921 s = b->data + b->l_data;
1922 *s++ = tag[0];
1923 *s++ = tag[1];
1924 } else if (shrink) { // Convert non-standard double tag to float
1925 memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
1926 b->l_data -= 4;
1927 }
1928 *s++ = 'f';
1929 float_to_le(val, s);
1930 if (new) b->l_data += 7;
1931
1932 return 0;
1933 }
1934
bam_aux_update_array(bam1_t * b,const char tag[2],uint8_t type,uint32_t items,void * data)1935 int bam_aux_update_array(bam1_t *b, const char tag[2],
1936 uint8_t type, uint32_t items, void *data)
1937 {
1938 uint8_t *s = bam_aux_get(b, tag);
1939 size_t old_sz = 0, new_sz;
1940 int new = 0;
1941
1942 if (s) { // Tag present
1943 if (*s != 'B') { errno = EINVAL; return -1; }
1944 old_sz = aux_type2size(s[1]);
1945 if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
1946 old_sz *= le_to_u32(s + 2);
1947 } else {
1948 if (errno == ENOENT) { // Tag doesn't exist - add a new one
1949 s = b->data + b->l_data;
1950 new = 1;
1951 } else { // Invalid aux data, give up.
1952 return -1;
1953 }
1954 }
1955
1956 new_sz = aux_type2size(type);
1957 if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
1958 if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
1959 new_sz *= items;
1960
1961 if (new || old_sz < new_sz) {
1962 // Make room for new tag
1963 ptrdiff_t s_offset = s - b->data;
1964 if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
1965 return -1;
1966 s = b->data + s_offset;
1967 }
1968 if (new) { // Add tag id and type
1969 *s++ = tag[0];
1970 *s++ = tag[1];
1971 *s = 'B';
1972 b->l_data += 8 + new_sz;
1973 } else if (old_sz != new_sz) { // shift following data if necessary
1974 memmove(s + 6 + new_sz, s + 6 + old_sz,
1975 b->l_data - ((s + 6 + old_sz) - b->data));
1976 b->l_data -= old_sz;
1977 b->l_data += new_sz;
1978 }
1979
1980 s[1] = type;
1981 u32_to_le(items, s + 2);
1982 #ifdef HTS_LITTLE_ENDIAN
1983 memcpy(s + 6, data, new_sz);
1984 return 0;
1985 #else
1986 return aux_to_le(type, s + 6, data, new_sz);
1987 #endif
1988 }
1989
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)1990 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
1991 uint32_t idx)
1992 {
1993 switch (type) {
1994 case 'c': return le_to_i8(s + idx);
1995 case 'C': return s[idx];
1996 case 's': return le_to_i16(s + 2 * idx);
1997 case 'S': return le_to_u16(s + 2 * idx);
1998 case 'i': return le_to_i32(s + 4 * idx);
1999 case 'I': return le_to_u32(s + 4 * idx);
2000 default:
2001 errno = EINVAL;
2002 return 0;
2003 }
2004 }
2005
bam_aux2i(const uint8_t * s)2006 int64_t bam_aux2i(const uint8_t *s)
2007 {
2008 int type;
2009 type = *s++;
2010 return get_int_aux_val(type, s, 0);
2011 }
2012
bam_aux2f(const uint8_t * s)2013 double bam_aux2f(const uint8_t *s)
2014 {
2015 int type;
2016 type = *s++;
2017 if (type == 'd') return le_to_double(s);
2018 else if (type == 'f') return le_to_float(s);
2019 else return get_int_aux_val(type, s, 0);
2020 }
2021
bam_aux2A(const uint8_t * s)2022 char bam_aux2A(const uint8_t *s)
2023 {
2024 int type;
2025 type = *s++;
2026 if (type == 'A') return *(char*)s;
2027 errno = EINVAL;
2028 return 0;
2029 }
2030
bam_aux2Z(const uint8_t * s)2031 char *bam_aux2Z(const uint8_t *s)
2032 {
2033 int type;
2034 type = *s++;
2035 if (type == 'Z' || type == 'H') return (char*)s;
2036 errno = EINVAL;
2037 return 0;
2038 }
2039
bam_auxB_len(const uint8_t * s)2040 uint32_t bam_auxB_len(const uint8_t *s)
2041 {
2042 if (s[0] != 'B') {
2043 errno = EINVAL;
2044 return 0;
2045 }
2046 return le_to_u32(s + 2);
2047 }
2048
bam_auxB2i(const uint8_t * s,uint32_t idx)2049 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
2050 {
2051 uint32_t len = bam_auxB_len(s);
2052 if (idx >= len) {
2053 errno = ERANGE;
2054 return 0;
2055 }
2056 return get_int_aux_val(s[1], s + 6, idx);
2057 }
2058
bam_auxB2f(const uint8_t * s,uint32_t idx)2059 double bam_auxB2f(const uint8_t *s, uint32_t idx)
2060 {
2061 uint32_t len = bam_auxB_len(s);
2062 if (idx >= len) {
2063 errno = ERANGE;
2064 return 0.0;
2065 }
2066 if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
2067 else return get_int_aux_val(s[1], s + 6, idx);
2068 }
2069
sam_open_mode(char * mode,const char * fn,const char * format)2070 int sam_open_mode(char *mode, const char *fn, const char *format)
2071 {
2072 // TODO Parse "bam5" etc for compression level
2073 if (format == NULL) {
2074 // Try to pick a format based on the filename extension
2075 const char *ext = fn? strrchr(fn, '.') : NULL;
2076 if (ext == NULL || strchr(ext, '/')) return -1;
2077 return sam_open_mode(mode, fn, ext+1);
2078 }
2079 else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
2080 else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
2081 else if (strcmp(format, "sam") == 0) strcpy(mode, "");
2082 else return -1;
2083
2084 return 0;
2085 }
2086
2087 // A version of sam_open_mode that can handle ,key=value options.
2088 // The format string is allocated and returned, to be freed by the caller.
2089 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)2090 char *sam_open_mode_opts(const char *fn,
2091 const char *mode,
2092 const char *format)
2093 {
2094 char *mode_opts = malloc((format ? strlen(format) : 1) +
2095 (mode ? strlen(mode) : 1) + 12);
2096 char *opts, *cp;
2097 int format_len;
2098
2099 if (!mode_opts)
2100 return NULL;
2101
2102 strcpy(mode_opts, mode ? mode : "r");
2103 cp = mode_opts + strlen(mode_opts);
2104
2105 if (format == NULL) {
2106 // Try to pick a format based on the filename extension
2107 const char *ext = fn? strrchr(fn, '.') : NULL;
2108 if (ext == NULL || strchr(ext, '/')) {
2109 free(mode_opts);
2110 return NULL;
2111 }
2112 if (sam_open_mode(cp, fn, ext+1) == 0) {
2113 return mode_opts;
2114 } else {
2115 free(mode_opts);
2116 return NULL;
2117 }
2118 }
2119
2120 if ((opts = strchr(format, ','))) {
2121 format_len = opts-format;
2122 } else {
2123 opts="";
2124 format_len = strlen(format);
2125 }
2126
2127 if (strncmp(format, "bam", format_len) == 0) {
2128 *cp++ = 'b';
2129 } else if (strncmp(format, "cram", format_len) == 0) {
2130 *cp++ = 'c';
2131 } else if (strncmp(format, "cram2", format_len) == 0) {
2132 *cp++ = 'c';
2133 strcpy(cp, ",VERSION=2.1");
2134 cp += 12;
2135 } else if (strncmp(format, "cram3", format_len) == 0) {
2136 *cp++ = 'c';
2137 strcpy(cp, ",VERSION=3.0");
2138 cp += 12;
2139 } else if (strncmp(format, "sam", format_len) == 0) {
2140 ; // format mode=""
2141 } else {
2142 free(mode_opts);
2143 return NULL;
2144 }
2145
2146 strcpy(cp, opts);
2147
2148 return mode_opts;
2149 }
2150
2151 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)2152 int bam_str2flag(const char *str)
2153 {
2154 char *end, *beg = (char*) str;
2155 long int flag = strtol(str, &end, 0);
2156 if ( end!=str ) return flag; // the conversion was successful
2157 flag = 0;
2158 while ( *str )
2159 {
2160 end = beg;
2161 while ( *end && *end!=',' ) end++;
2162 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
2163 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
2164 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
2165 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
2166 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
2167 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
2168 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
2169 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
2170 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
2171 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
2172 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
2173 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
2174 else return -1;
2175 if ( !*end ) break;
2176 beg = end + 1;
2177 }
2178 return flag;
2179 }
2180
bam_flag2str(int flag)2181 char *bam_flag2str(int flag)
2182 {
2183 kstring_t str = {0,0,0};
2184 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
2185 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
2186 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
2187 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
2188 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
2189 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
2190 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
2191 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
2192 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
2193 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
2194 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
2195 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
2196 if ( str.l == 0 ) kputsn("", 0, &str);
2197 return str.s;
2198 }
2199
2200
2201 /**************************
2202 *** Pileup and Mpileup ***
2203 **************************/
2204
2205 #if !defined(BAM_NO_PILEUP)
2206
2207 #include <assert.h>
2208
2209 /*******************
2210 *** Memory pool ***
2211 *******************/
2212
2213 typedef struct {
2214 int k, x, y, end;
2215 } cstate_t;
2216
2217 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
2218
2219 typedef struct __linkbuf_t {
2220 bam1_t b;
2221 int32_t beg, end;
2222 cstate_t s;
2223 struct __linkbuf_t *next;
2224 bam_pileup_cd cd;
2225 } lbnode_t;
2226
2227 typedef struct {
2228 int cnt, n, max;
2229 lbnode_t **buf;
2230 } mempool_t;
2231
mp_init(void)2232 static mempool_t *mp_init(void)
2233 {
2234 mempool_t *mp;
2235 mp = (mempool_t*)calloc(1, sizeof(mempool_t));
2236 return mp;
2237 }
mp_destroy(mempool_t * mp)2238 static void mp_destroy(mempool_t *mp)
2239 {
2240 int k;
2241 for (k = 0; k < mp->n; ++k) {
2242 free(mp->buf[k]->b.data);
2243 free(mp->buf[k]);
2244 }
2245 free(mp->buf);
2246 free(mp);
2247 }
mp_alloc(mempool_t * mp)2248 static inline lbnode_t *mp_alloc(mempool_t *mp)
2249 {
2250 ++mp->cnt;
2251 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
2252 else return mp->buf[--mp->n];
2253 }
mp_free(mempool_t * mp,lbnode_t * p)2254 static inline void mp_free(mempool_t *mp, lbnode_t *p)
2255 {
2256 --mp->cnt; p->next = 0; // clear lbnode_t::next here
2257 if (mp->n == mp->max) {
2258 mp->max = mp->max? mp->max<<1 : 256;
2259 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
2260 }
2261 mp->buf[mp->n++] = p;
2262 }
2263
2264 /**********************
2265 *** CIGAR resolver ***
2266 **********************/
2267
2268 /* s->k: the index of the CIGAR operator that has just been processed.
2269 s->x: the reference coordinate of the start of s->k
2270 s->y: the query coordiante of the start of s->k
2271 */
resolve_cigar2(bam_pileup1_t * p,int32_t pos,cstate_t * s)2272 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
2273 {
2274 #define _cop(c) ((c)&BAM_CIGAR_MASK)
2275 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
2276
2277 bam1_t *b = p->b;
2278 bam1_core_t *c = &b->core;
2279 uint32_t *cigar = bam_get_cigar(b);
2280 int k;
2281 // determine the current CIGAR operation
2282 //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
2283 if (s->k == -1) { // never processed
2284 if (c->n_cigar == 1) { // just one operation, save a loop
2285 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
2286 } else { // find the first match or deletion
2287 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
2288 int op = _cop(cigar[k]);
2289 int l = _cln(cigar[k]);
2290 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2291 else if (op == BAM_CREF_SKIP) s->x += l;
2292 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2293 }
2294 assert(k < c->n_cigar);
2295 s->k = k;
2296 }
2297 } else { // the read has been processed before
2298 int op, l = _cln(cigar[s->k]);
2299 if (pos - s->x >= l) { // jump to the next operation
2300 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
2301 op = _cop(cigar[s->k+1]);
2302 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
2303 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2304 s->x += l;
2305 ++s->k;
2306 } else { // find the next M/D/N/=/X
2307 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2308 s->x += l;
2309 for (k = s->k + 1; k < c->n_cigar; ++k) {
2310 op = _cop(cigar[k]), l = _cln(cigar[k]);
2311 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2312 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2313 }
2314 s->k = k;
2315 }
2316 assert(s->k < c->n_cigar); // otherwise a bug
2317 } // else, do nothing
2318 }
2319 { // collect pileup information
2320 int op, l;
2321 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
2322 p->is_del = p->indel = p->is_refskip = 0;
2323 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
2324 int op2 = _cop(cigar[s->k+1]);
2325 int l2 = _cln(cigar[s->k+1]);
2326 if (op2 == BAM_CDEL) p->indel = -(int)l2;
2327 else if (op2 == BAM_CINS) p->indel = l2;
2328 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
2329 int l3 = 0;
2330 for (k = s->k + 2; k < c->n_cigar; ++k) {
2331 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
2332 if (op2 == BAM_CINS) l3 += l2;
2333 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
2334 }
2335 if (l3 > 0) p->indel = l3;
2336 }
2337 }
2338 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
2339 p->qpos = s->y + (pos - s->x);
2340 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
2341 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
2342 p->is_refskip = (op == BAM_CREF_SKIP);
2343 } // cannot be other operations; otherwise a bug
2344 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
2345 }
2346 return 1;
2347 }
2348
2349 /***********************
2350 *** Pileup iterator ***
2351 ***********************/
2352
2353 // Dictionary of overlapping reads
2354 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
2355 typedef khash_t(olap_hash) olap_hash_t;
2356
2357 struct __bam_plp_t {
2358 mempool_t *mp;
2359 lbnode_t *head, *tail;
2360 int32_t tid, pos, max_tid, max_pos;
2361 int is_eof, max_plp, error, maxcnt;
2362 uint64_t id;
2363 bam_pileup1_t *plp;
2364 // for the "auto" interface only
2365 bam1_t *b;
2366 bam_plp_auto_f func;
2367 void *data;
2368 olap_hash_t *overlaps;
2369
2370 // For notification of creation and destruction events
2371 // and associated client-owned pointer.
2372 int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
2373 int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
2374 };
2375
bam_plp_init(bam_plp_auto_f func,void * data)2376 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
2377 {
2378 bam_plp_t iter;
2379 iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
2380 iter->mp = mp_init();
2381 iter->head = iter->tail = mp_alloc(iter->mp);
2382 iter->max_tid = iter->max_pos = -1;
2383 iter->maxcnt = 8000;
2384 if (func) {
2385 iter->func = func;
2386 iter->data = data;
2387 iter->b = bam_init1();
2388 }
2389 return iter;
2390 }
2391
bam_plp_init_overlaps(bam_plp_t iter)2392 void bam_plp_init_overlaps(bam_plp_t iter)
2393 {
2394 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
2395 }
2396
bam_plp_destroy(bam_plp_t iter)2397 void bam_plp_destroy(bam_plp_t iter)
2398 {
2399 lbnode_t *p, *pnext;
2400 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
2401 for (p = iter->head; p != NULL; p = pnext) {
2402 pnext = p->next;
2403 mp_free(iter->mp, p);
2404 }
2405 mp_destroy(iter->mp);
2406 if (iter->b) bam_destroy1(iter->b);
2407 free(iter->plp);
2408 free(iter);
2409 }
2410
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2411 void bam_plp_constructor(bam_plp_t plp,
2412 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2413 plp->plp_construct = func;
2414 }
2415
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2416 void bam_plp_destructor(bam_plp_t plp,
2417 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2418 plp->plp_destruct = func;
2419 }
2420
2421 //---------------------------------
2422 //--- Tweak overlapping reads
2423 //---------------------------------
2424
2425 /**
2426 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
2427 * cigar_iref2iseq_next() - get the next CMATCH base
2428 * @cigar: pointer to current cigar block (rw)
2429 * @cigar_max: pointer just beyond the last cigar block
2430 * @icig: position within the current cigar block (rw)
2431 * @iseq: position in the sequence (rw)
2432 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
2433 *
2434 * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
2435 */
cigar_iref2iseq_set(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2436 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2437 {
2438 int pos = *iref;
2439 if ( pos < 0 ) return -1;
2440 *icig = 0;
2441 *iseq = 0;
2442 *iref = 0;
2443 while ( *cigar<cigar_max )
2444 {
2445 int cig = (**cigar) & BAM_CIGAR_MASK;
2446 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2447
2448 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2449 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2450 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2451 {
2452 pos -= ncig;
2453 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
2454 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
2455 continue;
2456 }
2457 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2458 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
2459 {
2460 pos -= ncig;
2461 if ( pos<0 ) pos = 0;
2462 (*cigar)++; *icig = 0; *iref += ncig;
2463 continue;
2464 }
2465 hts_log_error("Unexpected cigar %d", cig);
2466 assert(0);
2467 }
2468 *iseq = -1;
2469 return -1;
2470 }
cigar_iref2iseq_next(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2471 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2472 {
2473 while ( *cigar < cigar_max )
2474 {
2475 int cig = (**cigar) & BAM_CIGAR_MASK;
2476 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2477
2478 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2479 {
2480 if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
2481 (*iseq)++; (*icig)++; (*iref)++;
2482 return BAM_CMATCH;
2483 }
2484 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
2485 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2486 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2487 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2488 hts_log_error("Unexpected cigar %d", cig);
2489 assert(0);
2490 }
2491 *iseq = -1;
2492 *iref = -1;
2493 return -1;
2494 }
2495
tweak_overlap_quality(bam1_t * a,bam1_t * b)2496 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
2497 {
2498 uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
2499 uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
2500 int a_icig = 0, a_iseq = 0;
2501 int b_icig = 0, b_iseq = 0;
2502 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
2503 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
2504
2505 int iref = b->core.pos;
2506 int a_iref = iref - a->core.pos;
2507 int b_iref = iref - b->core.pos;
2508 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2509 if ( a_ret<0 ) return; // no overlap
2510 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2511 if ( b_ret<0 ) return; // no overlap
2512
2513 #if DBG
2514 fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
2515 a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
2516 #endif
2517
2518 while ( 1 )
2519 {
2520 // Increment reference position
2521 while ( a_iref>=0 && a_iref < iref - a->core.pos )
2522 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2523 if ( a_ret<0 ) break; // done
2524 if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
2525
2526 while ( b_iref>=0 && b_iref < iref - b->core.pos )
2527 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2528 if ( b_ret<0 ) break; // done
2529 if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
2530
2531 iref++;
2532 if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
2533
2534 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
2535 {
2536 #if DBG
2537 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
2538 #endif
2539 // we are very confident about this base
2540 int qual = a_qual[a_iseq] + b_qual[b_iseq];
2541 a_qual[a_iseq] = qual>200 ? 200 : qual;
2542 b_qual[b_iseq] = 0;
2543 }
2544 else
2545 {
2546 if ( a_qual[a_iseq] >= b_qual[b_iseq] )
2547 {
2548 #if DBG
2549 fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
2550 #endif
2551 a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
2552 b_qual[b_iseq] = 0;
2553 }
2554 else
2555 {
2556 #if DBG
2557 fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
2558 #endif
2559 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
2560 a_qual[a_iseq] = 0;
2561 }
2562 }
2563 }
2564 #if DBG
2565 fprintf(stderr,"\n");
2566 #endif
2567 }
2568
2569 // Fix overlapping reads. Simple soft-clipping did not give good results.
2570 // Lowering qualities of unwanted bases is more selective and works better.
2571 //
overlap_push(bam_plp_t iter,lbnode_t * node)2572 static void overlap_push(bam_plp_t iter, lbnode_t *node)
2573 {
2574 if ( !iter->overlaps ) return;
2575
2576 // mapped mates and paired reads only
2577 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
2578
2579 // no overlap possible, unless some wild cigar
2580 if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
2581
2582 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
2583 if ( kitr==kh_end(iter->overlaps) )
2584 {
2585 int ret;
2586 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
2587 kh_value(iter->overlaps, kitr) = node;
2588 }
2589 else
2590 {
2591 lbnode_t *a = kh_value(iter->overlaps, kitr);
2592 tweak_overlap_quality(&a->b, &node->b);
2593 kh_del(olap_hash, iter->overlaps, kitr);
2594 assert(a->end-1 == a->s.end);
2595 a->end = bam_endpos(&a->b);
2596 a->s.end = a->end - 1;
2597 }
2598 }
2599
overlap_remove(bam_plp_t iter,const bam1_t * b)2600 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
2601 {
2602 if ( !iter->overlaps ) return;
2603
2604 khiter_t kitr;
2605 if ( b )
2606 {
2607 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
2608 if ( kitr!=kh_end(iter->overlaps) )
2609 kh_del(olap_hash, iter->overlaps, kitr);
2610 }
2611 else
2612 {
2613 // remove all
2614 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
2615 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
2616 }
2617 }
2618
2619
2620
2621 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
2622 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
2623 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2624 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2625 {
2626 if (iter->error) { *_n_plp = -1; return NULL; }
2627 *_n_plp = 0;
2628 if (iter->is_eof && iter->head == iter->tail) return NULL;
2629 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
2630 int n_plp = 0;
2631 // write iter->plp at iter->pos
2632 lbnode_t **pptr = &iter->head;
2633 while (*pptr != iter->tail) {
2634 lbnode_t *p = *pptr;
2635 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
2636 overlap_remove(iter, &p->b);
2637 if (iter->plp_destruct)
2638 iter->plp_destruct(iter->data, &p->b, &p->cd);
2639 *pptr = p->next; mp_free(iter->mp, p);
2640 }
2641 else {
2642 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
2643 if (n_plp == iter->max_plp) { // then double the capacity
2644 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
2645 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
2646 }
2647 iter->plp[n_plp].b = &p->b;
2648 iter->plp[n_plp].cd = p->cd;
2649 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
2650 }
2651 pptr = &(*pptr)->next;
2652 }
2653 }
2654 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
2655 // update iter->tid and iter->pos
2656 if (iter->head != iter->tail) {
2657 if (iter->tid > iter->head->b.core.tid) {
2658 hts_log_error("Unsorted input. Pileup aborts");
2659 iter->error = 1;
2660 *_n_plp = -1;
2661 return NULL;
2662 }
2663 }
2664 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
2665 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
2666 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
2667 iter->pos = iter->head->beg; // jump to the next position
2668 } else ++iter->pos; // scan contiguously
2669 // return
2670 if (n_plp) return iter->plp;
2671 if (iter->is_eof && iter->head == iter->tail) break;
2672 }
2673 return NULL;
2674 }
2675
bam_plp_push(bam_plp_t iter,const bam1_t * b)2676 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
2677 {
2678 if (iter->error) return -1;
2679 if (b) {
2680 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
2681 // Skip only unmapped reads here, any additional filtering must be done in iter->func
2682 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
2683 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
2684 {
2685 overlap_remove(iter, b);
2686 return 0;
2687 }
2688 bam_copy1(&iter->tail->b, b);
2689 overlap_push(iter, iter->tail);
2690 #ifndef BAM_NO_ID
2691 iter->tail->b.id = iter->id++;
2692 #endif
2693 iter->tail->beg = b->core.pos;
2694 iter->tail->end = bam_endpos(b);
2695 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
2696 if (b->core.tid < iter->max_tid) {
2697 hts_log_error("The input is not sorted (chromosomes out of order)");
2698 iter->error = 1;
2699 return -1;
2700 }
2701 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
2702 hts_log_error("The input is not sorted (reads out of order)");
2703 iter->error = 1;
2704 return -1;
2705 }
2706 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
2707 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
2708 if (iter->plp_construct)
2709 iter->plp_construct(iter->data, b, &iter->tail->cd);
2710 iter->tail->next = mp_alloc(iter->mp);
2711 iter->tail = iter->tail->next;
2712 }
2713 } else iter->is_eof = 1;
2714 return 0;
2715 }
2716
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2717 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2718 {
2719 const bam_pileup1_t *plp;
2720 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
2721 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2722 else { // no pileup line can be obtained; read alignments
2723 *_n_plp = 0;
2724 if (iter->is_eof) return 0;
2725 int ret;
2726 while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
2727 if (bam_plp_push(iter, iter->b) < 0) {
2728 *_n_plp = -1;
2729 return 0;
2730 }
2731 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2732 // otherwise no pileup line can be returned; read the next alignment.
2733 }
2734 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
2735 bam_plp_push(iter, 0);
2736 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2737 return 0;
2738 }
2739 }
2740
bam_plp_reset(bam_plp_t iter)2741 void bam_plp_reset(bam_plp_t iter)
2742 {
2743 overlap_remove(iter, NULL);
2744 iter->max_tid = iter->max_pos = -1;
2745 iter->tid = iter->pos = 0;
2746 iter->is_eof = 0;
2747 while (iter->head != iter->tail) {
2748 lbnode_t *p = iter->head;
2749 iter->head = p->next;
2750 mp_free(iter->mp, p);
2751 }
2752 }
2753
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)2754 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
2755 {
2756 iter->maxcnt = maxcnt;
2757 }
2758
2759 /************************
2760 *** Mpileup iterator ***
2761 ************************/
2762
2763 struct __bam_mplp_t {
2764 int n;
2765 uint64_t min, *pos;
2766 bam_plp_t *iter;
2767 int *n_plp;
2768 const bam_pileup1_t **plp;
2769 };
2770
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)2771 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
2772 {
2773 int i;
2774 bam_mplp_t iter;
2775 iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
2776 iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
2777 iter->n_plp = (int*)calloc(n, sizeof(int));
2778 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
2779 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
2780 iter->n = n;
2781 iter->min = (uint64_t)-1;
2782 for (i = 0; i < n; ++i) {
2783 iter->iter[i] = bam_plp_init(func, data[i]);
2784 iter->pos[i] = iter->min;
2785 }
2786 return iter;
2787 }
2788
bam_mplp_init_overlaps(bam_mplp_t iter)2789 void bam_mplp_init_overlaps(bam_mplp_t iter)
2790 {
2791 int i;
2792 for (i = 0; i < iter->n; ++i)
2793 bam_plp_init_overlaps(iter->iter[i]);
2794 }
2795
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)2796 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
2797 {
2798 int i;
2799 for (i = 0; i < iter->n; ++i)
2800 iter->iter[i]->maxcnt = maxcnt;
2801 }
2802
bam_mplp_destroy(bam_mplp_t iter)2803 void bam_mplp_destroy(bam_mplp_t iter)
2804 {
2805 int i;
2806 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
2807 free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
2808 free(iter);
2809 }
2810
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)2811 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
2812 {
2813 int i, ret = 0;
2814 uint64_t new_min = (uint64_t)-1;
2815 for (i = 0; i < iter->n; ++i) {
2816 if (iter->pos[i] == iter->min) {
2817 int tid, pos;
2818 iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
2819 if ( iter->iter[i]->error ) return -1;
2820 iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
2821 }
2822 if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
2823 }
2824 iter->min = new_min;
2825 if (new_min == (uint64_t)-1) return 0;
2826 *_tid = new_min>>32; *_pos = (uint32_t)new_min;
2827 for (i = 0; i < iter->n; ++i) {
2828 if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
2829 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
2830 ++ret;
2831 } else n_plp[i] = 0, plp[i] = 0;
2832 }
2833 return ret;
2834 }
2835
bam_mplp_reset(bam_mplp_t iter)2836 void bam_mplp_reset(bam_mplp_t iter)
2837 {
2838 int i;
2839 iter->min = (uint64_t)-1;
2840 for (i = 0; i < iter->n; ++i) {
2841 bam_plp_reset(iter->iter[i]);
2842 iter->pos[i] = (uint64_t)-1;
2843 iter->n_plp[i] = 0;
2844 iter->plp[i] = NULL;
2845 }
2846 }
2847
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2848 void bam_mplp_constructor(bam_mplp_t iter,
2849 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2850 int i;
2851 for (i = 0; i < iter->n; ++i)
2852 bam_plp_constructor(iter->iter[i], func);
2853 }
2854
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2855 void bam_mplp_destructor(bam_mplp_t iter,
2856 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2857 int i;
2858 for (i = 0; i < iter->n; ++i)
2859 bam_plp_destructor(iter->iter[i], func);
2860 }
2861
2862 #endif // ~!defined(BAM_NO_PILEUP)
2863