1 /* sam.c -- SAM and BAM file I/O and manipulation.
2
3 Copyright (C) 2008-2010, 2012-2018 Genome Research Ltd.
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <strings.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <zlib.h>
34 #include <assert.h>
35 #include "htslib/sam.h"
36 #include "htslib/bgzf.h"
37 #include "cram/cram.h"
38 #include "hts_internal.h"
39 #include "htslib/hfile.h"
40 #include "htslib/hts_endian.h"
41
42 #include "htslib/khash.h"
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44
45 typedef khash_t(s2i) sdict_t;
46
47 #ifndef EOVERFLOW
48 #define EOVERFLOW ERANGE
49 #endif
50
51 /**********************
52 *** BAM header I/O ***
53 **********************/
54
bam_hdr_init()55 bam_hdr_t *bam_hdr_init()
56 {
57 return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
58 }
59
bam_hdr_destroy(bam_hdr_t * h)60 void bam_hdr_destroy(bam_hdr_t *h)
61 {
62 int32_t i;
63 if (h == NULL) return;
64 if (h->target_name) {
65 for (i = 0; i < h->n_targets; ++i)
66 free(h->target_name[i]);
67 free(h->target_name);
68 free(h->target_len);
69 }
70 free(h->text); free(h->cigar_tab);
71 if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
72 free(h);
73 }
74
bam_hdr_dup(const bam_hdr_t * h0)75 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
76 {
77 if (h0 == NULL) return NULL;
78 bam_hdr_t *h;
79 if ((h = bam_hdr_init()) == NULL) return NULL;
80 // copy the simple data
81 h->n_targets = h0->n_targets;
82 h->ignore_sam_err = h0->ignore_sam_err;
83 h->l_text = h0->l_text;
84 // Then the pointery stuff
85 h->cigar_tab = NULL;
86 h->sdict = NULL;
87 // TODO Check for memory allocation failures
88 h->text = (char*)calloc(h->l_text + 1, 1);
89 memcpy(h->text, h0->text, h->l_text);
90 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
91 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
92 int i;
93 for (i = 0; i < h->n_targets; ++i) {
94 h->target_len[i] = h0->target_len[i];
95 h->target_name[i] = strdup(h0->target_name[i]);
96 }
97 return h;
98 }
99
100
hdr_from_dict(sdict_t * d)101 static bam_hdr_t *hdr_from_dict(sdict_t *d)
102 {
103 bam_hdr_t *h;
104 khint_t k;
105 h = bam_hdr_init();
106 h->sdict = d;
107 h->n_targets = kh_size(d);
108 // TODO Check for memory allocation failures
109 h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
110 h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
111 for (k = kh_begin(d); k != kh_end(d); ++k) {
112 if (!kh_exist(d, k)) continue;
113 h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
114 h->target_len[kh_val(d, k)>>32] = kh_val(d, k) & 0xffffffffUL;
115 kh_val(d, k) >>= 32;
116 }
117 return h;
118 }
119
bam_hdr_read(BGZF * fp)120 bam_hdr_t *bam_hdr_read(BGZF *fp)
121 {
122 bam_hdr_t *h;
123 char buf[4];
124 int magic_len, has_EOF;
125 int32_t i, name_len, num_names = 0;
126 size_t bufsize;
127 ssize_t bytes;
128 // check EOF
129 has_EOF = bgzf_check_EOF(fp);
130 if (has_EOF < 0) {
131 perror("[W::bam_hdr_read] bgzf_check_EOF");
132 } else if (has_EOF == 0) {
133 hts_log_warning("EOF marker is absent. The input is probably truncated");
134 }
135 // read "BAM1"
136 magic_len = bgzf_read(fp, buf, 4);
137 if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
138 hts_log_error("Invalid BAM binary header");
139 return 0;
140 }
141 h = bam_hdr_init();
142 if (!h) goto nomem;
143
144 // read plain text and the number of reference sequences
145 bytes = bgzf_read(fp, &h->l_text, 4);
146 if (bytes != 4) goto read_err;
147 if (fp->is_be) ed_swap_4p(&h->l_text);
148
149 bufsize = ((size_t) h->l_text) + 1;
150 if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
151 h->text = (char*)malloc(bufsize);
152 if (!h->text) goto nomem;
153 h->text[h->l_text] = 0; // make sure it is NULL terminated
154 bytes = bgzf_read(fp, h->text, h->l_text);
155 if (bytes != h->l_text) goto read_err;
156
157 bytes = bgzf_read(fp, &h->n_targets, 4);
158 if (bytes != 4) goto read_err;
159 if (fp->is_be) ed_swap_4p(&h->n_targets);
160
161 if (h->n_targets < 0) goto invalid;
162
163 // read reference sequence names and lengths
164 if (h->n_targets > 0) {
165 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
166 if (!h->target_name) goto nomem;
167 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
168 if (!h->target_len) goto nomem;
169 }
170 else {
171 h->target_name = NULL;
172 h->target_len = NULL;
173 }
174
175 for (i = 0; i != h->n_targets; ++i) {
176 bytes = bgzf_read(fp, &name_len, 4);
177 if (bytes != 4) goto read_err;
178 if (fp->is_be) ed_swap_4p(&name_len);
179 if (name_len <= 0) goto invalid;
180
181 h->target_name[i] = (char*)malloc(name_len);
182 if (!h->target_name[i]) goto nomem;
183 num_names++;
184
185 bytes = bgzf_read(fp, h->target_name[i], name_len);
186 if (bytes != name_len) goto read_err;
187
188 if (h->target_name[i][name_len - 1] != '\0') {
189 /* Fix missing NUL-termination. Is this being too nice?
190 We could alternatively bail out with an error. */
191 char *new_name;
192 if (name_len == INT32_MAX) goto invalid;
193 new_name = realloc(h->target_name[i], name_len + 1);
194 if (new_name == NULL) goto nomem;
195 h->target_name[i] = new_name;
196 h->target_name[i][name_len] = '\0';
197 }
198
199 bytes = bgzf_read(fp, &h->target_len[i], 4);
200 if (bytes != 4) goto read_err;
201 if (fp->is_be) ed_swap_4p(&h->target_len[i]);
202 }
203 return h;
204
205 nomem:
206 hts_log_error("Out of memory");
207 goto clean;
208
209 read_err:
210 if (bytes < 0) {
211 hts_log_error("Error reading BGZF stream");
212 } else {
213 hts_log_error("Truncated BAM header");
214 }
215 goto clean;
216
217 invalid:
218 hts_log_error("Invalid BAM binary header");
219
220 clean:
221 if (h != NULL) {
222 h->n_targets = num_names; // ensure we free only allocated target_names
223 bam_hdr_destroy(h);
224 }
225 return NULL;
226 }
227
bam_hdr_write(BGZF * fp,const bam_hdr_t * h)228 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
229 {
230 char buf[4];
231 int32_t i, name_len, x;
232 // write "BAM1"
233 strncpy(buf, "BAM\1", 4);
234 if (bgzf_write(fp, buf, 4) < 0) return -1;
235 // write plain text and the number of reference sequences
236 if (fp->is_be) {
237 x = ed_swap_4(h->l_text);
238 if (bgzf_write(fp, &x, 4) < 0) return -1;
239 if (h->l_text) {
240 if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
241 }
242 x = ed_swap_4(h->n_targets);
243 if (bgzf_write(fp, &x, 4) < 0) return -1;
244 } else {
245 if (bgzf_write(fp, &h->l_text, 4) < 0) return -1;
246 if (h->l_text) {
247 if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
248 }
249 if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1;
250 }
251 // write sequence names and lengths
252 for (i = 0; i != h->n_targets; ++i) {
253 char *p = h->target_name[i];
254 name_len = strlen(p) + 1;
255 if (fp->is_be) {
256 x = ed_swap_4(name_len);
257 if (bgzf_write(fp, &x, 4) < 0) return -1;
258 } else {
259 if (bgzf_write(fp, &name_len, 4) < 0) return -1;
260 }
261 if (bgzf_write(fp, p, name_len) < 0) return -1;
262 if (fp->is_be) {
263 x = ed_swap_4(h->target_len[i]);
264 if (bgzf_write(fp, &x, 4) < 0) return -1;
265 } else {
266 if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
267 }
268 }
269 if (bgzf_flush(fp) < 0) return -1;
270 return 0;
271 }
272
bam_name2id(bam_hdr_t * h,const char * ref)273 int bam_name2id(bam_hdr_t *h, const char *ref)
274 {
275 sdict_t *d = (sdict_t*)h->sdict;
276 khint_t k;
277 if (h->sdict == 0) {
278 int i, absent;
279 d = kh_init(s2i);
280 for (i = 0; i < h->n_targets; ++i) {
281 k = kh_put(s2i, d, h->target_name[i], &absent);
282 kh_val(d, k) = i;
283 }
284 h->sdict = d;
285 }
286 k = kh_get(s2i, d, ref);
287 return k == kh_end(d)? -1 : kh_val(d, k);
288 }
289
290 /*************************
291 *** BAM alignment I/O ***
292 *************************/
293
bam_init1()294 bam1_t *bam_init1()
295 {
296 return (bam1_t*)calloc(1, sizeof(bam1_t));
297 }
298
bam_destroy1(bam1_t * b)299 void bam_destroy1(bam1_t *b)
300 {
301 if (b == 0) return;
302 free(b->data); free(b);
303 }
304
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)305 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
306 {
307 uint8_t *data = bdst->data;
308 int m_data = bdst->m_data; // backup data and m_data
309 if (m_data < bsrc->l_data) { // double the capacity
310 m_data = bsrc->l_data; kroundup32(m_data);
311 data = (uint8_t*)realloc(data, m_data);
312 }
313 memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
314 *bdst = *bsrc; // copy the rest
315 // restore the backup
316 bdst->m_data = m_data;
317 bdst->data = data;
318 return bdst;
319 }
320
bam_dup1(const bam1_t * bsrc)321 bam1_t *bam_dup1(const bam1_t *bsrc)
322 {
323 if (bsrc == NULL) return NULL;
324 bam1_t *bdst = bam_init1();
325 if (bdst == NULL) return NULL;
326 return bam_copy1(bdst, bsrc);
327 }
328
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,int * rlen,int * qlen)329 void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen)
330 {
331 int k;
332 *rlen = *qlen = 0;
333 for (k = 0; k < n_cigar; ++k) {
334 int type = bam_cigar_type(bam_cigar_op(cigar[k]));
335 int len = bam_cigar_oplen(cigar[k]);
336 if (type & 1) *qlen += len;
337 if (type & 2) *rlen += len;
338 }
339 }
340
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)341 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
342 {
343 int k, l;
344 for (k = l = 0; k < n_cigar; ++k)
345 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
346 l += bam_cigar_oplen(cigar[k]);
347 return l;
348 }
349
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)350 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
351 {
352 int k, l;
353 for (k = l = 0; k < n_cigar; ++k)
354 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
355 l += bam_cigar_oplen(cigar[k]);
356 return l;
357 }
358
bam_endpos(const bam1_t * b)359 int32_t bam_endpos(const bam1_t *b)
360 {
361 if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
362 return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
363 else
364 return b->core.pos + 1;
365 }
366
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)367 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
368 {
369 bam1_core_t *c = &b->core;
370 uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
371 uint8_t *CG;
372
373 // test where there is a real CIGAR in the CG tag to move
374 if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
375 cigar0 = bam_get_cigar(b);
376 if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
377 fake_bytes = c->n_cigar * 4;
378 if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag
379 if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
380 CG_len = le_to_u32(CG + 2);
381 if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
382
383 // move from the CG tag to the right position
384 cigar_st = (uint8_t*)cigar0 - b->data;
385 c->n_cigar = CG_len;
386 n_cigar4 = c->n_cigar * 4;
387 CG_st = CG - b->data - 2;
388 CG_en = CG_st + 8 + n_cigar4;
389 b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
390 if (b->m_data < b->l_data) {
391 uint8_t *new_data;
392 uint32_t new_max = b->l_data;
393 kroundup32(new_max);
394 new_data = (uint8_t*)realloc(b->data, new_max);
395 if (new_data == 0) return -1;
396 b->m_data = new_max, b->data = new_data;
397 }
398 memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
399 memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
400 if (ori_len > CG_en) // move data after the CG tag
401 memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
402 b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
403 if (recal_bin)
404 b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5);
405 if (give_warning)
406 hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
407 return 1;
408 }
409
aux_type2size(uint8_t type)410 static inline int aux_type2size(uint8_t type)
411 {
412 switch (type) {
413 case 'A': case 'c': case 'C':
414 return 1;
415 case 's': case 'S':
416 return 2;
417 case 'i': case 'I': case 'f':
418 return 4;
419 case 'd':
420 return 8;
421 case 'Z': case 'H': case 'B':
422 return type;
423 default:
424 return 0;
425 }
426 }
427
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)428 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
429 {
430 uint32_t *cigar = (uint32_t*)(data + c->l_qname);
431 uint32_t i;
432 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
433 }
434
bam_read1(BGZF * fp,bam1_t * b)435 int bam_read1(BGZF *fp, bam1_t *b)
436 {
437 bam1_core_t *c = &b->core;
438 int32_t block_len, ret, i;
439 uint32_t x[8];
440 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
441 if (ret == 0) return -1; // normal end-of-file
442 else return -2; // truncated
443 }
444 if (fp->is_be)
445 ed_swap_4p(&block_len);
446 if (block_len < 32) return -4; // block_len includes core data
447 if (bgzf_read(fp, x, 32) != 32) return -3;
448 if (fp->is_be) {
449 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
450 }
451 c->tid = x[0]; c->pos = x[1];
452 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
453 c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
454 if ((uint32_t) c->l_qname + c->l_extranul > 255) // l_qname would overflow
455 return -4;
456 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
457 c->l_qseq = x[4];
458 c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
459 b->l_data = block_len - 32 + c->l_extranul;
460 if (b->l_data < 0 || c->l_qseq < 0 || c->l_qname < 1) return -4;
461 if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
462 + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) b->l_data)
463 return -4;
464 if (b->m_data < b->l_data) {
465 uint8_t *new_data;
466 uint32_t new_m = b->l_data;
467 kroundup32(new_m);
468 new_data = (uint8_t*)realloc(b->data, new_m);
469 if (!new_data)
470 return -4;
471 b->data = new_data;
472 b->m_data = new_m;
473 }
474 if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
475 for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
476 c->l_qname += c->l_extranul;
477 if (b->l_data < c->l_qname ||
478 bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
479 return -4;
480 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
481 if (bam_tag2cigar(b, 0, 0) < 0)
482 return -4;
483
484 if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
485 int rlen, qlen;
486 bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
487 if ((b->core.flag & BAM_FUNMAP)) rlen=1;
488 b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
489 // Sanity check for broken CIGAR alignments
490 if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
491 hts_log_error("CIGAR and query sequence lengths differ for %s",
492 bam_get_qname(b));
493 return -4;
494 }
495 }
496
497 return 4 + block_len;
498 }
499
bam_write1(BGZF * fp,const bam1_t * b)500 int bam_write1(BGZF *fp, const bam1_t *b)
501 {
502 const bam1_core_t *c = &b->core;
503 uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
504 int i, ok;
505 if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
506 x[0] = c->tid;
507 x[1] = c->pos;
508 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
509 if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
510 else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
511 x[4] = c->l_qseq;
512 x[5] = c->mtid;
513 x[6] = c->mpos;
514 x[7] = c->isize;
515 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
516 if (fp->is_be) {
517 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
518 y = block_len;
519 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
520 swap_data(c, b->l_data, b->data, 1);
521 } else {
522 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
523 }
524 if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
525 if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
526 if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
527 if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
528 } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
529 uint8_t buf[8];
530 uint32_t cigar_st, cigar_en, cigar[2];
531 cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
532 cigar_en = cigar_st + c->n_cigar * 4;
533 cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
534 cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP;
535 u32_to_le(cigar[0], buf);
536 u32_to_le(cigar[1], buf + 4);
537 if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
538 if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
539 if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
540 u32_to_le(c->n_cigar, buf);
541 if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
542 if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
543 }
544 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
545 return ok? 4 + block_len : -1;
546 }
547
548 /********************
549 *** BAM indexing ***
550 ********************/
551
bam_index(BGZF * fp,int min_shift)552 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
553 {
554 int n_lvls, i, fmt, ret;
555 bam1_t *b;
556 hts_idx_t *idx;
557 bam_hdr_t *h;
558 h = bam_hdr_read(fp);
559 if (h == NULL) return NULL;
560 if (min_shift > 0) {
561 int64_t max_len = 0, s;
562 for (i = 0; i < h->n_targets; ++i)
563 if (max_len < h->target_len[i]) max_len = h->target_len[i];
564 max_len += 256;
565 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
566 fmt = HTS_FMT_CSI;
567 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
568 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
569 bam_hdr_destroy(h);
570 b = bam_init1();
571 while ((ret = bam_read1(fp, b)) >= 0) {
572 ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
573 if (ret < 0) goto err; // unsorted
574 }
575 if (ret < -1) goto err; // corrupted BAM file
576
577 hts_idx_finish(idx, bgzf_tell(fp));
578 bam_destroy1(b);
579 return idx;
580
581 err:
582 bam_destroy1(b);
583 hts_idx_destroy(idx);
584 return NULL;
585 }
586
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)587 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
588 {
589 hts_idx_t *idx;
590 htsFile *fp;
591 int ret = 0;
592
593 if ((fp = hts_open(fn, "r")) == 0) return -2;
594 if (nthreads)
595 hts_set_threads(fp, nthreads);
596
597 switch (fp->format.format) {
598 case cram:
599 ret = cram_index_build(fp->fp.cram, fn, fnidx);
600 break;
601
602 case bam:
603 idx = bam_index(fp->fp.bgzf, min_shift);
604 if (idx) {
605 ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
606 if (ret < 0) ret = -4;
607 hts_idx_destroy(idx);
608 }
609 else ret = -1;
610 break;
611
612 default:
613 ret = -3;
614 break;
615 }
616 hts_close(fp);
617
618 return ret;
619 }
620
sam_index_build2(const char * fn,const char * fnidx,int min_shift)621 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
622 {
623 return sam_index_build3(fn, fnidx, min_shift, 0);
624 }
625
sam_index_build(const char * fn,int min_shift)626 int sam_index_build(const char *fn, int min_shift)
627 {
628 return sam_index_build3(fn, NULL, min_shift, 0);
629 }
630
631 // Provide bam_index_build() symbol for binary compability with earlier HTSlib
632 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)633 int bam_index_build(const char *fn, int min_shift)
634 {
635 return sam_index_build2(fn, NULL, min_shift);
636 }
637
bam_readrec(BGZF * fp,void * ignored,void * bv,int * tid,int * beg,int * end)638 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
639 {
640 bam1_t *b = bv;
641 int ret;
642 if ((ret = bam_read1(fp, b)) >= 0) {
643 *tid = b->core.tid;
644 *beg = b->core.pos;
645 *end = bam_endpos(b);
646 }
647 return ret;
648 }
649
650 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,int * beg,int * end)651 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
652 {
653 htsFile *fp = fpv;
654 bam1_t *b = bv;
655 int ret = cram_get_bam_seq(fp->fp.cram, &b);
656 if (ret < 0)
657 return cram_eof(fp->fp.cram) ? -1 : -2;
658
659 if (bam_tag2cigar(b, 1, 1) < 0)
660 return -2;
661
662 *tid = b->core.tid;
663 *beg = b->core.pos;
664 *end = bam_endpos(b);
665
666 return ret;
667 }
668
cram_pseek(void * fp,int64_t offset,int whence)669 static int cram_pseek(void *fp, int64_t offset, int whence)
670 {
671 cram_fd *fd = (cram_fd *)fp;
672
673 if ((0 != cram_seek(fd, offset, SEEK_SET))
674 && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
675 return -1;
676
677 if (fd->ctr) {
678 cram_free_container(fd->ctr);
679 if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
680 cram_free_container(fd->ctr_mt);
681
682 fd->ctr = NULL;
683 fd->ctr_mt = NULL;
684 fd->ooc = 0;
685 }
686
687 return 0;
688 }
689
690 /*
691 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
692 * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
693 * container previously fetched. It was designed like this to integrate with the functionality
694 * of the iterator stepping logic.
695 */
696
cram_ptell(void * fp)697 static int64_t cram_ptell(void *fp)
698 {
699 cram_fd *fd = (cram_fd *)fp;
700 cram_container *c;
701 int64_t ret = -1L;
702
703 if (fd && fd->fp) {
704 ret = htell(fd->fp);
705 if ((c = fd->ctr) != NULL) {
706 ret -= ((c->curr_slice < c->max_slice || c->curr_rec < c->num_records) ? c->offset + 1 : 0);
707 }
708 }
709
710 return ret;
711 }
712
bam_pseek(void * fp,int64_t offset,int whence)713 static int bam_pseek(void *fp, int64_t offset, int whence)
714 {
715 BGZF *fd = (BGZF *)fp;
716
717 return bgzf_seek(fd, offset, whence);
718 }
719
bam_ptell(void * fp)720 static int64_t bam_ptell(void *fp)
721 {
722 BGZF *fd = (BGZF *)fp;
723 if (!fd)
724 return -1L;
725
726 return bgzf_tell(fd);
727 }
728
729 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_bam_cram_readrec(BGZF * bgzfp,void * fpv,void * bv,int * tid,int * beg,int * end)730 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
731 {
732 htsFile *fp = fpv;
733 bam1_t *b = bv;
734 switch (fp->format.format) {
735 case bam: return bam_read1(bgzfp, b);
736 case cram: {
737 int ret = cram_get_bam_seq(fp->fp.cram, &b);
738 if (ret < 0)
739 return cram_eof(fp->fp.cram) ? -1 : -2;
740
741 if (bam_tag2cigar(b, 1, 1) < 0)
742 return -2;
743 return ret;
744 }
745 default:
746 // TODO Need headers available to implement this for SAM files
747 hts_log_error("Not implemented for SAM files");
748 abort();
749 }
750 }
751
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)752 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
753 {
754 switch (fp->format.format) {
755 case bam:
756 return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI);
757
758 case cram: {
759 if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
760 // Cons up a fake "index" just pointing at the associated cram_fd:
761 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
762 if (idx == NULL) return NULL;
763 idx->fmt = HTS_FMT_CRAI;
764 idx->cram = fp->fp.cram;
765 return (hts_idx_t *) idx;
766 }
767
768 default:
769 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
770 }
771 }
772
sam_index_load(htsFile * fp,const char * fn)773 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
774 {
775 return sam_index_load2(fp, fn, NULL);
776 }
777
cram_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)778 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
779 {
780 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
781 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
782 if (iter == NULL) return NULL;
783
784 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
785 // the readrec function:
786 iter->is_cram = 1;
787 iter->read_rest = 1;
788 iter->off = NULL;
789 iter->bins.a = NULL;
790 iter->readrec = readrec;
791
792 if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
793 cram_range r = { tid, beg+1, end };
794 int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
795
796 iter->curr_off = 0;
797 // The following fields are not required by hts_itr_next(), but are
798 // filled in in case user code wants to look at them.
799 iter->tid = tid;
800 iter->beg = beg;
801 iter->end = end;
802
803 switch (ret) {
804 case 0:
805 break;
806
807 case -2:
808 // No data vs this ref, so mark iterator as completed.
809 // Same as HTS_IDX_NONE.
810 iter->finished = 1;
811 break;
812
813 default:
814 free(iter);
815 return NULL;
816 }
817 }
818 else switch (tid) {
819 case HTS_IDX_REST:
820 iter->curr_off = 0;
821 break;
822 case HTS_IDX_NONE:
823 iter->curr_off = 0;
824 iter->finished = 1;
825 break;
826 default:
827 hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
828 abort();
829 break;
830 }
831
832 return iter;
833 }
834
sam_itr_queryi(const hts_idx_t * idx,int tid,int beg,int end)835 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
836 {
837 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
838 if (idx == NULL)
839 return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
840 else if (cidx->fmt == HTS_FMT_CRAI)
841 return cram_itr_query(idx, tid, beg, end, cram_readrec);
842 else
843 return hts_itr_query(idx, tid, beg, end, bam_readrec);
844 }
845
cram_name2id(void * fdv,const char * ref)846 static int cram_name2id(void *fdv, const char *ref)
847 {
848 cram_fd *fd = (cram_fd *) fdv;
849 return sam_hdr_name2ref(fd->header, ref);
850 }
851
sam_itr_querys(const hts_idx_t * idx,bam_hdr_t * hdr,const char * region)852 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
853 {
854 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
855 if (cidx->fmt == HTS_FMT_CRAI)
856 return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
857 else
858 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
859 }
860
sam_itr_regions(const hts_idx_t * idx,bam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)861 hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
862 {
863 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
864 if (cidx->fmt == HTS_FMT_CRAI)
865 return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
866 hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
867 else
868 return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
869 hts_itr_multi_bam, bam_readrec, bam_pseek, bam_ptell);
870 }
871
872 /**********************
873 *** SAM header I/O ***
874 **********************/
875
876 #include "htslib/kseq.h"
877 #include "htslib/kstring.h"
878
sam_hdr_parse(int l_text,const char * text)879 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
880 {
881 const char *q, *r, *p;
882 khash_t(s2i) *d;
883 d = kh_init(s2i);
884 for (p = text; *p; ++p) {
885 if (strncmp(p, "@SQ\t", 4) == 0) {
886 char *sn = 0;
887 int ln = -1;
888 for (q = p + 4;; ++q) {
889 if (strncmp(q, "SN:", 3) == 0) {
890 q += 3;
891 for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r);
892 sn = (char*)calloc(r - q + 1, 1);
893 strncpy(sn, q, r - q);
894 q = r;
895 } else if (strncmp(q, "LN:", 3) == 0)
896 ln = strtol(q + 3, (char**)&q, 10);
897 while (*q != '\t' && *q != '\n' && *q != '\0') ++q;
898 if (*q == '\0' || *q == '\n') break;
899 }
900 p = q;
901 if (sn && ln >= 0) {
902 khint_t k;
903 int absent;
904 k = kh_put(s2i, d, sn, &absent);
905 if (!absent) {
906 hts_log_warning("Duplicated sequence '%s'", sn);
907 free(sn);
908 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
909 }
910 }
911 while (*p != '\0' && *p != '\n') ++p;
912 }
913 return hdr_from_dict(d);
914 }
915
916 // Minimal sanitisation of a header to ensure.
917 // - null terminated string.
918 // - all lines start with @ (also implies no blank lines).
919 //
920 // Much more could be done, but currently is not, including:
921 // - checking header types are known (HD, SQ, etc).
922 // - syntax (eg checking tab separated fields).
923 // - validating n_targets matches @SQ records.
924 // - validating target lengths against @SQ records.
sam_hdr_sanitise(bam_hdr_t * h)925 static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) {
926 if (!h)
927 return NULL;
928
929 // Special case for empty headers.
930 if (h->l_text == 0)
931 return h;
932
933 uint32_t i, lnum = 0;
934 char *cp = h->text, last = '\n';
935 for (i = 0; i < h->l_text; i++) {
936 // NB: l_text excludes terminating nul. This finds early ones.
937 if (cp[i] == 0)
938 break;
939
940 // Error on \n[^@], including duplicate newlines
941 if (last == '\n') {
942 lnum++;
943 if (cp[i] != '@') {
944 hts_log_error("Malformed SAM header at line %u", lnum);
945 bam_hdr_destroy(h);
946 return NULL;
947 }
948 }
949
950 last = cp[i];
951 }
952
953 if (i < h->l_text) { // Early nul found. Complain if not just padding.
954 uint32_t j = i;
955 while (j < h->l_text && cp[j] == '\0') j++;
956 if (j < h->l_text)
957 hts_log_warning("Unexpected NUL character in header. Possibly truncated");
958 }
959
960 // Add trailing newline and/or trailing nul if required.
961 if (last != '\n') {
962 hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
963
964 if (h->l_text == UINT32_MAX) {
965 hts_log_error("No room for extra newline");
966 bam_hdr_destroy(h);
967 return NULL;
968 }
969
970 if (i >= h->l_text - 1) {
971 cp = realloc(h->text, (size_t) h->l_text+2);
972 if (!cp) {
973 bam_hdr_destroy(h);
974 return NULL;
975 }
976 h->text = cp;
977 }
978 cp[i++] = '\n';
979
980 // l_text may be larger already due to multiple nul padding
981 if (h->l_text < i)
982 h->l_text = i;
983 cp[h->l_text] = '\0';
984 }
985
986 return h;
987 }
988
sam_hdr_read(htsFile * fp)989 bam_hdr_t *sam_hdr_read(htsFile *fp)
990 {
991 switch (fp->format.format) {
992 case bam:
993 return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
994
995 case cram:
996 return sam_hdr_sanitise(cram_header_to_bam(fp->fp.cram->header));
997
998 case sam: {
999 kstring_t str = { 0, 0, NULL };
1000 bam_hdr_t *h = NULL;
1001 int ret, has_SQ = 0;
1002 while ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1003 if (fp->line.s[0] != '@') break;
1004 if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
1005 kputsn(fp->line.s, fp->line.l, &str);
1006 kputc('\n', &str);
1007 }
1008 if (ret < -1) goto error;
1009 if (! has_SQ && fp->fn_aux) {
1010 kstring_t line = { 0, 0, NULL };
1011 hFILE *f = hopen(fp->fn_aux, "r");
1012 if (f == NULL) goto error;
1013 while (line.l = 0, kgetline(&line, (kgets_func *) hgets, f) >= 0) {
1014 char *tab = strchr(line.s, '\t');
1015 if (tab == NULL) continue;
1016 kputs("@SQ\tSN:", &str);
1017 kputsn(line.s, tab - line.s, &str);
1018 kputs("\tLN:", &str);
1019 kputl(atol(tab), &str);
1020 kputc('\n', &str);
1021 }
1022 free(line.s);
1023 if (hclose(f) != 0) {
1024 hts_log_warning("Failed to close %s", fp->fn_aux);
1025 }
1026 }
1027 if (str.l == 0) kputsn("", 0, &str);
1028 h = sam_hdr_parse(str.l, str.s);
1029 h->l_text = str.l; h->text = str.s;
1030 return sam_hdr_sanitise(h);
1031
1032 error:
1033 bam_hdr_destroy(h);
1034 free(str.s);
1035 return NULL;
1036 }
1037
1038 default:
1039 abort();
1040 }
1041 }
1042
sam_hdr_write(htsFile * fp,const bam_hdr_t * h)1043 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
1044 {
1045 if (!h) {
1046 errno = EINVAL;
1047 return -1;
1048 }
1049
1050 switch (fp->format.format) {
1051 case binary_format:
1052 fp->format.category = sequence_data;
1053 fp->format.format = bam;
1054 /* fall-through */
1055 case bam:
1056 if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1057 break;
1058
1059 case cram: {
1060 cram_fd *fd = fp->fp.cram;
1061 SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h);
1062 if (! hdr) return -1;
1063 if (cram_set_header(fd, hdr) < 0) return -1;
1064 if (fp->fn_aux)
1065 cram_load_reference(fd, fp->fn_aux);
1066 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1067 }
1068 break;
1069
1070 case text_format:
1071 fp->format.category = sequence_data;
1072 fp->format.format = sam;
1073 /* fall-through */
1074 case sam: {
1075 char *p;
1076 hputs(h->text, fp->fp.hfile);
1077 p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
1078 if (p == 0) {
1079 int i;
1080 for (i = 0; i < h->n_targets; ++i) {
1081 fp->line.l = 0;
1082 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
1083 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
1084 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1085 }
1086 }
1087 if ( hflush(fp->fp.hfile) != 0 ) return -1;
1088 }
1089 break;
1090
1091 default:
1092 abort();
1093 }
1094 return 0;
1095 }
1096
sam_hdr_change_HD(bam_hdr_t * h,const char * key,const char * val)1097 int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val)
1098 {
1099 char *p, *q, *beg = NULL, *end = NULL, *newtext;
1100 if (!h || !key)
1101 return -1;
1102
1103 if (h->l_text > 3) {
1104 if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
1105 if ((p = strchr(h->text, '\n')) == 0) return -1;
1106 *p = '\0'; // for strstr call
1107
1108 char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
1109
1110 if ((q = strstr(h->text, tmp)) != 0) { // key exists
1111 *p = '\n'; // change back
1112
1113 // mark the key:val
1114 beg = q;
1115 for (q += 4; *q != '\n' && *q != '\t'; ++q);
1116 end = q;
1117
1118 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
1119 && strlen(val) == end - beg - 4)
1120 return 0; // val is the same, no need to change
1121
1122 } else {
1123 beg = end = p;
1124 *p = '\n';
1125 }
1126 }
1127 }
1128 if (beg == NULL) { // no @HD
1129 if (h->l_text > UINT32_MAX - strlen(SAM_FORMAT_VERSION) - 9)
1130 return -1;
1131 h->l_text += strlen(SAM_FORMAT_VERSION) + 8;
1132 if (val) {
1133 if (h->l_text > UINT32_MAX - strlen(val) - 5)
1134 return -1;
1135 h->l_text += strlen(val) + 4;
1136 }
1137 newtext = (char*)malloc(h->l_text + 1);
1138 if (!newtext) return -1;
1139
1140 if (val)
1141 snprintf(newtext, h->l_text + 1,
1142 "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
1143 else
1144 snprintf(newtext, h->l_text + 1,
1145 "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
1146 } else { // has @HD but different or no key
1147 h->l_text = (beg - h->text) + (h->text + h->l_text - end);
1148 if (val) {
1149 if (h->l_text > UINT32_MAX - strlen(val) - 5)
1150 return -1;
1151 h->l_text += strlen(val) + 4;
1152 }
1153 newtext = (char*)malloc(h->l_text + 1);
1154 if (!newtext) return -1;
1155
1156 if (val) {
1157 snprintf(newtext, h->l_text + 1, "%.*s\t%s:%s%s",
1158 (int) (beg - h->text), h->text, key, val, end);
1159 } else { //delete key
1160 snprintf(newtext, h->l_text + 1, "%.*s%s",
1161 (int) (beg - h->text), h->text, end);
1162 }
1163 }
1164 free(h->text);
1165 h->text = newtext;
1166 return 0;
1167 }
1168
1169 /**********************
1170 *** SAM record I/O ***
1171 **********************/
1172
sam_parse1(kstring_t * s,bam_hdr_t * h,bam1_t * b)1173 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
1174 {
1175 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
1176 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
1177 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
1178 #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0)
1179 #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0)
1180 #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0)
1181
1182 uint8_t *t;
1183 char *p = s->s, *q;
1184 int i;
1185 kstring_t str;
1186 bam1_core_t *c = &b->core;
1187
1188 str.l = b->l_data = 0;
1189 str.s = (char*)b->data; str.m = b->m_data;
1190 memset(c, 0, 32);
1191 if (h->cigar_tab == 0) {
1192 h->cigar_tab = (int8_t*) malloc(128);
1193 for (i = 0; i < 128; ++i)
1194 h->cigar_tab[i] = -1;
1195 for (i = 0; BAM_CIGAR_STR[i]; ++i)
1196 h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
1197 }
1198 // qname
1199 q = _read_token(p);
1200 _parse_warn(p - q <= 1, "empty query name");
1201 _parse_err(p - q > 252, "query name too long");
1202 kputsn_(q, p - q, &str);
1203 for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++)
1204 kputc_('\0', &str);
1205 c->l_qname = p - q + c->l_extranul;
1206 // flag
1207 c->flag = strtol(p, &p, 0);
1208 if (*p++ != '\t') goto err_ret; // malformated flag
1209 // chr
1210 q = _read_token(p);
1211 if (strcmp(q, "*")) {
1212 _parse_err(h->n_targets == 0, "missing SAM header");
1213 c->tid = bam_name2id(h, q);
1214 _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
1215 } else c->tid = -1;
1216 // pos
1217 c->pos = strtol(p, &p, 10) - 1;
1218 if (*p++ != '\t') goto err_ret;
1219 if (c->pos < 0 && c->tid >= 0) {
1220 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
1221 c->tid = -1;
1222 }
1223 if (c->tid < 0) c->flag |= BAM_FUNMAP;
1224 // mapq
1225 c->qual = strtol(p, &p, 10);
1226 if (*p++ != '\t') goto err_ret;
1227 // cigar
1228 if (*p != '*') {
1229 uint32_t *cigar;
1230 size_t n_cigar = 0;
1231 for (q = p; *p && *p != '\t'; ++p)
1232 if (!isdigit_c(*p)) ++n_cigar;
1233 if (*p++ != '\t') goto err_ret;
1234 _parse_err(n_cigar == 0, "no CIGAR operations");
1235 _parse_err(n_cigar >= 2147483647, "too many CIGAR operations");
1236 c->n_cigar = n_cigar;
1237 _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t));
1238 for (i = 0; i < c->n_cigar; ++i, ++q) {
1239 int op;
1240 cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
1241 op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
1242 _parse_err(op < 0, "unrecognized CIGAR operator");
1243 cigar[i] |= op;
1244 }
1245 // can't use bam_endpos() directly as some fields not yet set up
1246 i = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
1247 } else {
1248 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
1249 c->flag |= BAM_FUNMAP;
1250 q = _read_token(p);
1251 i = 1;
1252 }
1253 c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
1254 // mate chr
1255 q = _read_token(p);
1256 if (strcmp(q, "=") == 0) {
1257 c->mtid = c->tid;
1258 } else if (strcmp(q, "*") == 0) {
1259 c->mtid = -1;
1260 } else {
1261 c->mtid = bam_name2id(h, q);
1262 _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped");
1263 }
1264 // mpos
1265 c->mpos = strtol(p, &p, 10) - 1;
1266 if (*p++ != '\t') goto err_ret;
1267 if (c->mpos < 0 && c->mtid >= 0) {
1268 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
1269 c->mtid = -1;
1270 }
1271 // tlen
1272 c->isize = strtol(p, &p, 10);
1273 if (*p++ != '\t') goto err_ret;
1274 // seq
1275 q = _read_token(p);
1276 if (strcmp(q, "*")) {
1277 c->l_qseq = p - q - 1;
1278 i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
1279 _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
1280 i = (c->l_qseq + 1) >> 1;
1281 _get_mem(uint8_t, &t, &str, i);
1282 memset(t, 0, i);
1283 for (i = 0; i < c->l_qseq; ++i)
1284 t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
1285 } else c->l_qseq = 0;
1286 // qual
1287 q = _read_token_aux(p);
1288 _get_mem(uint8_t, &t, &str, c->l_qseq);
1289 if (strcmp(q, "*")) {
1290 _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
1291 for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
1292 } else memset(t, 0xff, c->l_qseq);
1293 // aux
1294 while (p < s->s + s->l) {
1295 uint8_t type;
1296 q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
1297 _parse_err(p - q - 1 < 5, "incomplete aux field");
1298 kputsn_(q, 2, &str);
1299 q += 3; type = *q++; ++q; // q points to value
1300 if (type != 'Z' && type != 'H') // the only zero length acceptable fields
1301 _parse_err(p - q - 1 < 1, "incomplete aux field");
1302
1303 // Ensure str has enough space for a double + type allocated.
1304 // This is so we can stuff bigger integers and floats directly into
1305 // the kstring. Sorry.
1306 _parse_err(ks_resize(&str, str.l + 16), "out of memory");
1307
1308 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
1309 kputc_('A', &str);
1310 kputc_(*q, &str);
1311 } else if (type == 'i' || type == 'I') {
1312 if (*q == '-') {
1313 long x = strtol(q, &q, 10);
1314 if (x >= INT8_MIN) {
1315 kputc_('c', &str); kputc_(x, &str);
1316 } else if (x >= INT16_MIN) {
1317 str.s[str.l++] = 's';
1318 i16_to_le(x, (uint8_t *) str.s + str.l);
1319 str.l += 2;
1320 } else {
1321 str.s[str.l++] = 'i';
1322 i32_to_le(x, (uint8_t *) str.s + str.l);
1323 str.l += 4;
1324 }
1325 } else {
1326 unsigned long x = strtoul(q, &q, 10);
1327 if (x <= UINT8_MAX) {
1328 kputc_('C', &str); kputc_(x, &str);
1329 } else if (x <= UINT16_MAX) {
1330 str.s[str.l++] = 'S';
1331 u16_to_le(x, (uint8_t *) str.s + str.l);
1332 str.l += 2;
1333 } else {
1334 str.s[str.l++] = 'I';
1335 u32_to_le(x, (uint8_t *) str.s + str.l);
1336 str.l += 4;
1337 }
1338 }
1339 } else if (type == 'f') {
1340 str.s[str.l++] = 'f';
1341 float_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1342 str.l += sizeof(float);
1343 } else if (type == 'd') {
1344 str.s[str.l++] = 'd';
1345 double_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1346 str.l += sizeof(double);
1347 } else if (type == 'Z' || type == 'H') {
1348 _parse_err(type == 'H' && !((p-q)&1),
1349 "hex field does not have an even number of digits");
1350 kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
1351 } else if (type == 'B') {
1352 int32_t n, size;
1353 size_t bytes;
1354 char *r;
1355 _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
1356 type = *q++; // q points to the first ',' following the typing byte
1357
1358 size = aux_type2size(type);
1359 _parse_err_param(size <= 0 || size > 4,
1360 "unrecognized type B:%c", type);
1361 _parse_err(*q && *q != ',', "B aux field type not followed by ','");
1362
1363 for (r = q, n = 0; *r; ++r)
1364 if (*r == ',') ++n;
1365
1366 // Ensure space for type + values
1367 bytes = (size_t) n * (size_t) size;
1368 _parse_err(bytes / size != n
1369 || ks_resize(&str, str.l + bytes + 2 + sizeof(uint32_t)),
1370 "out of memory");
1371 str.s[str.l++] = 'B';
1372 str.s[str.l++] = type;
1373 i32_to_le(n, (uint8_t *) str.s + str.l);
1374 str.l += sizeof(uint32_t);
1375
1376 // This ensures that q always ends up at the next comma after
1377 // reading a number even if it's followed by junk. It
1378 // prevents the possibility of trying to read more than n items.
1379 #define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0)
1380
1381 if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
1382 else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
1383 else if (type == 's') while (q + 1 < p) { i16_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1384 else if (type == 'S') while (q + 1 < p) { u16_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1385 else if (type == 'i') while (q + 1 < p) { i32_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1386 else if (type == 'I') while (q + 1 < p) { u32_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1387 else if (type == 'f') while (q + 1 < p) { float_to_le(strtod(q + 1, &q), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1388 else _parse_err_param(1, "unrecognized type B:%c", type);
1389
1390 #undef _skip_to_comma
1391
1392 } else _parse_err_param(1, "unrecognized type %c", type);
1393 }
1394 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1395 if (bam_tag2cigar(b, 1, 1) < 0)
1396 return -2;
1397 return 0;
1398
1399 #undef _parse_warn
1400 #undef _parse_err
1401 #undef _parse_err_param
1402 #undef _get_mem
1403 #undef _read_token_aux
1404 #undef _read_token
1405 err_ret:
1406 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1407 return -2;
1408 }
1409
sam_read1(htsFile * fp,bam_hdr_t * h,bam1_t * b)1410 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
1411 {
1412 switch (fp->format.format) {
1413 case bam: {
1414 int r = bam_read1(fp->fp.bgzf, b);
1415 if (r >= 0) {
1416 if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
1417 b->core.mtid >= h->n_targets || b->core.mtid < -1)
1418 return -3;
1419 }
1420 return r;
1421 }
1422
1423 case cram: {
1424 int ret = cram_get_bam_seq(fp->fp.cram, &b);
1425 if (ret < 0)
1426 return cram_eof(fp->fp.cram) ? -1 : -2;
1427
1428 if (bam_tag2cigar(b, 1, 1) < 0)
1429 return -2;
1430 return ret;
1431 }
1432
1433 case sam: {
1434 int ret;
1435 err_recover:
1436 if (fp->line.l == 0) {
1437 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1438 if (ret < 0) return ret;
1439 }
1440 ret = sam_parse1(&fp->line, h, b);
1441 fp->line.l = 0;
1442 if (ret < 0) {
1443 hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
1444 if (h->ignore_sam_err) goto err_recover;
1445 }
1446 return ret;
1447 }
1448
1449 default:
1450 abort();
1451 }
1452 }
1453
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)1454 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
1455 {
1456 int i;
1457 uint8_t *s, *end;
1458 const bam1_core_t *c = &b->core;
1459
1460 str->l = 0;
1461 kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name
1462 kputw(c->flag, str); kputc('\t', str); // flag
1463 if (c->tid >= 0) { // chr
1464 kputs(h->target_name[c->tid] , str);
1465 kputc('\t', str);
1466 } else kputsn("*\t", 2, str);
1467 kputw(c->pos + 1, str); kputc('\t', str); // pos
1468 kputw(c->qual, str); kputc('\t', str); // qual
1469 if (c->n_cigar) { // cigar
1470 uint32_t *cigar = bam_get_cigar(b);
1471 for (i = 0; i < c->n_cigar; ++i) {
1472 kputw(bam_cigar_oplen(cigar[i]), str);
1473 kputc(bam_cigar_opchr(cigar[i]), str);
1474 }
1475 } else kputc('*', str);
1476 kputc('\t', str);
1477 if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
1478 else if (c->mtid == c->tid) kputsn("=\t", 2, str);
1479 else {
1480 kputs(h->target_name[c->mtid], str);
1481 kputc('\t', str);
1482 }
1483 kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
1484 kputw(c->isize, str); kputc('\t', str); // template len
1485 if (c->l_qseq) { // seq and qual
1486 uint8_t *s = bam_get_seq(b);
1487 for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
1488 kputc('\t', str);
1489 s = bam_get_qual(b);
1490 if (s[0] == 0xff) kputc('*', str);
1491 else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
1492 } else kputsn("*\t*", 3, str);
1493
1494 s = bam_get_aux(b); // aux
1495 end = b->data + b->l_data;
1496 while (end - s >= 4) {
1497 uint8_t type, key[2];
1498 key[0] = s[0]; key[1] = s[1];
1499 s += 2; type = *s++;
1500 kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
1501 if (type == 'A') {
1502 kputsn("A:", 2, str);
1503 kputc(*s, str);
1504 ++s;
1505 } else if (type == 'C') {
1506 kputsn("i:", 2, str);
1507 kputw(*s, str);
1508 ++s;
1509 } else if (type == 'c') {
1510 kputsn("i:", 2, str);
1511 kputw(*(int8_t*)s, str);
1512 ++s;
1513 } else if (type == 'S') {
1514 if (end - s >= 2) {
1515 kputsn("i:", 2, str);
1516 kputuw(le_to_u16(s), str);
1517 s += 2;
1518 } else goto bad_aux;
1519 } else if (type == 's') {
1520 if (end - s >= 2) {
1521 kputsn("i:", 2, str);
1522 kputw(le_to_i16(s), str);
1523 s += 2;
1524 } else goto bad_aux;
1525 } else if (type == 'I') {
1526 if (end - s >= 4) {
1527 kputsn("i:", 2, str);
1528 kputuw(le_to_u32(s), str);
1529 s += 4;
1530 } else goto bad_aux;
1531 } else if (type == 'i') {
1532 if (end - s >= 4) {
1533 kputsn("i:", 2, str);
1534 kputw(le_to_i32(s), str);
1535 s += 4;
1536 } else goto bad_aux;
1537 } else if (type == 'f') {
1538 if (end - s >= 4) {
1539 ksprintf(str, "f:%g", le_to_float(s));
1540 s += 4;
1541 } else goto bad_aux;
1542
1543 } else if (type == 'd') {
1544 if (end - s >= 8) {
1545 ksprintf(str, "d:%g", le_to_double(s));
1546 s += 8;
1547 } else goto bad_aux;
1548 } else if (type == 'Z' || type == 'H') {
1549 kputc(type, str); kputc(':', str);
1550 while (s < end && *s) kputc(*s++, str);
1551 if (s >= end)
1552 goto bad_aux;
1553 ++s;
1554 } else if (type == 'B') {
1555 uint8_t sub_type = *(s++);
1556 int sub_type_size = aux_type2size(sub_type);
1557 uint32_t n;
1558 if (sub_type_size == 0 || end - s < 4)
1559 goto bad_aux;
1560 n = le_to_u32(s);
1561 s += 4; // now points to the start of the array
1562 if ((end - s) / sub_type_size < n)
1563 goto bad_aux;
1564 kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
1565 for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
1566 kputc(',', str);
1567 if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
1568 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
1569 else if ('s' == sub_type) { kputw(le_to_i16(s), str); s += 2; }
1570 else if ('S' == sub_type) { kputw(le_to_u16(s), str); s += 2; }
1571 else if ('i' == sub_type) { kputw(le_to_i32(s), str); s += 4; }
1572 else if ('I' == sub_type) { kputuw(le_to_u32(s), str); s += 4; }
1573 else if ('f' == sub_type) { kputd(le_to_float(s), str); s += 4; }
1574 else goto bad_aux; // Unknown sub-type
1575 }
1576 } else { // Unknown type
1577 goto bad_aux;
1578 }
1579 }
1580 return str->l;
1581
1582 bad_aux:
1583 hts_log_error("Corrupted aux data for read %.*s",
1584 b->core.l_qname, bam_get_qname(b));
1585 errno = EINVAL;
1586 return -1;
1587 }
1588
sam_write1(htsFile * fp,const bam_hdr_t * h,const bam1_t * b)1589 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1590 {
1591 switch (fp->format.format) {
1592 case binary_format:
1593 fp->format.category = sequence_data;
1594 fp->format.format = bam;
1595 /* fall-through */
1596 case bam:
1597 return bam_write1(fp->fp.bgzf, b);
1598
1599 case cram:
1600 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1601
1602 case text_format:
1603 fp->format.category = sequence_data;
1604 fp->format.format = sam;
1605 /* fall-through */
1606 case sam:
1607 if (sam_format1(h, b, &fp->line) < 0) return -1;
1608 kputc('\n', &fp->line);
1609 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1610 return fp->line.l;
1611
1612 default:
1613 abort();
1614 }
1615 }
1616
1617 /************************
1618 *** Auxiliary fields ***
1619 ************************/
1620 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)1621 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
1622 int tsz = aux_type2size(type);
1623
1624 if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
1625
1626 switch (tsz) {
1627 case 'H': case 'Z': case 1: // Trivial
1628 memcpy(out, in, len);
1629 break;
1630
1631 #define aux_val_to_le(type_t, store_le) do { \
1632 type_t v; \
1633 size_t i; \
1634 for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
1635 memcpy(&v, in + i, sizeof(type_t)); \
1636 store_le(v, out); \
1637 } \
1638 } while (0)
1639
1640 case 2: aux_val_to_le(uint16_t, u16_to_le); break;
1641 case 4: aux_val_to_le(uint32_t, u32_to_le); break;
1642 case 8: aux_val_to_le(uint64_t, u64_to_le); break;
1643
1644 #undef aux_val_to_le
1645
1646 case 'B': { // Recurse!
1647 uint32_t n;
1648 if (len < 5) return -1;
1649 memcpy(&n, in + 1, 4);
1650 out[0] = in[0];
1651 u32_to_le(n, out + 1);
1652 return aux_to_le(in[0], out + 5, in + 5, len - 5);
1653 }
1654
1655 default: // Unknown type code
1656 return -1;
1657 }
1658
1659
1660
1661 return 0;
1662 }
1663 #endif
1664
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)1665 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
1666 {
1667 uint32_t new_len;
1668
1669 assert(b->l_data >= 0);
1670 new_len = b->l_data + 3 + len;
1671 if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
1672
1673 if (b->m_data < new_len) {
1674 uint32_t new_size = new_len;
1675 uint8_t *new_data;
1676 kroundup32(new_size);
1677 new_data = realloc(b->data, new_size);
1678 if (new_data == NULL) goto nomem;
1679 b->m_data = new_size;
1680 b->data = new_data;
1681 }
1682
1683 b->data[b->l_data] = tag[0];
1684 b->data[b->l_data + 1] = tag[1];
1685 b->data[b->l_data + 2] = type;
1686
1687 #ifdef HTS_LITTLE_ENDIAN
1688 memcpy(b->data + b->l_data + 3, data, len);
1689 #else
1690 if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
1691 errno = EINVAL;
1692 return -1;
1693 }
1694 #endif
1695
1696 b->l_data = new_len;
1697
1698 return 0;
1699
1700 nomem:
1701 errno = ENOMEM;
1702 return -1;
1703 }
1704
skip_aux(uint8_t * s,uint8_t * end)1705 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
1706 {
1707 int size;
1708 uint32_t n;
1709 if (s >= end) return end;
1710 size = aux_type2size(*s); ++s; // skip type
1711 switch (size) {
1712 case 'Z':
1713 case 'H':
1714 while (*s && s < end) ++s;
1715 return s < end ? s + 1 : end;
1716 case 'B':
1717 if (end - s < 5) return NULL;
1718 size = aux_type2size(*s); ++s;
1719 n = le_to_u32(s);
1720 s += 4;
1721 if (size == 0 || end - s < size * n) return NULL;
1722 return s + size * n;
1723 case 0:
1724 return NULL;
1725 default:
1726 if (end - s < size) return NULL;
1727 return s + size;
1728 }
1729 }
1730
bam_aux_get(const bam1_t * b,const char tag[2])1731 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1732 {
1733 uint8_t *s, *end, *t = (uint8_t *) tag;
1734 uint16_t y = (uint16_t) t[0]<<8 | t[1];
1735 s = bam_get_aux(b);
1736 end = b->data + b->l_data;
1737 while (s != NULL && end - s >= 3) {
1738 uint16_t x = (uint16_t) s[0]<<8 | s[1];
1739 s += 2;
1740 if (x == y) {
1741 // Check the tag value is valid and complete
1742 uint8_t *e = skip_aux(s, end);
1743 if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
1744 goto bad_aux; // Unterminated string
1745 }
1746 if (e != NULL) {
1747 return s;
1748 } else {
1749 goto bad_aux;
1750 }
1751 }
1752 s = skip_aux(s, end);
1753 }
1754 if (s == NULL) goto bad_aux;
1755 errno = ENOENT;
1756 return NULL;
1757
1758 bad_aux:
1759 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1760 errno = EINVAL;
1761 return NULL;
1762 }
1763 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)1764 int bam_aux_del(bam1_t *b, uint8_t *s)
1765 {
1766 uint8_t *p, *aux;
1767 int l_aux = bam_get_l_aux(b);
1768 aux = bam_get_aux(b);
1769 p = s - 2;
1770 s = skip_aux(s, aux + l_aux);
1771 if (s == NULL) goto bad_aux;
1772 memmove(p, s, l_aux - (s - aux));
1773 b->l_data -= s - p;
1774 return 0;
1775
1776 bad_aux:
1777 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1778 errno = EINVAL;
1779 return -1;
1780 }
1781
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)1782 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
1783 {
1784 // FIXME: This is not at all efficient!
1785 uint8_t *s = bam_aux_get(b,tag);
1786 if (!s) {
1787 if (errno == ENOENT) { // Tag doesn't exist - add a new one
1788 return bam_aux_append(b, tag, 'Z', len, (const uint8_t *) data);
1789 } else { // Invalid aux data, give up.
1790 return -1;
1791 }
1792 }
1793 char type = *s;
1794 if (type != 'Z') {
1795 hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
1796 errno = EINVAL;
1797 return -1;
1798 }
1799
1800 bam_aux_del(b,s);
1801 s -= 2;
1802 int l_aux = bam_get_l_aux(b);
1803
1804 b->l_data += 3 + len;
1805 if (b->m_data < b->l_data) {
1806 ptrdiff_t s_offset = s - b->data;
1807 b->m_data = b->l_data;
1808 kroundup32(b->m_data);
1809 b->data = (uint8_t*)realloc(b->data, b->m_data);
1810 s = b->data + s_offset;
1811 }
1812 memmove(s+3+len, s, l_aux - (s - bam_get_aux(b)));
1813 s[0] = tag[0];
1814 s[1] = tag[1];
1815 s[2] = type;
1816 memmove(s+3,data,len);
1817 return 0;
1818 }
1819
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)1820 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
1821 uint32_t idx)
1822 {
1823 switch (type) {
1824 case 'c': return le_to_i8(s + idx);
1825 case 'C': return s[idx];
1826 case 's': return le_to_i16(s + 2 * idx);
1827 case 'S': return le_to_u16(s + 2 * idx);
1828 case 'i': return le_to_i32(s + 4 * idx);
1829 case 'I': return le_to_u32(s + 4 * idx);
1830 default:
1831 errno = EINVAL;
1832 return 0;
1833 }
1834 }
1835
bam_aux2i(const uint8_t * s)1836 int64_t bam_aux2i(const uint8_t *s)
1837 {
1838 int type;
1839 type = *s++;
1840 return get_int_aux_val(type, s, 0);
1841 }
1842
bam_aux2f(const uint8_t * s)1843 double bam_aux2f(const uint8_t *s)
1844 {
1845 int type;
1846 type = *s++;
1847 if (type == 'd') return le_to_double(s);
1848 else if (type == 'f') return le_to_float(s);
1849 else return get_int_aux_val(type, s, 0);
1850 }
1851
bam_aux2A(const uint8_t * s)1852 char bam_aux2A(const uint8_t *s)
1853 {
1854 int type;
1855 type = *s++;
1856 if (type == 'A') return *(char*)s;
1857 errno = EINVAL;
1858 return 0;
1859 }
1860
bam_aux2Z(const uint8_t * s)1861 char *bam_aux2Z(const uint8_t *s)
1862 {
1863 int type;
1864 type = *s++;
1865 if (type == 'Z' || type == 'H') return (char*)s;
1866 errno = EINVAL;
1867 return 0;
1868 }
1869
bam_auxB_len(const uint8_t * s)1870 uint32_t bam_auxB_len(const uint8_t *s)
1871 {
1872 if (s[0] != 'B') {
1873 errno = EINVAL;
1874 return 0;
1875 }
1876 return le_to_u32(s + 2);
1877 }
1878
bam_auxB2i(const uint8_t * s,uint32_t idx)1879 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
1880 {
1881 uint32_t len = bam_auxB_len(s);
1882 if (idx >= len) {
1883 errno = ERANGE;
1884 return 0;
1885 }
1886 return get_int_aux_val(s[1], s + 6, idx);
1887 }
1888
bam_auxB2f(const uint8_t * s,uint32_t idx)1889 double bam_auxB2f(const uint8_t *s, uint32_t idx)
1890 {
1891 uint32_t len = bam_auxB_len(s);
1892 if (idx >= len) {
1893 errno = ERANGE;
1894 return 0.0;
1895 }
1896 if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
1897 else return get_int_aux_val(s[1], s + 6, idx);
1898 }
1899
sam_open_mode(char * mode,const char * fn,const char * format)1900 int sam_open_mode(char *mode, const char *fn, const char *format)
1901 {
1902 // TODO Parse "bam5" etc for compression level
1903 if (format == NULL) {
1904 // Try to pick a format based on the filename extension
1905 const char *ext = fn? strrchr(fn, '.') : NULL;
1906 if (ext == NULL || strchr(ext, '/')) return -1;
1907 return sam_open_mode(mode, fn, ext+1);
1908 }
1909 else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
1910 else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
1911 else if (strcmp(format, "sam") == 0) strcpy(mode, "");
1912 else return -1;
1913
1914 return 0;
1915 }
1916
1917 // A version of sam_open_mode that can handle ,key=value options.
1918 // The format string is allocated and returned, to be freed by the caller.
1919 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)1920 char *sam_open_mode_opts(const char *fn,
1921 const char *mode,
1922 const char *format)
1923 {
1924 char *mode_opts = malloc((format ? strlen(format) : 1) +
1925 (mode ? strlen(mode) : 1) + 12);
1926 char *opts, *cp;
1927 int format_len;
1928
1929 if (!mode_opts)
1930 return NULL;
1931
1932 strcpy(mode_opts, mode ? mode : "r");
1933 cp = mode_opts + strlen(mode_opts);
1934
1935 if (format == NULL) {
1936 // Try to pick a format based on the filename extension
1937 const char *ext = fn? strrchr(fn, '.') : NULL;
1938 if (ext == NULL || strchr(ext, '/')) {
1939 free(mode_opts);
1940 return NULL;
1941 }
1942 if (sam_open_mode(cp, fn, ext+1) == 0) {
1943 return mode_opts;
1944 } else {
1945 free(mode_opts);
1946 return NULL;
1947 }
1948 }
1949
1950 if ((opts = strchr(format, ','))) {
1951 format_len = opts-format;
1952 } else {
1953 opts="";
1954 format_len = strlen(format);
1955 }
1956
1957 if (strncmp(format, "bam", format_len) == 0) {
1958 *cp++ = 'b';
1959 } else if (strncmp(format, "cram", format_len) == 0) {
1960 *cp++ = 'c';
1961 } else if (strncmp(format, "cram2", format_len) == 0) {
1962 *cp++ = 'c';
1963 strcpy(cp, ",VERSION=2.1");
1964 cp += 12;
1965 } else if (strncmp(format, "cram3", format_len) == 0) {
1966 *cp++ = 'c';
1967 strcpy(cp, ",VERSION=3.0");
1968 cp += 12;
1969 } else if (strncmp(format, "sam", format_len) == 0) {
1970 ; // format mode=""
1971 } else {
1972 free(mode_opts);
1973 return NULL;
1974 }
1975
1976 strcpy(cp, opts);
1977
1978 return mode_opts;
1979 }
1980
1981 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)1982 int bam_str2flag(const char *str)
1983 {
1984 char *end, *beg = (char*) str;
1985 long int flag = strtol(str, &end, 0);
1986 if ( end!=str ) return flag; // the conversion was successful
1987 flag = 0;
1988 while ( *str )
1989 {
1990 end = beg;
1991 while ( *end && *end!=',' ) end++;
1992 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
1993 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
1994 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
1995 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
1996 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
1997 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
1998 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
1999 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
2000 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
2001 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
2002 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
2003 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
2004 else return -1;
2005 if ( !*end ) break;
2006 beg = end + 1;
2007 }
2008 return flag;
2009 }
2010
bam_flag2str(int flag)2011 char *bam_flag2str(int flag)
2012 {
2013 kstring_t str = {0,0,0};
2014 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
2015 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
2016 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
2017 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
2018 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
2019 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
2020 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
2021 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
2022 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
2023 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
2024 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
2025 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
2026 if ( str.l == 0 ) kputsn("", 0, &str);
2027 return str.s;
2028 }
2029
2030
2031 /**************************
2032 *** Pileup and Mpileup ***
2033 **************************/
2034
2035 #if !defined(BAM_NO_PILEUP)
2036
2037 #include <assert.h>
2038
2039 /*******************
2040 *** Memory pool ***
2041 *******************/
2042
2043 typedef struct {
2044 int k, x, y, end;
2045 } cstate_t;
2046
2047 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
2048
2049 typedef struct __linkbuf_t {
2050 bam1_t b;
2051 int32_t beg, end;
2052 cstate_t s;
2053 struct __linkbuf_t *next;
2054 bam_pileup_cd cd;
2055 } lbnode_t;
2056
2057 typedef struct {
2058 int cnt, n, max;
2059 lbnode_t **buf;
2060 } mempool_t;
2061
mp_init(void)2062 static mempool_t *mp_init(void)
2063 {
2064 mempool_t *mp;
2065 mp = (mempool_t*)calloc(1, sizeof(mempool_t));
2066 return mp;
2067 }
mp_destroy(mempool_t * mp)2068 static void mp_destroy(mempool_t *mp)
2069 {
2070 int k;
2071 for (k = 0; k < mp->n; ++k) {
2072 free(mp->buf[k]->b.data);
2073 free(mp->buf[k]);
2074 }
2075 free(mp->buf);
2076 free(mp);
2077 }
mp_alloc(mempool_t * mp)2078 static inline lbnode_t *mp_alloc(mempool_t *mp)
2079 {
2080 ++mp->cnt;
2081 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
2082 else return mp->buf[--mp->n];
2083 }
mp_free(mempool_t * mp,lbnode_t * p)2084 static inline void mp_free(mempool_t *mp, lbnode_t *p)
2085 {
2086 --mp->cnt; p->next = 0; // clear lbnode_t::next here
2087 if (mp->n == mp->max) {
2088 mp->max = mp->max? mp->max<<1 : 256;
2089 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
2090 }
2091 mp->buf[mp->n++] = p;
2092 }
2093
2094 /**********************
2095 *** CIGAR resolver ***
2096 **********************/
2097
2098 /* s->k: the index of the CIGAR operator that has just been processed.
2099 s->x: the reference coordinate of the start of s->k
2100 s->y: the query coordiante of the start of s->k
2101 */
resolve_cigar2(bam_pileup1_t * p,int32_t pos,cstate_t * s)2102 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
2103 {
2104 #define _cop(c) ((c)&BAM_CIGAR_MASK)
2105 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
2106
2107 bam1_t *b = p->b;
2108 bam1_core_t *c = &b->core;
2109 uint32_t *cigar = bam_get_cigar(b);
2110 int k;
2111 // determine the current CIGAR operation
2112 //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
2113 if (s->k == -1) { // never processed
2114 if (c->n_cigar == 1) { // just one operation, save a loop
2115 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
2116 } else { // find the first match or deletion
2117 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
2118 int op = _cop(cigar[k]);
2119 int l = _cln(cigar[k]);
2120 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2121 else if (op == BAM_CREF_SKIP) s->x += l;
2122 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2123 }
2124 assert(k < c->n_cigar);
2125 s->k = k;
2126 }
2127 } else { // the read has been processed before
2128 int op, l = _cln(cigar[s->k]);
2129 if (pos - s->x >= l) { // jump to the next operation
2130 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
2131 op = _cop(cigar[s->k+1]);
2132 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
2133 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2134 s->x += l;
2135 ++s->k;
2136 } else { // find the next M/D/N/=/X
2137 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2138 s->x += l;
2139 for (k = s->k + 1; k < c->n_cigar; ++k) {
2140 op = _cop(cigar[k]), l = _cln(cigar[k]);
2141 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2142 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2143 }
2144 s->k = k;
2145 }
2146 assert(s->k < c->n_cigar); // otherwise a bug
2147 } // else, do nothing
2148 }
2149 { // collect pileup information
2150 int op, l;
2151 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
2152 p->is_del = p->indel = p->is_refskip = 0;
2153 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
2154 int op2 = _cop(cigar[s->k+1]);
2155 int l2 = _cln(cigar[s->k+1]);
2156 if (op2 == BAM_CDEL) p->indel = -(int)l2;
2157 else if (op2 == BAM_CINS) p->indel = l2;
2158 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
2159 int l3 = 0;
2160 for (k = s->k + 2; k < c->n_cigar; ++k) {
2161 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
2162 if (op2 == BAM_CINS) l3 += l2;
2163 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
2164 }
2165 if (l3 > 0) p->indel = l3;
2166 }
2167 }
2168 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
2169 p->qpos = s->y + (pos - s->x);
2170 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
2171 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
2172 p->is_refskip = (op == BAM_CREF_SKIP);
2173 } // cannot be other operations; otherwise a bug
2174 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
2175 }
2176 return 1;
2177 }
2178
2179 /***********************
2180 *** Pileup iterator ***
2181 ***********************/
2182
2183 // Dictionary of overlapping reads
2184 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
2185 typedef khash_t(olap_hash) olap_hash_t;
2186
2187 struct __bam_plp_t {
2188 mempool_t *mp;
2189 lbnode_t *head, *tail;
2190 int32_t tid, pos, max_tid, max_pos;
2191 int is_eof, max_plp, error, maxcnt;
2192 uint64_t id;
2193 bam_pileup1_t *plp;
2194 // for the "auto" interface only
2195 bam1_t *b;
2196 bam_plp_auto_f func;
2197 void *data;
2198 olap_hash_t *overlaps;
2199
2200 // For notification of creation and destruction events
2201 // and associated client-owned pointer.
2202 int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
2203 int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
2204 };
2205
bam_plp_init(bam_plp_auto_f func,void * data)2206 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
2207 {
2208 bam_plp_t iter;
2209 iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
2210 iter->mp = mp_init();
2211 iter->head = iter->tail = mp_alloc(iter->mp);
2212 iter->max_tid = iter->max_pos = -1;
2213 iter->maxcnt = 8000;
2214 if (func) {
2215 iter->func = func;
2216 iter->data = data;
2217 iter->b = bam_init1();
2218 }
2219 return iter;
2220 }
2221
bam_plp_init_overlaps(bam_plp_t iter)2222 void bam_plp_init_overlaps(bam_plp_t iter)
2223 {
2224 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
2225 }
2226
bam_plp_destroy(bam_plp_t iter)2227 void bam_plp_destroy(bam_plp_t iter)
2228 {
2229 lbnode_t *p, *pnext;
2230 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
2231 for (p = iter->head; p != NULL; p = pnext) {
2232 pnext = p->next;
2233 mp_free(iter->mp, p);
2234 }
2235 mp_destroy(iter->mp);
2236 if (iter->b) bam_destroy1(iter->b);
2237 free(iter->plp);
2238 free(iter);
2239 }
2240
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2241 void bam_plp_constructor(bam_plp_t plp,
2242 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2243 plp->plp_construct = func;
2244 }
2245
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2246 void bam_plp_destructor(bam_plp_t plp,
2247 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2248 plp->plp_destruct = func;
2249 }
2250
2251 //---------------------------------
2252 //--- Tweak overlapping reads
2253 //---------------------------------
2254
2255 /**
2256 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
2257 * cigar_iref2iseq_next() - get the next CMATCH base
2258 * @cigar: pointer to current cigar block (rw)
2259 * @cigar_max: pointer just beyond the last cigar block
2260 * @icig: position within the current cigar block (rw)
2261 * @iseq: position in the sequence (rw)
2262 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
2263 *
2264 * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
2265 */
cigar_iref2iseq_set(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2266 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2267 {
2268 int pos = *iref;
2269 if ( pos < 0 ) return -1;
2270 *icig = 0;
2271 *iseq = 0;
2272 *iref = 0;
2273 while ( *cigar<cigar_max )
2274 {
2275 int cig = (**cigar) & BAM_CIGAR_MASK;
2276 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2277
2278 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2279 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2280 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2281 {
2282 pos -= ncig;
2283 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
2284 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
2285 continue;
2286 }
2287 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2288 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
2289 {
2290 pos -= ncig;
2291 if ( pos<0 ) pos = 0;
2292 (*cigar)++; *icig = 0; *iref += ncig;
2293 continue;
2294 }
2295 hts_log_error("Unexpected cigar %d", cig);
2296 assert(0);
2297 }
2298 *iseq = -1;
2299 return -1;
2300 }
cigar_iref2iseq_next(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2301 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2302 {
2303 while ( *cigar < cigar_max )
2304 {
2305 int cig = (**cigar) & BAM_CIGAR_MASK;
2306 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2307
2308 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2309 {
2310 if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
2311 (*iseq)++; (*icig)++; (*iref)++;
2312 return BAM_CMATCH;
2313 }
2314 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
2315 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2316 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2317 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2318 hts_log_error("Unexpected cigar %d", cig);
2319 assert(0);
2320 }
2321 *iseq = -1;
2322 *iref = -1;
2323 return -1;
2324 }
2325
tweak_overlap_quality(bam1_t * a,bam1_t * b)2326 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
2327 {
2328 uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
2329 uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
2330 int a_icig = 0, a_iseq = 0;
2331 int b_icig = 0, b_iseq = 0;
2332 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
2333 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
2334
2335 int iref = b->core.pos;
2336 int a_iref = iref - a->core.pos;
2337 int b_iref = iref - b->core.pos;
2338 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2339 if ( a_ret<0 ) return; // no overlap
2340 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2341 if ( b_ret<0 ) return; // no overlap
2342
2343 #if DBG
2344 fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
2345 a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
2346 #endif
2347
2348 while ( 1 )
2349 {
2350 // Increment reference position
2351 while ( a_iref>=0 && a_iref < iref - a->core.pos )
2352 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2353 if ( a_ret<0 ) break; // done
2354 if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
2355
2356 while ( b_iref>=0 && b_iref < iref - b->core.pos )
2357 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2358 if ( b_ret<0 ) break; // done
2359 if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
2360
2361 iref++;
2362 if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
2363
2364 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
2365 {
2366 #if DBG
2367 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
2368 #endif
2369 // we are very confident about this base
2370 int qual = a_qual[a_iseq] + b_qual[b_iseq];
2371 a_qual[a_iseq] = qual>200 ? 200 : qual;
2372 b_qual[b_iseq] = 0;
2373 }
2374 else
2375 {
2376 if ( a_qual[a_iseq] >= b_qual[b_iseq] )
2377 {
2378 #if DBG
2379 fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
2380 #endif
2381 a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
2382 b_qual[b_iseq] = 0;
2383 }
2384 else
2385 {
2386 #if DBG
2387 fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
2388 #endif
2389 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
2390 a_qual[a_iseq] = 0;
2391 }
2392 }
2393 }
2394 #if DBG
2395 fprintf(stderr,"\n");
2396 #endif
2397 }
2398
2399 // Fix overlapping reads. Simple soft-clipping did not give good results.
2400 // Lowering qualities of unwanted bases is more selective and works better.
2401 //
overlap_push(bam_plp_t iter,lbnode_t * node)2402 static void overlap_push(bam_plp_t iter, lbnode_t *node)
2403 {
2404 if ( !iter->overlaps ) return;
2405
2406 // mapped mates and paired reads only
2407 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
2408
2409 // no overlap possible, unless some wild cigar
2410 if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
2411
2412 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
2413 if ( kitr==kh_end(iter->overlaps) )
2414 {
2415 int ret;
2416 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
2417 kh_value(iter->overlaps, kitr) = node;
2418 }
2419 else
2420 {
2421 lbnode_t *a = kh_value(iter->overlaps, kitr);
2422 tweak_overlap_quality(&a->b, &node->b);
2423 kh_del(olap_hash, iter->overlaps, kitr);
2424 assert(a->end-1 == a->s.end);
2425 a->end = bam_endpos(&a->b);
2426 a->s.end = a->end - 1;
2427 }
2428 }
2429
overlap_remove(bam_plp_t iter,const bam1_t * b)2430 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
2431 {
2432 if ( !iter->overlaps ) return;
2433
2434 khiter_t kitr;
2435 if ( b )
2436 {
2437 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
2438 if ( kitr!=kh_end(iter->overlaps) )
2439 kh_del(olap_hash, iter->overlaps, kitr);
2440 }
2441 else
2442 {
2443 // remove all
2444 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
2445 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
2446 }
2447 }
2448
2449
2450
2451 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
2452 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
2453 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2454 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2455 {
2456 if (iter->error) { *_n_plp = -1; return NULL; }
2457 *_n_plp = 0;
2458 if (iter->is_eof && iter->head == iter->tail) return NULL;
2459 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
2460 int n_plp = 0;
2461 // write iter->plp at iter->pos
2462 lbnode_t **pptr = &iter->head;
2463 while (*pptr != iter->tail) {
2464 lbnode_t *p = *pptr;
2465 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
2466 overlap_remove(iter, &p->b);
2467 if (iter->plp_destruct)
2468 iter->plp_destruct(iter->data, &p->b, &p->cd);
2469 *pptr = p->next; mp_free(iter->mp, p);
2470 }
2471 else {
2472 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
2473 if (n_plp == iter->max_plp) { // then double the capacity
2474 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
2475 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
2476 }
2477 iter->plp[n_plp].b = &p->b;
2478 iter->plp[n_plp].cd = p->cd;
2479 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
2480 }
2481 pptr = &(*pptr)->next;
2482 }
2483 }
2484 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
2485 // update iter->tid and iter->pos
2486 if (iter->head != iter->tail) {
2487 if (iter->tid > iter->head->b.core.tid) {
2488 hts_log_error("Unsorted input. Pileup aborts");
2489 iter->error = 1;
2490 *_n_plp = -1;
2491 return NULL;
2492 }
2493 }
2494 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
2495 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
2496 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
2497 iter->pos = iter->head->beg; // jump to the next position
2498 } else ++iter->pos; // scan contiguously
2499 // return
2500 if (n_plp) return iter->plp;
2501 if (iter->is_eof && iter->head == iter->tail) break;
2502 }
2503 return NULL;
2504 }
2505
bam_plp_push(bam_plp_t iter,const bam1_t * b)2506 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
2507 {
2508 if (iter->error) return -1;
2509 if (b) {
2510 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
2511 // Skip only unmapped reads here, any additional filtering must be done in iter->func
2512 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
2513 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
2514 {
2515 overlap_remove(iter, b);
2516 return 0;
2517 }
2518 bam_copy1(&iter->tail->b, b);
2519 overlap_push(iter, iter->tail);
2520 #ifndef BAM_NO_ID
2521 iter->tail->b.id = iter->id++;
2522 #endif
2523 iter->tail->beg = b->core.pos;
2524 iter->tail->end = bam_endpos(b);
2525 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
2526 if (b->core.tid < iter->max_tid) {
2527 hts_log_error("The input is not sorted (chromosomes out of order)");
2528 iter->error = 1;
2529 return -1;
2530 }
2531 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
2532 hts_log_error("The input is not sorted (reads out of order)");
2533 iter->error = 1;
2534 return -1;
2535 }
2536 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
2537 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
2538 if (iter->plp_construct)
2539 iter->plp_construct(iter->data, b, &iter->tail->cd);
2540 iter->tail->next = mp_alloc(iter->mp);
2541 iter->tail = iter->tail->next;
2542 }
2543 } else iter->is_eof = 1;
2544 return 0;
2545 }
2546
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2547 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2548 {
2549 const bam_pileup1_t *plp;
2550 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
2551 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2552 else { // no pileup line can be obtained; read alignments
2553 *_n_plp = 0;
2554 if (iter->is_eof) return 0;
2555 int ret;
2556 while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
2557 if (bam_plp_push(iter, iter->b) < 0) {
2558 *_n_plp = -1;
2559 return 0;
2560 }
2561 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2562 // otherwise no pileup line can be returned; read the next alignment.
2563 }
2564 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
2565 bam_plp_push(iter, 0);
2566 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2567 return 0;
2568 }
2569 }
2570
bam_plp_reset(bam_plp_t iter)2571 void bam_plp_reset(bam_plp_t iter)
2572 {
2573 overlap_remove(iter, NULL);
2574 iter->max_tid = iter->max_pos = -1;
2575 iter->tid = iter->pos = 0;
2576 iter->is_eof = 0;
2577 while (iter->head != iter->tail) {
2578 lbnode_t *p = iter->head;
2579 iter->head = p->next;
2580 mp_free(iter->mp, p);
2581 }
2582 }
2583
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)2584 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
2585 {
2586 iter->maxcnt = maxcnt;
2587 }
2588
2589 /************************
2590 *** Mpileup iterator ***
2591 ************************/
2592
2593 struct __bam_mplp_t {
2594 int n;
2595 uint64_t min, *pos;
2596 bam_plp_t *iter;
2597 int *n_plp;
2598 const bam_pileup1_t **plp;
2599 };
2600
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)2601 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
2602 {
2603 int i;
2604 bam_mplp_t iter;
2605 iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
2606 iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
2607 iter->n_plp = (int*)calloc(n, sizeof(int));
2608 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
2609 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
2610 iter->n = n;
2611 iter->min = (uint64_t)-1;
2612 for (i = 0; i < n; ++i) {
2613 iter->iter[i] = bam_plp_init(func, data[i]);
2614 iter->pos[i] = iter->min;
2615 }
2616 return iter;
2617 }
2618
bam_mplp_init_overlaps(bam_mplp_t iter)2619 void bam_mplp_init_overlaps(bam_mplp_t iter)
2620 {
2621 int i;
2622 for (i = 0; i < iter->n; ++i)
2623 bam_plp_init_overlaps(iter->iter[i]);
2624 }
2625
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)2626 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
2627 {
2628 int i;
2629 for (i = 0; i < iter->n; ++i)
2630 iter->iter[i]->maxcnt = maxcnt;
2631 }
2632
bam_mplp_destroy(bam_mplp_t iter)2633 void bam_mplp_destroy(bam_mplp_t iter)
2634 {
2635 int i;
2636 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
2637 free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
2638 free(iter);
2639 }
2640
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)2641 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
2642 {
2643 int i, ret = 0;
2644 uint64_t new_min = (uint64_t)-1;
2645 for (i = 0; i < iter->n; ++i) {
2646 if (iter->pos[i] == iter->min) {
2647 int tid, pos;
2648 iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
2649 if ( iter->iter[i]->error ) return -1;
2650 iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
2651 }
2652 if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
2653 }
2654 iter->min = new_min;
2655 if (new_min == (uint64_t)-1) return 0;
2656 *_tid = new_min>>32; *_pos = (uint32_t)new_min;
2657 for (i = 0; i < iter->n; ++i) {
2658 if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
2659 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
2660 ++ret;
2661 } else n_plp[i] = 0, plp[i] = 0;
2662 }
2663 return ret;
2664 }
2665
bam_mplp_reset(bam_mplp_t iter)2666 void bam_mplp_reset(bam_mplp_t iter)
2667 {
2668 int i;
2669 iter->min = (uint64_t)-1;
2670 for (i = 0; i < iter->n; ++i) {
2671 bam_plp_reset(iter->iter[i]);
2672 iter->pos[i] = (uint64_t)-1;
2673 iter->n_plp[i] = 0;
2674 iter->plp[i] = NULL;
2675 }
2676 }
2677
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2678 void bam_mplp_constructor(bam_mplp_t iter,
2679 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2680 int i;
2681 for (i = 0; i < iter->n; ++i)
2682 bam_plp_constructor(iter->iter[i], func);
2683 }
2684
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2685 void bam_mplp_destructor(bam_mplp_t iter,
2686 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2687 int i;
2688 for (i = 0; i < iter->n; ++i)
2689 bam_plp_destructor(iter->iter[i], func);
2690 }
2691
2692 #endif // ~!defined(BAM_NO_PILEUP)
2693