1 /* sam.c -- SAM and BAM file I/O and manipulation.
2
3 Copyright (C) 2008-2010, 2012-2018 Genome Research Ltd.
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <strings.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <zlib.h>
34 #include <assert.h>
35 #include "htslib/sam.h"
36 #include "htslib/bgzf.h"
37 #include "cram/cram.h"
38 #include "hts_internal.h"
39 #include "htslib/hfile.h"
40 #include "htslib/hts_endian.h"
41
42 #include "htslib/khash.h"
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44
45 typedef khash_t(s2i) sdict_t;
46
47 #ifndef EOVERFLOW
48 #define EOVERFLOW ERANGE
49 #endif
50
51 /**********************
52 *** BAM header I/O ***
53 **********************/
54
bam_hdr_init()55 bam_hdr_t *bam_hdr_init()
56 {
57 return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
58 }
59
bam_hdr_destroy(bam_hdr_t * h)60 void bam_hdr_destroy(bam_hdr_t *h)
61 {
62 int32_t i;
63 if (h == NULL) return;
64 if (h->target_name) {
65 for (i = 0; i < h->n_targets; ++i)
66 free(h->target_name[i]);
67 free(h->target_name);
68 free(h->target_len);
69 }
70 free(h->text); free(h->cigar_tab);
71 if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
72 free(h);
73 }
74
bam_hdr_dup(const bam_hdr_t * h0)75 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
76 {
77 if (h0 == NULL) return NULL;
78 bam_hdr_t *h;
79 if ((h = bam_hdr_init()) == NULL) return NULL;
80 // copy the simple data
81 h->n_targets = h0->n_targets;
82 h->ignore_sam_err = h0->ignore_sam_err;
83 h->l_text = h0->l_text;
84 // Then the pointery stuff
85 h->cigar_tab = NULL;
86 h->sdict = NULL;
87 // TODO Check for memory allocation failures
88 h->text = (char*)calloc(h->l_text + 1, 1);
89 memcpy(h->text, h0->text, h->l_text);
90 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
91 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
92 int i;
93 for (i = 0; i < h->n_targets; ++i) {
94 h->target_len[i] = h0->target_len[i];
95 h->target_name[i] = strdup(h0->target_name[i]);
96 }
97 return h;
98 }
99
100
hdr_from_dict(sdict_t * d)101 static bam_hdr_t *hdr_from_dict(sdict_t *d)
102 {
103 bam_hdr_t *h;
104 khint_t k;
105 h = bam_hdr_init();
106 h->sdict = d;
107 h->n_targets = kh_size(d);
108 // TODO Check for memory allocation failures
109 h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
110 h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
111 for (k = kh_begin(d); k != kh_end(d); ++k) {
112 if (!kh_exist(d, k)) continue;
113 h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
114 h->target_len[kh_val(d, k)>>32] = kh_val(d, k) & 0xffffffffUL;
115 kh_val(d, k) >>= 32;
116 }
117 return h;
118 }
119
bam_hdr_read(BGZF * fp)120 bam_hdr_t *bam_hdr_read(BGZF *fp)
121 {
122 bam_hdr_t *h;
123 char buf[4];
124 int magic_len, has_EOF;
125 int32_t i, name_len, num_names = 0;
126 size_t bufsize;
127 ssize_t bytes;
128 // check EOF
129 has_EOF = bgzf_check_EOF(fp);
130 if (has_EOF < 0) {
131 perror("[W::bam_hdr_read] bgzf_check_EOF");
132 } else if (has_EOF == 0) {
133 hts_log_warning("EOF marker is absent. The input is probably truncated");
134 }
135 // read "BAM1"
136 magic_len = bgzf_read(fp, buf, 4);
137 if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
138 hts_log_error("Invalid BAM binary header");
139 return 0;
140 }
141 h = bam_hdr_init();
142 if (!h) goto nomem;
143
144 // read plain text and the number of reference sequences
145 bytes = bgzf_read(fp, &h->l_text, 4);
146 if (bytes != 4) goto read_err;
147 if (fp->is_be) ed_swap_4p(&h->l_text);
148
149 bufsize = ((size_t) h->l_text) + 1;
150 if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
151 h->text = (char*)malloc(bufsize);
152 if (!h->text) goto nomem;
153 h->text[h->l_text] = 0; // make sure it is NULL terminated
154 bytes = bgzf_read(fp, h->text, h->l_text);
155 if (bytes != h->l_text) goto read_err;
156
157 bytes = bgzf_read(fp, &h->n_targets, 4);
158 if (bytes != 4) goto read_err;
159 if (fp->is_be) ed_swap_4p(&h->n_targets);
160
161 if (h->n_targets < 0) goto invalid;
162
163 // read reference sequence names and lengths
164 if (h->n_targets > 0) {
165 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
166 if (!h->target_name) goto nomem;
167 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
168 if (!h->target_len) goto nomem;
169 }
170 else {
171 h->target_name = NULL;
172 h->target_len = NULL;
173 }
174
175 for (i = 0; i != h->n_targets; ++i) {
176 bytes = bgzf_read(fp, &name_len, 4);
177 if (bytes != 4) goto read_err;
178 if (fp->is_be) ed_swap_4p(&name_len);
179 if (name_len <= 0) goto invalid;
180
181 h->target_name[i] = (char*)malloc(name_len);
182 if (!h->target_name[i]) goto nomem;
183 num_names++;
184
185 bytes = bgzf_read(fp, h->target_name[i], name_len);
186 if (bytes != name_len) goto read_err;
187
188 if (h->target_name[i][name_len - 1] != '\0') {
189 /* Fix missing NUL-termination. Is this being too nice?
190 We could alternatively bail out with an error. */
191 char *new_name;
192 if (name_len == INT32_MAX) goto invalid;
193 new_name = realloc(h->target_name[i], name_len + 1);
194 if (new_name == NULL) goto nomem;
195 h->target_name[i] = new_name;
196 h->target_name[i][name_len] = '\0';
197 }
198
199 bytes = bgzf_read(fp, &h->target_len[i], 4);
200 if (bytes != 4) goto read_err;
201 if (fp->is_be) ed_swap_4p(&h->target_len[i]);
202 }
203 return h;
204
205 nomem:
206 hts_log_error("Out of memory");
207 goto clean;
208
209 read_err:
210 if (bytes < 0) {
211 hts_log_error("Error reading BGZF stream");
212 } else {
213 hts_log_error("Truncated BAM header");
214 }
215 goto clean;
216
217 invalid:
218 hts_log_error("Invalid BAM binary header");
219
220 clean:
221 if (h != NULL) {
222 h->n_targets = num_names; // ensure we free only allocated target_names
223 bam_hdr_destroy(h);
224 }
225 return NULL;
226 }
227
bam_hdr_write(BGZF * fp,const bam_hdr_t * h)228 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
229 {
230 char buf[4];
231 int32_t i, name_len, x;
232 // write "BAM1"
233 strncpy(buf, "BAM\1", 4);
234 if (bgzf_write(fp, buf, 4) < 0) return -1;
235 // write plain text and the number of reference sequences
236 if (fp->is_be) {
237 x = ed_swap_4(h->l_text);
238 if (bgzf_write(fp, &x, 4) < 0) return -1;
239 if (h->l_text) {
240 if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
241 }
242 x = ed_swap_4(h->n_targets);
243 if (bgzf_write(fp, &x, 4) < 0) return -1;
244 } else {
245 if (bgzf_write(fp, &h->l_text, 4) < 0) return -1;
246 if (h->l_text) {
247 if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
248 }
249 if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1;
250 }
251 // write sequence names and lengths
252 for (i = 0; i != h->n_targets; ++i) {
253 char *p = h->target_name[i];
254 name_len = strlen(p) + 1;
255 if (fp->is_be) {
256 x = ed_swap_4(name_len);
257 if (bgzf_write(fp, &x, 4) < 0) return -1;
258 } else {
259 if (bgzf_write(fp, &name_len, 4) < 0) return -1;
260 }
261 if (bgzf_write(fp, p, name_len) < 0) return -1;
262 if (fp->is_be) {
263 x = ed_swap_4(h->target_len[i]);
264 if (bgzf_write(fp, &x, 4) < 0) return -1;
265 } else {
266 if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
267 }
268 }
269 if (bgzf_flush(fp) < 0) return -1;
270 return 0;
271 }
272
bam_name2id(bam_hdr_t * h,const char * ref)273 int bam_name2id(bam_hdr_t *h, const char *ref)
274 {
275 sdict_t *d = (sdict_t*)h->sdict;
276 khint_t k;
277 if (h->sdict == 0) {
278 int i, absent;
279 d = kh_init(s2i);
280 for (i = 0; i < h->n_targets; ++i) {
281 k = kh_put(s2i, d, h->target_name[i], &absent);
282 kh_val(d, k) = i;
283 }
284 h->sdict = d;
285 }
286 k = kh_get(s2i, d, ref);
287 return k == kh_end(d)? -1 : kh_val(d, k);
288 }
289
290 /*************************
291 *** BAM alignment I/O ***
292 *************************/
293
bam_init1()294 bam1_t *bam_init1()
295 {
296 return (bam1_t*)calloc(1, sizeof(bam1_t));
297 }
298
bam_destroy1(bam1_t * b)299 void bam_destroy1(bam1_t *b)
300 {
301 if (b == 0) return;
302 free(b->data); free(b);
303 }
304
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)305 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
306 {
307 uint8_t *data = bdst->data;
308 int m_data = bdst->m_data; // backup data and m_data
309 if (m_data < bsrc->l_data) { // double the capacity
310 m_data = bsrc->l_data; kroundup32(m_data);
311 data = (uint8_t*)realloc(data, m_data);
312 }
313 memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
314 *bdst = *bsrc; // copy the rest
315 // restore the backup
316 bdst->m_data = m_data;
317 bdst->data = data;
318 return bdst;
319 }
320
bam_dup1(const bam1_t * bsrc)321 bam1_t *bam_dup1(const bam1_t *bsrc)
322 {
323 if (bsrc == NULL) return NULL;
324 bam1_t *bdst = bam_init1();
325 if (bdst == NULL) return NULL;
326 return bam_copy1(bdst, bsrc);
327 }
328
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,int * rlen,int * qlen)329 void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen)
330 {
331 int k;
332 *rlen = *qlen = 0;
333 for (k = 0; k < n_cigar; ++k) {
334 int type = bam_cigar_type(bam_cigar_op(cigar[k]));
335 int len = bam_cigar_oplen(cigar[k]);
336 if (type & 1) *qlen += len;
337 if (type & 2) *rlen += len;
338 }
339 }
340
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)341 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
342 {
343 int k, l;
344 for (k = l = 0; k < n_cigar; ++k)
345 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
346 l += bam_cigar_oplen(cigar[k]);
347 return l;
348 }
349
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)350 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
351 {
352 int k, l;
353 for (k = l = 0; k < n_cigar; ++k)
354 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
355 l += bam_cigar_oplen(cigar[k]);
356 return l;
357 }
358
bam_endpos(const bam1_t * b)359 int32_t bam_endpos(const bam1_t *b)
360 {
361 if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
362 return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
363 else
364 return b->core.pos + 1;
365 }
366
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)367 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
368 {
369 bam1_core_t *c = &b->core;
370 uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
371 uint8_t *CG;
372
373 // test where there is a real CIGAR in the CG tag to move
374 if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
375 cigar0 = bam_get_cigar(b);
376 if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
377 fake_bytes = c->n_cigar * 4;
378 if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag
379 if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
380 CG_len = le_to_u32(CG + 2);
381 if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
382
383 // move from the CG tag to the right position
384 cigar_st = (uint8_t*)cigar0 - b->data;
385 c->n_cigar = CG_len;
386 n_cigar4 = c->n_cigar * 4;
387 CG_st = CG - b->data - 2;
388 CG_en = CG_st + 8 + n_cigar4;
389 b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
390 if (b->m_data < b->l_data) {
391 uint8_t *new_data;
392 uint32_t new_max = b->l_data;
393 kroundup32(new_max);
394 new_data = (uint8_t*)realloc(b->data, new_max);
395 if (new_data == 0) return -1;
396 b->m_data = new_max, b->data = new_data;
397 }
398 memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
399 memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
400 if (ori_len > CG_en) // move data after the CG tag
401 memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
402 b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
403 if (recal_bin)
404 b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5);
405 if (give_warning)
406 hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
407 return 1;
408 }
409
aux_type2size(uint8_t type)410 static inline int aux_type2size(uint8_t type)
411 {
412 switch (type) {
413 case 'A': case 'c': case 'C':
414 return 1;
415 case 's': case 'S':
416 return 2;
417 case 'i': case 'I': case 'f':
418 return 4;
419 case 'd':
420 return 8;
421 case 'Z': case 'H': case 'B':
422 return type;
423 default:
424 return 0;
425 }
426 }
427
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)428 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
429 {
430 uint32_t *cigar = (uint32_t*)(data + c->l_qname);
431 uint32_t i;
432 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
433 }
434
bam_read1(BGZF * fp,bam1_t * b)435 int bam_read1(BGZF *fp, bam1_t *b)
436 {
437 bam1_core_t *c = &b->core;
438 int32_t block_len, ret, i;
439 uint32_t x[8];
440 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
441 if (ret == 0) return -1; // normal end-of-file
442 else return -2; // truncated
443 }
444 if (fp->is_be)
445 ed_swap_4p(&block_len);
446 if (block_len < 32) return -4; // block_len includes core data
447 if (bgzf_read(fp, x, 32) != 32) return -3;
448 if (fp->is_be) {
449 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
450 }
451 c->tid = x[0]; c->pos = x[1];
452 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
453 c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
454 if ((uint32_t) c->l_qname + c->l_extranul > 255) // l_qname would overflow
455 return -4;
456 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
457 c->l_qseq = x[4];
458 c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
459 b->l_data = block_len - 32 + c->l_extranul;
460 if (b->l_data < 0 || c->l_qseq < 0 || c->l_qname < 1) return -4;
461 if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
462 + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) b->l_data)
463 return -4;
464 if (b->m_data < b->l_data) {
465 uint8_t *new_data;
466 uint32_t new_m = b->l_data;
467 kroundup32(new_m);
468 new_data = (uint8_t*)realloc(b->data, new_m);
469 if (!new_data)
470 return -4;
471 b->data = new_data;
472 b->m_data = new_m;
473 }
474 if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
475 for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
476 c->l_qname += c->l_extranul;
477 if (b->l_data < c->l_qname ||
478 bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
479 return -4;
480 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
481 if (bam_tag2cigar(b, 0, 0) < 0)
482 return -4;
483
484 if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
485 int rlen, qlen;
486 bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
487 if ((b->core.flag & BAM_FUNMAP)) rlen=1;
488 b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
489 // Sanity check for broken CIGAR alignments
490 if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
491 hts_log_error("CIGAR and query sequence lengths differ for %s",
492 bam_get_qname(b));
493 return -4;
494 }
495 }
496
497 return 4 + block_len;
498 }
499
bam_write1(BGZF * fp,const bam1_t * b)500 int bam_write1(BGZF *fp, const bam1_t *b)
501 {
502 const bam1_core_t *c = &b->core;
503 uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
504 int i, ok;
505 if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
506 x[0] = c->tid;
507 x[1] = c->pos;
508 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
509 if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
510 else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
511 x[4] = c->l_qseq;
512 x[5] = c->mtid;
513 x[6] = c->mpos;
514 x[7] = c->isize;
515 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
516 if (fp->is_be) {
517 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
518 y = block_len;
519 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
520 swap_data(c, b->l_data, b->data, 1);
521 } else {
522 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
523 }
524 if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
525 if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
526 if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
527 if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
528 } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
529 uint8_t buf[8];
530 uint32_t cigar_st, cigar_en, cigar[2];
531 cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
532 cigar_en = cigar_st + c->n_cigar * 4;
533 cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
534 cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP;
535 u32_to_le(cigar[0], buf);
536 u32_to_le(cigar[1], buf + 4);
537 if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
538 if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
539 if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
540 u32_to_le(c->n_cigar, buf);
541 if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
542 if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
543 }
544 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
545 return ok? 4 + block_len : -1;
546 }
547
548 /********************
549 *** BAM indexing ***
550 ********************/
551
bam_index(BGZF * fp,int min_shift)552 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
553 {
554 int n_lvls, i, fmt, ret;
555 bam1_t *b;
556 hts_idx_t *idx;
557 bam_hdr_t *h;
558 h = bam_hdr_read(fp);
559 if (h == NULL) return NULL;
560 if (min_shift > 0) {
561 int64_t max_len = 0, s;
562 for (i = 0; i < h->n_targets; ++i)
563 if (max_len < h->target_len[i]) max_len = h->target_len[i];
564 max_len += 256;
565 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
566 fmt = HTS_FMT_CSI;
567 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
568 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
569 bam_hdr_destroy(h);
570 b = bam_init1();
571 while ((ret = bam_read1(fp, b)) >= 0) {
572 ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
573 if (ret < 0) goto err; // unsorted
574 }
575 if (ret < -1) goto err; // corrupted BAM file
576
577 hts_idx_finish(idx, bgzf_tell(fp));
578 bam_destroy1(b);
579 return idx;
580
581 err:
582 bam_destroy1(b);
583 hts_idx_destroy(idx);
584 return NULL;
585 }
586
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)587 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
588 {
589 hts_idx_t *idx;
590 htsFile *fp;
591 int ret = 0;
592
593 if ((fp = hts_open(fn, "r")) == 0) return -2;
594 if (nthreads)
595 hts_set_threads(fp, nthreads);
596
597 switch (fp->format.format) {
598 case cram:
599 ret = cram_index_build(fp->fp.cram, fn, fnidx);
600 break;
601
602 case bam:
603 idx = bam_index(fp->fp.bgzf, min_shift);
604 if (idx) {
605 ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
606 if (ret < 0) ret = -4;
607 hts_idx_destroy(idx);
608 }
609 else ret = -1;
610 break;
611
612 default:
613 ret = -3;
614 break;
615 }
616 hts_close(fp);
617
618 return ret;
619 }
620
sam_index_build2(const char * fn,const char * fnidx,int min_shift)621 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
622 {
623 return sam_index_build3(fn, fnidx, min_shift, 0);
624 }
625
sam_index_build(const char * fn,int min_shift)626 int sam_index_build(const char *fn, int min_shift)
627 {
628 return sam_index_build3(fn, NULL, min_shift, 0);
629 }
630
631 // Provide bam_index_build() symbol for binary compability with earlier HTSlib
632 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)633 int bam_index_build(const char *fn, int min_shift)
634 {
635 return sam_index_build2(fn, NULL, min_shift);
636 }
637
bam_readrec(BGZF * fp,void * ignored,void * bv,int * tid,int * beg,int * end)638 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
639 {
640 bam1_t *b = bv;
641 int ret;
642 if ((ret = bam_read1(fp, b)) >= 0) {
643 *tid = b->core.tid;
644 *beg = b->core.pos;
645 *end = bam_endpos(b);
646 }
647 return ret;
648 }
649
650 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,int * beg,int * end)651 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
652 {
653 htsFile *fp = fpv;
654 bam1_t *b = bv;
655 int ret = cram_get_bam_seq(fp->fp.cram, &b);
656 if (ret < 0)
657 return cram_eof(fp->fp.cram) ? -1 : -2;
658
659 if (bam_tag2cigar(b, 1, 1) < 0)
660 return -2;
661
662 *tid = b->core.tid;
663 *beg = b->core.pos;
664 *end = bam_endpos(b);
665
666 return ret;
667 }
668
cram_pseek(void * fp,int64_t offset,int whence)669 static int cram_pseek(void *fp, int64_t offset, int whence)
670 {
671 cram_fd *fd = (cram_fd *)fp;
672
673 if ((0 != cram_seek(fd, offset, SEEK_SET))
674 && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
675 return -1;
676
677 if (fd->ctr) {
678 cram_free_container(fd->ctr);
679 fd->ctr = NULL;
680 fd->ooc = 0;
681 }
682
683 return 0;
684 }
685
686 /*
687 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
688 * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
689 * container previously fetched. It was designed like this to integrate with the functionality
690 * of the iterator stepping logic.
691 */
692
cram_ptell(void * fp)693 static int64_t cram_ptell(void *fp)
694 {
695 cram_fd *fd = (cram_fd *)fp;
696 cram_container *c;
697 int64_t ret = -1L;
698
699 if (fd && fd->fp) {
700 ret = htell(fd->fp);
701 if ((c = fd->ctr) != NULL) {
702 ret -= ((c->curr_slice != c->max_slice || c->curr_rec != c->max_rec) ? c->offset + 1 : 0);
703 }
704 }
705
706 return ret;
707 }
708
bam_pseek(void * fp,int64_t offset,int whence)709 static int bam_pseek(void *fp, int64_t offset, int whence)
710 {
711 BGZF *fd = (BGZF *)fp;
712
713 return bgzf_seek(fd, offset, whence);
714 }
715
bam_ptell(void * fp)716 static int64_t bam_ptell(void *fp)
717 {
718 BGZF *fd = (BGZF *)fp;
719 if (!fd)
720 return -1L;
721
722 return bgzf_tell(fd);
723 }
724
725 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_bam_cram_readrec(BGZF * bgzfp,void * fpv,void * bv,int * tid,int * beg,int * end)726 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
727 {
728 htsFile *fp = fpv;
729 bam1_t *b = bv;
730 switch (fp->format.format) {
731 case bam: return bam_read1(bgzfp, b);
732 case cram: {
733 int ret = cram_get_bam_seq(fp->fp.cram, &b);
734 if (ret < 0)
735 return cram_eof(fp->fp.cram) ? -1 : -2;
736
737 if (bam_tag2cigar(b, 1, 1) < 0)
738 return -2;
739 return ret;
740 }
741 default:
742 // TODO Need headers available to implement this for SAM files
743 hts_log_error("Not implemented for SAM files");
744 abort();
745 }
746 }
747
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)748 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
749 {
750 switch (fp->format.format) {
751 case bam:
752 return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI);
753
754 case cram: {
755 if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
756 // Cons up a fake "index" just pointing at the associated cram_fd:
757 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
758 if (idx == NULL) return NULL;
759 idx->fmt = HTS_FMT_CRAI;
760 idx->cram = fp->fp.cram;
761 return (hts_idx_t *) idx;
762 }
763
764 default:
765 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
766 }
767 }
768
sam_index_load(htsFile * fp,const char * fn)769 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
770 {
771 return sam_index_load2(fp, fn, NULL);
772 }
773
cram_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)774 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
775 {
776 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
777 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
778 if (iter == NULL) return NULL;
779
780 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
781 // the readrec function:
782 iter->is_cram = 1;
783 iter->read_rest = 1;
784 iter->off = NULL;
785 iter->bins.a = NULL;
786 iter->readrec = readrec;
787
788 if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
789 cram_range r = { tid, beg+1, end };
790 int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
791
792 iter->curr_off = 0;
793 // The following fields are not required by hts_itr_next(), but are
794 // filled in in case user code wants to look at them.
795 iter->tid = tid;
796 iter->beg = beg;
797 iter->end = end;
798
799 switch (ret) {
800 case 0:
801 break;
802
803 case -2:
804 // No data vs this ref, so mark iterator as completed.
805 // Same as HTS_IDX_NONE.
806 iter->finished = 1;
807 break;
808
809 default:
810 free(iter);
811 return NULL;
812 }
813 }
814 else switch (tid) {
815 case HTS_IDX_REST:
816 iter->curr_off = 0;
817 break;
818 case HTS_IDX_NONE:
819 iter->curr_off = 0;
820 iter->finished = 1;
821 break;
822 default:
823 hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
824 abort();
825 break;
826 }
827
828 return iter;
829 }
830
sam_itr_queryi(const hts_idx_t * idx,int tid,int beg,int end)831 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
832 {
833 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
834 if (idx == NULL)
835 return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
836 else if (cidx->fmt == HTS_FMT_CRAI)
837 return cram_itr_query(idx, tid, beg, end, cram_readrec);
838 else
839 return hts_itr_query(idx, tid, beg, end, bam_readrec);
840 }
841
cram_name2id(void * fdv,const char * ref)842 static int cram_name2id(void *fdv, const char *ref)
843 {
844 cram_fd *fd = (cram_fd *) fdv;
845 return sam_hdr_name2ref(fd->header, ref);
846 }
847
sam_itr_querys(const hts_idx_t * idx,bam_hdr_t * hdr,const char * region)848 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
849 {
850 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
851 if (cidx->fmt == HTS_FMT_CRAI)
852 return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
853 else
854 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
855 }
856
sam_itr_regions(const hts_idx_t * idx,bam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)857 hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
858 {
859 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
860 if (cidx->fmt == HTS_FMT_CRAI)
861 return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
862 hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
863 else
864 return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
865 hts_itr_multi_bam, bam_readrec, bam_pseek, bam_ptell);
866 }
867
868 /**********************
869 *** SAM header I/O ***
870 **********************/
871
872 #include "htslib/kseq.h"
873 #include "htslib/kstring.h"
874
sam_hdr_parse(int l_text,const char * text)875 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
876 {
877 const char *q, *r, *p;
878 khash_t(s2i) *d;
879 d = kh_init(s2i);
880 for (p = text; *p; ++p) {
881 if (strncmp(p, "@SQ\t", 4) == 0) {
882 char *sn = 0;
883 int ln = -1;
884 for (q = p + 4;; ++q) {
885 if (strncmp(q, "SN:", 3) == 0) {
886 q += 3;
887 for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r);
888 sn = (char*)calloc(r - q + 1, 1);
889 strncpy(sn, q, r - q);
890 q = r;
891 } else if (strncmp(q, "LN:", 3) == 0)
892 ln = strtol(q + 3, (char**)&q, 10);
893 while (*q != '\t' && *q != '\n' && *q != '\0') ++q;
894 if (*q == '\0' || *q == '\n') break;
895 }
896 p = q;
897 if (sn && ln >= 0) {
898 khint_t k;
899 int absent;
900 k = kh_put(s2i, d, sn, &absent);
901 if (!absent) {
902 hts_log_warning("Duplicated sequence '%s'", sn);
903 free(sn);
904 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
905 }
906 }
907 while (*p != '\0' && *p != '\n') ++p;
908 }
909 return hdr_from_dict(d);
910 }
911
912 // Minimal sanitisation of a header to ensure.
913 // - null terminated string.
914 // - all lines start with @ (also implies no blank lines).
915 //
916 // Much more could be done, but currently is not, including:
917 // - checking header types are known (HD, SQ, etc).
918 // - syntax (eg checking tab separated fields).
919 // - validating n_targets matches @SQ records.
920 // - validating target lengths against @SQ records.
sam_hdr_sanitise(bam_hdr_t * h)921 static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) {
922 if (!h)
923 return NULL;
924
925 // Special case for empty headers.
926 if (h->l_text == 0)
927 return h;
928
929 uint32_t i, lnum = 0;
930 char *cp = h->text, last = '\n';
931 for (i = 0; i < h->l_text; i++) {
932 // NB: l_text excludes terminating nul. This finds early ones.
933 if (cp[i] == 0)
934 break;
935
936 // Error on \n[^@], including duplicate newlines
937 if (last == '\n') {
938 lnum++;
939 if (cp[i] != '@') {
940 hts_log_error("Malformed SAM header at line %u", lnum);
941 bam_hdr_destroy(h);
942 return NULL;
943 }
944 }
945
946 last = cp[i];
947 }
948
949 if (i < h->l_text) { // Early nul found. Complain if not just padding.
950 uint32_t j = i;
951 while (j < h->l_text && cp[j] == '\0') j++;
952 if (j < h->l_text)
953 hts_log_warning("Unexpected NUL character in header. Possibly truncated");
954 }
955
956 // Add trailing newline and/or trailing nul if required.
957 if (last != '\n') {
958 hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
959
960 if (h->l_text == UINT32_MAX) {
961 hts_log_error("No room for extra newline");
962 bam_hdr_destroy(h);
963 return NULL;
964 }
965
966 if (i >= h->l_text - 1) {
967 cp = realloc(h->text, (size_t) h->l_text+2);
968 if (!cp) {
969 bam_hdr_destroy(h);
970 return NULL;
971 }
972 h->text = cp;
973 }
974 cp[i++] = '\n';
975
976 // l_text may be larger already due to multiple nul padding
977 if (h->l_text < i)
978 h->l_text = i;
979 cp[h->l_text] = '\0';
980 }
981
982 return h;
983 }
984
sam_hdr_read(htsFile * fp)985 bam_hdr_t *sam_hdr_read(htsFile *fp)
986 {
987 switch (fp->format.format) {
988 case bam:
989 return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
990
991 case cram:
992 return sam_hdr_sanitise(cram_header_to_bam(fp->fp.cram->header));
993
994 case sam: {
995 kstring_t str = { 0, 0, NULL };
996 bam_hdr_t *h = NULL;
997 int ret, has_SQ = 0;
998 while ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
999 if (fp->line.s[0] != '@') break;
1000 if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
1001 kputsn(fp->line.s, fp->line.l, &str);
1002 kputc('\n', &str);
1003 }
1004 if (ret < -1) goto error;
1005 if (! has_SQ && fp->fn_aux) {
1006 kstring_t line = { 0, 0, NULL };
1007 hFILE *f = hopen(fp->fn_aux, "r");
1008 if (f == NULL) goto error;
1009 while (line.l = 0, kgetline(&line, (kgets_func *) hgets, f) >= 0) {
1010 char *tab = strchr(line.s, '\t');
1011 if (tab == NULL) continue;
1012 kputs("@SQ\tSN:", &str);
1013 kputsn(line.s, tab - line.s, &str);
1014 kputs("\tLN:", &str);
1015 kputl(atol(tab), &str);
1016 kputc('\n', &str);
1017 }
1018 free(line.s);
1019 if (hclose(f) != 0) {
1020 hts_log_warning("Failed to close %s", fp->fn_aux);
1021 }
1022 }
1023 if (str.l == 0) kputsn("", 0, &str);
1024 h = sam_hdr_parse(str.l, str.s);
1025 h->l_text = str.l; h->text = str.s;
1026 return sam_hdr_sanitise(h);
1027
1028 error:
1029 bam_hdr_destroy(h);
1030 free(str.s);
1031 return NULL;
1032 }
1033
1034 default:
1035 abort();
1036 }
1037 }
1038
sam_hdr_write(htsFile * fp,const bam_hdr_t * h)1039 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
1040 {
1041 if (!h) {
1042 errno = EINVAL;
1043 return -1;
1044 }
1045
1046 switch (fp->format.format) {
1047 case binary_format:
1048 fp->format.category = sequence_data;
1049 fp->format.format = bam;
1050 /* fall-through */
1051 case bam:
1052 if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1053 break;
1054
1055 case cram: {
1056 cram_fd *fd = fp->fp.cram;
1057 SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h);
1058 if (! hdr) return -1;
1059 if (cram_set_header(fd, hdr) < 0) return -1;
1060 if (fp->fn_aux)
1061 cram_load_reference(fd, fp->fn_aux);
1062 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1063 }
1064 break;
1065
1066 case text_format:
1067 fp->format.category = sequence_data;
1068 fp->format.format = sam;
1069 /* fall-through */
1070 case sam: {
1071 char *p;
1072 hputs(h->text, fp->fp.hfile);
1073 p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
1074 if (p == 0) {
1075 int i;
1076 for (i = 0; i < h->n_targets; ++i) {
1077 fp->line.l = 0;
1078 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
1079 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
1080 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1081 }
1082 }
1083 if ( hflush(fp->fp.hfile) != 0 ) return -1;
1084 }
1085 break;
1086
1087 default:
1088 abort();
1089 }
1090 return 0;
1091 }
1092
sam_hdr_change_HD(bam_hdr_t * h,const char * key,const char * val)1093 int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val)
1094 {
1095 char *p, *q, *beg = NULL, *end = NULL, *newtext;
1096 if (!h || !key)
1097 return -1;
1098
1099 if (h->l_text > 3) {
1100 if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
1101 if ((p = strchr(h->text, '\n')) == 0) return -1;
1102 *p = '\0'; // for strstr call
1103
1104 char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
1105
1106 if ((q = strstr(h->text, tmp)) != 0) { // key exists
1107 *p = '\n'; // change back
1108
1109 // mark the key:val
1110 beg = q;
1111 for (q += 4; *q != '\n' && *q != '\t'; ++q);
1112 end = q;
1113
1114 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
1115 && strlen(val) == end - beg - 4)
1116 return 0; // val is the same, no need to change
1117
1118 } else {
1119 beg = end = p;
1120 *p = '\n';
1121 }
1122 }
1123 }
1124 if (beg == NULL) { // no @HD
1125 if (h->l_text > UINT32_MAX - strlen(SAM_FORMAT_VERSION) - 9)
1126 return -1;
1127 h->l_text += strlen(SAM_FORMAT_VERSION) + 8;
1128 if (val) {
1129 if (h->l_text > UINT32_MAX - strlen(val) - 5)
1130 return -1;
1131 h->l_text += strlen(val) + 4;
1132 }
1133 newtext = (char*)malloc(h->l_text + 1);
1134 if (!newtext) return -1;
1135
1136 if (val)
1137 snprintf(newtext, h->l_text + 1,
1138 "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
1139 else
1140 snprintf(newtext, h->l_text + 1,
1141 "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
1142 } else { // has @HD but different or no key
1143 h->l_text = (beg - h->text) + (h->text + h->l_text - end);
1144 if (val) {
1145 if (h->l_text > UINT32_MAX - strlen(val) - 5)
1146 return -1;
1147 h->l_text += strlen(val) + 4;
1148 }
1149 newtext = (char*)malloc(h->l_text + 1);
1150 if (!newtext) return -1;
1151
1152 if (val) {
1153 snprintf(newtext, h->l_text + 1, "%.*s\t%s:%s%s",
1154 (int) (beg - h->text), h->text, key, val, end);
1155 } else { //delete key
1156 snprintf(newtext, h->l_text + 1, "%.*s%s",
1157 (int) (beg - h->text), h->text, end);
1158 }
1159 }
1160 free(h->text);
1161 h->text = newtext;
1162 return 0;
1163 }
1164
1165 /**********************
1166 *** SAM record I/O ***
1167 **********************/
1168
sam_parse1(kstring_t * s,bam_hdr_t * h,bam1_t * b)1169 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
1170 {
1171 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
1172 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
1173 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
1174 #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0)
1175 #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0)
1176 #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0)
1177
1178 uint8_t *t;
1179 char *p = s->s, *q;
1180 int i;
1181 kstring_t str;
1182 bam1_core_t *c = &b->core;
1183
1184 str.l = b->l_data = 0;
1185 str.s = (char*)b->data; str.m = b->m_data;
1186 memset(c, 0, 32);
1187 if (h->cigar_tab == 0) {
1188 h->cigar_tab = (int8_t*) malloc(128);
1189 for (i = 0; i < 128; ++i)
1190 h->cigar_tab[i] = -1;
1191 for (i = 0; BAM_CIGAR_STR[i]; ++i)
1192 h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
1193 }
1194 // qname
1195 q = _read_token(p);
1196 _parse_warn(p - q <= 1, "empty query name");
1197 _parse_err(p - q > 252, "query name too long");
1198 kputsn_(q, p - q, &str);
1199 for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++)
1200 kputc_('\0', &str);
1201 c->l_qname = p - q + c->l_extranul;
1202 // flag
1203 c->flag = strtol(p, &p, 0);
1204 if (*p++ != '\t') goto err_ret; // malformated flag
1205 // chr
1206 q = _read_token(p);
1207 if (strcmp(q, "*")) {
1208 _parse_err(h->n_targets == 0, "missing SAM header");
1209 c->tid = bam_name2id(h, q);
1210 _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
1211 } else c->tid = -1;
1212 // pos
1213 c->pos = strtol(p, &p, 10) - 1;
1214 if (*p++ != '\t') goto err_ret;
1215 if (c->pos < 0 && c->tid >= 0) {
1216 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
1217 c->tid = -1;
1218 }
1219 if (c->tid < 0) c->flag |= BAM_FUNMAP;
1220 // mapq
1221 c->qual = strtol(p, &p, 10);
1222 if (*p++ != '\t') goto err_ret;
1223 // cigar
1224 if (*p != '*') {
1225 uint32_t *cigar;
1226 size_t n_cigar = 0;
1227 for (q = p; *p && *p != '\t'; ++p)
1228 if (!isdigit_c(*p)) ++n_cigar;
1229 if (*p++ != '\t') goto err_ret;
1230 _parse_err(n_cigar == 0, "no CIGAR operations");
1231 _parse_err(n_cigar >= 2147483647, "too many CIGAR operations");
1232 c->n_cigar = n_cigar;
1233 _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t));
1234 for (i = 0; i < c->n_cigar; ++i, ++q) {
1235 int op;
1236 cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
1237 op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
1238 _parse_err(op < 0, "unrecognized CIGAR operator");
1239 cigar[i] |= op;
1240 }
1241 // can't use bam_endpos() directly as some fields not yet set up
1242 i = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
1243 } else {
1244 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
1245 c->flag |= BAM_FUNMAP;
1246 q = _read_token(p);
1247 i = 1;
1248 }
1249 c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
1250 // mate chr
1251 q = _read_token(p);
1252 if (strcmp(q, "=") == 0) {
1253 c->mtid = c->tid;
1254 } else if (strcmp(q, "*") == 0) {
1255 c->mtid = -1;
1256 } else {
1257 c->mtid = bam_name2id(h, q);
1258 _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped");
1259 }
1260 // mpos
1261 c->mpos = strtol(p, &p, 10) - 1;
1262 if (*p++ != '\t') goto err_ret;
1263 if (c->mpos < 0 && c->mtid >= 0) {
1264 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
1265 c->mtid = -1;
1266 }
1267 // tlen
1268 c->isize = strtol(p, &p, 10);
1269 if (*p++ != '\t') goto err_ret;
1270 // seq
1271 q = _read_token(p);
1272 if (strcmp(q, "*")) {
1273 c->l_qseq = p - q - 1;
1274 i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
1275 _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
1276 i = (c->l_qseq + 1) >> 1;
1277 _get_mem(uint8_t, &t, &str, i);
1278 memset(t, 0, i);
1279 for (i = 0; i < c->l_qseq; ++i)
1280 t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
1281 } else c->l_qseq = 0;
1282 // qual
1283 q = _read_token_aux(p);
1284 _get_mem(uint8_t, &t, &str, c->l_qseq);
1285 if (strcmp(q, "*")) {
1286 _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
1287 for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
1288 } else memset(t, 0xff, c->l_qseq);
1289 // aux
1290 while (p < s->s + s->l) {
1291 uint8_t type;
1292 q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
1293 _parse_err(p - q - 1 < 5, "incomplete aux field");
1294 kputsn_(q, 2, &str);
1295 q += 3; type = *q++; ++q; // q points to value
1296 if (type != 'Z' && type != 'H') // the only zero length acceptable fields
1297 _parse_err(p - q - 1 < 1, "incomplete aux field");
1298
1299 // Ensure str has enough space for a double + type allocated.
1300 // This is so we can stuff bigger integers and floats directly into
1301 // the kstring. Sorry.
1302 _parse_err(ks_resize(&str, str.l + 16), "out of memory");
1303
1304 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
1305 kputc_('A', &str);
1306 kputc_(*q, &str);
1307 } else if (type == 'i' || type == 'I') {
1308 if (*q == '-') {
1309 long x = strtol(q, &q, 10);
1310 if (x >= INT8_MIN) {
1311 kputc_('c', &str); kputc_(x, &str);
1312 } else if (x >= INT16_MIN) {
1313 str.s[str.l++] = 's';
1314 i16_to_le(x, (uint8_t *) str.s + str.l);
1315 str.l += 2;
1316 } else {
1317 str.s[str.l++] = 'i';
1318 i32_to_le(x, (uint8_t *) str.s + str.l);
1319 str.l += 4;
1320 }
1321 } else {
1322 unsigned long x = strtoul(q, &q, 10);
1323 if (x <= UINT8_MAX) {
1324 kputc_('C', &str); kputc_(x, &str);
1325 } else if (x <= UINT16_MAX) {
1326 str.s[str.l++] = 'S';
1327 u16_to_le(x, (uint8_t *) str.s + str.l);
1328 str.l += 2;
1329 } else {
1330 str.s[str.l++] = 'I';
1331 u32_to_le(x, (uint8_t *) str.s + str.l);
1332 str.l += 4;
1333 }
1334 }
1335 } else if (type == 'f') {
1336 str.s[str.l++] = 'f';
1337 float_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1338 str.l += sizeof(float);
1339 } else if (type == 'd') {
1340 str.s[str.l++] = 'd';
1341 double_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1342 str.l += sizeof(double);
1343 } else if (type == 'Z' || type == 'H') {
1344 _parse_err(type == 'H' && !((p-q)&1),
1345 "hex field does not have an even number of digits");
1346 kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
1347 } else if (type == 'B') {
1348 int32_t n, size;
1349 size_t bytes;
1350 char *r;
1351 _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
1352 type = *q++; // q points to the first ',' following the typing byte
1353
1354 size = aux_type2size(type);
1355 _parse_err_param(size <= 0 || size > 4,
1356 "unrecognized type B:%c", type);
1357 _parse_err(*q && *q != ',', "B aux field type not followed by ','");
1358
1359 for (r = q, n = 0; *r; ++r)
1360 if (*r == ',') ++n;
1361
1362 // Ensure space for type + values
1363 bytes = (size_t) n * (size_t) size;
1364 _parse_err(bytes / size != n
1365 || ks_resize(&str, str.l + bytes + 2 + sizeof(uint32_t)),
1366 "out of memory");
1367 str.s[str.l++] = 'B';
1368 str.s[str.l++] = type;
1369 i32_to_le(n, (uint8_t *) str.s + str.l);
1370 str.l += sizeof(uint32_t);
1371
1372 // This ensures that q always ends up at the next comma after
1373 // reading a number even if it's followed by junk. It
1374 // prevents the possibility of trying to read more than n items.
1375 #define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0)
1376
1377 if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
1378 else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
1379 else if (type == 's') while (q + 1 < p) { i16_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1380 else if (type == 'S') while (q + 1 < p) { u16_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1381 else if (type == 'i') while (q + 1 < p) { i32_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1382 else if (type == 'I') while (q + 1 < p) { u32_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1383 else if (type == 'f') while (q + 1 < p) { float_to_le(strtod(q + 1, &q), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1384 else _parse_err_param(1, "unrecognized type B:%c", type);
1385
1386 #undef _skip_to_comma
1387
1388 } else _parse_err_param(1, "unrecognized type %c", type);
1389 }
1390 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1391 if (bam_tag2cigar(b, 1, 1) < 0)
1392 return -2;
1393 return 0;
1394
1395 #undef _parse_warn
1396 #undef _parse_err
1397 #undef _parse_err_param
1398 #undef _get_mem
1399 #undef _read_token_aux
1400 #undef _read_token
1401 err_ret:
1402 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1403 return -2;
1404 }
1405
sam_read1(htsFile * fp,bam_hdr_t * h,bam1_t * b)1406 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
1407 {
1408 switch (fp->format.format) {
1409 case bam: {
1410 int r = bam_read1(fp->fp.bgzf, b);
1411 if (r >= 0) {
1412 if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
1413 b->core.mtid >= h->n_targets || b->core.mtid < -1)
1414 return -3;
1415 }
1416 return r;
1417 }
1418
1419 case cram: {
1420 int ret = cram_get_bam_seq(fp->fp.cram, &b);
1421 if (ret < 0)
1422 return cram_eof(fp->fp.cram) ? -1 : -2;
1423
1424 if (bam_tag2cigar(b, 1, 1) < 0)
1425 return -2;
1426 return ret;
1427 }
1428
1429 case sam: {
1430 int ret;
1431 err_recover:
1432 if (fp->line.l == 0) {
1433 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1434 if (ret < 0) return ret;
1435 }
1436 ret = sam_parse1(&fp->line, h, b);
1437 fp->line.l = 0;
1438 if (ret < 0) {
1439 hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
1440 if (h->ignore_sam_err) goto err_recover;
1441 }
1442 return ret;
1443 }
1444
1445 default:
1446 abort();
1447 }
1448 }
1449
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)1450 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
1451 {
1452 int i;
1453 uint8_t *s, *end;
1454 const bam1_core_t *c = &b->core;
1455
1456 str->l = 0;
1457 kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name
1458 kputw(c->flag, str); kputc('\t', str); // flag
1459 if (c->tid >= 0) { // chr
1460 kputs(h->target_name[c->tid] , str);
1461 kputc('\t', str);
1462 } else kputsn("*\t", 2, str);
1463 kputw(c->pos + 1, str); kputc('\t', str); // pos
1464 kputw(c->qual, str); kputc('\t', str); // qual
1465 if (c->n_cigar) { // cigar
1466 uint32_t *cigar = bam_get_cigar(b);
1467 for (i = 0; i < c->n_cigar; ++i) {
1468 kputw(bam_cigar_oplen(cigar[i]), str);
1469 kputc(bam_cigar_opchr(cigar[i]), str);
1470 }
1471 } else kputc('*', str);
1472 kputc('\t', str);
1473 if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
1474 else if (c->mtid == c->tid) kputsn("=\t", 2, str);
1475 else {
1476 kputs(h->target_name[c->mtid], str);
1477 kputc('\t', str);
1478 }
1479 kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
1480 kputw(c->isize, str); kputc('\t', str); // template len
1481 if (c->l_qseq) { // seq and qual
1482 uint8_t *s = bam_get_seq(b);
1483 for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
1484 kputc('\t', str);
1485 s = bam_get_qual(b);
1486 if (s[0] == 0xff) kputc('*', str);
1487 else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
1488 } else kputsn("*\t*", 3, str);
1489
1490 s = bam_get_aux(b); // aux
1491 end = b->data + b->l_data;
1492 while (end - s >= 4) {
1493 uint8_t type, key[2];
1494 key[0] = s[0]; key[1] = s[1];
1495 s += 2; type = *s++;
1496 kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
1497 if (type == 'A') {
1498 kputsn("A:", 2, str);
1499 kputc(*s, str);
1500 ++s;
1501 } else if (type == 'C') {
1502 kputsn("i:", 2, str);
1503 kputw(*s, str);
1504 ++s;
1505 } else if (type == 'c') {
1506 kputsn("i:", 2, str);
1507 kputw(*(int8_t*)s, str);
1508 ++s;
1509 } else if (type == 'S') {
1510 if (end - s >= 2) {
1511 kputsn("i:", 2, str);
1512 kputuw(le_to_u16(s), str);
1513 s += 2;
1514 } else goto bad_aux;
1515 } else if (type == 's') {
1516 if (end - s >= 2) {
1517 kputsn("i:", 2, str);
1518 kputw(le_to_i16(s), str);
1519 s += 2;
1520 } else goto bad_aux;
1521 } else if (type == 'I') {
1522 if (end - s >= 4) {
1523 kputsn("i:", 2, str);
1524 kputuw(le_to_u32(s), str);
1525 s += 4;
1526 } else goto bad_aux;
1527 } else if (type == 'i') {
1528 if (end - s >= 4) {
1529 kputsn("i:", 2, str);
1530 kputw(le_to_i32(s), str);
1531 s += 4;
1532 } else goto bad_aux;
1533 } else if (type == 'f') {
1534 if (end - s >= 4) {
1535 ksprintf(str, "f:%g", le_to_float(s));
1536 s += 4;
1537 } else goto bad_aux;
1538
1539 } else if (type == 'd') {
1540 if (end - s >= 8) {
1541 ksprintf(str, "d:%g", le_to_double(s));
1542 s += 8;
1543 } else goto bad_aux;
1544 } else if (type == 'Z' || type == 'H') {
1545 kputc(type, str); kputc(':', str);
1546 while (s < end && *s) kputc(*s++, str);
1547 if (s >= end)
1548 goto bad_aux;
1549 ++s;
1550 } else if (type == 'B') {
1551 uint8_t sub_type = *(s++);
1552 int sub_type_size = aux_type2size(sub_type);
1553 uint32_t n;
1554 if (sub_type_size == 0 || end - s < 4)
1555 goto bad_aux;
1556 n = le_to_u32(s);
1557 s += 4; // now points to the start of the array
1558 if ((end - s) / sub_type_size < n)
1559 goto bad_aux;
1560 kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
1561 for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
1562 kputc(',', str);
1563 if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
1564 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
1565 else if ('s' == sub_type) { kputw(le_to_i16(s), str); s += 2; }
1566 else if ('S' == sub_type) { kputw(le_to_u16(s), str); s += 2; }
1567 else if ('i' == sub_type) { kputw(le_to_i32(s), str); s += 4; }
1568 else if ('I' == sub_type) { kputuw(le_to_u32(s), str); s += 4; }
1569 else if ('f' == sub_type) { kputd(le_to_float(s), str); s += 4; }
1570 else goto bad_aux; // Unknown sub-type
1571 }
1572 } else { // Unknown type
1573 goto bad_aux;
1574 }
1575 }
1576 return str->l;
1577
1578 bad_aux:
1579 hts_log_error("Corrupted aux data for read %.*s",
1580 b->core.l_qname, bam_get_qname(b));
1581 errno = EINVAL;
1582 return -1;
1583 }
1584
sam_write1(htsFile * fp,const bam_hdr_t * h,const bam1_t * b)1585 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1586 {
1587 switch (fp->format.format) {
1588 case binary_format:
1589 fp->format.category = sequence_data;
1590 fp->format.format = bam;
1591 /* fall-through */
1592 case bam:
1593 return bam_write1(fp->fp.bgzf, b);
1594
1595 case cram:
1596 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1597
1598 case text_format:
1599 fp->format.category = sequence_data;
1600 fp->format.format = sam;
1601 /* fall-through */
1602 case sam:
1603 if (sam_format1(h, b, &fp->line) < 0) return -1;
1604 kputc('\n', &fp->line);
1605 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1606 return fp->line.l;
1607
1608 default:
1609 abort();
1610 }
1611 }
1612
1613 /************************
1614 *** Auxiliary fields ***
1615 ************************/
1616 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)1617 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
1618 int tsz = aux_type2size(type);
1619
1620 if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
1621
1622 switch (tsz) {
1623 case 'H': case 'Z': case 1: // Trivial
1624 memcpy(out, in, len);
1625 break;
1626
1627 #define aux_val_to_le(type_t, store_le) do { \
1628 type_t v; \
1629 size_t i; \
1630 for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
1631 memcpy(&v, in + i, sizeof(type_t)); \
1632 store_le(v, out); \
1633 } \
1634 } while (0)
1635
1636 case 2: aux_val_to_le(uint16_t, u16_to_le); break;
1637 case 4: aux_val_to_le(uint32_t, u32_to_le); break;
1638 case 8: aux_val_to_le(uint64_t, u64_to_le); break;
1639
1640 #undef aux_val_to_le
1641
1642 case 'B': { // Recurse!
1643 uint32_t n;
1644 if (len < 5) return -1;
1645 memcpy(&n, in + 1, 4);
1646 out[0] = in[0];
1647 u32_to_le(n, out + 1);
1648 return aux_to_le(in[0], out + 5, in + 5, len - 5);
1649 }
1650
1651 default: // Unknown type code
1652 return -1;
1653 }
1654
1655
1656
1657 return 0;
1658 }
1659 #endif
1660
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)1661 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
1662 {
1663 uint32_t new_len;
1664
1665 assert(b->l_data >= 0);
1666 new_len = b->l_data + 3 + len;
1667 if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
1668
1669 if (b->m_data < new_len) {
1670 uint32_t new_size = new_len;
1671 uint8_t *new_data;
1672 kroundup32(new_size);
1673 new_data = realloc(b->data, new_size);
1674 if (new_data == NULL) goto nomem;
1675 b->m_data = new_size;
1676 b->data = new_data;
1677 }
1678
1679 b->data[b->l_data] = tag[0];
1680 b->data[b->l_data + 1] = tag[1];
1681 b->data[b->l_data + 2] = type;
1682
1683 #ifdef HTS_LITTLE_ENDIAN
1684 memcpy(b->data + b->l_data + 3, data, len);
1685 #else
1686 if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
1687 errno = EINVAL;
1688 return -1;
1689 }
1690 #endif
1691
1692 b->l_data = new_len;
1693
1694 return 0;
1695
1696 nomem:
1697 errno = ENOMEM;
1698 return -1;
1699 }
1700
skip_aux(uint8_t * s,uint8_t * end)1701 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
1702 {
1703 int size;
1704 uint32_t n;
1705 if (s >= end) return end;
1706 size = aux_type2size(*s); ++s; // skip type
1707 switch (size) {
1708 case 'Z':
1709 case 'H':
1710 while (*s && s < end) ++s;
1711 return s < end ? s + 1 : end;
1712 case 'B':
1713 if (end - s < 5) return NULL;
1714 size = aux_type2size(*s); ++s;
1715 n = le_to_u32(s);
1716 s += 4;
1717 if (size == 0 || end - s < size * n) return NULL;
1718 return s + size * n;
1719 case 0:
1720 return NULL;
1721 default:
1722 if (end - s < size) return NULL;
1723 return s + size;
1724 }
1725 }
1726
bam_aux_get(const bam1_t * b,const char tag[2])1727 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1728 {
1729 uint8_t *s, *end, *t = (uint8_t *) tag;
1730 uint16_t y = (uint16_t) t[0]<<8 | t[1];
1731 s = bam_get_aux(b);
1732 end = b->data + b->l_data;
1733 while (s != NULL && end - s >= 3) {
1734 uint16_t x = (uint16_t) s[0]<<8 | s[1];
1735 s += 2;
1736 if (x == y) {
1737 // Check the tag value is valid and complete
1738 uint8_t *e = skip_aux(s, end);
1739 if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
1740 goto bad_aux; // Unterminated string
1741 }
1742 if (e != NULL) {
1743 return s;
1744 } else {
1745 goto bad_aux;
1746 }
1747 }
1748 s = skip_aux(s, end);
1749 }
1750 if (s == NULL) goto bad_aux;
1751 errno = ENOENT;
1752 return NULL;
1753
1754 bad_aux:
1755 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1756 errno = EINVAL;
1757 return NULL;
1758 }
1759 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)1760 int bam_aux_del(bam1_t *b, uint8_t *s)
1761 {
1762 uint8_t *p, *aux;
1763 int l_aux = bam_get_l_aux(b);
1764 aux = bam_get_aux(b);
1765 p = s - 2;
1766 s = skip_aux(s, aux + l_aux);
1767 if (s == NULL) goto bad_aux;
1768 memmove(p, s, l_aux - (s - aux));
1769 b->l_data -= s - p;
1770 return 0;
1771
1772 bad_aux:
1773 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1774 errno = EINVAL;
1775 return -1;
1776 }
1777
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)1778 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
1779 {
1780 // FIXME: This is not at all efficient!
1781 uint8_t *s = bam_aux_get(b,tag);
1782 if (!s) {
1783 if (errno == ENOENT) { // Tag doesn't exist - add a new one
1784 return bam_aux_append(b, tag, 'Z', len, (const uint8_t *) data);
1785 } else { // Invalid aux data, give up.
1786 return -1;
1787 }
1788 }
1789 char type = *s;
1790 if (type != 'Z') {
1791 hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
1792 errno = EINVAL;
1793 return -1;
1794 }
1795
1796 bam_aux_del(b,s);
1797 s -= 2;
1798 int l_aux = bam_get_l_aux(b);
1799
1800 b->l_data += 3 + len;
1801 if (b->m_data < b->l_data) {
1802 ptrdiff_t s_offset = s - b->data;
1803 b->m_data = b->l_data;
1804 kroundup32(b->m_data);
1805 b->data = (uint8_t*)realloc(b->data, b->m_data);
1806 s = b->data + s_offset;
1807 }
1808 memmove(s+3+len, s, l_aux - (s - bam_get_aux(b)));
1809 s[0] = tag[0];
1810 s[1] = tag[1];
1811 s[2] = type;
1812 memmove(s+3,data,len);
1813 return 0;
1814 }
1815
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)1816 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
1817 uint32_t idx)
1818 {
1819 switch (type) {
1820 case 'c': return le_to_i8(s + idx);
1821 case 'C': return s[idx];
1822 case 's': return le_to_i16(s + 2 * idx);
1823 case 'S': return le_to_u16(s + 2 * idx);
1824 case 'i': return le_to_i32(s + 4 * idx);
1825 case 'I': return le_to_u32(s + 4 * idx);
1826 default:
1827 errno = EINVAL;
1828 return 0;
1829 }
1830 }
1831
bam_aux2i(const uint8_t * s)1832 int64_t bam_aux2i(const uint8_t *s)
1833 {
1834 int type;
1835 type = *s++;
1836 return get_int_aux_val(type, s, 0);
1837 }
1838
bam_aux2f(const uint8_t * s)1839 double bam_aux2f(const uint8_t *s)
1840 {
1841 int type;
1842 type = *s++;
1843 if (type == 'd') return le_to_double(s);
1844 else if (type == 'f') return le_to_float(s);
1845 else return get_int_aux_val(type, s, 0);
1846 }
1847
bam_aux2A(const uint8_t * s)1848 char bam_aux2A(const uint8_t *s)
1849 {
1850 int type;
1851 type = *s++;
1852 if (type == 'A') return *(char*)s;
1853 errno = EINVAL;
1854 return 0;
1855 }
1856
bam_aux2Z(const uint8_t * s)1857 char *bam_aux2Z(const uint8_t *s)
1858 {
1859 int type;
1860 type = *s++;
1861 if (type == 'Z' || type == 'H') return (char*)s;
1862 errno = EINVAL;
1863 return 0;
1864 }
1865
bam_auxB_len(const uint8_t * s)1866 uint32_t bam_auxB_len(const uint8_t *s)
1867 {
1868 if (s[0] != 'B') {
1869 errno = EINVAL;
1870 return 0;
1871 }
1872 return le_to_u32(s + 2);
1873 }
1874
bam_auxB2i(const uint8_t * s,uint32_t idx)1875 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
1876 {
1877 uint32_t len = bam_auxB_len(s);
1878 if (idx >= len) {
1879 errno = ERANGE;
1880 return 0;
1881 }
1882 return get_int_aux_val(s[1], s + 6, idx);
1883 }
1884
bam_auxB2f(const uint8_t * s,uint32_t idx)1885 double bam_auxB2f(const uint8_t *s, uint32_t idx)
1886 {
1887 uint32_t len = bam_auxB_len(s);
1888 if (idx >= len) {
1889 errno = ERANGE;
1890 return 0.0;
1891 }
1892 if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
1893 else return get_int_aux_val(s[1], s + 6, idx);
1894 }
1895
sam_open_mode(char * mode,const char * fn,const char * format)1896 int sam_open_mode(char *mode, const char *fn, const char *format)
1897 {
1898 // TODO Parse "bam5" etc for compression level
1899 if (format == NULL) {
1900 // Try to pick a format based on the filename extension
1901 const char *ext = fn? strrchr(fn, '.') : NULL;
1902 if (ext == NULL || strchr(ext, '/')) return -1;
1903 return sam_open_mode(mode, fn, ext+1);
1904 }
1905 else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
1906 else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
1907 else if (strcmp(format, "sam") == 0) strcpy(mode, "");
1908 else return -1;
1909
1910 return 0;
1911 }
1912
1913 // A version of sam_open_mode that can handle ,key=value options.
1914 // The format string is allocated and returned, to be freed by the caller.
1915 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)1916 char *sam_open_mode_opts(const char *fn,
1917 const char *mode,
1918 const char *format)
1919 {
1920 char *mode_opts = malloc((format ? strlen(format) : 1) +
1921 (mode ? strlen(mode) : 1) + 12);
1922 char *opts, *cp;
1923 int format_len;
1924
1925 if (!mode_opts)
1926 return NULL;
1927
1928 strcpy(mode_opts, mode ? mode : "r");
1929 cp = mode_opts + strlen(mode_opts);
1930
1931 if (format == NULL) {
1932 // Try to pick a format based on the filename extension
1933 const char *ext = fn? strrchr(fn, '.') : NULL;
1934 if (ext == NULL || strchr(ext, '/')) {
1935 free(mode_opts);
1936 return NULL;
1937 }
1938 return sam_open_mode(cp, fn, ext+1)
1939 ? (free(mode_opts), NULL)
1940 : mode_opts;
1941 }
1942
1943 if ((opts = strchr(format, ','))) {
1944 format_len = opts-format;
1945 } else {
1946 opts="";
1947 format_len = strlen(format);
1948 }
1949
1950 if (strncmp(format, "bam", format_len) == 0) {
1951 *cp++ = 'b';
1952 } else if (strncmp(format, "cram", format_len) == 0) {
1953 *cp++ = 'c';
1954 } else if (strncmp(format, "cram2", format_len) == 0) {
1955 *cp++ = 'c';
1956 strcpy(cp, ",VERSION=2.1");
1957 cp += 12;
1958 } else if (strncmp(format, "cram3", format_len) == 0) {
1959 *cp++ = 'c';
1960 strcpy(cp, ",VERSION=3.0");
1961 cp += 12;
1962 } else if (strncmp(format, "sam", format_len) == 0) {
1963 ; // format mode=""
1964 } else {
1965 free(mode_opts);
1966 return NULL;
1967 }
1968
1969 strcpy(cp, opts);
1970
1971 return mode_opts;
1972 }
1973
1974 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)1975 int bam_str2flag(const char *str)
1976 {
1977 char *end, *beg = (char*) str;
1978 long int flag = strtol(str, &end, 0);
1979 if ( end!=str ) return flag; // the conversion was successful
1980 flag = 0;
1981 while ( *str )
1982 {
1983 end = beg;
1984 while ( *end && *end!=',' ) end++;
1985 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
1986 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
1987 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
1988 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
1989 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
1990 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
1991 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
1992 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
1993 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
1994 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
1995 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
1996 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
1997 else return -1;
1998 if ( !*end ) break;
1999 beg = end + 1;
2000 }
2001 return flag;
2002 }
2003
bam_flag2str(int flag)2004 char *bam_flag2str(int flag)
2005 {
2006 kstring_t str = {0,0,0};
2007 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
2008 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
2009 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
2010 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
2011 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
2012 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
2013 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
2014 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
2015 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
2016 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
2017 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
2018 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
2019 if ( str.l == 0 ) kputsn("", 0, &str);
2020 return str.s;
2021 }
2022
2023
2024 /**************************
2025 *** Pileup and Mpileup ***
2026 **************************/
2027
2028 #if !defined(BAM_NO_PILEUP)
2029
2030 #include <assert.h>
2031
2032 /*******************
2033 *** Memory pool ***
2034 *******************/
2035
2036 typedef struct {
2037 int k, x, y, end;
2038 } cstate_t;
2039
2040 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
2041
2042 typedef struct __linkbuf_t {
2043 bam1_t b;
2044 int32_t beg, end;
2045 cstate_t s;
2046 struct __linkbuf_t *next;
2047 bam_pileup_cd cd;
2048 } lbnode_t;
2049
2050 typedef struct {
2051 int cnt, n, max;
2052 lbnode_t **buf;
2053 } mempool_t;
2054
mp_init(void)2055 static mempool_t *mp_init(void)
2056 {
2057 mempool_t *mp;
2058 mp = (mempool_t*)calloc(1, sizeof(mempool_t));
2059 return mp;
2060 }
mp_destroy(mempool_t * mp)2061 static void mp_destroy(mempool_t *mp)
2062 {
2063 int k;
2064 for (k = 0; k < mp->n; ++k) {
2065 free(mp->buf[k]->b.data);
2066 free(mp->buf[k]);
2067 }
2068 free(mp->buf);
2069 free(mp);
2070 }
mp_alloc(mempool_t * mp)2071 static inline lbnode_t *mp_alloc(mempool_t *mp)
2072 {
2073 ++mp->cnt;
2074 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
2075 else return mp->buf[--mp->n];
2076 }
mp_free(mempool_t * mp,lbnode_t * p)2077 static inline void mp_free(mempool_t *mp, lbnode_t *p)
2078 {
2079 --mp->cnt; p->next = 0; // clear lbnode_t::next here
2080 if (mp->n == mp->max) {
2081 mp->max = mp->max? mp->max<<1 : 256;
2082 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
2083 }
2084 mp->buf[mp->n++] = p;
2085 }
2086
2087 /**********************
2088 *** CIGAR resolver ***
2089 **********************/
2090
2091 /* s->k: the index of the CIGAR operator that has just been processed.
2092 s->x: the reference coordinate of the start of s->k
2093 s->y: the query coordiante of the start of s->k
2094 */
resolve_cigar2(bam_pileup1_t * p,int32_t pos,cstate_t * s)2095 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
2096 {
2097 #define _cop(c) ((c)&BAM_CIGAR_MASK)
2098 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
2099
2100 bam1_t *b = p->b;
2101 bam1_core_t *c = &b->core;
2102 uint32_t *cigar = bam_get_cigar(b);
2103 int k;
2104 // determine the current CIGAR operation
2105 //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
2106 if (s->k == -1) { // never processed
2107 if (c->n_cigar == 1) { // just one operation, save a loop
2108 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
2109 } else { // find the first match or deletion
2110 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
2111 int op = _cop(cigar[k]);
2112 int l = _cln(cigar[k]);
2113 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2114 else if (op == BAM_CREF_SKIP) s->x += l;
2115 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2116 }
2117 assert(k < c->n_cigar);
2118 s->k = k;
2119 }
2120 } else { // the read has been processed before
2121 int op, l = _cln(cigar[s->k]);
2122 if (pos - s->x >= l) { // jump to the next operation
2123 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
2124 op = _cop(cigar[s->k+1]);
2125 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
2126 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2127 s->x += l;
2128 ++s->k;
2129 } else { // find the next M/D/N/=/X
2130 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2131 s->x += l;
2132 for (k = s->k + 1; k < c->n_cigar; ++k) {
2133 op = _cop(cigar[k]), l = _cln(cigar[k]);
2134 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2135 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2136 }
2137 s->k = k;
2138 }
2139 assert(s->k < c->n_cigar); // otherwise a bug
2140 } // else, do nothing
2141 }
2142 { // collect pileup information
2143 int op, l;
2144 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
2145 p->is_del = p->indel = p->is_refskip = 0;
2146 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
2147 int op2 = _cop(cigar[s->k+1]);
2148 int l2 = _cln(cigar[s->k+1]);
2149 if (op2 == BAM_CDEL) p->indel = -(int)l2;
2150 else if (op2 == BAM_CINS) p->indel = l2;
2151 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
2152 int l3 = 0;
2153 for (k = s->k + 2; k < c->n_cigar; ++k) {
2154 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
2155 if (op2 == BAM_CINS) l3 += l2;
2156 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
2157 }
2158 if (l3 > 0) p->indel = l3;
2159 }
2160 }
2161 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
2162 p->qpos = s->y + (pos - s->x);
2163 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
2164 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
2165 p->is_refskip = (op == BAM_CREF_SKIP);
2166 } // cannot be other operations; otherwise a bug
2167 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
2168 }
2169 return 1;
2170 }
2171
2172 /***********************
2173 *** Pileup iterator ***
2174 ***********************/
2175
2176 // Dictionary of overlapping reads
2177 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
2178 typedef khash_t(olap_hash) olap_hash_t;
2179
2180 struct __bam_plp_t {
2181 mempool_t *mp;
2182 lbnode_t *head, *tail;
2183 int32_t tid, pos, max_tid, max_pos;
2184 int is_eof, max_plp, error, maxcnt;
2185 uint64_t id;
2186 bam_pileup1_t *plp;
2187 // for the "auto" interface only
2188 bam1_t *b;
2189 bam_plp_auto_f func;
2190 void *data;
2191 olap_hash_t *overlaps;
2192
2193 // For notification of creation and destruction events
2194 // and associated client-owned pointer.
2195 int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
2196 int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
2197 };
2198
bam_plp_init(bam_plp_auto_f func,void * data)2199 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
2200 {
2201 bam_plp_t iter;
2202 iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
2203 iter->mp = mp_init();
2204 iter->head = iter->tail = mp_alloc(iter->mp);
2205 iter->max_tid = iter->max_pos = -1;
2206 iter->maxcnt = 8000;
2207 if (func) {
2208 iter->func = func;
2209 iter->data = data;
2210 iter->b = bam_init1();
2211 }
2212 return iter;
2213 }
2214
bam_plp_init_overlaps(bam_plp_t iter)2215 void bam_plp_init_overlaps(bam_plp_t iter)
2216 {
2217 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
2218 }
2219
bam_plp_destroy(bam_plp_t iter)2220 void bam_plp_destroy(bam_plp_t iter)
2221 {
2222 lbnode_t *p, *pnext;
2223 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
2224 for (p = iter->head; p != NULL; p = pnext) {
2225 pnext = p->next;
2226 mp_free(iter->mp, p);
2227 }
2228 mp_destroy(iter->mp);
2229 if (iter->b) bam_destroy1(iter->b);
2230 free(iter->plp);
2231 free(iter);
2232 }
2233
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2234 void bam_plp_constructor(bam_plp_t plp,
2235 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2236 plp->plp_construct = func;
2237 }
2238
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2239 void bam_plp_destructor(bam_plp_t plp,
2240 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2241 plp->plp_destruct = func;
2242 }
2243
2244 //---------------------------------
2245 //--- Tweak overlapping reads
2246 //---------------------------------
2247
2248 /**
2249 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
2250 * cigar_iref2iseq_next() - get the next CMATCH base
2251 * @cigar: pointer to current cigar block (rw)
2252 * @cigar_max: pointer just beyond the last cigar block
2253 * @icig: position within the current cigar block (rw)
2254 * @iseq: position in the sequence (rw)
2255 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
2256 *
2257 * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
2258 */
cigar_iref2iseq_set(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2259 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2260 {
2261 int pos = *iref;
2262 if ( pos < 0 ) return -1;
2263 *icig = 0;
2264 *iseq = 0;
2265 *iref = 0;
2266 while ( *cigar<cigar_max )
2267 {
2268 int cig = (**cigar) & BAM_CIGAR_MASK;
2269 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2270
2271 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2272 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2273 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2274 {
2275 pos -= ncig;
2276 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
2277 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
2278 continue;
2279 }
2280 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2281 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
2282 {
2283 pos -= ncig;
2284 if ( pos<0 ) pos = 0;
2285 (*cigar)++; *icig = 0; *iref += ncig;
2286 continue;
2287 }
2288 hts_log_error("Unexpected cigar %d", cig);
2289 assert(0);
2290 }
2291 *iseq = -1;
2292 return -1;
2293 }
cigar_iref2iseq_next(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2294 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2295 {
2296 while ( *cigar < cigar_max )
2297 {
2298 int cig = (**cigar) & BAM_CIGAR_MASK;
2299 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2300
2301 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2302 {
2303 if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
2304 (*iseq)++; (*icig)++; (*iref)++;
2305 return BAM_CMATCH;
2306 }
2307 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
2308 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2309 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2310 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2311 hts_log_error("Unexpected cigar %d", cig);
2312 assert(0);
2313 }
2314 *iseq = -1;
2315 *iref = -1;
2316 return -1;
2317 }
2318
tweak_overlap_quality(bam1_t * a,bam1_t * b)2319 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
2320 {
2321 uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
2322 uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
2323 int a_icig = 0, a_iseq = 0;
2324 int b_icig = 0, b_iseq = 0;
2325 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
2326 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
2327
2328 int iref = b->core.pos;
2329 int a_iref = iref - a->core.pos;
2330 int b_iref = iref - b->core.pos;
2331 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2332 if ( a_ret<0 ) return; // no overlap
2333 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2334 if ( b_ret<0 ) return; // no overlap
2335
2336 #if DBG
2337 fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
2338 a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
2339 #endif
2340
2341 while ( 1 )
2342 {
2343 // Increment reference position
2344 while ( a_iref>=0 && a_iref < iref - a->core.pos )
2345 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2346 if ( a_ret<0 ) break; // done
2347 if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
2348
2349 while ( b_iref>=0 && b_iref < iref - b->core.pos )
2350 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2351 if ( b_ret<0 ) break; // done
2352 if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
2353
2354 iref++;
2355 if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
2356
2357 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
2358 {
2359 #if DBG
2360 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
2361 #endif
2362 // we are very confident about this base
2363 int qual = a_qual[a_iseq] + b_qual[b_iseq];
2364 a_qual[a_iseq] = qual>200 ? 200 : qual;
2365 b_qual[b_iseq] = 0;
2366 }
2367 else
2368 {
2369 if ( a_qual[a_iseq] >= b_qual[b_iseq] )
2370 {
2371 #if DBG
2372 fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
2373 #endif
2374 a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
2375 b_qual[b_iseq] = 0;
2376 }
2377 else
2378 {
2379 #if DBG
2380 fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
2381 #endif
2382 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
2383 a_qual[a_iseq] = 0;
2384 }
2385 }
2386 }
2387 #if DBG
2388 fprintf(stderr,"\n");
2389 #endif
2390 }
2391
2392 // Fix overlapping reads. Simple soft-clipping did not give good results.
2393 // Lowering qualities of unwanted bases is more selective and works better.
2394 //
overlap_push(bam_plp_t iter,lbnode_t * node)2395 static void overlap_push(bam_plp_t iter, lbnode_t *node)
2396 {
2397 if ( !iter->overlaps ) return;
2398
2399 // mapped mates and paired reads only
2400 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
2401
2402 // no overlap possible, unless some wild cigar
2403 if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
2404
2405 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
2406 if ( kitr==kh_end(iter->overlaps) )
2407 {
2408 int ret;
2409 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
2410 kh_value(iter->overlaps, kitr) = node;
2411 }
2412 else
2413 {
2414 lbnode_t *a = kh_value(iter->overlaps, kitr);
2415 tweak_overlap_quality(&a->b, &node->b);
2416 kh_del(olap_hash, iter->overlaps, kitr);
2417 assert(a->end-1 == a->s.end);
2418 a->end = bam_endpos(&a->b);
2419 a->s.end = a->end - 1;
2420 }
2421 }
2422
overlap_remove(bam_plp_t iter,const bam1_t * b)2423 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
2424 {
2425 if ( !iter->overlaps ) return;
2426
2427 khiter_t kitr;
2428 if ( b )
2429 {
2430 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
2431 if ( kitr!=kh_end(iter->overlaps) )
2432 kh_del(olap_hash, iter->overlaps, kitr);
2433 }
2434 else
2435 {
2436 // remove all
2437 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
2438 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
2439 }
2440 }
2441
2442
2443
2444 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
2445 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
2446 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2447 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2448 {
2449 if (iter->error) { *_n_plp = -1; return NULL; }
2450 *_n_plp = 0;
2451 if (iter->is_eof && iter->head == iter->tail) return NULL;
2452 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
2453 int n_plp = 0;
2454 // write iter->plp at iter->pos
2455 lbnode_t **pptr = &iter->head;
2456 while (*pptr != iter->tail) {
2457 lbnode_t *p = *pptr;
2458 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
2459 overlap_remove(iter, &p->b);
2460 if (iter->plp_destruct)
2461 iter->plp_destruct(iter->data, &p->b, &p->cd);
2462 *pptr = p->next; mp_free(iter->mp, p);
2463 }
2464 else {
2465 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
2466 if (n_plp == iter->max_plp) { // then double the capacity
2467 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
2468 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
2469 }
2470 iter->plp[n_plp].b = &p->b;
2471 iter->plp[n_plp].cd = p->cd;
2472 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
2473 }
2474 pptr = &(*pptr)->next;
2475 }
2476 }
2477 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
2478 // update iter->tid and iter->pos
2479 if (iter->head != iter->tail) {
2480 if (iter->tid > iter->head->b.core.tid) {
2481 hts_log_error("Unsorted input. Pileup aborts");
2482 iter->error = 1;
2483 *_n_plp = -1;
2484 return NULL;
2485 }
2486 }
2487 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
2488 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
2489 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
2490 iter->pos = iter->head->beg; // jump to the next position
2491 } else ++iter->pos; // scan contiguously
2492 // return
2493 if (n_plp) return iter->plp;
2494 if (iter->is_eof && iter->head == iter->tail) break;
2495 }
2496 return NULL;
2497 }
2498
bam_plp_push(bam_plp_t iter,const bam1_t * b)2499 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
2500 {
2501 if (iter->error) return -1;
2502 if (b) {
2503 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
2504 // Skip only unmapped reads here, any additional filtering must be done in iter->func
2505 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
2506 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
2507 {
2508 overlap_remove(iter, b);
2509 return 0;
2510 }
2511 bam_copy1(&iter->tail->b, b);
2512 overlap_push(iter, iter->tail);
2513 #ifndef BAM_NO_ID
2514 iter->tail->b.id = iter->id++;
2515 #endif
2516 iter->tail->beg = b->core.pos;
2517 iter->tail->end = bam_endpos(b);
2518 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
2519 if (b->core.tid < iter->max_tid) {
2520 hts_log_error("The input is not sorted (chromosomes out of order)");
2521 iter->error = 1;
2522 return -1;
2523 }
2524 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
2525 hts_log_error("The input is not sorted (reads out of order)");
2526 iter->error = 1;
2527 return -1;
2528 }
2529 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
2530 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
2531 if (iter->plp_construct)
2532 iter->plp_construct(iter->data, b, &iter->tail->cd);
2533 iter->tail->next = mp_alloc(iter->mp);
2534 iter->tail = iter->tail->next;
2535 }
2536 } else iter->is_eof = 1;
2537 return 0;
2538 }
2539
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2540 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2541 {
2542 const bam_pileup1_t *plp;
2543 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
2544 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2545 else { // no pileup line can be obtained; read alignments
2546 *_n_plp = 0;
2547 if (iter->is_eof) return 0;
2548 int ret;
2549 while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
2550 if (bam_plp_push(iter, iter->b) < 0) {
2551 *_n_plp = -1;
2552 return 0;
2553 }
2554 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2555 // otherwise no pileup line can be returned; read the next alignment.
2556 }
2557 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
2558 bam_plp_push(iter, 0);
2559 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2560 return 0;
2561 }
2562 }
2563
bam_plp_reset(bam_plp_t iter)2564 void bam_plp_reset(bam_plp_t iter)
2565 {
2566 overlap_remove(iter, NULL);
2567 iter->max_tid = iter->max_pos = -1;
2568 iter->tid = iter->pos = 0;
2569 iter->is_eof = 0;
2570 while (iter->head != iter->tail) {
2571 lbnode_t *p = iter->head;
2572 iter->head = p->next;
2573 mp_free(iter->mp, p);
2574 }
2575 }
2576
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)2577 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
2578 {
2579 iter->maxcnt = maxcnt;
2580 }
2581
2582 /************************
2583 *** Mpileup iterator ***
2584 ************************/
2585
2586 struct __bam_mplp_t {
2587 int n;
2588 uint64_t min, *pos;
2589 bam_plp_t *iter;
2590 int *n_plp;
2591 const bam_pileup1_t **plp;
2592 };
2593
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)2594 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
2595 {
2596 int i;
2597 bam_mplp_t iter;
2598 iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
2599 iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
2600 iter->n_plp = (int*)calloc(n, sizeof(int));
2601 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
2602 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
2603 iter->n = n;
2604 iter->min = (uint64_t)-1;
2605 for (i = 0; i < n; ++i) {
2606 iter->iter[i] = bam_plp_init(func, data[i]);
2607 iter->pos[i] = iter->min;
2608 }
2609 return iter;
2610 }
2611
bam_mplp_init_overlaps(bam_mplp_t iter)2612 void bam_mplp_init_overlaps(bam_mplp_t iter)
2613 {
2614 int i;
2615 for (i = 0; i < iter->n; ++i)
2616 bam_plp_init_overlaps(iter->iter[i]);
2617 }
2618
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)2619 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
2620 {
2621 int i;
2622 for (i = 0; i < iter->n; ++i)
2623 iter->iter[i]->maxcnt = maxcnt;
2624 }
2625
bam_mplp_destroy(bam_mplp_t iter)2626 void bam_mplp_destroy(bam_mplp_t iter)
2627 {
2628 int i;
2629 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
2630 free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
2631 free(iter);
2632 }
2633
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)2634 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
2635 {
2636 int i, ret = 0;
2637 uint64_t new_min = (uint64_t)-1;
2638 for (i = 0; i < iter->n; ++i) {
2639 if (iter->pos[i] == iter->min) {
2640 int tid, pos;
2641 iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
2642 if ( iter->iter[i]->error ) return -1;
2643 iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
2644 }
2645 if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
2646 }
2647 iter->min = new_min;
2648 if (new_min == (uint64_t)-1) return 0;
2649 *_tid = new_min>>32; *_pos = (uint32_t)new_min;
2650 for (i = 0; i < iter->n; ++i) {
2651 if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
2652 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
2653 ++ret;
2654 } else n_plp[i] = 0, plp[i] = 0;
2655 }
2656 return ret;
2657 }
2658
bam_mplp_reset(bam_mplp_t iter)2659 void bam_mplp_reset(bam_mplp_t iter)
2660 {
2661 int i;
2662 iter->min = (uint64_t)-1;
2663 for (i = 0; i < iter->n; ++i) {
2664 bam_plp_reset(iter->iter[i]);
2665 iter->pos[i] = (uint64_t)-1;
2666 iter->n_plp[i] = 0;
2667 iter->plp[i] = NULL;
2668 }
2669 }
2670
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2671 void bam_mplp_constructor(bam_mplp_t iter,
2672 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2673 int i;
2674 for (i = 0; i < iter->n; ++i)
2675 bam_plp_constructor(iter->iter[i], func);
2676 }
2677
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2678 void bam_mplp_destructor(bam_mplp_t iter,
2679 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2680 int i;
2681 for (i = 0; i < iter->n; ++i)
2682 bam_plp_destructor(iter->iter[i], func);
2683 }
2684
2685 #endif // ~!defined(BAM_NO_PILEUP)
2686