1 /*  sam.c -- SAM and BAM file I/O and manipulation.
2 
3     Copyright (C) 2008-2010, 2012-2018 Genome Research Ltd.
4     Copyright (C) 2010, 2012, 2013 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <strings.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <zlib.h>
34 #include <assert.h>
35 #include "htslib/sam.h"
36 #include "htslib/bgzf.h"
37 #include "cram/cram.h"
38 #include "hts_internal.h"
39 #include "htslib/hfile.h"
40 #include "htslib/hts_endian.h"
41 
42 #include "htslib/khash.h"
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44 
45 typedef khash_t(s2i) sdict_t;
46 
47 #ifndef EOVERFLOW
48 #define EOVERFLOW ERANGE
49 #endif
50 
51 /**********************
52  *** BAM header I/O ***
53  **********************/
54 
bam_hdr_init()55 bam_hdr_t *bam_hdr_init()
56 {
57     return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
58 }
59 
bam_hdr_destroy(bam_hdr_t * h)60 void bam_hdr_destroy(bam_hdr_t *h)
61 {
62     int32_t i;
63     if (h == NULL) return;
64     if (h->target_name) {
65         for (i = 0; i < h->n_targets; ++i)
66             free(h->target_name[i]);
67         free(h->target_name);
68         free(h->target_len);
69     }
70     free(h->text); free(h->cigar_tab);
71     if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
72     free(h);
73 }
74 
bam_hdr_dup(const bam_hdr_t * h0)75 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
76 {
77     if (h0 == NULL) return NULL;
78     bam_hdr_t *h;
79     if ((h = bam_hdr_init()) == NULL) return NULL;
80     // copy the simple data
81     h->n_targets = h0->n_targets;
82     h->ignore_sam_err = h0->ignore_sam_err;
83     h->l_text = h0->l_text;
84     // Then the pointery stuff
85     h->cigar_tab = NULL;
86     h->sdict = NULL;
87     // TODO Check for memory allocation failures
88     h->text = (char*)calloc(h->l_text + 1, 1);
89     memcpy(h->text, h0->text, h->l_text);
90     h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
91     h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
92     int i;
93     for (i = 0; i < h->n_targets; ++i) {
94         h->target_len[i] = h0->target_len[i];
95         h->target_name[i] = strdup(h0->target_name[i]);
96     }
97     return h;
98 }
99 
100 
hdr_from_dict(sdict_t * d)101 static bam_hdr_t *hdr_from_dict(sdict_t *d)
102 {
103     bam_hdr_t *h;
104     khint_t k;
105     h = bam_hdr_init();
106     h->sdict = d;
107     h->n_targets = kh_size(d);
108     // TODO Check for memory allocation failures
109     h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
110     h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
111     for (k = kh_begin(d); k != kh_end(d); ++k) {
112         if (!kh_exist(d, k)) continue;
113         h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
114         h->target_len[kh_val(d, k)>>32]  = kh_val(d, k) & 0xffffffffUL;
115         kh_val(d, k) >>= 32;
116     }
117     return h;
118 }
119 
bam_hdr_read(BGZF * fp)120 bam_hdr_t *bam_hdr_read(BGZF *fp)
121 {
122     bam_hdr_t *h;
123     char buf[4];
124     int magic_len, has_EOF;
125     int32_t i, name_len, num_names = 0;
126     size_t bufsize;
127     ssize_t bytes;
128     // check EOF
129     has_EOF = bgzf_check_EOF(fp);
130     if (has_EOF < 0) {
131         perror("[W::bam_hdr_read] bgzf_check_EOF");
132     } else if (has_EOF == 0) {
133         hts_log_warning("EOF marker is absent. The input is probably truncated");
134     }
135     // read "BAM1"
136     magic_len = bgzf_read(fp, buf, 4);
137     if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
138         hts_log_error("Invalid BAM binary header");
139         return 0;
140     }
141     h = bam_hdr_init();
142     if (!h) goto nomem;
143 
144     // read plain text and the number of reference sequences
145     bytes = bgzf_read(fp, &h->l_text, 4);
146     if (bytes != 4) goto read_err;
147     if (fp->is_be) ed_swap_4p(&h->l_text);
148 
149     bufsize = ((size_t) h->l_text) + 1;
150     if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
151     h->text = (char*)malloc(bufsize);
152     if (!h->text) goto nomem;
153     h->text[h->l_text] = 0; // make sure it is NULL terminated
154     bytes = bgzf_read(fp, h->text, h->l_text);
155     if (bytes != h->l_text) goto read_err;
156 
157     bytes = bgzf_read(fp, &h->n_targets, 4);
158     if (bytes != 4) goto read_err;
159     if (fp->is_be) ed_swap_4p(&h->n_targets);
160 
161     if (h->n_targets < 0) goto invalid;
162 
163     // read reference sequence names and lengths
164     if (h->n_targets > 0) {
165         h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
166         if (!h->target_name) goto nomem;
167         h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
168         if (!h->target_len) goto nomem;
169     }
170     else {
171         h->target_name = NULL;
172         h->target_len = NULL;
173     }
174 
175     for (i = 0; i != h->n_targets; ++i) {
176         bytes = bgzf_read(fp, &name_len, 4);
177         if (bytes != 4) goto read_err;
178         if (fp->is_be) ed_swap_4p(&name_len);
179         if (name_len <= 0) goto invalid;
180 
181         h->target_name[i] = (char*)malloc(name_len);
182         if (!h->target_name[i]) goto nomem;
183         num_names++;
184 
185         bytes = bgzf_read(fp, h->target_name[i], name_len);
186         if (bytes != name_len) goto read_err;
187 
188         if (h->target_name[i][name_len - 1] != '\0') {
189             /* Fix missing NUL-termination.  Is this being too nice?
190                We could alternatively bail out with an error. */
191             char *new_name;
192             if (name_len == INT32_MAX) goto invalid;
193             new_name = realloc(h->target_name[i], name_len + 1);
194             if (new_name == NULL) goto nomem;
195             h->target_name[i] = new_name;
196             h->target_name[i][name_len] = '\0';
197         }
198 
199         bytes = bgzf_read(fp, &h->target_len[i], 4);
200         if (bytes != 4) goto read_err;
201         if (fp->is_be) ed_swap_4p(&h->target_len[i]);
202     }
203     return h;
204 
205  nomem:
206     hts_log_error("Out of memory");
207     goto clean;
208 
209  read_err:
210     if (bytes < 0) {
211         hts_log_error("Error reading BGZF stream");
212     } else {
213         hts_log_error("Truncated BAM header");
214     }
215     goto clean;
216 
217  invalid:
218     hts_log_error("Invalid BAM binary header");
219 
220  clean:
221     if (h != NULL) {
222         h->n_targets = num_names; // ensure we free only allocated target_names
223         bam_hdr_destroy(h);
224     }
225     return NULL;
226 }
227 
bam_hdr_write(BGZF * fp,const bam_hdr_t * h)228 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
229 {
230     char buf[4];
231     int32_t i, name_len, x;
232     // write "BAM1"
233     strncpy(buf, "BAM\1", 4);
234     if (bgzf_write(fp, buf, 4) < 0) return -1;
235     // write plain text and the number of reference sequences
236     if (fp->is_be) {
237         x = ed_swap_4(h->l_text);
238         if (bgzf_write(fp, &x, 4) < 0) return -1;
239         if (h->l_text) {
240             if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
241         }
242         x = ed_swap_4(h->n_targets);
243         if (bgzf_write(fp, &x, 4) < 0) return -1;
244     } else {
245         if (bgzf_write(fp, &h->l_text, 4) < 0) return -1;
246         if (h->l_text) {
247             if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
248         }
249         if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1;
250     }
251     // write sequence names and lengths
252     for (i = 0; i != h->n_targets; ++i) {
253         char *p = h->target_name[i];
254         name_len = strlen(p) + 1;
255         if (fp->is_be) {
256             x = ed_swap_4(name_len);
257             if (bgzf_write(fp, &x, 4) < 0) return -1;
258         } else {
259             if (bgzf_write(fp, &name_len, 4) < 0) return -1;
260         }
261         if (bgzf_write(fp, p, name_len) < 0) return -1;
262         if (fp->is_be) {
263             x = ed_swap_4(h->target_len[i]);
264             if (bgzf_write(fp, &x, 4) < 0) return -1;
265         } else {
266             if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
267         }
268     }
269     if (bgzf_flush(fp) < 0) return -1;
270     return 0;
271 }
272 
bam_name2id(bam_hdr_t * h,const char * ref)273 int bam_name2id(bam_hdr_t *h, const char *ref)
274 {
275     sdict_t *d = (sdict_t*)h->sdict;
276     khint_t k;
277     if (h->sdict == 0) {
278         int i, absent;
279         d = kh_init(s2i);
280         for (i = 0; i < h->n_targets; ++i) {
281             k = kh_put(s2i, d, h->target_name[i], &absent);
282             kh_val(d, k) = i;
283         }
284         h->sdict = d;
285     }
286     k = kh_get(s2i, d, ref);
287     return k == kh_end(d)? -1 : kh_val(d, k);
288 }
289 
290 /*************************
291  *** BAM alignment I/O ***
292  *************************/
293 
bam_init1()294 bam1_t *bam_init1()
295 {
296     return (bam1_t*)calloc(1, sizeof(bam1_t));
297 }
298 
bam_destroy1(bam1_t * b)299 void bam_destroy1(bam1_t *b)
300 {
301     if (b == 0) return;
302     free(b->data); free(b);
303 }
304 
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)305 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
306 {
307     uint8_t *data = bdst->data;
308     int m_data = bdst->m_data;   // backup data and m_data
309     if (m_data < bsrc->l_data) { // double the capacity
310         m_data = bsrc->l_data; kroundup32(m_data);
311         data = (uint8_t*)realloc(data, m_data);
312     }
313     memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
314     *bdst = *bsrc; // copy the rest
315     // restore the backup
316     bdst->m_data = m_data;
317     bdst->data = data;
318     return bdst;
319 }
320 
bam_dup1(const bam1_t * bsrc)321 bam1_t *bam_dup1(const bam1_t *bsrc)
322 {
323     if (bsrc == NULL) return NULL;
324     bam1_t *bdst = bam_init1();
325     if (bdst == NULL) return NULL;
326     return bam_copy1(bdst, bsrc);
327 }
328 
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,int * rlen,int * qlen)329 void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen)
330 {
331     int k;
332     *rlen = *qlen = 0;
333     for (k = 0; k < n_cigar; ++k) {
334         int type = bam_cigar_type(bam_cigar_op(cigar[k]));
335         int len = bam_cigar_oplen(cigar[k]);
336         if (type & 1) *qlen += len;
337         if (type & 2) *rlen += len;
338     }
339 }
340 
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)341 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
342 {
343     int k, l;
344     for (k = l = 0; k < n_cigar; ++k)
345         if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
346             l += bam_cigar_oplen(cigar[k]);
347     return l;
348 }
349 
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)350 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
351 {
352     int k, l;
353     for (k = l = 0; k < n_cigar; ++k)
354         if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
355             l += bam_cigar_oplen(cigar[k]);
356     return l;
357 }
358 
bam_endpos(const bam1_t * b)359 int32_t bam_endpos(const bam1_t *b)
360 {
361     if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
362         return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
363     else
364         return b->core.pos + 1;
365 }
366 
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)367 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
368 {
369     bam1_core_t *c = &b->core;
370     uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
371     uint8_t *CG;
372 
373     // test where there is a real CIGAR in the CG tag to move
374     if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
375     cigar0 = bam_get_cigar(b);
376     if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
377     fake_bytes = c->n_cigar * 4;
378     if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag
379     if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
380     CG_len = le_to_u32(CG + 2);
381     if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
382 
383     // move from the CG tag to the right position
384     cigar_st = (uint8_t*)cigar0 - b->data;
385     c->n_cigar = CG_len;
386     n_cigar4 = c->n_cigar * 4;
387     CG_st = CG - b->data - 2;
388     CG_en = CG_st + 8 + n_cigar4;
389     b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
390     if (b->m_data < b->l_data) {
391         uint8_t *new_data;
392         uint32_t new_max = b->l_data;
393         kroundup32(new_max);
394         new_data = (uint8_t*)realloc(b->data, new_max);
395         if (new_data == 0) return -1;
396         b->m_data = new_max, b->data = new_data;
397     }
398     memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
399     memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
400     if (ori_len > CG_en) // move data after the CG tag
401         memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
402     b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
403     if (recal_bin)
404         b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5);
405     if (give_warning)
406         hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
407     return 1;
408 }
409 
aux_type2size(uint8_t type)410 static inline int aux_type2size(uint8_t type)
411 {
412     switch (type) {
413     case 'A': case 'c': case 'C':
414         return 1;
415     case 's': case 'S':
416         return 2;
417     case 'i': case 'I': case 'f':
418         return 4;
419     case 'd':
420         return 8;
421     case 'Z': case 'H': case 'B':
422         return type;
423     default:
424         return 0;
425     }
426 }
427 
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)428 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
429 {
430     uint32_t *cigar = (uint32_t*)(data + c->l_qname);
431     uint32_t i;
432     for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
433 }
434 
bam_read1(BGZF * fp,bam1_t * b)435 int bam_read1(BGZF *fp, bam1_t *b)
436 {
437     bam1_core_t *c = &b->core;
438     int32_t block_len, ret, i;
439     uint32_t x[8];
440     if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
441         if (ret == 0) return -1; // normal end-of-file
442         else return -2; // truncated
443     }
444     if (fp->is_be)
445         ed_swap_4p(&block_len);
446     if (block_len < 32) return -4;  // block_len includes core data
447     if (bgzf_read(fp, x, 32) != 32) return -3;
448     if (fp->is_be) {
449         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
450     }
451     c->tid = x[0]; c->pos = x[1];
452     c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
453     c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
454     if ((uint32_t) c->l_qname + c->l_extranul > 255) // l_qname would overflow
455         return -4;
456     c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
457     c->l_qseq = x[4];
458     c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
459     b->l_data = block_len - 32 + c->l_extranul;
460     if (b->l_data < 0 || c->l_qseq < 0 || c->l_qname < 1) return -4;
461     if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
462         + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) b->l_data)
463         return -4;
464     if (b->m_data < b->l_data) {
465         uint8_t *new_data;
466         uint32_t new_m = b->l_data;
467         kroundup32(new_m);
468         new_data = (uint8_t*)realloc(b->data, new_m);
469         if (!new_data)
470             return -4;
471         b->data = new_data;
472         b->m_data = new_m;
473     }
474     if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
475     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
476     c->l_qname += c->l_extranul;
477     if (b->l_data < c->l_qname ||
478         bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
479         return -4;
480     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
481     if (bam_tag2cigar(b, 0, 0) < 0)
482         return -4;
483 
484     if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
485         int rlen, qlen;
486         bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
487         if ((b->core.flag & BAM_FUNMAP)) rlen=1;
488         b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
489         // Sanity check for broken CIGAR alignments
490         if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
491             hts_log_error("CIGAR and query sequence lengths differ for %s",
492                     bam_get_qname(b));
493             return -4;
494         }
495     }
496 
497     return 4 + block_len;
498 }
499 
bam_write1(BGZF * fp,const bam1_t * b)500 int bam_write1(BGZF *fp, const bam1_t *b)
501 {
502     const bam1_core_t *c = &b->core;
503     uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
504     int i, ok;
505     if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
506     x[0] = c->tid;
507     x[1] = c->pos;
508     x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
509     if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
510     else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
511     x[4] = c->l_qseq;
512     x[5] = c->mtid;
513     x[6] = c->mpos;
514     x[7] = c->isize;
515     ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
516     if (fp->is_be) {
517         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
518         y = block_len;
519         if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
520         swap_data(c, b->l_data, b->data, 1);
521     } else {
522         if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
523     }
524     if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
525     if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
526     if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
527         if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
528     } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
529         uint8_t buf[8];
530         uint32_t cigar_st, cigar_en, cigar[2];
531         cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
532         cigar_en = cigar_st + c->n_cigar * 4;
533         cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
534         cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP;
535         u32_to_le(cigar[0], buf);
536         u32_to_le(cigar[1], buf + 4);
537         if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
538         if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
539         if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
540         u32_to_le(c->n_cigar, buf);
541         if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
542         if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
543     }
544     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
545     return ok? 4 + block_len : -1;
546 }
547 
548 /********************
549  *** BAM indexing ***
550  ********************/
551 
bam_index(BGZF * fp,int min_shift)552 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
553 {
554     int n_lvls, i, fmt, ret;
555     bam1_t *b;
556     hts_idx_t *idx;
557     bam_hdr_t *h;
558     h = bam_hdr_read(fp);
559     if (h == NULL) return NULL;
560     if (min_shift > 0) {
561         int64_t max_len = 0, s;
562         for (i = 0; i < h->n_targets; ++i)
563             if (max_len < h->target_len[i]) max_len = h->target_len[i];
564         max_len += 256;
565         for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
566         fmt = HTS_FMT_CSI;
567     } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
568     idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
569     bam_hdr_destroy(h);
570     b = bam_init1();
571     while ((ret = bam_read1(fp, b)) >= 0) {
572         ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
573         if (ret < 0) goto err; // unsorted
574     }
575     if (ret < -1) goto err; // corrupted BAM file
576 
577     hts_idx_finish(idx, bgzf_tell(fp));
578     bam_destroy1(b);
579     return idx;
580 
581 err:
582     bam_destroy1(b);
583     hts_idx_destroy(idx);
584     return NULL;
585 }
586 
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)587 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
588 {
589     hts_idx_t *idx;
590     htsFile *fp;
591     int ret = 0;
592 
593     if ((fp = hts_open(fn, "r")) == 0) return -2;
594     if (nthreads)
595         hts_set_threads(fp, nthreads);
596 
597     switch (fp->format.format) {
598     case cram:
599         ret = cram_index_build(fp->fp.cram, fn, fnidx);
600         break;
601 
602     case bam:
603         idx = bam_index(fp->fp.bgzf, min_shift);
604         if (idx) {
605             ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
606             if (ret < 0) ret = -4;
607             hts_idx_destroy(idx);
608         }
609         else ret = -1;
610         break;
611 
612     default:
613         ret = -3;
614         break;
615     }
616     hts_close(fp);
617 
618     return ret;
619 }
620 
sam_index_build2(const char * fn,const char * fnidx,int min_shift)621 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
622 {
623     return sam_index_build3(fn, fnidx, min_shift, 0);
624 }
625 
sam_index_build(const char * fn,int min_shift)626 int sam_index_build(const char *fn, int min_shift)
627 {
628     return sam_index_build3(fn, NULL, min_shift, 0);
629 }
630 
631 // Provide bam_index_build() symbol for binary compability with earlier HTSlib
632 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)633 int bam_index_build(const char *fn, int min_shift)
634 {
635     return sam_index_build2(fn, NULL, min_shift);
636 }
637 
bam_readrec(BGZF * fp,void * ignored,void * bv,int * tid,int * beg,int * end)638 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
639 {
640     bam1_t *b = bv;
641     int ret;
642     if ((ret = bam_read1(fp, b)) >= 0) {
643         *tid = b->core.tid;
644         *beg = b->core.pos;
645         *end = bam_endpos(b);
646     }
647     return ret;
648 }
649 
650 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,int * beg,int * end)651 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
652 {
653     htsFile *fp = fpv;
654     bam1_t *b = bv;
655     int ret = cram_get_bam_seq(fp->fp.cram, &b);
656     if (ret < 0)
657         return cram_eof(fp->fp.cram) ? -1 : -2;
658 
659     if (bam_tag2cigar(b, 1, 1) < 0)
660         return -2;
661 
662     *tid = b->core.tid;
663     *beg = b->core.pos;
664     *end = bam_endpos(b);
665 
666     return ret;
667 }
668 
cram_pseek(void * fp,int64_t offset,int whence)669 static int cram_pseek(void *fp, int64_t offset, int whence)
670 {
671     cram_fd *fd =  (cram_fd *)fp;
672 
673     if ((0 != cram_seek(fd, offset, SEEK_SET))
674      && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
675         return -1;
676 
677     if (fd->ctr) {
678         cram_free_container(fd->ctr);
679         if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
680             cram_free_container(fd->ctr_mt);
681 
682         fd->ctr = NULL;
683         fd->ctr_mt = NULL;
684         fd->ooc = 0;
685     }
686 
687     return 0;
688 }
689 
690 /*
691  * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
692  *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
693  *   container previously fetched. It was designed like this to integrate with the functionality
694  *   of the iterator stepping logic.
695  */
696 
cram_ptell(void * fp)697 static int64_t cram_ptell(void *fp)
698 {
699     cram_fd *fd = (cram_fd *)fp;
700     cram_container *c;
701     int64_t ret = -1L;
702 
703     if (fd && fd->fp) {
704         ret = htell(fd->fp);
705         if ((c = fd->ctr) != NULL) {
706             ret -= ((c->curr_slice < c->max_slice || c->curr_rec < c->num_records) ? c->offset + 1 : 0);
707         }
708     }
709 
710     return ret;
711 }
712 
bam_pseek(void * fp,int64_t offset,int whence)713 static int bam_pseek(void *fp, int64_t offset, int whence)
714 {
715     BGZF *fd = (BGZF *)fp;
716 
717     return bgzf_seek(fd, offset, whence);
718 }
719 
bam_ptell(void * fp)720 static int64_t bam_ptell(void *fp)
721 {
722     BGZF *fd = (BGZF *)fp;
723     if (!fd)
724         return -1L;
725 
726     return bgzf_tell(fd);
727 }
728 
729 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_bam_cram_readrec(BGZF * bgzfp,void * fpv,void * bv,int * tid,int * beg,int * end)730 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
731 {
732     htsFile *fp = fpv;
733     bam1_t *b = bv;
734     switch (fp->format.format) {
735     case bam:   return bam_read1(bgzfp, b);
736     case cram: {
737         int ret = cram_get_bam_seq(fp->fp.cram, &b);
738         if (ret < 0)
739             return cram_eof(fp->fp.cram) ? -1 : -2;
740 
741         if (bam_tag2cigar(b, 1, 1) < 0)
742             return -2;
743         return ret;
744     }
745     default:
746         // TODO Need headers available to implement this for SAM files
747         hts_log_error("Not implemented for SAM files");
748         abort();
749     }
750 }
751 
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)752 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
753 {
754     switch (fp->format.format) {
755     case bam:
756         return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI);
757 
758     case cram: {
759         if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
760         // Cons up a fake "index" just pointing at the associated cram_fd:
761         hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
762         if (idx == NULL) return NULL;
763         idx->fmt = HTS_FMT_CRAI;
764         idx->cram = fp->fp.cram;
765         return (hts_idx_t *) idx;
766         }
767 
768     default:
769         return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
770     }
771 }
772 
sam_index_load(htsFile * fp,const char * fn)773 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
774 {
775     return sam_index_load2(fp, fn, NULL);
776 }
777 
cram_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)778 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
779 {
780     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
781     hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
782     if (iter == NULL) return NULL;
783 
784     // Cons up a dummy iterator for which hts_itr_next() will simply invoke
785     // the readrec function:
786     iter->is_cram = 1;
787     iter->read_rest = 1;
788     iter->off = NULL;
789     iter->bins.a = NULL;
790     iter->readrec = readrec;
791 
792     if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
793         cram_range r = { tid, beg+1, end };
794         int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
795 
796         iter->curr_off = 0;
797         // The following fields are not required by hts_itr_next(), but are
798         // filled in in case user code wants to look at them.
799         iter->tid = tid;
800         iter->beg = beg;
801         iter->end = end;
802 
803         switch (ret) {
804         case 0:
805             break;
806 
807         case -2:
808             // No data vs this ref, so mark iterator as completed.
809             // Same as HTS_IDX_NONE.
810             iter->finished = 1;
811             break;
812 
813         default:
814             free(iter);
815             return NULL;
816         }
817     }
818     else switch (tid) {
819     case HTS_IDX_REST:
820         iter->curr_off = 0;
821         break;
822     case HTS_IDX_NONE:
823         iter->curr_off = 0;
824         iter->finished = 1;
825         break;
826     default:
827         hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
828         abort();
829         break;
830     }
831 
832     return iter;
833 }
834 
sam_itr_queryi(const hts_idx_t * idx,int tid,int beg,int end)835 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
836 {
837     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
838     if (idx == NULL)
839         return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
840     else if (cidx->fmt == HTS_FMT_CRAI)
841         return cram_itr_query(idx, tid, beg, end, cram_readrec);
842     else
843         return hts_itr_query(idx, tid, beg, end, bam_readrec);
844 }
845 
cram_name2id(void * fdv,const char * ref)846 static int cram_name2id(void *fdv, const char *ref)
847 {
848     cram_fd *fd = (cram_fd *) fdv;
849     return sam_hdr_name2ref(fd->header, ref);
850 }
851 
sam_itr_querys(const hts_idx_t * idx,bam_hdr_t * hdr,const char * region)852 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
853 {
854     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
855     if (cidx->fmt == HTS_FMT_CRAI)
856         return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
857     else
858         return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
859 }
860 
sam_itr_regions(const hts_idx_t * idx,bam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)861 hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
862 {
863     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
864     if (cidx->fmt == HTS_FMT_CRAI)
865         return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
866                    hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
867     else
868         return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
869                    hts_itr_multi_bam, bam_readrec, bam_pseek, bam_ptell);
870 }
871 
872 /**********************
873  *** SAM header I/O ***
874  **********************/
875 
876 #include "htslib/kseq.h"
877 #include "htslib/kstring.h"
878 
sam_hdr_parse(int l_text,const char * text)879 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
880 {
881     const char *q, *r, *p;
882     khash_t(s2i) *d;
883     d = kh_init(s2i);
884     for (p = text; *p; ++p) {
885         if (strncmp(p, "@SQ\t", 4) == 0) {
886             char *sn = 0;
887             int ln = -1;
888             for (q = p + 4;; ++q) {
889                 if (strncmp(q, "SN:", 3) == 0) {
890                     q += 3;
891                     for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r);
892                     sn = (char*)calloc(r - q + 1, 1);
893                     strncpy(sn, q, r - q);
894                     q = r;
895                 } else if (strncmp(q, "LN:", 3) == 0)
896                     ln = strtol(q + 3, (char**)&q, 10);
897                 while (*q != '\t' && *q != '\n' && *q != '\0') ++q;
898                 if (*q == '\0' || *q == '\n') break;
899             }
900             p = q;
901             if (sn && ln >= 0) {
902                 khint_t k;
903                 int absent;
904                 k = kh_put(s2i, d, sn, &absent);
905                 if (!absent) {
906                     hts_log_warning("Duplicated sequence '%s'", sn);
907                     free(sn);
908                 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
909             }
910         }
911         while (*p != '\0' && *p != '\n') ++p;
912     }
913     return hdr_from_dict(d);
914 }
915 
916 // Minimal sanitisation of a header to ensure.
917 // - null terminated string.
918 // - all lines start with @ (also implies no blank lines).
919 //
920 // Much more could be done, but currently is not, including:
921 // - checking header types are known (HD, SQ, etc).
922 // - syntax (eg checking tab separated fields).
923 // - validating n_targets matches @SQ records.
924 // - validating target lengths against @SQ records.
sam_hdr_sanitise(bam_hdr_t * h)925 static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) {
926     if (!h)
927         return NULL;
928 
929     // Special case for empty headers.
930     if (h->l_text == 0)
931         return h;
932 
933     uint32_t i, lnum = 0;
934     char *cp = h->text, last = '\n';
935     for (i = 0; i < h->l_text; i++) {
936         // NB: l_text excludes terminating nul.  This finds early ones.
937         if (cp[i] == 0)
938             break;
939 
940         // Error on \n[^@], including duplicate newlines
941         if (last == '\n') {
942             lnum++;
943             if (cp[i] != '@') {
944                 hts_log_error("Malformed SAM header at line %u", lnum);
945                 bam_hdr_destroy(h);
946                 return NULL;
947             }
948         }
949 
950         last = cp[i];
951     }
952 
953     if (i < h->l_text) { // Early nul found.  Complain if not just padding.
954         uint32_t j = i;
955         while (j < h->l_text && cp[j] == '\0') j++;
956         if (j < h->l_text)
957             hts_log_warning("Unexpected NUL character in header. Possibly truncated");
958     }
959 
960     // Add trailing newline and/or trailing nul if required.
961     if (last != '\n') {
962         hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
963 
964         if (h->l_text == UINT32_MAX) {
965             hts_log_error("No room for extra newline");
966             bam_hdr_destroy(h);
967             return NULL;
968         }
969 
970         if (i >= h->l_text - 1) {
971             cp = realloc(h->text, (size_t) h->l_text+2);
972             if (!cp) {
973                 bam_hdr_destroy(h);
974                 return NULL;
975             }
976             h->text = cp;
977         }
978         cp[i++] = '\n';
979 
980         // l_text may be larger already due to multiple nul padding
981         if (h->l_text < i)
982             h->l_text = i;
983         cp[h->l_text] = '\0';
984     }
985 
986     return h;
987 }
988 
sam_hdr_read(htsFile * fp)989 bam_hdr_t *sam_hdr_read(htsFile *fp)
990 {
991     switch (fp->format.format) {
992     case bam:
993         return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
994 
995     case cram:
996         return sam_hdr_sanitise(cram_header_to_bam(fp->fp.cram->header));
997 
998     case sam: {
999         kstring_t str = { 0, 0, NULL };
1000         bam_hdr_t *h = NULL;
1001         int ret, has_SQ = 0;
1002         while ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1003             if (fp->line.s[0] != '@') break;
1004             if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
1005             kputsn(fp->line.s, fp->line.l, &str);
1006             kputc('\n', &str);
1007         }
1008         if (ret < -1) goto error;
1009         if (! has_SQ && fp->fn_aux) {
1010             kstring_t line = { 0, 0, NULL };
1011             hFILE *f = hopen(fp->fn_aux, "r");
1012             if (f == NULL) goto error;
1013             while (line.l = 0, kgetline(&line, (kgets_func *) hgets, f) >= 0) {
1014                 char *tab = strchr(line.s, '\t');
1015                 if (tab == NULL) continue;
1016                 kputs("@SQ\tSN:", &str);
1017                 kputsn(line.s, tab - line.s, &str);
1018                 kputs("\tLN:", &str);
1019                 kputl(atol(tab), &str);
1020                 kputc('\n', &str);
1021             }
1022             free(line.s);
1023             if (hclose(f) != 0) {
1024                 hts_log_warning("Failed to close %s", fp->fn_aux);
1025             }
1026         }
1027         if (str.l == 0) kputsn("", 0, &str);
1028         h = sam_hdr_parse(str.l, str.s);
1029         h->l_text = str.l; h->text = str.s;
1030         return sam_hdr_sanitise(h);
1031 
1032      error:
1033         bam_hdr_destroy(h);
1034         free(str.s);
1035         return NULL;
1036         }
1037 
1038     default:
1039         abort();
1040     }
1041 }
1042 
sam_hdr_write(htsFile * fp,const bam_hdr_t * h)1043 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
1044 {
1045     if (!h) {
1046         errno = EINVAL;
1047         return -1;
1048     }
1049 
1050     switch (fp->format.format) {
1051     case binary_format:
1052         fp->format.category = sequence_data;
1053         fp->format.format = bam;
1054         /* fall-through */
1055     case bam:
1056         if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1057         break;
1058 
1059     case cram: {
1060         cram_fd *fd = fp->fp.cram;
1061         SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h);
1062         if (! hdr) return -1;
1063         if (cram_set_header(fd, hdr) < 0) return -1;
1064         if (fp->fn_aux)
1065             cram_load_reference(fd, fp->fn_aux);
1066         if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1067         }
1068         break;
1069 
1070     case text_format:
1071         fp->format.category = sequence_data;
1072         fp->format.format = sam;
1073         /* fall-through */
1074     case sam: {
1075         char *p;
1076         hputs(h->text, fp->fp.hfile);
1077         p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
1078         if (p == 0) {
1079             int i;
1080             for (i = 0; i < h->n_targets; ++i) {
1081                 fp->line.l = 0;
1082                 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
1083                 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
1084                 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1085             }
1086         }
1087         if ( hflush(fp->fp.hfile) != 0 ) return -1;
1088         }
1089         break;
1090 
1091     default:
1092         abort();
1093     }
1094     return 0;
1095 }
1096 
sam_hdr_change_HD(bam_hdr_t * h,const char * key,const char * val)1097 int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val)
1098 {
1099     char *p, *q, *beg = NULL, *end = NULL, *newtext;
1100     if (!h || !key)
1101         return -1;
1102 
1103     if (h->l_text > 3) {
1104         if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
1105             if ((p = strchr(h->text, '\n')) == 0) return -1;
1106             *p = '\0'; // for strstr call
1107 
1108             char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
1109 
1110             if ((q = strstr(h->text, tmp)) != 0) { // key exists
1111                 *p = '\n'; // change back
1112 
1113                 // mark the key:val
1114                 beg = q;
1115                 for (q += 4; *q != '\n' && *q != '\t'; ++q);
1116                 end = q;
1117 
1118                 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
1119                     && strlen(val) == end - beg - 4)
1120                      return 0; // val is the same, no need to change
1121 
1122             } else {
1123                 beg = end = p;
1124                 *p = '\n';
1125             }
1126         }
1127     }
1128     if (beg == NULL) { // no @HD
1129         if (h->l_text > UINT32_MAX - strlen(SAM_FORMAT_VERSION) - 9)
1130             return -1;
1131         h->l_text += strlen(SAM_FORMAT_VERSION) + 8;
1132         if (val) {
1133             if (h->l_text > UINT32_MAX - strlen(val) - 5)
1134                 return -1;
1135             h->l_text += strlen(val) + 4;
1136         }
1137         newtext = (char*)malloc(h->l_text + 1);
1138         if (!newtext) return -1;
1139 
1140         if (val)
1141             snprintf(newtext, h->l_text + 1,
1142                     "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
1143         else
1144             snprintf(newtext, h->l_text + 1,
1145                     "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
1146     } else { // has @HD but different or no key
1147         h->l_text = (beg - h->text) + (h->text + h->l_text - end);
1148         if (val) {
1149             if (h->l_text > UINT32_MAX - strlen(val) - 5)
1150                 return -1;
1151             h->l_text += strlen(val) + 4;
1152         }
1153         newtext = (char*)malloc(h->l_text + 1);
1154         if (!newtext) return -1;
1155 
1156         if (val) {
1157             snprintf(newtext, h->l_text + 1, "%.*s\t%s:%s%s",
1158                     (int) (beg - h->text), h->text, key, val, end);
1159         } else { //delete key
1160             snprintf(newtext, h->l_text + 1, "%.*s%s",
1161                     (int) (beg - h->text), h->text, end);
1162         }
1163     }
1164     free(h->text);
1165     h->text = newtext;
1166     return 0;
1167 }
1168 
1169 /**********************
1170  *** SAM record I/O ***
1171  **********************/
1172 
sam_parse1(kstring_t * s,bam_hdr_t * h,bam1_t * b)1173 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
1174 {
1175 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
1176 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
1177 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
1178 #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0)
1179 #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0)
1180 #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0)
1181 
1182     uint8_t *t;
1183     char *p = s->s, *q;
1184     int i;
1185     kstring_t str;
1186     bam1_core_t *c = &b->core;
1187 
1188     str.l = b->l_data = 0;
1189     str.s = (char*)b->data; str.m = b->m_data;
1190     memset(c, 0, 32);
1191     if (h->cigar_tab == 0) {
1192         h->cigar_tab = (int8_t*) malloc(128);
1193         for (i = 0; i < 128; ++i)
1194             h->cigar_tab[i] = -1;
1195         for (i = 0; BAM_CIGAR_STR[i]; ++i)
1196             h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
1197     }
1198     // qname
1199     q = _read_token(p);
1200     _parse_warn(p - q <= 1, "empty query name");
1201     _parse_err(p - q > 252, "query name too long");
1202     kputsn_(q, p - q, &str);
1203     for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++)
1204         kputc_('\0', &str);
1205     c->l_qname = p - q + c->l_extranul;
1206     // flag
1207     c->flag = strtol(p, &p, 0);
1208     if (*p++ != '\t') goto err_ret; // malformated flag
1209     // chr
1210     q = _read_token(p);
1211     if (strcmp(q, "*")) {
1212         _parse_err(h->n_targets == 0, "missing SAM header");
1213         c->tid = bam_name2id(h, q);
1214         _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
1215     } else c->tid = -1;
1216     // pos
1217     c->pos = strtol(p, &p, 10) - 1;
1218     if (*p++ != '\t') goto err_ret;
1219     if (c->pos < 0 && c->tid >= 0) {
1220         _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
1221         c->tid = -1;
1222     }
1223     if (c->tid < 0) c->flag |= BAM_FUNMAP;
1224     // mapq
1225     c->qual = strtol(p, &p, 10);
1226     if (*p++ != '\t') goto err_ret;
1227     // cigar
1228     if (*p != '*') {
1229         uint32_t *cigar;
1230         size_t n_cigar = 0;
1231         for (q = p; *p && *p != '\t'; ++p)
1232             if (!isdigit_c(*p)) ++n_cigar;
1233         if (*p++ != '\t') goto err_ret;
1234         _parse_err(n_cigar == 0, "no CIGAR operations");
1235         _parse_err(n_cigar >= 2147483647, "too many CIGAR operations");
1236         c->n_cigar = n_cigar;
1237         _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t));
1238         for (i = 0; i < c->n_cigar; ++i, ++q) {
1239             int op;
1240             cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
1241             op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
1242             _parse_err(op < 0, "unrecognized CIGAR operator");
1243             cigar[i] |= op;
1244         }
1245         // can't use bam_endpos() directly as some fields not yet set up
1246         i = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
1247     } else {
1248         _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
1249         c->flag |= BAM_FUNMAP;
1250         q = _read_token(p);
1251         i = 1;
1252     }
1253     c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
1254     // mate chr
1255     q = _read_token(p);
1256     if (strcmp(q, "=") == 0) {
1257         c->mtid = c->tid;
1258     } else if (strcmp(q, "*") == 0) {
1259         c->mtid = -1;
1260     } else {
1261         c->mtid = bam_name2id(h, q);
1262         _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped");
1263     }
1264     // mpos
1265     c->mpos = strtol(p, &p, 10) - 1;
1266     if (*p++ != '\t') goto err_ret;
1267     if (c->mpos < 0 && c->mtid >= 0) {
1268         _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
1269         c->mtid = -1;
1270     }
1271     // tlen
1272     c->isize = strtol(p, &p, 10);
1273     if (*p++ != '\t') goto err_ret;
1274     // seq
1275     q = _read_token(p);
1276     if (strcmp(q, "*")) {
1277         c->l_qseq = p - q - 1;
1278         i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
1279         _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
1280         i = (c->l_qseq + 1) >> 1;
1281         _get_mem(uint8_t, &t, &str, i);
1282         memset(t, 0, i);
1283         for (i = 0; i < c->l_qseq; ++i)
1284             t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
1285     } else c->l_qseq = 0;
1286     // qual
1287     q = _read_token_aux(p);
1288     _get_mem(uint8_t, &t, &str, c->l_qseq);
1289     if (strcmp(q, "*")) {
1290         _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
1291         for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
1292     } else memset(t, 0xff, c->l_qseq);
1293     // aux
1294     while (p < s->s + s->l) {
1295         uint8_t type;
1296         q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
1297         _parse_err(p - q - 1 < 5, "incomplete aux field");
1298         kputsn_(q, 2, &str);
1299         q += 3; type = *q++; ++q; // q points to value
1300         if (type != 'Z' && type != 'H') // the only zero length acceptable fields
1301             _parse_err(p - q - 1 < 1, "incomplete aux field");
1302 
1303         // Ensure str has enough space for a double + type allocated.
1304         // This is so we can stuff bigger integers and floats directly into
1305         // the kstring.  Sorry.
1306         _parse_err(ks_resize(&str, str.l + 16), "out of memory");
1307 
1308         if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
1309             kputc_('A', &str);
1310             kputc_(*q, &str);
1311         } else if (type == 'i' || type == 'I') {
1312             if (*q == '-') {
1313                 long x = strtol(q, &q, 10);
1314                 if (x >= INT8_MIN) {
1315                     kputc_('c', &str); kputc_(x, &str);
1316                 } else if (x >= INT16_MIN) {
1317                     str.s[str.l++] = 's';
1318                     i16_to_le(x, (uint8_t *) str.s + str.l);
1319                     str.l += 2;
1320                 } else {
1321                     str.s[str.l++] = 'i';
1322                     i32_to_le(x, (uint8_t *) str.s + str.l);
1323                     str.l += 4;
1324                 }
1325             } else {
1326                 unsigned long x = strtoul(q, &q, 10);
1327                 if (x <= UINT8_MAX) {
1328                     kputc_('C', &str); kputc_(x, &str);
1329                 } else if (x <= UINT16_MAX) {
1330                     str.s[str.l++] = 'S';
1331                     u16_to_le(x, (uint8_t *) str.s + str.l);
1332                     str.l += 2;
1333                 } else {
1334                     str.s[str.l++] = 'I';
1335                     u32_to_le(x, (uint8_t *) str.s + str.l);
1336                     str.l += 4;
1337                 }
1338             }
1339         } else if (type == 'f') {
1340             str.s[str.l++] = 'f';
1341             float_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1342             str.l += sizeof(float);
1343         } else if (type == 'd') {
1344             str.s[str.l++] = 'd';
1345             double_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1346             str.l += sizeof(double);
1347         } else if (type == 'Z' || type == 'H') {
1348             _parse_err(type == 'H' && !((p-q)&1),
1349                        "hex field does not have an even number of digits");
1350             kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
1351         } else if (type == 'B') {
1352             int32_t n, size;
1353             size_t bytes;
1354             char *r;
1355             _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
1356             type = *q++; // q points to the first ',' following the typing byte
1357 
1358             size = aux_type2size(type);
1359             _parse_err_param(size <= 0 || size > 4,
1360                              "unrecognized type B:%c", type);
1361             _parse_err(*q && *q != ',', "B aux field type not followed by ','");
1362 
1363             for (r = q, n = 0; *r; ++r)
1364                 if (*r == ',') ++n;
1365 
1366             // Ensure space for type + values
1367             bytes = (size_t) n * (size_t) size;
1368             _parse_err(bytes / size != n
1369                        || ks_resize(&str, str.l + bytes + 2 + sizeof(uint32_t)),
1370                        "out of memory");
1371             str.s[str.l++] = 'B';
1372             str.s[str.l++] = type;
1373             i32_to_le(n, (uint8_t *) str.s + str.l);
1374             str.l += sizeof(uint32_t);
1375 
1376             // This ensures that q always ends up at the next comma after
1377             // reading a number even if it's followed by junk.  It
1378             // prevents the possibility of trying to read more than n items.
1379 #define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0)
1380 
1381             if (type == 'c')      while (q + 1 < p) { int8_t   x = strtol(q + 1, &q, 0); kputc_(x, &str); }
1382             else if (type == 'C') while (q + 1 < p) { uint8_t  x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
1383             else if (type == 's') while (q + 1 < p) { i16_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1384             else if (type == 'S') while (q + 1 < p) { u16_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1385             else if (type == 'i') while (q + 1 < p) { i32_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1386             else if (type == 'I') while (q + 1 < p) { u32_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1387             else if (type == 'f') while (q + 1 < p) { float_to_le(strtod(q + 1, &q), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1388             else _parse_err_param(1, "unrecognized type B:%c", type);
1389 
1390 #undef _skip_to_comma
1391 
1392         } else _parse_err_param(1, "unrecognized type %c", type);
1393     }
1394     b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1395     if (bam_tag2cigar(b, 1, 1) < 0)
1396         return -2;
1397     return 0;
1398 
1399 #undef _parse_warn
1400 #undef _parse_err
1401 #undef _parse_err_param
1402 #undef _get_mem
1403 #undef _read_token_aux
1404 #undef _read_token
1405 err_ret:
1406     b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1407     return -2;
1408 }
1409 
sam_read1(htsFile * fp,bam_hdr_t * h,bam1_t * b)1410 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
1411 {
1412     switch (fp->format.format) {
1413     case bam: {
1414         int r = bam_read1(fp->fp.bgzf, b);
1415         if (r >= 0) {
1416             if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
1417                 b->core.mtid >= h->n_targets || b->core.mtid < -1)
1418                 return -3;
1419         }
1420         return r;
1421         }
1422 
1423     case cram: {
1424         int ret = cram_get_bam_seq(fp->fp.cram, &b);
1425         if (ret < 0)
1426             return cram_eof(fp->fp.cram) ? -1 : -2;
1427 
1428         if (bam_tag2cigar(b, 1, 1) < 0)
1429             return -2;
1430         return ret;
1431     }
1432 
1433     case sam: {
1434         int ret;
1435 err_recover:
1436         if (fp->line.l == 0) {
1437             ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1438             if (ret < 0) return ret;
1439         }
1440         ret = sam_parse1(&fp->line, h, b);
1441         fp->line.l = 0;
1442         if (ret < 0) {
1443             hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
1444             if (h->ignore_sam_err) goto err_recover;
1445         }
1446         return ret;
1447     }
1448 
1449     default:
1450         abort();
1451     }
1452 }
1453 
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)1454 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
1455 {
1456     int i;
1457     uint8_t *s, *end;
1458     const bam1_core_t *c = &b->core;
1459 
1460     str->l = 0;
1461     kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name
1462     kputw(c->flag, str); kputc('\t', str); // flag
1463     if (c->tid >= 0) { // chr
1464         kputs(h->target_name[c->tid] , str);
1465         kputc('\t', str);
1466     } else kputsn("*\t", 2, str);
1467     kputw(c->pos + 1, str); kputc('\t', str); // pos
1468     kputw(c->qual, str); kputc('\t', str); // qual
1469     if (c->n_cigar) { // cigar
1470         uint32_t *cigar = bam_get_cigar(b);
1471         for (i = 0; i < c->n_cigar; ++i) {
1472             kputw(bam_cigar_oplen(cigar[i]), str);
1473             kputc(bam_cigar_opchr(cigar[i]), str);
1474         }
1475     } else kputc('*', str);
1476     kputc('\t', str);
1477     if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
1478     else if (c->mtid == c->tid) kputsn("=\t", 2, str);
1479     else {
1480         kputs(h->target_name[c->mtid], str);
1481         kputc('\t', str);
1482     }
1483     kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
1484     kputw(c->isize, str); kputc('\t', str); // template len
1485     if (c->l_qseq) { // seq and qual
1486         uint8_t *s = bam_get_seq(b);
1487         for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
1488         kputc('\t', str);
1489         s = bam_get_qual(b);
1490         if (s[0] == 0xff) kputc('*', str);
1491         else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
1492     } else kputsn("*\t*", 3, str);
1493 
1494     s = bam_get_aux(b); // aux
1495     end = b->data + b->l_data;
1496     while (end - s >= 4) {
1497         uint8_t type, key[2];
1498         key[0] = s[0]; key[1] = s[1];
1499         s += 2; type = *s++;
1500         kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
1501         if (type == 'A') {
1502             kputsn("A:", 2, str);
1503             kputc(*s, str);
1504             ++s;
1505         } else if (type == 'C') {
1506             kputsn("i:", 2, str);
1507             kputw(*s, str);
1508             ++s;
1509         } else if (type == 'c') {
1510             kputsn("i:", 2, str);
1511             kputw(*(int8_t*)s, str);
1512             ++s;
1513         } else if (type == 'S') {
1514             if (end - s >= 2) {
1515                 kputsn("i:", 2, str);
1516                 kputuw(le_to_u16(s), str);
1517                 s += 2;
1518             } else goto bad_aux;
1519         } else if (type == 's') {
1520             if (end - s >= 2) {
1521                 kputsn("i:", 2, str);
1522                 kputw(le_to_i16(s), str);
1523                 s += 2;
1524             } else goto bad_aux;
1525         } else if (type == 'I') {
1526             if (end - s >= 4) {
1527                 kputsn("i:", 2, str);
1528                 kputuw(le_to_u32(s), str);
1529                 s += 4;
1530             } else goto bad_aux;
1531         } else if (type == 'i') {
1532             if (end - s >= 4) {
1533                 kputsn("i:", 2, str);
1534                 kputw(le_to_i32(s), str);
1535                 s += 4;
1536             } else goto bad_aux;
1537         } else if (type == 'f') {
1538             if (end - s >= 4) {
1539                 ksprintf(str, "f:%g", le_to_float(s));
1540                 s += 4;
1541             } else goto bad_aux;
1542 
1543         } else if (type == 'd') {
1544             if (end - s >= 8) {
1545                 ksprintf(str, "d:%g", le_to_double(s));
1546                 s += 8;
1547             } else goto bad_aux;
1548         } else if (type == 'Z' || type == 'H') {
1549             kputc(type, str); kputc(':', str);
1550             while (s < end && *s) kputc(*s++, str);
1551             if (s >= end)
1552                 goto bad_aux;
1553             ++s;
1554         } else if (type == 'B') {
1555             uint8_t sub_type = *(s++);
1556             int sub_type_size = aux_type2size(sub_type);
1557             uint32_t n;
1558             if (sub_type_size == 0 || end - s < 4)
1559                 goto bad_aux;
1560             n = le_to_u32(s);
1561             s += 4; // now points to the start of the array
1562             if ((end - s) / sub_type_size < n)
1563                 goto bad_aux;
1564             kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
1565             for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
1566                 kputc(',', str);
1567                 if ('c' == sub_type)      { kputw(*(int8_t*)s, str); ++s; }
1568                 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
1569                 else if ('s' == sub_type) { kputw(le_to_i16(s), str); s += 2; }
1570                 else if ('S' == sub_type) { kputw(le_to_u16(s), str); s += 2; }
1571                 else if ('i' == sub_type) { kputw(le_to_i32(s), str); s += 4; }
1572                 else if ('I' == sub_type) { kputuw(le_to_u32(s), str); s += 4; }
1573                 else if ('f' == sub_type) { kputd(le_to_float(s), str); s += 4; }
1574                 else goto bad_aux;  // Unknown sub-type
1575             }
1576         } else { // Unknown type
1577             goto bad_aux;
1578         }
1579     }
1580     return str->l;
1581 
1582  bad_aux:
1583     hts_log_error("Corrupted aux data for read %.*s",
1584                   b->core.l_qname, bam_get_qname(b));
1585     errno = EINVAL;
1586     return -1;
1587 }
1588 
sam_write1(htsFile * fp,const bam_hdr_t * h,const bam1_t * b)1589 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1590 {
1591     switch (fp->format.format) {
1592     case binary_format:
1593         fp->format.category = sequence_data;
1594         fp->format.format = bam;
1595         /* fall-through */
1596     case bam:
1597         return bam_write1(fp->fp.bgzf, b);
1598 
1599     case cram:
1600         return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1601 
1602     case text_format:
1603         fp->format.category = sequence_data;
1604         fp->format.format = sam;
1605         /* fall-through */
1606     case sam:
1607         if (sam_format1(h, b, &fp->line) < 0) return -1;
1608         kputc('\n', &fp->line);
1609         if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1610         return fp->line.l;
1611 
1612     default:
1613         abort();
1614     }
1615 }
1616 
1617 /************************
1618  *** Auxiliary fields ***
1619  ************************/
1620 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)1621 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
1622     int tsz = aux_type2size(type);
1623 
1624     if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
1625 
1626     switch (tsz) {
1627         case 'H': case 'Z': case 1:  // Trivial
1628             memcpy(out, in, len);
1629             break;
1630 
1631 #define aux_val_to_le(type_t, store_le) do {                            \
1632         type_t v;                                                       \
1633         size_t i;                                                       \
1634         for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
1635             memcpy(&v, in + i, sizeof(type_t));                         \
1636             store_le(v, out);                                           \
1637         }                                                               \
1638     } while (0)
1639 
1640         case 2: aux_val_to_le(uint16_t, u16_to_le); break;
1641         case 4: aux_val_to_le(uint32_t, u32_to_le); break;
1642         case 8: aux_val_to_le(uint64_t, u64_to_le); break;
1643 
1644 #undef aux_val_to_le
1645 
1646         case 'B': { // Recurse!
1647             uint32_t n;
1648             if (len < 5) return -1;
1649             memcpy(&n, in + 1, 4);
1650             out[0] = in[0];
1651             u32_to_le(n, out + 1);
1652             return aux_to_le(in[0], out + 5, in + 5, len - 5);
1653         }
1654 
1655         default: // Unknown type code
1656             return -1;
1657     }
1658 
1659 
1660 
1661     return 0;
1662 }
1663 #endif
1664 
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)1665 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
1666 {
1667     uint32_t new_len;
1668 
1669     assert(b->l_data >= 0);
1670     new_len = b->l_data + 3 + len;
1671     if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
1672 
1673     if (b->m_data < new_len) {
1674         uint32_t new_size = new_len;
1675         uint8_t *new_data;
1676         kroundup32(new_size);
1677         new_data = realloc(b->data, new_size);
1678         if (new_data == NULL) goto nomem;
1679         b->m_data = new_size;
1680         b->data = new_data;
1681     }
1682 
1683     b->data[b->l_data] = tag[0];
1684     b->data[b->l_data + 1] = tag[1];
1685     b->data[b->l_data + 2] = type;
1686 
1687 #ifdef HTS_LITTLE_ENDIAN
1688     memcpy(b->data + b->l_data + 3, data, len);
1689 #else
1690     if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
1691         errno = EINVAL;
1692         return -1;
1693     }
1694 #endif
1695 
1696     b->l_data = new_len;
1697 
1698     return 0;
1699 
1700  nomem:
1701     errno = ENOMEM;
1702     return -1;
1703 }
1704 
skip_aux(uint8_t * s,uint8_t * end)1705 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
1706 {
1707     int size;
1708     uint32_t n;
1709     if (s >= end) return end;
1710     size = aux_type2size(*s); ++s; // skip type
1711     switch (size) {
1712     case 'Z':
1713     case 'H':
1714         while (*s && s < end) ++s;
1715         return s < end ? s + 1 : end;
1716     case 'B':
1717         if (end - s < 5) return NULL;
1718         size = aux_type2size(*s); ++s;
1719         n = le_to_u32(s);
1720         s += 4;
1721         if (size == 0 || end - s < size * n) return NULL;
1722         return s + size * n;
1723     case 0:
1724         return NULL;
1725     default:
1726         if (end - s < size) return NULL;
1727         return s + size;
1728     }
1729 }
1730 
bam_aux_get(const bam1_t * b,const char tag[2])1731 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1732 {
1733     uint8_t *s, *end, *t = (uint8_t *) tag;
1734     uint16_t y = (uint16_t) t[0]<<8 | t[1];
1735     s = bam_get_aux(b);
1736     end = b->data + b->l_data;
1737     while (s != NULL && end - s >= 3) {
1738         uint16_t x = (uint16_t) s[0]<<8 | s[1];
1739         s += 2;
1740         if (x == y) {
1741             // Check the tag value is valid and complete
1742             uint8_t *e = skip_aux(s, end);
1743             if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
1744                 goto bad_aux;  // Unterminated string
1745             }
1746             if (e != NULL) {
1747                 return s;
1748             } else {
1749                 goto bad_aux;
1750             }
1751         }
1752         s = skip_aux(s, end);
1753     }
1754     if (s == NULL) goto bad_aux;
1755     errno = ENOENT;
1756     return NULL;
1757 
1758  bad_aux:
1759     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1760     errno = EINVAL;
1761     return NULL;
1762 }
1763 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)1764 int bam_aux_del(bam1_t *b, uint8_t *s)
1765 {
1766     uint8_t *p, *aux;
1767     int l_aux = bam_get_l_aux(b);
1768     aux = bam_get_aux(b);
1769     p = s - 2;
1770     s = skip_aux(s, aux + l_aux);
1771     if (s == NULL) goto bad_aux;
1772     memmove(p, s, l_aux - (s - aux));
1773     b->l_data -= s - p;
1774     return 0;
1775 
1776  bad_aux:
1777     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1778     errno = EINVAL;
1779     return -1;
1780 }
1781 
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)1782 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
1783 {
1784     // FIXME: This is not at all efficient!
1785     uint8_t *s = bam_aux_get(b,tag);
1786     if (!s) {
1787         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
1788             return bam_aux_append(b, tag, 'Z', len, (const uint8_t *) data);
1789         } else { // Invalid aux data, give up.
1790             return -1;
1791         }
1792     }
1793     char type = *s;
1794     if (type != 'Z') {
1795         hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
1796         errno = EINVAL;
1797         return -1;
1798     }
1799 
1800     bam_aux_del(b,s);
1801     s -= 2;
1802     int l_aux = bam_get_l_aux(b);
1803 
1804     b->l_data += 3 + len;
1805     if (b->m_data < b->l_data) {
1806         ptrdiff_t s_offset = s - b->data;
1807         b->m_data = b->l_data;
1808         kroundup32(b->m_data);
1809         b->data = (uint8_t*)realloc(b->data, b->m_data);
1810         s = b->data + s_offset;
1811     }
1812     memmove(s+3+len, s, l_aux - (s - bam_get_aux(b)));
1813     s[0] = tag[0];
1814     s[1] = tag[1];
1815     s[2] = type;
1816     memmove(s+3,data,len);
1817     return 0;
1818 }
1819 
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)1820 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
1821                                       uint32_t idx)
1822 {
1823     switch (type) {
1824         case 'c': return le_to_i8(s + idx);
1825         case 'C': return s[idx];
1826         case 's': return le_to_i16(s + 2 * idx);
1827         case 'S': return le_to_u16(s + 2 * idx);
1828         case 'i': return le_to_i32(s + 4 * idx);
1829         case 'I': return le_to_u32(s + 4 * idx);
1830         default:
1831             errno = EINVAL;
1832             return 0;
1833     }
1834 }
1835 
bam_aux2i(const uint8_t * s)1836 int64_t bam_aux2i(const uint8_t *s)
1837 {
1838     int type;
1839     type = *s++;
1840     return get_int_aux_val(type, s, 0);
1841 }
1842 
bam_aux2f(const uint8_t * s)1843 double bam_aux2f(const uint8_t *s)
1844 {
1845     int type;
1846     type = *s++;
1847     if (type == 'd') return le_to_double(s);
1848     else if (type == 'f') return le_to_float(s);
1849     else return get_int_aux_val(type, s, 0);
1850 }
1851 
bam_aux2A(const uint8_t * s)1852 char bam_aux2A(const uint8_t *s)
1853 {
1854     int type;
1855     type = *s++;
1856     if (type == 'A') return *(char*)s;
1857     errno = EINVAL;
1858     return 0;
1859 }
1860 
bam_aux2Z(const uint8_t * s)1861 char *bam_aux2Z(const uint8_t *s)
1862 {
1863     int type;
1864     type = *s++;
1865     if (type == 'Z' || type == 'H') return (char*)s;
1866     errno = EINVAL;
1867     return 0;
1868 }
1869 
bam_auxB_len(const uint8_t * s)1870 uint32_t bam_auxB_len(const uint8_t *s)
1871 {
1872     if (s[0] != 'B') {
1873         errno = EINVAL;
1874         return 0;
1875     }
1876     return le_to_u32(s + 2);
1877 }
1878 
bam_auxB2i(const uint8_t * s,uint32_t idx)1879 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
1880 {
1881     uint32_t len = bam_auxB_len(s);
1882     if (idx >= len) {
1883         errno = ERANGE;
1884         return 0;
1885     }
1886     return get_int_aux_val(s[1], s + 6, idx);
1887 }
1888 
bam_auxB2f(const uint8_t * s,uint32_t idx)1889 double bam_auxB2f(const uint8_t *s, uint32_t idx)
1890 {
1891     uint32_t len = bam_auxB_len(s);
1892     if (idx >= len) {
1893         errno = ERANGE;
1894         return 0.0;
1895     }
1896     if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
1897     else return get_int_aux_val(s[1], s + 6, idx);
1898 }
1899 
sam_open_mode(char * mode,const char * fn,const char * format)1900 int sam_open_mode(char *mode, const char *fn, const char *format)
1901 {
1902     // TODO Parse "bam5" etc for compression level
1903     if (format == NULL) {
1904         // Try to pick a format based on the filename extension
1905         const char *ext = fn? strrchr(fn, '.') : NULL;
1906         if (ext == NULL || strchr(ext, '/')) return -1;
1907         return sam_open_mode(mode, fn, ext+1);
1908     }
1909     else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
1910     else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
1911     else if (strcmp(format, "sam") == 0) strcpy(mode, "");
1912     else return -1;
1913 
1914     return 0;
1915 }
1916 
1917 // A version of sam_open_mode that can handle ,key=value options.
1918 // The format string is allocated and returned, to be freed by the caller.
1919 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)1920 char *sam_open_mode_opts(const char *fn,
1921                          const char *mode,
1922                          const char *format)
1923 {
1924     char *mode_opts = malloc((format ? strlen(format) : 1) +
1925                              (mode   ? strlen(mode)   : 1) + 12);
1926     char *opts, *cp;
1927     int format_len;
1928 
1929     if (!mode_opts)
1930         return NULL;
1931 
1932     strcpy(mode_opts, mode ? mode : "r");
1933     cp = mode_opts + strlen(mode_opts);
1934 
1935     if (format == NULL) {
1936         // Try to pick a format based on the filename extension
1937         const char *ext = fn? strrchr(fn, '.') : NULL;
1938         if (ext == NULL || strchr(ext, '/')) {
1939             free(mode_opts);
1940             return NULL;
1941         }
1942         if (sam_open_mode(cp, fn, ext+1) == 0) {
1943             return mode_opts;
1944         } else {
1945             free(mode_opts);
1946             return NULL;
1947         }
1948     }
1949 
1950     if ((opts = strchr(format, ','))) {
1951         format_len = opts-format;
1952     } else {
1953         opts="";
1954         format_len = strlen(format);
1955     }
1956 
1957     if (strncmp(format, "bam", format_len) == 0) {
1958         *cp++ = 'b';
1959     } else if (strncmp(format, "cram", format_len) == 0) {
1960         *cp++ = 'c';
1961     } else if (strncmp(format, "cram2", format_len) == 0) {
1962         *cp++ = 'c';
1963         strcpy(cp, ",VERSION=2.1");
1964         cp += 12;
1965     } else if (strncmp(format, "cram3", format_len) == 0) {
1966         *cp++ = 'c';
1967         strcpy(cp, ",VERSION=3.0");
1968         cp += 12;
1969     } else if (strncmp(format, "sam", format_len) == 0) {
1970         ; // format mode=""
1971     } else {
1972         free(mode_opts);
1973         return NULL;
1974     }
1975 
1976     strcpy(cp, opts);
1977 
1978     return mode_opts;
1979 }
1980 
1981 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)1982 int bam_str2flag(const char *str)
1983 {
1984     char *end, *beg = (char*) str;
1985     long int flag = strtol(str, &end, 0);
1986     if ( end!=str ) return flag;    // the conversion was successful
1987     flag = 0;
1988     while ( *str )
1989     {
1990         end = beg;
1991         while ( *end && *end!=',' ) end++;
1992         if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
1993         else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
1994         else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
1995         else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
1996         else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
1997         else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
1998         else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
1999         else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
2000         else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
2001         else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
2002         else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
2003         else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
2004         else return -1;
2005         if ( !*end ) break;
2006         beg = end + 1;
2007     }
2008     return flag;
2009 }
2010 
bam_flag2str(int flag)2011 char *bam_flag2str(int flag)
2012 {
2013     kstring_t str = {0,0,0};
2014     if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
2015     if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
2016     if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
2017     if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
2018     if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
2019     if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
2020     if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
2021     if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
2022     if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
2023     if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
2024     if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
2025     if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
2026     if ( str.l == 0 ) kputsn("", 0, &str);
2027     return str.s;
2028 }
2029 
2030 
2031 /**************************
2032  *** Pileup and Mpileup ***
2033  **************************/
2034 
2035 #if !defined(BAM_NO_PILEUP)
2036 
2037 #include <assert.h>
2038 
2039 /*******************
2040  *** Memory pool ***
2041  *******************/
2042 
2043 typedef struct {
2044     int k, x, y, end;
2045 } cstate_t;
2046 
2047 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
2048 
2049 typedef struct __linkbuf_t {
2050     bam1_t b;
2051     int32_t beg, end;
2052     cstate_t s;
2053     struct __linkbuf_t *next;
2054     bam_pileup_cd cd;
2055 } lbnode_t;
2056 
2057 typedef struct {
2058     int cnt, n, max;
2059     lbnode_t **buf;
2060 } mempool_t;
2061 
mp_init(void)2062 static mempool_t *mp_init(void)
2063 {
2064     mempool_t *mp;
2065     mp = (mempool_t*)calloc(1, sizeof(mempool_t));
2066     return mp;
2067 }
mp_destroy(mempool_t * mp)2068 static void mp_destroy(mempool_t *mp)
2069 {
2070     int k;
2071     for (k = 0; k < mp->n; ++k) {
2072         free(mp->buf[k]->b.data);
2073         free(mp->buf[k]);
2074     }
2075     free(mp->buf);
2076     free(mp);
2077 }
mp_alloc(mempool_t * mp)2078 static inline lbnode_t *mp_alloc(mempool_t *mp)
2079 {
2080     ++mp->cnt;
2081     if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
2082     else return mp->buf[--mp->n];
2083 }
mp_free(mempool_t * mp,lbnode_t * p)2084 static inline void mp_free(mempool_t *mp, lbnode_t *p)
2085 {
2086     --mp->cnt; p->next = 0; // clear lbnode_t::next here
2087     if (mp->n == mp->max) {
2088         mp->max = mp->max? mp->max<<1 : 256;
2089         mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
2090     }
2091     mp->buf[mp->n++] = p;
2092 }
2093 
2094 /**********************
2095  *** CIGAR resolver ***
2096  **********************/
2097 
2098 /* s->k: the index of the CIGAR operator that has just been processed.
2099    s->x: the reference coordinate of the start of s->k
2100    s->y: the query coordiante of the start of s->k
2101  */
resolve_cigar2(bam_pileup1_t * p,int32_t pos,cstate_t * s)2102 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
2103 {
2104 #define _cop(c) ((c)&BAM_CIGAR_MASK)
2105 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
2106 
2107     bam1_t *b = p->b;
2108     bam1_core_t *c = &b->core;
2109     uint32_t *cigar = bam_get_cigar(b);
2110     int k;
2111     // determine the current CIGAR operation
2112     //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
2113     if (s->k == -1) { // never processed
2114         if (c->n_cigar == 1) { // just one operation, save a loop
2115           if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
2116         } else { // find the first match or deletion
2117             for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
2118                 int op = _cop(cigar[k]);
2119                 int l = _cln(cigar[k]);
2120                 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2121                 else if (op == BAM_CREF_SKIP) s->x += l;
2122                 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2123             }
2124             assert(k < c->n_cigar);
2125             s->k = k;
2126         }
2127     } else { // the read has been processed before
2128         int op, l = _cln(cigar[s->k]);
2129         if (pos - s->x >= l) { // jump to the next operation
2130             assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
2131             op = _cop(cigar[s->k+1]);
2132             if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
2133               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2134                 s->x += l;
2135                 ++s->k;
2136             } else { // find the next M/D/N/=/X
2137               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2138                 s->x += l;
2139                 for (k = s->k + 1; k < c->n_cigar; ++k) {
2140                     op = _cop(cigar[k]), l = _cln(cigar[k]);
2141                     if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2142                     else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2143                 }
2144                 s->k = k;
2145             }
2146             assert(s->k < c->n_cigar); // otherwise a bug
2147         } // else, do nothing
2148     }
2149     { // collect pileup information
2150         int op, l;
2151         op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
2152         p->is_del = p->indel = p->is_refskip = 0;
2153         if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
2154             int op2 = _cop(cigar[s->k+1]);
2155             int l2 = _cln(cigar[s->k+1]);
2156             if (op2 == BAM_CDEL) p->indel = -(int)l2;
2157             else if (op2 == BAM_CINS) p->indel = l2;
2158             else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
2159                 int l3 = 0;
2160                 for (k = s->k + 2; k < c->n_cigar; ++k) {
2161                     op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
2162                     if (op2 == BAM_CINS) l3 += l2;
2163                     else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
2164                 }
2165                 if (l3 > 0) p->indel = l3;
2166             }
2167         }
2168         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
2169             p->qpos = s->y + (pos - s->x);
2170         } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
2171             p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
2172             p->is_refskip = (op == BAM_CREF_SKIP);
2173         } // cannot be other operations; otherwise a bug
2174         p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
2175     }
2176     return 1;
2177 }
2178 
2179 /***********************
2180  *** Pileup iterator ***
2181  ***********************/
2182 
2183 // Dictionary of overlapping reads
2184 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
2185 typedef khash_t(olap_hash) olap_hash_t;
2186 
2187 struct __bam_plp_t {
2188     mempool_t *mp;
2189     lbnode_t *head, *tail;
2190     int32_t tid, pos, max_tid, max_pos;
2191     int is_eof, max_plp, error, maxcnt;
2192     uint64_t id;
2193     bam_pileup1_t *plp;
2194     // for the "auto" interface only
2195     bam1_t *b;
2196     bam_plp_auto_f func;
2197     void *data;
2198     olap_hash_t *overlaps;
2199 
2200     // For notification of creation and destruction events
2201     // and associated client-owned pointer.
2202     int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
2203     int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
2204 };
2205 
bam_plp_init(bam_plp_auto_f func,void * data)2206 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
2207 {
2208     bam_plp_t iter;
2209     iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
2210     iter->mp = mp_init();
2211     iter->head = iter->tail = mp_alloc(iter->mp);
2212     iter->max_tid = iter->max_pos = -1;
2213     iter->maxcnt = 8000;
2214     if (func) {
2215         iter->func = func;
2216         iter->data = data;
2217         iter->b = bam_init1();
2218     }
2219     return iter;
2220 }
2221 
bam_plp_init_overlaps(bam_plp_t iter)2222 void bam_plp_init_overlaps(bam_plp_t iter)
2223 {
2224     iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
2225 }
2226 
bam_plp_destroy(bam_plp_t iter)2227 void bam_plp_destroy(bam_plp_t iter)
2228 {
2229     lbnode_t *p, *pnext;
2230     if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
2231     for (p = iter->head; p != NULL; p = pnext) {
2232         pnext = p->next;
2233         mp_free(iter->mp, p);
2234     }
2235     mp_destroy(iter->mp);
2236     if (iter->b) bam_destroy1(iter->b);
2237     free(iter->plp);
2238     free(iter);
2239 }
2240 
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2241 void bam_plp_constructor(bam_plp_t plp,
2242                          int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2243     plp->plp_construct = func;
2244 }
2245 
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2246 void bam_plp_destructor(bam_plp_t plp,
2247                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2248     plp->plp_destruct = func;
2249 }
2250 
2251 //---------------------------------
2252 //---  Tweak overlapping reads
2253 //---------------------------------
2254 
2255 /**
2256  *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
2257  *  cigar_iref2iseq_next() - get the next CMATCH base
2258  *  @cigar:       pointer to current cigar block (rw)
2259  *  @cigar_max:   pointer just beyond the last cigar block
2260  *  @icig:        position within the current cigar block (rw)
2261  *  @iseq:        position in the sequence (rw)
2262  *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
2263  *
2264  *  Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
2265  */
cigar_iref2iseq_set(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2266 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2267 {
2268     int pos = *iref;
2269     if ( pos < 0 ) return -1;
2270     *icig = 0;
2271     *iseq = 0;
2272     *iref = 0;
2273     while ( *cigar<cigar_max )
2274     {
2275         int cig  = (**cigar) & BAM_CIGAR_MASK;
2276         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2277 
2278         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2279         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2280         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2281         {
2282             pos -= ncig;
2283             if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
2284             (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
2285             continue;
2286         }
2287         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2288         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
2289         {
2290             pos -= ncig;
2291             if ( pos<0 ) pos = 0;
2292             (*cigar)++; *icig = 0; *iref += ncig;
2293             continue;
2294         }
2295         hts_log_error("Unexpected cigar %d", cig);
2296         assert(0);
2297     }
2298     *iseq = -1;
2299     return -1;
2300 }
cigar_iref2iseq_next(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2301 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2302 {
2303     while ( *cigar < cigar_max )
2304     {
2305         int cig  = (**cigar) & BAM_CIGAR_MASK;
2306         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2307 
2308         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2309         {
2310             if ( *icig >= ncig - 1 ) { *icig = 0;  (*cigar)++; continue; }
2311             (*iseq)++; (*icig)++; (*iref)++;
2312             return BAM_CMATCH;
2313         }
2314         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
2315         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2316         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2317         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2318         hts_log_error("Unexpected cigar %d", cig);
2319         assert(0);
2320     }
2321     *iseq = -1;
2322     *iref = -1;
2323     return -1;
2324 }
2325 
tweak_overlap_quality(bam1_t * a,bam1_t * b)2326 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
2327 {
2328     uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
2329     uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
2330     int a_icig = 0, a_iseq = 0;
2331     int b_icig = 0, b_iseq = 0;
2332     uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
2333     uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
2334 
2335     int iref   = b->core.pos;
2336     int a_iref = iref - a->core.pos;
2337     int b_iref = iref - b->core.pos;
2338     int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2339     if ( a_ret<0 ) return;  // no overlap
2340     int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2341     if ( b_ret<0 ) return;  // no overlap
2342 
2343     #if DBG
2344         fprintf(stderr,"tweak %s  n_cigar=%d %d  .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
2345             a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
2346     #endif
2347 
2348     while ( 1 )
2349     {
2350         // Increment reference position
2351         while ( a_iref>=0 && a_iref < iref - a->core.pos )
2352             a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2353         if ( a_ret<0 ) break;   // done
2354         if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
2355 
2356         while ( b_iref>=0 && b_iref < iref - b->core.pos )
2357             b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2358         if ( b_ret<0 ) break;   // done
2359         if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
2360 
2361         iref++;
2362         if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue;   // only CMATCH positions, don't know what to do with indels
2363 
2364         if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
2365         {
2366             #if DBG
2367                 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
2368             #endif
2369             // we are very confident about this base
2370             int qual = a_qual[a_iseq] + b_qual[b_iseq];
2371             a_qual[a_iseq] = qual>200 ? 200 : qual;
2372             b_qual[b_iseq] = 0;
2373         }
2374         else
2375         {
2376             if ( a_qual[a_iseq] >= b_qual[b_iseq] )
2377             {
2378                 #if DBG
2379                     fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
2380                 #endif
2381                 a_qual[a_iseq] = 0.8 * a_qual[a_iseq];  // not so confident about a_qual anymore given the mismatch
2382                 b_qual[b_iseq] = 0;
2383             }
2384             else
2385             {
2386                 #if DBG
2387                     fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
2388                 #endif
2389                 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
2390                 a_qual[a_iseq] = 0;
2391             }
2392         }
2393     }
2394     #if DBG
2395         fprintf(stderr,"\n");
2396     #endif
2397 }
2398 
2399 // Fix overlapping reads. Simple soft-clipping did not give good results.
2400 // Lowering qualities of unwanted bases is more selective and works better.
2401 //
overlap_push(bam_plp_t iter,lbnode_t * node)2402 static void overlap_push(bam_plp_t iter, lbnode_t *node)
2403 {
2404     if ( !iter->overlaps ) return;
2405 
2406     // mapped mates and paired reads only
2407     if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
2408 
2409     // no overlap possible, unless some wild cigar
2410     if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
2411 
2412     khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
2413     if ( kitr==kh_end(iter->overlaps) )
2414     {
2415         int ret;
2416         kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
2417         kh_value(iter->overlaps, kitr) = node;
2418     }
2419     else
2420     {
2421         lbnode_t *a = kh_value(iter->overlaps, kitr);
2422         tweak_overlap_quality(&a->b, &node->b);
2423         kh_del(olap_hash, iter->overlaps, kitr);
2424         assert(a->end-1 == a->s.end);
2425         a->end = bam_endpos(&a->b);
2426         a->s.end = a->end - 1;
2427     }
2428 }
2429 
overlap_remove(bam_plp_t iter,const bam1_t * b)2430 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
2431 {
2432     if ( !iter->overlaps ) return;
2433 
2434     khiter_t kitr;
2435     if ( b )
2436     {
2437         kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
2438         if ( kitr!=kh_end(iter->overlaps) )
2439             kh_del(olap_hash, iter->overlaps, kitr);
2440     }
2441     else
2442     {
2443         // remove all
2444         for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
2445             if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
2446     }
2447 }
2448 
2449 
2450 
2451 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
2452 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
2453 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2454 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2455 {
2456     if (iter->error) { *_n_plp = -1; return NULL; }
2457     *_n_plp = 0;
2458     if (iter->is_eof && iter->head == iter->tail) return NULL;
2459     while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
2460         int n_plp = 0;
2461         // write iter->plp at iter->pos
2462         lbnode_t **pptr = &iter->head;
2463         while (*pptr != iter->tail) {
2464             lbnode_t *p = *pptr;
2465             if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
2466                 overlap_remove(iter, &p->b);
2467                 if (iter->plp_destruct)
2468                     iter->plp_destruct(iter->data, &p->b, &p->cd);
2469                 *pptr = p->next; mp_free(iter->mp, p);
2470             }
2471             else {
2472                 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
2473                     if (n_plp == iter->max_plp) { // then double the capacity
2474                         iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
2475                         iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
2476                     }
2477                     iter->plp[n_plp].b = &p->b;
2478                     iter->plp[n_plp].cd = p->cd;
2479                     if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
2480                 }
2481                 pptr = &(*pptr)->next;
2482             }
2483         }
2484         *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
2485         // update iter->tid and iter->pos
2486         if (iter->head != iter->tail) {
2487             if (iter->tid > iter->head->b.core.tid) {
2488                 hts_log_error("Unsorted input. Pileup aborts");
2489                 iter->error = 1;
2490                 *_n_plp = -1;
2491                 return NULL;
2492             }
2493         }
2494         if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
2495             iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
2496         } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
2497             iter->pos = iter->head->beg; // jump to the next position
2498         } else ++iter->pos; // scan contiguously
2499         // return
2500         if (n_plp) return iter->plp;
2501         if (iter->is_eof && iter->head == iter->tail) break;
2502     }
2503     return NULL;
2504 }
2505 
bam_plp_push(bam_plp_t iter,const bam1_t * b)2506 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
2507 {
2508     if (iter->error) return -1;
2509     if (b) {
2510         if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
2511         // Skip only unmapped reads here, any additional filtering must be done in iter->func
2512         if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
2513         if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
2514         {
2515             overlap_remove(iter, b);
2516             return 0;
2517         }
2518         bam_copy1(&iter->tail->b, b);
2519         overlap_push(iter, iter->tail);
2520 #ifndef BAM_NO_ID
2521         iter->tail->b.id = iter->id++;
2522 #endif
2523         iter->tail->beg = b->core.pos;
2524         iter->tail->end = bam_endpos(b);
2525         iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
2526         if (b->core.tid < iter->max_tid) {
2527             hts_log_error("The input is not sorted (chromosomes out of order)");
2528             iter->error = 1;
2529             return -1;
2530         }
2531         if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
2532             hts_log_error("The input is not sorted (reads out of order)");
2533             iter->error = 1;
2534             return -1;
2535         }
2536         iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
2537         if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
2538             if (iter->plp_construct)
2539                 iter->plp_construct(iter->data, b, &iter->tail->cd);
2540             iter->tail->next = mp_alloc(iter->mp);
2541             iter->tail = iter->tail->next;
2542         }
2543     } else iter->is_eof = 1;
2544     return 0;
2545 }
2546 
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2547 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2548 {
2549     const bam_pileup1_t *plp;
2550     if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
2551     if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2552     else { // no pileup line can be obtained; read alignments
2553         *_n_plp = 0;
2554         if (iter->is_eof) return 0;
2555         int ret;
2556         while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
2557             if (bam_plp_push(iter, iter->b) < 0) {
2558                 *_n_plp = -1;
2559                 return 0;
2560             }
2561             if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2562             // otherwise no pileup line can be returned; read the next alignment.
2563         }
2564         if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
2565         bam_plp_push(iter, 0);
2566         if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2567         return 0;
2568     }
2569 }
2570 
bam_plp_reset(bam_plp_t iter)2571 void bam_plp_reset(bam_plp_t iter)
2572 {
2573     overlap_remove(iter, NULL);
2574     iter->max_tid = iter->max_pos = -1;
2575     iter->tid = iter->pos = 0;
2576     iter->is_eof = 0;
2577     while (iter->head != iter->tail) {
2578         lbnode_t *p = iter->head;
2579         iter->head = p->next;
2580         mp_free(iter->mp, p);
2581     }
2582 }
2583 
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)2584 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
2585 {
2586     iter->maxcnt = maxcnt;
2587 }
2588 
2589 /************************
2590  *** Mpileup iterator ***
2591  ************************/
2592 
2593 struct __bam_mplp_t {
2594     int n;
2595     uint64_t min, *pos;
2596     bam_plp_t *iter;
2597     int *n_plp;
2598     const bam_pileup1_t **plp;
2599 };
2600 
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)2601 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
2602 {
2603     int i;
2604     bam_mplp_t iter;
2605     iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
2606     iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
2607     iter->n_plp = (int*)calloc(n, sizeof(int));
2608     iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
2609     iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
2610     iter->n = n;
2611     iter->min = (uint64_t)-1;
2612     for (i = 0; i < n; ++i) {
2613         iter->iter[i] = bam_plp_init(func, data[i]);
2614         iter->pos[i] = iter->min;
2615     }
2616     return iter;
2617 }
2618 
bam_mplp_init_overlaps(bam_mplp_t iter)2619 void bam_mplp_init_overlaps(bam_mplp_t iter)
2620 {
2621     int i;
2622     for (i = 0; i < iter->n; ++i)
2623         bam_plp_init_overlaps(iter->iter[i]);
2624 }
2625 
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)2626 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
2627 {
2628     int i;
2629     for (i = 0; i < iter->n; ++i)
2630         iter->iter[i]->maxcnt = maxcnt;
2631 }
2632 
bam_mplp_destroy(bam_mplp_t iter)2633 void bam_mplp_destroy(bam_mplp_t iter)
2634 {
2635     int i;
2636     for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
2637     free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
2638     free(iter);
2639 }
2640 
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)2641 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
2642 {
2643     int i, ret = 0;
2644     uint64_t new_min = (uint64_t)-1;
2645     for (i = 0; i < iter->n; ++i) {
2646         if (iter->pos[i] == iter->min) {
2647             int tid, pos;
2648             iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
2649             if ( iter->iter[i]->error ) return -1;
2650             iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
2651         }
2652         if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
2653     }
2654     iter->min = new_min;
2655     if (new_min == (uint64_t)-1) return 0;
2656     *_tid = new_min>>32; *_pos = (uint32_t)new_min;
2657     for (i = 0; i < iter->n; ++i) {
2658         if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
2659             n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
2660             ++ret;
2661         } else n_plp[i] = 0, plp[i] = 0;
2662     }
2663     return ret;
2664 }
2665 
bam_mplp_reset(bam_mplp_t iter)2666 void bam_mplp_reset(bam_mplp_t iter)
2667 {
2668     int i;
2669     iter->min = (uint64_t)-1;
2670     for (i = 0; i < iter->n; ++i) {
2671         bam_plp_reset(iter->iter[i]);
2672         iter->pos[i] = (uint64_t)-1;
2673         iter->n_plp[i] = 0;
2674         iter->plp[i] = NULL;
2675     }
2676 }
2677 
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2678 void bam_mplp_constructor(bam_mplp_t iter,
2679                           int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2680     int i;
2681     for (i = 0; i < iter->n; ++i)
2682         bam_plp_constructor(iter->iter[i], func);
2683 }
2684 
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2685 void bam_mplp_destructor(bam_mplp_t iter,
2686                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2687     int i;
2688     for (i = 0; i < iter->n; ++i)
2689         bam_plp_destructor(iter->iter[i], func);
2690 }
2691 
2692 #endif // ~!defined(BAM_NO_PILEUP)
2693