1 /*  sam.c -- SAM and BAM file I/O and manipulation.
2 
3     Copyright (C) 2008-2010, 2012-2018 Genome Research Ltd.
4     Copyright (C) 2010, 2012, 2013 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <strings.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <zlib.h>
34 #include <assert.h>
35 #include "htslib/sam.h"
36 #include "htslib/bgzf.h"
37 #include "cram/cram.h"
38 #include "hts_internal.h"
39 #include "htslib/hfile.h"
40 #include "htslib/hts_endian.h"
41 
42 #include "htslib/khash.h"
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44 
45 typedef khash_t(s2i) sdict_t;
46 
47 #ifndef EOVERFLOW
48 #define EOVERFLOW ERANGE
49 #endif
50 
51 /**********************
52  *** BAM header I/O ***
53  **********************/
54 
bam_hdr_init()55 bam_hdr_t *bam_hdr_init()
56 {
57     return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
58 }
59 
bam_hdr_destroy(bam_hdr_t * h)60 void bam_hdr_destroy(bam_hdr_t *h)
61 {
62     int32_t i;
63     if (h == NULL) return;
64     if (h->target_name) {
65         for (i = 0; i < h->n_targets; ++i)
66             free(h->target_name[i]);
67         free(h->target_name);
68         free(h->target_len);
69     }
70     free(h->text); free(h->cigar_tab);
71     if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
72     free(h);
73 }
74 
bam_hdr_dup(const bam_hdr_t * h0)75 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
76 {
77     if (h0 == NULL) return NULL;
78     bam_hdr_t *h;
79     if ((h = bam_hdr_init()) == NULL) return NULL;
80     // copy the simple data
81     h->n_targets = h0->n_targets;
82     h->ignore_sam_err = h0->ignore_sam_err;
83     h->l_text = h0->l_text;
84     // Then the pointery stuff
85     h->cigar_tab = NULL;
86     h->sdict = NULL;
87     // TODO Check for memory allocation failures
88     h->text = (char*)calloc(h->l_text + 1, 1);
89     memcpy(h->text, h0->text, h->l_text);
90     h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
91     h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
92     int i;
93     for (i = 0; i < h->n_targets; ++i) {
94         h->target_len[i] = h0->target_len[i];
95         h->target_name[i] = strdup(h0->target_name[i]);
96     }
97     return h;
98 }
99 
100 
hdr_from_dict(sdict_t * d)101 static bam_hdr_t *hdr_from_dict(sdict_t *d)
102 {
103     bam_hdr_t *h;
104     khint_t k;
105     h = bam_hdr_init();
106     h->sdict = d;
107     h->n_targets = kh_size(d);
108     // TODO Check for memory allocation failures
109     h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
110     h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
111     for (k = kh_begin(d); k != kh_end(d); ++k) {
112         if (!kh_exist(d, k)) continue;
113         h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
114         h->target_len[kh_val(d, k)>>32]  = kh_val(d, k) & 0xffffffffUL;
115         kh_val(d, k) >>= 32;
116     }
117     return h;
118 }
119 
bam_hdr_read(BGZF * fp)120 bam_hdr_t *bam_hdr_read(BGZF *fp)
121 {
122     bam_hdr_t *h;
123     char buf[4];
124     int magic_len, has_EOF;
125     int32_t i, name_len, num_names = 0;
126     size_t bufsize;
127     ssize_t bytes;
128     // check EOF
129     has_EOF = bgzf_check_EOF(fp);
130     if (has_EOF < 0) {
131         perror("[W::bam_hdr_read] bgzf_check_EOF");
132     } else if (has_EOF == 0) {
133         hts_log_warning("EOF marker is absent. The input is probably truncated");
134     }
135     // read "BAM1"
136     magic_len = bgzf_read(fp, buf, 4);
137     if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
138         hts_log_error("Invalid BAM binary header");
139         return 0;
140     }
141     h = bam_hdr_init();
142     if (!h) goto nomem;
143 
144     // read plain text and the number of reference sequences
145     bytes = bgzf_read(fp, &h->l_text, 4);
146     if (bytes != 4) goto read_err;
147     if (fp->is_be) ed_swap_4p(&h->l_text);
148 
149     bufsize = ((size_t) h->l_text) + 1;
150     if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
151     h->text = (char*)malloc(bufsize);
152     if (!h->text) goto nomem;
153     h->text[h->l_text] = 0; // make sure it is NULL terminated
154     bytes = bgzf_read(fp, h->text, h->l_text);
155     if (bytes != h->l_text) goto read_err;
156 
157     bytes = bgzf_read(fp, &h->n_targets, 4);
158     if (bytes != 4) goto read_err;
159     if (fp->is_be) ed_swap_4p(&h->n_targets);
160 
161     if (h->n_targets < 0) goto invalid;
162 
163     // read reference sequence names and lengths
164     if (h->n_targets > 0) {
165         h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
166         if (!h->target_name) goto nomem;
167         h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
168         if (!h->target_len) goto nomem;
169     }
170     else {
171         h->target_name = NULL;
172         h->target_len = NULL;
173     }
174 
175     for (i = 0; i != h->n_targets; ++i) {
176         bytes = bgzf_read(fp, &name_len, 4);
177         if (bytes != 4) goto read_err;
178         if (fp->is_be) ed_swap_4p(&name_len);
179         if (name_len <= 0) goto invalid;
180 
181         h->target_name[i] = (char*)malloc(name_len);
182         if (!h->target_name[i]) goto nomem;
183         num_names++;
184 
185         bytes = bgzf_read(fp, h->target_name[i], name_len);
186         if (bytes != name_len) goto read_err;
187 
188         if (h->target_name[i][name_len - 1] != '\0') {
189             /* Fix missing NUL-termination.  Is this being too nice?
190                We could alternatively bail out with an error. */
191             char *new_name;
192             if (name_len == INT32_MAX) goto invalid;
193             new_name = realloc(h->target_name[i], name_len + 1);
194             if (new_name == NULL) goto nomem;
195             h->target_name[i] = new_name;
196             h->target_name[i][name_len] = '\0';
197         }
198 
199         bytes = bgzf_read(fp, &h->target_len[i], 4);
200         if (bytes != 4) goto read_err;
201         if (fp->is_be) ed_swap_4p(&h->target_len[i]);
202     }
203     return h;
204 
205  nomem:
206     hts_log_error("Out of memory");
207     goto clean;
208 
209  read_err:
210     if (bytes < 0) {
211         hts_log_error("Error reading BGZF stream");
212     } else {
213         hts_log_error("Truncated BAM header");
214     }
215     goto clean;
216 
217  invalid:
218     hts_log_error("Invalid BAM binary header");
219 
220  clean:
221     if (h != NULL) {
222         h->n_targets = num_names; // ensure we free only allocated target_names
223         bam_hdr_destroy(h);
224     }
225     return NULL;
226 }
227 
bam_hdr_write(BGZF * fp,const bam_hdr_t * h)228 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
229 {
230     char buf[4];
231     int32_t i, name_len, x;
232     // write "BAM1"
233     strncpy(buf, "BAM\1", 4);
234     if (bgzf_write(fp, buf, 4) < 0) return -1;
235     // write plain text and the number of reference sequences
236     if (fp->is_be) {
237         x = ed_swap_4(h->l_text);
238         if (bgzf_write(fp, &x, 4) < 0) return -1;
239         if (h->l_text) {
240             if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
241         }
242         x = ed_swap_4(h->n_targets);
243         if (bgzf_write(fp, &x, 4) < 0) return -1;
244     } else {
245         if (bgzf_write(fp, &h->l_text, 4) < 0) return -1;
246         if (h->l_text) {
247             if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
248         }
249         if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1;
250     }
251     // write sequence names and lengths
252     for (i = 0; i != h->n_targets; ++i) {
253         char *p = h->target_name[i];
254         name_len = strlen(p) + 1;
255         if (fp->is_be) {
256             x = ed_swap_4(name_len);
257             if (bgzf_write(fp, &x, 4) < 0) return -1;
258         } else {
259             if (bgzf_write(fp, &name_len, 4) < 0) return -1;
260         }
261         if (bgzf_write(fp, p, name_len) < 0) return -1;
262         if (fp->is_be) {
263             x = ed_swap_4(h->target_len[i]);
264             if (bgzf_write(fp, &x, 4) < 0) return -1;
265         } else {
266             if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
267         }
268     }
269     if (bgzf_flush(fp) < 0) return -1;
270     return 0;
271 }
272 
bam_name2id(bam_hdr_t * h,const char * ref)273 int bam_name2id(bam_hdr_t *h, const char *ref)
274 {
275     sdict_t *d = (sdict_t*)h->sdict;
276     khint_t k;
277     if (h->sdict == 0) {
278         int i, absent;
279         d = kh_init(s2i);
280         for (i = 0; i < h->n_targets; ++i) {
281             k = kh_put(s2i, d, h->target_name[i], &absent);
282             kh_val(d, k) = i;
283         }
284         h->sdict = d;
285     }
286     k = kh_get(s2i, d, ref);
287     return k == kh_end(d)? -1 : kh_val(d, k);
288 }
289 
290 /*************************
291  *** BAM alignment I/O ***
292  *************************/
293 
bam_init1()294 bam1_t *bam_init1()
295 {
296     return (bam1_t*)calloc(1, sizeof(bam1_t));
297 }
298 
bam_destroy1(bam1_t * b)299 void bam_destroy1(bam1_t *b)
300 {
301     if (b == 0) return;
302     free(b->data); free(b);
303 }
304 
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)305 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
306 {
307     uint8_t *data = bdst->data;
308     int m_data = bdst->m_data;   // backup data and m_data
309     if (m_data < bsrc->l_data) { // double the capacity
310         m_data = bsrc->l_data; kroundup32(m_data);
311         data = (uint8_t*)realloc(data, m_data);
312     }
313     memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
314     *bdst = *bsrc; // copy the rest
315     // restore the backup
316     bdst->m_data = m_data;
317     bdst->data = data;
318     return bdst;
319 }
320 
bam_dup1(const bam1_t * bsrc)321 bam1_t *bam_dup1(const bam1_t *bsrc)
322 {
323     if (bsrc == NULL) return NULL;
324     bam1_t *bdst = bam_init1();
325     if (bdst == NULL) return NULL;
326     return bam_copy1(bdst, bsrc);
327 }
328 
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,int * rlen,int * qlen)329 void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen)
330 {
331     int k;
332     *rlen = *qlen = 0;
333     for (k = 0; k < n_cigar; ++k) {
334         int type = bam_cigar_type(bam_cigar_op(cigar[k]));
335         int len = bam_cigar_oplen(cigar[k]);
336         if (type & 1) *qlen += len;
337         if (type & 2) *rlen += len;
338     }
339 }
340 
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)341 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
342 {
343     int k, l;
344     for (k = l = 0; k < n_cigar; ++k)
345         if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
346             l += bam_cigar_oplen(cigar[k]);
347     return l;
348 }
349 
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)350 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
351 {
352     int k, l;
353     for (k = l = 0; k < n_cigar; ++k)
354         if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
355             l += bam_cigar_oplen(cigar[k]);
356     return l;
357 }
358 
bam_endpos(const bam1_t * b)359 int32_t bam_endpos(const bam1_t *b)
360 {
361     if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
362         return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
363     else
364         return b->core.pos + 1;
365 }
366 
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)367 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
368 {
369     bam1_core_t *c = &b->core;
370     uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
371     uint8_t *CG;
372 
373     // test where there is a real CIGAR in the CG tag to move
374     if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
375     cigar0 = bam_get_cigar(b);
376     if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
377     fake_bytes = c->n_cigar * 4;
378     if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag
379     if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
380     CG_len = le_to_u32(CG + 2);
381     if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
382 
383     // move from the CG tag to the right position
384     cigar_st = (uint8_t*)cigar0 - b->data;
385     c->n_cigar = CG_len;
386     n_cigar4 = c->n_cigar * 4;
387     CG_st = CG - b->data - 2;
388     CG_en = CG_st + 8 + n_cigar4;
389     b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
390     if (b->m_data < b->l_data) {
391         uint8_t *new_data;
392         uint32_t new_max = b->l_data;
393         kroundup32(new_max);
394         new_data = (uint8_t*)realloc(b->data, new_max);
395         if (new_data == 0) return -1;
396         b->m_data = new_max, b->data = new_data;
397     }
398     memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
399     memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
400     if (ori_len > CG_en) // move data after the CG tag
401         memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
402     b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
403     if (recal_bin)
404         b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5);
405     if (give_warning)
406         hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
407     return 1;
408 }
409 
aux_type2size(uint8_t type)410 static inline int aux_type2size(uint8_t type)
411 {
412     switch (type) {
413     case 'A': case 'c': case 'C':
414         return 1;
415     case 's': case 'S':
416         return 2;
417     case 'i': case 'I': case 'f':
418         return 4;
419     case 'd':
420         return 8;
421     case 'Z': case 'H': case 'B':
422         return type;
423     default:
424         return 0;
425     }
426 }
427 
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)428 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
429 {
430     uint32_t *cigar = (uint32_t*)(data + c->l_qname);
431     uint32_t i;
432     for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
433 }
434 
bam_read1(BGZF * fp,bam1_t * b)435 int bam_read1(BGZF *fp, bam1_t *b)
436 {
437     bam1_core_t *c = &b->core;
438     int32_t block_len, ret, i;
439     uint32_t x[8];
440     if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
441         if (ret == 0) return -1; // normal end-of-file
442         else return -2; // truncated
443     }
444     if (fp->is_be)
445         ed_swap_4p(&block_len);
446     if (block_len < 32) return -4;  // block_len includes core data
447     if (bgzf_read(fp, x, 32) != 32) return -3;
448     if (fp->is_be) {
449         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
450     }
451     c->tid = x[0]; c->pos = x[1];
452     c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
453     c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
454     if ((uint32_t) c->l_qname + c->l_extranul > 255) // l_qname would overflow
455         return -4;
456     c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
457     c->l_qseq = x[4];
458     c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
459     b->l_data = block_len - 32 + c->l_extranul;
460     if (b->l_data < 0 || c->l_qseq < 0 || c->l_qname < 1) return -4;
461     if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
462         + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) b->l_data)
463         return -4;
464     if (b->m_data < b->l_data) {
465         uint8_t *new_data;
466         uint32_t new_m = b->l_data;
467         kroundup32(new_m);
468         new_data = (uint8_t*)realloc(b->data, new_m);
469         if (!new_data)
470             return -4;
471         b->data = new_data;
472         b->m_data = new_m;
473     }
474     if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
475     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
476     c->l_qname += c->l_extranul;
477     if (b->l_data < c->l_qname ||
478         bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
479         return -4;
480     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
481     if (bam_tag2cigar(b, 0, 0) < 0)
482         return -4;
483 
484     if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
485         int rlen, qlen;
486         bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
487         if ((b->core.flag & BAM_FUNMAP)) rlen=1;
488         b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
489         // Sanity check for broken CIGAR alignments
490         if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
491             hts_log_error("CIGAR and query sequence lengths differ for %s",
492                     bam_get_qname(b));
493             return -4;
494         }
495     }
496 
497     return 4 + block_len;
498 }
499 
bam_write1(BGZF * fp,const bam1_t * b)500 int bam_write1(BGZF *fp, const bam1_t *b)
501 {
502     const bam1_core_t *c = &b->core;
503     uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
504     int i, ok;
505     if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
506     x[0] = c->tid;
507     x[1] = c->pos;
508     x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
509     if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
510     else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
511     x[4] = c->l_qseq;
512     x[5] = c->mtid;
513     x[6] = c->mpos;
514     x[7] = c->isize;
515     ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
516     if (fp->is_be) {
517         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
518         y = block_len;
519         if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
520         swap_data(c, b->l_data, b->data, 1);
521     } else {
522         if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
523     }
524     if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
525     if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
526     if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
527         if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
528     } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
529         uint8_t buf[8];
530         uint32_t cigar_st, cigar_en, cigar[2];
531         cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
532         cigar_en = cigar_st + c->n_cigar * 4;
533         cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
534         cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP;
535         u32_to_le(cigar[0], buf);
536         u32_to_le(cigar[1], buf + 4);
537         if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
538         if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
539         if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
540         u32_to_le(c->n_cigar, buf);
541         if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
542         if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
543     }
544     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
545     return ok? 4 + block_len : -1;
546 }
547 
548 /********************
549  *** BAM indexing ***
550  ********************/
551 
bam_index(BGZF * fp,int min_shift)552 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
553 {
554     int n_lvls, i, fmt, ret;
555     bam1_t *b;
556     hts_idx_t *idx;
557     bam_hdr_t *h;
558     h = bam_hdr_read(fp);
559     if (h == NULL) return NULL;
560     if (min_shift > 0) {
561         int64_t max_len = 0, s;
562         for (i = 0; i < h->n_targets; ++i)
563             if (max_len < h->target_len[i]) max_len = h->target_len[i];
564         max_len += 256;
565         for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
566         fmt = HTS_FMT_CSI;
567     } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
568     idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
569     bam_hdr_destroy(h);
570     b = bam_init1();
571     while ((ret = bam_read1(fp, b)) >= 0) {
572         ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
573         if (ret < 0) goto err; // unsorted
574     }
575     if (ret < -1) goto err; // corrupted BAM file
576 
577     hts_idx_finish(idx, bgzf_tell(fp));
578     bam_destroy1(b);
579     return idx;
580 
581 err:
582     bam_destroy1(b);
583     hts_idx_destroy(idx);
584     return NULL;
585 }
586 
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)587 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
588 {
589     hts_idx_t *idx;
590     htsFile *fp;
591     int ret = 0;
592 
593     if ((fp = hts_open(fn, "r")) == 0) return -2;
594     if (nthreads)
595         hts_set_threads(fp, nthreads);
596 
597     switch (fp->format.format) {
598     case cram:
599         ret = cram_index_build(fp->fp.cram, fn, fnidx);
600         break;
601 
602     case bam:
603         idx = bam_index(fp->fp.bgzf, min_shift);
604         if (idx) {
605             ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
606             if (ret < 0) ret = -4;
607             hts_idx_destroy(idx);
608         }
609         else ret = -1;
610         break;
611 
612     default:
613         ret = -3;
614         break;
615     }
616     hts_close(fp);
617 
618     return ret;
619 }
620 
sam_index_build2(const char * fn,const char * fnidx,int min_shift)621 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
622 {
623     return sam_index_build3(fn, fnidx, min_shift, 0);
624 }
625 
sam_index_build(const char * fn,int min_shift)626 int sam_index_build(const char *fn, int min_shift)
627 {
628     return sam_index_build3(fn, NULL, min_shift, 0);
629 }
630 
631 // Provide bam_index_build() symbol for binary compability with earlier HTSlib
632 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)633 int bam_index_build(const char *fn, int min_shift)
634 {
635     return sam_index_build2(fn, NULL, min_shift);
636 }
637 
bam_readrec(BGZF * fp,void * ignored,void * bv,int * tid,int * beg,int * end)638 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
639 {
640     bam1_t *b = bv;
641     int ret;
642     if ((ret = bam_read1(fp, b)) >= 0) {
643         *tid = b->core.tid;
644         *beg = b->core.pos;
645         *end = bam_endpos(b);
646     }
647     return ret;
648 }
649 
650 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,int * beg,int * end)651 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
652 {
653     htsFile *fp = fpv;
654     bam1_t *b = bv;
655     int ret = cram_get_bam_seq(fp->fp.cram, &b);
656     if (ret < 0)
657         return cram_eof(fp->fp.cram) ? -1 : -2;
658 
659     if (bam_tag2cigar(b, 1, 1) < 0)
660         return -2;
661 
662     *tid = b->core.tid;
663     *beg = b->core.pos;
664     *end = bam_endpos(b);
665 
666     return ret;
667 }
668 
cram_pseek(void * fp,int64_t offset,int whence)669 static int cram_pseek(void *fp, int64_t offset, int whence)
670 {
671     cram_fd *fd =  (cram_fd *)fp;
672 
673     if ((0 != cram_seek(fd, offset, SEEK_SET))
674      && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
675         return -1;
676 
677     if (fd->ctr) {
678         cram_free_container(fd->ctr);
679         fd->ctr = NULL;
680         fd->ooc = 0;
681     }
682 
683     return 0;
684 }
685 
686 /*
687  * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
688  *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
689  *   container previously fetched. It was designed like this to integrate with the functionality
690  *   of the iterator stepping logic.
691  */
692 
cram_ptell(void * fp)693 static int64_t cram_ptell(void *fp)
694 {
695     cram_fd *fd = (cram_fd *)fp;
696     cram_container *c;
697     int64_t ret = -1L;
698 
699     if (fd && fd->fp) {
700         ret = htell(fd->fp);
701         if ((c = fd->ctr) != NULL) {
702             ret -= ((c->curr_slice != c->max_slice || c->curr_rec != c->max_rec) ? c->offset + 1 : 0);
703         }
704     }
705 
706     return ret;
707 }
708 
bam_pseek(void * fp,int64_t offset,int whence)709 static int bam_pseek(void *fp, int64_t offset, int whence)
710 {
711     BGZF *fd = (BGZF *)fp;
712 
713     return bgzf_seek(fd, offset, whence);
714 }
715 
bam_ptell(void * fp)716 static int64_t bam_ptell(void *fp)
717 {
718     BGZF *fd = (BGZF *)fp;
719     if (!fd)
720         return -1L;
721 
722     return bgzf_tell(fd);
723 }
724 
725 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_bam_cram_readrec(BGZF * bgzfp,void * fpv,void * bv,int * tid,int * beg,int * end)726 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
727 {
728     htsFile *fp = fpv;
729     bam1_t *b = bv;
730     switch (fp->format.format) {
731     case bam:   return bam_read1(bgzfp, b);
732     case cram: {
733         int ret = cram_get_bam_seq(fp->fp.cram, &b);
734         if (ret < 0)
735             return cram_eof(fp->fp.cram) ? -1 : -2;
736 
737         if (bam_tag2cigar(b, 1, 1) < 0)
738             return -2;
739         return ret;
740     }
741     default:
742         // TODO Need headers available to implement this for SAM files
743         hts_log_error("Not implemented for SAM files");
744         abort();
745     }
746 }
747 
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)748 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
749 {
750     switch (fp->format.format) {
751     case bam:
752         return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI);
753 
754     case cram: {
755         if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
756         // Cons up a fake "index" just pointing at the associated cram_fd:
757         hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
758         if (idx == NULL) return NULL;
759         idx->fmt = HTS_FMT_CRAI;
760         idx->cram = fp->fp.cram;
761         return (hts_idx_t *) idx;
762         }
763 
764     default:
765         return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
766     }
767 }
768 
sam_index_load(htsFile * fp,const char * fn)769 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
770 {
771     return sam_index_load2(fp, fn, NULL);
772 }
773 
cram_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)774 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
775 {
776     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
777     hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
778     if (iter == NULL) return NULL;
779 
780     // Cons up a dummy iterator for which hts_itr_next() will simply invoke
781     // the readrec function:
782     iter->is_cram = 1;
783     iter->read_rest = 1;
784     iter->off = NULL;
785     iter->bins.a = NULL;
786     iter->readrec = readrec;
787 
788     if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
789         cram_range r = { tid, beg+1, end };
790         int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
791 
792         iter->curr_off = 0;
793         // The following fields are not required by hts_itr_next(), but are
794         // filled in in case user code wants to look at them.
795         iter->tid = tid;
796         iter->beg = beg;
797         iter->end = end;
798 
799         switch (ret) {
800         case 0:
801             break;
802 
803         case -2:
804             // No data vs this ref, so mark iterator as completed.
805             // Same as HTS_IDX_NONE.
806             iter->finished = 1;
807             break;
808 
809         default:
810             free(iter);
811             return NULL;
812         }
813     }
814     else switch (tid) {
815     case HTS_IDX_REST:
816         iter->curr_off = 0;
817         break;
818     case HTS_IDX_NONE:
819         iter->curr_off = 0;
820         iter->finished = 1;
821         break;
822     default:
823         hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
824         abort();
825         break;
826     }
827 
828     return iter;
829 }
830 
sam_itr_queryi(const hts_idx_t * idx,int tid,int beg,int end)831 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
832 {
833     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
834     if (idx == NULL)
835         return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
836     else if (cidx->fmt == HTS_FMT_CRAI)
837         return cram_itr_query(idx, tid, beg, end, cram_readrec);
838     else
839         return hts_itr_query(idx, tid, beg, end, bam_readrec);
840 }
841 
cram_name2id(void * fdv,const char * ref)842 static int cram_name2id(void *fdv, const char *ref)
843 {
844     cram_fd *fd = (cram_fd *) fdv;
845     return sam_hdr_name2ref(fd->header, ref);
846 }
847 
sam_itr_querys(const hts_idx_t * idx,bam_hdr_t * hdr,const char * region)848 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
849 {
850     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
851     if (cidx->fmt == HTS_FMT_CRAI)
852         return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
853     else
854         return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
855 }
856 
sam_itr_regions(const hts_idx_t * idx,bam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)857 hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
858 {
859     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
860     if (cidx->fmt == HTS_FMT_CRAI)
861         return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
862                    hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
863     else
864         return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
865                    hts_itr_multi_bam, bam_readrec, bam_pseek, bam_ptell);
866 }
867 
868 /**********************
869  *** SAM header I/O ***
870  **********************/
871 
872 #include "htslib/kseq.h"
873 #include "htslib/kstring.h"
874 
sam_hdr_parse(int l_text,const char * text)875 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
876 {
877     const char *q, *r, *p;
878     khash_t(s2i) *d;
879     d = kh_init(s2i);
880     for (p = text; *p; ++p) {
881         if (strncmp(p, "@SQ\t", 4) == 0) {
882             char *sn = 0;
883             int ln = -1;
884             for (q = p + 4;; ++q) {
885                 if (strncmp(q, "SN:", 3) == 0) {
886                     q += 3;
887                     for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r);
888                     sn = (char*)calloc(r - q + 1, 1);
889                     strncpy(sn, q, r - q);
890                     q = r;
891                 } else if (strncmp(q, "LN:", 3) == 0)
892                     ln = strtol(q + 3, (char**)&q, 10);
893                 while (*q != '\t' && *q != '\n' && *q != '\0') ++q;
894                 if (*q == '\0' || *q == '\n') break;
895             }
896             p = q;
897             if (sn && ln >= 0) {
898                 khint_t k;
899                 int absent;
900                 k = kh_put(s2i, d, sn, &absent);
901                 if (!absent) {
902                     hts_log_warning("Duplicated sequence '%s'", sn);
903                     free(sn);
904                 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
905             }
906         }
907         while (*p != '\0' && *p != '\n') ++p;
908     }
909     return hdr_from_dict(d);
910 }
911 
912 // Minimal sanitisation of a header to ensure.
913 // - null terminated string.
914 // - all lines start with @ (also implies no blank lines).
915 //
916 // Much more could be done, but currently is not, including:
917 // - checking header types are known (HD, SQ, etc).
918 // - syntax (eg checking tab separated fields).
919 // - validating n_targets matches @SQ records.
920 // - validating target lengths against @SQ records.
sam_hdr_sanitise(bam_hdr_t * h)921 static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) {
922     if (!h)
923         return NULL;
924 
925     // Special case for empty headers.
926     if (h->l_text == 0)
927         return h;
928 
929     uint32_t i, lnum = 0;
930     char *cp = h->text, last = '\n';
931     for (i = 0; i < h->l_text; i++) {
932         // NB: l_text excludes terminating nul.  This finds early ones.
933         if (cp[i] == 0)
934             break;
935 
936         // Error on \n[^@], including duplicate newlines
937         if (last == '\n') {
938             lnum++;
939             if (cp[i] != '@') {
940                 hts_log_error("Malformed SAM header at line %u", lnum);
941                 bam_hdr_destroy(h);
942                 return NULL;
943             }
944         }
945 
946         last = cp[i];
947     }
948 
949     if (i < h->l_text) { // Early nul found.  Complain if not just padding.
950         uint32_t j = i;
951         while (j < h->l_text && cp[j] == '\0') j++;
952         if (j < h->l_text)
953             hts_log_warning("Unexpected NUL character in header. Possibly truncated");
954     }
955 
956     // Add trailing newline and/or trailing nul if required.
957     if (last != '\n') {
958         hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
959 
960         if (h->l_text == UINT32_MAX) {
961             hts_log_error("No room for extra newline");
962             bam_hdr_destroy(h);
963             return NULL;
964         }
965 
966         if (i >= h->l_text - 1) {
967             cp = realloc(h->text, (size_t) h->l_text+2);
968             if (!cp) {
969                 bam_hdr_destroy(h);
970                 return NULL;
971             }
972             h->text = cp;
973         }
974         cp[i++] = '\n';
975 
976         // l_text may be larger already due to multiple nul padding
977         if (h->l_text < i)
978             h->l_text = i;
979         cp[h->l_text] = '\0';
980     }
981 
982     return h;
983 }
984 
sam_hdr_read(htsFile * fp)985 bam_hdr_t *sam_hdr_read(htsFile *fp)
986 {
987     switch (fp->format.format) {
988     case bam:
989         return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
990 
991     case cram:
992         return sam_hdr_sanitise(cram_header_to_bam(fp->fp.cram->header));
993 
994     case sam: {
995         kstring_t str = { 0, 0, NULL };
996         bam_hdr_t *h = NULL;
997         int ret, has_SQ = 0;
998         while ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
999             if (fp->line.s[0] != '@') break;
1000             if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
1001             kputsn(fp->line.s, fp->line.l, &str);
1002             kputc('\n', &str);
1003         }
1004         if (ret < -1) goto error;
1005         if (! has_SQ && fp->fn_aux) {
1006             kstring_t line = { 0, 0, NULL };
1007             hFILE *f = hopen(fp->fn_aux, "r");
1008             if (f == NULL) goto error;
1009             while (line.l = 0, kgetline(&line, (kgets_func *) hgets, f) >= 0) {
1010                 char *tab = strchr(line.s, '\t');
1011                 if (tab == NULL) continue;
1012                 kputs("@SQ\tSN:", &str);
1013                 kputsn(line.s, tab - line.s, &str);
1014                 kputs("\tLN:", &str);
1015                 kputl(atol(tab), &str);
1016                 kputc('\n', &str);
1017             }
1018             free(line.s);
1019             if (hclose(f) != 0) {
1020                 hts_log_warning("Failed to close %s", fp->fn_aux);
1021             }
1022         }
1023         if (str.l == 0) kputsn("", 0, &str);
1024         h = sam_hdr_parse(str.l, str.s);
1025         h->l_text = str.l; h->text = str.s;
1026         return sam_hdr_sanitise(h);
1027 
1028      error:
1029         bam_hdr_destroy(h);
1030         free(str.s);
1031         return NULL;
1032         }
1033 
1034     default:
1035         abort();
1036     }
1037 }
1038 
sam_hdr_write(htsFile * fp,const bam_hdr_t * h)1039 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
1040 {
1041     if (!h) {
1042         errno = EINVAL;
1043         return -1;
1044     }
1045 
1046     switch (fp->format.format) {
1047     case binary_format:
1048         fp->format.category = sequence_data;
1049         fp->format.format = bam;
1050         /* fall-through */
1051     case bam:
1052         if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1053         break;
1054 
1055     case cram: {
1056         cram_fd *fd = fp->fp.cram;
1057         SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h);
1058         if (! hdr) return -1;
1059         if (cram_set_header(fd, hdr) < 0) return -1;
1060         if (fp->fn_aux)
1061             cram_load_reference(fd, fp->fn_aux);
1062         if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1063         }
1064         break;
1065 
1066     case text_format:
1067         fp->format.category = sequence_data;
1068         fp->format.format = sam;
1069         /* fall-through */
1070     case sam: {
1071         char *p;
1072         hputs(h->text, fp->fp.hfile);
1073         p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
1074         if (p == 0) {
1075             int i;
1076             for (i = 0; i < h->n_targets; ++i) {
1077                 fp->line.l = 0;
1078                 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
1079                 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
1080                 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1081             }
1082         }
1083         if ( hflush(fp->fp.hfile) != 0 ) return -1;
1084         }
1085         break;
1086 
1087     default:
1088         abort();
1089     }
1090     return 0;
1091 }
1092 
sam_hdr_change_HD(bam_hdr_t * h,const char * key,const char * val)1093 int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val)
1094 {
1095     char *p, *q, *beg = NULL, *end = NULL, *newtext;
1096     if (!h || !key)
1097         return -1;
1098 
1099     if (h->l_text > 3) {
1100         if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
1101             if ((p = strchr(h->text, '\n')) == 0) return -1;
1102             *p = '\0'; // for strstr call
1103 
1104             char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
1105 
1106             if ((q = strstr(h->text, tmp)) != 0) { // key exists
1107                 *p = '\n'; // change back
1108 
1109                 // mark the key:val
1110                 beg = q;
1111                 for (q += 4; *q != '\n' && *q != '\t'; ++q);
1112                 end = q;
1113 
1114                 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
1115                     && strlen(val) == end - beg - 4)
1116                      return 0; // val is the same, no need to change
1117 
1118             } else {
1119                 beg = end = p;
1120                 *p = '\n';
1121             }
1122         }
1123     }
1124     if (beg == NULL) { // no @HD
1125         if (h->l_text > UINT32_MAX - strlen(SAM_FORMAT_VERSION) - 9)
1126             return -1;
1127         h->l_text += strlen(SAM_FORMAT_VERSION) + 8;
1128         if (val) {
1129             if (h->l_text > UINT32_MAX - strlen(val) - 5)
1130                 return -1;
1131             h->l_text += strlen(val) + 4;
1132         }
1133         newtext = (char*)malloc(h->l_text + 1);
1134         if (!newtext) return -1;
1135 
1136         if (val)
1137             snprintf(newtext, h->l_text + 1,
1138                     "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
1139         else
1140             snprintf(newtext, h->l_text + 1,
1141                     "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
1142     } else { // has @HD but different or no key
1143         h->l_text = (beg - h->text) + (h->text + h->l_text - end);
1144         if (val) {
1145             if (h->l_text > UINT32_MAX - strlen(val) - 5)
1146                 return -1;
1147             h->l_text += strlen(val) + 4;
1148         }
1149         newtext = (char*)malloc(h->l_text + 1);
1150         if (!newtext) return -1;
1151 
1152         if (val) {
1153             snprintf(newtext, h->l_text + 1, "%.*s\t%s:%s%s",
1154                     (int) (beg - h->text), h->text, key, val, end);
1155         } else { //delete key
1156             snprintf(newtext, h->l_text + 1, "%.*s%s",
1157                     (int) (beg - h->text), h->text, end);
1158         }
1159     }
1160     free(h->text);
1161     h->text = newtext;
1162     return 0;
1163 }
1164 
1165 /**********************
1166  *** SAM record I/O ***
1167  **********************/
1168 
sam_parse1(kstring_t * s,bam_hdr_t * h,bam1_t * b)1169 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
1170 {
1171 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
1172 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
1173 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
1174 #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0)
1175 #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0)
1176 #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0)
1177 
1178     uint8_t *t;
1179     char *p = s->s, *q;
1180     int i;
1181     kstring_t str;
1182     bam1_core_t *c = &b->core;
1183 
1184     str.l = b->l_data = 0;
1185     str.s = (char*)b->data; str.m = b->m_data;
1186     memset(c, 0, 32);
1187     if (h->cigar_tab == 0) {
1188         h->cigar_tab = (int8_t*) malloc(128);
1189         for (i = 0; i < 128; ++i)
1190             h->cigar_tab[i] = -1;
1191         for (i = 0; BAM_CIGAR_STR[i]; ++i)
1192             h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
1193     }
1194     // qname
1195     q = _read_token(p);
1196     _parse_warn(p - q <= 1, "empty query name");
1197     _parse_err(p - q > 252, "query name too long");
1198     kputsn_(q, p - q, &str);
1199     for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++)
1200         kputc_('\0', &str);
1201     c->l_qname = p - q + c->l_extranul;
1202     // flag
1203     c->flag = strtol(p, &p, 0);
1204     if (*p++ != '\t') goto err_ret; // malformated flag
1205     // chr
1206     q = _read_token(p);
1207     if (strcmp(q, "*")) {
1208         _parse_err(h->n_targets == 0, "missing SAM header");
1209         c->tid = bam_name2id(h, q);
1210         _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
1211     } else c->tid = -1;
1212     // pos
1213     c->pos = strtol(p, &p, 10) - 1;
1214     if (*p++ != '\t') goto err_ret;
1215     if (c->pos < 0 && c->tid >= 0) {
1216         _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
1217         c->tid = -1;
1218     }
1219     if (c->tid < 0) c->flag |= BAM_FUNMAP;
1220     // mapq
1221     c->qual = strtol(p, &p, 10);
1222     if (*p++ != '\t') goto err_ret;
1223     // cigar
1224     if (*p != '*') {
1225         uint32_t *cigar;
1226         size_t n_cigar = 0;
1227         for (q = p; *p && *p != '\t'; ++p)
1228             if (!isdigit_c(*p)) ++n_cigar;
1229         if (*p++ != '\t') goto err_ret;
1230         _parse_err(n_cigar == 0, "no CIGAR operations");
1231         _parse_err(n_cigar >= 2147483647, "too many CIGAR operations");
1232         c->n_cigar = n_cigar;
1233         _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t));
1234         for (i = 0; i < c->n_cigar; ++i, ++q) {
1235             int op;
1236             cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
1237             op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
1238             _parse_err(op < 0, "unrecognized CIGAR operator");
1239             cigar[i] |= op;
1240         }
1241         // can't use bam_endpos() directly as some fields not yet set up
1242         i = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
1243     } else {
1244         _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
1245         c->flag |= BAM_FUNMAP;
1246         q = _read_token(p);
1247         i = 1;
1248     }
1249     c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
1250     // mate chr
1251     q = _read_token(p);
1252     if (strcmp(q, "=") == 0) {
1253         c->mtid = c->tid;
1254     } else if (strcmp(q, "*") == 0) {
1255         c->mtid = -1;
1256     } else {
1257         c->mtid = bam_name2id(h, q);
1258         _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped");
1259     }
1260     // mpos
1261     c->mpos = strtol(p, &p, 10) - 1;
1262     if (*p++ != '\t') goto err_ret;
1263     if (c->mpos < 0 && c->mtid >= 0) {
1264         _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
1265         c->mtid = -1;
1266     }
1267     // tlen
1268     c->isize = strtol(p, &p, 10);
1269     if (*p++ != '\t') goto err_ret;
1270     // seq
1271     q = _read_token(p);
1272     if (strcmp(q, "*")) {
1273         c->l_qseq = p - q - 1;
1274         i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
1275         _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
1276         i = (c->l_qseq + 1) >> 1;
1277         _get_mem(uint8_t, &t, &str, i);
1278         memset(t, 0, i);
1279         for (i = 0; i < c->l_qseq; ++i)
1280             t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
1281     } else c->l_qseq = 0;
1282     // qual
1283     q = _read_token_aux(p);
1284     _get_mem(uint8_t, &t, &str, c->l_qseq);
1285     if (strcmp(q, "*")) {
1286         _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
1287         for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
1288     } else memset(t, 0xff, c->l_qseq);
1289     // aux
1290     while (p < s->s + s->l) {
1291         uint8_t type;
1292         q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
1293         _parse_err(p - q - 1 < 5, "incomplete aux field");
1294         kputsn_(q, 2, &str);
1295         q += 3; type = *q++; ++q; // q points to value
1296         if (type != 'Z' && type != 'H') // the only zero length acceptable fields
1297             _parse_err(p - q - 1 < 1, "incomplete aux field");
1298 
1299         // Ensure str has enough space for a double + type allocated.
1300         // This is so we can stuff bigger integers and floats directly into
1301         // the kstring.  Sorry.
1302         _parse_err(ks_resize(&str, str.l + 16), "out of memory");
1303 
1304         if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
1305             kputc_('A', &str);
1306             kputc_(*q, &str);
1307         } else if (type == 'i' || type == 'I') {
1308             if (*q == '-') {
1309                 long x = strtol(q, &q, 10);
1310                 if (x >= INT8_MIN) {
1311                     kputc_('c', &str); kputc_(x, &str);
1312                 } else if (x >= INT16_MIN) {
1313                     str.s[str.l++] = 's';
1314                     i16_to_le(x, (uint8_t *) str.s + str.l);
1315                     str.l += 2;
1316                 } else {
1317                     str.s[str.l++] = 'i';
1318                     i32_to_le(x, (uint8_t *) str.s + str.l);
1319                     str.l += 4;
1320                 }
1321             } else {
1322                 unsigned long x = strtoul(q, &q, 10);
1323                 if (x <= UINT8_MAX) {
1324                     kputc_('C', &str); kputc_(x, &str);
1325                 } else if (x <= UINT16_MAX) {
1326                     str.s[str.l++] = 'S';
1327                     u16_to_le(x, (uint8_t *) str.s + str.l);
1328                     str.l += 2;
1329                 } else {
1330                     str.s[str.l++] = 'I';
1331                     u32_to_le(x, (uint8_t *) str.s + str.l);
1332                     str.l += 4;
1333                 }
1334             }
1335         } else if (type == 'f') {
1336             str.s[str.l++] = 'f';
1337             float_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1338             str.l += sizeof(float);
1339         } else if (type == 'd') {
1340             str.s[str.l++] = 'd';
1341             double_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1342             str.l += sizeof(double);
1343         } else if (type == 'Z' || type == 'H') {
1344             _parse_err(type == 'H' && !((p-q)&1),
1345                        "hex field does not have an even number of digits");
1346             kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
1347         } else if (type == 'B') {
1348             int32_t n, size;
1349             size_t bytes;
1350             char *r;
1351             _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
1352             type = *q++; // q points to the first ',' following the typing byte
1353 
1354             size = aux_type2size(type);
1355             _parse_err_param(size <= 0 || size > 4,
1356                              "unrecognized type B:%c", type);
1357             _parse_err(*q && *q != ',', "B aux field type not followed by ','");
1358 
1359             for (r = q, n = 0; *r; ++r)
1360                 if (*r == ',') ++n;
1361 
1362             // Ensure space for type + values
1363             bytes = (size_t) n * (size_t) size;
1364             _parse_err(bytes / size != n
1365                        || ks_resize(&str, str.l + bytes + 2 + sizeof(uint32_t)),
1366                        "out of memory");
1367             str.s[str.l++] = 'B';
1368             str.s[str.l++] = type;
1369             i32_to_le(n, (uint8_t *) str.s + str.l);
1370             str.l += sizeof(uint32_t);
1371 
1372             // This ensures that q always ends up at the next comma after
1373             // reading a number even if it's followed by junk.  It
1374             // prevents the possibility of trying to read more than n items.
1375 #define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0)
1376 
1377             if (type == 'c')      while (q + 1 < p) { int8_t   x = strtol(q + 1, &q, 0); kputc_(x, &str); }
1378             else if (type == 'C') while (q + 1 < p) { uint8_t  x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
1379             else if (type == 's') while (q + 1 < p) { i16_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1380             else if (type == 'S') while (q + 1 < p) { u16_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1381             else if (type == 'i') while (q + 1 < p) { i32_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1382             else if (type == 'I') while (q + 1 < p) { u32_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1383             else if (type == 'f') while (q + 1 < p) { float_to_le(strtod(q + 1, &q), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1384             else _parse_err_param(1, "unrecognized type B:%c", type);
1385 
1386 #undef _skip_to_comma
1387 
1388         } else _parse_err_param(1, "unrecognized type %c", type);
1389     }
1390     b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1391     if (bam_tag2cigar(b, 1, 1) < 0)
1392         return -2;
1393     return 0;
1394 
1395 #undef _parse_warn
1396 #undef _parse_err
1397 #undef _parse_err_param
1398 #undef _get_mem
1399 #undef _read_token_aux
1400 #undef _read_token
1401 err_ret:
1402     b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1403     return -2;
1404 }
1405 
sam_read1(htsFile * fp,bam_hdr_t * h,bam1_t * b)1406 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
1407 {
1408     switch (fp->format.format) {
1409     case bam: {
1410         int r = bam_read1(fp->fp.bgzf, b);
1411         if (r >= 0) {
1412             if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
1413                 b->core.mtid >= h->n_targets || b->core.mtid < -1)
1414                 return -3;
1415         }
1416         return r;
1417         }
1418 
1419     case cram: {
1420         int ret = cram_get_bam_seq(fp->fp.cram, &b);
1421         if (ret < 0)
1422             return cram_eof(fp->fp.cram) ? -1 : -2;
1423 
1424         if (bam_tag2cigar(b, 1, 1) < 0)
1425             return -2;
1426         return ret;
1427     }
1428 
1429     case sam: {
1430         int ret;
1431 err_recover:
1432         if (fp->line.l == 0) {
1433             ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1434             if (ret < 0) return ret;
1435         }
1436         ret = sam_parse1(&fp->line, h, b);
1437         fp->line.l = 0;
1438         if (ret < 0) {
1439             hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
1440             if (h->ignore_sam_err) goto err_recover;
1441         }
1442         return ret;
1443     }
1444 
1445     default:
1446         abort();
1447     }
1448 }
1449 
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)1450 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
1451 {
1452     int i;
1453     uint8_t *s, *end;
1454     const bam1_core_t *c = &b->core;
1455 
1456     str->l = 0;
1457     kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name
1458     kputw(c->flag, str); kputc('\t', str); // flag
1459     if (c->tid >= 0) { // chr
1460         kputs(h->target_name[c->tid] , str);
1461         kputc('\t', str);
1462     } else kputsn("*\t", 2, str);
1463     kputw(c->pos + 1, str); kputc('\t', str); // pos
1464     kputw(c->qual, str); kputc('\t', str); // qual
1465     if (c->n_cigar) { // cigar
1466         uint32_t *cigar = bam_get_cigar(b);
1467         for (i = 0; i < c->n_cigar; ++i) {
1468             kputw(bam_cigar_oplen(cigar[i]), str);
1469             kputc(bam_cigar_opchr(cigar[i]), str);
1470         }
1471     } else kputc('*', str);
1472     kputc('\t', str);
1473     if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
1474     else if (c->mtid == c->tid) kputsn("=\t", 2, str);
1475     else {
1476         kputs(h->target_name[c->mtid], str);
1477         kputc('\t', str);
1478     }
1479     kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
1480     kputw(c->isize, str); kputc('\t', str); // template len
1481     if (c->l_qseq) { // seq and qual
1482         uint8_t *s = bam_get_seq(b);
1483         for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
1484         kputc('\t', str);
1485         s = bam_get_qual(b);
1486         if (s[0] == 0xff) kputc('*', str);
1487         else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
1488     } else kputsn("*\t*", 3, str);
1489 
1490     s = bam_get_aux(b); // aux
1491     end = b->data + b->l_data;
1492     while (end - s >= 4) {
1493         uint8_t type, key[2];
1494         key[0] = s[0]; key[1] = s[1];
1495         s += 2; type = *s++;
1496         kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
1497         if (type == 'A') {
1498             kputsn("A:", 2, str);
1499             kputc(*s, str);
1500             ++s;
1501         } else if (type == 'C') {
1502             kputsn("i:", 2, str);
1503             kputw(*s, str);
1504             ++s;
1505         } else if (type == 'c') {
1506             kputsn("i:", 2, str);
1507             kputw(*(int8_t*)s, str);
1508             ++s;
1509         } else if (type == 'S') {
1510             if (end - s >= 2) {
1511                 kputsn("i:", 2, str);
1512                 kputuw(le_to_u16(s), str);
1513                 s += 2;
1514             } else goto bad_aux;
1515         } else if (type == 's') {
1516             if (end - s >= 2) {
1517                 kputsn("i:", 2, str);
1518                 kputw(le_to_i16(s), str);
1519                 s += 2;
1520             } else goto bad_aux;
1521         } else if (type == 'I') {
1522             if (end - s >= 4) {
1523                 kputsn("i:", 2, str);
1524                 kputuw(le_to_u32(s), str);
1525                 s += 4;
1526             } else goto bad_aux;
1527         } else if (type == 'i') {
1528             if (end - s >= 4) {
1529                 kputsn("i:", 2, str);
1530                 kputw(le_to_i32(s), str);
1531                 s += 4;
1532             } else goto bad_aux;
1533         } else if (type == 'f') {
1534             if (end - s >= 4) {
1535                 ksprintf(str, "f:%g", le_to_float(s));
1536                 s += 4;
1537             } else goto bad_aux;
1538 
1539         } else if (type == 'd') {
1540             if (end - s >= 8) {
1541                 ksprintf(str, "d:%g", le_to_double(s));
1542                 s += 8;
1543             } else goto bad_aux;
1544         } else if (type == 'Z' || type == 'H') {
1545             kputc(type, str); kputc(':', str);
1546             while (s < end && *s) kputc(*s++, str);
1547             if (s >= end)
1548                 goto bad_aux;
1549             ++s;
1550         } else if (type == 'B') {
1551             uint8_t sub_type = *(s++);
1552             int sub_type_size = aux_type2size(sub_type);
1553             uint32_t n;
1554             if (sub_type_size == 0 || end - s < 4)
1555                 goto bad_aux;
1556             n = le_to_u32(s);
1557             s += 4; // now points to the start of the array
1558             if ((end - s) / sub_type_size < n)
1559                 goto bad_aux;
1560             kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
1561             for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
1562                 kputc(',', str);
1563                 if ('c' == sub_type)      { kputw(*(int8_t*)s, str); ++s; }
1564                 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
1565                 else if ('s' == sub_type) { kputw(le_to_i16(s), str); s += 2; }
1566                 else if ('S' == sub_type) { kputw(le_to_u16(s), str); s += 2; }
1567                 else if ('i' == sub_type) { kputw(le_to_i32(s), str); s += 4; }
1568                 else if ('I' == sub_type) { kputuw(le_to_u32(s), str); s += 4; }
1569                 else if ('f' == sub_type) { kputd(le_to_float(s), str); s += 4; }
1570                 else goto bad_aux;  // Unknown sub-type
1571             }
1572         } else { // Unknown type
1573             goto bad_aux;
1574         }
1575     }
1576     return str->l;
1577 
1578  bad_aux:
1579     hts_log_error("Corrupted aux data for read %.*s",
1580                   b->core.l_qname, bam_get_qname(b));
1581     errno = EINVAL;
1582     return -1;
1583 }
1584 
sam_write1(htsFile * fp,const bam_hdr_t * h,const bam1_t * b)1585 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1586 {
1587     switch (fp->format.format) {
1588     case binary_format:
1589         fp->format.category = sequence_data;
1590         fp->format.format = bam;
1591         /* fall-through */
1592     case bam:
1593         return bam_write1(fp->fp.bgzf, b);
1594 
1595     case cram:
1596         return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1597 
1598     case text_format:
1599         fp->format.category = sequence_data;
1600         fp->format.format = sam;
1601         /* fall-through */
1602     case sam:
1603         if (sam_format1(h, b, &fp->line) < 0) return -1;
1604         kputc('\n', &fp->line);
1605         if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1606         return fp->line.l;
1607 
1608     default:
1609         abort();
1610     }
1611 }
1612 
1613 /************************
1614  *** Auxiliary fields ***
1615  ************************/
1616 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)1617 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
1618     int tsz = aux_type2size(type);
1619 
1620     if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
1621 
1622     switch (tsz) {
1623         case 'H': case 'Z': case 1:  // Trivial
1624             memcpy(out, in, len);
1625             break;
1626 
1627 #define aux_val_to_le(type_t, store_le) do {                            \
1628         type_t v;                                                       \
1629         size_t i;                                                       \
1630         for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
1631             memcpy(&v, in + i, sizeof(type_t));                         \
1632             store_le(v, out);                                           \
1633         }                                                               \
1634     } while (0)
1635 
1636         case 2: aux_val_to_le(uint16_t, u16_to_le); break;
1637         case 4: aux_val_to_le(uint32_t, u32_to_le); break;
1638         case 8: aux_val_to_le(uint64_t, u64_to_le); break;
1639 
1640 #undef aux_val_to_le
1641 
1642         case 'B': { // Recurse!
1643             uint32_t n;
1644             if (len < 5) return -1;
1645             memcpy(&n, in + 1, 4);
1646             out[0] = in[0];
1647             u32_to_le(n, out + 1);
1648             return aux_to_le(in[0], out + 5, in + 5, len - 5);
1649         }
1650 
1651         default: // Unknown type code
1652             return -1;
1653     }
1654 
1655 
1656 
1657     return 0;
1658 }
1659 #endif
1660 
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)1661 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
1662 {
1663     uint32_t new_len;
1664 
1665     assert(b->l_data >= 0);
1666     new_len = b->l_data + 3 + len;
1667     if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
1668 
1669     if (b->m_data < new_len) {
1670         uint32_t new_size = new_len;
1671         uint8_t *new_data;
1672         kroundup32(new_size);
1673         new_data = realloc(b->data, new_size);
1674         if (new_data == NULL) goto nomem;
1675         b->m_data = new_size;
1676         b->data = new_data;
1677     }
1678 
1679     b->data[b->l_data] = tag[0];
1680     b->data[b->l_data + 1] = tag[1];
1681     b->data[b->l_data + 2] = type;
1682 
1683 #ifdef HTS_LITTLE_ENDIAN
1684     memcpy(b->data + b->l_data + 3, data, len);
1685 #else
1686     if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
1687         errno = EINVAL;
1688         return -1;
1689     }
1690 #endif
1691 
1692     b->l_data = new_len;
1693 
1694     return 0;
1695 
1696  nomem:
1697     errno = ENOMEM;
1698     return -1;
1699 }
1700 
skip_aux(uint8_t * s,uint8_t * end)1701 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
1702 {
1703     int size;
1704     uint32_t n;
1705     if (s >= end) return end;
1706     size = aux_type2size(*s); ++s; // skip type
1707     switch (size) {
1708     case 'Z':
1709     case 'H':
1710         while (*s && s < end) ++s;
1711         return s < end ? s + 1 : end;
1712     case 'B':
1713         if (end - s < 5) return NULL;
1714         size = aux_type2size(*s); ++s;
1715         n = le_to_u32(s);
1716         s += 4;
1717         if (size == 0 || end - s < size * n) return NULL;
1718         return s + size * n;
1719     case 0:
1720         return NULL;
1721     default:
1722         if (end - s < size) return NULL;
1723         return s + size;
1724     }
1725 }
1726 
bam_aux_get(const bam1_t * b,const char tag[2])1727 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1728 {
1729     uint8_t *s, *end, *t = (uint8_t *) tag;
1730     uint16_t y = (uint16_t) t[0]<<8 | t[1];
1731     s = bam_get_aux(b);
1732     end = b->data + b->l_data;
1733     while (s != NULL && end - s >= 3) {
1734         uint16_t x = (uint16_t) s[0]<<8 | s[1];
1735         s += 2;
1736         if (x == y) {
1737             // Check the tag value is valid and complete
1738             uint8_t *e = skip_aux(s, end);
1739             if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
1740                 goto bad_aux;  // Unterminated string
1741             }
1742             if (e != NULL) {
1743                 return s;
1744             } else {
1745                 goto bad_aux;
1746             }
1747         }
1748         s = skip_aux(s, end);
1749     }
1750     if (s == NULL) goto bad_aux;
1751     errno = ENOENT;
1752     return NULL;
1753 
1754  bad_aux:
1755     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1756     errno = EINVAL;
1757     return NULL;
1758 }
1759 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)1760 int bam_aux_del(bam1_t *b, uint8_t *s)
1761 {
1762     uint8_t *p, *aux;
1763     int l_aux = bam_get_l_aux(b);
1764     aux = bam_get_aux(b);
1765     p = s - 2;
1766     s = skip_aux(s, aux + l_aux);
1767     if (s == NULL) goto bad_aux;
1768     memmove(p, s, l_aux - (s - aux));
1769     b->l_data -= s - p;
1770     return 0;
1771 
1772  bad_aux:
1773     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1774     errno = EINVAL;
1775     return -1;
1776 }
1777 
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)1778 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
1779 {
1780     // FIXME: This is not at all efficient!
1781     uint8_t *s = bam_aux_get(b,tag);
1782     if (!s) {
1783         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
1784             return bam_aux_append(b, tag, 'Z', len, (const uint8_t *) data);
1785         } else { // Invalid aux data, give up.
1786             return -1;
1787         }
1788     }
1789     char type = *s;
1790     if (type != 'Z') {
1791         hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
1792         errno = EINVAL;
1793         return -1;
1794     }
1795 
1796     bam_aux_del(b,s);
1797     s -= 2;
1798     int l_aux = bam_get_l_aux(b);
1799 
1800     b->l_data += 3 + len;
1801     if (b->m_data < b->l_data) {
1802         ptrdiff_t s_offset = s - b->data;
1803         b->m_data = b->l_data;
1804         kroundup32(b->m_data);
1805         b->data = (uint8_t*)realloc(b->data, b->m_data);
1806         s = b->data + s_offset;
1807     }
1808     memmove(s+3+len, s, l_aux - (s - bam_get_aux(b)));
1809     s[0] = tag[0];
1810     s[1] = tag[1];
1811     s[2] = type;
1812     memmove(s+3,data,len);
1813     return 0;
1814 }
1815 
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)1816 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
1817                                       uint32_t idx)
1818 {
1819     switch (type) {
1820         case 'c': return le_to_i8(s + idx);
1821         case 'C': return s[idx];
1822         case 's': return le_to_i16(s + 2 * idx);
1823         case 'S': return le_to_u16(s + 2 * idx);
1824         case 'i': return le_to_i32(s + 4 * idx);
1825         case 'I': return le_to_u32(s + 4 * idx);
1826         default:
1827             errno = EINVAL;
1828             return 0;
1829     }
1830 }
1831 
bam_aux2i(const uint8_t * s)1832 int64_t bam_aux2i(const uint8_t *s)
1833 {
1834     int type;
1835     type = *s++;
1836     return get_int_aux_val(type, s, 0);
1837 }
1838 
bam_aux2f(const uint8_t * s)1839 double bam_aux2f(const uint8_t *s)
1840 {
1841     int type;
1842     type = *s++;
1843     if (type == 'd') return le_to_double(s);
1844     else if (type == 'f') return le_to_float(s);
1845     else return get_int_aux_val(type, s, 0);
1846 }
1847 
bam_aux2A(const uint8_t * s)1848 char bam_aux2A(const uint8_t *s)
1849 {
1850     int type;
1851     type = *s++;
1852     if (type == 'A') return *(char*)s;
1853     errno = EINVAL;
1854     return 0;
1855 }
1856 
bam_aux2Z(const uint8_t * s)1857 char *bam_aux2Z(const uint8_t *s)
1858 {
1859     int type;
1860     type = *s++;
1861     if (type == 'Z' || type == 'H') return (char*)s;
1862     errno = EINVAL;
1863     return 0;
1864 }
1865 
bam_auxB_len(const uint8_t * s)1866 uint32_t bam_auxB_len(const uint8_t *s)
1867 {
1868     if (s[0] != 'B') {
1869         errno = EINVAL;
1870         return 0;
1871     }
1872     return le_to_u32(s + 2);
1873 }
1874 
bam_auxB2i(const uint8_t * s,uint32_t idx)1875 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
1876 {
1877     uint32_t len = bam_auxB_len(s);
1878     if (idx >= len) {
1879         errno = ERANGE;
1880         return 0;
1881     }
1882     return get_int_aux_val(s[1], s + 6, idx);
1883 }
1884 
bam_auxB2f(const uint8_t * s,uint32_t idx)1885 double bam_auxB2f(const uint8_t *s, uint32_t idx)
1886 {
1887     uint32_t len = bam_auxB_len(s);
1888     if (idx >= len) {
1889         errno = ERANGE;
1890         return 0.0;
1891     }
1892     if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
1893     else return get_int_aux_val(s[1], s + 6, idx);
1894 }
1895 
sam_open_mode(char * mode,const char * fn,const char * format)1896 int sam_open_mode(char *mode, const char *fn, const char *format)
1897 {
1898     // TODO Parse "bam5" etc for compression level
1899     if (format == NULL) {
1900         // Try to pick a format based on the filename extension
1901         const char *ext = fn? strrchr(fn, '.') : NULL;
1902         if (ext == NULL || strchr(ext, '/')) return -1;
1903         return sam_open_mode(mode, fn, ext+1);
1904     }
1905     else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
1906     else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
1907     else if (strcmp(format, "sam") == 0) strcpy(mode, "");
1908     else return -1;
1909 
1910     return 0;
1911 }
1912 
1913 // A version of sam_open_mode that can handle ,key=value options.
1914 // The format string is allocated and returned, to be freed by the caller.
1915 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)1916 char *sam_open_mode_opts(const char *fn,
1917                          const char *mode,
1918                          const char *format)
1919 {
1920     char *mode_opts = malloc((format ? strlen(format) : 1) +
1921                              (mode   ? strlen(mode)   : 1) + 12);
1922     char *opts, *cp;
1923     int format_len;
1924 
1925     if (!mode_opts)
1926         return NULL;
1927 
1928     strcpy(mode_opts, mode ? mode : "r");
1929     cp = mode_opts + strlen(mode_opts);
1930 
1931     if (format == NULL) {
1932         // Try to pick a format based on the filename extension
1933         const char *ext = fn? strrchr(fn, '.') : NULL;
1934         if (ext == NULL || strchr(ext, '/')) {
1935             free(mode_opts);
1936             return NULL;
1937         }
1938         return sam_open_mode(cp, fn, ext+1)
1939             ? (free(mode_opts), NULL)
1940             : mode_opts;
1941     }
1942 
1943     if ((opts = strchr(format, ','))) {
1944         format_len = opts-format;
1945     } else {
1946         opts="";
1947         format_len = strlen(format);
1948     }
1949 
1950     if (strncmp(format, "bam", format_len) == 0) {
1951         *cp++ = 'b';
1952     } else if (strncmp(format, "cram", format_len) == 0) {
1953         *cp++ = 'c';
1954     } else if (strncmp(format, "cram2", format_len) == 0) {
1955         *cp++ = 'c';
1956         strcpy(cp, ",VERSION=2.1");
1957         cp += 12;
1958     } else if (strncmp(format, "cram3", format_len) == 0) {
1959         *cp++ = 'c';
1960         strcpy(cp, ",VERSION=3.0");
1961         cp += 12;
1962     } else if (strncmp(format, "sam", format_len) == 0) {
1963         ; // format mode=""
1964     } else {
1965         free(mode_opts);
1966         return NULL;
1967     }
1968 
1969     strcpy(cp, opts);
1970 
1971     return mode_opts;
1972 }
1973 
1974 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)1975 int bam_str2flag(const char *str)
1976 {
1977     char *end, *beg = (char*) str;
1978     long int flag = strtol(str, &end, 0);
1979     if ( end!=str ) return flag;    // the conversion was successful
1980     flag = 0;
1981     while ( *str )
1982     {
1983         end = beg;
1984         while ( *end && *end!=',' ) end++;
1985         if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
1986         else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
1987         else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
1988         else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
1989         else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
1990         else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
1991         else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
1992         else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
1993         else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
1994         else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
1995         else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
1996         else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
1997         else return -1;
1998         if ( !*end ) break;
1999         beg = end + 1;
2000     }
2001     return flag;
2002 }
2003 
bam_flag2str(int flag)2004 char *bam_flag2str(int flag)
2005 {
2006     kstring_t str = {0,0,0};
2007     if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
2008     if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
2009     if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
2010     if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
2011     if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
2012     if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
2013     if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
2014     if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
2015     if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
2016     if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
2017     if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
2018     if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
2019     if ( str.l == 0 ) kputsn("", 0, &str);
2020     return str.s;
2021 }
2022 
2023 
2024 /**************************
2025  *** Pileup and Mpileup ***
2026  **************************/
2027 
2028 #if !defined(BAM_NO_PILEUP)
2029 
2030 #include <assert.h>
2031 
2032 /*******************
2033  *** Memory pool ***
2034  *******************/
2035 
2036 typedef struct {
2037     int k, x, y, end;
2038 } cstate_t;
2039 
2040 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
2041 
2042 typedef struct __linkbuf_t {
2043     bam1_t b;
2044     int32_t beg, end;
2045     cstate_t s;
2046     struct __linkbuf_t *next;
2047     bam_pileup_cd cd;
2048 } lbnode_t;
2049 
2050 typedef struct {
2051     int cnt, n, max;
2052     lbnode_t **buf;
2053 } mempool_t;
2054 
mp_init(void)2055 static mempool_t *mp_init(void)
2056 {
2057     mempool_t *mp;
2058     mp = (mempool_t*)calloc(1, sizeof(mempool_t));
2059     return mp;
2060 }
mp_destroy(mempool_t * mp)2061 static void mp_destroy(mempool_t *mp)
2062 {
2063     int k;
2064     for (k = 0; k < mp->n; ++k) {
2065         free(mp->buf[k]->b.data);
2066         free(mp->buf[k]);
2067     }
2068     free(mp->buf);
2069     free(mp);
2070 }
mp_alloc(mempool_t * mp)2071 static inline lbnode_t *mp_alloc(mempool_t *mp)
2072 {
2073     ++mp->cnt;
2074     if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
2075     else return mp->buf[--mp->n];
2076 }
mp_free(mempool_t * mp,lbnode_t * p)2077 static inline void mp_free(mempool_t *mp, lbnode_t *p)
2078 {
2079     --mp->cnt; p->next = 0; // clear lbnode_t::next here
2080     if (mp->n == mp->max) {
2081         mp->max = mp->max? mp->max<<1 : 256;
2082         mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
2083     }
2084     mp->buf[mp->n++] = p;
2085 }
2086 
2087 /**********************
2088  *** CIGAR resolver ***
2089  **********************/
2090 
2091 /* s->k: the index of the CIGAR operator that has just been processed.
2092    s->x: the reference coordinate of the start of s->k
2093    s->y: the query coordiante of the start of s->k
2094  */
resolve_cigar2(bam_pileup1_t * p,int32_t pos,cstate_t * s)2095 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
2096 {
2097 #define _cop(c) ((c)&BAM_CIGAR_MASK)
2098 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
2099 
2100     bam1_t *b = p->b;
2101     bam1_core_t *c = &b->core;
2102     uint32_t *cigar = bam_get_cigar(b);
2103     int k;
2104     // determine the current CIGAR operation
2105     //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
2106     if (s->k == -1) { // never processed
2107         if (c->n_cigar == 1) { // just one operation, save a loop
2108           if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
2109         } else { // find the first match or deletion
2110             for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
2111                 int op = _cop(cigar[k]);
2112                 int l = _cln(cigar[k]);
2113                 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2114                 else if (op == BAM_CREF_SKIP) s->x += l;
2115                 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2116             }
2117             assert(k < c->n_cigar);
2118             s->k = k;
2119         }
2120     } else { // the read has been processed before
2121         int op, l = _cln(cigar[s->k]);
2122         if (pos - s->x >= l) { // jump to the next operation
2123             assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
2124             op = _cop(cigar[s->k+1]);
2125             if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
2126               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2127                 s->x += l;
2128                 ++s->k;
2129             } else { // find the next M/D/N/=/X
2130               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2131                 s->x += l;
2132                 for (k = s->k + 1; k < c->n_cigar; ++k) {
2133                     op = _cop(cigar[k]), l = _cln(cigar[k]);
2134                     if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2135                     else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2136                 }
2137                 s->k = k;
2138             }
2139             assert(s->k < c->n_cigar); // otherwise a bug
2140         } // else, do nothing
2141     }
2142     { // collect pileup information
2143         int op, l;
2144         op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
2145         p->is_del = p->indel = p->is_refskip = 0;
2146         if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
2147             int op2 = _cop(cigar[s->k+1]);
2148             int l2 = _cln(cigar[s->k+1]);
2149             if (op2 == BAM_CDEL) p->indel = -(int)l2;
2150             else if (op2 == BAM_CINS) p->indel = l2;
2151             else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
2152                 int l3 = 0;
2153                 for (k = s->k + 2; k < c->n_cigar; ++k) {
2154                     op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
2155                     if (op2 == BAM_CINS) l3 += l2;
2156                     else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
2157                 }
2158                 if (l3 > 0) p->indel = l3;
2159             }
2160         }
2161         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
2162             p->qpos = s->y + (pos - s->x);
2163         } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
2164             p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
2165             p->is_refskip = (op == BAM_CREF_SKIP);
2166         } // cannot be other operations; otherwise a bug
2167         p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
2168     }
2169     return 1;
2170 }
2171 
2172 /***********************
2173  *** Pileup iterator ***
2174  ***********************/
2175 
2176 // Dictionary of overlapping reads
2177 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
2178 typedef khash_t(olap_hash) olap_hash_t;
2179 
2180 struct __bam_plp_t {
2181     mempool_t *mp;
2182     lbnode_t *head, *tail;
2183     int32_t tid, pos, max_tid, max_pos;
2184     int is_eof, max_plp, error, maxcnt;
2185     uint64_t id;
2186     bam_pileup1_t *plp;
2187     // for the "auto" interface only
2188     bam1_t *b;
2189     bam_plp_auto_f func;
2190     void *data;
2191     olap_hash_t *overlaps;
2192 
2193     // For notification of creation and destruction events
2194     // and associated client-owned pointer.
2195     int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
2196     int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
2197 };
2198 
bam_plp_init(bam_plp_auto_f func,void * data)2199 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
2200 {
2201     bam_plp_t iter;
2202     iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
2203     iter->mp = mp_init();
2204     iter->head = iter->tail = mp_alloc(iter->mp);
2205     iter->max_tid = iter->max_pos = -1;
2206     iter->maxcnt = 8000;
2207     if (func) {
2208         iter->func = func;
2209         iter->data = data;
2210         iter->b = bam_init1();
2211     }
2212     return iter;
2213 }
2214 
bam_plp_init_overlaps(bam_plp_t iter)2215 void bam_plp_init_overlaps(bam_plp_t iter)
2216 {
2217     iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
2218 }
2219 
bam_plp_destroy(bam_plp_t iter)2220 void bam_plp_destroy(bam_plp_t iter)
2221 {
2222     lbnode_t *p, *pnext;
2223     if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
2224     for (p = iter->head; p != NULL; p = pnext) {
2225         pnext = p->next;
2226         mp_free(iter->mp, p);
2227     }
2228     mp_destroy(iter->mp);
2229     if (iter->b) bam_destroy1(iter->b);
2230     free(iter->plp);
2231     free(iter);
2232 }
2233 
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2234 void bam_plp_constructor(bam_plp_t plp,
2235                          int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2236     plp->plp_construct = func;
2237 }
2238 
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2239 void bam_plp_destructor(bam_plp_t plp,
2240                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2241     plp->plp_destruct = func;
2242 }
2243 
2244 //---------------------------------
2245 //---  Tweak overlapping reads
2246 //---------------------------------
2247 
2248 /**
2249  *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
2250  *  cigar_iref2iseq_next() - get the next CMATCH base
2251  *  @cigar:       pointer to current cigar block (rw)
2252  *  @cigar_max:   pointer just beyond the last cigar block
2253  *  @icig:        position within the current cigar block (rw)
2254  *  @iseq:        position in the sequence (rw)
2255  *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
2256  *
2257  *  Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
2258  */
cigar_iref2iseq_set(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2259 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2260 {
2261     int pos = *iref;
2262     if ( pos < 0 ) return -1;
2263     *icig = 0;
2264     *iseq = 0;
2265     *iref = 0;
2266     while ( *cigar<cigar_max )
2267     {
2268         int cig  = (**cigar) & BAM_CIGAR_MASK;
2269         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2270 
2271         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2272         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2273         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2274         {
2275             pos -= ncig;
2276             if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
2277             (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
2278             continue;
2279         }
2280         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2281         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
2282         {
2283             pos -= ncig;
2284             if ( pos<0 ) pos = 0;
2285             (*cigar)++; *icig = 0; *iref += ncig;
2286             continue;
2287         }
2288         hts_log_error("Unexpected cigar %d", cig);
2289         assert(0);
2290     }
2291     *iseq = -1;
2292     return -1;
2293 }
cigar_iref2iseq_next(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2294 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2295 {
2296     while ( *cigar < cigar_max )
2297     {
2298         int cig  = (**cigar) & BAM_CIGAR_MASK;
2299         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2300 
2301         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2302         {
2303             if ( *icig >= ncig - 1 ) { *icig = 0;  (*cigar)++; continue; }
2304             (*iseq)++; (*icig)++; (*iref)++;
2305             return BAM_CMATCH;
2306         }
2307         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
2308         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2309         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2310         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2311         hts_log_error("Unexpected cigar %d", cig);
2312         assert(0);
2313     }
2314     *iseq = -1;
2315     *iref = -1;
2316     return -1;
2317 }
2318 
tweak_overlap_quality(bam1_t * a,bam1_t * b)2319 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
2320 {
2321     uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
2322     uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
2323     int a_icig = 0, a_iseq = 0;
2324     int b_icig = 0, b_iseq = 0;
2325     uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
2326     uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
2327 
2328     int iref   = b->core.pos;
2329     int a_iref = iref - a->core.pos;
2330     int b_iref = iref - b->core.pos;
2331     int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2332     if ( a_ret<0 ) return;  // no overlap
2333     int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2334     if ( b_ret<0 ) return;  // no overlap
2335 
2336     #if DBG
2337         fprintf(stderr,"tweak %s  n_cigar=%d %d  .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
2338             a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
2339     #endif
2340 
2341     while ( 1 )
2342     {
2343         // Increment reference position
2344         while ( a_iref>=0 && a_iref < iref - a->core.pos )
2345             a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2346         if ( a_ret<0 ) break;   // done
2347         if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
2348 
2349         while ( b_iref>=0 && b_iref < iref - b->core.pos )
2350             b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2351         if ( b_ret<0 ) break;   // done
2352         if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
2353 
2354         iref++;
2355         if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue;   // only CMATCH positions, don't know what to do with indels
2356 
2357         if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
2358         {
2359             #if DBG
2360                 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
2361             #endif
2362             // we are very confident about this base
2363             int qual = a_qual[a_iseq] + b_qual[b_iseq];
2364             a_qual[a_iseq] = qual>200 ? 200 : qual;
2365             b_qual[b_iseq] = 0;
2366         }
2367         else
2368         {
2369             if ( a_qual[a_iseq] >= b_qual[b_iseq] )
2370             {
2371                 #if DBG
2372                     fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
2373                 #endif
2374                 a_qual[a_iseq] = 0.8 * a_qual[a_iseq];  // not so confident about a_qual anymore given the mismatch
2375                 b_qual[b_iseq] = 0;
2376             }
2377             else
2378             {
2379                 #if DBG
2380                     fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
2381                 #endif
2382                 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
2383                 a_qual[a_iseq] = 0;
2384             }
2385         }
2386     }
2387     #if DBG
2388         fprintf(stderr,"\n");
2389     #endif
2390 }
2391 
2392 // Fix overlapping reads. Simple soft-clipping did not give good results.
2393 // Lowering qualities of unwanted bases is more selective and works better.
2394 //
overlap_push(bam_plp_t iter,lbnode_t * node)2395 static void overlap_push(bam_plp_t iter, lbnode_t *node)
2396 {
2397     if ( !iter->overlaps ) return;
2398 
2399     // mapped mates and paired reads only
2400     if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
2401 
2402     // no overlap possible, unless some wild cigar
2403     if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
2404 
2405     khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
2406     if ( kitr==kh_end(iter->overlaps) )
2407     {
2408         int ret;
2409         kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
2410         kh_value(iter->overlaps, kitr) = node;
2411     }
2412     else
2413     {
2414         lbnode_t *a = kh_value(iter->overlaps, kitr);
2415         tweak_overlap_quality(&a->b, &node->b);
2416         kh_del(olap_hash, iter->overlaps, kitr);
2417         assert(a->end-1 == a->s.end);
2418         a->end = bam_endpos(&a->b);
2419         a->s.end = a->end - 1;
2420     }
2421 }
2422 
overlap_remove(bam_plp_t iter,const bam1_t * b)2423 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
2424 {
2425     if ( !iter->overlaps ) return;
2426 
2427     khiter_t kitr;
2428     if ( b )
2429     {
2430         kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
2431         if ( kitr!=kh_end(iter->overlaps) )
2432             kh_del(olap_hash, iter->overlaps, kitr);
2433     }
2434     else
2435     {
2436         // remove all
2437         for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
2438             if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
2439     }
2440 }
2441 
2442 
2443 
2444 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
2445 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
2446 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2447 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2448 {
2449     if (iter->error) { *_n_plp = -1; return NULL; }
2450     *_n_plp = 0;
2451     if (iter->is_eof && iter->head == iter->tail) return NULL;
2452     while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
2453         int n_plp = 0;
2454         // write iter->plp at iter->pos
2455         lbnode_t **pptr = &iter->head;
2456         while (*pptr != iter->tail) {
2457             lbnode_t *p = *pptr;
2458             if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
2459                 overlap_remove(iter, &p->b);
2460                 if (iter->plp_destruct)
2461                     iter->plp_destruct(iter->data, &p->b, &p->cd);
2462                 *pptr = p->next; mp_free(iter->mp, p);
2463             }
2464             else {
2465                 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
2466                     if (n_plp == iter->max_plp) { // then double the capacity
2467                         iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
2468                         iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
2469                     }
2470                     iter->plp[n_plp].b = &p->b;
2471                     iter->plp[n_plp].cd = p->cd;
2472                     if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
2473                 }
2474                 pptr = &(*pptr)->next;
2475             }
2476         }
2477         *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
2478         // update iter->tid and iter->pos
2479         if (iter->head != iter->tail) {
2480             if (iter->tid > iter->head->b.core.tid) {
2481                 hts_log_error("Unsorted input. Pileup aborts");
2482                 iter->error = 1;
2483                 *_n_plp = -1;
2484                 return NULL;
2485             }
2486         }
2487         if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
2488             iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
2489         } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
2490             iter->pos = iter->head->beg; // jump to the next position
2491         } else ++iter->pos; // scan contiguously
2492         // return
2493         if (n_plp) return iter->plp;
2494         if (iter->is_eof && iter->head == iter->tail) break;
2495     }
2496     return NULL;
2497 }
2498 
bam_plp_push(bam_plp_t iter,const bam1_t * b)2499 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
2500 {
2501     if (iter->error) return -1;
2502     if (b) {
2503         if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
2504         // Skip only unmapped reads here, any additional filtering must be done in iter->func
2505         if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
2506         if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
2507         {
2508             overlap_remove(iter, b);
2509             return 0;
2510         }
2511         bam_copy1(&iter->tail->b, b);
2512         overlap_push(iter, iter->tail);
2513 #ifndef BAM_NO_ID
2514         iter->tail->b.id = iter->id++;
2515 #endif
2516         iter->tail->beg = b->core.pos;
2517         iter->tail->end = bam_endpos(b);
2518         iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
2519         if (b->core.tid < iter->max_tid) {
2520             hts_log_error("The input is not sorted (chromosomes out of order)");
2521             iter->error = 1;
2522             return -1;
2523         }
2524         if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
2525             hts_log_error("The input is not sorted (reads out of order)");
2526             iter->error = 1;
2527             return -1;
2528         }
2529         iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
2530         if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
2531             if (iter->plp_construct)
2532                 iter->plp_construct(iter->data, b, &iter->tail->cd);
2533             iter->tail->next = mp_alloc(iter->mp);
2534             iter->tail = iter->tail->next;
2535         }
2536     } else iter->is_eof = 1;
2537     return 0;
2538 }
2539 
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2540 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2541 {
2542     const bam_pileup1_t *plp;
2543     if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
2544     if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2545     else { // no pileup line can be obtained; read alignments
2546         *_n_plp = 0;
2547         if (iter->is_eof) return 0;
2548         int ret;
2549         while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
2550             if (bam_plp_push(iter, iter->b) < 0) {
2551                 *_n_plp = -1;
2552                 return 0;
2553             }
2554             if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2555             // otherwise no pileup line can be returned; read the next alignment.
2556         }
2557         if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
2558         bam_plp_push(iter, 0);
2559         if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2560         return 0;
2561     }
2562 }
2563 
bam_plp_reset(bam_plp_t iter)2564 void bam_plp_reset(bam_plp_t iter)
2565 {
2566     overlap_remove(iter, NULL);
2567     iter->max_tid = iter->max_pos = -1;
2568     iter->tid = iter->pos = 0;
2569     iter->is_eof = 0;
2570     while (iter->head != iter->tail) {
2571         lbnode_t *p = iter->head;
2572         iter->head = p->next;
2573         mp_free(iter->mp, p);
2574     }
2575 }
2576 
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)2577 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
2578 {
2579     iter->maxcnt = maxcnt;
2580 }
2581 
2582 /************************
2583  *** Mpileup iterator ***
2584  ************************/
2585 
2586 struct __bam_mplp_t {
2587     int n;
2588     uint64_t min, *pos;
2589     bam_plp_t *iter;
2590     int *n_plp;
2591     const bam_pileup1_t **plp;
2592 };
2593 
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)2594 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
2595 {
2596     int i;
2597     bam_mplp_t iter;
2598     iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
2599     iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
2600     iter->n_plp = (int*)calloc(n, sizeof(int));
2601     iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
2602     iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
2603     iter->n = n;
2604     iter->min = (uint64_t)-1;
2605     for (i = 0; i < n; ++i) {
2606         iter->iter[i] = bam_plp_init(func, data[i]);
2607         iter->pos[i] = iter->min;
2608     }
2609     return iter;
2610 }
2611 
bam_mplp_init_overlaps(bam_mplp_t iter)2612 void bam_mplp_init_overlaps(bam_mplp_t iter)
2613 {
2614     int i;
2615     for (i = 0; i < iter->n; ++i)
2616         bam_plp_init_overlaps(iter->iter[i]);
2617 }
2618 
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)2619 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
2620 {
2621     int i;
2622     for (i = 0; i < iter->n; ++i)
2623         iter->iter[i]->maxcnt = maxcnt;
2624 }
2625 
bam_mplp_destroy(bam_mplp_t iter)2626 void bam_mplp_destroy(bam_mplp_t iter)
2627 {
2628     int i;
2629     for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
2630     free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
2631     free(iter);
2632 }
2633 
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)2634 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
2635 {
2636     int i, ret = 0;
2637     uint64_t new_min = (uint64_t)-1;
2638     for (i = 0; i < iter->n; ++i) {
2639         if (iter->pos[i] == iter->min) {
2640             int tid, pos;
2641             iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
2642             if ( iter->iter[i]->error ) return -1;
2643             iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
2644         }
2645         if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
2646     }
2647     iter->min = new_min;
2648     if (new_min == (uint64_t)-1) return 0;
2649     *_tid = new_min>>32; *_pos = (uint32_t)new_min;
2650     for (i = 0; i < iter->n; ++i) {
2651         if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
2652             n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
2653             ++ret;
2654         } else n_plp[i] = 0, plp[i] = 0;
2655     }
2656     return ret;
2657 }
2658 
bam_mplp_reset(bam_mplp_t iter)2659 void bam_mplp_reset(bam_mplp_t iter)
2660 {
2661     int i;
2662     iter->min = (uint64_t)-1;
2663     for (i = 0; i < iter->n; ++i) {
2664         bam_plp_reset(iter->iter[i]);
2665         iter->pos[i] = (uint64_t)-1;
2666         iter->n_plp[i] = 0;
2667         iter->plp[i] = NULL;
2668     }
2669 }
2670 
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2671 void bam_mplp_constructor(bam_mplp_t iter,
2672                           int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2673     int i;
2674     for (i = 0; i < iter->n; ++i)
2675         bam_plp_constructor(iter->iter[i], func);
2676 }
2677 
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2678 void bam_mplp_destructor(bam_mplp_t iter,
2679                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2680     int i;
2681     for (i = 0; i < iter->n; ++i)
2682         bam_plp_destructor(iter->iter[i], func);
2683 }
2684 
2685 #endif // ~!defined(BAM_NO_PILEUP)
2686