1 /*  sam.c -- SAM and BAM file I/O and manipulation.
2 
3     Copyright (C) 2008-2010, 2012-2018 Genome Research Ltd.
4     Copyright (C) 2010, 2012, 2013 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <strings.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <zlib.h>
34 #include <assert.h>
35 #include "htslib/sam.h"
36 #include "htslib/bgzf.h"
37 #include "cram/cram.h"
38 #include "hts_internal.h"
39 #include "htslib/hfile.h"
40 #include "htslib/hts_endian.h"
41 
42 #include "htslib/khash.h"
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44 
45 typedef khash_t(s2i) sdict_t;
46 
47 #ifndef EOVERFLOW
48 #define EOVERFLOW ERANGE
49 #endif
50 
51 /**********************
52  *** BAM header I/O ***
53  **********************/
54 
bam_hdr_init()55 bam_hdr_t *bam_hdr_init()
56 {
57     return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
58 }
59 
bam_hdr_destroy(bam_hdr_t * h)60 void bam_hdr_destroy(bam_hdr_t *h)
61 {
62     int32_t i;
63     if (h == NULL) return;
64     if (h->target_name) {
65         for (i = 0; i < h->n_targets; ++i)
66             free(h->target_name[i]);
67         free(h->target_name);
68         free(h->target_len);
69     }
70     free(h->text); free(h->cigar_tab);
71     if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
72     free(h);
73 }
74 
bam_hdr_dup(const bam_hdr_t * h0)75 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
76 {
77     if (h0 == NULL) return NULL;
78     bam_hdr_t *h;
79     if ((h = bam_hdr_init()) == NULL) return NULL;
80     // copy the simple data
81     h->n_targets = h0->n_targets;
82     h->ignore_sam_err = h0->ignore_sam_err;
83     h->l_text = h0->l_text;
84     // Then the pointery stuff
85     h->cigar_tab = NULL;
86     h->sdict = NULL;
87     // TODO Check for memory allocation failures
88     h->text = (char*)calloc(h->l_text + 1, 1);
89     memcpy(h->text, h0->text, h->l_text);
90     h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
91     h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
92     int i;
93     for (i = 0; i < h->n_targets; ++i) {
94         h->target_len[i] = h0->target_len[i];
95         h->target_name[i] = strdup(h0->target_name[i]);
96     }
97     return h;
98 }
99 
100 
hdr_from_dict(sdict_t * d)101 static bam_hdr_t *hdr_from_dict(sdict_t *d)
102 {
103     bam_hdr_t *h;
104     khint_t k;
105     h = bam_hdr_init();
106     h->sdict = d;
107     h->n_targets = kh_size(d);
108     // TODO Check for memory allocation failures
109     h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
110     h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
111     for (k = kh_begin(d); k != kh_end(d); ++k) {
112         if (!kh_exist(d, k)) continue;
113         h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
114         h->target_len[kh_val(d, k)>>32]  = kh_val(d, k) & 0xffffffffUL;
115         kh_val(d, k) >>= 32;
116     }
117     return h;
118 }
119 
bam_hdr_read(BGZF * fp)120 bam_hdr_t *bam_hdr_read(BGZF *fp)
121 {
122     bam_hdr_t *h;
123     char buf[4];
124     int magic_len, has_EOF;
125     int32_t i, name_len, num_names = 0;
126     size_t bufsize;
127     ssize_t bytes;
128     // check EOF
129     has_EOF = bgzf_check_EOF(fp);
130     if (has_EOF < 0) {
131         perror("[W::bam_hdr_read] bgzf_check_EOF");
132     } else if (has_EOF == 0) {
133         hts_log_warning("EOF marker is absent. The input is probably truncated");
134     }
135     // read "BAM1"
136     magic_len = bgzf_read(fp, buf, 4);
137     if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
138         hts_log_error("Invalid BAM binary header");
139         return 0;
140     }
141     h = bam_hdr_init();
142     if (!h) goto nomem;
143 
144     // read plain text and the number of reference sequences
145     bytes = bgzf_read(fp, &h->l_text, 4);
146     if (bytes != 4) goto read_err;
147     if (fp->is_be) ed_swap_4p(&h->l_text);
148 
149     bufsize = ((size_t) h->l_text) + 1;
150     if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
151     h->text = (char*)malloc(bufsize);
152     if (!h->text) goto nomem;
153     h->text[h->l_text] = 0; // make sure it is NULL terminated
154     bytes = bgzf_read(fp, h->text, h->l_text);
155     if (bytes != h->l_text) goto read_err;
156 
157     bytes = bgzf_read(fp, &h->n_targets, 4);
158     if (bytes != 4) goto read_err;
159     if (fp->is_be) ed_swap_4p(&h->n_targets);
160 
161     if (h->n_targets < 0) goto invalid;
162 
163     // read reference sequence names and lengths
164     if (h->n_targets > 0) {
165         h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
166         if (!h->target_name) goto nomem;
167         h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
168         if (!h->target_len) goto nomem;
169     }
170     else {
171         h->target_name = NULL;
172         h->target_len = NULL;
173     }
174 
175     for (i = 0; i != h->n_targets; ++i) {
176         bytes = bgzf_read(fp, &name_len, 4);
177         if (bytes != 4) goto read_err;
178         if (fp->is_be) ed_swap_4p(&name_len);
179         if (name_len <= 0) goto invalid;
180 
181         h->target_name[i] = (char*)malloc(name_len);
182         if (!h->target_name[i]) goto nomem;
183         num_names++;
184 
185         bytes = bgzf_read(fp, h->target_name[i], name_len);
186         if (bytes != name_len) goto read_err;
187 
188         if (h->target_name[i][name_len - 1] != '\0') {
189             /* Fix missing NUL-termination.  Is this being too nice?
190                We could alternatively bail out with an error. */
191             char *new_name;
192             if (name_len == INT32_MAX) goto invalid;
193             new_name = realloc(h->target_name[i], name_len + 1);
194             if (new_name == NULL) goto nomem;
195             h->target_name[i] = new_name;
196             h->target_name[i][name_len] = '\0';
197         }
198 
199         bytes = bgzf_read(fp, &h->target_len[i], 4);
200         if (bytes != 4) goto read_err;
201         if (fp->is_be) ed_swap_4p(&h->target_len[i]);
202     }
203     return h;
204 
205  nomem:
206     hts_log_error("Out of memory");
207     goto clean;
208 
209  read_err:
210     if (bytes < 0) {
211         hts_log_error("Error reading BGZF stream");
212     } else {
213         hts_log_error("Truncated BAM header");
214     }
215     goto clean;
216 
217  invalid:
218     hts_log_error("Invalid BAM binary header");
219 
220  clean:
221     if (h != NULL) {
222         h->n_targets = num_names; // ensure we free only allocated target_names
223         bam_hdr_destroy(h);
224     }
225     return NULL;
226 }
227 
bam_hdr_write(BGZF * fp,const bam_hdr_t * h)228 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
229 {
230     int32_t i, name_len, x;
231     // write "BAM1"
232     if (bgzf_write(fp, "BAM\1", 4) < 0) return -1;
233     // write plain text and the number of reference sequences
234     if (fp->is_be) {
235         x = ed_swap_4(h->l_text);
236         if (bgzf_write(fp, &x, 4) < 0) return -1;
237         if (h->l_text) {
238             if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
239         }
240         x = ed_swap_4(h->n_targets);
241         if (bgzf_write(fp, &x, 4) < 0) return -1;
242     } else {
243         if (bgzf_write(fp, &h->l_text, 4) < 0) return -1;
244         if (h->l_text) {
245             if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
246         }
247         if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1;
248     }
249     // write sequence names and lengths
250     for (i = 0; i != h->n_targets; ++i) {
251         char *p = h->target_name[i];
252         name_len = strlen(p) + 1;
253         if (fp->is_be) {
254             x = ed_swap_4(name_len);
255             if (bgzf_write(fp, &x, 4) < 0) return -1;
256         } else {
257             if (bgzf_write(fp, &name_len, 4) < 0) return -1;
258         }
259         if (bgzf_write(fp, p, name_len) < 0) return -1;
260         if (fp->is_be) {
261             x = ed_swap_4(h->target_len[i]);
262             if (bgzf_write(fp, &x, 4) < 0) return -1;
263         } else {
264             if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
265         }
266     }
267     if (bgzf_flush(fp) < 0) return -1;
268     return 0;
269 }
270 
bam_name2id(bam_hdr_t * h,const char * ref)271 int bam_name2id(bam_hdr_t *h, const char *ref)
272 {
273     sdict_t *d = (sdict_t*)h->sdict;
274     khint_t k;
275     if (h->sdict == 0) {
276         int i, absent;
277         d = kh_init(s2i);
278         for (i = 0; i < h->n_targets; ++i) {
279             k = kh_put(s2i, d, h->target_name[i], &absent);
280             kh_val(d, k) = i;
281         }
282         h->sdict = d;
283     }
284     k = kh_get(s2i, d, ref);
285     return k == kh_end(d)? -1 : kh_val(d, k);
286 }
287 
288 /*************************
289  *** BAM alignment I/O ***
290  *************************/
291 
bam_init1()292 bam1_t *bam_init1()
293 {
294     return (bam1_t*)calloc(1, sizeof(bam1_t));
295 }
296 
do_realloc_bam_data(bam1_t * b,size_t desired)297 static int do_realloc_bam_data(bam1_t *b, size_t desired)
298 {
299     uint32_t new_m_data;
300     uint8_t *new_data;
301     new_m_data = desired;
302     kroundup32(new_m_data);
303     if (new_m_data < desired) {
304         errno = ENOMEM; // Not strictly true but we can't store the size
305         return -1;
306     }
307     new_data = realloc(b->data, new_m_data);
308     if (!new_data) return -1;
309     b->data = new_data;
310     b->m_data = new_m_data;
311     return 0;
312 }
313 
realloc_bam_data(bam1_t * b,size_t desired)314 static inline int realloc_bam_data(bam1_t *b, size_t desired)
315 {
316     if (desired <= b->m_data) return 0;
317     return do_realloc_bam_data(b, desired);
318 }
319 
possibly_expand_bam_data(bam1_t * b,size_t bytes)320 static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) {
321     uint32_t new_len = b->l_data + bytes;
322 
323     if (new_len > INT32_MAX || new_len < b->l_data) {
324         errno = ENOMEM;
325         return -1;
326     }
327     if (new_len <= b->m_data) return 0;
328     return do_realloc_bam_data(b, new_len);
329 }
330 
bam_destroy1(bam1_t * b)331 void bam_destroy1(bam1_t *b)
332 {
333     if (b == 0) return;
334     free(b->data); free(b);
335 }
336 
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)337 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
338 {
339     uint8_t *data = bdst->data;
340     int m_data = bdst->m_data;   // backup data and m_data
341     if (m_data < bsrc->l_data) { // double the capacity
342         m_data = bsrc->l_data; kroundup32(m_data);
343         data = (uint8_t*)realloc(data, m_data);
344     }
345     memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
346     *bdst = *bsrc; // copy the rest
347     // restore the backup
348     bdst->m_data = m_data;
349     bdst->data = data;
350     return bdst;
351 }
352 
bam_dup1(const bam1_t * bsrc)353 bam1_t *bam_dup1(const bam1_t *bsrc)
354 {
355     if (bsrc == NULL) return NULL;
356     bam1_t *bdst = bam_init1();
357     if (bdst == NULL) return NULL;
358     return bam_copy1(bdst, bsrc);
359 }
360 
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,int * rlen,int * qlen)361 void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen)
362 {
363     int k;
364     *rlen = *qlen = 0;
365     for (k = 0; k < n_cigar; ++k) {
366         int type = bam_cigar_type(bam_cigar_op(cigar[k]));
367         int len = bam_cigar_oplen(cigar[k]);
368         if (type & 1) *qlen += len;
369         if (type & 2) *rlen += len;
370     }
371 }
372 
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)373 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
374 {
375     int k, l;
376     for (k = l = 0; k < n_cigar; ++k)
377         if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
378             l += bam_cigar_oplen(cigar[k]);
379     return l;
380 }
381 
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)382 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
383 {
384     int k, l;
385     for (k = l = 0; k < n_cigar; ++k)
386         if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
387             l += bam_cigar_oplen(cigar[k]);
388     return l;
389 }
390 
bam_endpos(const bam1_t * b)391 int32_t bam_endpos(const bam1_t *b)
392 {
393     if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
394         return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
395     else
396         return b->core.pos + 1;
397 }
398 
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)399 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
400 {
401     bam1_core_t *c = &b->core;
402     uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
403     uint8_t *CG;
404 
405     // test where there is a real CIGAR in the CG tag to move
406     if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
407     cigar0 = bam_get_cigar(b);
408     if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
409     fake_bytes = c->n_cigar * 4;
410     if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag
411     if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
412     CG_len = le_to_u32(CG + 2);
413     if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
414 
415     // move from the CG tag to the right position
416     cigar_st = (uint8_t*)cigar0 - b->data;
417     c->n_cigar = CG_len;
418     n_cigar4 = c->n_cigar * 4;
419     CG_st = CG - b->data - 2;
420     CG_en = CG_st + 8 + n_cigar4;
421     if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
422     b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
423     memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
424     memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
425     if (ori_len > CG_en) // move data after the CG tag
426         memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
427     b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
428     if (recal_bin)
429         b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5);
430     if (give_warning)
431         hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
432     return 1;
433 }
434 
aux_type2size(uint8_t type)435 static inline int aux_type2size(uint8_t type)
436 {
437     switch (type) {
438     case 'A': case 'c': case 'C':
439         return 1;
440     case 's': case 'S':
441         return 2;
442     case 'i': case 'I': case 'f':
443         return 4;
444     case 'd':
445         return 8;
446     case 'Z': case 'H': case 'B':
447         return type;
448     default:
449         return 0;
450     }
451 }
452 
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)453 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
454 {
455     uint32_t *cigar = (uint32_t*)(data + c->l_qname);
456     uint32_t i;
457     for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
458 }
459 
bam_read1(BGZF * fp,bam1_t * b)460 int bam_read1(BGZF *fp, bam1_t *b)
461 {
462     bam1_core_t *c = &b->core;
463     int32_t block_len, ret, i;
464     uint32_t x[8], new_l_data;
465     if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
466         if (ret == 0) return -1; // normal end-of-file
467         else return -2; // truncated
468     }
469     if (fp->is_be)
470         ed_swap_4p(&block_len);
471     if (block_len < 32) return -4;  // block_len includes core data
472     if (bgzf_read(fp, x, 32) != 32) return -3;
473     if (fp->is_be) {
474         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
475     }
476     c->tid = x[0]; c->pos = x[1];
477     c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
478     c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
479     if ((uint32_t) c->l_qname + c->l_extranul > 255) // l_qname would overflow
480         return -4;
481     c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
482     c->l_qseq = x[4];
483     c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
484 
485     new_l_data = block_len - 32 + c->l_extranul;
486     if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
487     if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
488         + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
489         return -4;
490     if (realloc_bam_data(b, new_l_data) < 0) return -4;
491     b->l_data = new_l_data;
492 
493     if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
494     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
495     c->l_qname += c->l_extranul;
496     if (b->l_data < c->l_qname ||
497         bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
498         return -4;
499     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
500     if (bam_tag2cigar(b, 0, 0) < 0)
501         return -4;
502 
503     if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
504         int rlen, qlen;
505         bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
506         if ((b->core.flag & BAM_FUNMAP)) rlen=1;
507         b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
508         // Sanity check for broken CIGAR alignments
509         if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
510             hts_log_error("CIGAR and query sequence lengths differ for %s",
511                     bam_get_qname(b));
512             return -4;
513         }
514     }
515 
516     return 4 + block_len;
517 }
518 
bam_write1(BGZF * fp,const bam1_t * b)519 int bam_write1(BGZF *fp, const bam1_t *b)
520 {
521     const bam1_core_t *c = &b->core;
522     uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
523     int i, ok;
524     if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
525     x[0] = c->tid;
526     x[1] = c->pos;
527     x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
528     if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
529     else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
530     x[4] = c->l_qseq;
531     x[5] = c->mtid;
532     x[6] = c->mpos;
533     x[7] = c->isize;
534     ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
535     if (fp->is_be) {
536         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
537         y = block_len;
538         if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
539         swap_data(c, b->l_data, b->data, 1);
540     } else {
541         if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
542     }
543     if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
544     if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
545     if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
546         if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
547     } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
548         uint8_t buf[8];
549         uint32_t cigar_st, cigar_en, cigar[2];
550         cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
551         cigar_en = cigar_st + c->n_cigar * 4;
552         cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
553         cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP;
554         u32_to_le(cigar[0], buf);
555         u32_to_le(cigar[1], buf + 4);
556         if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
557         if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
558         if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
559         u32_to_le(c->n_cigar, buf);
560         if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
561         if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
562     }
563     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
564     return ok? 4 + block_len : -1;
565 }
566 
567 /********************
568  *** BAM indexing ***
569  ********************/
570 
bam_index(BGZF * fp,int min_shift)571 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
572 {
573     int n_lvls, i, fmt, ret;
574     bam1_t *b;
575     hts_idx_t *idx;
576     bam_hdr_t *h;
577     h = bam_hdr_read(fp);
578     if (h == NULL) return NULL;
579     if (min_shift > 0) {
580         int64_t max_len = 0, s;
581         for (i = 0; i < h->n_targets; ++i)
582             if (max_len < h->target_len[i]) max_len = h->target_len[i];
583         max_len += 256;
584         for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
585         fmt = HTS_FMT_CSI;
586     } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
587     idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
588     bam_hdr_destroy(h);
589     b = bam_init1();
590     while ((ret = bam_read1(fp, b)) >= 0) {
591         ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
592         if (ret < 0) goto err; // unsorted
593     }
594     if (ret < -1) goto err; // corrupted BAM file
595 
596     hts_idx_finish(idx, bgzf_tell(fp));
597     bam_destroy1(b);
598     return idx;
599 
600 err:
601     bam_destroy1(b);
602     hts_idx_destroy(idx);
603     return NULL;
604 }
605 
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)606 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
607 {
608     hts_idx_t *idx;
609     htsFile *fp;
610     int ret = 0;
611 
612     if ((fp = hts_open(fn, "r")) == 0) return -2;
613     if (nthreads)
614         hts_set_threads(fp, nthreads);
615 
616     switch (fp->format.format) {
617     case cram:
618         ret = cram_index_build(fp->fp.cram, fn, fnidx);
619         break;
620 
621     case bam:
622         idx = bam_index(fp->fp.bgzf, min_shift);
623         if (idx) {
624             ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
625             if (ret < 0) ret = -4;
626             hts_idx_destroy(idx);
627         }
628         else ret = -1;
629         break;
630 
631     default:
632         ret = -3;
633         break;
634     }
635     hts_close(fp);
636 
637     return ret;
638 }
639 
sam_index_build2(const char * fn,const char * fnidx,int min_shift)640 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
641 {
642     return sam_index_build3(fn, fnidx, min_shift, 0);
643 }
644 
sam_index_build(const char * fn,int min_shift)645 int sam_index_build(const char *fn, int min_shift)
646 {
647     return sam_index_build3(fn, NULL, min_shift, 0);
648 }
649 
650 // Provide bam_index_build() symbol for binary compability with earlier HTSlib
651 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)652 int bam_index_build(const char *fn, int min_shift)
653 {
654     return sam_index_build2(fn, NULL, min_shift);
655 }
656 
bam_readrec(BGZF * fp,void * ignored,void * bv,int * tid,int * beg,int * end)657 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
658 {
659     bam1_t *b = bv;
660     int ret;
661     if ((ret = bam_read1(fp, b)) >= 0) {
662         *tid = b->core.tid;
663         *beg = b->core.pos;
664         *end = bam_endpos(b);
665     }
666     return ret;
667 }
668 
669 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,int * beg,int * end)670 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
671 {
672     htsFile *fp = fpv;
673     bam1_t *b = bv;
674     int ret = cram_get_bam_seq(fp->fp.cram, &b);
675     if (ret < 0)
676         return cram_eof(fp->fp.cram) ? -1 : -2;
677 
678     if (bam_tag2cigar(b, 1, 1) < 0)
679         return -2;
680 
681     *tid = b->core.tid;
682     *beg = b->core.pos;
683     *end = bam_endpos(b);
684 
685     return ret;
686 }
687 
cram_pseek(void * fp,int64_t offset,int whence)688 static int cram_pseek(void *fp, int64_t offset, int whence)
689 {
690     cram_fd *fd =  (cram_fd *)fp;
691 
692     if ((0 != cram_seek(fd, offset, SEEK_SET))
693      && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
694         return -1;
695 
696     if (fd->ctr) {
697         cram_free_container(fd->ctr);
698         if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
699             cram_free_container(fd->ctr_mt);
700 
701         fd->ctr = NULL;
702         fd->ctr_mt = NULL;
703         fd->ooc = 0;
704     }
705 
706     return 0;
707 }
708 
709 /*
710  * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
711  *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
712  *   container previously fetched. It was designed like this to integrate with the functionality
713  *   of the iterator stepping logic.
714  */
715 
cram_ptell(void * fp)716 static int64_t cram_ptell(void *fp)
717 {
718     cram_fd *fd = (cram_fd *)fp;
719     cram_container *c;
720     int64_t ret = -1L;
721 
722     if (fd && fd->fp) {
723         ret = htell(fd->fp);
724         if ((c = fd->ctr) != NULL) {
725             ret -= ((c->curr_slice < c->max_slice || c->curr_rec < c->num_records) ? c->offset + 1 : 0);
726         }
727     }
728 
729     return ret;
730 }
731 
bam_pseek(void * fp,int64_t offset,int whence)732 static int bam_pseek(void *fp, int64_t offset, int whence)
733 {
734     BGZF *fd = (BGZF *)fp;
735 
736     return bgzf_seek(fd, offset, whence);
737 }
738 
bam_ptell(void * fp)739 static int64_t bam_ptell(void *fp)
740 {
741     BGZF *fd = (BGZF *)fp;
742     if (!fd)
743         return -1L;
744 
745     return bgzf_tell(fd);
746 }
747 
748 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_bam_cram_readrec(BGZF * bgzfp,void * fpv,void * bv,int * tid,int * beg,int * end)749 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
750 {
751     htsFile *fp = fpv;
752     bam1_t *b = bv;
753     switch (fp->format.format) {
754     case bam:   return bam_read1(bgzfp, b);
755     case cram: {
756         int ret = cram_get_bam_seq(fp->fp.cram, &b);
757         if (ret < 0)
758             return cram_eof(fp->fp.cram) ? -1 : -2;
759 
760         if (bam_tag2cigar(b, 1, 1) < 0)
761             return -2;
762         return ret;
763     }
764     default:
765         // TODO Need headers available to implement this for SAM files
766         hts_log_error("Not implemented for SAM files");
767         abort();
768     }
769 }
770 
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)771 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
772 {
773     switch (fp->format.format) {
774     case bam:
775         return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI);
776 
777     case cram: {
778         if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
779         // Cons up a fake "index" just pointing at the associated cram_fd:
780         hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
781         if (idx == NULL) return NULL;
782         idx->fmt = HTS_FMT_CRAI;
783         idx->cram = fp->fp.cram;
784         return (hts_idx_t *) idx;
785         }
786 
787     default:
788         return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
789     }
790 }
791 
sam_index_load(htsFile * fp,const char * fn)792 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
793 {
794     return sam_index_load2(fp, fn, NULL);
795 }
796 
cram_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)797 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
798 {
799     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
800     hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
801     if (iter == NULL) return NULL;
802 
803     // Cons up a dummy iterator for which hts_itr_next() will simply invoke
804     // the readrec function:
805     iter->is_cram = 1;
806     iter->read_rest = 1;
807     iter->off = NULL;
808     iter->bins.a = NULL;
809     iter->readrec = readrec;
810 
811     if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
812         cram_range r = { tid, beg+1, end };
813         int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
814 
815         iter->curr_off = 0;
816         // The following fields are not required by hts_itr_next(), but are
817         // filled in in case user code wants to look at them.
818         iter->tid = tid;
819         iter->beg = beg;
820         iter->end = end;
821 
822         switch (ret) {
823         case 0:
824             break;
825 
826         case -2:
827             // No data vs this ref, so mark iterator as completed.
828             // Same as HTS_IDX_NONE.
829             iter->finished = 1;
830             break;
831 
832         default:
833             free(iter);
834             return NULL;
835         }
836     }
837     else switch (tid) {
838     case HTS_IDX_REST:
839         iter->curr_off = 0;
840         break;
841     case HTS_IDX_NONE:
842         iter->curr_off = 0;
843         iter->finished = 1;
844         break;
845     default:
846         hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
847         abort();
848         break;
849     }
850 
851     return iter;
852 }
853 
sam_itr_queryi(const hts_idx_t * idx,int tid,int beg,int end)854 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
855 {
856     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
857     if (idx == NULL)
858         return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
859     else if (cidx->fmt == HTS_FMT_CRAI)
860         return cram_itr_query(idx, tid, beg, end, cram_readrec);
861     else
862         return hts_itr_query(idx, tid, beg, end, bam_readrec);
863 }
864 
cram_name2id(void * fdv,const char * ref)865 static int cram_name2id(void *fdv, const char *ref)
866 {
867     cram_fd *fd = (cram_fd *) fdv;
868     return sam_hdr_name2ref(fd->header, ref);
869 }
870 
sam_itr_querys(const hts_idx_t * idx,bam_hdr_t * hdr,const char * region)871 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
872 {
873     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
874     if (cidx->fmt == HTS_FMT_CRAI)
875         return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
876     else
877         return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
878 }
879 
sam_itr_regions(const hts_idx_t * idx,bam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)880 hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
881 {
882     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
883     if (cidx->fmt == HTS_FMT_CRAI)
884         return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
885                    hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
886     else
887         return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
888                    hts_itr_multi_bam, bam_readrec, bam_pseek, bam_ptell);
889 }
890 
891 /**********************
892  *** SAM header I/O ***
893  **********************/
894 
895 #include "htslib/kseq.h"
896 #include "htslib/kstring.h"
897 
sam_hdr_parse(int l_text,const char * text)898 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
899 {
900     const char *q, *r, *p;
901     khash_t(s2i) *d;
902     d = kh_init(s2i);
903     for (p = text; *p; ++p) {
904         if (strncmp(p, "@SQ\t", 4) == 0) {
905             char *sn = 0;
906             int ln = -1;
907             for (q = p + 4;; ++q) {
908                 if (strncmp(q, "SN:", 3) == 0) {
909                     q += 3;
910                     for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r);
911                     sn = (char*)calloc(r - q + 1, 1);
912                     strncpy(sn, q, r - q);
913                     q = r;
914                 } else if (strncmp(q, "LN:", 3) == 0)
915                     ln = strtol(q + 3, (char**)&q, 10);
916                 while (*q != '\t' && *q != '\n' && *q != '\0') ++q;
917                 if (*q == '\0' || *q == '\n') break;
918             }
919             p = q;
920             if (sn && ln >= 0) {
921                 khint_t k;
922                 int absent;
923                 k = kh_put(s2i, d, sn, &absent);
924                 if (!absent) {
925                     hts_log_warning("Duplicated sequence '%s'", sn);
926                     free(sn);
927                 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
928             }
929         }
930         while (*p != '\0' && *p != '\n') ++p;
931     }
932     return hdr_from_dict(d);
933 }
934 
935 // Minimal sanitisation of a header to ensure.
936 // - null terminated string.
937 // - all lines start with @ (also implies no blank lines).
938 //
939 // Much more could be done, but currently is not, including:
940 // - checking header types are known (HD, SQ, etc).
941 // - syntax (eg checking tab separated fields).
942 // - validating n_targets matches @SQ records.
943 // - validating target lengths against @SQ records.
sam_hdr_sanitise(bam_hdr_t * h)944 static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) {
945     if (!h)
946         return NULL;
947 
948     // Special case for empty headers.
949     if (h->l_text == 0)
950         return h;
951 
952     uint32_t i, lnum = 0;
953     char *cp = h->text, last = '\n';
954     for (i = 0; i < h->l_text; i++) {
955         // NB: l_text excludes terminating nul.  This finds early ones.
956         if (cp[i] == 0)
957             break;
958 
959         // Error on \n[^@], including duplicate newlines
960         if (last == '\n') {
961             lnum++;
962             if (cp[i] != '@') {
963                 hts_log_error("Malformed SAM header at line %u", lnum);
964                 bam_hdr_destroy(h);
965                 return NULL;
966             }
967         }
968 
969         last = cp[i];
970     }
971 
972     if (i < h->l_text) { // Early nul found.  Complain if not just padding.
973         uint32_t j = i;
974         while (j < h->l_text && cp[j] == '\0') j++;
975         if (j < h->l_text)
976             hts_log_warning("Unexpected NUL character in header. Possibly truncated");
977     }
978 
979     // Add trailing newline and/or trailing nul if required.
980     if (last != '\n') {
981         hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
982 
983         if (h->l_text == UINT32_MAX) {
984             hts_log_error("No room for extra newline");
985             bam_hdr_destroy(h);
986             return NULL;
987         }
988 
989         if (i >= h->l_text - 1) {
990             cp = realloc(h->text, (size_t) h->l_text+2);
991             if (!cp) {
992                 bam_hdr_destroy(h);
993                 return NULL;
994             }
995             h->text = cp;
996         }
997         cp[i++] = '\n';
998 
999         // l_text may be larger already due to multiple nul padding
1000         if (h->l_text < i)
1001             h->l_text = i;
1002         cp[h->l_text] = '\0';
1003     }
1004 
1005     return h;
1006 }
1007 
sam_hdr_read(htsFile * fp)1008 bam_hdr_t *sam_hdr_read(htsFile *fp)
1009 {
1010     if (!fp) {
1011         errno = EINVAL;
1012         return NULL;
1013     }
1014 
1015     switch (fp->format.format) {
1016     case bam:
1017         return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1018 
1019     case cram:
1020         return sam_hdr_sanitise(cram_header_to_bam(fp->fp.cram->header));
1021 
1022     case sam: {
1023         kstring_t str = { 0, 0, NULL };
1024         bam_hdr_t *h = NULL;
1025         int ret, has_SQ = 0;
1026         while ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1027             if (fp->line.s[0] != '@') break;
1028             if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
1029             kputsn(fp->line.s, fp->line.l, &str);
1030             kputc('\n', &str);
1031         }
1032         if (ret < -1) goto error;
1033         if (! has_SQ && fp->fn_aux) {
1034             kstring_t line = { 0, 0, NULL };
1035             hFILE *f = hopen(fp->fn_aux, "r");
1036             if (f == NULL) goto error;
1037             while (line.l = 0, kgetline(&line, (kgets_func *) hgets, f) >= 0) {
1038                 char *tab = strchr(line.s, '\t');
1039                 if (tab == NULL) continue;
1040                 kputs("@SQ\tSN:", &str);
1041                 kputsn(line.s, tab - line.s, &str);
1042                 kputs("\tLN:", &str);
1043                 kputl(atol(tab), &str);
1044                 kputc('\n', &str);
1045             }
1046             free(line.s);
1047             if (hclose(f) != 0) {
1048                 hts_log_warning("Failed to close %s", fp->fn_aux);
1049             }
1050         }
1051         if (str.l == 0) kputsn("", 0, &str);
1052         h = sam_hdr_parse(str.l, str.s);
1053         h->l_text = str.l; h->text = str.s;
1054         return sam_hdr_sanitise(h);
1055 
1056      error:
1057         bam_hdr_destroy(h);
1058         free(str.s);
1059         return NULL;
1060         }
1061 
1062     default:
1063         abort();
1064     }
1065 }
1066 
sam_hdr_write(htsFile * fp,const bam_hdr_t * h)1067 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
1068 {
1069     if (!fp || !h) {
1070         errno = EINVAL;
1071         return -1;
1072     }
1073 
1074     switch (fp->format.format) {
1075     case binary_format:
1076         fp->format.category = sequence_data;
1077         fp->format.format = bam;
1078         /* fall-through */
1079     case bam:
1080         if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1081         break;
1082 
1083     case cram: {
1084         cram_fd *fd = fp->fp.cram;
1085         SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h);
1086         if (! hdr) return -1;
1087         if (cram_set_header(fd, hdr) < 0) return -1;
1088         if (fp->fn_aux)
1089             cram_load_reference(fd, fp->fn_aux);
1090         if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1091         }
1092         break;
1093 
1094     case text_format:
1095         fp->format.category = sequence_data;
1096         fp->format.format = sam;
1097         /* fall-through */
1098     case sam: {
1099         char *p;
1100         hputs(h->text, fp->fp.hfile);
1101         p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
1102         if (p == 0) {
1103             int i;
1104             for (i = 0; i < h->n_targets; ++i) {
1105                 fp->line.l = 0;
1106                 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
1107                 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
1108                 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1109             }
1110         }
1111         if ( hflush(fp->fp.hfile) != 0 ) return -1;
1112         }
1113         break;
1114 
1115     default:
1116         abort();
1117     }
1118     return 0;
1119 }
1120 
sam_hdr_change_HD(bam_hdr_t * h,const char * key,const char * val)1121 int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val)
1122 {
1123     char *p, *q, *beg = NULL, *end = NULL, *newtext;
1124     if (!h || !key)
1125         return -1;
1126 
1127     if (h->l_text > 3) {
1128         if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
1129             if ((p = strchr(h->text, '\n')) == 0) return -1;
1130             *p = '\0'; // for strstr call
1131 
1132             char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
1133 
1134             if ((q = strstr(h->text, tmp)) != 0) { // key exists
1135                 *p = '\n'; // change back
1136 
1137                 // mark the key:val
1138                 beg = q;
1139                 for (q += 4; *q != '\n' && *q != '\t'; ++q);
1140                 end = q;
1141 
1142                 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
1143                     && strlen(val) == end - beg - 4)
1144                      return 0; // val is the same, no need to change
1145 
1146             } else {
1147                 beg = end = p;
1148                 *p = '\n';
1149             }
1150         }
1151     }
1152     if (beg == NULL) { // no @HD
1153         if (h->l_text > UINT32_MAX - strlen(SAM_FORMAT_VERSION) - 9)
1154             return -1;
1155         h->l_text += strlen(SAM_FORMAT_VERSION) + 8;
1156         if (val) {
1157             if (h->l_text > UINT32_MAX - strlen(val) - 5)
1158                 return -1;
1159             h->l_text += strlen(val) + 4;
1160         }
1161         newtext = (char*)malloc(h->l_text + 1);
1162         if (!newtext) return -1;
1163 
1164         if (val)
1165             snprintf(newtext, h->l_text + 1,
1166                     "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
1167         else
1168             snprintf(newtext, h->l_text + 1,
1169                     "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
1170     } else { // has @HD but different or no key
1171         h->l_text = (beg - h->text) + (h->text + h->l_text - end);
1172         if (val) {
1173             if (h->l_text > UINT32_MAX - strlen(val) - 5)
1174                 return -1;
1175             h->l_text += strlen(val) + 4;
1176         }
1177         newtext = (char*)malloc(h->l_text + 1);
1178         if (!newtext) return -1;
1179 
1180         if (val) {
1181             snprintf(newtext, h->l_text + 1, "%.*s\t%s:%s%s",
1182                     (int) (beg - h->text), h->text, key, val, end);
1183         } else { //delete key
1184             snprintf(newtext, h->l_text + 1, "%.*s%s",
1185                     (int) (beg - h->text), h->text, end);
1186         }
1187     }
1188     free(h->text);
1189     h->text = newtext;
1190     return 0;
1191 }
1192 
1193 /**********************
1194  *** SAM record I/O ***
1195  **********************/
1196 
sam_parse1(kstring_t * s,bam_hdr_t * h,bam1_t * b)1197 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
1198 {
1199 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
1200 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
1201 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
1202 #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0)
1203 #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0)
1204 #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0)
1205 
1206     uint8_t *t;
1207     char *p = s->s, *q;
1208     int i;
1209     kstring_t str;
1210     bam1_core_t *c = &b->core;
1211 
1212     str.l = b->l_data = 0;
1213     str.s = (char*)b->data; str.m = b->m_data;
1214     memset(c, 0, 32);
1215     if (h->cigar_tab == 0) {
1216         h->cigar_tab = (int8_t*) malloc(128);
1217         for (i = 0; i < 128; ++i)
1218             h->cigar_tab[i] = -1;
1219         for (i = 0; BAM_CIGAR_STR[i]; ++i)
1220             h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
1221     }
1222     // qname
1223     q = _read_token(p);
1224     _parse_warn(p - q <= 1, "empty query name");
1225     _parse_err(p - q > 252, "query name too long");
1226     kputsn_(q, p - q, &str);
1227     for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++)
1228         kputc_('\0', &str);
1229     c->l_qname = p - q + c->l_extranul;
1230     // flag
1231     c->flag = strtol(p, &p, 0);
1232     if (*p++ != '\t') goto err_ret; // malformated flag
1233     // chr
1234     q = _read_token(p);
1235     if (strcmp(q, "*")) {
1236         _parse_err(h->n_targets == 0, "missing SAM header");
1237         c->tid = bam_name2id(h, q);
1238         _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
1239     } else c->tid = -1;
1240     // pos
1241     c->pos = strtol(p, &p, 10) - 1;
1242     if (*p++ != '\t') goto err_ret;
1243     if (c->pos < 0 && c->tid >= 0) {
1244         _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
1245         c->tid = -1;
1246     }
1247     if (c->tid < 0) c->flag |= BAM_FUNMAP;
1248     // mapq
1249     c->qual = strtol(p, &p, 10);
1250     if (*p++ != '\t') goto err_ret;
1251     // cigar
1252     if (*p != '*') {
1253         uint32_t *cigar;
1254         size_t n_cigar = 0;
1255         for (q = p; *p && *p != '\t'; ++p)
1256             if (!isdigit_c(*p)) ++n_cigar;
1257         if (*p++ != '\t') goto err_ret;
1258         _parse_err(n_cigar == 0, "no CIGAR operations");
1259         _parse_err(n_cigar >= 2147483647, "too many CIGAR operations");
1260         c->n_cigar = n_cigar;
1261         _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t));
1262         for (i = 0; i < c->n_cigar; ++i, ++q) {
1263             int op;
1264             cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
1265             op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
1266             _parse_err(op < 0, "unrecognized CIGAR operator");
1267             cigar[i] |= op;
1268         }
1269         // can't use bam_endpos() directly as some fields not yet set up
1270         i = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
1271     } else {
1272         _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
1273         c->flag |= BAM_FUNMAP;
1274         q = _read_token(p);
1275         i = 1;
1276     }
1277     c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
1278     // mate chr
1279     q = _read_token(p);
1280     if (strcmp(q, "=") == 0) {
1281         c->mtid = c->tid;
1282     } else if (strcmp(q, "*") == 0) {
1283         c->mtid = -1;
1284     } else {
1285         c->mtid = bam_name2id(h, q);
1286         _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped");
1287     }
1288     // mpos
1289     c->mpos = strtol(p, &p, 10) - 1;
1290     if (*p++ != '\t') goto err_ret;
1291     if (c->mpos < 0 && c->mtid >= 0) {
1292         _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
1293         c->mtid = -1;
1294     }
1295     // tlen
1296     c->isize = strtol(p, &p, 10);
1297     if (*p++ != '\t') goto err_ret;
1298     // seq
1299     q = _read_token(p);
1300     if (strcmp(q, "*")) {
1301         c->l_qseq = p - q - 1;
1302         i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
1303         _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
1304         i = (c->l_qseq + 1) >> 1;
1305         _get_mem(uint8_t, &t, &str, i);
1306         memset(t, 0, i);
1307         for (i = 0; i < c->l_qseq; ++i)
1308             t[i>>1] |= seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
1309     } else c->l_qseq = 0;
1310     // qual
1311     q = _read_token_aux(p);
1312     _get_mem(uint8_t, &t, &str, c->l_qseq);
1313     if (strcmp(q, "*")) {
1314         _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
1315         for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
1316     } else memset(t, 0xff, c->l_qseq);
1317     // aux
1318     while (p < s->s + s->l) {
1319         uint8_t type;
1320         q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
1321         _parse_err(p - q - 1 < 5, "incomplete aux field");
1322         kputsn_(q, 2, &str);
1323         q += 3; type = *q++; ++q; // q points to value
1324         if (type != 'Z' && type != 'H') // the only zero length acceptable fields
1325             _parse_err(p - q - 1 < 1, "incomplete aux field");
1326 
1327         // Ensure str has enough space for a double + type allocated.
1328         // This is so we can stuff bigger integers and floats directly into
1329         // the kstring.  Sorry.
1330         _parse_err(ks_resize(&str, str.l + 16), "out of memory");
1331 
1332         if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
1333             kputc_('A', &str);
1334             kputc_(*q, &str);
1335         } else if (type == 'i' || type == 'I') {
1336             if (*q == '-') {
1337                 long x = strtol(q, &q, 10);
1338                 if (x >= INT8_MIN) {
1339                     kputc_('c', &str); kputc_(x, &str);
1340                 } else if (x >= INT16_MIN) {
1341                     str.s[str.l++] = 's';
1342                     i16_to_le(x, (uint8_t *) str.s + str.l);
1343                     str.l += 2;
1344                 } else {
1345                     str.s[str.l++] = 'i';
1346                     i32_to_le(x, (uint8_t *) str.s + str.l);
1347                     str.l += 4;
1348                 }
1349             } else {
1350                 unsigned long x = strtoul(q, &q, 10);
1351                 if (x <= UINT8_MAX) {
1352                     kputc_('C', &str); kputc_(x, &str);
1353                 } else if (x <= UINT16_MAX) {
1354                     str.s[str.l++] = 'S';
1355                     u16_to_le(x, (uint8_t *) str.s + str.l);
1356                     str.l += 2;
1357                 } else {
1358                     str.s[str.l++] = 'I';
1359                     u32_to_le(x, (uint8_t *) str.s + str.l);
1360                     str.l += 4;
1361                 }
1362             }
1363         } else if (type == 'f') {
1364             str.s[str.l++] = 'f';
1365             float_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1366             str.l += sizeof(float);
1367         } else if (type == 'd') {
1368             str.s[str.l++] = 'd';
1369             double_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1370             str.l += sizeof(double);
1371         } else if (type == 'Z' || type == 'H') {
1372             _parse_err(type == 'H' && !((p-q)&1),
1373                        "hex field does not have an even number of digits");
1374             kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
1375         } else if (type == 'B') {
1376             int32_t n, size;
1377             size_t bytes;
1378             char *r;
1379             _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
1380             type = *q++; // q points to the first ',' following the typing byte
1381 
1382             size = aux_type2size(type);
1383             _parse_err_param(size <= 0 || size > 4,
1384                              "unrecognized type B:%c", type);
1385             _parse_err(*q && *q != ',', "B aux field type not followed by ','");
1386 
1387             for (r = q, n = 0; *r; ++r)
1388                 if (*r == ',') ++n;
1389 
1390             // Ensure space for type + values
1391             bytes = (size_t) n * (size_t) size;
1392             _parse_err(bytes / size != n
1393                        || ks_resize(&str, str.l + bytes + 2 + sizeof(uint32_t)),
1394                        "out of memory");
1395             str.s[str.l++] = 'B';
1396             str.s[str.l++] = type;
1397             i32_to_le(n, (uint8_t *) str.s + str.l);
1398             str.l += sizeof(uint32_t);
1399 
1400             // This ensures that q always ends up at the next comma after
1401             // reading a number even if it's followed by junk.  It
1402             // prevents the possibility of trying to read more than n items.
1403 #define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0)
1404 
1405             if (type == 'c')      while (q + 1 < p) { int8_t   x = strtol(q + 1, &q, 0); kputc_(x, &str); }
1406             else if (type == 'C') while (q + 1 < p) { uint8_t  x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
1407             else if (type == 's') while (q + 1 < p) { i16_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1408             else if (type == 'S') while (q + 1 < p) { u16_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1409             else if (type == 'i') while (q + 1 < p) { i32_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1410             else if (type == 'I') while (q + 1 < p) { u32_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1411             else if (type == 'f') while (q + 1 < p) { float_to_le(strtod(q + 1, &q), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1412             else _parse_err_param(1, "unrecognized type B:%c", type);
1413 
1414 #undef _skip_to_comma
1415 
1416         } else _parse_err_param(1, "unrecognized type %c", type);
1417     }
1418     b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1419     if (bam_tag2cigar(b, 1, 1) < 0)
1420         return -2;
1421     return 0;
1422 
1423 #undef _parse_warn
1424 #undef _parse_err
1425 #undef _parse_err_param
1426 #undef _get_mem
1427 #undef _read_token_aux
1428 #undef _read_token
1429 err_ret:
1430     b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1431     return -2;
1432 }
1433 
sam_read1(htsFile * fp,bam_hdr_t * h,bam1_t * b)1434 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
1435 {
1436     switch (fp->format.format) {
1437     case bam: {
1438         int r = bam_read1(fp->fp.bgzf, b);
1439         if (r >= 0) {
1440             if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
1441                 b->core.mtid >= h->n_targets || b->core.mtid < -1)
1442                 return -3;
1443         }
1444         return r;
1445         }
1446 
1447     case cram: {
1448         int ret = cram_get_bam_seq(fp->fp.cram, &b);
1449         if (ret < 0)
1450             return cram_eof(fp->fp.cram) ? -1 : -2;
1451 
1452         if (bam_tag2cigar(b, 1, 1) < 0)
1453             return -2;
1454         return ret;
1455     }
1456 
1457     case sam: {
1458         int ret;
1459 err_recover:
1460         if (fp->line.l == 0) {
1461             ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1462             if (ret < 0) return ret;
1463         }
1464         ret = sam_parse1(&fp->line, h, b);
1465         fp->line.l = 0;
1466         if (ret < 0) {
1467             hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
1468             if (h->ignore_sam_err) goto err_recover;
1469         }
1470         return ret;
1471     }
1472 
1473     default:
1474         abort();
1475     }
1476 }
1477 
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)1478 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
1479 {
1480     int i;
1481     uint8_t *s, *end;
1482     const bam1_core_t *c = &b->core;
1483 
1484     str->l = 0;
1485     kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name
1486     kputw(c->flag, str); kputc('\t', str); // flag
1487     if (c->tid >= 0) { // chr
1488         kputs(h->target_name[c->tid] , str);
1489         kputc('\t', str);
1490     } else kputsn("*\t", 2, str);
1491     kputw(c->pos + 1, str); kputc('\t', str); // pos
1492     kputw(c->qual, str); kputc('\t', str); // qual
1493     if (c->n_cigar) { // cigar
1494         uint32_t *cigar = bam_get_cigar(b);
1495         for (i = 0; i < c->n_cigar; ++i) {
1496             kputw(bam_cigar_oplen(cigar[i]), str);
1497             kputc(bam_cigar_opchr(cigar[i]), str);
1498         }
1499     } else kputc('*', str);
1500     kputc('\t', str);
1501     if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
1502     else if (c->mtid == c->tid) kputsn("=\t", 2, str);
1503     else {
1504         kputs(h->target_name[c->mtid], str);
1505         kputc('\t', str);
1506     }
1507     kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
1508     kputw(c->isize, str); kputc('\t', str); // template len
1509     if (c->l_qseq) { // seq and qual
1510         uint8_t *s = bam_get_seq(b);
1511         for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
1512         kputc('\t', str);
1513         s = bam_get_qual(b);
1514         if (s[0] == 0xff) kputc('*', str);
1515         else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
1516     } else kputsn("*\t*", 3, str);
1517 
1518     s = bam_get_aux(b); // aux
1519     end = b->data + b->l_data;
1520     while (end - s >= 4) {
1521         uint8_t type, key[2];
1522         key[0] = s[0]; key[1] = s[1];
1523         s += 2; type = *s++;
1524         kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
1525         if (type == 'A') {
1526             kputsn("A:", 2, str);
1527             kputc(*s, str);
1528             ++s;
1529         } else if (type == 'C') {
1530             kputsn("i:", 2, str);
1531             kputw(*s, str);
1532             ++s;
1533         } else if (type == 'c') {
1534             kputsn("i:", 2, str);
1535             kputw(*(int8_t*)s, str);
1536             ++s;
1537         } else if (type == 'S') {
1538             if (end - s >= 2) {
1539                 kputsn("i:", 2, str);
1540                 kputuw(le_to_u16(s), str);
1541                 s += 2;
1542             } else goto bad_aux;
1543         } else if (type == 's') {
1544             if (end - s >= 2) {
1545                 kputsn("i:", 2, str);
1546                 kputw(le_to_i16(s), str);
1547                 s += 2;
1548             } else goto bad_aux;
1549         } else if (type == 'I') {
1550             if (end - s >= 4) {
1551                 kputsn("i:", 2, str);
1552                 kputuw(le_to_u32(s), str);
1553                 s += 4;
1554             } else goto bad_aux;
1555         } else if (type == 'i') {
1556             if (end - s >= 4) {
1557                 kputsn("i:", 2, str);
1558                 kputw(le_to_i32(s), str);
1559                 s += 4;
1560             } else goto bad_aux;
1561         } else if (type == 'f') {
1562             if (end - s >= 4) {
1563                 ksprintf(str, "f:%g", le_to_float(s));
1564                 s += 4;
1565             } else goto bad_aux;
1566 
1567         } else if (type == 'd') {
1568             if (end - s >= 8) {
1569                 ksprintf(str, "d:%g", le_to_double(s));
1570                 s += 8;
1571             } else goto bad_aux;
1572         } else if (type == 'Z' || type == 'H') {
1573             kputc(type, str); kputc(':', str);
1574             while (s < end && *s) kputc(*s++, str);
1575             if (s >= end)
1576                 goto bad_aux;
1577             ++s;
1578         } else if (type == 'B') {
1579             uint8_t sub_type = *(s++);
1580             int sub_type_size = aux_type2size(sub_type);
1581             uint32_t n;
1582             if (sub_type_size == 0 || end - s < 4)
1583                 goto bad_aux;
1584             n = le_to_u32(s);
1585             s += 4; // now points to the start of the array
1586             if ((end - s) / sub_type_size < n)
1587                 goto bad_aux;
1588             kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
1589             for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
1590                 kputc(',', str);
1591                 if ('c' == sub_type)      { kputw(*(int8_t*)s, str); ++s; }
1592                 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
1593                 else if ('s' == sub_type) { kputw(le_to_i16(s), str); s += 2; }
1594                 else if ('S' == sub_type) { kputw(le_to_u16(s), str); s += 2; }
1595                 else if ('i' == sub_type) { kputw(le_to_i32(s), str); s += 4; }
1596                 else if ('I' == sub_type) { kputuw(le_to_u32(s), str); s += 4; }
1597                 else if ('f' == sub_type) { kputd(le_to_float(s), str); s += 4; }
1598                 else goto bad_aux;  // Unknown sub-type
1599             }
1600         } else { // Unknown type
1601             goto bad_aux;
1602         }
1603     }
1604     return str->l;
1605 
1606  bad_aux:
1607     hts_log_error("Corrupted aux data for read %.*s",
1608                   b->core.l_qname, bam_get_qname(b));
1609     errno = EINVAL;
1610     return -1;
1611 }
1612 
sam_write1(htsFile * fp,const bam_hdr_t * h,const bam1_t * b)1613 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1614 {
1615     switch (fp->format.format) {
1616     case binary_format:
1617         fp->format.category = sequence_data;
1618         fp->format.format = bam;
1619         /* fall-through */
1620     case bam:
1621         return bam_write1(fp->fp.bgzf, b);
1622 
1623     case cram:
1624         return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1625 
1626     case text_format:
1627         fp->format.category = sequence_data;
1628         fp->format.format = sam;
1629         /* fall-through */
1630     case sam:
1631         if (sam_format1(h, b, &fp->line) < 0) return -1;
1632         kputc('\n', &fp->line);
1633         if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1634         return fp->line.l;
1635 
1636     default:
1637         abort();
1638     }
1639 }
1640 
1641 /************************
1642  *** Auxiliary fields ***
1643  ************************/
1644 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)1645 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
1646     int tsz = aux_type2size(type);
1647 
1648     if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
1649 
1650     switch (tsz) {
1651         case 'H': case 'Z': case 1:  // Trivial
1652             memcpy(out, in, len);
1653             break;
1654 
1655 #define aux_val_to_le(type_t, store_le) do {                            \
1656         type_t v;                                                       \
1657         size_t i;                                                       \
1658         for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
1659             memcpy(&v, in + i, sizeof(type_t));                         \
1660             store_le(v, out);                                           \
1661         }                                                               \
1662     } while (0)
1663 
1664         case 2: aux_val_to_le(uint16_t, u16_to_le); break;
1665         case 4: aux_val_to_le(uint32_t, u32_to_le); break;
1666         case 8: aux_val_to_le(uint64_t, u64_to_le); break;
1667 
1668 #undef aux_val_to_le
1669 
1670         case 'B': { // Recurse!
1671             uint32_t n;
1672             if (len < 5) return -1;
1673             memcpy(&n, in + 1, 4);
1674             out[0] = in[0];
1675             u32_to_le(n, out + 1);
1676             return aux_to_le(in[0], out + 5, in + 5, len - 5);
1677         }
1678 
1679         default: // Unknown type code
1680             return -1;
1681     }
1682 
1683 
1684 
1685     return 0;
1686 }
1687 #endif
1688 
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)1689 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
1690 {
1691     uint32_t new_len;
1692 
1693     assert(b->l_data >= 0);
1694     new_len = b->l_data + 3 + len;
1695     if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
1696 
1697     if (realloc_bam_data(b, new_len) < 0) return -1;
1698 
1699     b->data[b->l_data] = tag[0];
1700     b->data[b->l_data + 1] = tag[1];
1701     b->data[b->l_data + 2] = type;
1702 
1703 #ifdef HTS_LITTLE_ENDIAN
1704     memcpy(b->data + b->l_data + 3, data, len);
1705 #else
1706     if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
1707         errno = EINVAL;
1708         return -1;
1709     }
1710 #endif
1711 
1712     b->l_data = new_len;
1713 
1714     return 0;
1715 
1716  nomem:
1717     errno = ENOMEM;
1718     return -1;
1719 }
1720 
skip_aux(uint8_t * s,uint8_t * end)1721 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
1722 {
1723     int size;
1724     uint32_t n;
1725     if (s >= end) return end;
1726     size = aux_type2size(*s); ++s; // skip type
1727     switch (size) {
1728     case 'Z':
1729     case 'H':
1730         while (*s && s < end) ++s;
1731         return s < end ? s + 1 : end;
1732     case 'B':
1733         if (end - s < 5) return NULL;
1734         size = aux_type2size(*s); ++s;
1735         n = le_to_u32(s);
1736         s += 4;
1737         if (size == 0 || end - s < size * n) return NULL;
1738         return s + size * n;
1739     case 0:
1740         return NULL;
1741     default:
1742         if (end - s < size) return NULL;
1743         return s + size;
1744     }
1745 }
1746 
bam_aux_get(const bam1_t * b,const char tag[2])1747 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1748 {
1749     uint8_t *s, *end, *t = (uint8_t *) tag;
1750     uint16_t y = (uint16_t) t[0]<<8 | t[1];
1751     s = bam_get_aux(b);
1752     end = b->data + b->l_data;
1753     while (s != NULL && end - s >= 3) {
1754         uint16_t x = (uint16_t) s[0]<<8 | s[1];
1755         s += 2;
1756         if (x == y) {
1757             // Check the tag value is valid and complete
1758             uint8_t *e = skip_aux(s, end);
1759             if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
1760                 goto bad_aux;  // Unterminated string
1761             }
1762             if (e != NULL) {
1763                 return s;
1764             } else {
1765                 goto bad_aux;
1766             }
1767         }
1768         s = skip_aux(s, end);
1769     }
1770     if (s == NULL) goto bad_aux;
1771     errno = ENOENT;
1772     return NULL;
1773 
1774  bad_aux:
1775     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1776     errno = EINVAL;
1777     return NULL;
1778 }
1779 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)1780 int bam_aux_del(bam1_t *b, uint8_t *s)
1781 {
1782     uint8_t *p, *aux;
1783     int l_aux = bam_get_l_aux(b);
1784     aux = bam_get_aux(b);
1785     p = s - 2;
1786     s = skip_aux(s, aux + l_aux);
1787     if (s == NULL) goto bad_aux;
1788     memmove(p, s, l_aux - (s - aux));
1789     b->l_data -= s - p;
1790     return 0;
1791 
1792  bad_aux:
1793     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1794     errno = EINVAL;
1795     return -1;
1796 }
1797 
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)1798 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
1799 {
1800     // FIXME: This is not at all efficient!
1801     uint8_t *s = bam_aux_get(b,tag);
1802     if (!s) {
1803         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
1804             return bam_aux_append(b, tag, 'Z', len, (const uint8_t *) data);
1805         } else { // Invalid aux data, give up.
1806             return -1;
1807         }
1808     }
1809     char type = *s;
1810     if (type != 'Z') {
1811         hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
1812         errno = EINVAL;
1813         return -1;
1814     }
1815 
1816     bam_aux_del(b,s);
1817     s -= 2;
1818     int l_aux = bam_get_l_aux(b);
1819 
1820     ptrdiff_t s_offset = s - b->data;
1821     if (possibly_expand_bam_data(b, 3 + len) < 0) return -1;
1822     s = b->data + s_offset;
1823     b->l_data += 3 + len;
1824 
1825     memmove(s+3+len, s, l_aux - (s - bam_get_aux(b)));
1826     s[0] = tag[0];
1827     s[1] = tag[1];
1828     s[2] = type;
1829     memmove(s+3,data,len);
1830     return 0;
1831 }
1832 
bam_aux_update_int(bam1_t * b,const char tag[2],int64_t val)1833 int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
1834 {
1835     uint32_t sz, old_sz = 0, new = 0;
1836     uint8_t *s, type;
1837 
1838     if (val < INT32_MIN || val > UINT32_MAX) {
1839         errno = EOVERFLOW;
1840         return -1;
1841     }
1842     if (val < INT16_MIN)       { type = 'i'; sz = 4; }
1843     else if (val < INT8_MIN)   { type = 's'; sz = 2; }
1844     else if (val < 0)          { type = 'c'; sz = 1; }
1845     else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
1846     else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
1847     else                       { type = 'I'; sz = 4; }
1848 
1849     s = bam_aux_get(b, tag);
1850     if (s) {  // Tag present - how big was the old one?
1851         switch (*s) {
1852             case 'c': case 'C': old_sz = 1; break;
1853             case 's': case 'S': old_sz = 2; break;
1854             case 'i': case 'I': old_sz = 4; break;
1855             default: errno = EINVAL; return -1;  // Not an integer
1856         }
1857     } else {
1858         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
1859             s = b->data + b->l_data;
1860             new = 1;
1861         }  else { // Invalid aux data, give up.
1862             return -1;
1863         }
1864     }
1865 
1866     if (new || old_sz < sz) {
1867         // Make room for new tag
1868         ptrdiff_t s_offset = s - b->data;
1869         if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
1870             return -1;
1871         s =  b->data + s_offset;
1872         if (new) { // Add tag id
1873             *s++ = tag[0];
1874             *s++ = tag[1];
1875         } else {   // Shift following data so we have space
1876             memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
1877         }
1878     } else {
1879         // Reuse old space.  Data value may be bigger than necessary but
1880         // we avoid having to move everything else
1881         sz = old_sz;
1882         type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
1883         assert(type > 0);
1884     }
1885     *s++ = type;
1886 #ifdef HTS_LITTLE_ENDIAN
1887     memcpy(s, &val, sz);
1888 #else
1889     switch (sz) {
1890         case 4:  u32_to_le(val, s); break;
1891         case 2:  u16_to_le(val, s); break;
1892         default: *s = val; break;
1893     }
1894 #endif
1895     b->l_data += (new ? 3 : 0) + sz - old_sz;
1896     return 0;
1897 }
1898 
bam_aux_update_float(bam1_t * b,const char tag[2],float val)1899 int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
1900 {
1901     uint8_t *s = bam_aux_get(b, tag);
1902     int shrink = 0, new = 0;
1903 
1904     if (s) { // Tag present - what was it?
1905         switch (*s) {
1906             case 'f': break;
1907             case 'd': shrink = 1; break;
1908             default: errno = EINVAL; return -1;  // Not a float
1909         }
1910     } else {
1911         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
1912             new = 1;
1913         }  else { // Invalid aux data, give up.
1914             return -1;
1915         }
1916     }
1917 
1918     if (new) { // Ensure there's room
1919         if (possibly_expand_bam_data(b, 3 + 4) < 0)
1920             return -1;
1921         s = b->data + b->l_data;
1922         *s++ = tag[0];
1923         *s++ = tag[1];
1924     } else if (shrink) { // Convert non-standard double tag to float
1925         memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
1926         b->l_data -= 4;
1927     }
1928     *s++ = 'f';
1929     float_to_le(val, s);
1930     if (new) b->l_data += 7;
1931 
1932     return 0;
1933 }
1934 
bam_aux_update_array(bam1_t * b,const char tag[2],uint8_t type,uint32_t items,void * data)1935 int bam_aux_update_array(bam1_t *b, const char tag[2],
1936                          uint8_t type, uint32_t items, void *data)
1937 {
1938     uint8_t *s = bam_aux_get(b, tag);
1939     size_t old_sz = 0, new_sz;
1940     int new = 0;
1941 
1942     if (s) { // Tag present
1943         if (*s != 'B') { errno = EINVAL; return -1; }
1944         old_sz = aux_type2size(s[1]);
1945         if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
1946         old_sz *= le_to_u32(s + 2);
1947     } else {
1948         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
1949             s = b->data + b->l_data;
1950             new = 1;
1951         }  else { // Invalid aux data, give up.
1952             return -1;
1953         }
1954     }
1955 
1956     new_sz = aux_type2size(type);
1957     if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
1958     if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
1959     new_sz *= items;
1960 
1961     if (new || old_sz < new_sz) {
1962         // Make room for new tag
1963         ptrdiff_t s_offset = s - b->data;
1964         if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
1965             return -1;
1966         s =  b->data + s_offset;
1967     }
1968     if (new) { // Add tag id and type
1969         *s++ = tag[0];
1970         *s++ = tag[1];
1971         *s = 'B';
1972         b->l_data += 8 + new_sz;
1973     } else if (old_sz != new_sz) { // shift following data if necessary
1974         memmove(s + 6 + new_sz, s + 6 + old_sz,
1975                 b->l_data - ((s + 6 + old_sz) - b->data));
1976         b->l_data -= old_sz;
1977         b->l_data += new_sz;
1978     }
1979 
1980     s[1] = type;
1981     u32_to_le(items, s + 2);
1982 #ifdef HTS_LITTLE_ENDIAN
1983     memcpy(s + 6, data, new_sz);
1984     return 0;
1985 #else
1986     return aux_to_le(type, s + 6, data, new_sz);
1987 #endif
1988 }
1989 
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)1990 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
1991                                       uint32_t idx)
1992 {
1993     switch (type) {
1994         case 'c': return le_to_i8(s + idx);
1995         case 'C': return s[idx];
1996         case 's': return le_to_i16(s + 2 * idx);
1997         case 'S': return le_to_u16(s + 2 * idx);
1998         case 'i': return le_to_i32(s + 4 * idx);
1999         case 'I': return le_to_u32(s + 4 * idx);
2000         default:
2001             errno = EINVAL;
2002             return 0;
2003     }
2004 }
2005 
bam_aux2i(const uint8_t * s)2006 int64_t bam_aux2i(const uint8_t *s)
2007 {
2008     int type;
2009     type = *s++;
2010     return get_int_aux_val(type, s, 0);
2011 }
2012 
bam_aux2f(const uint8_t * s)2013 double bam_aux2f(const uint8_t *s)
2014 {
2015     int type;
2016     type = *s++;
2017     if (type == 'd') return le_to_double(s);
2018     else if (type == 'f') return le_to_float(s);
2019     else return get_int_aux_val(type, s, 0);
2020 }
2021 
bam_aux2A(const uint8_t * s)2022 char bam_aux2A(const uint8_t *s)
2023 {
2024     int type;
2025     type = *s++;
2026     if (type == 'A') return *(char*)s;
2027     errno = EINVAL;
2028     return 0;
2029 }
2030 
bam_aux2Z(const uint8_t * s)2031 char *bam_aux2Z(const uint8_t *s)
2032 {
2033     int type;
2034     type = *s++;
2035     if (type == 'Z' || type == 'H') return (char*)s;
2036     errno = EINVAL;
2037     return 0;
2038 }
2039 
bam_auxB_len(const uint8_t * s)2040 uint32_t bam_auxB_len(const uint8_t *s)
2041 {
2042     if (s[0] != 'B') {
2043         errno = EINVAL;
2044         return 0;
2045     }
2046     return le_to_u32(s + 2);
2047 }
2048 
bam_auxB2i(const uint8_t * s,uint32_t idx)2049 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
2050 {
2051     uint32_t len = bam_auxB_len(s);
2052     if (idx >= len) {
2053         errno = ERANGE;
2054         return 0;
2055     }
2056     return get_int_aux_val(s[1], s + 6, idx);
2057 }
2058 
bam_auxB2f(const uint8_t * s,uint32_t idx)2059 double bam_auxB2f(const uint8_t *s, uint32_t idx)
2060 {
2061     uint32_t len = bam_auxB_len(s);
2062     if (idx >= len) {
2063         errno = ERANGE;
2064         return 0.0;
2065     }
2066     if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
2067     else return get_int_aux_val(s[1], s + 6, idx);
2068 }
2069 
sam_open_mode(char * mode,const char * fn,const char * format)2070 int sam_open_mode(char *mode, const char *fn, const char *format)
2071 {
2072     // TODO Parse "bam5" etc for compression level
2073     if (format == NULL) {
2074         // Try to pick a format based on the filename extension
2075         const char *ext = fn? strrchr(fn, '.') : NULL;
2076         if (ext == NULL || strchr(ext, '/')) return -1;
2077         return sam_open_mode(mode, fn, ext+1);
2078     }
2079     else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
2080     else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
2081     else if (strcmp(format, "sam") == 0) strcpy(mode, "");
2082     else return -1;
2083 
2084     return 0;
2085 }
2086 
2087 // A version of sam_open_mode that can handle ,key=value options.
2088 // The format string is allocated and returned, to be freed by the caller.
2089 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)2090 char *sam_open_mode_opts(const char *fn,
2091                          const char *mode,
2092                          const char *format)
2093 {
2094     char *mode_opts = malloc((format ? strlen(format) : 1) +
2095                              (mode   ? strlen(mode)   : 1) + 12);
2096     char *opts, *cp;
2097     int format_len;
2098 
2099     if (!mode_opts)
2100         return NULL;
2101 
2102     strcpy(mode_opts, mode ? mode : "r");
2103     cp = mode_opts + strlen(mode_opts);
2104 
2105     if (format == NULL) {
2106         // Try to pick a format based on the filename extension
2107         const char *ext = fn? strrchr(fn, '.') : NULL;
2108         if (ext == NULL || strchr(ext, '/')) {
2109             free(mode_opts);
2110             return NULL;
2111         }
2112         if (sam_open_mode(cp, fn, ext+1) == 0) {
2113             return mode_opts;
2114         } else {
2115             free(mode_opts);
2116             return NULL;
2117         }
2118     }
2119 
2120     if ((opts = strchr(format, ','))) {
2121         format_len = opts-format;
2122     } else {
2123         opts="";
2124         format_len = strlen(format);
2125     }
2126 
2127     if (strncmp(format, "bam", format_len) == 0) {
2128         *cp++ = 'b';
2129     } else if (strncmp(format, "cram", format_len) == 0) {
2130         *cp++ = 'c';
2131     } else if (strncmp(format, "cram2", format_len) == 0) {
2132         *cp++ = 'c';
2133         strcpy(cp, ",VERSION=2.1");
2134         cp += 12;
2135     } else if (strncmp(format, "cram3", format_len) == 0) {
2136         *cp++ = 'c';
2137         strcpy(cp, ",VERSION=3.0");
2138         cp += 12;
2139     } else if (strncmp(format, "sam", format_len) == 0) {
2140         ; // format mode=""
2141     } else {
2142         free(mode_opts);
2143         return NULL;
2144     }
2145 
2146     strcpy(cp, opts);
2147 
2148     return mode_opts;
2149 }
2150 
2151 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)2152 int bam_str2flag(const char *str)
2153 {
2154     char *end, *beg = (char*) str;
2155     long int flag = strtol(str, &end, 0);
2156     if ( end!=str ) return flag;    // the conversion was successful
2157     flag = 0;
2158     while ( *str )
2159     {
2160         end = beg;
2161         while ( *end && *end!=',' ) end++;
2162         if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
2163         else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
2164         else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
2165         else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
2166         else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
2167         else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
2168         else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
2169         else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
2170         else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
2171         else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
2172         else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
2173         else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
2174         else return -1;
2175         if ( !*end ) break;
2176         beg = end + 1;
2177     }
2178     return flag;
2179 }
2180 
bam_flag2str(int flag)2181 char *bam_flag2str(int flag)
2182 {
2183     kstring_t str = {0,0,0};
2184     if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
2185     if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
2186     if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
2187     if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
2188     if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
2189     if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
2190     if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
2191     if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
2192     if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
2193     if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
2194     if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
2195     if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
2196     if ( str.l == 0 ) kputsn("", 0, &str);
2197     return str.s;
2198 }
2199 
2200 
2201 /**************************
2202  *** Pileup and Mpileup ***
2203  **************************/
2204 
2205 #if !defined(BAM_NO_PILEUP)
2206 
2207 #include <assert.h>
2208 
2209 /*******************
2210  *** Memory pool ***
2211  *******************/
2212 
2213 typedef struct {
2214     int k, x, y, end;
2215 } cstate_t;
2216 
2217 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
2218 
2219 typedef struct __linkbuf_t {
2220     bam1_t b;
2221     int32_t beg, end;
2222     cstate_t s;
2223     struct __linkbuf_t *next;
2224     bam_pileup_cd cd;
2225 } lbnode_t;
2226 
2227 typedef struct {
2228     int cnt, n, max;
2229     lbnode_t **buf;
2230 } mempool_t;
2231 
mp_init(void)2232 static mempool_t *mp_init(void)
2233 {
2234     mempool_t *mp;
2235     mp = (mempool_t*)calloc(1, sizeof(mempool_t));
2236     return mp;
2237 }
mp_destroy(mempool_t * mp)2238 static void mp_destroy(mempool_t *mp)
2239 {
2240     int k;
2241     for (k = 0; k < mp->n; ++k) {
2242         free(mp->buf[k]->b.data);
2243         free(mp->buf[k]);
2244     }
2245     free(mp->buf);
2246     free(mp);
2247 }
mp_alloc(mempool_t * mp)2248 static inline lbnode_t *mp_alloc(mempool_t *mp)
2249 {
2250     ++mp->cnt;
2251     if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
2252     else return mp->buf[--mp->n];
2253 }
mp_free(mempool_t * mp,lbnode_t * p)2254 static inline void mp_free(mempool_t *mp, lbnode_t *p)
2255 {
2256     --mp->cnt; p->next = 0; // clear lbnode_t::next here
2257     if (mp->n == mp->max) {
2258         mp->max = mp->max? mp->max<<1 : 256;
2259         mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
2260     }
2261     mp->buf[mp->n++] = p;
2262 }
2263 
2264 /**********************
2265  *** CIGAR resolver ***
2266  **********************/
2267 
2268 /* s->k: the index of the CIGAR operator that has just been processed.
2269    s->x: the reference coordinate of the start of s->k
2270    s->y: the query coordiante of the start of s->k
2271  */
resolve_cigar2(bam_pileup1_t * p,int32_t pos,cstate_t * s)2272 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
2273 {
2274 #define _cop(c) ((c)&BAM_CIGAR_MASK)
2275 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
2276 
2277     bam1_t *b = p->b;
2278     bam1_core_t *c = &b->core;
2279     uint32_t *cigar = bam_get_cigar(b);
2280     int k;
2281     // determine the current CIGAR operation
2282     //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
2283     if (s->k == -1) { // never processed
2284         if (c->n_cigar == 1) { // just one operation, save a loop
2285           if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
2286         } else { // find the first match or deletion
2287             for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
2288                 int op = _cop(cigar[k]);
2289                 int l = _cln(cigar[k]);
2290                 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2291                 else if (op == BAM_CREF_SKIP) s->x += l;
2292                 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2293             }
2294             assert(k < c->n_cigar);
2295             s->k = k;
2296         }
2297     } else { // the read has been processed before
2298         int op, l = _cln(cigar[s->k]);
2299         if (pos - s->x >= l) { // jump to the next operation
2300             assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
2301             op = _cop(cigar[s->k+1]);
2302             if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
2303               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2304                 s->x += l;
2305                 ++s->k;
2306             } else { // find the next M/D/N/=/X
2307               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
2308                 s->x += l;
2309                 for (k = s->k + 1; k < c->n_cigar; ++k) {
2310                     op = _cop(cigar[k]), l = _cln(cigar[k]);
2311                     if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
2312                     else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
2313                 }
2314                 s->k = k;
2315             }
2316             assert(s->k < c->n_cigar); // otherwise a bug
2317         } // else, do nothing
2318     }
2319     { // collect pileup information
2320         int op, l;
2321         op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
2322         p->is_del = p->indel = p->is_refskip = 0;
2323         if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
2324             int op2 = _cop(cigar[s->k+1]);
2325             int l2 = _cln(cigar[s->k+1]);
2326             if (op2 == BAM_CDEL) p->indel = -(int)l2;
2327             else if (op2 == BAM_CINS) p->indel = l2;
2328             else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
2329                 int l3 = 0;
2330                 for (k = s->k + 2; k < c->n_cigar; ++k) {
2331                     op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
2332                     if (op2 == BAM_CINS) l3 += l2;
2333                     else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
2334                 }
2335                 if (l3 > 0) p->indel = l3;
2336             }
2337         }
2338         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
2339             p->qpos = s->y + (pos - s->x);
2340         } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
2341             p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
2342             p->is_refskip = (op == BAM_CREF_SKIP);
2343         } // cannot be other operations; otherwise a bug
2344         p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
2345     }
2346     return 1;
2347 }
2348 
2349 /***********************
2350  *** Pileup iterator ***
2351  ***********************/
2352 
2353 // Dictionary of overlapping reads
2354 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
2355 typedef khash_t(olap_hash) olap_hash_t;
2356 
2357 struct __bam_plp_t {
2358     mempool_t *mp;
2359     lbnode_t *head, *tail;
2360     int32_t tid, pos, max_tid, max_pos;
2361     int is_eof, max_plp, error, maxcnt;
2362     uint64_t id;
2363     bam_pileup1_t *plp;
2364     // for the "auto" interface only
2365     bam1_t *b;
2366     bam_plp_auto_f func;
2367     void *data;
2368     olap_hash_t *overlaps;
2369 
2370     // For notification of creation and destruction events
2371     // and associated client-owned pointer.
2372     int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
2373     int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
2374 };
2375 
bam_plp_init(bam_plp_auto_f func,void * data)2376 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
2377 {
2378     bam_plp_t iter;
2379     iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
2380     iter->mp = mp_init();
2381     iter->head = iter->tail = mp_alloc(iter->mp);
2382     iter->max_tid = iter->max_pos = -1;
2383     iter->maxcnt = 8000;
2384     if (func) {
2385         iter->func = func;
2386         iter->data = data;
2387         iter->b = bam_init1();
2388     }
2389     return iter;
2390 }
2391 
bam_plp_init_overlaps(bam_plp_t iter)2392 void bam_plp_init_overlaps(bam_plp_t iter)
2393 {
2394     iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
2395 }
2396 
bam_plp_destroy(bam_plp_t iter)2397 void bam_plp_destroy(bam_plp_t iter)
2398 {
2399     lbnode_t *p, *pnext;
2400     if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
2401     for (p = iter->head; p != NULL; p = pnext) {
2402         pnext = p->next;
2403         mp_free(iter->mp, p);
2404     }
2405     mp_destroy(iter->mp);
2406     if (iter->b) bam_destroy1(iter->b);
2407     free(iter->plp);
2408     free(iter);
2409 }
2410 
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2411 void bam_plp_constructor(bam_plp_t plp,
2412                          int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2413     plp->plp_construct = func;
2414 }
2415 
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))2416 void bam_plp_destructor(bam_plp_t plp,
2417                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
2418     plp->plp_destruct = func;
2419 }
2420 
2421 //---------------------------------
2422 //---  Tweak overlapping reads
2423 //---------------------------------
2424 
2425 /**
2426  *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
2427  *  cigar_iref2iseq_next() - get the next CMATCH base
2428  *  @cigar:       pointer to current cigar block (rw)
2429  *  @cigar_max:   pointer just beyond the last cigar block
2430  *  @icig:        position within the current cigar block (rw)
2431  *  @iseq:        position in the sequence (rw)
2432  *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
2433  *
2434  *  Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
2435  */
cigar_iref2iseq_set(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2436 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2437 {
2438     int pos = *iref;
2439     if ( pos < 0 ) return -1;
2440     *icig = 0;
2441     *iseq = 0;
2442     *iref = 0;
2443     while ( *cigar<cigar_max )
2444     {
2445         int cig  = (**cigar) & BAM_CIGAR_MASK;
2446         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2447 
2448         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2449         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2450         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2451         {
2452             pos -= ncig;
2453             if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
2454             (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
2455             continue;
2456         }
2457         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2458         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
2459         {
2460             pos -= ncig;
2461             if ( pos<0 ) pos = 0;
2462             (*cigar)++; *icig = 0; *iref += ncig;
2463             continue;
2464         }
2465         hts_log_error("Unexpected cigar %d", cig);
2466         assert(0);
2467     }
2468     *iseq = -1;
2469     return -1;
2470 }
cigar_iref2iseq_next(uint32_t ** cigar,uint32_t * cigar_max,int * icig,int * iseq,int * iref)2471 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2472 {
2473     while ( *cigar < cigar_max )
2474     {
2475         int cig  = (**cigar) & BAM_CIGAR_MASK;
2476         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2477 
2478         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2479         {
2480             if ( *icig >= ncig - 1 ) { *icig = 0;  (*cigar)++; continue; }
2481             (*iseq)++; (*icig)++; (*iref)++;
2482             return BAM_CMATCH;
2483         }
2484         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
2485         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2486         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2487         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2488         hts_log_error("Unexpected cigar %d", cig);
2489         assert(0);
2490     }
2491     *iseq = -1;
2492     *iref = -1;
2493     return -1;
2494 }
2495 
tweak_overlap_quality(bam1_t * a,bam1_t * b)2496 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
2497 {
2498     uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
2499     uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
2500     int a_icig = 0, a_iseq = 0;
2501     int b_icig = 0, b_iseq = 0;
2502     uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
2503     uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
2504 
2505     int iref   = b->core.pos;
2506     int a_iref = iref - a->core.pos;
2507     int b_iref = iref - b->core.pos;
2508     int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2509     if ( a_ret<0 ) return;  // no overlap
2510     int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2511     if ( b_ret<0 ) return;  // no overlap
2512 
2513     #if DBG
2514         fprintf(stderr,"tweak %s  n_cigar=%d %d  .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
2515             a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
2516     #endif
2517 
2518     while ( 1 )
2519     {
2520         // Increment reference position
2521         while ( a_iref>=0 && a_iref < iref - a->core.pos )
2522             a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2523         if ( a_ret<0 ) break;   // done
2524         if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
2525 
2526         while ( b_iref>=0 && b_iref < iref - b->core.pos )
2527             b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2528         if ( b_ret<0 ) break;   // done
2529         if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
2530 
2531         iref++;
2532         if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue;   // only CMATCH positions, don't know what to do with indels
2533 
2534         if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
2535         {
2536             #if DBG
2537                 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
2538             #endif
2539             // we are very confident about this base
2540             int qual = a_qual[a_iseq] + b_qual[b_iseq];
2541             a_qual[a_iseq] = qual>200 ? 200 : qual;
2542             b_qual[b_iseq] = 0;
2543         }
2544         else
2545         {
2546             if ( a_qual[a_iseq] >= b_qual[b_iseq] )
2547             {
2548                 #if DBG
2549                     fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
2550                 #endif
2551                 a_qual[a_iseq] = 0.8 * a_qual[a_iseq];  // not so confident about a_qual anymore given the mismatch
2552                 b_qual[b_iseq] = 0;
2553             }
2554             else
2555             {
2556                 #if DBG
2557                     fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
2558                 #endif
2559                 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
2560                 a_qual[a_iseq] = 0;
2561             }
2562         }
2563     }
2564     #if DBG
2565         fprintf(stderr,"\n");
2566     #endif
2567 }
2568 
2569 // Fix overlapping reads. Simple soft-clipping did not give good results.
2570 // Lowering qualities of unwanted bases is more selective and works better.
2571 //
overlap_push(bam_plp_t iter,lbnode_t * node)2572 static void overlap_push(bam_plp_t iter, lbnode_t *node)
2573 {
2574     if ( !iter->overlaps ) return;
2575 
2576     // mapped mates and paired reads only
2577     if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
2578 
2579     // no overlap possible, unless some wild cigar
2580     if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
2581 
2582     khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
2583     if ( kitr==kh_end(iter->overlaps) )
2584     {
2585         int ret;
2586         kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
2587         kh_value(iter->overlaps, kitr) = node;
2588     }
2589     else
2590     {
2591         lbnode_t *a = kh_value(iter->overlaps, kitr);
2592         tweak_overlap_quality(&a->b, &node->b);
2593         kh_del(olap_hash, iter->overlaps, kitr);
2594         assert(a->end-1 == a->s.end);
2595         a->end = bam_endpos(&a->b);
2596         a->s.end = a->end - 1;
2597     }
2598 }
2599 
overlap_remove(bam_plp_t iter,const bam1_t * b)2600 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
2601 {
2602     if ( !iter->overlaps ) return;
2603 
2604     khiter_t kitr;
2605     if ( b )
2606     {
2607         kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
2608         if ( kitr!=kh_end(iter->overlaps) )
2609             kh_del(olap_hash, iter->overlaps, kitr);
2610     }
2611     else
2612     {
2613         // remove all
2614         for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
2615             if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
2616     }
2617 }
2618 
2619 
2620 
2621 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
2622 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
2623 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2624 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2625 {
2626     if (iter->error) { *_n_plp = -1; return NULL; }
2627     *_n_plp = 0;
2628     if (iter->is_eof && iter->head == iter->tail) return NULL;
2629     while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
2630         int n_plp = 0;
2631         // write iter->plp at iter->pos
2632         lbnode_t **pptr = &iter->head;
2633         while (*pptr != iter->tail) {
2634             lbnode_t *p = *pptr;
2635             if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
2636                 overlap_remove(iter, &p->b);
2637                 if (iter->plp_destruct)
2638                     iter->plp_destruct(iter->data, &p->b, &p->cd);
2639                 *pptr = p->next; mp_free(iter->mp, p);
2640             }
2641             else {
2642                 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
2643                     if (n_plp == iter->max_plp) { // then double the capacity
2644                         iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
2645                         iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
2646                     }
2647                     iter->plp[n_plp].b = &p->b;
2648                     iter->plp[n_plp].cd = p->cd;
2649                     if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
2650                 }
2651                 pptr = &(*pptr)->next;
2652             }
2653         }
2654         *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
2655         // update iter->tid and iter->pos
2656         if (iter->head != iter->tail) {
2657             if (iter->tid > iter->head->b.core.tid) {
2658                 hts_log_error("Unsorted input. Pileup aborts");
2659                 iter->error = 1;
2660                 *_n_plp = -1;
2661                 return NULL;
2662             }
2663         }
2664         if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
2665             iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
2666         } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
2667             iter->pos = iter->head->beg; // jump to the next position
2668         } else ++iter->pos; // scan contiguously
2669         // return
2670         if (n_plp) return iter->plp;
2671         if (iter->is_eof && iter->head == iter->tail) break;
2672     }
2673     return NULL;
2674 }
2675 
bam_plp_push(bam_plp_t iter,const bam1_t * b)2676 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
2677 {
2678     if (iter->error) return -1;
2679     if (b) {
2680         if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
2681         // Skip only unmapped reads here, any additional filtering must be done in iter->func
2682         if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
2683         if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
2684         {
2685             overlap_remove(iter, b);
2686             return 0;
2687         }
2688         bam_copy1(&iter->tail->b, b);
2689         overlap_push(iter, iter->tail);
2690 #ifndef BAM_NO_ID
2691         iter->tail->b.id = iter->id++;
2692 #endif
2693         iter->tail->beg = b->core.pos;
2694         iter->tail->end = bam_endpos(b);
2695         iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
2696         if (b->core.tid < iter->max_tid) {
2697             hts_log_error("The input is not sorted (chromosomes out of order)");
2698             iter->error = 1;
2699             return -1;
2700         }
2701         if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
2702             hts_log_error("The input is not sorted (reads out of order)");
2703             iter->error = 1;
2704             return -1;
2705         }
2706         iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
2707         if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
2708             if (iter->plp_construct)
2709                 iter->plp_construct(iter->data, b, &iter->tail->cd);
2710             iter->tail->next = mp_alloc(iter->mp);
2711             iter->tail = iter->tail->next;
2712         }
2713     } else iter->is_eof = 1;
2714     return 0;
2715 }
2716 
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)2717 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2718 {
2719     const bam_pileup1_t *plp;
2720     if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
2721     if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2722     else { // no pileup line can be obtained; read alignments
2723         *_n_plp = 0;
2724         if (iter->is_eof) return 0;
2725         int ret;
2726         while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
2727             if (bam_plp_push(iter, iter->b) < 0) {
2728                 *_n_plp = -1;
2729                 return 0;
2730             }
2731             if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2732             // otherwise no pileup line can be returned; read the next alignment.
2733         }
2734         if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
2735         bam_plp_push(iter, 0);
2736         if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2737         return 0;
2738     }
2739 }
2740 
bam_plp_reset(bam_plp_t iter)2741 void bam_plp_reset(bam_plp_t iter)
2742 {
2743     overlap_remove(iter, NULL);
2744     iter->max_tid = iter->max_pos = -1;
2745     iter->tid = iter->pos = 0;
2746     iter->is_eof = 0;
2747     while (iter->head != iter->tail) {
2748         lbnode_t *p = iter->head;
2749         iter->head = p->next;
2750         mp_free(iter->mp, p);
2751     }
2752 }
2753 
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)2754 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
2755 {
2756     iter->maxcnt = maxcnt;
2757 }
2758 
2759 /************************
2760  *** Mpileup iterator ***
2761  ************************/
2762 
2763 struct __bam_mplp_t {
2764     int n;
2765     uint64_t min, *pos;
2766     bam_plp_t *iter;
2767     int *n_plp;
2768     const bam_pileup1_t **plp;
2769 };
2770 
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)2771 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
2772 {
2773     int i;
2774     bam_mplp_t iter;
2775     iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
2776     iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
2777     iter->n_plp = (int*)calloc(n, sizeof(int));
2778     iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
2779     iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
2780     iter->n = n;
2781     iter->min = (uint64_t)-1;
2782     for (i = 0; i < n; ++i) {
2783         iter->iter[i] = bam_plp_init(func, data[i]);
2784         iter->pos[i] = iter->min;
2785     }
2786     return iter;
2787 }
2788 
bam_mplp_init_overlaps(bam_mplp_t iter)2789 void bam_mplp_init_overlaps(bam_mplp_t iter)
2790 {
2791     int i;
2792     for (i = 0; i < iter->n; ++i)
2793         bam_plp_init_overlaps(iter->iter[i]);
2794 }
2795 
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)2796 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
2797 {
2798     int i;
2799     for (i = 0; i < iter->n; ++i)
2800         iter->iter[i]->maxcnt = maxcnt;
2801 }
2802 
bam_mplp_destroy(bam_mplp_t iter)2803 void bam_mplp_destroy(bam_mplp_t iter)
2804 {
2805     int i;
2806     for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
2807     free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
2808     free(iter);
2809 }
2810 
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)2811 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
2812 {
2813     int i, ret = 0;
2814     uint64_t new_min = (uint64_t)-1;
2815     for (i = 0; i < iter->n; ++i) {
2816         if (iter->pos[i] == iter->min) {
2817             int tid, pos;
2818             iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
2819             if ( iter->iter[i]->error ) return -1;
2820             iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
2821         }
2822         if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
2823     }
2824     iter->min = new_min;
2825     if (new_min == (uint64_t)-1) return 0;
2826     *_tid = new_min>>32; *_pos = (uint32_t)new_min;
2827     for (i = 0; i < iter->n; ++i) {
2828         if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
2829             n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
2830             ++ret;
2831         } else n_plp[i] = 0, plp[i] = 0;
2832     }
2833     return ret;
2834 }
2835 
bam_mplp_reset(bam_mplp_t iter)2836 void bam_mplp_reset(bam_mplp_t iter)
2837 {
2838     int i;
2839     iter->min = (uint64_t)-1;
2840     for (i = 0; i < iter->n; ++i) {
2841         bam_plp_reset(iter->iter[i]);
2842         iter->pos[i] = (uint64_t)-1;
2843         iter->n_plp[i] = 0;
2844         iter->plp[i] = NULL;
2845     }
2846 }
2847 
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2848 void bam_mplp_constructor(bam_mplp_t iter,
2849                           int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2850     int i;
2851     for (i = 0; i < iter->n; ++i)
2852         bam_plp_constructor(iter->iter[i], func);
2853 }
2854 
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))2855 void bam_mplp_destructor(bam_mplp_t iter,
2856                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2857     int i;
2858     for (i = 0; i < iter->n; ++i)
2859         bam_plp_destructor(iter->iter[i], func);
2860 }
2861 
2862 #endif // ~!defined(BAM_NO_PILEUP)
2863