1 /*  sam.c -- SAM and BAM file I/O and manipulation.
2 
3     Copyright (C) 2008-2010, 2012-2020 Genome Research Ltd.
4     Copyright (C) 2010, 2012, 2013 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27 #include <config.h>
28 
29 #include <strings.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <zlib.h>
35 #include <assert.h>
36 #include <signal.h>
37 #include <inttypes.h>
38 #include <unistd.h>
39 
40 // Suppress deprecation message for cigar_tab, which we initialise
41 #include "htslib/hts_defs.h"
42 #undef HTS_DEPRECATED
43 #define HTS_DEPRECATED(message)
44 
45 #include "htslib/sam.h"
46 #include "htslib/bgzf.h"
47 #include "cram/cram.h"
48 #include "hts_internal.h"
49 #include "sam_internal.h"
50 #include "htslib/hfile.h"
51 #include "htslib/hts_endian.h"
52 #include "header.h"
53 
54 #include "htslib/khash.h"
55 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
56 
57 #ifndef EFTYPE
58 #define EFTYPE ENOEXEC
59 #endif
60 #ifndef EOVERFLOW
61 #define EOVERFLOW ERANGE
62 #endif
63 
64 /**********************
65  *** BAM header I/O ***
66  **********************/
67 
68 HTSLIB_EXPORT
69 const int8_t bam_cigar_table[256] = {
70     // 0 .. 47
71     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
72     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
73     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
74 
75     // 48 .. 63  (including =)
76     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
77 
78     // 64 .. 79  (including MIDNHB)
79     -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
80         BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
81 
82     // 80 .. 95  (including SPX)
83     BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
84         BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
85 
86     // 96 .. 127
87     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
88     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
89 
90     // 128 .. 255
91     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
92     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
93     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
94     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
96     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
97     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
99 };
100 
sam_hdr_init()101 sam_hdr_t *sam_hdr_init()
102 {
103     sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
104     if (bh == NULL) return NULL;
105 
106     bh->cigar_tab = bam_cigar_table;
107     return bh;
108 }
109 
sam_hdr_destroy(sam_hdr_t * bh)110 void sam_hdr_destroy(sam_hdr_t *bh)
111 {
112     int32_t i;
113 
114     if (bh == NULL) return;
115 
116     if (bh->ref_count > 0) {
117         --bh->ref_count;
118         return;
119     }
120 
121     if (bh->target_name) {
122         for (i = 0; i < bh->n_targets; ++i)
123             free(bh->target_name[i]);
124         free(bh->target_name);
125         free(bh->target_len);
126     }
127     free(bh->text);
128     if (bh->hrecs)
129         sam_hrecs_free(bh->hrecs);
130     if (bh->sdict)
131         kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
132     free(bh);
133 }
134 
135 // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
136 // references before sam_hdr_t::hrecs is populated
sam_hdr_dup_sdict(const sam_hdr_t * h0,sam_hdr_t * h)137 int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
138 {
139     const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
140     khash_t(s2i) *dest_long_refs = kh_init(s2i);
141     int i;
142     if (!dest_long_refs) return -1;
143 
144     for (i = 0; i < h->n_targets; i++) {
145         int ret;
146         khiter_t ksrc, kdest;
147         if (h->target_len[i] < UINT32_MAX) continue;
148         ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
149         if (ksrc == kh_end(src_long_refs)) continue;
150         kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
151         if (ret < 0) {
152             kh_destroy(s2i, dest_long_refs);
153             return -1;
154         }
155         kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
156     }
157 
158     h->sdict = dest_long_refs;
159     return 0;
160 }
161 
sam_hdr_dup(const sam_hdr_t * h0)162 sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
163 {
164     if (h0 == NULL) return NULL;
165     sam_hdr_t *h;
166     if ((h = sam_hdr_init()) == NULL) return NULL;
167     // copy the simple data
168     h->n_targets = 0;
169     h->ignore_sam_err = h0->ignore_sam_err;
170     h->l_text = 0;
171 
172     // Then the pointery stuff
173 
174     if (!h0->hrecs) {
175         h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
176         if (!h->target_len) goto fail;
177         h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
178         if (!h->target_name) goto fail;
179 
180         int i;
181         for (i = 0; i < h0->n_targets; ++i) {
182             h->target_len[i] = h0->target_len[i];
183             h->target_name[i] = strdup(h0->target_name[i]);
184             if (!h->target_name[i]) break;
185         }
186         h->n_targets = i;
187         if (i < h0->n_targets) goto fail;
188 
189         if (h0->sdict) {
190             if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
191         }
192     }
193 
194     if (h0->hrecs) {
195         kstring_t tmp = { 0, 0, NULL };
196         if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
197             free(ks_release(&tmp));
198             goto fail;
199         }
200 
201         h->l_text = tmp.l;
202         h->text   = ks_release(&tmp);
203 
204         if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
205             goto fail;
206     } else {
207         h->l_text = h0->l_text;
208         h->text = malloc(h->l_text + 1);
209         if (!h->text) goto fail;
210         memcpy(h->text, h0->text, h->l_text);
211         h->text[h->l_text] = '\0';
212     }
213 
214     return h;
215 
216  fail:
217     sam_hdr_destroy(h);
218     return NULL;
219 }
220 
bam_hdr_read(BGZF * fp)221 sam_hdr_t *bam_hdr_read(BGZF *fp)
222 {
223     sam_hdr_t *h;
224     uint8_t buf[4];
225     int magic_len, has_EOF;
226     int32_t i, name_len, num_names = 0;
227     size_t bufsize;
228     ssize_t bytes;
229     // check EOF
230     has_EOF = bgzf_check_EOF(fp);
231     if (has_EOF < 0) {
232         perror("[W::bam_hdr_read] bgzf_check_EOF");
233     } else if (has_EOF == 0) {
234         hts_log_warning("EOF marker is absent. The input is probably truncated");
235     }
236     // read "BAM1"
237     magic_len = bgzf_read(fp, buf, 4);
238     if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
239         hts_log_error("Invalid BAM binary header");
240         return 0;
241     }
242     h = sam_hdr_init();
243     if (!h) goto nomem;
244 
245     // read plain text and the number of reference sequences
246     bytes = bgzf_read(fp, buf, 4);
247     if (bytes != 4) goto read_err;
248     h->l_text = le_to_u32(buf);
249 
250     bufsize = h->l_text + 1;
251     if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
252     h->text = (char*)malloc(bufsize);
253     if (!h->text) goto nomem;
254     h->text[h->l_text] = 0; // make sure it is NULL terminated
255     bytes = bgzf_read(fp, h->text, h->l_text);
256     if (bytes != h->l_text) goto read_err;
257 
258     bytes = bgzf_read(fp, &h->n_targets, 4);
259     if (bytes != 4) goto read_err;
260     if (fp->is_be) ed_swap_4p(&h->n_targets);
261 
262     if (h->n_targets < 0) goto invalid;
263 
264     // read reference sequence names and lengths
265     if (h->n_targets > 0) {
266         h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
267         if (!h->target_name) goto nomem;
268         h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
269         if (!h->target_len) goto nomem;
270     }
271     else {
272         h->target_name = NULL;
273         h->target_len = NULL;
274     }
275 
276     for (i = 0; i != h->n_targets; ++i) {
277         bytes = bgzf_read(fp, &name_len, 4);
278         if (bytes != 4) goto read_err;
279         if (fp->is_be) ed_swap_4p(&name_len);
280         if (name_len <= 0) goto invalid;
281 
282         h->target_name[i] = (char*)malloc(name_len);
283         if (!h->target_name[i]) goto nomem;
284         num_names++;
285 
286         bytes = bgzf_read(fp, h->target_name[i], name_len);
287         if (bytes != name_len) goto read_err;
288 
289         if (h->target_name[i][name_len - 1] != '\0') {
290             /* Fix missing NUL-termination.  Is this being too nice?
291                We could alternatively bail out with an error. */
292             char *new_name;
293             if (name_len == INT32_MAX) goto invalid;
294             new_name = realloc(h->target_name[i], name_len + 1);
295             if (new_name == NULL) goto nomem;
296             h->target_name[i] = new_name;
297             h->target_name[i][name_len] = '\0';
298         }
299 
300         bytes = bgzf_read(fp, &h->target_len[i], 4);
301         if (bytes != 4) goto read_err;
302         if (fp->is_be) ed_swap_4p(&h->target_len[i]);
303     }
304     return h;
305 
306  nomem:
307     hts_log_error("Out of memory");
308     goto clean;
309 
310  read_err:
311     if (bytes < 0) {
312         hts_log_error("Error reading BGZF stream");
313     } else {
314         hts_log_error("Truncated BAM header");
315     }
316     goto clean;
317 
318  invalid:
319     hts_log_error("Invalid BAM binary header");
320 
321  clean:
322     if (h != NULL) {
323         h->n_targets = num_names; // ensure we free only allocated target_names
324         sam_hdr_destroy(h);
325     }
326     return NULL;
327 }
328 
bam_hdr_write(BGZF * fp,const sam_hdr_t * h)329 int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
330 {
331     int32_t i, name_len, x;
332     kstring_t hdr_ks = { 0, 0, NULL };
333     char *text;
334     uint32_t l_text;
335 
336     if (!h) return -1;
337 
338     if (h->hrecs) {
339         if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
340         if (hdr_ks.l > INT32_MAX) {
341             hts_log_error("Header too long for BAM format");
342             free(hdr_ks.s);
343             return -1;
344         }
345         text = hdr_ks.s;
346         l_text = hdr_ks.l;
347     } else {
348         if (h->l_text > INT32_MAX) {
349             hts_log_error("Header too long for BAM format");
350             return -1;
351         }
352         text = h->text;
353         l_text = h->l_text;
354     }
355     // write "BAM1"
356     if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
357     // write plain text and the number of reference sequences
358     if (fp->is_be) {
359         x = ed_swap_4(l_text);
360         if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
361         if (l_text) {
362             if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
363         }
364         x = ed_swap_4(h->n_targets);
365         if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
366     } else {
367         if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
368         if (l_text) {
369             if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
370         }
371         if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
372     }
373     free(hdr_ks.s);
374     // write sequence names and lengths
375     for (i = 0; i != h->n_targets; ++i) {
376         char *p = h->target_name[i];
377         name_len = strlen(p) + 1;
378         if (fp->is_be) {
379             x = ed_swap_4(name_len);
380             if (bgzf_write(fp, &x, 4) < 0) return -1;
381         } else {
382             if (bgzf_write(fp, &name_len, 4) < 0) return -1;
383         }
384         if (bgzf_write(fp, p, name_len) < 0) return -1;
385         if (fp->is_be) {
386             x = ed_swap_4(h->target_len[i]);
387             if (bgzf_write(fp, &x, 4) < 0) return -1;
388         } else {
389             if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
390         }
391     }
392     if (bgzf_flush(fp) < 0) return -1;
393     return 0;
394 }
395 
sam_parse_region(sam_hdr_t * h,const char * s,int * tid,hts_pos_t * beg,hts_pos_t * end,int flags)396 const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
397                              hts_pos_t *beg, hts_pos_t *end, int flags) {
398     return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
399 }
400 
401 /*************************
402  *** BAM alignment I/O ***
403  *************************/
404 
bam_init1()405 bam1_t *bam_init1()
406 {
407     return (bam1_t*)calloc(1, sizeof(bam1_t));
408 }
409 
sam_realloc_bam_data(bam1_t * b,size_t desired)410 int sam_realloc_bam_data(bam1_t *b, size_t desired)
411 {
412     uint32_t new_m_data;
413     uint8_t *new_data;
414     new_m_data = desired;
415     kroundup32(new_m_data);
416     if (new_m_data < desired) {
417         errno = ENOMEM; // Not strictly true but we can't store the size
418         return -1;
419     }
420     if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
421         new_data = realloc(b->data, new_m_data);
422     } else {
423         if ((new_data = malloc(new_m_data)) != NULL) {
424             if (b->l_data > 0)
425                 memcpy(new_data, b->data,
426                        b->l_data < b->m_data ? b->l_data : b->m_data);
427             bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
428         }
429     }
430     if (!new_data) return -1;
431     b->data = new_data;
432     b->m_data = new_m_data;
433     return 0;
434 }
435 
bam_destroy1(bam1_t * b)436 void bam_destroy1(bam1_t *b)
437 {
438     if (b == 0) return;
439     if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
440         free(b->data);
441         if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
442             // In case of reuse
443             b->data = NULL;
444             b->m_data = 0;
445             b->l_data = 0;
446         }
447     }
448 
449     if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
450         free(b);
451 }
452 
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)453 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
454 {
455     if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
456     memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
457     memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
458     bdst->l_data = bsrc->l_data;
459     bdst->id = bsrc->id;
460     return bdst;
461 }
462 
bam_dup1(const bam1_t * bsrc)463 bam1_t *bam_dup1(const bam1_t *bsrc)
464 {
465     if (bsrc == NULL) return NULL;
466     bam1_t *bdst = bam_init1();
467     if (bdst == NULL) return NULL;
468     if (bam_copy1(bdst, bsrc) == NULL) {
469         bam_destroy1(bdst);
470         return NULL;
471     }
472     return bdst;
473 }
474 
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,hts_pos_t * rlen,hts_pos_t * qlen)475 static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
476                              hts_pos_t *rlen, hts_pos_t *qlen)
477 {
478     int k;
479     *rlen = *qlen = 0;
480     for (k = 0; k < n_cigar; ++k) {
481         int type = bam_cigar_type(bam_cigar_op(cigar[k]));
482         int len = bam_cigar_oplen(cigar[k]);
483         if (type & 1) *qlen += len;
484         if (type & 2) *rlen += len;
485     }
486 }
487 
subtract_check_underflow(size_t length,size_t * limit)488 static int subtract_check_underflow(size_t length, size_t *limit)
489 {
490     if (length <= *limit) {
491         *limit -= length;
492         return 0;
493     }
494 
495     return -1;
496 }
497 
bam_set1(bam1_t * bam,size_t l_qname,const char * qname,uint16_t flag,int32_t tid,hts_pos_t pos,uint8_t mapq,size_t n_cigar,const uint32_t * cigar,int32_t mtid,hts_pos_t mpos,hts_pos_t isize,size_t l_seq,const char * seq,const char * qual,size_t l_aux)498 int bam_set1(bam1_t *bam,
499              size_t l_qname, const char *qname,
500              uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
501              size_t n_cigar, const uint32_t *cigar,
502              int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
503              size_t l_seq, const char *seq, const char *qual,
504              size_t l_aux)
505 {
506     // use a default qname "*" if none is provided
507     if (l_qname == 0) {
508         l_qname = 1;
509         qname = "*";
510     }
511 
512     // note: the qname is stored nul terminated and padded as described in the
513     // documentation for the bam1_t struct.
514     size_t qname_nuls = 4 - l_qname % 4;
515 
516     // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
517     // can't use bam_endpos() directly as some fields not yet set up.
518     hts_pos_t rlen = 0, qlen = 0;
519     if (!(flag & BAM_FUNMAP)) {
520         bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
521     }
522     if (rlen == 0) {
523         rlen = 1;
524     }
525 
526     // validate parameters
527     if (l_qname > 254) {
528         hts_log_error("Query name too long");
529         errno = EINVAL;
530         return -1;
531     }
532     if (HTS_POS_MAX - rlen <= pos) {
533         hts_log_error("Read ends beyond highest supported position");
534         errno = EINVAL;
535         return -1;
536     }
537     if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
538         hts_log_error("Mapped query must have a CIGAR");
539         errno = EINVAL;
540         return -1;
541     }
542     if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
543         hts_log_error("CIGAR and query sequence are of different length");
544         errno = EINVAL;
545         return -1;
546     }
547 
548     size_t limit = INT32_MAX;
549     int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
550     u    += subtract_check_underflow(n_cigar * 4, &limit);
551     u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
552     u    += subtract_check_underflow(l_seq, &limit);
553     u    += subtract_check_underflow(l_aux, &limit);
554     if (u != 0) {
555         hts_log_error("Size overflow");
556         errno = EINVAL;
557         return -1;
558     }
559 
560     // re-allocate the data buffer as needed.
561     size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
562     if (realloc_bam_data(bam, data_len + l_aux) < 0) {
563         return -1;
564     }
565 
566     bam->l_data = (int)data_len;
567     bam->core.pos = pos;
568     bam->core.tid = tid;
569     bam->core.bin = bam_reg2bin(pos, pos + rlen);
570     bam->core.qual = mapq;
571     bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
572     bam->core.flag = flag;
573     bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
574     bam->core.n_cigar = (uint32_t)n_cigar;
575     bam->core.l_qseq = (int32_t)l_seq;
576     bam->core.mtid = mtid;
577     bam->core.mpos = mpos;
578     bam->core.isize = isize;
579 
580     uint8_t *cp = bam->data;
581     strncpy((char *)cp, qname, l_qname);
582     int i;
583     for (i = 0; i < qname_nuls; i++) {
584         cp[l_qname + i] = '\0';
585     }
586     cp += l_qname + qname_nuls;
587 
588     if (n_cigar > 0) {
589         memcpy(cp, cigar, n_cigar * 4);
590     }
591     cp += n_cigar * 4;
592 
593     for (i = 0; i + 1 < l_seq; i += 2) {
594         *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]];
595     }
596     for (; i < l_seq; i++) {
597         *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
598     }
599 
600     if (qual) {
601         memcpy(cp, qual, l_seq);
602     }
603     else {
604         memset(cp, '\xff', l_seq);
605     }
606 
607     return (int)data_len;
608 }
609 
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)610 hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
611 {
612     int k;
613     hts_pos_t l;
614     for (k = l = 0; k < n_cigar; ++k)
615         if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
616             l += bam_cigar_oplen(cigar[k]);
617     return l;
618 }
619 
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)620 hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
621 {
622     int k;
623     hts_pos_t l;
624     for (k = l = 0; k < n_cigar; ++k)
625         if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
626             l += bam_cigar_oplen(cigar[k]);
627     return l;
628 }
629 
bam_endpos(const bam1_t * b)630 hts_pos_t bam_endpos(const bam1_t *b)
631 {
632     hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
633     if (rlen == 0) rlen = 1;
634     return b->core.pos + rlen;
635 }
636 
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)637 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
638 {
639     bam1_core_t *c = &b->core;
640     uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
641     uint8_t *CG;
642 
643     // test where there is a real CIGAR in the CG tag to move
644     if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
645     cigar0 = bam_get_cigar(b);
646     if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
647     fake_bytes = c->n_cigar * 4;
648     int saved_errno = errno;
649     CG = bam_aux_get(b, "CG");
650     if (!CG) {
651         if (errno != ENOENT) return -1;  // Bad aux data
652         errno = saved_errno; // restore errno on expected no-CG-tag case
653         return 0;
654     }
655     if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
656     CG_len = le_to_u32(CG + 2);
657     if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
658 
659     // move from the CG tag to the right position
660     cigar_st = (uint8_t*)cigar0 - b->data;
661     c->n_cigar = CG_len;
662     n_cigar4 = c->n_cigar * 4;
663     CG_st = CG - b->data - 2;
664     CG_en = CG_st + 8 + n_cigar4;
665     if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
666     b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
667     memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
668     memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
669     if (ori_len > CG_en) // move data after the CG tag
670         memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
671     b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
672     if (recal_bin)
673         b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
674     if (give_warning)
675         hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
676     return 1;
677 }
678 
aux_type2size(uint8_t type)679 static inline int aux_type2size(uint8_t type)
680 {
681     switch (type) {
682     case 'A': case 'c': case 'C':
683         return 1;
684     case 's': case 'S':
685         return 2;
686     case 'i': case 'I': case 'f':
687         return 4;
688     case 'd':
689         return 8;
690     case 'Z': case 'H': case 'B':
691         return type;
692     default:
693         return 0;
694     }
695 }
696 
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)697 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
698 {
699     uint32_t *cigar = (uint32_t*)(data + c->l_qname);
700     uint32_t i;
701     for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
702 }
703 
704 // Fix bad records where qname is not terminated correctly.
fixup_missing_qname_nul(bam1_t * b)705 static int fixup_missing_qname_nul(bam1_t *b) {
706     bam1_core_t *c = &b->core;
707 
708     // Note this is called before c->l_extranul is added to c->l_qname
709     if (c->l_extranul > 0) {
710         b->data[c->l_qname++] = '\0';
711         c->l_extranul--;
712     } else {
713         if (b->l_data > INT_MAX - 4) return -1;
714         if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
715         b->l_data += 4;
716         b->data[c->l_qname++] = '\0';
717         c->l_extranul = 3;
718     }
719     return 0;
720 }
721 
722 /*
723  * Note a second interface that returns a bam pointer instead would avoid bam_copy1
724  * in multi-threaded handling.  This may be worth considering for htslib2.
725  */
bam_read1(BGZF * fp,bam1_t * b)726 int bam_read1(BGZF *fp, bam1_t *b)
727 {
728     bam1_core_t *c = &b->core;
729     int32_t block_len, ret, i;
730     uint32_t x[8], new_l_data;
731 
732     b->l_data = 0;
733 
734     if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
735         if (ret == 0) return -1; // normal end-of-file
736         else return -2; // truncated
737     }
738     if (fp->is_be)
739         ed_swap_4p(&block_len);
740     if (block_len < 32) return -4;  // block_len includes core data
741     if (bgzf_read(fp, x, 32) != 32) return -3;
742     if (fp->is_be) {
743         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
744     }
745     c->tid = x[0]; c->pos = (int32_t)x[1];
746     c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
747     c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
748     c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
749     c->l_qseq = x[4];
750     c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
751 
752     new_l_data = block_len - 32 + c->l_extranul;
753     if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
754     if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
755         + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
756         return -4;
757     if (realloc_bam_data(b, new_l_data) < 0) return -4;
758     b->l_data = new_l_data;
759 
760     if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
761     if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
762         if (fixup_missing_qname_nul(b) < 0) return -4;
763     }
764     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
765     c->l_qname += c->l_extranul;
766     if (b->l_data < c->l_qname ||
767         bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
768         return -4;
769     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
770     if (bam_tag2cigar(b, 0, 0) < 0)
771         return -4;
772 
773     if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
774         hts_pos_t rlen, qlen;
775         bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
776         if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
777         b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
778         // Sanity check for broken CIGAR alignments
779         if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
780             hts_log_error("CIGAR and query sequence lengths differ for %s",
781                     bam_get_qname(b));
782             return -4;
783         }
784     }
785 
786     return 4 + block_len;
787 }
788 
bam_write1(BGZF * fp,const bam1_t * b)789 int bam_write1(BGZF *fp, const bam1_t *b)
790 {
791     const bam1_core_t *c = &b->core;
792     uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
793     int i, ok;
794     if (c->l_qname - c->l_extranul > 255) {
795         hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
796         errno = EOVERFLOW;
797         return -1;
798     }
799     if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
800     if (c->pos > INT_MAX ||
801         c->mpos > INT_MAX ||
802         c->isize < INT_MIN || c->isize > INT_MAX) {
803         hts_log_error("Positional data is too large for BAM format");
804         return -1;
805     }
806     x[0] = c->tid;
807     x[1] = c->pos;
808     x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
809     if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
810     else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
811     x[4] = c->l_qseq;
812     x[5] = c->mtid;
813     x[6] = c->mpos;
814     x[7] = c->isize;
815     ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
816     if (fp->is_be) {
817         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
818         y = block_len;
819         if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
820         swap_data(c, b->l_data, b->data, 1);
821     } else {
822         if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
823     }
824     if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
825     if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
826     if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
827         if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
828     } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
829         uint8_t buf[8];
830         uint32_t cigar_st, cigar_en, cigar[2];
831         hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
832         if (cigreflen >= (1<<28)) {
833             // Length of reference covered is greater than the biggest
834             // CIGAR operation currently allowed.
835             hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
836                           " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
837                           bam_get_qname(b), c->n_cigar, cigreflen);
838             return -1;
839         }
840         cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
841         cigar_en = cigar_st + c->n_cigar * 4;
842         cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
843         cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
844         u32_to_le(cigar[0], buf);
845         u32_to_le(cigar[1], buf + 4);
846         if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
847         if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
848         if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
849         u32_to_le(c->n_cigar, buf);
850         if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
851         if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
852     }
853     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
854     return ok? 4 + block_len : -1;
855 }
856 
857 /*
858  * Write a BAM file and append to the in-memory index simultaneously.
859  */
bam_write_idx1(htsFile * fp,const sam_hdr_t * h,const bam1_t * b)860 static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
861     BGZF *bfp = fp->fp.bgzf;
862 
863     if (!fp->idx)
864         return bam_write1(bfp, b);
865 
866     uint32_t block_len = b->l_data - b->core.l_extranul + 32;
867     if (bgzf_flush_try(bfp, 4 + block_len) < 0)
868         return -1;
869     if (!bfp->mt)
870         hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
871     else
872         bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp));
873 
874     int ret = bam_write1(bfp, b);
875     if (ret < 0)
876         return -1;
877 
878     if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
879         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
880                 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
881         ret = -1;
882     }
883 
884     return ret;
885 }
886 
887 /*
888  * Set the qname in a BAM record
889  */
bam_set_qname(bam1_t * rec,const char * qname)890 int bam_set_qname(bam1_t *rec, const char *qname)
891 {
892     if (!rec) return -1;
893     if (!qname || !*qname) return -1;
894 
895     size_t old_len = rec->core.l_qname;
896     size_t new_len = strlen(qname) + 1;
897     if (new_len < 1 || new_len > 255) return -1;
898 
899     int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
900 
901     size_t new_data_len = rec->l_data - old_len + new_len + extranul;
902     if (realloc_bam_data(rec, new_data_len) < 0) return -1;
903 
904     // Make room
905     if (new_len + extranul != rec->core.l_qname)
906         memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
907     // Copy in new name and pad if needed
908     memcpy(rec->data, qname, new_len);
909     int n;
910     for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
911 
912     rec->l_data = new_data_len;
913     rec->core.l_qname = new_len + extranul;
914     rec->core.l_extranul = extranul;
915 
916     return 0;
917 }
918 
919 /********************
920  *** BAM indexing ***
921  ********************/
922 
sam_index(htsFile * fp,int min_shift)923 static hts_idx_t *sam_index(htsFile *fp, int min_shift)
924 {
925     int n_lvls, i, fmt, ret;
926     bam1_t *b;
927     hts_idx_t *idx;
928     sam_hdr_t *h;
929     h = sam_hdr_read(fp);
930     if (h == NULL) return NULL;
931     if (min_shift > 0) {
932         hts_pos_t max_len = 0, s;
933         for (i = 0; i < h->n_targets; ++i) {
934             hts_pos_t len = sam_hdr_tid2len(h, i);
935             if (max_len < len) max_len = len;
936         }
937         max_len += 256;
938         for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
939         fmt = HTS_FMT_CSI;
940     } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
941     idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
942     b = bam_init1();
943     while ((ret = sam_read1(fp, h, b)) >= 0) {
944         ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
945         if (ret < 0) { // unsorted or doesn't fit
946             hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
947             goto err;
948         }
949     }
950     if (ret < -1) goto err; // corrupted BAM file
951 
952     hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
953     sam_hdr_destroy(h);
954     bam_destroy1(b);
955     return idx;
956 
957 err:
958     bam_destroy1(b);
959     hts_idx_destroy(idx);
960     return NULL;
961 }
962 
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)963 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
964 {
965     hts_idx_t *idx;
966     htsFile *fp;
967     int ret = 0;
968 
969     if ((fp = hts_open(fn, "r")) == 0) return -2;
970     if (nthreads)
971         hts_set_threads(fp, nthreads);
972 
973     switch (fp->format.format) {
974     case cram:
975 
976         ret = cram_index_build(fp->fp.cram, fn, fnidx);
977         break;
978 
979     case bam:
980     case sam:
981         if (fp->format.compression != bgzf) {
982             hts_log_error("%s file \"%s\" not BGZF compressed",
983                           fp->format.format == bam ? "BAM" : "SAM", fn);
984             ret = -1;
985             break;
986         }
987         idx = sam_index(fp, min_shift);
988         if (idx) {
989             ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
990             if (ret < 0) ret = -4;
991             hts_idx_destroy(idx);
992         }
993         else ret = -1;
994         break;
995 
996     default:
997         ret = -3;
998         break;
999     }
1000     hts_close(fp);
1001 
1002     return ret;
1003 }
1004 
sam_index_build2(const char * fn,const char * fnidx,int min_shift)1005 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1006 {
1007     return sam_index_build3(fn, fnidx, min_shift, 0);
1008 }
1009 
sam_index_build(const char * fn,int min_shift)1010 int sam_index_build(const char *fn, int min_shift)
1011 {
1012     return sam_index_build3(fn, NULL, min_shift, 0);
1013 }
1014 
1015 // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1016 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)1017 int bam_index_build(const char *fn, int min_shift)
1018 {
1019     return sam_index_build2(fn, NULL, min_shift);
1020 }
1021 
1022 // Initialise fp->idx for the current format type.
1023 // This must be called after the header has been written but no other data.
sam_idx_init(htsFile * fp,sam_hdr_t * h,int min_shift,const char * fnidx)1024 int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1025     fp->fnidx = fnidx;
1026     if (fp->format.format == bam || fp->format.format == bcf ||
1027         (fp->format.format == sam && fp->format.compression == bgzf)) {
1028         int n_lvls, fmt = HTS_FMT_CSI;
1029         if (min_shift > 0) {
1030             int64_t max_len = 0, s;
1031             int i;
1032             for (i = 0; i < h->n_targets; ++i)
1033                 if (max_len < h->target_len[i]) max_len = h->target_len[i];
1034             max_len += 256;
1035             for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1036 
1037         } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1038 
1039         fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1040         return fp->idx ? 0 : -1;
1041     }
1042 
1043     if (fp->format.format == cram) {
1044         fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1045         return fp->fp.cram->idxfp ? 0 : -1;
1046     }
1047 
1048     return -1;
1049 }
1050 
1051 // Finishes an index. Call after the last record has been written.
1052 // Returns 0 on success, <0 on failure.
sam_idx_save(htsFile * fp)1053 int sam_idx_save(htsFile *fp) {
1054     if (fp->format.format == bam || fp->format.format == bcf ||
1055         fp->format.format == vcf || fp->format.format == sam) {
1056         int ret;
1057         if ((ret = sam_state_destroy(fp)) < 0) {
1058             errno = -ret;
1059             return -1;
1060         }
1061         if (bgzf_flush(fp->fp.bgzf) < 0)
1062             return -1;
1063         hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1064 
1065         if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1066             return -1;
1067 
1068         return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx));
1069 
1070     } else if (fp->format.format == cram) {
1071         // flushed and closed by cram_close
1072     }
1073 
1074     return 0;
1075 }
1076 
sam_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1077 static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1078 {
1079     htsFile *fp = (htsFile *)fpv;
1080     bam1_t *b = bv;
1081     fp->line.l = 0;
1082     int ret = sam_read1(fp, fp->bam_header, b);
1083     if (ret >= 0) {
1084         *tid = b->core.tid;
1085         *beg = b->core.pos;
1086         *end = bam_endpos(b);
1087     }
1088     return ret;
1089 }
1090 
1091 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_readrec_rest(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1092 static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1093 {
1094     htsFile *fp = (htsFile *)fpv;
1095     bam1_t *b = bv;
1096     fp->line.l = 0;
1097     int ret = sam_read1(fp, fp->bam_header, b);
1098     return ret;
1099 }
1100 
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1101 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1102 {
1103     htsFile *fp = fpv;
1104     bam1_t *b = bv;
1105     int ret = cram_get_bam_seq(fp->fp.cram, &b);
1106     if (ret < 0)
1107         return cram_eof(fp->fp.cram) ? -1 : -2;
1108 
1109     if (bam_tag2cigar(b, 1, 1) < 0)
1110         return -2;
1111 
1112     *tid = b->core.tid;
1113     *beg = b->core.pos;
1114     *end = bam_endpos(b);
1115 
1116     return ret;
1117 }
1118 
cram_pseek(void * fp,int64_t offset,int whence)1119 static int cram_pseek(void *fp, int64_t offset, int whence)
1120 {
1121     cram_fd *fd =  (cram_fd *)fp;
1122 
1123     if ((0 != cram_seek(fd, offset, SEEK_SET))
1124      && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1125         return -1;
1126 
1127     fd->curr_position = offset;
1128 
1129     if (fd->ctr) {
1130         cram_free_container(fd->ctr);
1131         if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1132             cram_free_container(fd->ctr_mt);
1133 
1134         fd->ctr = NULL;
1135         fd->ctr_mt = NULL;
1136         fd->ooc = 0;
1137     }
1138 
1139     return 0;
1140 }
1141 
1142 /*
1143  * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1144  *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1145  *   container previously fetched. It was designed like this to integrate with the functionality
1146  *   of the iterator stepping logic.
1147  */
1148 
cram_ptell(void * fp)1149 static int64_t cram_ptell(void *fp)
1150 {
1151     cram_fd *fd = (cram_fd *)fp;
1152     cram_container *c;
1153     cram_slice *s;
1154     int64_t ret = -1L;
1155 
1156     if (fd) {
1157         if ((c = fd->ctr) != NULL) {
1158             if ((s = c->slice) != NULL && s->max_rec) {
1159                 if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1160                     fd->curr_position += c->offset + c->length;
1161             }
1162         }
1163         ret = fd->curr_position;
1164     }
1165 
1166     return ret;
1167 }
1168 
bam_pseek(void * fp,int64_t offset,int whence)1169 static int bam_pseek(void *fp, int64_t offset, int whence)
1170 {
1171     BGZF *fd = (BGZF *)fp;
1172 
1173     return bgzf_seek(fd, offset, whence);
1174 }
1175 
bam_ptell(void * fp)1176 static int64_t bam_ptell(void *fp)
1177 {
1178     BGZF *fd = (BGZF *)fp;
1179     if (!fd)
1180         return -1L;
1181 
1182     return bgzf_tell(fd);
1183 }
1184 
1185 
1186 
index_load(htsFile * fp,const char * fn,const char * fnidx,int flags)1187 static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1188 {
1189     switch (fp->format.format) {
1190     case bam:
1191     case sam:
1192         return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1193 
1194     case cram: {
1195         if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1196 
1197         // Cons up a fake "index" just pointing at the associated cram_fd:
1198         hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1199         if (idx == NULL) return NULL;
1200         idx->fmt = HTS_FMT_CRAI;
1201         idx->cram = fp->fp.cram;
1202         return (hts_idx_t *) idx;
1203         }
1204 
1205     default:
1206         return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1207     }
1208 }
1209 
sam_index_load3(htsFile * fp,const char * fn,const char * fnidx,int flags)1210 hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1211 {
1212     return index_load(fp, fn, fnidx, flags);
1213 }
1214 
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)1215 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1216     return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1217 }
1218 
sam_index_load(htsFile * fp,const char * fn)1219 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1220 {
1221     return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1222 }
1223 
cram_itr_query(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end,hts_readrec_func * readrec)1224 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1225 {
1226     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1227     hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1228     if (iter == NULL) return NULL;
1229 
1230     // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1231     // the readrec function:
1232     iter->is_cram = 1;
1233     iter->read_rest = 1;
1234     iter->off = NULL;
1235     iter->bins.a = NULL;
1236     iter->readrec = readrec;
1237 
1238     if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1239         cram_range r = { tid, beg+1, end };
1240         int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1241 
1242         iter->curr_off = 0;
1243         // The following fields are not required by hts_itr_next(), but are
1244         // filled in in case user code wants to look at them.
1245         iter->tid = tid;
1246         iter->beg = beg;
1247         iter->end = end;
1248 
1249         switch (ret) {
1250         case 0:
1251             break;
1252 
1253         case -2:
1254             // No data vs this ref, so mark iterator as completed.
1255             // Same as HTS_IDX_NONE.
1256             iter->finished = 1;
1257             break;
1258 
1259         default:
1260             free(iter);
1261             return NULL;
1262         }
1263     }
1264     else switch (tid) {
1265     case HTS_IDX_REST:
1266         iter->curr_off = 0;
1267         break;
1268     case HTS_IDX_NONE:
1269         iter->curr_off = 0;
1270         iter->finished = 1;
1271         break;
1272     default:
1273         hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1274         abort();
1275         break;
1276     }
1277 
1278     return iter;
1279 }
1280 
sam_itr_queryi(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end)1281 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1282 {
1283     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1284     if (idx == NULL)
1285         return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1286     else if (cidx->fmt == HTS_FMT_CRAI)
1287         return cram_itr_query(idx, tid, beg, end, sam_readrec);
1288     else
1289         return hts_itr_query(idx, tid, beg, end, sam_readrec);
1290 }
1291 
cram_name2id(void * fdv,const char * ref)1292 static int cram_name2id(void *fdv, const char *ref)
1293 {
1294     cram_fd *fd = (cram_fd *) fdv;
1295     return sam_hdr_name2tid(fd->header, ref);
1296 }
1297 
sam_itr_querys(const hts_idx_t * idx,sam_hdr_t * hdr,const char * region)1298 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1299 {
1300     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1301     return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1302                           cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1303                           sam_readrec);
1304 }
1305 
sam_itr_regarray(const hts_idx_t * idx,sam_hdr_t * hdr,char ** regarray,unsigned int regcount)1306 hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1307 {
1308     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1309     hts_reglist_t *r_list = NULL;
1310     int r_count = 0;
1311 
1312     if (!cidx || !hdr)
1313         return NULL;
1314 
1315     hts_itr_t *itr = NULL;
1316     if (cidx->fmt == HTS_FMT_CRAI) {
1317         r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1318         if (!r_list)
1319             return NULL;
1320         itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1321                    hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1322     } else {
1323         r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1324         if (!r_list)
1325             return NULL;
1326         itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1327                    hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1328     }
1329 
1330     if (!itr)
1331         hts_reglist_free(r_list, r_count);
1332 
1333     return itr;
1334 }
1335 
sam_itr_regions(const hts_idx_t * idx,sam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)1336 hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1337 {
1338     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1339 
1340     if(!cidx || !hdr || !reglist)
1341         return NULL;
1342 
1343     if (cidx->fmt == HTS_FMT_CRAI)
1344         return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1345                    hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1346     else
1347         return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1348                    hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1349 }
1350 
1351 /**********************
1352  *** SAM header I/O ***
1353  **********************/
1354 
1355 #include "htslib/kseq.h"
1356 #include "htslib/kstring.h"
1357 
sam_hdr_parse(size_t l_text,const char * text)1358 sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1359 {
1360     sam_hdr_t *bh = sam_hdr_init();
1361     if (!bh) return NULL;
1362 
1363     if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1364         sam_hdr_destroy(bh);
1365         return NULL;
1366     }
1367 
1368     return bh;
1369 }
1370 
1371 // Minimal sanitisation of a header to ensure.
1372 // - null terminated string.
1373 // - all lines start with @ (also implies no blank lines).
1374 //
1375 // Much more could be done, but currently is not, including:
1376 // - checking header types are known (HD, SQ, etc).
1377 // - syntax (eg checking tab separated fields).
1378 // - validating n_targets matches @SQ records.
1379 // - validating target lengths against @SQ records.
sam_hdr_sanitise(sam_hdr_t * h)1380 static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1381     if (!h)
1382         return NULL;
1383 
1384     // Special case for empty headers.
1385     if (h->l_text == 0)
1386         return h;
1387 
1388     size_t i;
1389     unsigned int lnum = 0;
1390     char *cp = h->text, last = '\n';
1391     for (i = 0; i < h->l_text; i++) {
1392         // NB: l_text excludes terminating nul.  This finds early ones.
1393         if (cp[i] == 0)
1394             break;
1395 
1396         // Error on \n[^@], including duplicate newlines
1397         if (last == '\n') {
1398             lnum++;
1399             if (cp[i] != '@') {
1400                 hts_log_error("Malformed SAM header at line %u", lnum);
1401                 sam_hdr_destroy(h);
1402                 return NULL;
1403             }
1404         }
1405 
1406         last = cp[i];
1407     }
1408 
1409     if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1410         size_t j = i;
1411         while (j < h->l_text && cp[j] == '\0') j++;
1412         if (j < h->l_text)
1413             hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1414     }
1415 
1416     // Add trailing newline and/or trailing nul if required.
1417     if (last != '\n') {
1418         hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1419 
1420         if (h->l_text < 2 || i >= h->l_text - 2) {
1421             if (h->l_text >= SIZE_MAX - 2) {
1422                 hts_log_error("No room for extra newline");
1423                 sam_hdr_destroy(h);
1424                 return NULL;
1425             }
1426 
1427             cp = realloc(h->text, (size_t) h->l_text+2);
1428             if (!cp) {
1429                 sam_hdr_destroy(h);
1430                 return NULL;
1431             }
1432             h->text = cp;
1433         }
1434         cp[i++] = '\n';
1435 
1436         // l_text may be larger already due to multiple nul padding
1437         if (h->l_text < i)
1438             h->l_text = i;
1439         cp[h->l_text] = '\0';
1440     }
1441 
1442     return h;
1443 }
1444 
sam_hdr_create(htsFile * fp)1445 static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1446     kstring_t str = { 0, 0, NULL };
1447     khint_t k;
1448     sam_hdr_t* h = sam_hdr_init();
1449     const char *q, *r;
1450     char* sn = NULL;
1451     khash_t(s2i) *d = kh_init(s2i);
1452     khash_t(s2i) *long_refs = NULL;
1453     if (!h || !d)
1454         goto error;
1455 
1456     int ret, has_SQ = 0;
1457     int next_c = '@';
1458     while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1459         if (fp->line.s[0] != '@')
1460             break;
1461 
1462         if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1463             has_SQ = 1;
1464             hts_pos_t ln = -1;
1465             for (q = fp->line.s + 4;; ++q) {
1466                 if (strncmp(q, "SN:", 3) == 0) {
1467                     q += 3;
1468                     for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1469 
1470                     if (sn) {
1471                         hts_log_warning("SQ header line has more than one SN: tag");
1472                         free(sn);
1473                     }
1474                     sn = (char*)calloc(r - q + 1, 1);
1475                     if (!sn)
1476                         goto error;
1477 
1478                     strncpy(sn, q, r - q);
1479                     q = r;
1480                 } else {
1481                     if (strncmp(q, "LN:", 3) == 0)
1482                         ln = strtoll(q + 3, (char**)&q, 10);
1483                 }
1484 
1485                 while (*q != '\t' && *q != '\n' && *q != '\0')
1486                     ++q;
1487                 if (*q == '\0' || *q == '\n')
1488                     break;
1489             }
1490             if (sn) {
1491                 if (ln >= 0) {
1492                     int absent;
1493                     k = kh_put(s2i, d, sn, &absent);
1494                     if (absent < 0)
1495                         goto error;
1496 
1497                     if (!absent) {
1498                         hts_log_warning("Duplicated sequence '%s'", sn);
1499                         free(sn);
1500                     } else {
1501                         if (ln >= UINT32_MAX) {
1502                             // Stash away ref length that
1503                             // doesn't fit in target_len array
1504                             int k2;
1505                             if (!long_refs) {
1506                                 long_refs = kh_init(s2i);
1507                                 if (!long_refs)
1508                                     goto error;
1509                             }
1510                             k2 = kh_put(s2i, long_refs, sn, &absent);
1511                             if (absent < 0)
1512                                 goto error;
1513                             kh_val(long_refs, k2) = ln;
1514                             kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1515                                             | UINT32_MAX);
1516                         } else {
1517                             kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1518                         }
1519                     }
1520                 } else {
1521                     hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
1522                     free(sn);
1523                 }
1524             } else {
1525                 hts_log_warning("Ignored @SQ line with missing SN: tag");
1526             }
1527             sn = NULL;
1528         }
1529         if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1530             goto error;
1531 
1532         if (kputc('\n', &str) < 0)
1533             goto error;
1534 
1535         if (fp->is_bgzf) {
1536             next_c = bgzf_peek(fp->fp.bgzf);
1537         } else {
1538             unsigned char nc;
1539             ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
1540             next_c = pret > 0 ? nc : pret - 1;
1541         }
1542         if (next_c < -1)
1543             goto error;
1544     }
1545     if (next_c != '@')
1546         fp->line.l = 0;
1547 
1548     if (ret < -1)
1549         goto error;
1550 
1551     if (!has_SQ && fp->fn_aux) {
1552         kstring_t line = { 0, 0, NULL };
1553 
1554         /* The reference index (.fai) is actually needed here */
1555         char *fai_fn = fp->fn_aux;
1556         char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
1557         if (fn_delim)
1558             fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
1559 
1560         hFILE* f = hopen(fai_fn, "r");
1561         int e = 0, absent;
1562         if (f == NULL)
1563             goto error;
1564 
1565         while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
1566             char* tab = strchr(line.s, '\t');
1567             hts_pos_t ln;
1568 
1569             if (tab == NULL)
1570                 continue;
1571 
1572             sn = (char*)calloc(tab-line.s+1, 1);
1573             if (!sn)
1574                 break;
1575             memcpy(sn, line.s, tab-line.s);
1576             k = kh_put(s2i, d, sn, &absent);
1577             if (absent < 0)
1578                 break;
1579 
1580             ln = strtoll(tab, NULL, 10);
1581 
1582             if (!absent) {
1583                 hts_log_warning("Duplicated sequence '%s'", sn);
1584                 free(sn);
1585             } else {
1586                 if (ln >= UINT32_MAX) {
1587                     // Stash away ref length that
1588                     // doesn't fit in target_len array
1589                     khint_t k2;
1590                     int absent = -1;
1591                     if (!long_refs) {
1592                         long_refs = kh_init(s2i);
1593                         if (!long_refs)
1594                             goto error;
1595                     }
1596                     k2 = kh_put(s2i, long_refs, sn, &absent);
1597                     if (absent < 0)
1598                         goto error;
1599                     kh_val(long_refs, k2) = ln;
1600                     kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1601                                     | UINT32_MAX);
1602                 } else {
1603                     kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1604                 }
1605                 has_SQ = 1;
1606             }
1607 
1608             e |= kputs("@SQ\tSN:", &str) < 0;
1609             e |= kputsn(line.s, tab - line.s, &str) < 0;
1610             e |= kputs("\tLN:", &str) < 0;
1611             e |= kputll(ln, &str) < 0;
1612             e |= kputc('\n', &str) < 0;
1613             if (e)
1614                 break;
1615         }
1616 
1617         ks_free(&line);
1618         if (hclose(f) != 0) {
1619             hts_log_error("Error on closing %s", fai_fn);
1620             e = 1;
1621         }
1622         if (e)
1623             goto error;
1624     }
1625 
1626     if (has_SQ) {
1627         // Populate the targets array
1628         h->n_targets = kh_size(d);
1629 
1630         h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
1631         if (!h->target_name) {
1632             h->n_targets = 0;
1633             goto error;
1634         }
1635 
1636         h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
1637         if (!h->target_len) {
1638             h->n_targets = 0;
1639             goto error;
1640         }
1641 
1642         for (k = kh_begin(d); k != kh_end(d); ++k) {
1643             if (!kh_exist(d, k))
1644                 continue;
1645 
1646             h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
1647             h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
1648             kh_val(d, k) >>= 32;
1649         }
1650     }
1651 
1652     // Repurpose sdict to hold any references longer than UINT32_MAX
1653     h->sdict = long_refs;
1654 
1655     kh_destroy(s2i, d);
1656 
1657     if (str.l == 0)
1658         kputsn("", 0, &str);
1659     h->l_text = str.l;
1660     h->text = ks_release(&str);
1661     fp->bam_header = sam_hdr_sanitise(h);
1662     fp->bam_header->ref_count = 1;
1663 
1664     return fp->bam_header;
1665 
1666  error:
1667     if (h && d && (!h->target_name || !h->target_len)) {
1668         for (k = kh_begin(d); k != kh_end(d); ++k)
1669             if (kh_exist(d, k)) free((void *)kh_key(d, k));
1670     }
1671     sam_hdr_destroy(h);
1672     ks_free(&str);
1673     kh_destroy(s2i, d);
1674     kh_destroy(s2i, long_refs);
1675     if (sn) free(sn);
1676     return NULL;
1677 }
1678 
sam_hdr_read(htsFile * fp)1679 sam_hdr_t *sam_hdr_read(htsFile *fp)
1680 {
1681     if (!fp) {
1682         errno = EINVAL;
1683         return NULL;
1684     }
1685 
1686     switch (fp->format.format) {
1687     case bam:
1688         return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1689 
1690     case cram:
1691         return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
1692 
1693     case sam:
1694         return sam_hdr_create(fp);
1695 
1696     case empty_format:
1697         errno = EPIPE;
1698         return NULL;
1699 
1700     default:
1701         errno = EFTYPE;
1702         return NULL;
1703     }
1704 }
1705 
sam_hdr_write(htsFile * fp,const sam_hdr_t * h)1706 int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
1707 {
1708     if (!fp || !h) {
1709         errno = EINVAL;
1710         return -1;
1711     }
1712 
1713     if (!h->hrecs && !h->text)
1714         return 0;
1715 
1716     switch (fp->format.format) {
1717     case binary_format:
1718         fp->format.category = sequence_data;
1719         fp->format.format = bam;
1720         /* fall-through */
1721     case bam:
1722         if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1723         break;
1724 
1725     case cram: {
1726         cram_fd *fd = fp->fp.cram;
1727         if (cram_set_header2(fd, h) < 0) return -1;
1728         if (fp->fn_aux)
1729             cram_load_reference(fd, fp->fn_aux);
1730         if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1731         }
1732         break;
1733 
1734     case text_format:
1735         fp->format.category = sequence_data;
1736         fp->format.format = sam;
1737         /* fall-through */
1738     case sam: {
1739         char *text;
1740         kstring_t hdr_ks = { 0, 0, NULL };
1741         size_t l_text;
1742         ssize_t bytes;
1743         int r = 0, no_sq = 0;
1744 
1745         if (h->hrecs) {
1746             if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
1747                 return -1;
1748             text = hdr_ks.s;
1749             l_text = hdr_ks.l;
1750         } else {
1751             const char *p = NULL;
1752             do {
1753                 const char *q = p == NULL ? h->text : p + 4;
1754                 p = strstr(q, "@SQ\t");
1755             } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
1756             no_sq = p == NULL;
1757             text = h->text;
1758             l_text = h->l_text;
1759         }
1760 
1761         if (fp->is_bgzf) {
1762             bytes = bgzf_write(fp->fp.bgzf, text, l_text);
1763         } else {
1764             bytes = hwrite(fp->fp.hfile, text, l_text);
1765         }
1766         free(hdr_ks.s);
1767         if (bytes != l_text)
1768             return -1;
1769 
1770         if (no_sq) {
1771             int i;
1772             for (i = 0; i < h->n_targets; ++i) {
1773                 fp->line.l = 0;
1774                 r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
1775                 r |= kputs(h->target_name[i], &fp->line) < 0;
1776                 r |= kputsn("\tLN:", 4, &fp->line) < 0;
1777                 r |= kputw(h->target_len[i], &fp->line) < 0;
1778                 r |= kputc('\n', &fp->line) < 0;
1779                 if (r != 0)
1780                     return -1;
1781 
1782                 if (fp->is_bgzf) {
1783                     bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
1784                 } else {
1785                     bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
1786                 }
1787                 if (bytes != fp->line.l)
1788                     return -1;
1789             }
1790         }
1791         if (fp->is_bgzf) {
1792             if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
1793         } else {
1794             if (hflush(fp->fp.hfile) != 0) return -1;
1795         }
1796         }
1797         break;
1798 
1799     default:
1800         errno = EBADF;
1801         return -1;
1802     }
1803     return 0;
1804 }
1805 
old_sam_hdr_change_HD(sam_hdr_t * h,const char * key,const char * val)1806 static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
1807 {
1808     char *p, *q, *beg = NULL, *end = NULL, *newtext;
1809     size_t new_l_text;
1810     if (!h || !key)
1811         return -1;
1812 
1813     if (h->l_text > 3) {
1814         if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
1815             if ((p = strchr(h->text, '\n')) == 0) return -1;
1816             *p = '\0'; // for strstr call
1817 
1818             char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
1819 
1820             if ((q = strstr(h->text, tmp)) != 0) { // key exists
1821                 *p = '\n'; // change back
1822 
1823                 // mark the key:val
1824                 beg = q;
1825                 for (q += 4; *q != '\n' && *q != '\t'; ++q);
1826                 end = q;
1827 
1828                 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
1829                     && strlen(val) == end - beg - 4)
1830                      return 0; // val is the same, no need to change
1831 
1832             } else {
1833                 beg = end = p;
1834                 *p = '\n';
1835             }
1836         }
1837     }
1838     if (beg == NULL) { // no @HD
1839         new_l_text = h->l_text;
1840         if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
1841             return -1;
1842         new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
1843         if (val) {
1844             if (new_l_text > SIZE_MAX - strlen(val) - 5)
1845                 return -1;
1846             new_l_text += strlen(val) + 4;
1847         }
1848         newtext = (char*)malloc(new_l_text + 1);
1849         if (!newtext) return -1;
1850 
1851         if (val)
1852             snprintf(newtext, new_l_text + 1,
1853                     "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
1854         else
1855             snprintf(newtext, new_l_text + 1,
1856                     "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
1857     } else { // has @HD but different or no key
1858         new_l_text = (beg - h->text) + (h->text + h->l_text - end);
1859         if (val) {
1860             if (new_l_text > SIZE_MAX - strlen(val) - 5)
1861                 return -1;
1862             new_l_text += strlen(val) + 4;
1863         }
1864         newtext = (char*)malloc(new_l_text + 1);
1865         if (!newtext) return -1;
1866 
1867         if (val) {
1868             snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
1869                     (int) (beg - h->text), h->text, key, val, end);
1870         } else { //delete key
1871             snprintf(newtext, new_l_text + 1, "%.*s%s",
1872                     (int) (beg - h->text), h->text, end);
1873         }
1874     }
1875     free(h->text);
1876     h->text = newtext;
1877     h->l_text = new_l_text;
1878     return 0;
1879 }
1880 
1881 
sam_hdr_change_HD(sam_hdr_t * h,const char * key,const char * val)1882 int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
1883 {
1884     if (!h || !key)
1885         return -1;
1886 
1887     if (!h->hrecs)
1888         return old_sam_hdr_change_HD(h, key, val);
1889 
1890     if (val) {
1891         if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
1892             return -1;
1893     } else {
1894         if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
1895             return -1;
1896     }
1897     return sam_hdr_rebuild(h);
1898 }
1899 /**********************
1900  *** SAM record I/O ***
1901  **********************/
1902 
sam_parse_B_vals(char type,uint32_t n,char * in,char ** end,char * r,bam1_t * b)1903 static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
1904                             char *r, bam1_t *b)
1905 {
1906     int orig_l = b->l_data;
1907     char *q = in;
1908     int32_t size;
1909     size_t bytes;
1910     int overflow = 0;
1911 
1912     size = aux_type2size(type);
1913     if (size <= 0 || size > 4) {
1914         hts_log_error("Unrecognized type B:%c", type);
1915         return -1;
1916     }
1917 
1918     // Ensure space for type + values
1919     bytes = (size_t) n * (size_t) size;
1920     if (bytes / size != n
1921         || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
1922         hts_log_error("Out of memory");
1923         return -1;
1924     }
1925 
1926     b->data[b->l_data++] = 'B';
1927     b->data[b->l_data++] = type;
1928     i32_to_le(n, b->data + b->l_data);
1929     b->l_data += sizeof(uint32_t);
1930     // This ensures that q always ends up at the next comma after
1931     // reading a number even if it's followed by junk.  It
1932     // prevents the possibility of trying to read more than n items.
1933 #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
1934     if (type == 'c') {
1935         while (q < r) {
1936             *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow);
1937             b->l_data++;
1938             skip_to_comma_(q);
1939         }
1940     } else if (type == 'C') {
1941         while (q < r) {
1942             if (*q != '-') {
1943                 *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow);
1944                 b->l_data++;
1945             } else {
1946                 overflow = 1;
1947             }
1948             skip_to_comma_(q);
1949         }
1950     } else if (type == 's') {
1951         while (q < r) {
1952             i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data);
1953             b->l_data += 2;
1954             skip_to_comma_(q);
1955         }
1956     } else if (type == 'S') {
1957         while (q < r) {
1958             if (*q != '-') {
1959                 u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data);
1960                 b->l_data += 2;
1961             } else {
1962                 overflow = 1;
1963             }
1964             skip_to_comma_(q);
1965         }
1966     } else if (type == 'i') {
1967         while (q < r) {
1968             i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data);
1969             b->l_data += 4;
1970             skip_to_comma_(q);
1971         }
1972     } else if (type == 'I') {
1973         while (q < r) {
1974             if (*q != '-') {
1975                 u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data);
1976                 b->l_data += 4;
1977             } else {
1978                 overflow = 1;
1979             }
1980             skip_to_comma_(q);
1981         }
1982     } else if (type == 'f') {
1983         while (q < r) {
1984             float_to_le(strtod(q + 1, &q), b->data + b->l_data);
1985             b->l_data += 4;
1986             skip_to_comma_(q);
1987         }
1988     } else {
1989         hts_log_error("Unrecognized type B:%c", type);
1990         return -1;
1991     }
1992 
1993     if (!overflow) {
1994         *end = q;
1995         return 0;
1996     } else {
1997         int64_t max = 0, min = 0, val;
1998         // Given type was incorrect.  Try to rescue the situation.
1999         q = in;
2000         overflow = 0;
2001         b->l_data = orig_l;
2002         // Find out what range of values is present
2003         while (q < r) {
2004             val = hts_str2int(q + 1, &q, 64, &overflow);
2005             if (max < val) max = val;
2006             if (min > val) min = val;
2007             skip_to_comma_(q);
2008         }
2009         // Retry with appropriate type
2010         if (!overflow) {
2011             if (min < 0) {
2012                 if (min >= INT8_MIN && max <= INT8_MAX) {
2013                     return sam_parse_B_vals('c', n, in, end, r, b);
2014                 } else if (min >= INT16_MIN && max <= INT16_MAX) {
2015                     return sam_parse_B_vals('s', n, in, end, r, b);
2016                 } else if (min >= INT32_MIN && max <= INT32_MAX) {
2017                     return sam_parse_B_vals('i', n, in, end, r, b);
2018                 }
2019             } else {
2020                 if (max < UINT8_MAX) {
2021                     return sam_parse_B_vals('C', n, in, end, r, b);
2022                 } else if (max <= UINT16_MAX) {
2023                     return sam_parse_B_vals('S', n, in, end, r, b);
2024                 } else if (max <= UINT32_MAX) {
2025                     return sam_parse_B_vals('I', n, in, end, r, b);
2026                 }
2027             }
2028         }
2029         // If here then at least one of the values is too big to store
2030         hts_log_error("Numeric value in B array out of allowed range");
2031         return -1;
2032     }
2033 #undef skip_to_comma_
2034 }
2035 
parse_sam_flag(char * v,char ** rv,int * overflow)2036 static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2037     if (*v >= '1' && *v <= '9') {
2038         return hts_str2uint(v, rv, 16, overflow);
2039     }
2040     else if (*v == '0') {
2041         // handle single-digit "0" directly; otherwise it's hex or octal
2042         if (v[1] == '\t') { *rv = v+1; return 0; }
2043         else {
2044             unsigned long val = strtoul(v, rv, 0);
2045             if (val > 65535) { *overflow = 1; return 65535; }
2046             return val;
2047         }
2048     }
2049     else {
2050         // TODO implement symbolic flag letters
2051         *rv = v;
2052         return 0;
2053     }
2054 }
2055 
sam_parse1(kstring_t * s,sam_hdr_t * h,bam1_t * b)2056 int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2057 {
2058 #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2059 
2060 #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2061 
2062 // Macro that operates on 64-bits at a time.
2063 #define COPY_MINUS_N(to,from,n,l,failed)                        \
2064     do {                                                        \
2065         uint64_u *from8 = (uint64_u *)(from);                   \
2066         uint64_u *to8 = (uint64_u *)(to);                       \
2067         uint64_t uflow = 0;                                     \
2068         size_t l8 = (l)>>3, i;                                  \
2069         for (i = 0; i < l8; i++) {                              \
2070             to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2071             uflow |= to8[i];                                    \
2072         }                                                       \
2073         for (i<<=3; i < (l); ++i) {                             \
2074             to[i] = from[i] - (n);                              \
2075             uflow |= to[i];                                     \
2076         }                                                       \
2077         failed = (uflow & 0x8080808080808080UL) > 0;            \
2078     } while (0)
2079 
2080 #else
2081 
2082 // Basic version which operates a byte at a time
2083 #define COPY_MINUS_N(to,from,n,l,failed) do {                \
2084         uint8_t uflow = 0;                                   \
2085         for (i = 0; i < (l); ++i) {                          \
2086             (to)[i] = (from)[i] - (n);                       \
2087             uflow |= (uint8_t) (to)[i];                      \
2088         }                                                    \
2089         failed = (uflow & 0x80) > 0;                         \
2090     } while (0)
2091 
2092 #endif
2093 
2094 #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2095 #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2096 #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2097 
2098     uint8_t *t;
2099 
2100     char *p = s->s, *q;
2101     int i, overflow = 0;
2102     char logbuf[40];
2103     hts_pos_t cigreflen;
2104     bam1_core_t *c = &b->core;
2105 
2106     b->l_data = 0;
2107     memset(c, 0, 32);
2108 
2109     // qname
2110     q = _read_token(p);
2111 
2112     _parse_warn(p - q <= 1, "empty query name");
2113     _parse_err(p - q > 255, "query name too long");
2114     // resize large enough for name + extranul
2115     if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2116     memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2117 
2118     c->l_extranul = (4 - (b->l_data & 3)) & 3;
2119     memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2120     b->l_data += c->l_extranul;
2121 
2122     c->l_qname = p - q + c->l_extranul;
2123 
2124     // flag
2125     c->flag = parse_sam_flag(p, &p, &overflow);
2126     if (*p++ != '\t') goto err_ret; // malformated flag
2127 
2128     // chr
2129     q = _read_token(p);
2130     if (strcmp(q, "*")) {
2131         _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2132         c->tid = bam_name2id(h, q);
2133         _parse_err(c->tid < -1, "failed to parse header");
2134         _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2135     } else c->tid = -1;
2136 
2137     // pos
2138     c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
2139     if (*p++ != '\t') goto err_ret;
2140     if (c->pos < 0 && c->tid >= 0) {
2141         _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2142         c->tid = -1;
2143     }
2144     if (c->tid < 0) c->flag |= BAM_FUNMAP;
2145 
2146     // mapq
2147     c->qual = hts_str2uint(p, &p, 8, &overflow);
2148     if (*p++ != '\t') goto err_ret;
2149     // cigar
2150     if (*p != '*') {
2151         uint32_t *cigar = NULL;
2152         int old_l_data = b->l_data;
2153         int n_cigar = bam_parse_cigar(p, &p, b);
2154         if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2155         cigar = (uint32_t *)(b->data + old_l_data);
2156         c->n_cigar = n_cigar;
2157 
2158         // can't use bam_endpos() directly as some fields not yet set up
2159         cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2160         if (cigreflen == 0) cigreflen = 1;
2161     } else {
2162         _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2163         c->flag |= BAM_FUNMAP;
2164         q = _read_token(p);
2165         cigreflen = 1;
2166     }
2167     _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2168                "read ends beyond highest supported position");
2169     c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2170     // mate chr
2171     q = _read_token(p);
2172     if (strcmp(q, "=") == 0) {
2173         c->mtid = c->tid;
2174     } else if (strcmp(q, "*") == 0) {
2175         c->mtid = -1;
2176     } else {
2177         c->mtid = bam_name2id(h, q);
2178         _parse_err(c->mtid < -1, "failed to parse header");
2179         _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2180     }
2181     // mpos
2182     c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
2183     if (*p++ != '\t') goto err_ret;
2184     if (c->mpos < 0 && c->mtid >= 0) {
2185         _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2186         c->mtid = -1;
2187     }
2188     // tlen
2189     c->isize = hts_str2int(p, &p, 64, &overflow);
2190     if (*p++ != '\t') goto err_ret;
2191     // seq
2192     q = _read_token(p);
2193     if (strcmp(q, "*")) {
2194         _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2195         c->l_qseq = p - q - 1;
2196         hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2197         _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2198         i = (c->l_qseq + 1) >> 1;
2199         _get_mem(uint8_t, &t, b, i);
2200 
2201         unsigned int lqs2 = c->l_qseq&~1, i;
2202         for (i = 0; i < lqs2; i+=2)
2203             t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2204         for (; i < c->l_qseq; ++i)
2205             t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2206     } else c->l_qseq = 0;
2207     // qual
2208     _get_mem(uint8_t, &t, b, c->l_qseq);
2209     if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2210         memset(t, 0xff, c->l_qseq);
2211         p += 2;
2212     } else {
2213         int failed = 0;
2214         _parse_err(s->l - (p - s->s) < c->l_qseq
2215                    || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2216                    "SEQ and QUAL are of different length");
2217         COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2218         _parse_err(failed, "invalid QUAL character");
2219         p += c->l_qseq + 1;
2220     }
2221     // aux
2222     q = p;
2223     p = s->s + s->l;
2224     while (q < p) {
2225         char type;
2226         _parse_err(p - q < 5, "incomplete aux field");
2227         _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2228         // Copy over id
2229         if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2230         memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2231         q += 3; type = *q++; ++q; // q points to value
2232         if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2233             _parse_err(*q <= '\t', "incomplete aux field");
2234 
2235         // Ensure enough space for a double + type allocated.
2236         if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2237 
2238         if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2239             b->data[b->l_data++] = 'A';
2240             b->data[b->l_data++] = *q++;
2241         } else if (type == 'i' || type == 'I') {
2242             if (*q == '-') {
2243                 int32_t x = hts_str2int(q, &q, 32, &overflow);
2244                 if (x >= INT8_MIN) {
2245                     b->data[b->l_data++] = 'c';
2246                     b->data[b->l_data++] = x;
2247                 } else if (x >= INT16_MIN) {
2248                     b->data[b->l_data++] = 's';
2249                     i16_to_le(x, b->data + b->l_data);
2250                     b->l_data += 2;
2251                 } else {
2252                     b->data[b->l_data++] = 'i';
2253                     i32_to_le(x, b->data + b->l_data);
2254                     b->l_data += 4;
2255                 }
2256             } else {
2257                 uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2258                 if (x <= UINT8_MAX) {
2259                     b->data[b->l_data++] = 'C';
2260                     b->data[b->l_data++] = x;
2261                 } else if (x <= UINT16_MAX) {
2262                     b->data[b->l_data++] = 'S';
2263                     u16_to_le(x, b->data + b->l_data);
2264                     b->l_data += 2;
2265                 } else {
2266                     b->data[b->l_data++] = 'I';
2267                     u32_to_le(x, b->data + b->l_data);
2268                     b->l_data += 4;
2269                 }
2270             }
2271         } else if (type == 'f') {
2272             b->data[b->l_data++] = 'f';
2273             float_to_le(strtod(q, &q), b->data + b->l_data);
2274             b->l_data += sizeof(float);
2275         } else if (type == 'd') {
2276             b->data[b->l_data++] = 'd';
2277             double_to_le(strtod(q, &q), b->data + b->l_data);
2278             b->l_data += sizeof(double);
2279         } else if (type == 'Z' || type == 'H') {
2280             char *end = strchr(q, '\t');
2281             if (!end) end = q + strlen(q);
2282             _parse_err(type == 'H' && ((end-q)&1) != 0,
2283                        "hex field does not have an even number of digits");
2284             b->data[b->l_data++] = type;
2285             if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2286             memcpy(b->data + b->l_data, q, end - q);
2287             b->l_data += end - q;
2288             b->data[b->l_data++] = '\0';
2289             q = end;
2290         } else if (type == 'B') {
2291             uint32_t n;
2292             char *r;
2293             type = *q++; // q points to the first ',' following the typing byte
2294             _parse_err(*q && *q != ',' && *q != '\t',
2295                        "B aux field type not followed by ','");
2296 
2297             for (r = q, n = 0; *r > '\t'; ++r)
2298                 if (*r == ',') ++n;
2299 
2300             if (sam_parse_B_vals(type, n, q, &q, r, b) < 0)
2301                 goto err_ret;
2302         } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2303 
2304         while (*q > '\t') { q++; } // Skip any junk to next tab
2305         q++;
2306     }
2307 
2308     _parse_err(overflow != 0, "numeric value out of allowed range");
2309 
2310     if (bam_tag2cigar(b, 1, 1) < 0)
2311         return -2;
2312     return 0;
2313 
2314 #undef _parse_warn
2315 #undef _parse_err
2316 #undef _get_mem
2317 #undef _read_token
2318 err_ret:
2319     return -2;
2320 }
2321 
read_ncigar(const char * q)2322 static uint32_t read_ncigar(const char *q) {
2323     uint32_t n_cigar = 0;
2324     for (; *q && *q != '\t'; ++q)
2325         if (!isdigit_c(*q)) ++n_cigar;
2326     if (!n_cigar) {
2327         hts_log_error("No CIGAR operations");
2328         return 0;
2329     }
2330     if (n_cigar >= 2147483647) {
2331         hts_log_error("Too many CIGAR operations");
2332         return 0;
2333     }
2334 
2335     return n_cigar;
2336 }
2337 
2338 /*! @function
2339  @abstract  Parse a CIGAR string into preallocated a uint32_t array
2340  @param  in      [in]  pointer to the source string
2341  @param  a_cigar [out]  address of the destination uint32_t buffer
2342  @return         number of processed input characters; 0 on error
2343  */
parse_cigar(const char * in,uint32_t * a_cigar,uint32_t n_cigar)2344 static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2345     int i, overflow = 0;
2346     const char *p = in;
2347     for (i = 0; i < n_cigar; i++) {
2348         uint32_t len;
2349         int op;
2350         char *q;
2351         len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2352         if (q == p) {
2353             hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2354             return 0;
2355         }
2356         if (overflow) {
2357             hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2358             return 0;
2359         }
2360         p = q;
2361         op = bam_cigar_table[(unsigned char)*p++];
2362         if (op < 0) {
2363             hts_log_error("Unrecognized CIGAR operator");
2364             return 0;
2365         }
2366         a_cigar[i] = len;
2367         a_cigar[i] |= op;
2368     }
2369 
2370     return p-in;
2371 }
2372 
sam_parse_cigar(const char * in,char ** end,uint32_t ** a_cigar,size_t * a_mem)2373 ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2374     size_t n_cigar = 0;
2375     int diff;
2376 
2377     if (!in || !a_cigar || !a_mem) {
2378         hts_log_error("NULL pointer arguments");
2379         return -1;
2380     }
2381     if (end) *end = (char *)in;
2382 
2383     if (*in == '*') {
2384         if (end) (*end)++;
2385         return 0;
2386     }
2387     n_cigar = read_ncigar(in);
2388     if (!n_cigar) return 0;
2389     if (n_cigar > *a_mem) {
2390         uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2391         if (a_tmp) {
2392             *a_cigar = a_tmp;
2393             *a_mem = n_cigar;
2394         } else {
2395             hts_log_error("Memory allocation error");
2396             return -1;
2397         }
2398     }
2399 
2400     if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2401     if (end) *end = (char *)in+diff;
2402 
2403     return n_cigar;
2404 }
2405 
bam_parse_cigar(const char * in,char ** end,bam1_t * b)2406 ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2407     size_t n_cigar = 0;
2408     int diff;
2409 
2410     if (!in || !b) {
2411         hts_log_error("NULL pointer arguments");
2412         return -1;
2413     }
2414     if (end) *end = (char *)in;
2415 
2416     if (*in == '*') {
2417         if (end) (*end)++;
2418         return 0;
2419     }
2420     n_cigar = read_ncigar(in);
2421     if (!n_cigar) return 0;
2422     if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) {
2423         hts_log_error("Memory allocation error");
2424         return -1;
2425     }
2426 
2427     if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1;
2428     b->l_data += (n_cigar * sizeof(uint32_t));
2429     if (end) *end = (char *)in+diff;
2430 
2431     return n_cigar;
2432 }
2433 
2434 /*
2435  * -----------------------------------------------------------------------------
2436  * SAM threading
2437  */
2438 // Size of SAM text block (reading)
2439 #define NM 240000
2440 // Number of BAM records (writing)
2441 #define NB 1000
2442 
2443 struct SAM_state;
2444 
2445 // Output job - a block of BAM records
2446 typedef struct sp_bams {
2447     struct sp_bams *next;
2448     int serial;
2449 
2450     bam1_t *bams;
2451     int nbams, abams; // used and alloc
2452 
2453     struct SAM_state *fd;
2454 } sp_bams;
2455 
2456 // Input job - a block of SAM text
2457 typedef struct sp_lines {
2458     struct sp_lines *next;
2459     int serial;
2460 
2461     char *data;
2462     int data_size;
2463     int alloc;
2464 
2465     struct SAM_state *fd;
2466     sp_bams *bams;
2467 } sp_lines;
2468 
2469 enum sam_cmd {
2470     SAM_NONE = 0,
2471     SAM_CLOSE,
2472     SAM_CLOSE_DONE,
2473 };
2474 
2475 typedef struct SAM_state {
2476     sam_hdr_t *h;
2477 
2478     hts_tpool *p;
2479     int own_pool;
2480     pthread_mutex_t lines_m;
2481     hts_tpool_process *q;
2482     pthread_t dispatcher;
2483     int dispatcher_set;
2484 
2485     sp_lines *lines;
2486     sp_bams *bams;
2487 
2488     sp_bams *curr_bam;
2489     int curr_idx;
2490     int serial;
2491 
2492     // Be warned: moving these mutexes around in this struct can reduce
2493     // threading performance by up to 70%!
2494     pthread_mutex_t command_m;
2495     pthread_cond_t command_c;
2496     enum sam_cmd command;
2497 
2498     // One of the E* errno codes
2499     int errcode;
2500 
2501     htsFile *fp;
2502 } SAM_state;
2503 
2504 // Returns a SAM_state struct from a generic hFILE.
2505 //
2506 // Returns NULL on failure.
sam_state_create(htsFile * fp)2507 static SAM_state *sam_state_create(htsFile *fp) {
2508     // Ideally sam_open wouldn't be a #define to hts_open but instead would
2509     // be a redirect call with an additional 'S' mode.  This in turn would
2510     // correctly set the designed format to sam instead of a generic
2511     // text_format.
2512     if (fp->format.format != sam && fp->format.format != text_format)
2513         return NULL;
2514 
2515     SAM_state *fd = calloc(1, sizeof(*fd));
2516     if (!fd)
2517         return NULL;
2518 
2519     fp->state = fd;
2520     fd->fp = fp;
2521 
2522     return fd;
2523 }
2524 
2525 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
2526 static void *sam_format_worker(void *arg);
2527 
sam_state_err(SAM_state * fd,int errcode)2528 static void sam_state_err(SAM_state *fd, int errcode) {
2529     pthread_mutex_lock(&fd->command_m);
2530     if (!fd->errcode)
2531         fd->errcode = errcode;
2532     pthread_mutex_unlock(&fd->command_m);
2533 }
2534 
sam_free_sp_bams(sp_bams * b)2535 static void sam_free_sp_bams(sp_bams *b) {
2536     if (!b)
2537         return;
2538 
2539     if (b->bams) {
2540         int i;
2541         for (i = 0; i < b->abams; i++) {
2542             if (b->bams[i].data)
2543                 free(b->bams[i].data);
2544         }
2545         free(b->bams);
2546     }
2547     free(b);
2548 }
2549 
2550 // Destroys the state produce by sam_state_create.
sam_state_destroy(htsFile * fp)2551 int sam_state_destroy(htsFile *fp) {
2552     int ret = 0;
2553 
2554     if (!fp->state)
2555         return 0;
2556 
2557     SAM_state *fd = fp->state;
2558     if (fd->p) {
2559         if (fd->h) {
2560             // Notify sam_dispatcher we're closing
2561             pthread_mutex_lock(&fd->command_m);
2562             if (fd->command != SAM_CLOSE_DONE)
2563                 fd->command = SAM_CLOSE;
2564             pthread_cond_signal(&fd->command_c);
2565             ret = -fd->errcode;
2566             if (fd->q)
2567                 hts_tpool_wake_dispatch(fd->q); // unstick the reader
2568 
2569             if (!fp->is_write && fd->q && fd->dispatcher_set) {
2570                 for (;;) {
2571                     // Avoid deadlocks with dispatcher
2572                     if (fd->command == SAM_CLOSE_DONE)
2573                         break;
2574                     hts_tpool_wake_dispatch(fd->q);
2575                     pthread_mutex_unlock(&fd->command_m);
2576                     usleep(10000);
2577                     pthread_mutex_lock(&fd->command_m);
2578                 }
2579             }
2580             pthread_mutex_unlock(&fd->command_m);
2581 
2582             if (fp->is_write) {
2583                 // Dispatch the last partial block.
2584                 sp_bams *gb = fd->curr_bam;
2585                 if (!ret && gb && gb->nbams > 0 && fd->q)
2586                     ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
2587 
2588                 // Flush and drain output
2589                 if (fd->q)
2590                     hts_tpool_process_flush(fd->q);
2591                 pthread_mutex_lock(&fd->command_m);
2592                 if (!ret) ret = -fd->errcode;
2593                 pthread_mutex_unlock(&fd->command_m);
2594 
2595                 while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
2596                     usleep(10000);
2597                     pthread_mutex_lock(&fd->command_m);
2598                     ret = -fd->errcode;
2599                     // not empty but shutdown implies error
2600                     if (hts_tpool_process_is_shutdown(fd->q) && !ret)
2601                         ret = EIO;
2602                     pthread_mutex_unlock(&fd->command_m);
2603                 }
2604                 if (fd->q)
2605                     hts_tpool_process_shutdown(fd->q);
2606             }
2607 
2608             // Wait for it to acknowledge
2609             if (fd->dispatcher_set)
2610                 pthread_join(fd->dispatcher, NULL);
2611             if (!ret) ret = -fd->errcode;
2612         }
2613 
2614         // Tidy up memory
2615         if (fd->q)
2616             hts_tpool_process_destroy(fd->q);
2617 
2618         if (fd->own_pool && fp->format.compression == no_compression) {
2619             hts_tpool_destroy(fd->p);
2620             fd->p = NULL;
2621         }
2622         pthread_mutex_destroy(&fd->lines_m);
2623         pthread_mutex_destroy(&fd->command_m);
2624         pthread_cond_destroy(&fd->command_c);
2625 
2626         sp_lines *l = fd->lines;
2627         while (l) {
2628             sp_lines *n = l->next;
2629             free(l->data);
2630             free(l);
2631             l = n;
2632         }
2633 
2634         sp_bams *b = fd->bams;
2635         while (b) {
2636             if (fd->curr_bam == b)
2637                 fd->curr_bam = NULL;
2638             sp_bams *n = b->next;
2639             sam_free_sp_bams(b);
2640             b = n;
2641         }
2642 
2643         if (fd->curr_bam)
2644             sam_free_sp_bams(fd->curr_bam);
2645 
2646         // Decrement counter by one, maybe destroying too.
2647         // This is to permit the caller using bam_hdr_destroy
2648         // before sam_close without triggering decode errors
2649         // in the background threads.
2650         bam_hdr_destroy(fd->h);
2651     }
2652 
2653     free(fp->state);
2654     fp->state = NULL;
2655     return ret;
2656 }
2657 
2658 // Cleanup function - job for sam_parse_worker; result for sam_format_worker
cleanup_sp_lines(void * arg)2659 static void cleanup_sp_lines(void *arg) {
2660     sp_lines *gl = (sp_lines *)arg;
2661     if (!gl) return;
2662 
2663     // Should always be true for lines passed to / from thread workers.
2664     assert(gl->next == NULL);
2665 
2666     free(gl->data);
2667     sam_free_sp_bams(gl->bams);
2668     free(gl);
2669 }
2670 
2671 // Run from one of the worker threads.
2672 // Convert a passed in array of lines to array of BAMs, returning
2673 // the result back to the thread queue.
sam_parse_worker(void * arg)2674 static void *sam_parse_worker(void *arg) {
2675     sp_lines *gl = (sp_lines *)arg;
2676     sp_bams *gb = NULL;
2677     char *lines = gl->data;
2678     int i;
2679     bam1_t *b;
2680     SAM_state *fd = gl->fd;
2681 
2682     // Use a block of BAM structs we had earlier if available.
2683     pthread_mutex_lock(&fd->lines_m);
2684     if (fd->bams) {
2685         gb = fd->bams;
2686         fd->bams = gb->next;
2687     }
2688     pthread_mutex_unlock(&fd->lines_m);
2689 
2690     if (gb == NULL) {
2691         gb = calloc(1, sizeof(*gb));
2692         if (!gb) {
2693             return NULL;
2694         }
2695         gb->abams = 100;
2696         gb->bams = b = calloc(gb->abams, sizeof(*b));
2697         if (!gb->bams) {
2698             sam_state_err(fd, ENOMEM);
2699             goto err;
2700         }
2701         gb->nbams = 0;
2702     }
2703     gb->serial = gl->serial;
2704     gb->next = NULL;
2705 
2706     b = (bam1_t *)gb->bams;
2707     if (!b) {
2708         sam_state_err(fd, ENOMEM);
2709         goto err;
2710     }
2711 
2712     i = 0;
2713     char *cp = lines, *cp_end = lines + gl->data_size;
2714     while (cp < cp_end) {
2715         if (i >= gb->abams) {
2716             int old_abams = gb->abams;
2717             gb->abams *= 2;
2718             b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
2719             if (!b) {
2720                 gb->abams /= 2;
2721                 sam_state_err(fd, ENOMEM);
2722                 goto err;
2723             }
2724             memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
2725             gb->bams = b;
2726         }
2727 
2728         // Ideally we'd get sam_parse1 to return the number of
2729         // bytes decoded and to be able to stop on newline as
2730         // well as \0.
2731         //
2732         // We can then avoid the additional strchr loop.
2733         // It's around 6% of our CPU cost, albeit threadable.
2734         //
2735         // However this is an API change so for now we copy.
2736 
2737         char *nl = strchr(cp, '\n');
2738         char *line_end;
2739         if (nl) {
2740             line_end = nl;
2741             if (line_end > cp && *(line_end - 1) == '\r')
2742                 line_end--;
2743             nl++;
2744         } else {
2745             nl = line_end = cp_end;
2746         }
2747         *line_end = '\0';
2748         kstring_t ks = { line_end - cp, gl->alloc, cp };
2749         if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
2750             sam_state_err(fd, errno ? errno : EIO);
2751             cleanup_sp_lines(gl);
2752             goto err;
2753         }
2754         cp = nl;
2755         i++;
2756     }
2757     gb->nbams = i;
2758 
2759     pthread_mutex_lock(&fd->lines_m);
2760     gl->next = fd->lines;
2761     fd->lines = gl;
2762     pthread_mutex_unlock(&fd->lines_m);
2763     return gb;
2764 
2765  err:
2766     sam_free_sp_bams(gb);
2767     return NULL;
2768 }
2769 
sam_parse_eof(void * arg)2770 static void *sam_parse_eof(void *arg) {
2771     return NULL;
2772 }
2773 
2774 // Cleanup function - result for sam_parse_worker; job for sam_format_worker
cleanup_sp_bams(void * arg)2775 static void cleanup_sp_bams(void *arg) {
2776     sam_free_sp_bams((sp_bams *) arg);
2777 }
2778 
2779 // Runs in its own thread.
2780 // Reads a block of text (SAM) and sends a new job to the thread queue to
2781 // translate this to BAM.
sam_dispatcher_read(void * vp)2782 static void *sam_dispatcher_read(void *vp) {
2783     htsFile *fp = vp;
2784     kstring_t line = {0};
2785     int line_frag = 0;
2786     SAM_state *fd = fp->state;
2787     sp_lines *l = NULL;
2788 
2789     // Pre-allocate buffer for left-over bits of line (exact size doesn't
2790     // matter as it will grow if necessary).
2791     if (ks_resize(&line, 1000) < 0)
2792         goto err;
2793 
2794     for (;;) {
2795         // Check for command
2796         pthread_mutex_lock(&fd->command_m);
2797         switch (fd->command) {
2798 
2799         case SAM_CLOSE:
2800             pthread_cond_signal(&fd->command_c);
2801             pthread_mutex_unlock(&fd->command_m);
2802             hts_tpool_process_shutdown(fd->q);
2803             goto tidyup;
2804 
2805         default:
2806             break;
2807         }
2808         pthread_mutex_unlock(&fd->command_m);
2809 
2810         pthread_mutex_lock(&fd->lines_m);
2811         if (fd->lines) {
2812             // reuse existing line buffer
2813             l = fd->lines;
2814             fd->lines = l->next;
2815         }
2816         pthread_mutex_unlock(&fd->lines_m);
2817 
2818         if (l == NULL) {
2819             // none to reuse, to create a new one
2820             l = calloc(1, sizeof(*l));
2821             if (!l)
2822                 goto err;
2823             l->alloc = NM;
2824             l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
2825             if (!l->data) {
2826                 free(l);
2827                 l = NULL;
2828                 goto err;
2829             }
2830             l->fd = fd;
2831         }
2832         l->next = NULL;
2833 
2834         if (l->alloc < line_frag+NM/2) {
2835             char *rp = realloc(l->data, line_frag+NM/2 +8);
2836             if (!rp)
2837                 goto err;
2838             l->alloc = line_frag+NM/2;
2839             l->data = rp;
2840         }
2841         memcpy(l->data, line.s, line_frag);
2842 
2843         l->data_size = line_frag;
2844         ssize_t nbytes;
2845     longer_line:
2846         if (fp->is_bgzf)
2847             nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
2848         else
2849             nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
2850         if (nbytes < 0) {
2851             sam_state_err(fd, errno ? errno : EIO);
2852             goto err;
2853         } else if (nbytes == 0)
2854             break; // EOF
2855         l->data_size += nbytes;
2856 
2857         // trim to last \n. Maybe \r\n, but that's still fine
2858         if (nbytes == l->alloc - line_frag) {
2859             char *cp_end = l->data + l->data_size;
2860             char *cp = cp_end-1;
2861 
2862             while (cp > (char *)l->data && *cp != '\n')
2863                 cp--;
2864 
2865             // entire buffer is part of a single line
2866             if (cp == l->data) {
2867                 line_frag = l->data_size;
2868                 char *rp = realloc(l->data, l->alloc * 2 + 8);
2869                 if (!rp)
2870                     goto err;
2871                 l->alloc *= 2;
2872                 l->data = rp;
2873                 assert(l->alloc >= l->data_size);
2874                 assert(l->alloc >= line_frag);
2875                 assert(l->alloc >= l->alloc - line_frag);
2876                 goto longer_line;
2877             }
2878             cp++;
2879 
2880             // line holds the remainder of our line.
2881             if (ks_resize(&line, cp_end - cp) < 0)
2882                 goto err;
2883             memcpy(line.s, cp, cp_end - cp);
2884             line_frag = cp_end - cp;
2885             l->data_size = l->alloc - line_frag;
2886         } else {
2887             // out of buffer
2888             line_frag = 0;
2889         }
2890 
2891         l->serial = fd->serial++;
2892         //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
2893         if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
2894                                 cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
2895             goto err;
2896         pthread_mutex_lock(&fd->command_m);
2897         if (fd->command == SAM_CLOSE) {
2898             pthread_mutex_unlock(&fd->command_m);
2899             l = NULL;
2900             goto tidyup;
2901         }
2902         l = NULL;  // Now "owned" by sam_parse_worker()
2903         pthread_mutex_unlock(&fd->command_m);
2904     }
2905 
2906     if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
2907         goto err;
2908 
2909     // At EOF, wait for close request.
2910     // (In future if we add support for seek, this is where we need to catch it.)
2911     for (;;) {
2912         pthread_mutex_lock(&fd->command_m);
2913         if (fd->command == SAM_NONE)
2914             pthread_cond_wait(&fd->command_c, &fd->command_m);
2915         switch (fd->command) {
2916         case SAM_CLOSE:
2917             pthread_cond_signal(&fd->command_c);
2918             pthread_mutex_unlock(&fd->command_m);
2919             hts_tpool_process_shutdown(fd->q);
2920             goto tidyup;
2921 
2922         default:
2923             pthread_mutex_unlock(&fd->command_m);
2924             break;
2925         }
2926     }
2927 
2928  tidyup:
2929     pthread_mutex_lock(&fd->command_m);
2930     fd->command = SAM_CLOSE_DONE;
2931     pthread_cond_signal(&fd->command_c);
2932     pthread_mutex_unlock(&fd->command_m);
2933 
2934     if (l) {
2935         pthread_mutex_lock(&fd->lines_m);
2936         l->next = fd->lines;
2937         fd->lines = l;
2938         pthread_mutex_unlock(&fd->lines_m);
2939     }
2940     free(line.s);
2941 
2942     return NULL;
2943 
2944  err:
2945     sam_state_err(fd, errno ? errno : ENOMEM);
2946     hts_tpool_process_shutdown(fd->q);
2947     goto tidyup;
2948 }
2949 
2950 // Runs in its own thread.
2951 // Takes encoded blocks of SAM off the thread results queue and writes them
2952 // to our output stream.
sam_dispatcher_write(void * vp)2953 static void *sam_dispatcher_write(void *vp) {
2954     htsFile *fp = vp;
2955     SAM_state *fd = fp->state;
2956     hts_tpool_result *r;
2957 
2958     // Iterates until result queue is shutdown, where it returns NULL.
2959     while ((r = hts_tpool_next_result_wait(fd->q))) {
2960         sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
2961         if (!gl) {
2962             sam_state_err(fd, ENOMEM);
2963             goto err;
2964         }
2965 
2966         if (fp->idx) {
2967             sp_bams *gb = gl->bams;
2968             int i = 0, count = 0;
2969             while (i < gl->data_size) {
2970                 int j = i;
2971                 while (i < gl->data_size && gl->data[i] != '\n')
2972                     i++;
2973                 if (i < gl->data_size)
2974                     i++;
2975 
2976                 if (fp->is_bgzf) {
2977                     if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
2978                         goto err;
2979                 } else {
2980                     if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
2981                         goto err;
2982                 }
2983 
2984                 bam1_t *b = &gb->bams[count++];
2985                 if (fp->format.compression == bgzf) {
2986                     if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
2987                                       b->core.tid, b->core.pos, bam_endpos(b),
2988                                       bgzf_tell(fp->fp.bgzf),
2989                                       !(b->core.flag&BAM_FUNMAP)) < 0) {
2990                         sam_state_err(fd, errno ? errno : ENOMEM);
2991                         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
2992                                 bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
2993                         goto err;
2994                     }
2995                 } else {
2996                     if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
2997                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
2998                         sam_state_err(fd, errno ? errno : ENOMEM);
2999                         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3000                                 bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3001                         goto err;
3002                     }
3003                 }
3004             }
3005 
3006             assert(count == gb->nbams);
3007 
3008             // Add bam array to free-list
3009             pthread_mutex_lock(&fd->lines_m);
3010             gb->next = fd->bams;
3011             fd->bams = gl->bams;
3012             gl->bams = NULL;
3013             pthread_mutex_unlock(&fd->lines_m);
3014         } else {
3015             if (fp->is_bgzf) {
3016                 if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size)
3017                     goto err;
3018             } else {
3019                 if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3020                     goto err;
3021             }
3022         }
3023 
3024         hts_tpool_delete_result(r, 0);
3025 
3026         // Also updated by main thread
3027         pthread_mutex_lock(&fd->lines_m);
3028         gl->next = fd->lines;
3029         fd->lines = gl;
3030         pthread_mutex_unlock(&fd->lines_m);
3031     }
3032 
3033     sam_state_err(fd, 0); // success
3034     hts_tpool_process_shutdown(fd->q);
3035     return NULL;
3036 
3037  err:
3038     sam_state_err(fd, errno ? errno : EIO);
3039     return (void *)-1;
3040 }
3041 
3042 // Run from one of the worker threads.
3043 // Convert a passed in array of BAMs (sp_bams) and converts to a block
3044 // of text SAM records (sp_lines).
sam_format_worker(void * arg)3045 static void *sam_format_worker(void *arg) {
3046     sp_bams *gb = (sp_bams *)arg;
3047     sp_lines *gl = NULL;
3048     int i;
3049     SAM_state *fd = gb->fd;
3050     htsFile *fp = fd->fp;
3051 
3052     // Use a block of SAM strings we had earlier if available.
3053     pthread_mutex_lock(&fd->lines_m);
3054     if (fd->lines) {
3055         gl = fd->lines;
3056         fd->lines = gl->next;
3057     }
3058     pthread_mutex_unlock(&fd->lines_m);
3059 
3060     if (gl == NULL) {
3061         gl = calloc(1, sizeof(*gl));
3062         if (!gl) {
3063             sam_state_err(fd, ENOMEM);
3064             return NULL;
3065         }
3066         gl->alloc = gl->data_size = 0;
3067         gl->data = NULL;
3068     }
3069     gl->serial = gb->serial;
3070     gl->next = NULL;
3071 
3072     kstring_t ks = {0, gl->alloc, gl->data};
3073 
3074     for (i = 0; i < gb->nbams; i++) {
3075         if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3076             sam_state_err(fd, errno ? errno : EIO);
3077             goto err;
3078         }
3079         kputc('\n', &ks);
3080     }
3081 
3082     pthread_mutex_lock(&fd->lines_m);
3083     gl->data_size = ks.l;
3084     gl->alloc = ks.m;
3085     gl->data = ks.s;
3086 
3087     if (fp->idx) {
3088         // Keep hold of the bam array a little longer as
3089         // sam_dispatcher_write needs to use them for building the index.
3090         gl->bams = gb;
3091     } else {
3092         // Add bam array to free-list
3093         gb->next = fd->bams;
3094         fd->bams = gb;
3095     }
3096     pthread_mutex_unlock(&fd->lines_m);
3097 
3098     return gl;
3099 
3100  err:
3101     // Possible race between this and fd->curr_bam.
3102     // Easier to not free and leave it on the input list so it
3103     // gets freed there instead?
3104     // sam_free_sp_bams(gb);
3105     if (gl) {
3106         free(gl->data);
3107         free(gl);
3108     }
3109     return NULL;
3110 }
3111 
sam_set_thread_pool(htsFile * fp,htsThreadPool * p)3112 int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3113     if (fp->state)
3114         return 0;
3115 
3116     if (!(fp->state = sam_state_create(fp)))
3117         return -1;
3118     SAM_state *fd = (SAM_state *)fp->state;
3119 
3120     pthread_mutex_init(&fd->lines_m, NULL);
3121     pthread_mutex_init(&fd->command_m, NULL);
3122     pthread_cond_init(&fd->command_c, NULL);
3123     fd->p = p->pool;
3124     int qsize = p->qsize;
3125     if (!qsize)
3126         qsize = 2*hts_tpool_size(fd->p);
3127     fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3128     if (!fd->q) {
3129         sam_state_destroy(fp);
3130         return -1;
3131     }
3132 
3133     if (fp->format.compression == bgzf)
3134         return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3135 
3136     return 0;
3137 }
3138 
sam_set_threads(htsFile * fp,int nthreads)3139 int sam_set_threads(htsFile *fp, int nthreads) {
3140     if (nthreads <= 0)
3141         return 0;
3142 
3143     htsThreadPool p;
3144     p.pool = hts_tpool_init(nthreads);
3145     p.qsize = nthreads*2;
3146 
3147     int ret = sam_set_thread_pool(fp, &p);
3148     if (ret < 0)
3149         return ret;
3150 
3151     SAM_state *fd = (SAM_state *)fp->state;
3152     fd->own_pool = 1;
3153 
3154     return 0;
3155 }
3156 
3157 // Returns 0 on success,
3158 //        -1 on EOF,
3159 //       <-1 on error
sam_read1(htsFile * fp,sam_hdr_t * h,bam1_t * b)3160 int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
3161 {
3162     switch (fp->format.format) {
3163     case bam: {
3164         int r = bam_read1(fp->fp.bgzf, b);
3165         if (h && r >= 0) {
3166             if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
3167                 b->core.mtid >= h->n_targets || b->core.mtid < -1) {
3168                 errno = ERANGE;
3169                 return -3;
3170             }
3171         }
3172         return r;
3173         }
3174 
3175     case cram: {
3176         int ret = cram_get_bam_seq(fp->fp.cram, &b);
3177         if (ret < 0)
3178             return cram_eof(fp->fp.cram) ? -1 : -2;
3179 
3180         if (bam_tag2cigar(b, 1, 1) < 0)
3181             return -2;
3182         return ret;
3183     }
3184 
3185     case sam: {
3186         // Consume 1st line after header parsing as it wasn't using peek
3187         if (fp->line.l != 0) {
3188             int ret = sam_parse1(&fp->line, h, b);
3189             fp->line.l = 0;
3190             return ret;
3191         }
3192 
3193         if (fp->state) {
3194             SAM_state *fd = (SAM_state *)fp->state;
3195 
3196             if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
3197                 // We don't support multi-threaded SAM parsing with seeks yet.
3198                 int ret;
3199                 if ((ret = sam_state_destroy(fp)) < 0) {
3200                     errno = -ret;
3201                     return -2;
3202                 }
3203                 if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
3204                     return -1;
3205                 fp->fp.bgzf->seeked = 0;
3206                 goto err_recover;
3207             }
3208 
3209             if (!fd->h) {
3210                 fd->h = h;
3211                 fd->h->ref_count++;
3212                 // Ensure hrecs is initialised now as we don't want multiple
3213                 // threads trying to do this simultaneously.
3214                 if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
3215                     return -2;
3216 
3217                 // We can only do this once we've got a header
3218                 if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
3219                                    fp) != 0)
3220                     return -2;
3221                 fd->dispatcher_set = 1;
3222             }
3223 
3224             if (fd->h != h) {
3225                 hts_log_error("SAM multi-threaded decoding does not support changing header");
3226                 return -1;
3227             }
3228 
3229             sp_bams *gb = fd->curr_bam;
3230             if (!gb) {
3231                 if (fd->errcode) {
3232                     // In case reader failed
3233                     errno = fd->errcode;
3234                     return -2;
3235                 }
3236                 hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
3237                 if (!r)
3238                     return -2;
3239                 fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
3240                 hts_tpool_delete_result(r, 0);
3241             }
3242             if (!gb)
3243                 return fd->errcode ? -2 : -1;
3244             bam1_t *b_array = (bam1_t *)gb->bams;
3245             if (fd->curr_idx < gb->nbams)
3246                 if (!bam_copy1(b, &b_array[fd->curr_idx++]))
3247                     return -2;
3248             if (fd->curr_idx == gb->nbams) {
3249                 pthread_mutex_lock(&fd->lines_m);
3250                 gb->next = fd->bams;
3251                 fd->bams = gb;
3252                 pthread_mutex_unlock(&fd->lines_m);
3253 
3254                 fd->curr_bam = NULL;
3255                 fd->curr_idx = 0;
3256             }
3257 
3258             return 0;
3259 
3260         } else  {
3261             int ret;
3262         err_recover:
3263 
3264             ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3265             if (ret < 0) return ret;
3266 
3267             ret = sam_parse1(&fp->line, h, b);
3268             fp->line.l = 0;
3269             if (ret < 0) {
3270                 hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
3271                 if (h->ignore_sam_err) goto err_recover;
3272             }
3273             return ret;
3274         }
3275     }
3276 
3277     case empty_format:
3278         errno = EPIPE;
3279         return -3;
3280 
3281     default:
3282         errno = EFTYPE;
3283         return -3;
3284     }
3285 }
3286 
3287 
sam_format1_append(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)3288 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
3289 {
3290     int i, r = 0;
3291     uint8_t *s, *end;
3292     const bam1_core_t *c = &b->core;
3293 
3294     if (c->l_qname == 0)
3295         return -1;
3296     r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
3297     r |= kputc_('\t', str); // query name
3298     r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
3299     if (c->tid >= 0) { // chr
3300         r |= kputs(h->target_name[c->tid] , str);
3301         r |= kputc_('\t', str);
3302     } else r |= kputsn_("*\t", 2, str);
3303     r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
3304     r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
3305     if (c->n_cigar) { // cigar
3306         uint32_t *cigar = bam_get_cigar(b);
3307         for (i = 0; i < c->n_cigar; ++i) {
3308             r |= kputw(bam_cigar_oplen(cigar[i]), str);
3309             r |= kputc_(bam_cigar_opchr(cigar[i]), str);
3310         }
3311     } else r |= kputc_('*', str);
3312     r |= kputc_('\t', str);
3313     if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
3314     else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
3315     else {
3316         r |= kputs(h->target_name[c->mtid], str);
3317         r |= kputc_('\t', str);
3318     }
3319     r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
3320     r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
3321     if (c->l_qseq) { // seq and qual
3322         uint8_t *s = bam_get_seq(b);
3323         if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
3324         char *cp = str->s + str->l;
3325 
3326         // Sequence, 2 bases at a time
3327         nibble2base(s, cp, c->l_qseq);
3328         cp[c->l_qseq] = '\t';
3329         cp += c->l_qseq+1;
3330 
3331         // Quality
3332         s = bam_get_qual(b);
3333         i = 0;
3334         if (s[0] == 0xff) {
3335             cp[i++] = '*';
3336         } else {
3337             // local copy of c->l_qseq to aid unrolling
3338             uint32_t lqseq = c->l_qseq;
3339             for (i = 0; i < lqseq; ++i)
3340                 cp[i]=s[i]+33;
3341         }
3342         cp[i] = 0;
3343         cp += i;
3344         str->l = cp - str->s;
3345     } else r |= kputsn_("*\t*", 3, str);
3346 
3347     s = bam_get_aux(b); // aux
3348     end = b->data + b->l_data;
3349 
3350     while (end - s >= 4) {
3351         r |= kputc_('\t', str);
3352         if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
3353             goto bad_aux;
3354     }
3355     r |= kputsn("", 0, str); // nul terminate
3356     if (r < 0) goto mem_err;
3357 
3358     return str->l;
3359 
3360  bad_aux:
3361     hts_log_error("Corrupted aux data for read %.*s",
3362                   b->core.l_qname, bam_get_qname(b));
3363     errno = EINVAL;
3364     return -1;
3365 
3366  mem_err:
3367     hts_log_error("Out of memory");
3368     errno = ENOMEM;
3369     return -1;
3370 }
3371 
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)3372 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
3373 {
3374     str->l = 0;
3375     return sam_format1_append(h, b, str);
3376 }
3377 
3378 // Sadly we need to be able to modify the bam_hdr here so we can
3379 // reference count the structure.
sam_write1(htsFile * fp,const sam_hdr_t * h,const bam1_t * b)3380 int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
3381 {
3382     switch (fp->format.format) {
3383     case binary_format:
3384         fp->format.category = sequence_data;
3385         fp->format.format = bam;
3386         /* fall-through */
3387     case bam:
3388         return bam_write_idx1(fp, h, b);
3389 
3390     case cram:
3391         return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
3392 
3393     case text_format:
3394         fp->format.category = sequence_data;
3395         fp->format.format = sam;
3396         /* fall-through */
3397     case sam:
3398         if (fp->state) {
3399             SAM_state *fd = (SAM_state *)fp->state;
3400 
3401             // Threaded output
3402             if (!fd->h) {
3403                 // NB: discard const.  We don't actually modify sam_hdr_t here,
3404                 // just data pointed to by it (which is a bit weasely still),
3405                 // but out cached pointer must be non-const as we want to
3406                 // destroy it later on and sam_hdr_destroy takes non-const.
3407                 //
3408                 // We do this because some tools do sam_hdr_destroy; sam_close
3409                 // while others do sam_close; sam_hdr_destroy.  The former is
3410                 // an issue as we need the header still when flushing.
3411                 fd->h = (sam_hdr_t *)h;
3412                 fd->h->ref_count++;
3413 
3414                 if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
3415                                    fp) != 0)
3416                     return -2;
3417                 fd->dispatcher_set = 1;
3418             }
3419 
3420             if (fd->h != h) {
3421                 hts_log_error("SAM multi-threaded decoding does not support changing header");
3422                 return -2;
3423             }
3424 
3425             // Find a suitable BAM array to copy to
3426             sp_bams *gb = fd->curr_bam;
3427             if (!gb) {
3428                 pthread_mutex_lock(&fd->lines_m);
3429                 if (fd->bams) {
3430                     fd->curr_bam = gb = fd->bams;
3431                     fd->bams = gb->next;
3432                     gb->next = NULL;
3433                     gb->nbams = 0;
3434                     pthread_mutex_unlock(&fd->lines_m);
3435                 } else {
3436                     pthread_mutex_unlock(&fd->lines_m);
3437                     if (!(gb = calloc(1, sizeof(*gb)))) return -1;
3438                     if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) {
3439                         free(gb);
3440                         return -1;
3441                     }
3442                     gb->nbams = 0;
3443                     gb->abams = NB;
3444                     gb->fd = fd;
3445                     fd->curr_idx = 0;
3446                     fd->curr_bam = gb;
3447                 }
3448             }
3449 
3450             if (!bam_copy1(&gb->bams[gb->nbams++], b))
3451                 return -2;
3452 
3453             // Dispatch if full
3454             if (gb->nbams == NB) {
3455                 gb->serial = fd->serial++;
3456                 //fprintf(stderr, "Dispatch another %d bams\n", NB);
3457                 pthread_mutex_lock(&fd->command_m);
3458                 if (fd->errcode != 0) {
3459                     pthread_mutex_unlock(&fd->command_m);
3460                     return -fd->errcode;
3461                 }
3462                 if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
3463                                         cleanup_sp_bams,
3464                                         cleanup_sp_lines, 0) < 0) {
3465                     pthread_mutex_unlock(&fd->command_m);
3466                     return -1;
3467                 }
3468                 pthread_mutex_unlock(&fd->command_m);
3469                 fd->curr_bam = NULL;
3470             }
3471 
3472             // Dummy value as we don't know how long it really is.
3473             // We could track file sizes via a SAM_state field, but I don't think
3474             // it is necessary.
3475             return 1;
3476         } else {
3477             if (sam_format1(h, b, &fp->line) < 0) return -1;
3478             kputc('\n', &fp->line);
3479             if (fp->is_bgzf) {
3480                 if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
3481             } else {
3482                 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
3483             }
3484 
3485             if (fp->idx) {
3486                 if (fp->format.compression == bgzf) {
3487                     if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3488                                       bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3489                         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3490                                 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
3491                         return -1;
3492                     }
3493                 } else {
3494                     if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3495                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3496                         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3497                                 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
3498                         return -1;
3499                     }
3500                 }
3501             }
3502 
3503             return fp->line.l;
3504         }
3505 
3506     default:
3507         errno = EBADF;
3508         return -1;
3509     }
3510 }
3511 
3512 /************************
3513  *** Auxiliary fields ***
3514  ************************/
3515 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)3516 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
3517     int tsz = aux_type2size(type);
3518 
3519     if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
3520 
3521     switch (tsz) {
3522         case 'H': case 'Z': case 1:  // Trivial
3523             memcpy(out, in, len);
3524             break;
3525 
3526 #define aux_val_to_le(type_t, store_le) do {                            \
3527         type_t v;                                                       \
3528         size_t i;                                                       \
3529         for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
3530             memcpy(&v, in + i, sizeof(type_t));                         \
3531             store_le(v, out);                                           \
3532         }                                                               \
3533     } while (0)
3534 
3535         case 2: aux_val_to_le(uint16_t, u16_to_le); break;
3536         case 4: aux_val_to_le(uint32_t, u32_to_le); break;
3537         case 8: aux_val_to_le(uint64_t, u64_to_le); break;
3538 
3539 #undef aux_val_to_le
3540 
3541         case 'B': { // Recurse!
3542             uint32_t n;
3543             if (len < 5) return -1;
3544             memcpy(&n, in + 1, 4);
3545             out[0] = in[0];
3546             u32_to_le(n, out + 1);
3547             return aux_to_le(in[0], out + 5, in + 5, len - 5);
3548         }
3549 
3550         default: // Unknown type code
3551             return -1;
3552     }
3553 
3554 
3555 
3556     return 0;
3557 }
3558 #endif
3559 
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)3560 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
3561 {
3562     uint32_t new_len;
3563 
3564     assert(b->l_data >= 0);
3565     new_len = b->l_data + 3 + len;
3566     if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
3567 
3568     if (realloc_bam_data(b, new_len) < 0) return -1;
3569 
3570     b->data[b->l_data] = tag[0];
3571     b->data[b->l_data + 1] = tag[1];
3572     b->data[b->l_data + 2] = type;
3573 
3574 #ifdef HTS_LITTLE_ENDIAN
3575     memcpy(b->data + b->l_data + 3, data, len);
3576 #else
3577     if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
3578         errno = EINVAL;
3579         return -1;
3580     }
3581 #endif
3582 
3583     b->l_data = new_len;
3584 
3585     return 0;
3586 
3587  nomem:
3588     errno = ENOMEM;
3589     return -1;
3590 }
3591 
skip_aux(uint8_t * s,uint8_t * end)3592 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
3593 {
3594     int size;
3595     uint32_t n;
3596     if (s >= end) return end;
3597     size = aux_type2size(*s); ++s; // skip type
3598     switch (size) {
3599     case 'Z':
3600     case 'H':
3601         while (s < end && *s) ++s;
3602         return s < end ? s + 1 : end;
3603     case 'B':
3604         if (end - s < 5) return NULL;
3605         size = aux_type2size(*s); ++s;
3606         n = le_to_u32(s);
3607         s += 4;
3608         if (size == 0 || end - s < size * n) return NULL;
3609         return s + size * n;
3610     case 0:
3611         return NULL;
3612     default:
3613         if (end - s < size) return NULL;
3614         return s + size;
3615     }
3616 }
3617 
bam_aux_get(const bam1_t * b,const char tag[2])3618 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
3619 {
3620     uint8_t *s, *end, *t = (uint8_t *) tag;
3621     uint16_t y = (uint16_t) t[0]<<8 | t[1];
3622     s = bam_get_aux(b);
3623     end = b->data + b->l_data;
3624     while (s != NULL && end - s >= 3) {
3625         uint16_t x = (uint16_t) s[0]<<8 | s[1];
3626         s += 2;
3627         if (x == y) {
3628             // Check the tag value is valid and complete
3629             uint8_t *e = skip_aux(s, end);
3630             if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
3631                 goto bad_aux;  // Unterminated string
3632             }
3633             if (e != NULL) {
3634                 return s;
3635             } else {
3636                 goto bad_aux;
3637             }
3638         }
3639         s = skip_aux(s, end);
3640     }
3641     if (s == NULL) goto bad_aux;
3642     errno = ENOENT;
3643     return NULL;
3644 
3645  bad_aux:
3646     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
3647     errno = EINVAL;
3648     return NULL;
3649 }
3650 
3651 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)3652 int bam_aux_del(bam1_t *b, uint8_t *s)
3653 {
3654     uint8_t *p, *aux;
3655     int l_aux = bam_get_l_aux(b);
3656     aux = bam_get_aux(b);
3657     p = s - 2;
3658     s = skip_aux(s, aux + l_aux);
3659     if (s == NULL) goto bad_aux;
3660     memmove(p, s, l_aux - (s - aux));
3661     b->l_data -= s - p;
3662     return 0;
3663 
3664  bad_aux:
3665     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
3666     errno = EINVAL;
3667     return -1;
3668 }
3669 
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)3670 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
3671 {
3672     // FIXME: This is not at all efficient!
3673     size_t ln = len >= 0 ? len : strlen(data) + 1;
3674     size_t old_ln = 0;
3675     int need_nul = ln == 0 || data[ln - 1] != '\0';
3676     int save_errno = errno;
3677     int new_tag = 0;
3678     uint8_t *s = bam_aux_get(b,tag), *e;
3679 
3680     if (s) {  // Replacing existing tag
3681         char type = *s;
3682         if (type != 'Z') {
3683             hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
3684             errno = EINVAL;
3685             return -1;
3686         }
3687         s++;
3688         e = memchr(s, '\0', b->data + b->l_data - s);
3689         old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
3690         s -= 3;
3691     } else {
3692         if (errno != ENOENT) { // Invalid aux data, give up
3693             return -1;
3694         } else { // Tag doesn't exist - put it on the end
3695             errno = save_errno;
3696             s = b->data + b->l_data;
3697             new_tag = 3;
3698         }
3699     }
3700 
3701     if (old_ln < ln + need_nul + new_tag) {
3702         ptrdiff_t s_offset = s - b->data;
3703         if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
3704             return -1;
3705         s = b->data + s_offset;
3706     }
3707     if (!new_tag) {
3708         memmove(s + 3 + ln + need_nul,
3709                 s + 3 + old_ln,
3710                 b->l_data - (s + 3 - b->data) - old_ln);
3711     }
3712     b->l_data += new_tag + ln + need_nul - old_ln;
3713 
3714     s[0] = tag[0];
3715     s[1] = tag[1];
3716     s[2] = 'Z';
3717     memmove(s+3,data,ln);
3718     if (need_nul) s[3 + ln] = '\0';
3719     return 0;
3720 }
3721 
bam_aux_update_int(bam1_t * b,const char tag[2],int64_t val)3722 int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
3723 {
3724     uint32_t sz, old_sz = 0, new = 0;
3725     uint8_t *s, type;
3726 
3727     if (val < INT32_MIN || val > UINT32_MAX) {
3728         errno = EOVERFLOW;
3729         return -1;
3730     }
3731     if (val < INT16_MIN)       { type = 'i'; sz = 4; }
3732     else if (val < INT8_MIN)   { type = 's'; sz = 2; }
3733     else if (val < 0)          { type = 'c'; sz = 1; }
3734     else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
3735     else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
3736     else                       { type = 'I'; sz = 4; }
3737 
3738     s = bam_aux_get(b, tag);
3739     if (s) {  // Tag present - how big was the old one?
3740         switch (*s) {
3741             case 'c': case 'C': old_sz = 1; break;
3742             case 's': case 'S': old_sz = 2; break;
3743             case 'i': case 'I': old_sz = 4; break;
3744             default: errno = EINVAL; return -1;  // Not an integer
3745         }
3746     } else {
3747         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
3748             s = b->data + b->l_data;
3749             new = 1;
3750         }  else { // Invalid aux data, give up.
3751             return -1;
3752         }
3753     }
3754 
3755     if (new || old_sz < sz) {
3756         // Make room for new tag
3757         ptrdiff_t s_offset = s - b->data;
3758         if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
3759             return -1;
3760         s =  b->data + s_offset;
3761         if (new) { // Add tag id
3762             *s++ = tag[0];
3763             *s++ = tag[1];
3764         } else {   // Shift following data so we have space
3765             memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
3766         }
3767     } else {
3768         // Reuse old space.  Data value may be bigger than necessary but
3769         // we avoid having to move everything else
3770         sz = old_sz;
3771         type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
3772         assert(type > 0);
3773     }
3774     *s++ = type;
3775 #ifdef HTS_LITTLE_ENDIAN
3776     memcpy(s, &val, sz);
3777 #else
3778     switch (sz) {
3779         case 4:  u32_to_le(val, s); break;
3780         case 2:  u16_to_le(val, s); break;
3781         default: *s = val; break;
3782     }
3783 #endif
3784     b->l_data += (new ? 3 : 0) + sz - old_sz;
3785     return 0;
3786 }
3787 
bam_aux_update_float(bam1_t * b,const char tag[2],float val)3788 int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
3789 {
3790     uint8_t *s = bam_aux_get(b, tag);
3791     int shrink = 0, new = 0;
3792 
3793     if (s) { // Tag present - what was it?
3794         switch (*s) {
3795             case 'f': break;
3796             case 'd': shrink = 1; break;
3797             default: errno = EINVAL; return -1;  // Not a float
3798         }
3799     } else {
3800         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
3801             new = 1;
3802         }  else { // Invalid aux data, give up.
3803             return -1;
3804         }
3805     }
3806 
3807     if (new) { // Ensure there's room
3808         if (possibly_expand_bam_data(b, 3 + 4) < 0)
3809             return -1;
3810         s = b->data + b->l_data;
3811         *s++ = tag[0];
3812         *s++ = tag[1];
3813     } else if (shrink) { // Convert non-standard double tag to float
3814         memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
3815         b->l_data -= 4;
3816     }
3817     *s++ = 'f';
3818     float_to_le(val, s);
3819     if (new) b->l_data += 7;
3820 
3821     return 0;
3822 }
3823 
bam_aux_update_array(bam1_t * b,const char tag[2],uint8_t type,uint32_t items,void * data)3824 int bam_aux_update_array(bam1_t *b, const char tag[2],
3825                          uint8_t type, uint32_t items, void *data)
3826 {
3827     uint8_t *s = bam_aux_get(b, tag);
3828     size_t old_sz = 0, new_sz;
3829     int new = 0;
3830 
3831     if (s) { // Tag present
3832         if (*s != 'B') { errno = EINVAL; return -1; }
3833         old_sz = aux_type2size(s[1]);
3834         if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
3835         old_sz *= le_to_u32(s + 2);
3836     } else {
3837         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
3838             s = b->data + b->l_data;
3839             new = 1;
3840         }  else { // Invalid aux data, give up.
3841             return -1;
3842         }
3843     }
3844 
3845     new_sz = aux_type2size(type);
3846     if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
3847     if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
3848     new_sz *= items;
3849 
3850     if (new || old_sz < new_sz) {
3851         // Make room for new tag
3852         ptrdiff_t s_offset = s - b->data;
3853         if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
3854             return -1;
3855         s =  b->data + s_offset;
3856     }
3857     if (new) { // Add tag id and type
3858         *s++ = tag[0];
3859         *s++ = tag[1];
3860         *s = 'B';
3861         b->l_data += 8 + new_sz;
3862     } else if (old_sz != new_sz) { // shift following data if necessary
3863         memmove(s + 6 + new_sz, s + 6 + old_sz,
3864                 b->l_data - ((s + 6 + old_sz) - b->data));
3865         b->l_data -= old_sz;
3866         b->l_data += new_sz;
3867     }
3868 
3869     s[1] = type;
3870     u32_to_le(items, s + 2);
3871 #ifdef HTS_LITTLE_ENDIAN
3872     memcpy(s + 6, data, new_sz);
3873     return 0;
3874 #else
3875     return aux_to_le(type, s + 6, data, new_sz);
3876 #endif
3877 }
3878 
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)3879 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
3880                                       uint32_t idx)
3881 {
3882     switch (type) {
3883         case 'c': return le_to_i8(s + idx);
3884         case 'C': return s[idx];
3885         case 's': return le_to_i16(s + 2 * idx);
3886         case 'S': return le_to_u16(s + 2 * idx);
3887         case 'i': return le_to_i32(s + 4 * idx);
3888         case 'I': return le_to_u32(s + 4 * idx);
3889         default:
3890             errno = EINVAL;
3891             return 0;
3892     }
3893 }
3894 
bam_aux2i(const uint8_t * s)3895 int64_t bam_aux2i(const uint8_t *s)
3896 {
3897     int type;
3898     type = *s++;
3899     return get_int_aux_val(type, s, 0);
3900 }
3901 
bam_aux2f(const uint8_t * s)3902 double bam_aux2f(const uint8_t *s)
3903 {
3904     int type;
3905     type = *s++;
3906     if (type == 'd') return le_to_double(s);
3907     else if (type == 'f') return le_to_float(s);
3908     else return get_int_aux_val(type, s, 0);
3909 }
3910 
bam_aux2A(const uint8_t * s)3911 char bam_aux2A(const uint8_t *s)
3912 {
3913     int type;
3914     type = *s++;
3915     if (type == 'A') return *(char*)s;
3916     errno = EINVAL;
3917     return 0;
3918 }
3919 
bam_aux2Z(const uint8_t * s)3920 char *bam_aux2Z(const uint8_t *s)
3921 {
3922     int type;
3923     type = *s++;
3924     if (type == 'Z' || type == 'H') return (char*)s;
3925     errno = EINVAL;
3926     return 0;
3927 }
3928 
bam_auxB_len(const uint8_t * s)3929 uint32_t bam_auxB_len(const uint8_t *s)
3930 {
3931     if (s[0] != 'B') {
3932         errno = EINVAL;
3933         return 0;
3934     }
3935     return le_to_u32(s + 2);
3936 }
3937 
bam_auxB2i(const uint8_t * s,uint32_t idx)3938 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
3939 {
3940     uint32_t len = bam_auxB_len(s);
3941     if (idx >= len) {
3942         errno = ERANGE;
3943         return 0;
3944     }
3945     return get_int_aux_val(s[1], s + 6, idx);
3946 }
3947 
bam_auxB2f(const uint8_t * s,uint32_t idx)3948 double bam_auxB2f(const uint8_t *s, uint32_t idx)
3949 {
3950     uint32_t len = bam_auxB_len(s);
3951     if (idx >= len) {
3952         errno = ERANGE;
3953         return 0.0;
3954     }
3955     if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
3956     else return get_int_aux_val(s[1], s + 6, idx);
3957 }
3958 
sam_open_mode(char * mode,const char * fn,const char * format)3959 int sam_open_mode(char *mode, const char *fn, const char *format)
3960 {
3961     // TODO Parse "bam5" etc for compression level
3962     if (format == NULL) {
3963         // Try to pick a format based on the filename extension
3964         char extension[HTS_MAX_EXT_LEN];
3965         if (find_file_extension(fn, extension) < 0) return -1;
3966         return sam_open_mode(mode, fn, extension);
3967     }
3968     else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
3969     else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
3970     else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
3971     else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
3972     else return -1;
3973 
3974     return 0;
3975 }
3976 
3977 // A version of sam_open_mode that can handle ,key=value options.
3978 // The format string is allocated and returned, to be freed by the caller.
3979 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)3980 char *sam_open_mode_opts(const char *fn,
3981                          const char *mode,
3982                          const char *format)
3983 {
3984     char *mode_opts = malloc((format ? strlen(format) : 1) +
3985                              (mode   ? strlen(mode)   : 1) + 12);
3986     char *opts, *cp;
3987     int format_len;
3988 
3989     if (!mode_opts)
3990         return NULL;
3991 
3992     strcpy(mode_opts, mode ? mode : "r");
3993     cp = mode_opts + strlen(mode_opts);
3994 
3995     if (format == NULL) {
3996         // Try to pick a format based on the filename extension
3997         char extension[HTS_MAX_EXT_LEN];
3998         if (find_file_extension(fn, extension) < 0) {
3999             free(mode_opts);
4000             return NULL;
4001         }
4002         if (sam_open_mode(cp, fn, extension) == 0) {
4003             return mode_opts;
4004         } else {
4005             free(mode_opts);
4006             return NULL;
4007         }
4008     }
4009 
4010     if ((opts = strchr(format, ','))) {
4011         format_len = opts-format;
4012     } else {
4013         opts="";
4014         format_len = strlen(format);
4015     }
4016 
4017     if (strncmp(format, "bam", format_len) == 0) {
4018         *cp++ = 'b';
4019     } else if (strncmp(format, "cram", format_len) == 0) {
4020         *cp++ = 'c';
4021     } else if (strncmp(format, "cram2", format_len) == 0) {
4022         *cp++ = 'c';
4023         strcpy(cp, ",VERSION=2.1");
4024         cp += 12;
4025     } else if (strncmp(format, "cram3", format_len) == 0) {
4026         *cp++ = 'c';
4027         strcpy(cp, ",VERSION=3.0");
4028         cp += 12;
4029     } else if (strncmp(format, "sam", format_len) == 0) {
4030         ; // format mode=""
4031     } else if (strncmp(format, "sam.gz", format_len) == 0) {
4032         *cp++ = 'z';
4033     } else {
4034         free(mode_opts);
4035         return NULL;
4036     }
4037 
4038     strcpy(cp, opts);
4039 
4040     return mode_opts;
4041 }
4042 
4043 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)4044 int bam_str2flag(const char *str)
4045 {
4046     char *end, *beg = (char*) str;
4047     long int flag = strtol(str, &end, 0);
4048     if ( end!=str ) return flag;    // the conversion was successful
4049     flag = 0;
4050     while ( *str )
4051     {
4052         end = beg;
4053         while ( *end && *end!=',' ) end++;
4054         if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
4055         else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
4056         else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
4057         else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
4058         else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
4059         else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
4060         else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
4061         else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
4062         else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
4063         else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
4064         else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
4065         else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
4066         else return -1;
4067         if ( !*end ) break;
4068         beg = end + 1;
4069     }
4070     return flag;
4071 }
4072 
bam_flag2str(int flag)4073 char *bam_flag2str(int flag)
4074 {
4075     kstring_t str = {0,0,0};
4076     if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
4077     if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
4078     if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
4079     if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
4080     if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
4081     if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
4082     if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
4083     if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
4084     if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
4085     if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
4086     if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
4087     if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
4088     if ( str.l == 0 ) kputsn("", 0, &str);
4089     return str.s;
4090 }
4091 
4092 
4093 /**************************
4094  *** Pileup and Mpileup ***
4095  **************************/
4096 
4097 #if !defined(BAM_NO_PILEUP)
4098 
4099 #include <assert.h>
4100 
4101 /*******************
4102  *** Memory pool ***
4103  *******************/
4104 
4105 typedef struct {
4106     int k, y;
4107     hts_pos_t x, end;
4108 } cstate_t;
4109 
4110 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
4111 
4112 typedef struct __linkbuf_t {
4113     bam1_t b;
4114     hts_pos_t beg, end;
4115     cstate_t s;
4116     struct __linkbuf_t *next;
4117     bam_pileup_cd cd;
4118 } lbnode_t;
4119 
4120 typedef struct {
4121     int cnt, n, max;
4122     lbnode_t **buf;
4123 } mempool_t;
4124 
mp_init(void)4125 static mempool_t *mp_init(void)
4126 {
4127     mempool_t *mp;
4128     mp = (mempool_t*)calloc(1, sizeof(mempool_t));
4129     return mp;
4130 }
mp_destroy(mempool_t * mp)4131 static void mp_destroy(mempool_t *mp)
4132 {
4133     int k;
4134     for (k = 0; k < mp->n; ++k) {
4135         free(mp->buf[k]->b.data);
4136         free(mp->buf[k]);
4137     }
4138     free(mp->buf);
4139     free(mp);
4140 }
mp_alloc(mempool_t * mp)4141 static inline lbnode_t *mp_alloc(mempool_t *mp)
4142 {
4143     ++mp->cnt;
4144     if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
4145     else return mp->buf[--mp->n];
4146 }
mp_free(mempool_t * mp,lbnode_t * p)4147 static inline void mp_free(mempool_t *mp, lbnode_t *p)
4148 {
4149     --mp->cnt; p->next = 0; // clear lbnode_t::next here
4150     if (mp->n == mp->max) {
4151         mp->max = mp->max? mp->max<<1 : 256;
4152         mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
4153     }
4154     mp->buf[mp->n++] = p;
4155 }
4156 
4157 /**********************
4158  *** CIGAR resolver ***
4159  **********************/
4160 
4161 /* s->k: the index of the CIGAR operator that has just been processed.
4162    s->x: the reference coordinate of the start of s->k
4163    s->y: the query coordinate of the start of s->k
4164  */
resolve_cigar2(bam_pileup1_t * p,hts_pos_t pos,cstate_t * s)4165 static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
4166 {
4167 #define _cop(c) ((c)&BAM_CIGAR_MASK)
4168 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
4169 
4170     bam1_t *b = p->b;
4171     bam1_core_t *c = &b->core;
4172     uint32_t *cigar = bam_get_cigar(b);
4173     int k;
4174     // determine the current CIGAR operation
4175     //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
4176     if (s->k == -1) { // never processed
4177         p->qpos = 0;
4178         if (c->n_cigar == 1) { // just one operation, save a loop
4179           if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
4180         } else { // find the first match or deletion
4181             for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
4182                 int op = _cop(cigar[k]);
4183                 int l = _cln(cigar[k]);
4184                 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
4185                     op == BAM_CEQUAL || op == BAM_CDIFF) break;
4186                 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
4187             }
4188             assert(k < c->n_cigar);
4189             s->k = k;
4190         }
4191     } else { // the read has been processed before
4192         int op, l = _cln(cigar[s->k]);
4193         if (pos - s->x >= l) { // jump to the next operation
4194             assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
4195             op = _cop(cigar[s->k+1]);
4196             if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
4197               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
4198                 s->x += l;
4199                 ++s->k;
4200             } else { // find the next M/D/N/=/X
4201               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
4202                 s->x += l;
4203                 for (k = s->k + 1; k < c->n_cigar; ++k) {
4204                     op = _cop(cigar[k]), l = _cln(cigar[k]);
4205                     if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
4206                     else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
4207                 }
4208                 s->k = k;
4209             }
4210             assert(s->k < c->n_cigar); // otherwise a bug
4211         } // else, do nothing
4212     }
4213     { // collect pileup information
4214         int op, l;
4215         op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
4216         p->is_del = p->indel = p->is_refskip = 0;
4217         if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
4218             int op2 = _cop(cigar[s->k+1]);
4219             int l2 = _cln(cigar[s->k+1]);
4220             if (op2 == BAM_CDEL) p->indel = -(int)l2;
4221             else if (op2 == BAM_CINS) p->indel = l2;
4222             else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
4223                 int l3 = 0;
4224                 for (k = s->k + 2; k < c->n_cigar; ++k) {
4225                     op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
4226                     if (op2 == BAM_CINS) l3 += l2;
4227                     else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
4228                 }
4229                 if (l3 > 0) p->indel = l3;
4230             }
4231         }
4232         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
4233             p->qpos = s->y + (pos - s->x);
4234         } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
4235             p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
4236             p->is_refskip = (op == BAM_CREF_SKIP);
4237         } // cannot be other operations; otherwise a bug
4238         p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
4239     }
4240     p->cigar_ind = s->k;
4241     return 1;
4242 }
4243 
4244 /*******************************
4245  *** Expansion of insertions ***
4246  *******************************/
4247 
4248 /*
4249  * Fills out the kstring with the padded insertion sequence for the current
4250  * location in 'p'.  If this is not an insertion site, the string is blank.
4251  *
4252  * Returns the length of insertion string on success;
4253  *        -1 on failure.
4254  */
bam_plp_insertion(const bam_pileup1_t * p,kstring_t * ins,int * del_len)4255 int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
4256     int j, k, indel;
4257     uint32_t *cigar;
4258 
4259     if (p->indel <= 0) {
4260         if (ks_resize(ins, 1) < 0)
4261             return -1;
4262         ins->l = 0;
4263         ins->s[0] = '\0';
4264         return 0;
4265     }
4266 
4267     if (del_len)
4268         *del_len = 0;
4269 
4270     // Measure indel length including pads
4271     indel = 0;
4272     k = p->cigar_ind+1;
4273     cigar = bam_get_cigar(p->b);
4274     while (k < p->b->core.n_cigar) {
4275         switch (cigar[k] & BAM_CIGAR_MASK) {
4276         case BAM_CPAD:
4277         case BAM_CINS:
4278             indel += (cigar[k] >> BAM_CIGAR_SHIFT);
4279             break;
4280         default:
4281             k = p->b->core.n_cigar;
4282             break;
4283         }
4284         k++;
4285     }
4286     ins->l = indel;
4287 
4288     // Produce sequence
4289     if (ks_resize(ins, indel+1) < 0)
4290         return -1;
4291     indel = 0;
4292     k = p->cigar_ind+1;
4293     j = 1;
4294     while (k < p->b->core.n_cigar) {
4295         int l, c;
4296         switch (cigar[k] & BAM_CIGAR_MASK) {
4297         case BAM_CPAD:
4298             for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
4299                 ins->s[indel++] = '*';
4300             break;
4301         case BAM_CINS:
4302             for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
4303                 c = seq_nt16_str[bam_seqi(bam_get_seq(p->b),
4304                                           p->qpos + j - p->is_del)];
4305                 ins->s[indel++] = c;
4306             }
4307             break;
4308         case BAM_CDEL:
4309             // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
4310             if (del_len)
4311                 *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
4312             // fall through
4313         default:
4314             k = p->b->core.n_cigar;
4315             break;
4316         }
4317         k++;
4318     }
4319     ins->s[indel] = '\0';
4320 
4321     return indel;
4322 }
4323 
4324 /***********************
4325  *** Pileup iterator ***
4326  ***********************/
4327 
4328 // Dictionary of overlapping reads
4329 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
4330 typedef khash_t(olap_hash) olap_hash_t;
4331 
4332 struct bam_plp_s {
4333     mempool_t *mp;
4334     lbnode_t *head, *tail;
4335     int32_t tid, max_tid;
4336     hts_pos_t pos, max_pos;
4337     int is_eof, max_plp, error, maxcnt;
4338     uint64_t id;
4339     bam_pileup1_t *plp;
4340     // for the "auto" interface only
4341     bam1_t *b;
4342     bam_plp_auto_f func;
4343     void *data;
4344     olap_hash_t *overlaps;
4345 
4346     // For notification of creation and destruction events
4347     // and associated client-owned pointer.
4348     int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
4349     int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
4350 };
4351 
bam_plp_init(bam_plp_auto_f func,void * data)4352 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
4353 {
4354     bam_plp_t iter;
4355     iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
4356     iter->mp = mp_init();
4357     iter->head = iter->tail = mp_alloc(iter->mp);
4358     iter->max_tid = iter->max_pos = -1;
4359     iter->maxcnt = 8000;
4360     if (func) {
4361         iter->func = func;
4362         iter->data = data;
4363         iter->b = bam_init1();
4364     }
4365     return iter;
4366 }
4367 
bam_plp_init_overlaps(bam_plp_t iter)4368 int bam_plp_init_overlaps(bam_plp_t iter)
4369 {
4370     iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
4371     return iter->overlaps ? 0 : -1;
4372 }
4373 
bam_plp_destroy(bam_plp_t iter)4374 void bam_plp_destroy(bam_plp_t iter)
4375 {
4376     lbnode_t *p, *pnext;
4377     if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
4378     for (p = iter->head; p != NULL; p = pnext) {
4379         pnext = p->next;
4380         mp_free(iter->mp, p);
4381     }
4382     mp_destroy(iter->mp);
4383     if (iter->b) bam_destroy1(iter->b);
4384     free(iter->plp);
4385     free(iter);
4386 }
4387 
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))4388 void bam_plp_constructor(bam_plp_t plp,
4389                          int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
4390     plp->plp_construct = func;
4391 }
4392 
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))4393 void bam_plp_destructor(bam_plp_t plp,
4394                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
4395     plp->plp_destruct = func;
4396 }
4397 
4398 //---------------------------------
4399 //---  Tweak overlapping reads
4400 //---------------------------------
4401 
4402 /**
4403  *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
4404  *  cigar_iref2iseq_next() - get the next CMATCH base
4405  *  @cigar:       pointer to current cigar block (rw)
4406  *  @cigar_max:   pointer just beyond the last cigar block
4407  *  @icig:        position within the current cigar block (rw)
4408  *  @iseq:        position in the sequence (rw)
4409  *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
4410  *
4411  *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
4412  *  or -2 on error.
4413  */
cigar_iref2iseq_set(uint32_t ** cigar,uint32_t * cigar_max,hts_pos_t * icig,hts_pos_t * iseq,hts_pos_t * iref)4414 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref)
4415 {
4416     hts_pos_t pos = *iref;
4417     if ( pos < 0 ) return -1;
4418     *icig = 0;
4419     *iseq = 0;
4420     *iref = 0;
4421     while ( *cigar<cigar_max )
4422     {
4423         int cig  = (**cigar) & BAM_CIGAR_MASK;
4424         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
4425 
4426         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
4427         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
4428         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
4429         {
4430             pos -= ncig;
4431             if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
4432             (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
4433             continue;
4434         }
4435         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
4436         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
4437         {
4438             pos -= ncig;
4439             if ( pos<0 ) pos = 0;
4440             (*cigar)++; *icig = 0; *iref += ncig;
4441             continue;
4442         }
4443         hts_log_error("Unexpected cigar %d", cig);
4444         return -2;
4445     }
4446     *iseq = -1;
4447     return -1;
4448 }
cigar_iref2iseq_next(uint32_t ** cigar,uint32_t * cigar_max,hts_pos_t * icig,hts_pos_t * iseq,hts_pos_t * iref)4449 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref)
4450 {
4451     while ( *cigar < cigar_max )
4452     {
4453         int cig  = (**cigar) & BAM_CIGAR_MASK;
4454         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
4455 
4456         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
4457         {
4458             if ( *icig >= ncig - 1 ) { *icig = 0;  (*cigar)++; continue; }
4459             (*iseq)++; (*icig)++; (*iref)++;
4460             return BAM_CMATCH;
4461         }
4462         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
4463         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
4464         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
4465         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
4466         hts_log_error("Unexpected cigar %d", cig);
4467         return -2;
4468     }
4469     *iseq = -1;
4470     *iref = -1;
4471     return -1;
4472 }
4473 
tweak_overlap_quality(bam1_t * a,bam1_t * b)4474 static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
4475 {
4476     uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
4477     uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
4478     hts_pos_t a_icig = 0, a_iseq = 0;
4479     hts_pos_t b_icig = 0, b_iseq = 0;
4480     uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
4481     uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
4482 
4483     hts_pos_t iref   = b->core.pos;
4484     hts_pos_t a_iref = iref - a->core.pos;
4485     hts_pos_t b_iref = iref - b->core.pos;
4486     int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
4487     if ( a_ret<0 ) return a_ret<-1 ? -1:0;  // no overlap or error
4488     int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
4489     if ( b_ret<0 ) return b_ret<-1 ? -1:0;  // no overlap or error
4490 
4491     #if DBG
4492         fprintf(stderr,"tweak %s  n_cigar=%d %d  .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
4493             a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
4494     #endif
4495 
4496     int err = 0;
4497     while ( 1 )
4498     {
4499         // Increment reference position
4500         while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
4501             a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
4502         if ( a_ret<0 ) { err = a_ret<-1?-1:0; break; }   // done
4503         if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
4504 
4505         while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
4506             b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
4507         if ( b_ret<0 ) { err = b_ret<-1?-1:0; break; }  // done
4508         if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
4509 
4510         iref++;
4511         if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue;   // only CMATCH positions, don't know what to do with indels
4512 
4513         if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
4514             return -1;  // Fell off end of sequence, bad CIGAR?
4515 
4516         if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
4517         {
4518             #if DBG
4519                 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
4520             #endif
4521             // we are very confident about this base
4522             int qual = a_qual[a_iseq] + b_qual[b_iseq];
4523             a_qual[a_iseq] = qual>200 ? 200 : qual;
4524             b_qual[b_iseq] = 0;
4525         }
4526         else
4527         {
4528             if ( a_qual[a_iseq] >= b_qual[b_iseq] )
4529             {
4530                 #if DBG
4531                     fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
4532                 #endif
4533                 a_qual[a_iseq] = 0.8 * a_qual[a_iseq];  // not so confident about a_qual anymore given the mismatch
4534                 b_qual[b_iseq] = 0;
4535             }
4536             else
4537             {
4538                 #if DBG
4539                     fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
4540                 #endif
4541                 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
4542                 a_qual[a_iseq] = 0;
4543             }
4544         }
4545     }
4546     #if DBG
4547         fprintf(stderr,"\n");
4548     #endif
4549     return err;
4550 }
4551 
4552 // Fix overlapping reads. Simple soft-clipping did not give good results.
4553 // Lowering qualities of unwanted bases is more selective and works better.
4554 //
4555 // Returns 0 on success, -1 on failure
overlap_push(bam_plp_t iter,lbnode_t * node)4556 static int overlap_push(bam_plp_t iter, lbnode_t *node)
4557 {
4558     if ( !iter->overlaps ) return 0;
4559 
4560     // mapped mates and paired reads only
4561     if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
4562 
4563     // no overlap possible, unless some wild cigar
4564     if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
4565          || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
4566          && node->b.core.mpos >= node->end) // for those wild cigars
4567        ) return 0;
4568 
4569     khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
4570     if ( kitr==kh_end(iter->overlaps) )
4571     {
4572         // Only add reads where the mate is still to arrive
4573         if (node->b.core.mpos >= node->b.core.pos ||
4574             ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
4575             int ret;
4576             kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
4577             if (ret < 0) return -1;
4578             kh_value(iter->overlaps, kitr) = node;
4579         }
4580     }
4581     else
4582     {
4583         lbnode_t *a = kh_value(iter->overlaps, kitr);
4584         int err = tweak_overlap_quality(&a->b, &node->b);
4585         kh_del(olap_hash, iter->overlaps, kitr);
4586         assert(a->end-1 == a->s.end);
4587         a->end = bam_endpos(&a->b);
4588         a->s.end = a->end - 1;
4589         return err;
4590     }
4591     return 0;
4592 }
4593 
overlap_remove(bam_plp_t iter,const bam1_t * b)4594 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
4595 {
4596     if ( !iter->overlaps ) return;
4597 
4598     khiter_t kitr;
4599     if ( b )
4600     {
4601         kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
4602         if ( kitr!=kh_end(iter->overlaps) )
4603             kh_del(olap_hash, iter->overlaps, kitr);
4604     }
4605     else
4606     {
4607         // remove all
4608         for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
4609             if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
4610     }
4611 }
4612 
4613 
4614 
4615 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
4616 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
4617 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp64_next(bam_plp_t iter,int * _tid,hts_pos_t * _pos,int * _n_plp)4618 const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
4619 {
4620     if (iter->error) { *_n_plp = -1; return NULL; }
4621     *_n_plp = 0;
4622     if (iter->is_eof && iter->head == iter->tail) return NULL;
4623     while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
4624         int n_plp = 0;
4625         // write iter->plp at iter->pos
4626         lbnode_t **pptr = &iter->head;
4627         while (*pptr != iter->tail) {
4628             lbnode_t *p = *pptr;
4629             if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
4630                 overlap_remove(iter, &p->b);
4631                 if (iter->plp_destruct)
4632                     iter->plp_destruct(iter->data, &p->b, &p->cd);
4633                 *pptr = p->next; mp_free(iter->mp, p);
4634             }
4635             else {
4636                 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
4637                     if (n_plp == iter->max_plp) { // then double the capacity
4638                         iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
4639                         iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
4640                     }
4641                     iter->plp[n_plp].b = &p->b;
4642                     iter->plp[n_plp].cd = p->cd;
4643                     if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
4644                 }
4645                 pptr = &(*pptr)->next;
4646             }
4647         }
4648         *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
4649         // update iter->tid and iter->pos
4650         if (iter->head != iter->tail) {
4651             if (iter->tid > iter->head->b.core.tid) {
4652                 hts_log_error("Unsorted input. Pileup aborts");
4653                 iter->error = 1;
4654                 *_n_plp = -1;
4655                 return NULL;
4656             }
4657         }
4658         if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
4659             iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
4660         } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
4661             iter->pos = iter->head->beg; // jump to the next position
4662         } else ++iter->pos; // scan contiguously
4663         // return
4664         if (n_plp) return iter->plp;
4665         if (iter->is_eof && iter->head == iter->tail) break;
4666     }
4667     return NULL;
4668 }
4669 
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)4670 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
4671 {
4672     hts_pos_t pos64 = 0;
4673     const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
4674     if (pos64 < INT_MAX) {
4675         *_pos = pos64;
4676     } else {
4677         hts_log_error("Position %"PRId64" too large", pos64);
4678         *_pos = INT_MAX;
4679         iter->error = 1;
4680         *_n_plp = -1;
4681         return NULL;
4682     }
4683     return p;
4684 }
4685 
bam_plp_push(bam_plp_t iter,const bam1_t * b)4686 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
4687 {
4688     if (iter->error) return -1;
4689     if (b) {
4690         if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
4691         // Skip only unmapped reads here, any additional filtering must be done in iter->func
4692         if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
4693         if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
4694         {
4695             overlap_remove(iter, b);
4696             return 0;
4697         }
4698         if (bam_copy1(&iter->tail->b, b) == NULL)
4699             return -1;
4700         iter->tail->b.id = iter->id++;
4701         iter->tail->beg = b->core.pos;
4702         // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
4703         iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
4704         iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
4705         if (b->core.tid < iter->max_tid) {
4706             hts_log_error("The input is not sorted (chromosomes out of order)");
4707             iter->error = 1;
4708             return -1;
4709         }
4710         if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
4711             hts_log_error("The input is not sorted (reads out of order)");
4712             iter->error = 1;
4713             return -1;
4714         }
4715         iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
4716         if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
4717             lbnode_t *next = mp_alloc(iter->mp);
4718             if (!next) {
4719                 iter->error = 1;
4720                 return -1;
4721             }
4722             if (iter->plp_construct)
4723                 iter->plp_construct(iter->data, &iter->tail->b,
4724                                     &iter->tail->cd);
4725             if (overlap_push(iter, iter->tail) < 0) {
4726                 mp_free(iter->mp, next);
4727                 iter->error = 1;
4728                 return -1;
4729             }
4730             iter->tail->next = next;
4731             iter->tail = iter->tail->next;
4732         }
4733     } else iter->is_eof = 1;
4734     return 0;
4735 }
4736 
bam_plp64_auto(bam_plp_t iter,int * _tid,hts_pos_t * _pos,int * _n_plp)4737 const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
4738 {
4739     const bam_pileup1_t *plp;
4740     if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
4741     if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
4742     else { // no pileup line can be obtained; read alignments
4743         *_n_plp = 0;
4744         if (iter->is_eof) return 0;
4745         int ret;
4746         while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
4747             if (bam_plp_push(iter, iter->b) < 0) {
4748                 *_n_plp = -1;
4749                 return 0;
4750             }
4751             if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
4752             // otherwise no pileup line can be returned; read the next alignment.
4753         }
4754         if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
4755         if (bam_plp_push(iter, 0) < 0) {
4756             *_n_plp = -1;
4757             return 0;
4758         }
4759         if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
4760         return 0;
4761     }
4762 }
4763 
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)4764 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
4765 {
4766     hts_pos_t pos64 = 0;
4767     const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
4768     if (pos64 < INT_MAX) {
4769         *_pos = pos64;
4770     } else {
4771         hts_log_error("Position %"PRId64" too large", pos64);
4772         *_pos = INT_MAX;
4773         iter->error = 1;
4774         *_n_plp = -1;
4775         return NULL;
4776     }
4777     return p;
4778 }
4779 
bam_plp_reset(bam_plp_t iter)4780 void bam_plp_reset(bam_plp_t iter)
4781 {
4782     overlap_remove(iter, NULL);
4783     iter->max_tid = iter->max_pos = -1;
4784     iter->tid = iter->pos = 0;
4785     iter->is_eof = 0;
4786     while (iter->head != iter->tail) {
4787         lbnode_t *p = iter->head;
4788         iter->head = p->next;
4789         mp_free(iter->mp, p);
4790     }
4791 }
4792 
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)4793 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
4794 {
4795     iter->maxcnt = maxcnt;
4796 }
4797 
4798 /************************
4799  *** Mpileup iterator ***
4800  ************************/
4801 
4802 struct bam_mplp_s {
4803     int n;
4804     int32_t min_tid, *tid;
4805     hts_pos_t min_pos, *pos;
4806     bam_plp_t *iter;
4807     int *n_plp;
4808     const bam_pileup1_t **plp;
4809 };
4810 
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)4811 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
4812 {
4813     int i;
4814     bam_mplp_t iter;
4815     iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
4816     iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
4817     iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
4818     iter->n_plp = (int*)calloc(n, sizeof(int));
4819     iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
4820     iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
4821     iter->n = n;
4822     iter->min_pos = HTS_POS_MAX;
4823     iter->min_tid = (uint32_t)-1;
4824     for (i = 0; i < n; ++i) {
4825         iter->iter[i] = bam_plp_init(func, data[i]);
4826         iter->pos[i] = iter->min_pos;
4827         iter->tid[i] = iter->min_tid;
4828     }
4829     return iter;
4830 }
4831 
bam_mplp_init_overlaps(bam_mplp_t iter)4832 int bam_mplp_init_overlaps(bam_mplp_t iter)
4833 {
4834     int i, r = 0;
4835     for (i = 0; i < iter->n; ++i)
4836         r |= bam_plp_init_overlaps(iter->iter[i]);
4837     return r == 0 ? 0 : -1;
4838 }
4839 
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)4840 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
4841 {
4842     int i;
4843     for (i = 0; i < iter->n; ++i)
4844         iter->iter[i]->maxcnt = maxcnt;
4845 }
4846 
bam_mplp_destroy(bam_mplp_t iter)4847 void bam_mplp_destroy(bam_mplp_t iter)
4848 {
4849     int i;
4850     for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
4851     free(iter->iter); free(iter->pos); free(iter->tid);
4852     free(iter->n_plp); free(iter->plp);
4853     free(iter);
4854 }
4855 
bam_mplp64_auto(bam_mplp_t iter,int * _tid,hts_pos_t * _pos,int * n_plp,const bam_pileup1_t ** plp)4856 int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
4857 {
4858     int i, ret = 0;
4859     hts_pos_t new_min_pos = HTS_POS_MAX;
4860     uint32_t new_min_tid = (uint32_t)-1;
4861     for (i = 0; i < iter->n; ++i) {
4862         if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
4863             int tid;
4864             hts_pos_t pos;
4865             iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
4866             if ( iter->iter[i]->error ) return -1;
4867             if (iter->plp[i]) {
4868                 iter->tid[i] = tid;
4869                 iter->pos[i] = pos;
4870             } else {
4871                 iter->tid[i] = 0;
4872                 iter->pos[i] = 0;
4873             }
4874         }
4875         if (iter->plp[i]) {
4876             if (iter->tid[i] < new_min_tid) {
4877                 new_min_tid = iter->tid[i];
4878                 new_min_pos = iter->pos[i];
4879             } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
4880                 new_min_pos = iter->pos[i];
4881             }
4882         }
4883     }
4884     iter->min_pos = new_min_pos;
4885     iter->min_tid = new_min_tid;
4886     if (new_min_pos == HTS_POS_MAX) return 0;
4887     *_tid = new_min_tid; *_pos = new_min_pos;
4888     for (i = 0; i < iter->n; ++i) {
4889         if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
4890             n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
4891             ++ret;
4892         } else n_plp[i] = 0, plp[i] = 0;
4893     }
4894     return ret;
4895 }
4896 
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)4897 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
4898 {
4899     hts_pos_t pos64 = 0;
4900     int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
4901     if (ret >= 0) {
4902         if (pos64 < INT_MAX) {
4903             *_pos = pos64;
4904         } else {
4905             hts_log_error("Position %"PRId64" too large", pos64);
4906             *_pos = INT_MAX;
4907             return -1;
4908         }
4909     }
4910     return ret;
4911 }
4912 
bam_mplp_reset(bam_mplp_t iter)4913 void bam_mplp_reset(bam_mplp_t iter)
4914 {
4915     int i;
4916     iter->min_pos = HTS_POS_MAX;
4917     iter->min_tid = (uint32_t)-1;
4918     for (i = 0; i < iter->n; ++i) {
4919         bam_plp_reset(iter->iter[i]);
4920         iter->pos[i] = HTS_POS_MAX;
4921         iter->tid[i] = (uint32_t)-1;
4922         iter->n_plp[i] = 0;
4923         iter->plp[i] = NULL;
4924     }
4925 }
4926 
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))4927 void bam_mplp_constructor(bam_mplp_t iter,
4928                           int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
4929     int i;
4930     for (i = 0; i < iter->n; ++i)
4931         bam_plp_constructor(iter->iter[i], func);
4932 }
4933 
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))4934 void bam_mplp_destructor(bam_mplp_t iter,
4935                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
4936     int i;
4937     for (i = 0; i < iter->n; ++i)
4938         bam_plp_destructor(iter->iter[i], func);
4939 }
4940 
4941 #endif // ~!defined(BAM_NO_PILEUP)
4942