1 /*  sam.c -- SAM and BAM file I/O and manipulation.
2 
3     Copyright (C) 2008-2010, 2012-2021 Genome Research Ltd.
4     Copyright (C) 2010, 2012, 2013 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27 #include <config.h>
28 
29 #include <strings.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <zlib.h>
35 #include <assert.h>
36 #include <signal.h>
37 #include <inttypes.h>
38 #include <unistd.h>
39 
40 // Suppress deprecation message for cigar_tab, which we initialise
41 #include "htslib/hts_defs.h"
42 #undef HTS_DEPRECATED
43 #define HTS_DEPRECATED(message)
44 
45 #include "htslib/sam.h"
46 #include "htslib/bgzf.h"
47 #include "cram/cram.h"
48 #include "hts_internal.h"
49 #include "sam_internal.h"
50 #include "htslib/hfile.h"
51 #include "htslib/hts_endian.h"
52 #include "htslib/hts_expr.h"
53 #include "header.h"
54 
55 #include "htslib/khash.h"
56 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
57 KHASH_SET_INIT_INT(tag)
58 
59 #ifndef EFTYPE
60 #define EFTYPE ENOEXEC
61 #endif
62 #ifndef EOVERFLOW
63 #define EOVERFLOW ERANGE
64 #endif
65 
66 /**********************
67  *** BAM header I/O ***
68  **********************/
69 
70 HTSLIB_EXPORT
71 const int8_t bam_cigar_table[256] = {
72     // 0 .. 47
73     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
74     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
75     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
76 
77     // 48 .. 63  (including =)
78     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
79 
80     // 64 .. 79  (including MIDNHB)
81     -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
82         BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
83 
84     // 80 .. 95  (including SPX)
85     BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
86         BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
87 
88     // 96 .. 127
89     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
90     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
91 
92     // 128 .. 255
93     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
94     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
96     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
97     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
101 };
102 
sam_hdr_init()103 sam_hdr_t *sam_hdr_init()
104 {
105     sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
106     if (bh == NULL) return NULL;
107 
108     bh->cigar_tab = bam_cigar_table;
109     return bh;
110 }
111 
sam_hdr_destroy(sam_hdr_t * bh)112 void sam_hdr_destroy(sam_hdr_t *bh)
113 {
114     int32_t i;
115 
116     if (bh == NULL) return;
117 
118     if (bh->ref_count > 0) {
119         --bh->ref_count;
120         return;
121     }
122 
123     if (bh->target_name) {
124         for (i = 0; i < bh->n_targets; ++i)
125             free(bh->target_name[i]);
126         free(bh->target_name);
127         free(bh->target_len);
128     }
129     free(bh->text);
130     if (bh->hrecs)
131         sam_hrecs_free(bh->hrecs);
132     if (bh->sdict)
133         kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
134     free(bh);
135 }
136 
137 // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
138 // references before sam_hdr_t::hrecs is populated
sam_hdr_dup_sdict(const sam_hdr_t * h0,sam_hdr_t * h)139 int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
140 {
141     const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
142     khash_t(s2i) *dest_long_refs = kh_init(s2i);
143     int i;
144     if (!dest_long_refs) return -1;
145 
146     for (i = 0; i < h->n_targets; i++) {
147         int ret;
148         khiter_t ksrc, kdest;
149         if (h->target_len[i] < UINT32_MAX) continue;
150         ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
151         if (ksrc == kh_end(src_long_refs)) continue;
152         kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
153         if (ret < 0) {
154             kh_destroy(s2i, dest_long_refs);
155             return -1;
156         }
157         kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
158     }
159 
160     h->sdict = dest_long_refs;
161     return 0;
162 }
163 
sam_hdr_dup(const sam_hdr_t * h0)164 sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
165 {
166     if (h0 == NULL) return NULL;
167     sam_hdr_t *h;
168     if ((h = sam_hdr_init()) == NULL) return NULL;
169     // copy the simple data
170     h->n_targets = 0;
171     h->ignore_sam_err = h0->ignore_sam_err;
172     h->l_text = 0;
173 
174     // Then the pointery stuff
175 
176     if (!h0->hrecs) {
177         h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
178         if (!h->target_len) goto fail;
179         h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
180         if (!h->target_name) goto fail;
181 
182         int i;
183         for (i = 0; i < h0->n_targets; ++i) {
184             h->target_len[i] = h0->target_len[i];
185             h->target_name[i] = strdup(h0->target_name[i]);
186             if (!h->target_name[i]) break;
187         }
188         h->n_targets = i;
189         if (i < h0->n_targets) goto fail;
190 
191         if (h0->sdict) {
192             if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
193         }
194     }
195 
196     if (h0->hrecs) {
197         kstring_t tmp = { 0, 0, NULL };
198         if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
199             free(ks_release(&tmp));
200             goto fail;
201         }
202 
203         h->l_text = tmp.l;
204         h->text   = ks_release(&tmp);
205 
206         if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
207             goto fail;
208     } else {
209         h->l_text = h0->l_text;
210         h->text = malloc(h->l_text + 1);
211         if (!h->text) goto fail;
212         memcpy(h->text, h0->text, h->l_text);
213         h->text[h->l_text] = '\0';
214     }
215 
216     return h;
217 
218  fail:
219     sam_hdr_destroy(h);
220     return NULL;
221 }
222 
bam_hdr_read(BGZF * fp)223 sam_hdr_t *bam_hdr_read(BGZF *fp)
224 {
225     sam_hdr_t *h;
226     uint8_t buf[4];
227     int magic_len, has_EOF;
228     int32_t i, name_len, num_names = 0;
229     size_t bufsize;
230     ssize_t bytes;
231     // check EOF
232     has_EOF = bgzf_check_EOF(fp);
233     if (has_EOF < 0) {
234         perror("[W::bam_hdr_read] bgzf_check_EOF");
235     } else if (has_EOF == 0) {
236         hts_log_warning("EOF marker is absent. The input is probably truncated");
237     }
238     // read "BAM1"
239     magic_len = bgzf_read(fp, buf, 4);
240     if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
241         hts_log_error("Invalid BAM binary header");
242         return 0;
243     }
244     h = sam_hdr_init();
245     if (!h) goto nomem;
246 
247     // read plain text and the number of reference sequences
248     bytes = bgzf_read(fp, buf, 4);
249     if (bytes != 4) goto read_err;
250     h->l_text = le_to_u32(buf);
251 
252     bufsize = h->l_text + 1;
253     if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
254     h->text = (char*)malloc(bufsize);
255     if (!h->text) goto nomem;
256     h->text[h->l_text] = 0; // make sure it is NULL terminated
257     bytes = bgzf_read(fp, h->text, h->l_text);
258     if (bytes != h->l_text) goto read_err;
259 
260     bytes = bgzf_read(fp, &h->n_targets, 4);
261     if (bytes != 4) goto read_err;
262     if (fp->is_be) ed_swap_4p(&h->n_targets);
263 
264     if (h->n_targets < 0) goto invalid;
265 
266     // read reference sequence names and lengths
267     if (h->n_targets > 0) {
268         h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
269         if (!h->target_name) goto nomem;
270         h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
271         if (!h->target_len) goto nomem;
272     }
273     else {
274         h->target_name = NULL;
275         h->target_len = NULL;
276     }
277 
278     for (i = 0; i != h->n_targets; ++i) {
279         bytes = bgzf_read(fp, &name_len, 4);
280         if (bytes != 4) goto read_err;
281         if (fp->is_be) ed_swap_4p(&name_len);
282         if (name_len <= 0) goto invalid;
283 
284         h->target_name[i] = (char*)malloc(name_len);
285         if (!h->target_name[i]) goto nomem;
286         num_names++;
287 
288         bytes = bgzf_read(fp, h->target_name[i], name_len);
289         if (bytes != name_len) goto read_err;
290 
291         if (h->target_name[i][name_len - 1] != '\0') {
292             /* Fix missing NUL-termination.  Is this being too nice?
293                We could alternatively bail out with an error. */
294             char *new_name;
295             if (name_len == INT32_MAX) goto invalid;
296             new_name = realloc(h->target_name[i], name_len + 1);
297             if (new_name == NULL) goto nomem;
298             h->target_name[i] = new_name;
299             h->target_name[i][name_len] = '\0';
300         }
301 
302         bytes = bgzf_read(fp, &h->target_len[i], 4);
303         if (bytes != 4) goto read_err;
304         if (fp->is_be) ed_swap_4p(&h->target_len[i]);
305     }
306     return h;
307 
308  nomem:
309     hts_log_error("Out of memory");
310     goto clean;
311 
312  read_err:
313     if (bytes < 0) {
314         hts_log_error("Error reading BGZF stream");
315     } else {
316         hts_log_error("Truncated BAM header");
317     }
318     goto clean;
319 
320  invalid:
321     hts_log_error("Invalid BAM binary header");
322 
323  clean:
324     if (h != NULL) {
325         h->n_targets = num_names; // ensure we free only allocated target_names
326         sam_hdr_destroy(h);
327     }
328     return NULL;
329 }
330 
bam_hdr_write(BGZF * fp,const sam_hdr_t * h)331 int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
332 {
333     int32_t i, name_len, x;
334     kstring_t hdr_ks = { 0, 0, NULL };
335     char *text;
336     uint32_t l_text;
337 
338     if (!h) return -1;
339 
340     if (h->hrecs) {
341         if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
342         if (hdr_ks.l > INT32_MAX) {
343             hts_log_error("Header too long for BAM format");
344             free(hdr_ks.s);
345             return -1;
346         }
347         text = hdr_ks.s;
348         l_text = hdr_ks.l;
349     } else {
350         if (h->l_text > INT32_MAX) {
351             hts_log_error("Header too long for BAM format");
352             return -1;
353         }
354         text = h->text;
355         l_text = h->l_text;
356     }
357     // write "BAM1"
358     if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
359     // write plain text and the number of reference sequences
360     if (fp->is_be) {
361         x = ed_swap_4(l_text);
362         if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
363         if (l_text) {
364             if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
365         }
366         x = ed_swap_4(h->n_targets);
367         if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
368     } else {
369         if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
370         if (l_text) {
371             if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
372         }
373         if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
374     }
375     free(hdr_ks.s);
376     // write sequence names and lengths
377     for (i = 0; i != h->n_targets; ++i) {
378         char *p = h->target_name[i];
379         name_len = strlen(p) + 1;
380         if (fp->is_be) {
381             x = ed_swap_4(name_len);
382             if (bgzf_write(fp, &x, 4) < 0) return -1;
383         } else {
384             if (bgzf_write(fp, &name_len, 4) < 0) return -1;
385         }
386         if (bgzf_write(fp, p, name_len) < 0) return -1;
387         if (fp->is_be) {
388             x = ed_swap_4(h->target_len[i]);
389             if (bgzf_write(fp, &x, 4) < 0) return -1;
390         } else {
391             if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
392         }
393     }
394     if (bgzf_flush(fp) < 0) return -1;
395     return 0;
396 }
397 
sam_parse_region(sam_hdr_t * h,const char * s,int * tid,hts_pos_t * beg,hts_pos_t * end,int flags)398 const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
399                              hts_pos_t *beg, hts_pos_t *end, int flags) {
400     return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
401 }
402 
403 /*************************
404  *** BAM alignment I/O ***
405  *************************/
406 
bam_init1()407 bam1_t *bam_init1()
408 {
409     return (bam1_t*)calloc(1, sizeof(bam1_t));
410 }
411 
sam_realloc_bam_data(bam1_t * b,size_t desired)412 int sam_realloc_bam_data(bam1_t *b, size_t desired)
413 {
414     uint32_t new_m_data;
415     uint8_t *new_data;
416     new_m_data = desired;
417     kroundup32(new_m_data);
418     if (new_m_data < desired) {
419         errno = ENOMEM; // Not strictly true but we can't store the size
420         return -1;
421     }
422     if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
423         new_data = realloc(b->data, new_m_data);
424     } else {
425         if ((new_data = malloc(new_m_data)) != NULL) {
426             if (b->l_data > 0)
427                 memcpy(new_data, b->data,
428                        b->l_data < b->m_data ? b->l_data : b->m_data);
429             bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
430         }
431     }
432     if (!new_data) return -1;
433     b->data = new_data;
434     b->m_data = new_m_data;
435     return 0;
436 }
437 
bam_destroy1(bam1_t * b)438 void bam_destroy1(bam1_t *b)
439 {
440     if (b == 0) return;
441     if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
442         free(b->data);
443         if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
444             // In case of reuse
445             b->data = NULL;
446             b->m_data = 0;
447             b->l_data = 0;
448         }
449     }
450 
451     if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
452         free(b);
453 }
454 
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)455 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
456 {
457     if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
458     memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
459     memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
460     bdst->l_data = bsrc->l_data;
461     bdst->id = bsrc->id;
462     return bdst;
463 }
464 
bam_dup1(const bam1_t * bsrc)465 bam1_t *bam_dup1(const bam1_t *bsrc)
466 {
467     if (bsrc == NULL) return NULL;
468     bam1_t *bdst = bam_init1();
469     if (bdst == NULL) return NULL;
470     if (bam_copy1(bdst, bsrc) == NULL) {
471         bam_destroy1(bdst);
472         return NULL;
473     }
474     return bdst;
475 }
476 
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,hts_pos_t * rlen,hts_pos_t * qlen)477 static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
478                              hts_pos_t *rlen, hts_pos_t *qlen)
479 {
480     int k;
481     *rlen = *qlen = 0;
482     for (k = 0; k < n_cigar; ++k) {
483         int type = bam_cigar_type(bam_cigar_op(cigar[k]));
484         int len = bam_cigar_oplen(cigar[k]);
485         if (type & 1) *qlen += len;
486         if (type & 2) *rlen += len;
487     }
488 }
489 
subtract_check_underflow(size_t length,size_t * limit)490 static int subtract_check_underflow(size_t length, size_t *limit)
491 {
492     if (length <= *limit) {
493         *limit -= length;
494         return 0;
495     }
496 
497     return -1;
498 }
499 
bam_set1(bam1_t * bam,size_t l_qname,const char * qname,uint16_t flag,int32_t tid,hts_pos_t pos,uint8_t mapq,size_t n_cigar,const uint32_t * cigar,int32_t mtid,hts_pos_t mpos,hts_pos_t isize,size_t l_seq,const char * seq,const char * qual,size_t l_aux)500 int bam_set1(bam1_t *bam,
501              size_t l_qname, const char *qname,
502              uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
503              size_t n_cigar, const uint32_t *cigar,
504              int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
505              size_t l_seq, const char *seq, const char *qual,
506              size_t l_aux)
507 {
508     // use a default qname "*" if none is provided
509     if (l_qname == 0) {
510         l_qname = 1;
511         qname = "*";
512     }
513 
514     // note: the qname is stored nul terminated and padded as described in the
515     // documentation for the bam1_t struct.
516     size_t qname_nuls = 4 - l_qname % 4;
517 
518     // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
519     // can't use bam_endpos() directly as some fields not yet set up.
520     hts_pos_t rlen = 0, qlen = 0;
521     if (!(flag & BAM_FUNMAP)) {
522         bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
523     }
524     if (rlen == 0) {
525         rlen = 1;
526     }
527 
528     // validate parameters
529     if (l_qname > 254) {
530         hts_log_error("Query name too long");
531         errno = EINVAL;
532         return -1;
533     }
534     if (HTS_POS_MAX - rlen <= pos) {
535         hts_log_error("Read ends beyond highest supported position");
536         errno = EINVAL;
537         return -1;
538     }
539     if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
540         hts_log_error("Mapped query must have a CIGAR");
541         errno = EINVAL;
542         return -1;
543     }
544     if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
545         hts_log_error("CIGAR and query sequence are of different length");
546         errno = EINVAL;
547         return -1;
548     }
549 
550     size_t limit = INT32_MAX;
551     int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
552     u    += subtract_check_underflow(n_cigar * 4, &limit);
553     u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
554     u    += subtract_check_underflow(l_seq, &limit);
555     u    += subtract_check_underflow(l_aux, &limit);
556     if (u != 0) {
557         hts_log_error("Size overflow");
558         errno = EINVAL;
559         return -1;
560     }
561 
562     // re-allocate the data buffer as needed.
563     size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
564     if (realloc_bam_data(bam, data_len + l_aux) < 0) {
565         return -1;
566     }
567 
568     bam->l_data = (int)data_len;
569     bam->core.pos = pos;
570     bam->core.tid = tid;
571     bam->core.bin = bam_reg2bin(pos, pos + rlen);
572     bam->core.qual = mapq;
573     bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
574     bam->core.flag = flag;
575     bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
576     bam->core.n_cigar = (uint32_t)n_cigar;
577     bam->core.l_qseq = (int32_t)l_seq;
578     bam->core.mtid = mtid;
579     bam->core.mpos = mpos;
580     bam->core.isize = isize;
581 
582     uint8_t *cp = bam->data;
583     strncpy((char *)cp, qname, l_qname);
584     int i;
585     for (i = 0; i < qname_nuls; i++) {
586         cp[l_qname + i] = '\0';
587     }
588     cp += l_qname + qname_nuls;
589 
590     if (n_cigar > 0) {
591         memcpy(cp, cigar, n_cigar * 4);
592     }
593     cp += n_cigar * 4;
594 
595     for (i = 0; i + 1 < l_seq; i += 2) {
596         *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]];
597     }
598     for (; i < l_seq; i++) {
599         *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
600     }
601 
602     if (qual) {
603         memcpy(cp, qual, l_seq);
604     }
605     else {
606         memset(cp, '\xff', l_seq);
607     }
608 
609     return (int)data_len;
610 }
611 
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)612 hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
613 {
614     int k;
615     hts_pos_t l;
616     for (k = l = 0; k < n_cigar; ++k)
617         if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
618             l += bam_cigar_oplen(cigar[k]);
619     return l;
620 }
621 
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)622 hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
623 {
624     int k;
625     hts_pos_t l;
626     for (k = l = 0; k < n_cigar; ++k)
627         if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
628             l += bam_cigar_oplen(cigar[k]);
629     return l;
630 }
631 
bam_endpos(const bam1_t * b)632 hts_pos_t bam_endpos(const bam1_t *b)
633 {
634     hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
635     if (rlen == 0) rlen = 1;
636     return b->core.pos + rlen;
637 }
638 
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)639 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
640 {
641     bam1_core_t *c = &b->core;
642     uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
643     uint8_t *CG;
644 
645     // test where there is a real CIGAR in the CG tag to move
646     if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
647     cigar0 = bam_get_cigar(b);
648     if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
649     fake_bytes = c->n_cigar * 4;
650     int saved_errno = errno;
651     CG = bam_aux_get(b, "CG");
652     if (!CG) {
653         if (errno != ENOENT) return -1;  // Bad aux data
654         errno = saved_errno; // restore errno on expected no-CG-tag case
655         return 0;
656     }
657     if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
658         return 0; // not of type B,I
659     CG_len = le_to_u32(CG + 2);
660     if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
661 
662     // move from the CG tag to the right position
663     cigar_st = (uint8_t*)cigar0 - b->data;
664     c->n_cigar = CG_len;
665     n_cigar4 = c->n_cigar * 4;
666     CG_st = CG - b->data - 2;
667     CG_en = CG_st + 8 + n_cigar4;
668     if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
669     b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
670     memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
671     memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
672     if (ori_len > CG_en) // move data after the CG tag
673         memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
674     b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
675     if (recal_bin)
676         b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
677     if (give_warning)
678         hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
679     return 1;
680 }
681 
aux_type2size(uint8_t type)682 static inline int aux_type2size(uint8_t type)
683 {
684     switch (type) {
685     case 'A': case 'c': case 'C':
686         return 1;
687     case 's': case 'S':
688         return 2;
689     case 'i': case 'I': case 'f':
690         return 4;
691     case 'd':
692         return 8;
693     case 'Z': case 'H': case 'B':
694         return type;
695     default:
696         return 0;
697     }
698 }
699 
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)700 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
701 {
702     uint32_t *cigar = (uint32_t*)(data + c->l_qname);
703     uint32_t i;
704     for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
705 }
706 
707 // Fix bad records where qname is not terminated correctly.
fixup_missing_qname_nul(bam1_t * b)708 static int fixup_missing_qname_nul(bam1_t *b) {
709     bam1_core_t *c = &b->core;
710 
711     // Note this is called before c->l_extranul is added to c->l_qname
712     if (c->l_extranul > 0) {
713         b->data[c->l_qname++] = '\0';
714         c->l_extranul--;
715     } else {
716         if (b->l_data > INT_MAX - 4) return -1;
717         if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
718         b->l_data += 4;
719         b->data[c->l_qname++] = '\0';
720         c->l_extranul = 3;
721     }
722     return 0;
723 }
724 
725 /*
726  * Note a second interface that returns a bam pointer instead would avoid bam_copy1
727  * in multi-threaded handling.  This may be worth considering for htslib2.
728  */
bam_read1(BGZF * fp,bam1_t * b)729 int bam_read1(BGZF *fp, bam1_t *b)
730 {
731     bam1_core_t *c = &b->core;
732     int32_t block_len, ret, i;
733     uint32_t x[8], new_l_data;
734 
735     b->l_data = 0;
736 
737     if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
738         if (ret == 0) return -1; // normal end-of-file
739         else return -2; // truncated
740     }
741     if (fp->is_be)
742         ed_swap_4p(&block_len);
743     if (block_len < 32) return -4;  // block_len includes core data
744     if (bgzf_read(fp, x, 32) != 32) return -3;
745     if (fp->is_be) {
746         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
747     }
748     c->tid = x[0]; c->pos = (int32_t)x[1];
749     c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
750     c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
751     c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
752     c->l_qseq = x[4];
753     c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
754 
755     new_l_data = block_len - 32 + c->l_extranul;
756     if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
757     if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
758         + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
759         return -4;
760     if (realloc_bam_data(b, new_l_data) < 0) return -4;
761     b->l_data = new_l_data;
762 
763     if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
764     if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
765         if (fixup_missing_qname_nul(b) < 0) return -4;
766     }
767     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
768     c->l_qname += c->l_extranul;
769     if (b->l_data < c->l_qname ||
770         bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
771         return -4;
772     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
773     if (bam_tag2cigar(b, 0, 0) < 0)
774         return -4;
775 
776     if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
777         hts_pos_t rlen, qlen;
778         bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
779         if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
780         b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
781         // Sanity check for broken CIGAR alignments
782         if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
783             hts_log_error("CIGAR and query sequence lengths differ for %s",
784                     bam_get_qname(b));
785             return -4;
786         }
787     }
788 
789     return 4 + block_len;
790 }
791 
bam_write1(BGZF * fp,const bam1_t * b)792 int bam_write1(BGZF *fp, const bam1_t *b)
793 {
794     const bam1_core_t *c = &b->core;
795     uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
796     int i, ok;
797     if (c->l_qname - c->l_extranul > 255) {
798         hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
799         errno = EOVERFLOW;
800         return -1;
801     }
802     if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
803     if (c->pos > INT_MAX ||
804         c->mpos > INT_MAX ||
805         c->isize < INT_MIN || c->isize > INT_MAX) {
806         hts_log_error("Positional data is too large for BAM format");
807         return -1;
808     }
809     x[0] = c->tid;
810     x[1] = c->pos;
811     x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
812     if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
813     else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
814     x[4] = c->l_qseq;
815     x[5] = c->mtid;
816     x[6] = c->mpos;
817     x[7] = c->isize;
818     ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
819     if (fp->is_be) {
820         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
821         y = block_len;
822         if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
823         swap_data(c, b->l_data, b->data, 1);
824     } else {
825         if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
826     }
827     if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
828     if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
829     if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
830         if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
831     } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
832         uint8_t buf[8];
833         uint32_t cigar_st, cigar_en, cigar[2];
834         hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
835         if (cigreflen >= (1<<28)) {
836             // Length of reference covered is greater than the biggest
837             // CIGAR operation currently allowed.
838             hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
839                           " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
840                           bam_get_qname(b), c->n_cigar, cigreflen);
841             return -1;
842         }
843         cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
844         cigar_en = cigar_st + c->n_cigar * 4;
845         cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
846         cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
847         u32_to_le(cigar[0], buf);
848         u32_to_le(cigar[1], buf + 4);
849         if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
850         if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
851         if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
852         u32_to_le(c->n_cigar, buf);
853         if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
854         if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
855     }
856     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
857     return ok? 4 + block_len : -1;
858 }
859 
860 /*
861  * Write a BAM file and append to the in-memory index simultaneously.
862  */
bam_write_idx1(htsFile * fp,const sam_hdr_t * h,const bam1_t * b)863 static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
864     BGZF *bfp = fp->fp.bgzf;
865 
866     if (!fp->idx)
867         return bam_write1(bfp, b);
868 
869     uint32_t block_len = b->l_data - b->core.l_extranul + 32;
870     if (bgzf_flush_try(bfp, 4 + block_len) < 0)
871         return -1;
872     if (!bfp->mt)
873         hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
874     else
875         bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp));
876 
877     int ret = bam_write1(bfp, b);
878     if (ret < 0)
879         return -1;
880 
881     if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
882         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
883                 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
884         ret = -1;
885     }
886 
887     return ret;
888 }
889 
890 /*
891  * Set the qname in a BAM record
892  */
bam_set_qname(bam1_t * rec,const char * qname)893 int bam_set_qname(bam1_t *rec, const char *qname)
894 {
895     if (!rec) return -1;
896     if (!qname || !*qname) return -1;
897 
898     size_t old_len = rec->core.l_qname;
899     size_t new_len = strlen(qname) + 1;
900     if (new_len < 1 || new_len > 255) return -1;
901 
902     int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
903 
904     size_t new_data_len = rec->l_data - old_len + new_len + extranul;
905     if (realloc_bam_data(rec, new_data_len) < 0) return -1;
906 
907     // Make room
908     if (new_len + extranul != rec->core.l_qname)
909         memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
910     // Copy in new name and pad if needed
911     memcpy(rec->data, qname, new_len);
912     int n;
913     for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
914 
915     rec->l_data = new_data_len;
916     rec->core.l_qname = new_len + extranul;
917     rec->core.l_extranul = extranul;
918 
919     return 0;
920 }
921 
922 /********************
923  *** BAM indexing ***
924  ********************/
925 
sam_index(htsFile * fp,int min_shift)926 static hts_idx_t *sam_index(htsFile *fp, int min_shift)
927 {
928     int n_lvls, i, fmt, ret;
929     bam1_t *b;
930     hts_idx_t *idx;
931     sam_hdr_t *h;
932     h = sam_hdr_read(fp);
933     if (h == NULL) return NULL;
934     if (min_shift > 0) {
935         hts_pos_t max_len = 0, s;
936         for (i = 0; i < h->n_targets; ++i) {
937             hts_pos_t len = sam_hdr_tid2len(h, i);
938             if (max_len < len) max_len = len;
939         }
940         max_len += 256;
941         for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
942         fmt = HTS_FMT_CSI;
943     } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
944     idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
945     b = bam_init1();
946     while ((ret = sam_read1(fp, h, b)) >= 0) {
947         ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
948         if (ret < 0) { // unsorted or doesn't fit
949             hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
950             goto err;
951         }
952     }
953     if (ret < -1) goto err; // corrupted BAM file
954 
955     hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
956     sam_hdr_destroy(h);
957     bam_destroy1(b);
958     return idx;
959 
960 err:
961     bam_destroy1(b);
962     hts_idx_destroy(idx);
963     return NULL;
964 }
965 
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)966 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
967 {
968     hts_idx_t *idx;
969     htsFile *fp;
970     int ret = 0;
971 
972     if ((fp = hts_open(fn, "r")) == 0) return -2;
973     if (nthreads)
974         hts_set_threads(fp, nthreads);
975 
976     switch (fp->format.format) {
977     case cram:
978 
979         ret = cram_index_build(fp->fp.cram, fn, fnidx);
980         break;
981 
982     case bam:
983     case sam:
984         if (fp->format.compression != bgzf) {
985             hts_log_error("%s file \"%s\" not BGZF compressed",
986                           fp->format.format == bam ? "BAM" : "SAM", fn);
987             ret = -1;
988             break;
989         }
990         idx = sam_index(fp, min_shift);
991         if (idx) {
992             ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
993             if (ret < 0) ret = -4;
994             hts_idx_destroy(idx);
995         }
996         else ret = -1;
997         break;
998 
999     default:
1000         ret = -3;
1001         break;
1002     }
1003     hts_close(fp);
1004 
1005     return ret;
1006 }
1007 
sam_index_build2(const char * fn,const char * fnidx,int min_shift)1008 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1009 {
1010     return sam_index_build3(fn, fnidx, min_shift, 0);
1011 }
1012 
sam_index_build(const char * fn,int min_shift)1013 int sam_index_build(const char *fn, int min_shift)
1014 {
1015     return sam_index_build3(fn, NULL, min_shift, 0);
1016 }
1017 
1018 // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1019 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)1020 int bam_index_build(const char *fn, int min_shift)
1021 {
1022     return sam_index_build2(fn, NULL, min_shift);
1023 }
1024 
1025 // Initialise fp->idx for the current format type.
1026 // This must be called after the header has been written but no other data.
sam_idx_init(htsFile * fp,sam_hdr_t * h,int min_shift,const char * fnidx)1027 int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1028     fp->fnidx = fnidx;
1029     if (fp->format.format == bam || fp->format.format == bcf ||
1030         (fp->format.format == sam && fp->format.compression == bgzf)) {
1031         int n_lvls, fmt = HTS_FMT_CSI;
1032         if (min_shift > 0) {
1033             int64_t max_len = 0, s;
1034             int i;
1035             for (i = 0; i < h->n_targets; ++i)
1036                 if (max_len < h->target_len[i]) max_len = h->target_len[i];
1037             max_len += 256;
1038             for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1039 
1040         } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1041 
1042         fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1043         return fp->idx ? 0 : -1;
1044     }
1045 
1046     if (fp->format.format == cram) {
1047         fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1048         return fp->fp.cram->idxfp ? 0 : -1;
1049     }
1050 
1051     return -1;
1052 }
1053 
1054 // Finishes an index. Call after the last record has been written.
1055 // Returns 0 on success, <0 on failure.
sam_idx_save(htsFile * fp)1056 int sam_idx_save(htsFile *fp) {
1057     if (fp->format.format == bam || fp->format.format == bcf ||
1058         fp->format.format == vcf || fp->format.format == sam) {
1059         int ret;
1060         if ((ret = sam_state_destroy(fp)) < 0) {
1061             errno = -ret;
1062             return -1;
1063         }
1064         if (bgzf_flush(fp->fp.bgzf) < 0)
1065             return -1;
1066         hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1067 
1068         if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1069             return -1;
1070 
1071         return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx));
1072 
1073     } else if (fp->format.format == cram) {
1074         // flushed and closed by cram_close
1075     }
1076 
1077     return 0;
1078 }
1079 
sam_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1080 static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1081 {
1082     htsFile *fp = (htsFile *)fpv;
1083     bam1_t *b = bv;
1084     fp->line.l = 0;
1085     int ret = sam_read1(fp, fp->bam_header, b);
1086     if (ret >= 0) {
1087         *tid = b->core.tid;
1088         *beg = b->core.pos;
1089         *end = bam_endpos(b);
1090     }
1091     return ret;
1092 }
1093 
1094 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_readrec_rest(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1095 static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1096 {
1097     htsFile *fp = (htsFile *)fpv;
1098     bam1_t *b = bv;
1099     fp->line.l = 0;
1100     int ret = sam_read1(fp, fp->bam_header, b);
1101     return ret;
1102 }
1103 
1104 // Internal (for now) func used by bam_sym_lookup.  This is copied from
1105 // samtools/bam.c.
bam_get_library(const bam_hdr_t * h,const bam1_t * b)1106 static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1107 {
1108     const char *rg;
1109     kstring_t lib = { 0, 0, NULL };
1110     rg = (char *)bam_aux_get(b, "RG");
1111 
1112     if (!rg)
1113         return NULL;
1114     else
1115         rg++;
1116 
1117     if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1118         return NULL;
1119 
1120     static char LB_text[1024];
1121     int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1122 
1123     memcpy(LB_text, lib.s, len);
1124     LB_text[len] = 0;
1125 
1126     free(lib.s);
1127 
1128     return LB_text;
1129 }
1130 
1131 
1132 // Bam record pointer and SAM header combined
1133 typedef struct {
1134     const sam_hdr_t *h;
1135     const bam1_t *b;
1136 } hb_pair;
1137 
1138 // Looks up variable names in str and replaces them with their value.
1139 // Also supports aux tags.
1140 //
1141 // Note the expression parser deliberately overallocates str size so it
1142 // is safe to use memcmp over strcmp.
bam_sym_lookup(void * data,char * str,char ** end,hts_expr_val_t * res)1143 static int bam_sym_lookup(void *data, char *str, char **end,
1144                           hts_expr_val_t *res) {
1145     hb_pair *hb = (hb_pair *)data;
1146     const bam1_t *b = hb->b;
1147 
1148     res->is_str = 0;
1149     switch(*str) {
1150     case 'c':
1151         if (memcmp(str, "cigar", 5) == 0) {
1152             *end = str+5;
1153             res->is_str = 1;
1154             ks_clear(&res->s);
1155             uint32_t *cigar = bam_get_cigar(b);
1156             int i, n = b->core.n_cigar, r = 0;
1157             if (n) {
1158                 for (i = 0; i < n; i++) {
1159                     r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1160                     r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1161                 }
1162                 r |= kputs("", &res->s) < 0;
1163             } else {
1164                 r |= kputs("*", &res->s) < 0;
1165             }
1166             return r ? -1 : 0;
1167         }
1168         break;
1169 
1170     case 'e':
1171         if (memcmp(str, "endpos", 6) == 0) {
1172             *end = str+6;
1173             res->d = bam_endpos(b);
1174             return 0;
1175         }
1176         break;
1177 
1178     case 'f':
1179         if (memcmp(str, "flag", 4) == 0) {
1180             str = *end = str+4;
1181             if (*str != '.') {
1182                 res->d = b->core.flag;
1183                 return 0;
1184             } else {
1185                 str++;
1186                 if (!memcmp(str, "paired", 6)) {
1187                     *end = str+6;
1188                     res->d = b->core.flag & BAM_FPAIRED;
1189                     return 0;
1190                 } else if (!memcmp(str, "proper_pair", 11)) {
1191                     *end = str+11;
1192                     res->d = b->core.flag & BAM_FPROPER_PAIR;
1193                     return 0;
1194                 } else if (!memcmp(str, "unmap", 5)) {
1195                     *end = str+5;
1196                     res->d = b->core.flag & BAM_FUNMAP;
1197                     return 0;
1198                 } else if (!memcmp(str, "munmap", 6)) {
1199                     *end = str+6;
1200                     res->d = b->core.flag & BAM_FMUNMAP;
1201                     return 0;
1202                 } else if (!memcmp(str, "reverse", 7)) {
1203                     *end = str+7;
1204                     res->d = b->core.flag & BAM_FREVERSE;
1205                     return 0;
1206                 } else if (!memcmp(str, "mreverse", 8)) {
1207                     *end = str+8;
1208                     res->d = b->core.flag & BAM_FMREVERSE;
1209                     return 0;
1210                 } else if (!memcmp(str, "read1", 5)) {
1211                     *end = str+5;
1212                     res->d = b->core.flag & BAM_FREAD1;
1213                     return 0;
1214                 } else if (!memcmp(str, "read2", 5)) {
1215                     *end = str+5;
1216                     res->d = b->core.flag & BAM_FREAD2;
1217                     return 0;
1218                 } else if (!memcmp(str, "secondary", 9)) {
1219                     *end = str+9;
1220                     res->d = b->core.flag & BAM_FSECONDARY;
1221                     return 0;
1222                 } else if (!memcmp(str, "qcfail", 6)) {
1223                     *end = str+6;
1224                     res->d = b->core.flag & BAM_FQCFAIL;
1225                     return 0;
1226                 } else if (!memcmp(str, "dup", 3)) {
1227                     *end = str+3;
1228                     res->d = b->core.flag & BAM_FDUP;
1229                     return 0;
1230                 } else if (!memcmp(str, "supplementary", 13)) {
1231                     *end = str+13;
1232                     res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1233                     return 0;
1234                 } else {
1235                     hts_log_error("Unrecognised flag string");
1236                     return -1;
1237                 }
1238             }
1239         }
1240         break;
1241 
1242     case 'l':
1243         if (memcmp(str, "library", 7) == 0) {
1244             *end = str+7;
1245             res->is_str = 1;
1246             const char *lib = bam_get_library(hb->h, b);
1247             kputs(lib ? lib : "", ks_clear(&res->s));
1248             return 0;
1249         }
1250         break;
1251 
1252     case 'm':
1253         if (memcmp(str, "mapq", 4) == 0) {
1254             *end = str+4;
1255             res->d = b->core.qual;
1256             return 0;
1257         } else if (memcmp(str, "mpos", 4) == 0) {
1258             *end = str+4;
1259             res->d = b->core.mpos+1;
1260             return 0;
1261         } else if (memcmp(str, "mrname", 6) == 0) {
1262             *end = str+6;
1263             res->is_str = 1;
1264             const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1265             kputs(rn ? rn : "*", ks_clear(&res->s));
1266             return 0;
1267         } else if (memcmp(str, "mrefid", 6) == 0) {
1268             *end = str+6;
1269             res->d = b->core.mtid;
1270             return 0;
1271         }
1272         break;
1273 
1274     case 'n':
1275         if (memcmp(str, "ncigar", 6) == 0) {
1276             *end = str+6;
1277             res->d = b->core.n_cigar;
1278             return 0;
1279         }
1280         break;
1281 
1282     case 'p':
1283         if (memcmp(str, "pos", 3) == 0) {
1284             *end = str+3;
1285             res->d = b->core.pos+1;
1286             return 0;
1287         } else if (memcmp(str, "pnext", 5) == 0) {
1288             *end = str+5;
1289             res->d = b->core.mpos+1;
1290             return 0;
1291         }
1292         break;
1293 
1294     case 'q':
1295         if (memcmp(str, "qlen", 4) == 0) {
1296             *end = str+4;
1297             res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1298             return 0;
1299         } else if (memcmp(str, "qname", 5) == 0) {
1300             *end = str+5;
1301             res->is_str = 1;
1302             kputs(bam_get_qname(b), ks_clear(&res->s));
1303             return 0;
1304         } else if (memcmp(str, "qual", 4) == 0) {
1305             *end = str+4;
1306             ks_clear(&res->s);
1307             if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1308                 return -1;
1309             memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1310             res->s.l = b->core.l_qseq;
1311             res->is_str = 1;
1312             return 0;
1313         }
1314         break;
1315 
1316     case 'r':
1317         if (memcmp(str, "rlen", 4) == 0) {
1318             *end = str+4;
1319             res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1320             return 0;
1321         } else if (memcmp(str, "rname", 5) == 0) {
1322             *end = str+5;
1323             res->is_str = 1;
1324             const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1325             kputs(rn ? rn : "*", ks_clear(&res->s));
1326             return 0;
1327         } else if (memcmp(str, "rnext", 5) == 0) {
1328             *end = str+5;
1329             res->is_str = 1;
1330             const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1331             kputs(rn ? rn : "*", ks_clear(&res->s));
1332             return 0;
1333         } else if (memcmp(str, "refid", 5) == 0) {
1334             *end = str+5;
1335             res->d = b->core.tid;
1336             return 0;
1337         }
1338         break;
1339 
1340     case 's':
1341         if (memcmp(str, "seq", 3) == 0) {
1342             *end = str+3;
1343             ks_clear(&res->s);
1344             if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1345                 return -1;
1346             nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1347             res->s.s[b->core.l_qseq] = 0;
1348             res->s.l = b->core.l_qseq;
1349             res->is_str = 1;
1350             return 0;
1351         }
1352         break;
1353 
1354     case 't':
1355         if (memcmp(str, "tlen", 4) == 0) {
1356             *end = str+4;
1357             res->d = b->core.isize;
1358             return 0;
1359         }
1360         break;
1361 
1362     case '[':
1363         if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1364             /* aux tags */
1365             *end = str+4;
1366 
1367             uint8_t *aux = bam_aux_get(b, str+1);
1368             if (aux) {
1369                 // we define the truth of a tag to be its presence, even if 0.
1370                 res->is_true = 1;
1371                 switch (*aux) {
1372                 case 'Z':
1373                 case 'H':
1374                     res->is_str = 1;
1375                     kputs((char *)aux+1, ks_clear(&res->s));
1376                     break;
1377 
1378                 case 'A':
1379                     res->is_str = 1;
1380                     kputsn((char *)aux+1, 1, ks_clear(&res->s));
1381                     break;
1382 
1383                 case 'i': case 'I':
1384                 case 's': case 'S':
1385                 case 'c': case 'C':
1386                     res->is_str = 0;
1387                     res->d = bam_aux2i(aux);
1388                     break;
1389 
1390                 case 'f':
1391                 case 'd':
1392                     res->is_str = 0;
1393                     res->d = bam_aux2f(aux);
1394                     break;
1395 
1396                 default:
1397                     hts_log_error("Aux type '%c not yet supported by filters",
1398                                   *aux);
1399                     return -1;
1400                 }
1401                 return 0;
1402 
1403             } else {
1404                 // hence absent tags are always false (and strings)
1405                 res->is_str = 1;
1406                 res->s.l = 0;
1407                 res->d = 0;
1408                 res->is_true = 0;
1409                 return 0;
1410             }
1411         }
1412         break;
1413     }
1414 
1415     // All successful matches in switch should return 0.
1416     // So if we didn't match, it's a parse error.
1417     return -1;
1418 }
1419 
1420 // Returns 1 when accepted by the filter, 0 if not, -1 on error.
sam_passes_filter(const sam_hdr_t * h,const bam1_t * b,hts_filter_t * filt)1421 int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1422 {
1423     hb_pair hb = {h, b};
1424     hts_expr_val_t res;
1425     if (hts_filter_eval(filt, &hb, bam_sym_lookup, &res)) {
1426         hts_log_error("Couldn't process filter expression");
1427         hts_expr_val_free(&res);
1428         return -1;
1429     }
1430 
1431     int t = res.is_true;
1432     hts_expr_val_free(&res);
1433 
1434     return t;
1435 }
1436 
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1437 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1438 {
1439     htsFile *fp = fpv;
1440     bam1_t *b = bv;
1441     int pass_filter, ret;
1442 
1443     do {
1444         ret = cram_get_bam_seq(fp->fp.cram, &b);
1445         if (ret < 0)
1446             return cram_eof(fp->fp.cram) ? -1 : -2;
1447 
1448         if (bam_tag2cigar(b, 1, 1) < 0)
1449             return -2;
1450 
1451         *tid = b->core.tid;
1452         *beg = b->core.pos;
1453         *end = bam_endpos(b);
1454 
1455         if (fp->filter) {
1456             pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1457             if (pass_filter < 0)
1458                 return -2;
1459         } else {
1460             pass_filter = 1;
1461         }
1462     } while (pass_filter == 0);
1463 
1464     return ret;
1465 }
1466 
cram_pseek(void * fp,int64_t offset,int whence)1467 static int cram_pseek(void *fp, int64_t offset, int whence)
1468 {
1469     cram_fd *fd =  (cram_fd *)fp;
1470 
1471     if ((0 != cram_seek(fd, offset, SEEK_SET))
1472      && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1473         return -1;
1474 
1475     fd->curr_position = offset;
1476 
1477     if (fd->ctr) {
1478         cram_free_container(fd->ctr);
1479         if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1480             cram_free_container(fd->ctr_mt);
1481 
1482         fd->ctr = NULL;
1483         fd->ctr_mt = NULL;
1484         fd->ooc = 0;
1485     }
1486 
1487     return 0;
1488 }
1489 
1490 /*
1491  * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1492  *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1493  *   container previously fetched. It was designed like this to integrate with the functionality
1494  *   of the iterator stepping logic.
1495  */
1496 
cram_ptell(void * fp)1497 static int64_t cram_ptell(void *fp)
1498 {
1499     cram_fd *fd = (cram_fd *)fp;
1500     cram_container *c;
1501     cram_slice *s;
1502     int64_t ret = -1L;
1503 
1504     if (fd) {
1505         if ((c = fd->ctr) != NULL) {
1506             if ((s = c->slice) != NULL && s->max_rec) {
1507                 if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1508                     fd->curr_position += c->offset + c->length;
1509             }
1510         }
1511         ret = fd->curr_position;
1512     }
1513 
1514     return ret;
1515 }
1516 
bam_pseek(void * fp,int64_t offset,int whence)1517 static int bam_pseek(void *fp, int64_t offset, int whence)
1518 {
1519     BGZF *fd = (BGZF *)fp;
1520 
1521     return bgzf_seek(fd, offset, whence);
1522 }
1523 
bam_ptell(void * fp)1524 static int64_t bam_ptell(void *fp)
1525 {
1526     BGZF *fd = (BGZF *)fp;
1527     if (!fd)
1528         return -1L;
1529 
1530     return bgzf_tell(fd);
1531 }
1532 
1533 
1534 
index_load(htsFile * fp,const char * fn,const char * fnidx,int flags)1535 static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1536 {
1537     switch (fp->format.format) {
1538     case bam:
1539     case sam:
1540         return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1541 
1542     case cram: {
1543         if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1544 
1545         // Cons up a fake "index" just pointing at the associated cram_fd:
1546         hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1547         if (idx == NULL) return NULL;
1548         idx->fmt = HTS_FMT_CRAI;
1549         idx->cram = fp->fp.cram;
1550         return (hts_idx_t *) idx;
1551         }
1552 
1553     default:
1554         return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1555     }
1556 }
1557 
sam_index_load3(htsFile * fp,const char * fn,const char * fnidx,int flags)1558 hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1559 {
1560     return index_load(fp, fn, fnidx, flags);
1561 }
1562 
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)1563 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1564     return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1565 }
1566 
sam_index_load(htsFile * fp,const char * fn)1567 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1568 {
1569     return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1570 }
1571 
cram_itr_query(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end,hts_readrec_func * readrec)1572 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1573 {
1574     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1575     hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1576     if (iter == NULL) return NULL;
1577 
1578     // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1579     // the readrec function:
1580     iter->is_cram = 1;
1581     iter->read_rest = 1;
1582     iter->off = NULL;
1583     iter->bins.a = NULL;
1584     iter->readrec = readrec;
1585 
1586     if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1587         cram_range r = { tid, beg+1, end };
1588         int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1589 
1590         iter->curr_off = 0;
1591         // The following fields are not required by hts_itr_next(), but are
1592         // filled in in case user code wants to look at them.
1593         iter->tid = tid;
1594         iter->beg = beg;
1595         iter->end = end;
1596 
1597         switch (ret) {
1598         case 0:
1599             break;
1600 
1601         case -2:
1602             // No data vs this ref, so mark iterator as completed.
1603             // Same as HTS_IDX_NONE.
1604             iter->finished = 1;
1605             break;
1606 
1607         default:
1608             free(iter);
1609             return NULL;
1610         }
1611     }
1612     else switch (tid) {
1613     case HTS_IDX_REST:
1614         iter->curr_off = 0;
1615         break;
1616     case HTS_IDX_NONE:
1617         iter->curr_off = 0;
1618         iter->finished = 1;
1619         break;
1620     default:
1621         hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1622         abort();
1623         break;
1624     }
1625 
1626     return iter;
1627 }
1628 
sam_itr_queryi(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end)1629 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1630 {
1631     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1632     if (idx == NULL)
1633         return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1634     else if (cidx->fmt == HTS_FMT_CRAI)
1635         return cram_itr_query(idx, tid, beg, end, sam_readrec);
1636     else
1637         return hts_itr_query(idx, tid, beg, end, sam_readrec);
1638 }
1639 
cram_name2id(void * fdv,const char * ref)1640 static int cram_name2id(void *fdv, const char *ref)
1641 {
1642     cram_fd *fd = (cram_fd *) fdv;
1643     return sam_hdr_name2tid(fd->header, ref);
1644 }
1645 
sam_itr_querys(const hts_idx_t * idx,sam_hdr_t * hdr,const char * region)1646 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1647 {
1648     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1649     return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1650                           cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1651                           sam_readrec);
1652 }
1653 
sam_itr_regarray(const hts_idx_t * idx,sam_hdr_t * hdr,char ** regarray,unsigned int regcount)1654 hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1655 {
1656     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1657     hts_reglist_t *r_list = NULL;
1658     int r_count = 0;
1659 
1660     if (!cidx || !hdr)
1661         return NULL;
1662 
1663     hts_itr_t *itr = NULL;
1664     if (cidx->fmt == HTS_FMT_CRAI) {
1665         r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1666         if (!r_list)
1667             return NULL;
1668         itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1669                    hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1670     } else {
1671         r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1672         if (!r_list)
1673             return NULL;
1674         itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1675                    hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1676     }
1677 
1678     if (!itr)
1679         hts_reglist_free(r_list, r_count);
1680 
1681     return itr;
1682 }
1683 
sam_itr_regions(const hts_idx_t * idx,sam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)1684 hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1685 {
1686     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1687 
1688     if(!cidx || !hdr || !reglist)
1689         return NULL;
1690 
1691     if (cidx->fmt == HTS_FMT_CRAI)
1692         return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1693                    hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1694     else
1695         return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1696                    hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1697 }
1698 
1699 /**********************
1700  *** SAM header I/O ***
1701  **********************/
1702 
1703 #include "htslib/kseq.h"
1704 #include "htslib/kstring.h"
1705 
sam_hdr_parse(size_t l_text,const char * text)1706 sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1707 {
1708     sam_hdr_t *bh = sam_hdr_init();
1709     if (!bh) return NULL;
1710 
1711     if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1712         sam_hdr_destroy(bh);
1713         return NULL;
1714     }
1715 
1716     return bh;
1717 }
1718 
valid_sam_header_type(const char * s)1719 static int valid_sam_header_type(const char *s) {
1720     if (s[0] != '@') return 0;
1721     switch (s[1]) {
1722     case 'H':
1723         return s[2] == 'D' && s[3] == '\t';
1724     case 'S':
1725         return s[2] == 'Q' && s[3] == '\t';
1726     case 'R':
1727     case 'P':
1728         return s[2] == 'G' && s[3] == '\t';
1729     case 'C':
1730         return s[2] == 'O';
1731     }
1732     return 0;
1733 }
1734 
1735 // Minimal sanitisation of a header to ensure.
1736 // - null terminated string.
1737 // - all lines start with @ (also implies no blank lines).
1738 //
1739 // Much more could be done, but currently is not, including:
1740 // - checking header types are known (HD, SQ, etc).
1741 // - syntax (eg checking tab separated fields).
1742 // - validating n_targets matches @SQ records.
1743 // - validating target lengths against @SQ records.
sam_hdr_sanitise(sam_hdr_t * h)1744 static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1745     if (!h)
1746         return NULL;
1747 
1748     // Special case for empty headers.
1749     if (h->l_text == 0)
1750         return h;
1751 
1752     size_t i;
1753     unsigned int lnum = 0;
1754     char *cp = h->text, last = '\n';
1755     for (i = 0; i < h->l_text; i++) {
1756         // NB: l_text excludes terminating nul.  This finds early ones.
1757         if (cp[i] == 0)
1758             break;
1759 
1760         // Error on \n[^@], including duplicate newlines
1761         if (last == '\n') {
1762             lnum++;
1763             if (cp[i] != '@') {
1764                 hts_log_error("Malformed SAM header at line %u", lnum);
1765                 sam_hdr_destroy(h);
1766                 return NULL;
1767             }
1768         }
1769 
1770         last = cp[i];
1771     }
1772 
1773     if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1774         size_t j = i;
1775         while (j < h->l_text && cp[j] == '\0') j++;
1776         if (j < h->l_text)
1777             hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1778     }
1779 
1780     // Add trailing newline and/or trailing nul if required.
1781     if (last != '\n') {
1782         hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1783 
1784         if (h->l_text < 2 || i >= h->l_text - 2) {
1785             if (h->l_text >= SIZE_MAX - 2) {
1786                 hts_log_error("No room for extra newline");
1787                 sam_hdr_destroy(h);
1788                 return NULL;
1789             }
1790 
1791             cp = realloc(h->text, (size_t) h->l_text+2);
1792             if (!cp) {
1793                 sam_hdr_destroy(h);
1794                 return NULL;
1795             }
1796             h->text = cp;
1797         }
1798         cp[i++] = '\n';
1799 
1800         // l_text may be larger already due to multiple nul padding
1801         if (h->l_text < i)
1802             h->l_text = i;
1803         cp[h->l_text] = '\0';
1804     }
1805 
1806     return h;
1807 }
1808 
known_stderr(const char * tool,const char * advice)1809 static void known_stderr(const char *tool, const char *advice) {
1810     hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
1811     hts_log_warning("%s", advice);
1812 }
1813 
warn_if_known_stderr(const char * line)1814 static void warn_if_known_stderr(const char *line) {
1815     if (strstr(line, "M::bwa_idx_load_from_disk") != NULL)
1816         known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
1817     else if (strstr(line, "M::mem_pestat") != NULL)
1818         known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
1819     else if (strstr(line, "loaded/built the index") != NULL)
1820         known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
1821 }
1822 
sam_hdr_create(htsFile * fp)1823 static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1824     kstring_t str = { 0, 0, NULL };
1825     khint_t k;
1826     sam_hdr_t* h = sam_hdr_init();
1827     const char *q, *r;
1828     char* sn = NULL;
1829     khash_t(s2i) *d = kh_init(s2i);
1830     khash_t(s2i) *long_refs = NULL;
1831     if (!h || !d)
1832         goto error;
1833 
1834     int ret, has_SQ = 0;
1835     int next_c = '@';
1836     while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1837         if (fp->line.s[0] != '@')
1838             break;
1839 
1840         if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1841             has_SQ = 1;
1842             hts_pos_t ln = -1;
1843             for (q = fp->line.s + 4;; ++q) {
1844                 if (strncmp(q, "SN:", 3) == 0) {
1845                     q += 3;
1846                     for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1847 
1848                     if (sn) {
1849                         hts_log_warning("SQ header line has more than one SN: tag");
1850                         free(sn);
1851                     }
1852                     sn = (char*)calloc(r - q + 1, 1);
1853                     if (!sn)
1854                         goto error;
1855 
1856                     strncpy(sn, q, r - q);
1857                     q = r;
1858                 } else {
1859                     if (strncmp(q, "LN:", 3) == 0)
1860                         ln = strtoll(q + 3, (char**)&q, 10);
1861                 }
1862 
1863                 while (*q != '\t' && *q != '\n' && *q != '\0')
1864                     ++q;
1865                 if (*q == '\0' || *q == '\n')
1866                     break;
1867             }
1868             if (sn) {
1869                 if (ln >= 0) {
1870                     int absent;
1871                     k = kh_put(s2i, d, sn, &absent);
1872                     if (absent < 0)
1873                         goto error;
1874 
1875                     if (!absent) {
1876                         hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn);
1877                         free(sn);
1878                     } else {
1879                         sn = NULL;
1880                         if (ln >= UINT32_MAX) {
1881                             // Stash away ref length that
1882                             // doesn't fit in target_len array
1883                             int k2;
1884                             if (!long_refs) {
1885                                 long_refs = kh_init(s2i);
1886                                 if (!long_refs)
1887                                     goto error;
1888                             }
1889                             k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
1890                             if (absent < 0)
1891                                 goto error;
1892                             kh_val(long_refs, k2) = ln;
1893                             kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1894                                             | UINT32_MAX);
1895                         } else {
1896                             kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1897                         }
1898                     }
1899                 } else {
1900                     hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
1901                     warn_if_known_stderr(fp->line.s);
1902                     free(sn);
1903                 }
1904             } else {
1905                 hts_log_warning("Ignored @SQ line with missing SN: tag");
1906                 warn_if_known_stderr(fp->line.s);
1907             }
1908             sn = NULL;
1909         }
1910         else if (!valid_sam_header_type(fp->line.s)) {
1911             hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
1912             warn_if_known_stderr(fp->line.s);
1913             goto error;
1914         }
1915 
1916         if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1917             goto error;
1918 
1919         if (kputc('\n', &str) < 0)
1920             goto error;
1921 
1922         if (fp->is_bgzf) {
1923             next_c = bgzf_peek(fp->fp.bgzf);
1924         } else {
1925             unsigned char nc;
1926             ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
1927             next_c = pret > 0 ? nc : pret - 1;
1928         }
1929         if (next_c < -1)
1930             goto error;
1931     }
1932     if (next_c != '@')
1933         fp->line.l = 0;
1934 
1935     if (ret < -1)
1936         goto error;
1937 
1938     if (!has_SQ && fp->fn_aux) {
1939         kstring_t line = { 0, 0, NULL };
1940 
1941         /* The reference index (.fai) is actually needed here */
1942         char *fai_fn = fp->fn_aux;
1943         char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
1944         if (fn_delim)
1945             fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
1946 
1947         hFILE* f = hopen(fai_fn, "r");
1948         int e = 0, absent;
1949         if (f == NULL)
1950             goto error;
1951 
1952         while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
1953             char* tab = strchr(line.s, '\t');
1954             hts_pos_t ln;
1955 
1956             if (tab == NULL)
1957                 continue;
1958 
1959             sn = (char*)calloc(tab-line.s+1, 1);
1960             if (!sn) {
1961                 e = 1;
1962                 break;
1963             }
1964             memcpy(sn, line.s, tab-line.s);
1965             k = kh_put(s2i, d, sn, &absent);
1966             if (absent < 0) {
1967                 e = 1;
1968                 break;
1969             }
1970 
1971             ln = strtoll(tab, NULL, 10);
1972 
1973             if (!absent) {
1974                 hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn);
1975                 free(sn);
1976                 sn = NULL;
1977             } else {
1978                 sn = NULL;
1979                 if (ln >= UINT32_MAX) {
1980                     // Stash away ref length that
1981                     // doesn't fit in target_len array
1982                     khint_t k2;
1983                     int absent = -1;
1984                     if (!long_refs) {
1985                         long_refs = kh_init(s2i);
1986                         if (!long_refs) {
1987                             e = 1;
1988                             break;
1989                         }
1990                     }
1991                     k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
1992                     if (absent < 0) {
1993                          e = 1;
1994                          break;
1995                     }
1996                     kh_val(long_refs, k2) = ln;
1997                     kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1998                                     | UINT32_MAX);
1999                 } else {
2000                     kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2001                 }
2002                 has_SQ = 1;
2003             }
2004 
2005             e |= kputs("@SQ\tSN:", &str) < 0;
2006             e |= kputsn(line.s, tab - line.s, &str) < 0;
2007             e |= kputs("\tLN:", &str) < 0;
2008             e |= kputll(ln, &str) < 0;
2009             e |= kputc('\n', &str) < 0;
2010             if (e)
2011                 break;
2012         }
2013 
2014         ks_free(&line);
2015         if (hclose(f) != 0) {
2016             hts_log_error("Error on closing %s", fai_fn);
2017             e = 1;
2018         }
2019         if (e)
2020             goto error;
2021     }
2022 
2023     if (has_SQ) {
2024         // Populate the targets array
2025         h->n_targets = kh_size(d);
2026 
2027         h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
2028         if (!h->target_name) {
2029             h->n_targets = 0;
2030             goto error;
2031         }
2032 
2033         h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
2034         if (!h->target_len) {
2035             h->n_targets = 0;
2036             goto error;
2037         }
2038 
2039         for (k = kh_begin(d); k != kh_end(d); ++k) {
2040             if (!kh_exist(d, k))
2041                 continue;
2042 
2043             h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
2044             h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
2045             kh_val(d, k) >>= 32;
2046         }
2047     }
2048 
2049     // Repurpose sdict to hold any references longer than UINT32_MAX
2050     h->sdict = long_refs;
2051 
2052     kh_destroy(s2i, d);
2053 
2054     if (str.l == 0)
2055         kputsn("", 0, &str);
2056     h->l_text = str.l;
2057     h->text = ks_release(&str);
2058     fp->bam_header = sam_hdr_sanitise(h);
2059     fp->bam_header->ref_count = 1;
2060 
2061     return fp->bam_header;
2062 
2063  error:
2064     if (h && d && (!h->target_name || !h->target_len)) {
2065         for (k = kh_begin(d); k != kh_end(d); ++k)
2066             if (kh_exist(d, k)) free((void *)kh_key(d, k));
2067     }
2068     sam_hdr_destroy(h);
2069     ks_free(&str);
2070     kh_destroy(s2i, d);
2071     kh_destroy(s2i, long_refs);
2072     if (sn) free(sn);
2073     return NULL;
2074 }
2075 
sam_hdr_read(htsFile * fp)2076 sam_hdr_t *sam_hdr_read(htsFile *fp)
2077 {
2078     if (!fp) {
2079         errno = EINVAL;
2080         return NULL;
2081     }
2082 
2083     switch (fp->format.format) {
2084     case bam:
2085         return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
2086 
2087     case cram:
2088         return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
2089 
2090     case sam:
2091         return sam_hdr_create(fp);
2092 
2093     case fastq_format:
2094     case fasta_format:
2095         return sam_hdr_init();
2096 
2097     case empty_format:
2098         errno = EPIPE;
2099         return NULL;
2100 
2101     default:
2102         errno = EFTYPE;
2103         return NULL;
2104     }
2105 }
2106 
sam_hdr_write(htsFile * fp,const sam_hdr_t * h)2107 int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
2108 {
2109     if (!fp || !h) {
2110         errno = EINVAL;
2111         return -1;
2112     }
2113 
2114     switch (fp->format.format) {
2115     case binary_format:
2116         fp->format.category = sequence_data;
2117         fp->format.format = bam;
2118         /* fall-through */
2119     case bam:
2120         if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
2121         break;
2122 
2123     case cram: {
2124         cram_fd *fd = fp->fp.cram;
2125         if (cram_set_header2(fd, h) < 0) return -1;
2126         if (fp->fn_aux)
2127             cram_load_reference(fd, fp->fn_aux);
2128         if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
2129         }
2130         break;
2131 
2132     case text_format:
2133         fp->format.category = sequence_data;
2134         fp->format.format = sam;
2135         /* fall-through */
2136     case sam: {
2137         if (!h->hrecs && !h->text)
2138             return 0;
2139         char *text;
2140         kstring_t hdr_ks = { 0, 0, NULL };
2141         size_t l_text;
2142         ssize_t bytes;
2143         int r = 0, no_sq = 0;
2144 
2145         if (h->hrecs) {
2146             if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2147                 return -1;
2148             text = hdr_ks.s;
2149             l_text = hdr_ks.l;
2150         } else {
2151             const char *p = NULL;
2152             do {
2153                 const char *q = p == NULL ? h->text : p + 4;
2154                 p = strstr(q, "@SQ\t");
2155             } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2156             no_sq = p == NULL;
2157             text = h->text;
2158             l_text = h->l_text;
2159         }
2160 
2161         if (fp->is_bgzf) {
2162             bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2163         } else {
2164             bytes = hwrite(fp->fp.hfile, text, l_text);
2165         }
2166         free(hdr_ks.s);
2167         if (bytes != l_text)
2168             return -1;
2169 
2170         if (no_sq) {
2171             int i;
2172             for (i = 0; i < h->n_targets; ++i) {
2173                 fp->line.l = 0;
2174                 r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2175                 r |= kputs(h->target_name[i], &fp->line) < 0;
2176                 r |= kputsn("\tLN:", 4, &fp->line) < 0;
2177                 r |= kputw(h->target_len[i], &fp->line) < 0;
2178                 r |= kputc('\n', &fp->line) < 0;
2179                 if (r != 0)
2180                     return -1;
2181 
2182                 if (fp->is_bgzf) {
2183                     bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2184                 } else {
2185                     bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2186                 }
2187                 if (bytes != fp->line.l)
2188                     return -1;
2189             }
2190         }
2191         if (fp->is_bgzf) {
2192             if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2193         } else {
2194             if (hflush(fp->fp.hfile) != 0) return -1;
2195         }
2196         }
2197         break;
2198 
2199     case fastq_format:
2200     case fasta_format:
2201         // Nothing to output; FASTQ has no file headers.
2202         break;
2203 
2204     default:
2205         errno = EBADF;
2206         return -1;
2207     }
2208     return 0;
2209 }
2210 
old_sam_hdr_change_HD(sam_hdr_t * h,const char * key,const char * val)2211 static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2212 {
2213     char *p, *q, *beg = NULL, *end = NULL, *newtext;
2214     size_t new_l_text;
2215     if (!h || !key)
2216         return -1;
2217 
2218     if (h->l_text > 3) {
2219         if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2220             if ((p = strchr(h->text, '\n')) == 0) return -1;
2221             *p = '\0'; // for strstr call
2222 
2223             char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2224 
2225             if ((q = strstr(h->text, tmp)) != 0) { // key exists
2226                 *p = '\n'; // change back
2227 
2228                 // mark the key:val
2229                 beg = q;
2230                 for (q += 4; *q != '\n' && *q != '\t'; ++q);
2231                 end = q;
2232 
2233                 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2234                     && strlen(val) == end - beg - 4)
2235                      return 0; // val is the same, no need to change
2236 
2237             } else {
2238                 beg = end = p;
2239                 *p = '\n';
2240             }
2241         }
2242     }
2243     if (beg == NULL) { // no @HD
2244         new_l_text = h->l_text;
2245         if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2246             return -1;
2247         new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2248         if (val) {
2249             if (new_l_text > SIZE_MAX - strlen(val) - 5)
2250                 return -1;
2251             new_l_text += strlen(val) + 4;
2252         }
2253         newtext = (char*)malloc(new_l_text + 1);
2254         if (!newtext) return -1;
2255 
2256         if (val)
2257             snprintf(newtext, new_l_text + 1,
2258                     "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2259         else
2260             snprintf(newtext, new_l_text + 1,
2261                     "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2262     } else { // has @HD but different or no key
2263         new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2264         if (val) {
2265             if (new_l_text > SIZE_MAX - strlen(val) - 5)
2266                 return -1;
2267             new_l_text += strlen(val) + 4;
2268         }
2269         newtext = (char*)malloc(new_l_text + 1);
2270         if (!newtext) return -1;
2271 
2272         if (val) {
2273             snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2274                     (int) (beg - h->text), h->text, key, val, end);
2275         } else { //delete key
2276             snprintf(newtext, new_l_text + 1, "%.*s%s",
2277                     (int) (beg - h->text), h->text, end);
2278         }
2279     }
2280     free(h->text);
2281     h->text = newtext;
2282     h->l_text = new_l_text;
2283     return 0;
2284 }
2285 
2286 
sam_hdr_change_HD(sam_hdr_t * h,const char * key,const char * val)2287 int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2288 {
2289     if (!h || !key)
2290         return -1;
2291 
2292     if (!h->hrecs)
2293         return old_sam_hdr_change_HD(h, key, val);
2294 
2295     if (val) {
2296         if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2297             return -1;
2298     } else {
2299         if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2300             return -1;
2301     }
2302     return sam_hdr_rebuild(h);
2303 }
2304 /**********************
2305  *** SAM record I/O ***
2306  **********************/
2307 
sam_parse_B_vals(char type,uint32_t n,char * in,char ** end,char * r,bam1_t * b)2308 static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
2309                             char *r, bam1_t *b)
2310 {
2311     int orig_l = b->l_data;
2312     char *q = in;
2313     int32_t size;
2314     size_t bytes;
2315     int overflow = 0;
2316 
2317     size = aux_type2size(type);
2318     if (size <= 0 || size > 4) {
2319         hts_log_error("Unrecognized type B:%c", type);
2320         return -1;
2321     }
2322 
2323     // Ensure space for type + values
2324     bytes = (size_t) n * (size_t) size;
2325     if (bytes / size != n
2326         || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2327         hts_log_error("Out of memory");
2328         return -1;
2329     }
2330 
2331     b->data[b->l_data++] = 'B';
2332     b->data[b->l_data++] = type;
2333     i32_to_le(n, b->data + b->l_data);
2334     b->l_data += sizeof(uint32_t);
2335     // This ensures that q always ends up at the next comma after
2336     // reading a number even if it's followed by junk.  It
2337     // prevents the possibility of trying to read more than n items.
2338 #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2339     if (type == 'c') {
2340         while (q < r) {
2341             *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow);
2342             b->l_data++;
2343             skip_to_comma_(q);
2344         }
2345     } else if (type == 'C') {
2346         while (q < r) {
2347             if (*q != '-') {
2348                 *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow);
2349                 b->l_data++;
2350             } else {
2351                 overflow = 1;
2352             }
2353             skip_to_comma_(q);
2354         }
2355     } else if (type == 's') {
2356         while (q < r) {
2357             i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data);
2358             b->l_data += 2;
2359             skip_to_comma_(q);
2360         }
2361     } else if (type == 'S') {
2362         while (q < r) {
2363             if (*q != '-') {
2364                 u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data);
2365                 b->l_data += 2;
2366             } else {
2367                 overflow = 1;
2368             }
2369             skip_to_comma_(q);
2370         }
2371     } else if (type == 'i') {
2372         while (q < r) {
2373             i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data);
2374             b->l_data += 4;
2375             skip_to_comma_(q);
2376         }
2377     } else if (type == 'I') {
2378         while (q < r) {
2379             if (*q != '-') {
2380                 u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data);
2381                 b->l_data += 4;
2382             } else {
2383                 overflow = 1;
2384             }
2385             skip_to_comma_(q);
2386         }
2387     } else if (type == 'f') {
2388         while (q < r) {
2389             float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2390             b->l_data += 4;
2391             skip_to_comma_(q);
2392         }
2393     } else {
2394         hts_log_error("Unrecognized type B:%c", type);
2395         return -1;
2396     }
2397 
2398     if (!overflow) {
2399         *end = q;
2400         return 0;
2401     } else {
2402         int64_t max = 0, min = 0, val;
2403         // Given type was incorrect.  Try to rescue the situation.
2404         q = in;
2405         overflow = 0;
2406         b->l_data = orig_l;
2407         // Find out what range of values is present
2408         while (q < r) {
2409             val = hts_str2int(q + 1, &q, 64, &overflow);
2410             if (max < val) max = val;
2411             if (min > val) min = val;
2412             skip_to_comma_(q);
2413         }
2414         // Retry with appropriate type
2415         if (!overflow) {
2416             if (min < 0) {
2417                 if (min >= INT8_MIN && max <= INT8_MAX) {
2418                     return sam_parse_B_vals('c', n, in, end, r, b);
2419                 } else if (min >= INT16_MIN && max <= INT16_MAX) {
2420                     return sam_parse_B_vals('s', n, in, end, r, b);
2421                 } else if (min >= INT32_MIN && max <= INT32_MAX) {
2422                     return sam_parse_B_vals('i', n, in, end, r, b);
2423                 }
2424             } else {
2425                 if (max < UINT8_MAX) {
2426                     return sam_parse_B_vals('C', n, in, end, r, b);
2427                 } else if (max <= UINT16_MAX) {
2428                     return sam_parse_B_vals('S', n, in, end, r, b);
2429                 } else if (max <= UINT32_MAX) {
2430                     return sam_parse_B_vals('I', n, in, end, r, b);
2431                 }
2432             }
2433         }
2434         // If here then at least one of the values is too big to store
2435         hts_log_error("Numeric value in B array out of allowed range");
2436         return -1;
2437     }
2438 #undef skip_to_comma_
2439 }
2440 
parse_sam_flag(char * v,char ** rv,int * overflow)2441 static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2442     if (*v >= '1' && *v <= '9') {
2443         return hts_str2uint(v, rv, 16, overflow);
2444     }
2445     else if (*v == '0') {
2446         // handle single-digit "0" directly; otherwise it's hex or octal
2447         if (v[1] == '\t') { *rv = v+1; return 0; }
2448         else {
2449             unsigned long val = strtoul(v, rv, 0);
2450             if (val > 65535) { *overflow = 1; return 65535; }
2451             return val;
2452         }
2453     }
2454     else {
2455         // TODO implement symbolic flag letters
2456         *rv = v;
2457         return 0;
2458     }
2459 }
2460 
2461 // Parse tag line and append to bam object b.
2462 // Shared by both SAM and FASTQ parsers.
2463 //
2464 // The difference between the two is how lenient we are to recognising
2465 // non-compliant strings.  The FASTQ parser glosses over arbitrary
2466 // non-SAM looking strings.
aux_parse(char * start,char * end,bam1_t * b,int lenient,khash_t (tag)* tag_whitelist)2467 static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2468                             khash_t(tag) *tag_whitelist) {
2469     int overflow = 0;
2470     int checkpoint;
2471     char logbuf[40];
2472     char *q = start, *p = end;
2473 
2474 #define _parse_err(cond, ...)                   \
2475     do {                                        \
2476         if (cond) {                             \
2477             if (lenient) {                      \
2478                 while (q < p && !isspace_c(*q))   \
2479                     q++;                        \
2480                 while (q < p && isspace_c(*q))    \
2481                     q++;                        \
2482                 b->l_data = checkpoint;         \
2483                 goto loop;                      \
2484             } else {                            \
2485                 hts_log_error(__VA_ARGS__);     \
2486                 goto err_ret;                   \
2487             }                                   \
2488         }                                       \
2489     } while (0)
2490 
2491     while (q < p) loop: {
2492         char type;
2493         checkpoint = b->l_data;
2494         if (p - q < 5) {
2495             if (lenient) {
2496                 break;
2497             } else {
2498                 hts_log_error("Incomplete aux field");
2499                 goto err_ret;
2500             }
2501         }
2502         _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2503 
2504         if (lenient && (q[2] | q[4]) != ':') {
2505             while (q < p && !isspace_c(*q))
2506                 q++;
2507             while (q < p && isspace_c(*q))
2508                 q++;
2509             continue;
2510         }
2511 
2512         if (tag_whitelist) {
2513             int tt = q[0]*256 + q[1];
2514             if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2515                 while (q < p && *q != '\t')
2516                     q++;
2517                 continue;
2518             }
2519         }
2520 
2521         // Copy over id
2522         if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2523         memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2524         q += 3; type = *q++; ++q; // q points to value
2525         if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2526             _parse_err(*q <= '\t', "incomplete aux field");
2527 
2528         // Ensure enough space for a double + type allocated.
2529         if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2530 
2531         if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2532             b->data[b->l_data++] = 'A';
2533             b->data[b->l_data++] = *q++;
2534         } else if (type == 'i' || type == 'I') {
2535             if (*q == '-') {
2536                 int32_t x = hts_str2int(q, &q, 32, &overflow);
2537                 if (x >= INT8_MIN) {
2538                     b->data[b->l_data++] = 'c';
2539                     b->data[b->l_data++] = x;
2540                 } else if (x >= INT16_MIN) {
2541                     b->data[b->l_data++] = 's';
2542                     i16_to_le(x, b->data + b->l_data);
2543                     b->l_data += 2;
2544                 } else {
2545                     b->data[b->l_data++] = 'i';
2546                     i32_to_le(x, b->data + b->l_data);
2547                     b->l_data += 4;
2548                 }
2549             } else {
2550                 uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2551                 if (x <= UINT8_MAX) {
2552                     b->data[b->l_data++] = 'C';
2553                     b->data[b->l_data++] = x;
2554                 } else if (x <= UINT16_MAX) {
2555                     b->data[b->l_data++] = 'S';
2556                     u16_to_le(x, b->data + b->l_data);
2557                     b->l_data += 2;
2558                 } else {
2559                     b->data[b->l_data++] = 'I';
2560                     u32_to_le(x, b->data + b->l_data);
2561                     b->l_data += 4;
2562                 }
2563             }
2564         } else if (type == 'f') {
2565             b->data[b->l_data++] = 'f';
2566             float_to_le(strtod(q, &q), b->data + b->l_data);
2567             b->l_data += sizeof(float);
2568         } else if (type == 'd') {
2569             b->data[b->l_data++] = 'd';
2570             double_to_le(strtod(q, &q), b->data + b->l_data);
2571             b->l_data += sizeof(double);
2572         } else if (type == 'Z' || type == 'H') {
2573             char *end = strchr(q, '\t');
2574             if (!end) end = q + strlen(q);
2575             _parse_err(type == 'H' && ((end-q)&1) != 0,
2576                        "hex field does not have an even number of digits");
2577             b->data[b->l_data++] = type;
2578             if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2579             memcpy(b->data + b->l_data, q, end - q);
2580             b->l_data += end - q;
2581             b->data[b->l_data++] = '\0';
2582             q = end;
2583         } else if (type == 'B') {
2584             uint32_t n;
2585             char *r;
2586             type = *q++; // q points to the first ',' following the typing byte
2587             _parse_err(*q && *q != ',' && *q != '\t',
2588                        "B aux field type not followed by ','");
2589 
2590             for (r = q, n = 0; *r > '\t'; ++r)
2591                 if (*r == ',') ++n;
2592 
2593             if (sam_parse_B_vals(type, n, q, &q, r, b) < 0)
2594                 goto err_ret;
2595         } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2596 
2597         while (*q > '\t') { q++; } // Skip any junk to next tab
2598         q++;
2599     }
2600 
2601     _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2602 #undef _parse_err
2603 
2604     return 0;
2605 
2606 err_ret:
2607     return -2;
2608 }
2609 
sam_parse1(kstring_t * s,sam_hdr_t * h,bam1_t * b)2610 int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2611 {
2612 #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2613 
2614 #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2615 
2616 // Macro that operates on 64-bits at a time.
2617 #define COPY_MINUS_N(to,from,n,l,failed)                        \
2618     do {                                                        \
2619         uint64_u *from8 = (uint64_u *)(from);                   \
2620         uint64_u *to8 = (uint64_u *)(to);                       \
2621         uint64_t uflow = 0;                                     \
2622         size_t l8 = (l)>>3, i;                                  \
2623         for (i = 0; i < l8; i++) {                              \
2624             to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2625             uflow |= to8[i];                                    \
2626         }                                                       \
2627         for (i<<=3; i < (l); ++i) {                             \
2628             to[i] = from[i] - (n);                              \
2629             uflow |= to[i];                                     \
2630         }                                                       \
2631         failed = (uflow & 0x8080808080808080UL) > 0;            \
2632     } while (0)
2633 
2634 #else
2635 
2636 // Basic version which operates a byte at a time
2637 #define COPY_MINUS_N(to,from,n,l,failed) do {                \
2638         uint8_t uflow = 0;                                   \
2639         for (i = 0; i < (l); ++i) {                          \
2640             (to)[i] = (from)[i] - (n);                       \
2641             uflow |= (uint8_t) (to)[i];                      \
2642         }                                                    \
2643         failed = (uflow & 0x80) > 0;                         \
2644     } while (0)
2645 
2646 #endif
2647 
2648 #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2649 #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2650 #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2651 
2652     uint8_t *t;
2653 
2654     char *p = s->s, *q;
2655     int i, overflow = 0;
2656     char logbuf[40];
2657     hts_pos_t cigreflen;
2658     bam1_core_t *c = &b->core;
2659 
2660     b->l_data = 0;
2661     memset(c, 0, 32);
2662 
2663     // qname
2664     q = _read_token(p);
2665 
2666     _parse_warn(p - q <= 1, "empty query name");
2667     _parse_err(p - q > 255, "query name too long");
2668     // resize large enough for name + extranul
2669     if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2670     memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2671 
2672     c->l_extranul = (4 - (b->l_data & 3)) & 3;
2673     memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2674     b->l_data += c->l_extranul;
2675 
2676     c->l_qname = p - q + c->l_extranul;
2677 
2678     // flag
2679     c->flag = parse_sam_flag(p, &p, &overflow);
2680     if (*p++ != '\t') goto err_ret; // malformated flag
2681 
2682     // chr
2683     q = _read_token(p);
2684     if (strcmp(q, "*")) {
2685         _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2686         c->tid = bam_name2id(h, q);
2687         _parse_err(c->tid < -1, "failed to parse header");
2688         _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2689     } else c->tid = -1;
2690 
2691     // pos
2692     c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
2693     if (*p++ != '\t') goto err_ret;
2694     if (c->pos < 0 && c->tid >= 0) {
2695         _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2696         c->tid = -1;
2697     }
2698     if (c->tid < 0) c->flag |= BAM_FUNMAP;
2699 
2700     // mapq
2701     c->qual = hts_str2uint(p, &p, 8, &overflow);
2702     if (*p++ != '\t') goto err_ret;
2703     // cigar
2704     if (*p != '*') {
2705         uint32_t *cigar = NULL;
2706         int old_l_data = b->l_data;
2707         int n_cigar = bam_parse_cigar(p, &p, b);
2708         if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2709         cigar = (uint32_t *)(b->data + old_l_data);
2710         c->n_cigar = n_cigar;
2711 
2712         // can't use bam_endpos() directly as some fields not yet set up
2713         cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2714         if (cigreflen == 0) cigreflen = 1;
2715     } else {
2716         _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2717         c->flag |= BAM_FUNMAP;
2718         q = _read_token(p);
2719         cigreflen = 1;
2720     }
2721     _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2722                "read ends beyond highest supported position");
2723     c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2724     // mate chr
2725     q = _read_token(p);
2726     if (strcmp(q, "=") == 0) {
2727         c->mtid = c->tid;
2728     } else if (strcmp(q, "*") == 0) {
2729         c->mtid = -1;
2730     } else {
2731         c->mtid = bam_name2id(h, q);
2732         _parse_err(c->mtid < -1, "failed to parse header");
2733         _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2734     }
2735     // mpos
2736     c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
2737     if (*p++ != '\t') goto err_ret;
2738     if (c->mpos < 0 && c->mtid >= 0) {
2739         _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2740         c->mtid = -1;
2741     }
2742     // tlen
2743     c->isize = hts_str2int(p, &p, 64, &overflow);
2744     if (*p++ != '\t') goto err_ret;
2745     // seq
2746     q = _read_token(p);
2747     if (strcmp(q, "*")) {
2748         _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2749         c->l_qseq = p - q - 1;
2750         hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2751         _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2752         i = (c->l_qseq + 1) >> 1;
2753         _get_mem(uint8_t, &t, b, i);
2754 
2755         unsigned int lqs2 = c->l_qseq&~1, i;
2756         for (i = 0; i < lqs2; i+=2)
2757             t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2758         for (; i < c->l_qseq; ++i)
2759             t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2760     } else c->l_qseq = 0;
2761     // qual
2762     _get_mem(uint8_t, &t, b, c->l_qseq);
2763     if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2764         memset(t, 0xff, c->l_qseq);
2765         p += 2;
2766     } else {
2767         int failed = 0;
2768         _parse_err(s->l - (p - s->s) < c->l_qseq
2769                    || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2770                    "SEQ and QUAL are of different length");
2771         COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2772         _parse_err(failed, "invalid QUAL character");
2773         p += c->l_qseq + 1;
2774     }
2775 
2776     // aux
2777     if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2778         goto err_ret;
2779 
2780     if (bam_tag2cigar(b, 1, 1) < 0)
2781         return -2;
2782     return 0;
2783 
2784 #undef _parse_warn
2785 #undef _parse_err
2786 #undef _get_mem
2787 #undef _read_token
2788 err_ret:
2789     return -2;
2790 }
2791 
read_ncigar(const char * q)2792 static uint32_t read_ncigar(const char *q) {
2793     uint32_t n_cigar = 0;
2794     for (; *q && *q != '\t'; ++q)
2795         if (!isdigit_c(*q)) ++n_cigar;
2796     if (!n_cigar) {
2797         hts_log_error("No CIGAR operations");
2798         return 0;
2799     }
2800     if (n_cigar >= 2147483647) {
2801         hts_log_error("Too many CIGAR operations");
2802         return 0;
2803     }
2804 
2805     return n_cigar;
2806 }
2807 
2808 /*! @function
2809  @abstract  Parse a CIGAR string into preallocated a uint32_t array
2810  @param  in      [in]  pointer to the source string
2811  @param  a_cigar [out]  address of the destination uint32_t buffer
2812  @return         number of processed input characters; 0 on error
2813  */
parse_cigar(const char * in,uint32_t * a_cigar,uint32_t n_cigar)2814 static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2815     int i, overflow = 0;
2816     const char *p = in;
2817     for (i = 0; i < n_cigar; i++) {
2818         uint32_t len;
2819         int op;
2820         char *q;
2821         len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2822         if (q == p) {
2823             hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2824             return 0;
2825         }
2826         if (overflow) {
2827             hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2828             return 0;
2829         }
2830         p = q;
2831         op = bam_cigar_table[(unsigned char)*p++];
2832         if (op < 0) {
2833             hts_log_error("Unrecognized CIGAR operator");
2834             return 0;
2835         }
2836         a_cigar[i] = len;
2837         a_cigar[i] |= op;
2838     }
2839 
2840     return p-in;
2841 }
2842 
sam_parse_cigar(const char * in,char ** end,uint32_t ** a_cigar,size_t * a_mem)2843 ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2844     size_t n_cigar = 0;
2845     int diff;
2846 
2847     if (!in || !a_cigar || !a_mem) {
2848         hts_log_error("NULL pointer arguments");
2849         return -1;
2850     }
2851     if (end) *end = (char *)in;
2852 
2853     if (*in == '*') {
2854         if (end) (*end)++;
2855         return 0;
2856     }
2857     n_cigar = read_ncigar(in);
2858     if (!n_cigar) return 0;
2859     if (n_cigar > *a_mem) {
2860         uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2861         if (a_tmp) {
2862             *a_cigar = a_tmp;
2863             *a_mem = n_cigar;
2864         } else {
2865             hts_log_error("Memory allocation error");
2866             return -1;
2867         }
2868     }
2869 
2870     if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2871     if (end) *end = (char *)in+diff;
2872 
2873     return n_cigar;
2874 }
2875 
bam_parse_cigar(const char * in,char ** end,bam1_t * b)2876 ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2877     size_t n_cigar = 0;
2878     int diff;
2879 
2880     if (!in || !b) {
2881         hts_log_error("NULL pointer arguments");
2882         return -1;
2883     }
2884     if (end) *end = (char *)in;
2885 
2886     if (*in == '*') {
2887         if (end) (*end)++;
2888         return 0;
2889     }
2890     n_cigar = read_ncigar(in);
2891     if (!n_cigar) return 0;
2892     if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) {
2893         hts_log_error("Memory allocation error");
2894         return -1;
2895     }
2896 
2897     if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1;
2898     b->l_data += (n_cigar * sizeof(uint32_t));
2899     if (end) *end = (char *)in+diff;
2900 
2901     return n_cigar;
2902 }
2903 
2904 /*
2905  * -----------------------------------------------------------------------------
2906  * SAM threading
2907  */
2908 // Size of SAM text block (reading)
2909 #define NM 240000
2910 // Number of BAM records (writing)
2911 #define NB 1000
2912 
2913 struct SAM_state;
2914 
2915 // Output job - a block of BAM records
2916 typedef struct sp_bams {
2917     struct sp_bams *next;
2918     int serial;
2919 
2920     bam1_t *bams;
2921     int nbams, abams; // used and alloc
2922 
2923     struct SAM_state *fd;
2924 } sp_bams;
2925 
2926 // Input job - a block of SAM text
2927 typedef struct sp_lines {
2928     struct sp_lines *next;
2929     int serial;
2930 
2931     char *data;
2932     int data_size;
2933     int alloc;
2934 
2935     struct SAM_state *fd;
2936     sp_bams *bams;
2937 } sp_lines;
2938 
2939 enum sam_cmd {
2940     SAM_NONE = 0,
2941     SAM_CLOSE,
2942     SAM_CLOSE_DONE,
2943 };
2944 
2945 typedef struct SAM_state {
2946     sam_hdr_t *h;
2947 
2948     hts_tpool *p;
2949     int own_pool;
2950     pthread_mutex_t lines_m;
2951     hts_tpool_process *q;
2952     pthread_t dispatcher;
2953     int dispatcher_set;
2954 
2955     sp_lines *lines;
2956     sp_bams *bams;
2957 
2958     sp_bams *curr_bam;
2959     int curr_idx;
2960     int serial;
2961 
2962     // Be warned: moving these mutexes around in this struct can reduce
2963     // threading performance by up to 70%!
2964     pthread_mutex_t command_m;
2965     pthread_cond_t command_c;
2966     enum sam_cmd command;
2967 
2968     // One of the E* errno codes
2969     int errcode;
2970 
2971     htsFile *fp;
2972 } SAM_state;
2973 
2974 // Returns a SAM_state struct from a generic hFILE.
2975 //
2976 // Returns NULL on failure.
sam_state_create(htsFile * fp)2977 static SAM_state *sam_state_create(htsFile *fp) {
2978     // Ideally sam_open wouldn't be a #define to hts_open but instead would
2979     // be a redirect call with an additional 'S' mode.  This in turn would
2980     // correctly set the designed format to sam instead of a generic
2981     // text_format.
2982     if (fp->format.format != sam && fp->format.format != text_format)
2983         return NULL;
2984 
2985     SAM_state *fd = calloc(1, sizeof(*fd));
2986     if (!fd)
2987         return NULL;
2988 
2989     fp->state = fd;
2990     fd->fp = fp;
2991 
2992     return fd;
2993 }
2994 
2995 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
2996 static void *sam_format_worker(void *arg);
2997 
sam_state_err(SAM_state * fd,int errcode)2998 static void sam_state_err(SAM_state *fd, int errcode) {
2999     pthread_mutex_lock(&fd->command_m);
3000     if (!fd->errcode)
3001         fd->errcode = errcode;
3002     pthread_mutex_unlock(&fd->command_m);
3003 }
3004 
sam_free_sp_bams(sp_bams * b)3005 static void sam_free_sp_bams(sp_bams *b) {
3006     if (!b)
3007         return;
3008 
3009     if (b->bams) {
3010         int i;
3011         for (i = 0; i < b->abams; i++) {
3012             if (b->bams[i].data)
3013                 free(b->bams[i].data);
3014         }
3015         free(b->bams);
3016     }
3017     free(b);
3018 }
3019 
3020 // Destroys the state produce by sam_state_create.
sam_state_destroy(htsFile * fp)3021 int sam_state_destroy(htsFile *fp) {
3022     int ret = 0;
3023 
3024     if (!fp->state)
3025         return 0;
3026 
3027     SAM_state *fd = fp->state;
3028     if (fd->p) {
3029         if (fd->h) {
3030             // Notify sam_dispatcher we're closing
3031             pthread_mutex_lock(&fd->command_m);
3032             if (fd->command != SAM_CLOSE_DONE)
3033                 fd->command = SAM_CLOSE;
3034             pthread_cond_signal(&fd->command_c);
3035             ret = -fd->errcode;
3036             if (fd->q)
3037                 hts_tpool_wake_dispatch(fd->q); // unstick the reader
3038 
3039             if (!fp->is_write && fd->q && fd->dispatcher_set) {
3040                 for (;;) {
3041                     // Avoid deadlocks with dispatcher
3042                     if (fd->command == SAM_CLOSE_DONE)
3043                         break;
3044                     hts_tpool_wake_dispatch(fd->q);
3045                     pthread_mutex_unlock(&fd->command_m);
3046                     usleep(10000);
3047                     pthread_mutex_lock(&fd->command_m);
3048                 }
3049             }
3050             pthread_mutex_unlock(&fd->command_m);
3051 
3052             if (fp->is_write) {
3053                 // Dispatch the last partial block.
3054                 sp_bams *gb = fd->curr_bam;
3055                 if (!ret && gb && gb->nbams > 0 && fd->q)
3056                     ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3057 
3058                 // Flush and drain output
3059                 if (fd->q)
3060                     hts_tpool_process_flush(fd->q);
3061                 pthread_mutex_lock(&fd->command_m);
3062                 if (!ret) ret = -fd->errcode;
3063                 pthread_mutex_unlock(&fd->command_m);
3064 
3065                 while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3066                     usleep(10000);
3067                     pthread_mutex_lock(&fd->command_m);
3068                     ret = -fd->errcode;
3069                     // not empty but shutdown implies error
3070                     if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3071                         ret = EIO;
3072                     pthread_mutex_unlock(&fd->command_m);
3073                 }
3074                 if (fd->q)
3075                     hts_tpool_process_shutdown(fd->q);
3076             }
3077 
3078             // Wait for it to acknowledge
3079             if (fd->dispatcher_set)
3080                 pthread_join(fd->dispatcher, NULL);
3081             if (!ret) ret = -fd->errcode;
3082         }
3083 
3084         // Tidy up memory
3085         if (fd->q)
3086             hts_tpool_process_destroy(fd->q);
3087 
3088         if (fd->own_pool && fp->format.compression == no_compression) {
3089             hts_tpool_destroy(fd->p);
3090             fd->p = NULL;
3091         }
3092         pthread_mutex_destroy(&fd->lines_m);
3093         pthread_mutex_destroy(&fd->command_m);
3094         pthread_cond_destroy(&fd->command_c);
3095 
3096         sp_lines *l = fd->lines;
3097         while (l) {
3098             sp_lines *n = l->next;
3099             free(l->data);
3100             free(l);
3101             l = n;
3102         }
3103 
3104         sp_bams *b = fd->bams;
3105         while (b) {
3106             if (fd->curr_bam == b)
3107                 fd->curr_bam = NULL;
3108             sp_bams *n = b->next;
3109             sam_free_sp_bams(b);
3110             b = n;
3111         }
3112 
3113         if (fd->curr_bam)
3114             sam_free_sp_bams(fd->curr_bam);
3115 
3116         // Decrement counter by one, maybe destroying too.
3117         // This is to permit the caller using bam_hdr_destroy
3118         // before sam_close without triggering decode errors
3119         // in the background threads.
3120         bam_hdr_destroy(fd->h);
3121     }
3122 
3123     free(fp->state);
3124     fp->state = NULL;
3125     return ret;
3126 }
3127 
3128 // Cleanup function - job for sam_parse_worker; result for sam_format_worker
cleanup_sp_lines(void * arg)3129 static void cleanup_sp_lines(void *arg) {
3130     sp_lines *gl = (sp_lines *)arg;
3131     if (!gl) return;
3132 
3133     // Should always be true for lines passed to / from thread workers.
3134     assert(gl->next == NULL);
3135 
3136     free(gl->data);
3137     sam_free_sp_bams(gl->bams);
3138     free(gl);
3139 }
3140 
3141 // Run from one of the worker threads.
3142 // Convert a passed in array of lines to array of BAMs, returning
3143 // the result back to the thread queue.
sam_parse_worker(void * arg)3144 static void *sam_parse_worker(void *arg) {
3145     sp_lines *gl = (sp_lines *)arg;
3146     sp_bams *gb = NULL;
3147     char *lines = gl->data;
3148     int i;
3149     bam1_t *b;
3150     SAM_state *fd = gl->fd;
3151 
3152     // Use a block of BAM structs we had earlier if available.
3153     pthread_mutex_lock(&fd->lines_m);
3154     if (fd->bams) {
3155         gb = fd->bams;
3156         fd->bams = gb->next;
3157     }
3158     pthread_mutex_unlock(&fd->lines_m);
3159 
3160     if (gb == NULL) {
3161         gb = calloc(1, sizeof(*gb));
3162         if (!gb) {
3163             return NULL;
3164         }
3165         gb->abams = 100;
3166         gb->bams = b = calloc(gb->abams, sizeof(*b));
3167         if (!gb->bams) {
3168             sam_state_err(fd, ENOMEM);
3169             goto err;
3170         }
3171         gb->nbams = 0;
3172     }
3173     gb->serial = gl->serial;
3174     gb->next = NULL;
3175 
3176     b = (bam1_t *)gb->bams;
3177     if (!b) {
3178         sam_state_err(fd, ENOMEM);
3179         goto err;
3180     }
3181 
3182     i = 0;
3183     char *cp = lines, *cp_end = lines + gl->data_size;
3184     while (cp < cp_end) {
3185         if (i >= gb->abams) {
3186             int old_abams = gb->abams;
3187             gb->abams *= 2;
3188             b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3189             if (!b) {
3190                 gb->abams /= 2;
3191                 sam_state_err(fd, ENOMEM);
3192                 goto err;
3193             }
3194             memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3195             gb->bams = b;
3196         }
3197 
3198         // Ideally we'd get sam_parse1 to return the number of
3199         // bytes decoded and to be able to stop on newline as
3200         // well as \0.
3201         //
3202         // We can then avoid the additional strchr loop.
3203         // It's around 6% of our CPU cost, albeit threadable.
3204         //
3205         // However this is an API change so for now we copy.
3206 
3207         char *nl = strchr(cp, '\n');
3208         char *line_end;
3209         if (nl) {
3210             line_end = nl;
3211             if (line_end > cp && *(line_end - 1) == '\r')
3212                 line_end--;
3213             nl++;
3214         } else {
3215             nl = line_end = cp_end;
3216         }
3217         *line_end = '\0';
3218         kstring_t ks = { line_end - cp, gl->alloc, cp };
3219         if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3220             sam_state_err(fd, errno ? errno : EIO);
3221             cleanup_sp_lines(gl);
3222             goto err;
3223         }
3224         cp = nl;
3225         i++;
3226     }
3227     gb->nbams = i;
3228 
3229     pthread_mutex_lock(&fd->lines_m);
3230     gl->next = fd->lines;
3231     fd->lines = gl;
3232     pthread_mutex_unlock(&fd->lines_m);
3233     return gb;
3234 
3235  err:
3236     sam_free_sp_bams(gb);
3237     return NULL;
3238 }
3239 
sam_parse_eof(void * arg)3240 static void *sam_parse_eof(void *arg) {
3241     return NULL;
3242 }
3243 
3244 // Cleanup function - result for sam_parse_worker; job for sam_format_worker
cleanup_sp_bams(void * arg)3245 static void cleanup_sp_bams(void *arg) {
3246     sam_free_sp_bams((sp_bams *) arg);
3247 }
3248 
3249 // Runs in its own thread.
3250 // Reads a block of text (SAM) and sends a new job to the thread queue to
3251 // translate this to BAM.
sam_dispatcher_read(void * vp)3252 static void *sam_dispatcher_read(void *vp) {
3253     htsFile *fp = vp;
3254     kstring_t line = {0};
3255     int line_frag = 0;
3256     SAM_state *fd = fp->state;
3257     sp_lines *l = NULL;
3258 
3259     // Pre-allocate buffer for left-over bits of line (exact size doesn't
3260     // matter as it will grow if necessary).
3261     if (ks_resize(&line, 1000) < 0)
3262         goto err;
3263 
3264     for (;;) {
3265         // Check for command
3266         pthread_mutex_lock(&fd->command_m);
3267         switch (fd->command) {
3268 
3269         case SAM_CLOSE:
3270             pthread_cond_signal(&fd->command_c);
3271             pthread_mutex_unlock(&fd->command_m);
3272             hts_tpool_process_shutdown(fd->q);
3273             goto tidyup;
3274 
3275         default:
3276             break;
3277         }
3278         pthread_mutex_unlock(&fd->command_m);
3279 
3280         pthread_mutex_lock(&fd->lines_m);
3281         if (fd->lines) {
3282             // reuse existing line buffer
3283             l = fd->lines;
3284             fd->lines = l->next;
3285         }
3286         pthread_mutex_unlock(&fd->lines_m);
3287 
3288         if (l == NULL) {
3289             // none to reuse, to create a new one
3290             l = calloc(1, sizeof(*l));
3291             if (!l)
3292                 goto err;
3293             l->alloc = NM;
3294             l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3295             if (!l->data) {
3296                 free(l);
3297                 l = NULL;
3298                 goto err;
3299             }
3300             l->fd = fd;
3301         }
3302         l->next = NULL;
3303 
3304         if (l->alloc < line_frag+NM/2) {
3305             char *rp = realloc(l->data, line_frag+NM/2 +8);
3306             if (!rp)
3307                 goto err;
3308             l->alloc = line_frag+NM/2;
3309             l->data = rp;
3310         }
3311         memcpy(l->data, line.s, line_frag);
3312 
3313         l->data_size = line_frag;
3314         ssize_t nbytes;
3315     longer_line:
3316         if (fp->is_bgzf)
3317             nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3318         else
3319             nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3320         if (nbytes < 0) {
3321             sam_state_err(fd, errno ? errno : EIO);
3322             goto err;
3323         } else if (nbytes == 0)
3324             break; // EOF
3325         l->data_size += nbytes;
3326 
3327         // trim to last \n. Maybe \r\n, but that's still fine
3328         if (nbytes == l->alloc - line_frag) {
3329             char *cp_end = l->data + l->data_size;
3330             char *cp = cp_end-1;
3331 
3332             while (cp > (char *)l->data && *cp != '\n')
3333                 cp--;
3334 
3335             // entire buffer is part of a single line
3336             if (cp == l->data) {
3337                 line_frag = l->data_size;
3338                 char *rp = realloc(l->data, l->alloc * 2 + 8);
3339                 if (!rp)
3340                     goto err;
3341                 l->alloc *= 2;
3342                 l->data = rp;
3343                 assert(l->alloc >= l->data_size);
3344                 assert(l->alloc >= line_frag);
3345                 assert(l->alloc >= l->alloc - line_frag);
3346                 goto longer_line;
3347             }
3348             cp++;
3349 
3350             // line holds the remainder of our line.
3351             if (ks_resize(&line, cp_end - cp) < 0)
3352                 goto err;
3353             memcpy(line.s, cp, cp_end - cp);
3354             line_frag = cp_end - cp;
3355             l->data_size = l->alloc - line_frag;
3356         } else {
3357             // out of buffer
3358             line_frag = 0;
3359         }
3360 
3361         l->serial = fd->serial++;
3362         //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3363         if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3364                                 cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3365             goto err;
3366         pthread_mutex_lock(&fd->command_m);
3367         if (fd->command == SAM_CLOSE) {
3368             pthread_mutex_unlock(&fd->command_m);
3369             l = NULL;
3370             goto tidyup;
3371         }
3372         l = NULL;  // Now "owned" by sam_parse_worker()
3373         pthread_mutex_unlock(&fd->command_m);
3374     }
3375 
3376     if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3377         goto err;
3378 
3379     // At EOF, wait for close request.
3380     // (In future if we add support for seek, this is where we need to catch it.)
3381     for (;;) {
3382         pthread_mutex_lock(&fd->command_m);
3383         if (fd->command == SAM_NONE)
3384             pthread_cond_wait(&fd->command_c, &fd->command_m);
3385         switch (fd->command) {
3386         case SAM_CLOSE:
3387             pthread_cond_signal(&fd->command_c);
3388             pthread_mutex_unlock(&fd->command_m);
3389             hts_tpool_process_shutdown(fd->q);
3390             goto tidyup;
3391 
3392         default:
3393             pthread_mutex_unlock(&fd->command_m);
3394             break;
3395         }
3396     }
3397 
3398  tidyup:
3399     pthread_mutex_lock(&fd->command_m);
3400     fd->command = SAM_CLOSE_DONE;
3401     pthread_cond_signal(&fd->command_c);
3402     pthread_mutex_unlock(&fd->command_m);
3403 
3404     if (l) {
3405         pthread_mutex_lock(&fd->lines_m);
3406         l->next = fd->lines;
3407         fd->lines = l;
3408         pthread_mutex_unlock(&fd->lines_m);
3409     }
3410     free(line.s);
3411 
3412     return NULL;
3413 
3414  err:
3415     sam_state_err(fd, errno ? errno : ENOMEM);
3416     hts_tpool_process_shutdown(fd->q);
3417     goto tidyup;
3418 }
3419 
3420 // Runs in its own thread.
3421 // Takes encoded blocks of SAM off the thread results queue and writes them
3422 // to our output stream.
sam_dispatcher_write(void * vp)3423 static void *sam_dispatcher_write(void *vp) {
3424     htsFile *fp = vp;
3425     SAM_state *fd = fp->state;
3426     hts_tpool_result *r;
3427 
3428     // Iterates until result queue is shutdown, where it returns NULL.
3429     while ((r = hts_tpool_next_result_wait(fd->q))) {
3430         sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3431         if (!gl) {
3432             sam_state_err(fd, ENOMEM);
3433             goto err;
3434         }
3435 
3436         if (fp->idx) {
3437             sp_bams *gb = gl->bams;
3438             int i = 0, count = 0;
3439             while (i < gl->data_size) {
3440                 int j = i;
3441                 while (i < gl->data_size && gl->data[i] != '\n')
3442                     i++;
3443                 if (i < gl->data_size)
3444                     i++;
3445 
3446                 if (fp->is_bgzf) {
3447                     if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3448                         goto err;
3449                 } else {
3450                     if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3451                         goto err;
3452                 }
3453 
3454                 bam1_t *b = &gb->bams[count++];
3455                 if (fp->format.compression == bgzf) {
3456                     if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3457                                       b->core.tid, b->core.pos, bam_endpos(b),
3458                                       bgzf_tell(fp->fp.bgzf),
3459                                       !(b->core.flag&BAM_FUNMAP)) < 0) {
3460                         sam_state_err(fd, errno ? errno : ENOMEM);
3461                         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3462                                 bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3463                         goto err;
3464                     }
3465                 } else {
3466                     if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3467                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3468                         sam_state_err(fd, errno ? errno : ENOMEM);
3469                         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3470                                 bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3471                         goto err;
3472                     }
3473                 }
3474             }
3475 
3476             assert(count == gb->nbams);
3477 
3478             // Add bam array to free-list
3479             pthread_mutex_lock(&fd->lines_m);
3480             gb->next = fd->bams;
3481             fd->bams = gl->bams;
3482             gl->bams = NULL;
3483             pthread_mutex_unlock(&fd->lines_m);
3484         } else {
3485             if (fp->is_bgzf) {
3486                 if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size)
3487                     goto err;
3488             } else {
3489                 if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3490                     goto err;
3491             }
3492         }
3493 
3494         hts_tpool_delete_result(r, 0);
3495 
3496         // Also updated by main thread
3497         pthread_mutex_lock(&fd->lines_m);
3498         gl->next = fd->lines;
3499         fd->lines = gl;
3500         pthread_mutex_unlock(&fd->lines_m);
3501     }
3502 
3503     sam_state_err(fd, 0); // success
3504     hts_tpool_process_shutdown(fd->q);
3505     return NULL;
3506 
3507  err:
3508     sam_state_err(fd, errno ? errno : EIO);
3509     return (void *)-1;
3510 }
3511 
3512 // Run from one of the worker threads.
3513 // Convert a passed in array of BAMs (sp_bams) and converts to a block
3514 // of text SAM records (sp_lines).
sam_format_worker(void * arg)3515 static void *sam_format_worker(void *arg) {
3516     sp_bams *gb = (sp_bams *)arg;
3517     sp_lines *gl = NULL;
3518     int i;
3519     SAM_state *fd = gb->fd;
3520     htsFile *fp = fd->fp;
3521 
3522     // Use a block of SAM strings we had earlier if available.
3523     pthread_mutex_lock(&fd->lines_m);
3524     if (fd->lines) {
3525         gl = fd->lines;
3526         fd->lines = gl->next;
3527     }
3528     pthread_mutex_unlock(&fd->lines_m);
3529 
3530     if (gl == NULL) {
3531         gl = calloc(1, sizeof(*gl));
3532         if (!gl) {
3533             sam_state_err(fd, ENOMEM);
3534             return NULL;
3535         }
3536         gl->alloc = gl->data_size = 0;
3537         gl->data = NULL;
3538     }
3539     gl->serial = gb->serial;
3540     gl->next = NULL;
3541 
3542     kstring_t ks = {0, gl->alloc, gl->data};
3543 
3544     for (i = 0; i < gb->nbams; i++) {
3545         if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3546             sam_state_err(fd, errno ? errno : EIO);
3547             goto err;
3548         }
3549         kputc('\n', &ks);
3550     }
3551 
3552     pthread_mutex_lock(&fd->lines_m);
3553     gl->data_size = ks.l;
3554     gl->alloc = ks.m;
3555     gl->data = ks.s;
3556 
3557     if (fp->idx) {
3558         // Keep hold of the bam array a little longer as
3559         // sam_dispatcher_write needs to use them for building the index.
3560         gl->bams = gb;
3561     } else {
3562         // Add bam array to free-list
3563         gb->next = fd->bams;
3564         fd->bams = gb;
3565     }
3566     pthread_mutex_unlock(&fd->lines_m);
3567 
3568     return gl;
3569 
3570  err:
3571     // Possible race between this and fd->curr_bam.
3572     // Easier to not free and leave it on the input list so it
3573     // gets freed there instead?
3574     // sam_free_sp_bams(gb);
3575     if (gl) {
3576         free(gl->data);
3577         free(gl);
3578     }
3579     return NULL;
3580 }
3581 
sam_set_thread_pool(htsFile * fp,htsThreadPool * p)3582 int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3583     if (fp->state)
3584         return 0;
3585 
3586     if (!(fp->state = sam_state_create(fp)))
3587         return -1;
3588     SAM_state *fd = (SAM_state *)fp->state;
3589 
3590     pthread_mutex_init(&fd->lines_m, NULL);
3591     pthread_mutex_init(&fd->command_m, NULL);
3592     pthread_cond_init(&fd->command_c, NULL);
3593     fd->p = p->pool;
3594     int qsize = p->qsize;
3595     if (!qsize)
3596         qsize = 2*hts_tpool_size(fd->p);
3597     fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3598     if (!fd->q) {
3599         sam_state_destroy(fp);
3600         return -1;
3601     }
3602 
3603     if (fp->format.compression == bgzf)
3604         return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3605 
3606     return 0;
3607 }
3608 
sam_set_threads(htsFile * fp,int nthreads)3609 int sam_set_threads(htsFile *fp, int nthreads) {
3610     if (nthreads <= 0)
3611         return 0;
3612 
3613     htsThreadPool p;
3614     p.pool = hts_tpool_init(nthreads);
3615     p.qsize = nthreads*2;
3616 
3617     int ret = sam_set_thread_pool(fp, &p);
3618     if (ret < 0)
3619         return ret;
3620 
3621     SAM_state *fd = (SAM_state *)fp->state;
3622     fd->own_pool = 1;
3623 
3624     return 0;
3625 }
3626 
3627 typedef struct {
3628     kstring_t name;
3629     kstring_t comment; // NB: pointer into name, do not free
3630     kstring_t seq;
3631     kstring_t qual;
3632     int casava;
3633     int aux;
3634     int rnum;
3635     char BC[3];         // aux tag ID for barcode
3636     khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3637     char nprefix;
3638     int sra_names;
3639 } fastq_state;
3640 
3641 // Initialise fastq state.
3642 // Name char of '@' or '>' distinguishes fastq vs fasta variant
fastq_state_init(int name_char)3643 static fastq_state *fastq_state_init(int name_char) {
3644     fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3645     if (!x)
3646         return NULL;
3647     strcpy(x->BC, "BC");
3648     x->nprefix = name_char;
3649 
3650     return x;
3651 }
3652 
fastq_state_destroy(htsFile * fp)3653 void fastq_state_destroy(htsFile *fp) {
3654     if (fp->state) {
3655         fastq_state *x = (fastq_state *)fp->state;
3656         if (x->tags)
3657             kh_destroy(tag, x->tags);
3658         ks_free(&x->name);
3659         ks_free(&x->seq);
3660         ks_free(&x->qual);
3661         free(fp->state);
3662     }
3663 }
3664 
fastq_state_set(samFile * fp,enum hts_fmt_option opt,...)3665 int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3666     va_list args;
3667 
3668     if (!fp)
3669         return -1;
3670     if (!fp->state)
3671         if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3672                                            ? '@' : '>')))
3673             return -1;
3674 
3675     fastq_state *x = (fastq_state *)fp->state;
3676 
3677     switch (opt) {
3678     case FASTQ_OPT_CASAVA:
3679         x->casava = 1;
3680         break;
3681 
3682     case FASTQ_OPT_NAME2:
3683         x->sra_names = 1;
3684         break;
3685 
3686     case FASTQ_OPT_AUX: {
3687         va_start(args, opt);
3688         x->aux = 1;
3689         char *tag = va_arg(args, char *);
3690         va_end(args);
3691         if (tag && strcmp(tag, "1") != 0) {
3692             if (!x->tags)
3693                 if (!(x->tags = kh_init(tag)))
3694                     return -1;
3695 
3696             size_t i, tlen = strlen(tag);
3697             for (i = 0; i+3 <= tlen+1; i += 3) {
3698                 if (tag[i+0] == ',' || tag[i+1] == ',' ||
3699                     !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3700                     hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3701                     break;
3702                 }
3703                 int ret, tcode = tag[i+0]*256 + tag[i+1];
3704                 kh_put(tag, x->tags, tcode, &ret);
3705                 if (ret < 0)
3706                     return -1;
3707             }
3708         }
3709         break;
3710     }
3711 
3712     case FASTQ_OPT_BARCODE: {
3713         va_start(args, opt);
3714         char *bc = va_arg(args, char *);
3715         va_end(args);
3716         strncpy(x->BC, bc, 2);
3717         x->BC[2] = 0;
3718         break;
3719     }
3720 
3721     case FASTQ_OPT_RNUM:
3722         x->rnum = 1;
3723         break;
3724 
3725     default:
3726         break;
3727     }
3728     return 0;
3729 }
3730 
fastq_parse1(htsFile * fp,bam1_t * b)3731 static int fastq_parse1(htsFile *fp, bam1_t *b) {
3732     fastq_state *x = (fastq_state *)fp->state;
3733     size_t i, l;
3734     int ret = 0;
3735 
3736     if (fp->format.format == fasta_format && fp->line.s) {
3737         // For FASTA we've already read the >name line; steal it
3738         // Not the most efficient, but we don't optimise for fasta reading.
3739         if (fp->line.l == 0)
3740             return -1; // EOF
3741 
3742         free(x->name.s);
3743         x->name = fp->line;
3744         fp->line.l = fp->line.m = 0;
3745         fp->line.s = NULL;
3746     } else {
3747         // Read a FASTQ format entry.
3748         ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3749         if (ret == -1)
3750             return -1;  // EOF
3751         else if (ret < -1)
3752             return ret; // ERR
3753     }
3754 
3755     // Name
3756 
3757     if (*x->name.s != x->nprefix)
3758         return -2;
3759 
3760     // Reverse the SRA strangeness of putting the run_name.number before
3761     // the read name.
3762     i = 0;
3763     char *name = x->name.s+1;
3764     if (x->sra_names) {
3765         char *cp = strpbrk(x->name.s, " \t");
3766         if (cp) {
3767             while (*cp == ' ' || *cp == '\t')
3768                 cp++;
3769             *--cp = '@';
3770             i = cp - x->name.s;
3771             name = cp+1;
3772         }
3773     }
3774 
3775     l = x->name.l;
3776     char *s = x->name.s;
3777     while (i < l && !isspace_c(s[i]))
3778         i++;
3779     if (i < l) {
3780         s[i] = 0;
3781         x->name.l = i++;
3782     }
3783 
3784     // Comment; a kstring struct, but pointer into name line.  (Do not free)
3785     while (i < l && isspace_c(s[i]))
3786         i++;
3787     x->comment.s = s+i;
3788     x->comment.l = l - i;
3789 
3790     // Seq
3791     x->seq.l = 0;
3792     for (;;) {
3793         if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3794             if (fp->format.format == fastq_format || ret < -1)
3795                 return -2;
3796         if (*fp->line.s == (fp->format.format == fastq_format ? '+' : '>')
3797             || ret == -1)
3798             break;
3799         if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3800             return -2;
3801     }
3802 
3803     // Qual
3804     if (fp->format.format == fastq_format) {
3805         size_t remainder = x->seq.l;
3806         x->qual.l = 0;
3807         do {
3808             if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3809                 return -2;
3810             if (fp->line.l > remainder)
3811                 return -2;
3812             if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
3813                 return -2;
3814             remainder -= fp->line.l;
3815         } while (remainder > 0);
3816 
3817         // Decr qual
3818         for (i = 0; i < x->qual.l; i++)
3819             x->qual.s[i] -= '!';
3820     }
3821 
3822     int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
3823     if (x->name.l > 2 &&
3824         x->name.s[x->name.l-2] == '/' &&
3825         isdigit_c(x->name.s[x->name.l-1])) {
3826         switch(x->name.s[x->name.l-1]) {
3827         case '1': flag |= BAM_FREAD1 | pflag; break;
3828         case '2': flag |= BAM_FREAD2 | pflag; break;
3829         default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
3830         }
3831         x->name.s[x->name.l-=2] = 0;
3832     }
3833 
3834     // Convert to BAM
3835     ret = bam_set1(b,
3836                    x->name.s + x->name.l - name, name,
3837                    flag,
3838                    -1, -1, 0, // ref '*', pos, mapq,
3839                    0, NULL,     // no cigar,
3840                    -1, -1, 0,    // mate
3841                    x->seq.l, x->seq.s, x->qual.s,
3842                    0);
3843 
3844     // Identify Illumina CASAVA strings.
3845     // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
3846     char *barcode = NULL;
3847     int barcode_len = 0;
3848     kstring_t *kc = &x->comment;
3849     char *endptr;
3850     if (x->casava &&
3851         // \d:[YN]:\d+:[ACGTN]+
3852         kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
3853         strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
3854         && *endptr == ':') {
3855 
3856         // read num
3857         switch(kc->s[0]) {
3858         case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
3859         case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
3860         default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
3861         }
3862 
3863         if (kc->s[2] == 'Y')
3864             b->core.flag |= BAM_FQCFAIL;
3865 
3866         // Barcode, maybe numeric in which case we skip it
3867         if (!isdigit_c(endptr[1])) {
3868             barcode = endptr+1;
3869             for (i = barcode - kc->s; i < kc->l; i++)
3870                 if (isspace_c(kc->s[i]))
3871                     break;
3872 
3873             kc->s[i] = 0;
3874             barcode_len = i+1-(barcode - kc->s);
3875         }
3876     }
3877 
3878     if (ret >= 0 && barcode_len)
3879         if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
3880             ret = -2;
3881 
3882     if (!x->aux)
3883         return ret;
3884 
3885     // Identify any SAM style aux tags in comments too.
3886     if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
3887         ret = -2;
3888 
3889     return ret;
3890 }
3891 
3892 // Internal component of sam_read1 below
sam_read1_bam(htsFile * fp,sam_hdr_t * h,bam1_t * b)3893 static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
3894     int ret = bam_read1(fp->fp.bgzf, b);
3895     if (h && ret >= 0) {
3896         if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
3897             b->core.mtid >= h->n_targets || b->core.mtid < -1) {
3898             errno = ERANGE;
3899             return -3;
3900         }
3901     }
3902     return ret;
3903 }
3904 
3905 // Internal component of sam_read1 below
sam_read1_cram(htsFile * fp,sam_hdr_t * h,bam1_t ** b)3906 static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
3907     int ret = cram_get_bam_seq(fp->fp.cram, b);
3908     if (ret < 0)
3909         return cram_eof(fp->fp.cram) ? -1 : -2;
3910 
3911     if (bam_tag2cigar(*b, 1, 1) < 0)
3912         return -2;
3913 
3914     return ret;
3915 }
3916 
3917 // Internal component of sam_read1 below
sam_read1_sam(htsFile * fp,sam_hdr_t * h,bam1_t * b)3918 static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
3919     int ret;
3920 
3921     // Consume 1st line after header parsing as it wasn't using peek
3922     if (fp->line.l != 0) {
3923         ret = sam_parse1(&fp->line, h, b);
3924         fp->line.l = 0;
3925         return ret;
3926     }
3927 
3928     if (fp->state) {
3929         SAM_state *fd = (SAM_state *)fp->state;
3930 
3931         if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
3932             // We don't support multi-threaded SAM parsing with seeks yet.
3933             int ret;
3934             if ((ret = sam_state_destroy(fp)) < 0) {
3935                 errno = -ret;
3936                 return -2;
3937             }
3938             if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
3939                 return -1;
3940             fp->fp.bgzf->seeked = 0;
3941             goto err_recover;
3942         }
3943 
3944         if (!fd->h) {
3945             fd->h = h;
3946             fd->h->ref_count++;
3947             // Ensure hrecs is initialised now as we don't want multiple
3948             // threads trying to do this simultaneously.
3949             if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
3950                 return -2;
3951 
3952             // We can only do this once we've got a header
3953             if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
3954                                fp) != 0)
3955                 return -2;
3956             fd->dispatcher_set = 1;
3957         }
3958 
3959         if (fd->h != h) {
3960             hts_log_error("SAM multi-threaded decoding does not support changing header");
3961             return -1;
3962         }
3963 
3964         sp_bams *gb = fd->curr_bam;
3965         if (!gb) {
3966             if (fd->errcode) {
3967                 // In case reader failed
3968                 errno = fd->errcode;
3969                 return -2;
3970             }
3971             hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
3972             if (!r)
3973                 return -2;
3974             fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
3975             hts_tpool_delete_result(r, 0);
3976         }
3977         if (!gb)
3978             return fd->errcode ? -2 : -1;
3979         bam1_t *b_array = (bam1_t *)gb->bams;
3980         if (fd->curr_idx < gb->nbams)
3981             if (!bam_copy1(b, &b_array[fd->curr_idx++]))
3982                 return -2;
3983         if (fd->curr_idx == gb->nbams) {
3984             pthread_mutex_lock(&fd->lines_m);
3985             gb->next = fd->bams;
3986             fd->bams = gb;
3987             pthread_mutex_unlock(&fd->lines_m);
3988 
3989             fd->curr_bam = NULL;
3990             fd->curr_idx = 0;
3991         }
3992 
3993         ret = 0;
3994 
3995     } else  {
3996     err_recover:
3997         ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3998         if (ret < 0) return ret;
3999 
4000         ret = sam_parse1(&fp->line, h, b);
4001         fp->line.l = 0;
4002         if (ret < 0) {
4003             hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4004             if (h->ignore_sam_err) goto err_recover;
4005         }
4006     }
4007 
4008     return ret;
4009 }
4010 
4011 // Returns 0 on success,
4012 //        -1 on EOF,
4013 //       <-1 on error
sam_read1(htsFile * fp,sam_hdr_t * h,bam1_t * b)4014 int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4015 {
4016     int ret, pass_filter;
4017 
4018     do {
4019         switch (fp->format.format) {
4020         case bam:
4021             ret = sam_read1_bam(fp, h, b);
4022             break;
4023 
4024         case cram:
4025             ret = sam_read1_cram(fp, h, &b);
4026             break;
4027 
4028         case sam:
4029             ret = sam_read1_sam(fp, h, b);
4030             break;
4031 
4032         case fasta_format:
4033         case fastq_format: {
4034             fastq_state *x = (fastq_state *)fp->state;
4035             if (!x) {
4036                 if (!(fp->state = fastq_state_init(fp->format.format
4037                                                    == fastq_format ? '@' : '>')))
4038                     return -2;
4039             }
4040 
4041             return fastq_parse1(fp, b);
4042         }
4043 
4044         case empty_format:
4045             errno = EPIPE;
4046             return -3;
4047 
4048         default:
4049             errno = EFTYPE;
4050             return -3;
4051         }
4052 
4053         pass_filter = (ret >= 0 && fp->filter)
4054             ? sam_passes_filter(h, b, fp->filter)
4055             : 1;
4056     } while (pass_filter == 0);
4057 
4058     return pass_filter < 0 ? -2 : ret;
4059 }
4060 
4061 
sam_format1_append(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)4062 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4063 {
4064     int i, r = 0;
4065     uint8_t *s, *end;
4066     const bam1_core_t *c = &b->core;
4067 
4068     if (c->l_qname == 0)
4069         return -1;
4070     r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4071     r |= kputc_('\t', str); // query name
4072     r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4073     if (c->tid >= 0) { // chr
4074         r |= kputs(h->target_name[c->tid] , str);
4075         r |= kputc_('\t', str);
4076     } else r |= kputsn_("*\t", 2, str);
4077     r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4078     r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4079     if (c->n_cigar) { // cigar
4080         uint32_t *cigar = bam_get_cigar(b);
4081         for (i = 0; i < c->n_cigar; ++i) {
4082             r |= kputw(bam_cigar_oplen(cigar[i]), str);
4083             r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4084         }
4085     } else r |= kputc_('*', str);
4086     r |= kputc_('\t', str);
4087     if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4088     else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4089     else {
4090         r |= kputs(h->target_name[c->mtid], str);
4091         r |= kputc_('\t', str);
4092     }
4093     r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4094     r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4095     if (c->l_qseq) { // seq and qual
4096         uint8_t *s = bam_get_seq(b);
4097         if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4098         char *cp = str->s + str->l;
4099 
4100         // Sequence, 2 bases at a time
4101         nibble2base(s, cp, c->l_qseq);
4102         cp[c->l_qseq] = '\t';
4103         cp += c->l_qseq+1;
4104 
4105         // Quality
4106         s = bam_get_qual(b);
4107         i = 0;
4108         if (s[0] == 0xff) {
4109             cp[i++] = '*';
4110         } else {
4111             // local copy of c->l_qseq to aid unrolling
4112             uint32_t lqseq = c->l_qseq;
4113             for (i = 0; i < lqseq; ++i)
4114                 cp[i]=s[i]+33;
4115         }
4116         cp[i] = 0;
4117         cp += i;
4118         str->l = cp - str->s;
4119     } else r |= kputsn_("*\t*", 3, str);
4120 
4121     s = bam_get_aux(b); // aux
4122     end = b->data + b->l_data;
4123 
4124     while (end - s >= 4) {
4125         r |= kputc_('\t', str);
4126         if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4127             goto bad_aux;
4128     }
4129     r |= kputsn("", 0, str); // nul terminate
4130     if (r < 0) goto mem_err;
4131 
4132     return str->l;
4133 
4134  bad_aux:
4135     hts_log_error("Corrupted aux data for read %.*s",
4136                   b->core.l_qname, bam_get_qname(b));
4137     errno = EINVAL;
4138     return -1;
4139 
4140  mem_err:
4141     hts_log_error("Out of memory");
4142     errno = ENOMEM;
4143     return -1;
4144 }
4145 
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)4146 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4147 {
4148     str->l = 0;
4149     return sam_format1_append(h, b, str);
4150 }
4151 
4152 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
fastq_format1(fastq_state * x,const bam1_t * b,kstring_t * str)4153 int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4154 {
4155     unsigned flag = b->core.flag;
4156     int i, e = 0, len = b->core.l_qseq;
4157     uint8_t *seq, *qual;
4158 
4159     str->l = 0;
4160 
4161     if (len == 0) return 0;
4162 
4163     // Name
4164     if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4165         return -1;
4166 
4167     // /1 or /2 suffix
4168     if (x && x->rnum && (flag & BAM_FPAIRED)) {
4169         int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4170         if (r12 == BAM_FREAD1) {
4171             if (kputs("/1", str) == EOF)
4172                 return -1;
4173         } else if (r12 == BAM_FREAD2) {
4174             if (kputs("/2", str) == EOF)
4175                 return -1;
4176         }
4177     }
4178 
4179     // Illumina CASAVA tag.
4180     // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4181     if (x && x->casava) {
4182         int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4183         char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4184         uint8_t *bc = bam_aux_get(b, x->BC);
4185         if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4186                      bc ? (char *)bc+1 : "0") < 0)
4187             return -1;
4188 
4189         // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4190         if (bc) {
4191             int l = strlen((char *)bc+1);
4192             char *c = (char *)str->s + str->l - l;
4193             for (i = 0; i < l; i++)
4194                 if (!isalpha_c(c[i]))
4195                     c[i] = '+';
4196         }
4197     }
4198 
4199     // Aux tags
4200     if (x && x->aux) {
4201         uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4202         while (s && end - s >= 4) {
4203             int tt = s[0]*256 + s[1];
4204             if (x->tags == NULL ||
4205                 kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4206                 e |= kputc_('\t', str) < 0;
4207                 if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4208                     return -1;
4209             } else {
4210                 s = skip_aux(s+2, end);
4211             }
4212         }
4213         e |= kputsn("", 0, str) < 0; // nul terminate
4214     }
4215 
4216     if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4217     e |= kputc_('\n', str) < 0;
4218 
4219     // Seq line
4220     seq = bam_get_seq(b);
4221     if (flag & BAM_FREVERSE)
4222         for (i = len-1; i >= 0; i--)
4223             e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4224     else
4225         for (i = 0; i < len; i++)
4226             e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4227 
4228 
4229     // Qual line
4230     if (x->nprefix == '@') {
4231         kputsn("\n+\n", 3, str);
4232         qual = bam_get_qual(b);
4233         if (qual[0] == 0xff)
4234             for (i = 0; i < len; i++)
4235                 e |= kputc_('B', str) < 0;
4236         else if (flag & BAM_FREVERSE)
4237             for (i = len-1; i >= 0; i--)
4238                 e |= kputc_(33 + qual[i], str) < 0;
4239         else
4240             for (i = 0; i < len; i++)
4241                 e |= kputc_(33 + qual[i], str) < 0;
4242 
4243     }
4244     e |= kputc('\n', str) < 0;
4245 
4246     return e ? -1 : str->l;
4247 }
4248 
4249 // Sadly we need to be able to modify the bam_hdr here so we can
4250 // reference count the structure.
sam_write1(htsFile * fp,const sam_hdr_t * h,const bam1_t * b)4251 int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4252 {
4253     switch (fp->format.format) {
4254     case binary_format:
4255         fp->format.category = sequence_data;
4256         fp->format.format = bam;
4257         /* fall-through */
4258     case bam:
4259         return bam_write_idx1(fp, h, b);
4260 
4261     case cram:
4262         return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4263 
4264     case text_format:
4265         fp->format.category = sequence_data;
4266         fp->format.format = sam;
4267         /* fall-through */
4268     case sam:
4269         if (fp->state) {
4270             SAM_state *fd = (SAM_state *)fp->state;
4271 
4272             // Threaded output
4273             if (!fd->h) {
4274                 // NB: discard const.  We don't actually modify sam_hdr_t here,
4275                 // just data pointed to by it (which is a bit weasely still),
4276                 // but out cached pointer must be non-const as we want to
4277                 // destroy it later on and sam_hdr_destroy takes non-const.
4278                 //
4279                 // We do this because some tools do sam_hdr_destroy; sam_close
4280                 // while others do sam_close; sam_hdr_destroy.  The former is
4281                 // an issue as we need the header still when flushing.
4282                 fd->h = (sam_hdr_t *)h;
4283                 fd->h->ref_count++;
4284 
4285                 if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4286                                    fp) != 0)
4287                     return -2;
4288                 fd->dispatcher_set = 1;
4289             }
4290 
4291             if (fd->h != h) {
4292                 hts_log_error("SAM multi-threaded decoding does not support changing header");
4293                 return -2;
4294             }
4295 
4296             // Find a suitable BAM array to copy to
4297             sp_bams *gb = fd->curr_bam;
4298             if (!gb) {
4299                 pthread_mutex_lock(&fd->lines_m);
4300                 if (fd->bams) {
4301                     fd->curr_bam = gb = fd->bams;
4302                     fd->bams = gb->next;
4303                     gb->next = NULL;
4304                     gb->nbams = 0;
4305                     pthread_mutex_unlock(&fd->lines_m);
4306                 } else {
4307                     pthread_mutex_unlock(&fd->lines_m);
4308                     if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4309                     if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) {
4310                         free(gb);
4311                         return -1;
4312                     }
4313                     gb->nbams = 0;
4314                     gb->abams = NB;
4315                     gb->fd = fd;
4316                     fd->curr_idx = 0;
4317                     fd->curr_bam = gb;
4318                 }
4319             }
4320 
4321             if (!bam_copy1(&gb->bams[gb->nbams++], b))
4322                 return -2;
4323 
4324             // Dispatch if full
4325             if (gb->nbams == NB) {
4326                 gb->serial = fd->serial++;
4327                 //fprintf(stderr, "Dispatch another %d bams\n", NB);
4328                 pthread_mutex_lock(&fd->command_m);
4329                 if (fd->errcode != 0) {
4330                     pthread_mutex_unlock(&fd->command_m);
4331                     return -fd->errcode;
4332                 }
4333                 if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4334                                         cleanup_sp_bams,
4335                                         cleanup_sp_lines, 0) < 0) {
4336                     pthread_mutex_unlock(&fd->command_m);
4337                     return -1;
4338                 }
4339                 pthread_mutex_unlock(&fd->command_m);
4340                 fd->curr_bam = NULL;
4341             }
4342 
4343             // Dummy value as we don't know how long it really is.
4344             // We could track file sizes via a SAM_state field, but I don't think
4345             // it is necessary.
4346             return 1;
4347         } else {
4348             if (sam_format1(h, b, &fp->line) < 0) return -1;
4349             kputc('\n', &fp->line);
4350             if (fp->is_bgzf) {
4351                 if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4352             } else {
4353                 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4354             }
4355 
4356             if (fp->idx) {
4357                 if (fp->format.compression == bgzf) {
4358                     if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4359                                       bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4360                         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4361                                 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4362                         return -1;
4363                     }
4364                 } else {
4365                     if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4366                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4367                         hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4368                                 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4369                         return -1;
4370                     }
4371                 }
4372             }
4373 
4374             return fp->line.l;
4375         }
4376 
4377 
4378     case fasta_format:
4379     case fastq_format: {
4380         fastq_state *x = (fastq_state *)fp->state;
4381         if (!x) {
4382             if (!(fp->state = fastq_state_init(fp->format.format
4383                                                == fastq_format ? '@' : '>')))
4384                 return -2;
4385         }
4386 
4387         if (fastq_format1(fp->state, b, &fp->line) < 0)
4388             return -1;
4389         if (fp->is_bgzf) {
4390             if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4391                 return -1;
4392         } else {
4393             if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4394                 return -1;
4395         }
4396         return fp->line.l;
4397     }
4398 
4399     default:
4400         errno = EBADF;
4401         return -1;
4402     }
4403 }
4404 
4405 /************************
4406  *** Auxiliary fields ***
4407  ************************/
4408 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)4409 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4410     int tsz = aux_type2size(type);
4411 
4412     if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4413 
4414     switch (tsz) {
4415         case 'H': case 'Z': case 1:  // Trivial
4416             memcpy(out, in, len);
4417             break;
4418 
4419 #define aux_val_to_le(type_t, store_le) do {                            \
4420         type_t v;                                                       \
4421         size_t i;                                                       \
4422         for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4423             memcpy(&v, in + i, sizeof(type_t));                         \
4424             store_le(v, out);                                           \
4425         }                                                               \
4426     } while (0)
4427 
4428         case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4429         case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4430         case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4431 
4432 #undef aux_val_to_le
4433 
4434         case 'B': { // Recurse!
4435             uint32_t n;
4436             if (len < 5) return -1;
4437             memcpy(&n, in + 1, 4);
4438             out[0] = in[0];
4439             u32_to_le(n, out + 1);
4440             return aux_to_le(in[0], out + 5, in + 5, len - 5);
4441         }
4442 
4443         default: // Unknown type code
4444             return -1;
4445     }
4446 
4447 
4448 
4449     return 0;
4450 }
4451 #endif
4452 
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)4453 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4454 {
4455     uint32_t new_len;
4456 
4457     assert(b->l_data >= 0);
4458     new_len = b->l_data + 3 + len;
4459     if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4460 
4461     if (realloc_bam_data(b, new_len) < 0) return -1;
4462 
4463     b->data[b->l_data] = tag[0];
4464     b->data[b->l_data + 1] = tag[1];
4465     b->data[b->l_data + 2] = type;
4466 
4467 #ifdef HTS_LITTLE_ENDIAN
4468     memcpy(b->data + b->l_data + 3, data, len);
4469 #else
4470     if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4471         errno = EINVAL;
4472         return -1;
4473     }
4474 #endif
4475 
4476     b->l_data = new_len;
4477 
4478     return 0;
4479 
4480  nomem:
4481     errno = ENOMEM;
4482     return -1;
4483 }
4484 
skip_aux(uint8_t * s,uint8_t * end)4485 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4486 {
4487     int size;
4488     uint32_t n;
4489     if (s >= end) return end;
4490     size = aux_type2size(*s); ++s; // skip type
4491     switch (size) {
4492     case 'Z':
4493     case 'H':
4494         while (s < end && *s) ++s;
4495         return s < end ? s + 1 : end;
4496     case 'B':
4497         if (end - s < 5) return NULL;
4498         size = aux_type2size(*s); ++s;
4499         n = le_to_u32(s);
4500         s += 4;
4501         if (size == 0 || end - s < size * n) return NULL;
4502         return s + size * n;
4503     case 0:
4504         return NULL;
4505     default:
4506         if (end - s < size) return NULL;
4507         return s + size;
4508     }
4509 }
4510 
bam_aux_get(const bam1_t * b,const char tag[2])4511 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4512 {
4513     uint8_t *s, *end, *t = (uint8_t *) tag;
4514     uint16_t y = (uint16_t) t[0]<<8 | t[1];
4515     s = bam_get_aux(b);
4516     end = b->data + b->l_data;
4517     while (s != NULL && end - s >= 3) {
4518         uint16_t x = (uint16_t) s[0]<<8 | s[1];
4519         s += 2;
4520         if (x == y) {
4521             // Check the tag value is valid and complete
4522             uint8_t *e = skip_aux(s, end);
4523             if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
4524                 goto bad_aux;  // Unterminated string
4525             }
4526             if (e != NULL) {
4527                 return s;
4528             } else {
4529                 goto bad_aux;
4530             }
4531         }
4532         s = skip_aux(s, end);
4533     }
4534     if (s == NULL) goto bad_aux;
4535     errno = ENOENT;
4536     return NULL;
4537 
4538  bad_aux:
4539     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4540     errno = EINVAL;
4541     return NULL;
4542 }
4543 
4544 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)4545 int bam_aux_del(bam1_t *b, uint8_t *s)
4546 {
4547     uint8_t *p, *aux;
4548     int l_aux = bam_get_l_aux(b);
4549     aux = bam_get_aux(b);
4550     p = s - 2;
4551     s = skip_aux(s, aux + l_aux);
4552     if (s == NULL) goto bad_aux;
4553     memmove(p, s, l_aux - (s - aux));
4554     b->l_data -= s - p;
4555     return 0;
4556 
4557  bad_aux:
4558     hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4559     errno = EINVAL;
4560     return -1;
4561 }
4562 
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)4563 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4564 {
4565     // FIXME: This is not at all efficient!
4566     size_t ln = len >= 0 ? len : strlen(data) + 1;
4567     size_t old_ln = 0;
4568     int need_nul = ln == 0 || data[ln - 1] != '\0';
4569     int save_errno = errno;
4570     int new_tag = 0;
4571     uint8_t *s = bam_aux_get(b,tag), *e;
4572 
4573     if (s) {  // Replacing existing tag
4574         char type = *s;
4575         if (type != 'Z') {
4576             hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4577             errno = EINVAL;
4578             return -1;
4579         }
4580         s++;
4581         e = memchr(s, '\0', b->data + b->l_data - s);
4582         old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4583         s -= 3;
4584     } else {
4585         if (errno != ENOENT) { // Invalid aux data, give up
4586             return -1;
4587         } else { // Tag doesn't exist - put it on the end
4588             errno = save_errno;
4589             s = b->data + b->l_data;
4590             new_tag = 3;
4591         }
4592     }
4593 
4594     if (old_ln < ln + need_nul + new_tag) {
4595         ptrdiff_t s_offset = s - b->data;
4596         if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4597             return -1;
4598         s = b->data + s_offset;
4599     }
4600     if (!new_tag) {
4601         memmove(s + 3 + ln + need_nul,
4602                 s + 3 + old_ln,
4603                 b->l_data - (s + 3 - b->data) - old_ln);
4604     }
4605     b->l_data += new_tag + ln + need_nul - old_ln;
4606 
4607     s[0] = tag[0];
4608     s[1] = tag[1];
4609     s[2] = 'Z';
4610     memmove(s+3,data,ln);
4611     if (need_nul) s[3 + ln] = '\0';
4612     return 0;
4613 }
4614 
bam_aux_update_int(bam1_t * b,const char tag[2],int64_t val)4615 int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4616 {
4617     uint32_t sz, old_sz = 0, new = 0;
4618     uint8_t *s, type;
4619 
4620     if (val < INT32_MIN || val > UINT32_MAX) {
4621         errno = EOVERFLOW;
4622         return -1;
4623     }
4624     if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4625     else if (val < INT8_MIN)   { type = 's'; sz = 2; }
4626     else if (val < 0)          { type = 'c'; sz = 1; }
4627     else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
4628     else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4629     else                       { type = 'I'; sz = 4; }
4630 
4631     s = bam_aux_get(b, tag);
4632     if (s) {  // Tag present - how big was the old one?
4633         switch (*s) {
4634             case 'c': case 'C': old_sz = 1; break;
4635             case 's': case 'S': old_sz = 2; break;
4636             case 'i': case 'I': old_sz = 4; break;
4637             default: errno = EINVAL; return -1;  // Not an integer
4638         }
4639     } else {
4640         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4641             s = b->data + b->l_data;
4642             new = 1;
4643         }  else { // Invalid aux data, give up.
4644             return -1;
4645         }
4646     }
4647 
4648     if (new || old_sz < sz) {
4649         // Make room for new tag
4650         ptrdiff_t s_offset = s - b->data;
4651         if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4652             return -1;
4653         s =  b->data + s_offset;
4654         if (new) { // Add tag id
4655             *s++ = tag[0];
4656             *s++ = tag[1];
4657         } else {   // Shift following data so we have space
4658             memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4659         }
4660     } else {
4661         // Reuse old space.  Data value may be bigger than necessary but
4662         // we avoid having to move everything else
4663         sz = old_sz;
4664         type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4665         assert(type > 0);
4666     }
4667     *s++ = type;
4668 #ifdef HTS_LITTLE_ENDIAN
4669     memcpy(s, &val, sz);
4670 #else
4671     switch (sz) {
4672         case 4:  u32_to_le(val, s); break;
4673         case 2:  u16_to_le(val, s); break;
4674         default: *s = val; break;
4675     }
4676 #endif
4677     b->l_data += (new ? 3 : 0) + sz - old_sz;
4678     return 0;
4679 }
4680 
bam_aux_update_float(bam1_t * b,const char tag[2],float val)4681 int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
4682 {
4683     uint8_t *s = bam_aux_get(b, tag);
4684     int shrink = 0, new = 0;
4685 
4686     if (s) { // Tag present - what was it?
4687         switch (*s) {
4688             case 'f': break;
4689             case 'd': shrink = 1; break;
4690             default: errno = EINVAL; return -1;  // Not a float
4691         }
4692     } else {
4693         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4694             new = 1;
4695         }  else { // Invalid aux data, give up.
4696             return -1;
4697         }
4698     }
4699 
4700     if (new) { // Ensure there's room
4701         if (possibly_expand_bam_data(b, 3 + 4) < 0)
4702             return -1;
4703         s = b->data + b->l_data;
4704         *s++ = tag[0];
4705         *s++ = tag[1];
4706     } else if (shrink) { // Convert non-standard double tag to float
4707         memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
4708         b->l_data -= 4;
4709     }
4710     *s++ = 'f';
4711     float_to_le(val, s);
4712     if (new) b->l_data += 7;
4713 
4714     return 0;
4715 }
4716 
bam_aux_update_array(bam1_t * b,const char tag[2],uint8_t type,uint32_t items,void * data)4717 int bam_aux_update_array(bam1_t *b, const char tag[2],
4718                          uint8_t type, uint32_t items, void *data)
4719 {
4720     uint8_t *s = bam_aux_get(b, tag);
4721     size_t old_sz = 0, new_sz;
4722     int new = 0;
4723 
4724     if (s) { // Tag present
4725         if (*s != 'B') { errno = EINVAL; return -1; }
4726         old_sz = aux_type2size(s[1]);
4727         if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
4728         old_sz *= le_to_u32(s + 2);
4729     } else {
4730         if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4731             s = b->data + b->l_data;
4732             new = 1;
4733         }  else { // Invalid aux data, give up.
4734             return -1;
4735         }
4736     }
4737 
4738     new_sz = aux_type2size(type);
4739     if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
4740     if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
4741     new_sz *= items;
4742 
4743     if (new || old_sz < new_sz) {
4744         // Make room for new tag
4745         ptrdiff_t s_offset = s - b->data;
4746         if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
4747             return -1;
4748         s =  b->data + s_offset;
4749     }
4750     if (new) { // Add tag id and type
4751         *s++ = tag[0];
4752         *s++ = tag[1];
4753         *s = 'B';
4754         b->l_data += 8 + new_sz;
4755     } else if (old_sz != new_sz) { // shift following data if necessary
4756         memmove(s + 6 + new_sz, s + 6 + old_sz,
4757                 b->l_data - ((s + 6 + old_sz) - b->data));
4758         b->l_data -= old_sz;
4759         b->l_data += new_sz;
4760     }
4761 
4762     s[1] = type;
4763     u32_to_le(items, s + 2);
4764 #ifdef HTS_LITTLE_ENDIAN
4765     memcpy(s + 6, data, new_sz);
4766     return 0;
4767 #else
4768     return aux_to_le(type, s + 6, data, new_sz);
4769 #endif
4770 }
4771 
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)4772 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
4773                                       uint32_t idx)
4774 {
4775     switch (type) {
4776         case 'c': return le_to_i8(s + idx);
4777         case 'C': return s[idx];
4778         case 's': return le_to_i16(s + 2 * idx);
4779         case 'S': return le_to_u16(s + 2 * idx);
4780         case 'i': return le_to_i32(s + 4 * idx);
4781         case 'I': return le_to_u32(s + 4 * idx);
4782         default:
4783             errno = EINVAL;
4784             return 0;
4785     }
4786 }
4787 
bam_aux2i(const uint8_t * s)4788 int64_t bam_aux2i(const uint8_t *s)
4789 {
4790     int type;
4791     type = *s++;
4792     return get_int_aux_val(type, s, 0);
4793 }
4794 
bam_aux2f(const uint8_t * s)4795 double bam_aux2f(const uint8_t *s)
4796 {
4797     int type;
4798     type = *s++;
4799     if (type == 'd') return le_to_double(s);
4800     else if (type == 'f') return le_to_float(s);
4801     else return get_int_aux_val(type, s, 0);
4802 }
4803 
bam_aux2A(const uint8_t * s)4804 char bam_aux2A(const uint8_t *s)
4805 {
4806     int type;
4807     type = *s++;
4808     if (type == 'A') return *(char*)s;
4809     errno = EINVAL;
4810     return 0;
4811 }
4812 
bam_aux2Z(const uint8_t * s)4813 char *bam_aux2Z(const uint8_t *s)
4814 {
4815     int type;
4816     type = *s++;
4817     if (type == 'Z' || type == 'H') return (char*)s;
4818     errno = EINVAL;
4819     return 0;
4820 }
4821 
bam_auxB_len(const uint8_t * s)4822 uint32_t bam_auxB_len(const uint8_t *s)
4823 {
4824     if (s[0] != 'B') {
4825         errno = EINVAL;
4826         return 0;
4827     }
4828     return le_to_u32(s + 2);
4829 }
4830 
bam_auxB2i(const uint8_t * s,uint32_t idx)4831 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
4832 {
4833     uint32_t len = bam_auxB_len(s);
4834     if (idx >= len) {
4835         errno = ERANGE;
4836         return 0;
4837     }
4838     return get_int_aux_val(s[1], s + 6, idx);
4839 }
4840 
bam_auxB2f(const uint8_t * s,uint32_t idx)4841 double bam_auxB2f(const uint8_t *s, uint32_t idx)
4842 {
4843     uint32_t len = bam_auxB_len(s);
4844     if (idx >= len) {
4845         errno = ERANGE;
4846         return 0.0;
4847     }
4848     if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
4849     else return get_int_aux_val(s[1], s + 6, idx);
4850 }
4851 
sam_open_mode(char * mode,const char * fn,const char * format)4852 int sam_open_mode(char *mode, const char *fn, const char *format)
4853 {
4854     // TODO Parse "bam5" etc for compression level
4855     if (format == NULL) {
4856         // Try to pick a format based on the filename extension
4857         char extension[HTS_MAX_EXT_LEN];
4858         if (find_file_extension(fn, extension) < 0) return -1;
4859         return sam_open_mode(mode, fn, extension);
4860     }
4861     else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
4862     else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
4863     else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
4864     else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
4865     else if (strcasecmp(format, "fastq") == 0 ||
4866              strcasecmp(format, "fq") == 0) strcpy(mode, "f");
4867     else if (strcasecmp(format, "fastq.gz") == 0 ||
4868              strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
4869     else if (strcasecmp(format, "fasta") == 0 ||
4870              strcasecmp(format, "fa") == 0) strcpy(mode, "F");
4871     else if (strcasecmp(format, "fasta.gz") == 0 ||
4872              strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
4873     else return -1;
4874 
4875     return 0;
4876 }
4877 
4878 // A version of sam_open_mode that can handle ,key=value options.
4879 // The format string is allocated and returned, to be freed by the caller.
4880 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)4881 char *sam_open_mode_opts(const char *fn,
4882                          const char *mode,
4883                          const char *format)
4884 {
4885     char *mode_opts = malloc((format ? strlen(format) : 1) +
4886                              (mode   ? strlen(mode)   : 1) + 12);
4887     char *opts, *cp;
4888     int format_len;
4889 
4890     if (!mode_opts)
4891         return NULL;
4892 
4893     strcpy(mode_opts, mode ? mode : "r");
4894     cp = mode_opts + strlen(mode_opts);
4895 
4896     if (format == NULL) {
4897         // Try to pick a format based on the filename extension
4898         char extension[HTS_MAX_EXT_LEN];
4899         if (find_file_extension(fn, extension) < 0) {
4900             free(mode_opts);
4901             return NULL;
4902         }
4903         if (sam_open_mode(cp, fn, extension) == 0) {
4904             return mode_opts;
4905         } else {
4906             free(mode_opts);
4907             return NULL;
4908         }
4909     }
4910 
4911     if ((opts = strchr(format, ','))) {
4912         format_len = opts-format;
4913     } else {
4914         opts="";
4915         format_len = strlen(format);
4916     }
4917 
4918     if (strncmp(format, "bam", format_len) == 0) {
4919         *cp++ = 'b';
4920     } else if (strncmp(format, "cram", format_len) == 0) {
4921         *cp++ = 'c';
4922     } else if (strncmp(format, "cram2", format_len) == 0) {
4923         *cp++ = 'c';
4924         strcpy(cp, ",VERSION=2.1");
4925         cp += 12;
4926     } else if (strncmp(format, "cram3", format_len) == 0) {
4927         *cp++ = 'c';
4928         strcpy(cp, ",VERSION=3.0");
4929         cp += 12;
4930     } else if (strncmp(format, "sam", format_len) == 0) {
4931         ; // format mode=""
4932     } else if (strncmp(format, "sam.gz", format_len) == 0) {
4933         *cp++ = 'z';
4934     } else if (strncmp(format, "fastq", format_len) == 0 ||
4935                strncmp(format, "fq", format_len) == 0) {
4936         *cp++ = 'f';
4937     } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
4938                strncmp(format, "fq.gz", format_len) == 0) {
4939         *cp++ = 'f';
4940         *cp++ = 'z';
4941     } else if (strncmp(format, "fasta", format_len) == 0 ||
4942                strncmp(format, "fa", format_len) == 0) {
4943         *cp++ = 'F';
4944     } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
4945                strncmp(format, "fa", format_len) == 0) {
4946         *cp++ = 'F';
4947         *cp++ = 'z';
4948     } else {
4949         free(mode_opts);
4950         return NULL;
4951     }
4952 
4953     strcpy(cp, opts);
4954 
4955     return mode_opts;
4956 }
4957 
4958 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)4959 int bam_str2flag(const char *str)
4960 {
4961     char *end, *beg = (char*) str;
4962     long int flag = strtol(str, &end, 0);
4963     if ( end!=str ) return flag;    // the conversion was successful
4964     flag = 0;
4965     while ( *str )
4966     {
4967         end = beg;
4968         while ( *end && *end!=',' ) end++;
4969         if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
4970         else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
4971         else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
4972         else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
4973         else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
4974         else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
4975         else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
4976         else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
4977         else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
4978         else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
4979         else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
4980         else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
4981         else return -1;
4982         if ( !*end ) break;
4983         beg = end + 1;
4984     }
4985     return flag;
4986 }
4987 
bam_flag2str(int flag)4988 char *bam_flag2str(int flag)
4989 {
4990     kstring_t str = {0,0,0};
4991     if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
4992     if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
4993     if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
4994     if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
4995     if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
4996     if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
4997     if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
4998     if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
4999     if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5000     if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5001     if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5002     if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5003     if ( str.l == 0 ) kputsn("", 0, &str);
5004     return str.s;
5005 }
5006 
5007 
5008 /**************************
5009  *** Pileup and Mpileup ***
5010  **************************/
5011 
5012 #if !defined(BAM_NO_PILEUP)
5013 
5014 #include <assert.h>
5015 
5016 /*******************
5017  *** Memory pool ***
5018  *******************/
5019 
5020 typedef struct {
5021     int k, y;
5022     hts_pos_t x, end;
5023 } cstate_t;
5024 
5025 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5026 
5027 typedef struct __linkbuf_t {
5028     bam1_t b;
5029     hts_pos_t beg, end;
5030     cstate_t s;
5031     struct __linkbuf_t *next;
5032     bam_pileup_cd cd;
5033 } lbnode_t;
5034 
5035 typedef struct {
5036     int cnt, n, max;
5037     lbnode_t **buf;
5038 } mempool_t;
5039 
mp_init(void)5040 static mempool_t *mp_init(void)
5041 {
5042     mempool_t *mp;
5043     mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5044     return mp;
5045 }
mp_destroy(mempool_t * mp)5046 static void mp_destroy(mempool_t *mp)
5047 {
5048     int k;
5049     for (k = 0; k < mp->n; ++k) {
5050         free(mp->buf[k]->b.data);
5051         free(mp->buf[k]);
5052     }
5053     free(mp->buf);
5054     free(mp);
5055 }
mp_alloc(mempool_t * mp)5056 static inline lbnode_t *mp_alloc(mempool_t *mp)
5057 {
5058     ++mp->cnt;
5059     if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5060     else return mp->buf[--mp->n];
5061 }
mp_free(mempool_t * mp,lbnode_t * p)5062 static inline void mp_free(mempool_t *mp, lbnode_t *p)
5063 {
5064     --mp->cnt; p->next = 0; // clear lbnode_t::next here
5065     if (mp->n == mp->max) {
5066         mp->max = mp->max? mp->max<<1 : 256;
5067         mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5068     }
5069     mp->buf[mp->n++] = p;
5070 }
5071 
5072 /**********************
5073  *** CIGAR resolver ***
5074  **********************/
5075 
5076 /* s->k: the index of the CIGAR operator that has just been processed.
5077    s->x: the reference coordinate of the start of s->k
5078    s->y: the query coordinate of the start of s->k
5079  */
resolve_cigar2(bam_pileup1_t * p,hts_pos_t pos,cstate_t * s)5080 static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5081 {
5082 #define _cop(c) ((c)&BAM_CIGAR_MASK)
5083 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5084 
5085     bam1_t *b = p->b;
5086     bam1_core_t *c = &b->core;
5087     uint32_t *cigar = bam_get_cigar(b);
5088     int k;
5089     // determine the current CIGAR operation
5090     //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5091     if (s->k == -1) { // never processed
5092         p->qpos = 0;
5093         if (c->n_cigar == 1) { // just one operation, save a loop
5094           if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5095         } else { // find the first match or deletion
5096             for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5097                 int op = _cop(cigar[k]);
5098                 int l = _cln(cigar[k]);
5099                 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5100                     op == BAM_CEQUAL || op == BAM_CDIFF) break;
5101                 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5102             }
5103             assert(k < c->n_cigar);
5104             s->k = k;
5105         }
5106     } else { // the read has been processed before
5107         int op, l = _cln(cigar[s->k]);
5108         if (pos - s->x >= l) { // jump to the next operation
5109             assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5110             op = _cop(cigar[s->k+1]);
5111             if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5112               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5113                 s->x += l;
5114                 ++s->k;
5115             } else { // find the next M/D/N/=/X
5116               if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5117                 s->x += l;
5118                 for (k = s->k + 1; k < c->n_cigar; ++k) {
5119                     op = _cop(cigar[k]), l = _cln(cigar[k]);
5120                     if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5121                     else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5122                 }
5123                 s->k = k;
5124             }
5125             assert(s->k < c->n_cigar); // otherwise a bug
5126         } // else, do nothing
5127     }
5128     { // collect pileup information
5129         int op, l;
5130         op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5131         p->is_del = p->indel = p->is_refskip = 0;
5132         if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5133             int op2 = _cop(cigar[s->k+1]);
5134             int l2 = _cln(cigar[s->k+1]);
5135             if (op2 == BAM_CDEL) p->indel = -(int)l2;
5136             else if (op2 == BAM_CINS) p->indel = l2;
5137             else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
5138                 int l3 = 0;
5139                 for (k = s->k + 2; k < c->n_cigar; ++k) {
5140                     op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5141                     if (op2 == BAM_CINS) l3 += l2;
5142                     else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5143                 }
5144                 if (l3 > 0) p->indel = l3;
5145             }
5146         }
5147         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5148             p->qpos = s->y + (pos - s->x);
5149         } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5150             p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5151             p->is_refskip = (op == BAM_CREF_SKIP);
5152         } // cannot be other operations; otherwise a bug
5153         p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5154     }
5155     p->cigar_ind = s->k;
5156     return 1;
5157 }
5158 
5159 /*******************************
5160  *** Expansion of insertions ***
5161  *******************************/
5162 
5163 /*
5164  * Fills out the kstring with the padded insertion sequence for the current
5165  * location in 'p'.  If this is not an insertion site, the string is blank.
5166  *
5167  * This variant handles base modifications, but only when "m" is non-NULL.
5168  *
5169  * Returns the number of inserted base on success, with string length being
5170  *        accessable via ins->l;
5171  *        -1 on failure.
5172  */
bam_plp_insertion_mod(const bam_pileup1_t * p,hts_base_mod_state * m,kstring_t * ins,int * del_len)5173 int bam_plp_insertion_mod(const bam_pileup1_t *p,
5174                           hts_base_mod_state *m,
5175                           kstring_t *ins, int *del_len) {
5176     int j, k, indel, nb = 0;
5177     uint32_t *cigar;
5178 
5179     if (p->indel <= 0) {
5180         if (ks_resize(ins, 1) < 0)
5181             return -1;
5182         ins->l = 0;
5183         ins->s[0] = '\0';
5184         return 0;
5185     }
5186 
5187     if (del_len)
5188         *del_len = 0;
5189 
5190     // Measure indel length including pads
5191     indel = 0;
5192     k = p->cigar_ind+1;
5193     cigar = bam_get_cigar(p->b);
5194     while (k < p->b->core.n_cigar) {
5195         switch (cigar[k] & BAM_CIGAR_MASK) {
5196         case BAM_CPAD:
5197         case BAM_CINS:
5198             indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5199             break;
5200         default:
5201             k = p->b->core.n_cigar;
5202             break;
5203         }
5204         k++;
5205     }
5206     nb = ins->l = indel;
5207 
5208     // Produce sequence
5209     if (ks_resize(ins, indel+1) < 0)
5210         return -1;
5211     indel = 0;
5212     k = p->cigar_ind+1;
5213     j = 1;
5214     while (k < p->b->core.n_cigar) {
5215         int l, c;
5216         switch (cigar[k] & BAM_CIGAR_MASK) {
5217         case BAM_CPAD:
5218             for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5219                 ins->s[indel++] = '*';
5220             break;
5221         case BAM_CINS:
5222             for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5223                 c = seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5224                                           p->qpos + j - p->is_del)];
5225                 ins->s[indel++] = c;
5226                 int nm;
5227                 hts_base_mod mod[256];
5228                 if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5229                                                 m, mod, 256)) > 0) {
5230                     if (ks_resize(ins, ins->l + nm*16+3) < 0)
5231                         return -1;
5232                     ins->s[indel++] = '[';
5233                     int j;
5234                     for (j = 0; j < nm; j++) {
5235                         char qual[20];
5236                         if (mod[j].qual >= 0)
5237                             sprintf(qual, "%d", mod[j].qual);
5238                         else
5239                             *qual=0;
5240                         if (mod[j].modified_base < 0)
5241                             // ChEBI
5242                             indel += sprintf(&ins->s[indel], "%c(%d)%s",
5243                                              "+-"[mod[j].strand],
5244                                              -mod[j].modified_base,
5245                                              qual);
5246                         else
5247                             indel += sprintf(&ins->s[indel], "%c%c%s",
5248                                              "+-"[mod[j].strand],
5249                                              mod[j].modified_base,
5250                                              qual);
5251                     }
5252                     ins->s[indel++] = ']';
5253                 }
5254             }
5255             break;
5256         case BAM_CDEL:
5257             // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5258             if (del_len)
5259                 *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5260             // fall through
5261         default:
5262             k = p->b->core.n_cigar;
5263             break;
5264         }
5265         k++;
5266     }
5267     ins->s[indel] = '\0';
5268     ins->l = indel; // string length
5269 
5270     return nb;      // base length
5271 }
5272 
5273 /*
5274  * Fills out the kstring with the padded insertion sequence for the current
5275  * location in 'p'.  If this is not an insertion site, the string is blank.
5276  *
5277  * This is the original interface with no capability for reporting base
5278  * modifications.
5279  *
5280  * Returns the length of insertion string on success;
5281  *        -1 on failure.
5282  */
bam_plp_insertion(const bam_pileup1_t * p,kstring_t * ins,int * del_len)5283 int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5284     return bam_plp_insertion_mod(p, NULL, ins, del_len);
5285 }
5286 
5287 /***********************
5288  *** Pileup iterator ***
5289  ***********************/
5290 
5291 // Dictionary of overlapping reads
5292 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5293 typedef khash_t(olap_hash) olap_hash_t;
5294 
5295 struct bam_plp_s {
5296     mempool_t *mp;
5297     lbnode_t *head, *tail;
5298     int32_t tid, max_tid;
5299     hts_pos_t pos, max_pos;
5300     int is_eof, max_plp, error, maxcnt;
5301     uint64_t id;
5302     bam_pileup1_t *plp;
5303     // for the "auto" interface only
5304     bam1_t *b;
5305     bam_plp_auto_f func;
5306     void *data;
5307     olap_hash_t *overlaps;
5308 
5309     // For notification of creation and destruction events
5310     // and associated client-owned pointer.
5311     int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5312     int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5313 };
5314 
bam_plp_init(bam_plp_auto_f func,void * data)5315 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5316 {
5317     bam_plp_t iter;
5318     iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5319     iter->mp = mp_init();
5320     iter->head = iter->tail = mp_alloc(iter->mp);
5321     iter->max_tid = iter->max_pos = -1;
5322     iter->maxcnt = 8000;
5323     if (func) {
5324         iter->func = func;
5325         iter->data = data;
5326         iter->b = bam_init1();
5327     }
5328     return iter;
5329 }
5330 
bam_plp_init_overlaps(bam_plp_t iter)5331 int bam_plp_init_overlaps(bam_plp_t iter)
5332 {
5333     iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5334     return iter->overlaps ? 0 : -1;
5335 }
5336 
bam_plp_destroy(bam_plp_t iter)5337 void bam_plp_destroy(bam_plp_t iter)
5338 {
5339     lbnode_t *p, *pnext;
5340     if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5341     for (p = iter->head; p != NULL; p = pnext) {
5342         pnext = p->next;
5343         mp_free(iter->mp, p);
5344     }
5345     mp_destroy(iter->mp);
5346     if (iter->b) bam_destroy1(iter->b);
5347     free(iter->plp);
5348     free(iter);
5349 }
5350 
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))5351 void bam_plp_constructor(bam_plp_t plp,
5352                          int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5353     plp->plp_construct = func;
5354 }
5355 
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))5356 void bam_plp_destructor(bam_plp_t plp,
5357                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5358     plp->plp_destruct = func;
5359 }
5360 
5361 //---------------------------------
5362 //---  Tweak overlapping reads
5363 //---------------------------------
5364 
5365 /**
5366  *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5367  *  cigar_iref2iseq_next() - get the next CMATCH base
5368  *  @cigar:       pointer to current cigar block (rw)
5369  *  @cigar_max:   pointer just beyond the last cigar block
5370  *  @icig:        position within the current cigar block (rw)
5371  *  @iseq:        position in the sequence (rw)
5372  *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5373  *
5374  *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5375  *  or -2 on error.
5376  */
cigar_iref2iseq_set(const uint32_t ** cigar,const uint32_t * cigar_max,hts_pos_t * icig,hts_pos_t * iseq,hts_pos_t * iref)5377 static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5378                                       const uint32_t *cigar_max,
5379                                       hts_pos_t *icig,
5380                                       hts_pos_t *iseq,
5381                                       hts_pos_t *iref)
5382 {
5383     hts_pos_t pos = *iref;
5384     if ( pos < 0 ) return -1;
5385     *icig = 0;
5386     *iseq = 0;
5387     *iref = 0;
5388     while ( *cigar<cigar_max )
5389     {
5390         int cig  = (**cigar) & BAM_CIGAR_MASK;
5391         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5392 
5393         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5394         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5395         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5396         {
5397             pos -= ncig;
5398             if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5399             (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5400             continue;
5401         }
5402         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5403         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5404         {
5405             pos -= ncig;
5406             if ( pos<0 ) pos = 0;
5407             (*cigar)++; *icig = 0; *iref += ncig;
5408             continue;
5409         }
5410         hts_log_error("Unexpected cigar %d", cig);
5411         return -2;
5412     }
5413     *iseq = -1;
5414     return -1;
5415 }
cigar_iref2iseq_next(const uint32_t ** cigar,const uint32_t * cigar_max,hts_pos_t * icig,hts_pos_t * iseq,hts_pos_t * iref)5416 static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5417                                        const uint32_t *cigar_max,
5418                                        hts_pos_t *icig,
5419                                        hts_pos_t *iseq,
5420                                        hts_pos_t *iref)
5421 {
5422     while ( *cigar < cigar_max )
5423     {
5424         int cig  = (**cigar) & BAM_CIGAR_MASK;
5425         int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5426 
5427         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5428         {
5429             if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5430             (*iseq)++; (*icig)++; (*iref)++;
5431             return BAM_CMATCH;
5432         }
5433         if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5434         if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5435         if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5436         if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5437         hts_log_error("Unexpected cigar %d", cig);
5438         return -2;
5439     }
5440     *iseq = -1;
5441     *iref = -1;
5442     return -1;
5443 }
5444 
5445 // Given overlapping read 'a' (left) and 'b' (right) on the same
5446 // template, adjust quality values to zero for either a or b.
5447 // Note versions 1.12 and earlier always removed quality from 'b' for
5448 // matching bases.  Now we select a or b semi-randomly based on name hash.
5449 // Returns 0 on success,
5450 //        -1 on failure
tweak_overlap_quality(bam1_t * a,bam1_t * b)5451 static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5452 {
5453     const uint32_t *a_cigar = bam_get_cigar(a),
5454         *a_cigar_max = a_cigar + a->core.n_cigar;
5455     const uint32_t *b_cigar = bam_get_cigar(b),
5456         *b_cigar_max = b_cigar + b->core.n_cigar;
5457     hts_pos_t a_icig = 0, a_iseq = 0;
5458     hts_pos_t b_icig = 0, b_iseq = 0;
5459     uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5460     uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5461 
5462     hts_pos_t iref   = b->core.pos;
5463     hts_pos_t a_iref = iref - a->core.pos;
5464     hts_pos_t b_iref = iref - b->core.pos;
5465 
5466     int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5467                                     &a_icig, &a_iseq, &a_iref);
5468     if ( a_ret<0 )
5469         // no overlap or error
5470         return a_ret<-1 ? -1:0;
5471 
5472     int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5473                                     &b_icig, &b_iseq, &b_iref);
5474     if ( b_ret<0 )
5475         // no overlap or error
5476         return b_ret<-1 ? -1:0;
5477 
5478     // Determine which seq is the one getting modified qualities.
5479     uint8_t amul, bmul;
5480     if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5481         amul = 1;
5482         bmul = 0;
5483     } else {
5484         amul = 0;
5485         bmul = 1;
5486     }
5487 
5488     // Loop over the overlapping region nulling qualities in either
5489     // seq a or b.
5490     int err = 0;
5491     while ( 1 )
5492     {
5493         // Step to next matching reference position in a and b
5494         while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5495             a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5496                                          &a_icig, &a_iseq, &a_iref);
5497         if ( a_ret<0 ) { // done
5498             err = a_ret<-1?-1:0;
5499             break;
5500         }
5501         if ( iref < a_iref + a->core.pos )
5502             iref = a_iref + a->core.pos;
5503 
5504         while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5505             b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5506                                          &b_iseq, &b_iref);
5507         if ( b_ret<0 ) { // done
5508             err = b_ret<-1?-1:0;
5509             break;
5510         }
5511         if ( iref < b_iref + b->core.pos )
5512             iref = b_iref + b->core.pos;
5513 
5514         iref++;
5515 
5516         if ( a_iref+a->core.pos != b_iref+b->core.pos )
5517             // only CMATCH positions, don't know what to do with indels
5518             continue;
5519 
5520         if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5521             // Fell off end of sequence, bad CIGAR?
5522             return -1;
5523 
5524         // We're finally at the same ref base in both a and b.
5525         // Check if the bases match (confident) or mismatch
5526         // (not so confident).
5527         if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5528             // We are very confident about this base.  Use sum of quals
5529             int qual = a_qual[a_iseq] + b_qual[b_iseq];
5530             a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5531             b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5532         } else {
5533             // Not so confident about anymore given the mismatch.
5534             // Reduce qual for lowest quality base.
5535             if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5536                 // A highest qual base; keep
5537                 a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5538                 b_qual[b_iseq] = 0;
5539             } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5540                 // B highest qual base; keep
5541                 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5542                 a_qual[a_iseq] = 0;
5543             } else {
5544                 // Both equal, so pick randomly
5545                 a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5546                 b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5547             }
5548         }
5549     }
5550 
5551     return err;
5552 }
5553 
5554 // Fix overlapping reads. Simple soft-clipping did not give good results.
5555 // Lowering qualities of unwanted bases is more selective and works better.
5556 //
5557 // Returns 0 on success, -1 on failure
overlap_push(bam_plp_t iter,lbnode_t * node)5558 static int overlap_push(bam_plp_t iter, lbnode_t *node)
5559 {
5560     if ( !iter->overlaps ) return 0;
5561 
5562     // mapped mates and paired reads only
5563     if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5564 
5565     // no overlap possible, unless some wild cigar
5566     if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5567          || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5568          && node->b.core.mpos >= node->end) // for those wild cigars
5569        ) return 0;
5570 
5571     khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5572     if ( kitr==kh_end(iter->overlaps) )
5573     {
5574         // Only add reads where the mate is still to arrive
5575         if (node->b.core.mpos >= node->b.core.pos ||
5576             ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5577             int ret;
5578             kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5579             if (ret < 0) return -1;
5580             kh_value(iter->overlaps, kitr) = node;
5581         }
5582     }
5583     else
5584     {
5585         lbnode_t *a = kh_value(iter->overlaps, kitr);
5586         int err = tweak_overlap_quality(&a->b, &node->b);
5587         kh_del(olap_hash, iter->overlaps, kitr);
5588         assert(a->end-1 == a->s.end);
5589         return err;
5590     }
5591     return 0;
5592 }
5593 
overlap_remove(bam_plp_t iter,const bam1_t * b)5594 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5595 {
5596     if ( !iter->overlaps ) return;
5597 
5598     khiter_t kitr;
5599     if ( b )
5600     {
5601         kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5602         if ( kitr!=kh_end(iter->overlaps) )
5603             kh_del(olap_hash, iter->overlaps, kitr);
5604     }
5605     else
5606     {
5607         // remove all
5608         for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
5609             if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
5610     }
5611 }
5612 
5613 
5614 
5615 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
5616 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
5617 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp64_next(bam_plp_t iter,int * _tid,hts_pos_t * _pos,int * _n_plp)5618 const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
5619 {
5620     if (iter->error) { *_n_plp = -1; return NULL; }
5621     *_n_plp = 0;
5622     if (iter->is_eof && iter->head == iter->tail) return NULL;
5623     while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
5624         int n_plp = 0;
5625         // write iter->plp at iter->pos
5626         lbnode_t **pptr = &iter->head;
5627         while (*pptr != iter->tail) {
5628             lbnode_t *p = *pptr;
5629             if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
5630                 overlap_remove(iter, &p->b);
5631                 if (iter->plp_destruct)
5632                     iter->plp_destruct(iter->data, &p->b, &p->cd);
5633                 *pptr = p->next; mp_free(iter->mp, p);
5634             }
5635             else {
5636                 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
5637                     if (n_plp == iter->max_plp) { // then double the capacity
5638                         iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
5639                         iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
5640                     }
5641                     iter->plp[n_plp].b = &p->b;
5642                     iter->plp[n_plp].cd = p->cd;
5643                     if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
5644                 }
5645                 pptr = &(*pptr)->next;
5646             }
5647         }
5648         *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
5649         // update iter->tid and iter->pos
5650         if (iter->head != iter->tail) {
5651             if (iter->tid > iter->head->b.core.tid) {
5652                 hts_log_error("Unsorted input. Pileup aborts");
5653                 iter->error = 1;
5654                 *_n_plp = -1;
5655                 return NULL;
5656             }
5657         }
5658         if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
5659             iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
5660         } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
5661             iter->pos = iter->head->beg; // jump to the next position
5662         } else ++iter->pos; // scan contiguously
5663         // return
5664         if (n_plp) return iter->plp;
5665         if (iter->is_eof && iter->head == iter->tail) break;
5666     }
5667     return NULL;
5668 }
5669 
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)5670 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
5671 {
5672     hts_pos_t pos64 = 0;
5673     const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
5674     if (pos64 < INT_MAX) {
5675         *_pos = pos64;
5676     } else {
5677         hts_log_error("Position %"PRId64" too large", pos64);
5678         *_pos = INT_MAX;
5679         iter->error = 1;
5680         *_n_plp = -1;
5681         return NULL;
5682     }
5683     return p;
5684 }
5685 
bam_plp_push(bam_plp_t iter,const bam1_t * b)5686 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
5687 {
5688     if (iter->error) return -1;
5689     if (b) {
5690         if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
5691         // Skip only unmapped reads here, any additional filtering must be done in iter->func
5692         if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
5693         if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
5694         {
5695             overlap_remove(iter, b);
5696             return 0;
5697         }
5698         if (bam_copy1(&iter->tail->b, b) == NULL)
5699             return -1;
5700         iter->tail->b.id = iter->id++;
5701         iter->tail->beg = b->core.pos;
5702         // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
5703         iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
5704         iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
5705         if (b->core.tid < iter->max_tid) {
5706             hts_log_error("The input is not sorted (chromosomes out of order)");
5707             iter->error = 1;
5708             return -1;
5709         }
5710         if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
5711             hts_log_error("The input is not sorted (reads out of order)");
5712             iter->error = 1;
5713             return -1;
5714         }
5715         iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
5716         if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
5717             lbnode_t *next = mp_alloc(iter->mp);
5718             if (!next) {
5719                 iter->error = 1;
5720                 return -1;
5721             }
5722             if (iter->plp_construct) {
5723                 if (iter->plp_construct(iter->data, &iter->tail->b,
5724                                         &iter->tail->cd) < 0) {
5725                     mp_free(iter->mp, next);
5726                     iter->error = 1;
5727                     return -1;
5728                 }
5729             }
5730             if (overlap_push(iter, iter->tail) < 0) {
5731                 mp_free(iter->mp, next);
5732                 iter->error = 1;
5733                 return -1;
5734             }
5735             iter->tail->next = next;
5736             iter->tail = iter->tail->next;
5737         }
5738     } else iter->is_eof = 1;
5739     return 0;
5740 }
5741 
bam_plp64_auto(bam_plp_t iter,int * _tid,hts_pos_t * _pos,int * _n_plp)5742 const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
5743 {
5744     const bam_pileup1_t *plp;
5745     if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
5746     if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5747     else { // no pileup line can be obtained; read alignments
5748         *_n_plp = 0;
5749         if (iter->is_eof) return 0;
5750         int ret;
5751         while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
5752             if (bam_plp_push(iter, iter->b) < 0) {
5753                 *_n_plp = -1;
5754                 return 0;
5755             }
5756             if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5757             // otherwise no pileup line can be returned; read the next alignment.
5758         }
5759         if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
5760         if (bam_plp_push(iter, 0) < 0) {
5761             *_n_plp = -1;
5762             return 0;
5763         }
5764         if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5765         return 0;
5766     }
5767 }
5768 
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)5769 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
5770 {
5771     hts_pos_t pos64 = 0;
5772     const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
5773     if (pos64 < INT_MAX) {
5774         *_pos = pos64;
5775     } else {
5776         hts_log_error("Position %"PRId64" too large", pos64);
5777         *_pos = INT_MAX;
5778         iter->error = 1;
5779         *_n_plp = -1;
5780         return NULL;
5781     }
5782     return p;
5783 }
5784 
bam_plp_reset(bam_plp_t iter)5785 void bam_plp_reset(bam_plp_t iter)
5786 {
5787     overlap_remove(iter, NULL);
5788     iter->max_tid = iter->max_pos = -1;
5789     iter->tid = iter->pos = 0;
5790     iter->is_eof = 0;
5791     while (iter->head != iter->tail) {
5792         lbnode_t *p = iter->head;
5793         iter->head = p->next;
5794         mp_free(iter->mp, p);
5795     }
5796 }
5797 
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)5798 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
5799 {
5800     iter->maxcnt = maxcnt;
5801 }
5802 
5803 /************************
5804  *** Mpileup iterator ***
5805  ************************/
5806 
5807 struct bam_mplp_s {
5808     int n;
5809     int32_t min_tid, *tid;
5810     hts_pos_t min_pos, *pos;
5811     bam_plp_t *iter;
5812     int *n_plp;
5813     const bam_pileup1_t **plp;
5814 };
5815 
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)5816 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
5817 {
5818     int i;
5819     bam_mplp_t iter;
5820     iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
5821     iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
5822     iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
5823     iter->n_plp = (int*)calloc(n, sizeof(int));
5824     iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
5825     iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
5826     iter->n = n;
5827     iter->min_pos = HTS_POS_MAX;
5828     iter->min_tid = (uint32_t)-1;
5829     for (i = 0; i < n; ++i) {
5830         iter->iter[i] = bam_plp_init(func, data[i]);
5831         iter->pos[i] = iter->min_pos;
5832         iter->tid[i] = iter->min_tid;
5833     }
5834     return iter;
5835 }
5836 
bam_mplp_init_overlaps(bam_mplp_t iter)5837 int bam_mplp_init_overlaps(bam_mplp_t iter)
5838 {
5839     int i, r = 0;
5840     for (i = 0; i < iter->n; ++i)
5841         r |= bam_plp_init_overlaps(iter->iter[i]);
5842     return r == 0 ? 0 : -1;
5843 }
5844 
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)5845 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
5846 {
5847     int i;
5848     for (i = 0; i < iter->n; ++i)
5849         iter->iter[i]->maxcnt = maxcnt;
5850 }
5851 
bam_mplp_destroy(bam_mplp_t iter)5852 void bam_mplp_destroy(bam_mplp_t iter)
5853 {
5854     int i;
5855     for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
5856     free(iter->iter); free(iter->pos); free(iter->tid);
5857     free(iter->n_plp); free(iter->plp);
5858     free(iter);
5859 }
5860 
bam_mplp64_auto(bam_mplp_t iter,int * _tid,hts_pos_t * _pos,int * n_plp,const bam_pileup1_t ** plp)5861 int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
5862 {
5863     int i, ret = 0;
5864     hts_pos_t new_min_pos = HTS_POS_MAX;
5865     uint32_t new_min_tid = (uint32_t)-1;
5866     for (i = 0; i < iter->n; ++i) {
5867         if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
5868             int tid;
5869             hts_pos_t pos;
5870             iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
5871             if ( iter->iter[i]->error ) return -1;
5872             if (iter->plp[i]) {
5873                 iter->tid[i] = tid;
5874                 iter->pos[i] = pos;
5875             } else {
5876                 iter->tid[i] = 0;
5877                 iter->pos[i] = 0;
5878             }
5879         }
5880         if (iter->plp[i]) {
5881             if (iter->tid[i] < new_min_tid) {
5882                 new_min_tid = iter->tid[i];
5883                 new_min_pos = iter->pos[i];
5884             } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
5885                 new_min_pos = iter->pos[i];
5886             }
5887         }
5888     }
5889     iter->min_pos = new_min_pos;
5890     iter->min_tid = new_min_tid;
5891     if (new_min_pos == HTS_POS_MAX) return 0;
5892     *_tid = new_min_tid; *_pos = new_min_pos;
5893     for (i = 0; i < iter->n; ++i) {
5894         if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
5895             n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
5896             ++ret;
5897         } else n_plp[i] = 0, plp[i] = 0;
5898     }
5899     return ret;
5900 }
5901 
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)5902 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
5903 {
5904     hts_pos_t pos64 = 0;
5905     int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
5906     if (ret >= 0) {
5907         if (pos64 < INT_MAX) {
5908             *_pos = pos64;
5909         } else {
5910             hts_log_error("Position %"PRId64" too large", pos64);
5911             *_pos = INT_MAX;
5912             return -1;
5913         }
5914     }
5915     return ret;
5916 }
5917 
bam_mplp_reset(bam_mplp_t iter)5918 void bam_mplp_reset(bam_mplp_t iter)
5919 {
5920     int i;
5921     iter->min_pos = HTS_POS_MAX;
5922     iter->min_tid = (uint32_t)-1;
5923     for (i = 0; i < iter->n; ++i) {
5924         bam_plp_reset(iter->iter[i]);
5925         iter->pos[i] = HTS_POS_MAX;
5926         iter->tid[i] = (uint32_t)-1;
5927         iter->n_plp[i] = 0;
5928         iter->plp[i] = NULL;
5929     }
5930 }
5931 
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))5932 void bam_mplp_constructor(bam_mplp_t iter,
5933                           int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
5934     int i;
5935     for (i = 0; i < iter->n; ++i)
5936         bam_plp_constructor(iter->iter[i], func);
5937 }
5938 
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))5939 void bam_mplp_destructor(bam_mplp_t iter,
5940                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
5941     int i;
5942     for (i = 0; i < iter->n; ++i)
5943         bam_plp_destructor(iter->iter[i], func);
5944 }
5945 
5946 #endif // ~!defined(BAM_NO_PILEUP)
5947 
5948 // ---------------------------
5949 // Base Modification retrieval
5950 //
5951 // These operate by recording state in an opaque type, allocated and freed
5952 // via the functions below.
5953 //
5954 // Initially we call bam_parse_basemod to process the tags and record the
5955 // modifications in the state structure, and then functions such as
5956 // bam_next_basemod can iterate over this cached state.
5957 
5958 /*
5959  * Base modification are stored in MM/Mm tags as <mod_list> defined as
5960  *
5961  * <mod_list>        ::= <mod_chain><mod_list> | ""
5962  * <mod_chain>       ::= <canonical_base><strand><mod-list><delta-list>
5963  *
5964  * <canonical_base>  ::= "A" | "C" | "G" | "T" | "N".
5965  *
5966  * <strand>          ::= "+" | "-".
5967  *
5968  * <mod-list>        ::= <simple-mod-list> | <ChEBI-code>
5969  * <simple-mod-list> ::= <simple-mod><simple-mod-list> | <simple-mod>
5970  * <ChEBI-code>      ::= <integer>
5971  * <simple-mod>      ::= <letter>
5972  *
5973  * <delta-list>      ::= "," <integer> <delta-list> | ";"
5974  *
5975  * We do not allocate additional memory other than the fixed size
5976  * state, thus we track up to 256 pointers to different locations
5977  * within the MM and ML tags.  Each pointer is for a distinct
5978  * modification code (simple or ChEBI), meaning some may point to the
5979  * same delta-list when multiple codes are combined together
5980  * (e.g. "C+mh,1,5,18,3;").  This is the MM[] array.
5981  *
5982  * Each numeric in the delta-list is tracked in MMcount[], counted
5983  * down until it hits zero in which case the next delta is fetched.
5984  *
5985  * ML array similarly holds the locations in the quality (ML) tag per
5986  * type, but these are interleaved so C+mhfc,10,15 will have 4 types
5987  * all pointing to the same delta position, but in ML we store
5988  * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1).  This ML
5989  * also has MLstride indicating how many positions along ML to jump
5990  * each time we consume a base. (4 in our above example, but usually 1
5991  * for the simple case).
5992  *
5993  * One complexity of the base modification system is that mods are
5994  * always stored in the original DNA orientation.  This is so that
5995  * tools that may reverse-complement a sequence (eg "samtools fastq -T
5996  * MM,ML") can pass through these modification tags irrespective of
5997  * whether they have any knowledge of their internal workings.
5998  *
5999  * Because we don't wish to allocate extra memory, we cannot simply
6000  * reverse the MM and ML tags.  Sadly this means we have to manage the
6001  * reverse complementing ourselves on-the-fly.
6002  * For reversed reads we start at the right end of MM and no longer
6003  * stop at the semicolon.  Instead we use MMend[] array to mark the
6004  * termination point.
6005  */
6006 #define MAX_BASE_MOD 256
6007 struct hts_base_mod_state {
6008     int type[MAX_BASE_MOD];     // char or minus-CHEBI
6009     int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15)
6010     char strand[MAX_BASE_MOD];  // strand of modification; + or -
6011     int MMcount[MAX_BASE_MOD];  // no. canonical bases left until next mod
6012     char *MM[MAX_BASE_MOD];     // next pos delta (string)
6013     char *MMend[MAX_BASE_MOD];  // end of pos-delta string
6014     uint8_t *ML[MAX_BASE_MOD];  // next qual
6015     int MLstride[MAX_BASE_MOD]; // bytes between quals for this type
6016     int seq_pos;                // current position along sequence
6017     int nmods;                  // used array size (0 to MAX_BASE_MOD-1).
6018 };
6019 
hts_base_mod_state_alloc(void)6020 hts_base_mod_state *hts_base_mod_state_alloc(void) {
6021     return calloc(1, sizeof(hts_base_mod_state));
6022 }
6023 
hts_base_mod_state_free(hts_base_mod_state * state)6024 void hts_base_mod_state_free(hts_base_mod_state *state) {
6025     free(state);
6026 }
6027 
6028 /*
6029  * Count frequency of A, C, G, T and N canonical bases in the sequence
6030  */
seq_freq(const bam1_t * b,int freq[16])6031 static void seq_freq(const bam1_t *b, int freq[16]) {
6032     int i;
6033 
6034     memset(freq, 0, 16*sizeof(*freq));
6035     uint8_t *seq = bam_get_seq(b);
6036     for (i = 0; i < b->core.l_qseq; i++)
6037         freq[bam_seqi(seq, i)]++;
6038     freq[15] = b->core.l_qseq; // all bases count as N for base mods
6039 }
6040 
6041 //0123456789ABCDEF
6042 //=ACMGRSVTWYHKDBN  aka seq_nt16_str[]
6043 //=TGKCYSBAWRDMHVN  comp1ement of seq_nt16_str
6044 //084C2A6E195D3B7F
6045 static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
6046 
6047 /*
6048  * Parse the MM and ML tags to populate the base mod state.
6049  * This structure will have been previously allocated via
6050  * hts_base_mod_state_alloc, but it does not need to be repeatedly
6051  * freed and allocated for each new bam record. (Although obviously
6052  * it requires a new call to this function.)
6053  *
6054  */
bam_parse_basemod(const bam1_t * b,hts_base_mod_state * state)6055 int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
6056     // Read MM and ML tags
6057     uint8_t *mm = bam_aux_get(b, "MM");
6058     if (!mm) mm = bam_aux_get(b, "Mm");
6059     if (!mm)
6060         return 0;
6061     if (mm[0] != 'Z') {
6062         hts_log_error("MM tag is not of type Z");
6063         return -1;
6064     }
6065 
6066     uint8_t *ml = bam_aux_get(b, "ML");
6067     if (!ml) ml = bam_aux_get(b, "Ml");
6068     if (ml && (ml[0] != 'B' || ml[1] != 'C')) {
6069         hts_log_error("ML tag is not of type B,C");
6070         return -1;
6071     }
6072     uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL;
6073     if (ml) ml += 6;
6074 
6075     state->seq_pos = 0;
6076 
6077     // Aggregate freqs of ACGTN if reversed, to get final-delta (later)
6078     int freq[16];
6079     if (b->core.flag & BAM_FREVERSE)
6080         seq_freq(b, freq);
6081 
6082     char *cp = (char *)mm+1;
6083     int mod_num = 0;
6084     while (*cp) {
6085         for (; *cp; cp++) {
6086             // cp should be [ACGTNU][+-][^,]*(,\d+)*;
6087             unsigned char btype = *cp++;
6088 
6089             if (btype != 'A' && btype != 'C' &&
6090                 btype != 'G' && btype != 'T' &&
6091                 btype != 'U' && btype != 'N')
6092                 return -1;
6093             if (btype == 'U') btype = 'T';
6094 
6095             btype = seq_nt16_table[btype];
6096 
6097             // Strand
6098             if (*cp != '+' && *cp != '-')
6099                 return -1; // malformed
6100             char strand = *cp++;
6101 
6102             // List of modification types
6103             char *ms = cp, *me; // mod code start and end
6104             char *cp_end = NULL;
6105             int chebi = 0;
6106             if (isdigit(*cp)) {
6107                 chebi = strtol(cp, &cp_end, 10);
6108                 cp = cp_end;
6109                 ms = cp-1;
6110             } else {
6111                 while (*cp && *cp != ',' && *cp != ';')
6112                     cp++;
6113                 if (*cp == '\0')
6114                     return -1;
6115             }
6116             me = cp;
6117 
6118             long delta;
6119             int n = 0; // nth symbol in a multi-mod string
6120             int stride = me-ms;
6121             int ndelta = 0;
6122 
6123             if (b->core.flag & BAM_FREVERSE) {
6124                 // We process the sequence in left to right order,
6125                 // but delta is successive count of bases to skip
6126                 // counting right to left.  This also means the number
6127                 // of bases to skip at left edge is unrecorded (as it's
6128                 // the remainder).
6129                 //
6130                 // To output mods in left to right, we step through the
6131                 // MM list in reverse and need to identify the left-end
6132                 // "remainder" delta.
6133                 int total_seq = 0;
6134                 for (;;) {
6135                     cp += (*cp == ',');
6136                     if (*cp == 0 || *cp == ';')
6137                         break;
6138 
6139                     delta = strtol(cp, &cp_end, 10);
6140                     if (cp_end == cp) {
6141                         hts_log_error("Hit end of MM tag. Missing semicolon?");
6142                         return -1;
6143                     }
6144 
6145                     cp = cp_end;
6146                     total_seq += delta+1;
6147                     ndelta++;
6148                 }
6149                 delta = freq[seqi_rc[btype]] - total_seq; // remainder
6150             } else {
6151                 delta = *cp == ','
6152                     ? strtol(cp+1, &cp_end, 10)
6153                     : 0;
6154                 if (!cp_end) {
6155                     // empty list
6156                     delta = INT_MAX;
6157                     cp_end = cp+1;
6158                 }
6159             }
6160             // Now delta is first in list or computed remainder,
6161             // and cp_end is either start or end of the MM list.
6162             while (ms < me) {
6163                 state->type     [mod_num] = chebi ? -chebi : *ms;
6164                 state->strand   [mod_num] = (strand == '-');
6165                 state->canonical[mod_num] = btype;
6166                 state->MLstride [mod_num] = stride;
6167 
6168                 state->MMcount  [mod_num] = delta;
6169                 if (b->core.flag & BAM_FREVERSE) {
6170                     state->MM   [mod_num] = cp+1;
6171                     state->MMend[mod_num] = cp_end;
6172                     state->ML   [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL;
6173                 } else {
6174                     state->MM   [mod_num] = cp_end;
6175                     state->MMend[mod_num] = NULL;
6176                     state->ML   [mod_num] = ml ? ml+n : NULL;
6177                 }
6178 
6179                 if (++mod_num >= MAX_BASE_MOD) {
6180                     hts_log_error("Too many base modification types");
6181                     return -1;
6182                 }
6183                 ms++; n++;
6184             }
6185 
6186             // Skip modification deltas
6187             if (ml) {
6188                 if (b->core.flag & BAM_FREVERSE) {
6189                     ml += ndelta*stride;
6190                 } else {
6191                     while (*cp && *cp != ';') {
6192                         if (*cp == ',')
6193                             ml+=stride;
6194                         cp++;
6195                     }
6196                 }
6197                 if (ml > ml_end) {
6198                     hts_log_error("Insufficient number of entries in ML tag");
6199                     return -1;
6200                 }
6201             } else {
6202                 // cp_end already known if FREVERSE
6203                 if (cp_end && (b->core.flag & BAM_FREVERSE))
6204                     cp = cp_end;
6205                 else
6206                     while (*cp && *cp != ';')
6207                         cp++;
6208             }
6209             if (!*cp) {
6210                 hts_log_error("Hit end of MM tag. Missing semicolon?");
6211                 return -1;
6212             }
6213         }
6214     }
6215 
6216     state->nmods = mod_num;
6217 
6218     return 0;
6219 }
6220 
6221 /*
6222  * Fills out mods[] with the base modifications found.
6223  * Returns the number found (0 if none), which may be more than
6224  * the size of n_mods if more were found than reported.
6225  * Returns <= -1 on error.
6226  *
6227  * This always marches left to right along sequence, irrespective of
6228  * reverse flag or modification strand.
6229  */
bam_mods_at_next_pos(const bam1_t * b,hts_base_mod_state * state,hts_base_mod * mods,int n_mods)6230 int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
6231                          hts_base_mod *mods, int n_mods) {
6232     if (b->core.flag & BAM_FREVERSE) {
6233         if (state->seq_pos < 0)
6234             return -1;
6235     } else {
6236         if (state->seq_pos >= b->core.l_qseq)
6237             return -1;
6238     }
6239 
6240     int i, j, n = 0;
6241     unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos);
6242     state->seq_pos++;
6243     if (b->core.flag & BAM_FREVERSE)
6244         base = seqi_rc[base];
6245 
6246     for (i = 0; i < state->nmods; i++) {
6247         if (state->canonical[i] != base && state->canonical[i] != 15/*N*/)
6248             continue;
6249 
6250         if (state->MMcount[i]-- > 0)
6251             continue;
6252 
6253         char *MMptr = state->MM[i];
6254         if (n < n_mods) {
6255             mods[n].modified_base = state->type[i];
6256             mods[n].canonical_base = seq_nt16_str[state->canonical[i]];
6257             mods[n].strand = state->strand[i];
6258             mods[n].qual = state->ML[i] ? *state->ML[i] : -1;
6259         }
6260         n++;
6261         if (state->ML[i])
6262             state->ML[i] += (b->core.flag & BAM_FREVERSE)
6263                 ? -state->MLstride[i]
6264                 : +state->MLstride[i];
6265 
6266         if (b->core.flag & BAM_FREVERSE) {
6267             // process MM list backwards
6268             char *cp;
6269             for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--)
6270                 if (*cp == ',')
6271                     break;
6272             state->MMend[i] = cp;
6273             if (cp != state->MM[i])
6274                 state->MMcount[i] = strtol(cp+1, NULL, 10);
6275             else
6276                 state->MMcount[i] = INT_MAX;
6277         } else {
6278             if (*state->MM[i] == ',')
6279                 state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10);
6280             else
6281                 state->MMcount[i] = INT_MAX;
6282         }
6283 
6284         // Multiple mods at the same coords.
6285         for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) {
6286             if (n < n_mods) {
6287                 mods[n].modified_base = state->type[j];
6288                 mods[n].canonical_base = seq_nt16_str[state->canonical[j]];
6289                 mods[n].strand = state->strand[j];
6290                 mods[n].qual = state->ML[j] ? *state->ML[j] : -1;
6291             }
6292             n++;
6293             state->MMcount[j] = state->MMcount[i];
6294             state->MM[j]      = state->MM[i];
6295             if (state->ML[j])
6296                 state->ML[j] += (b->core.flag & BAM_FREVERSE)
6297                     ? -state->MLstride[j]
6298                     : +state->MLstride[j];
6299         }
6300         i = j-1;
6301     }
6302 
6303     return n;
6304 }
6305 
6306 /*
6307  * Looks for the next location with a base modification.
6308  */
bam_next_basemod(const bam1_t * b,hts_base_mod_state * state,hts_base_mod * mods,int n_mods,int * pos)6309 int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
6310                      hts_base_mod *mods, int n_mods, int *pos) {
6311     if (state->seq_pos >= b->core.l_qseq)
6312         return 0;
6313 
6314     // Look through state->MMcount arrays to see when the next lowest is
6315     // per base type;
6316     int next[16], freq[16] = {0}, i;
6317     memset(next, 0x7f, 16*sizeof(*next));
6318     if (b->core.flag & BAM_FREVERSE) {
6319         for (i = 0; i < state->nmods; i++) {
6320             if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i])
6321                 next[seqi_rc[state->canonical[i]]] = state->MMcount[i];
6322         }
6323     } else {
6324         for (i = 0; i < state->nmods; i++) {
6325             if (next[state->canonical[i]] > state->MMcount[i])
6326                 next[state->canonical[i]] = state->MMcount[i];
6327         }
6328     }
6329 
6330     // Now step through the sequence counting off base types.
6331     for (i = state->seq_pos; i < b->core.l_qseq; i++) {
6332         unsigned char bc = bam_seqi(bam_get_seq(b), i);
6333         if (next[bc] <= freq[bc] || next[15] <= freq[15])
6334             break;
6335         freq[bc]++;
6336         if (bc != 15) // N
6337             freq[15]++;
6338     }
6339     *pos = state->seq_pos = i;
6340 
6341     if (i >= b->core.l_qseq) {
6342         // Check for more MM elements than bases present.
6343         for (i = 0; i < state->nmods; i++) {
6344             if (!(b->core.flag & BAM_FREVERSE) &&
6345                 state->MMcount[i] < 0x7f000000) {
6346                 hts_log_warning("MM tag refers to bases beyond sequence length");
6347                 return -1;
6348             }
6349         }
6350         return 0;
6351     }
6352 
6353     if (b->core.flag & BAM_FREVERSE) {
6354         for (i = 0; i < state->nmods; i++)
6355             state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]];
6356     } else {
6357         for (i = 0; i < state->nmods; i++)
6358             state->MMcount[i] -= freq[state->canonical[i]];
6359     }
6360 
6361     int r = bam_mods_at_next_pos(b, state, mods, n_mods);
6362     return r > 0 ? r : 0;
6363 }
6364 
6365 /*
6366  * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos.
6367  * This can only march forwards along the read, but can do so by more than
6368  * one base-pair.
6369  *
6370  * This makes it useful for calling from pileup iterators where qpos may
6371  * start part way through a read for the first occurrence of that record.
6372  */
bam_mods_at_qpos(const bam1_t * b,int qpos,hts_base_mod_state * state,hts_base_mod * mods,int n_mods)6373 int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
6374                     hts_base_mod *mods, int n_mods) {
6375     // FIXME: for now this is inefficient in implementation.
6376     int r = 0;
6377     while (state->seq_pos <= qpos)
6378         if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0)
6379             break;
6380 
6381     return r;
6382 }
6383