1 /* sam.c -- SAM and BAM file I/O and manipulation.
2
3 Copyright (C) 2008-2010, 2012-2021 Genome Research Ltd.
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27 #include <config.h>
28
29 #include <strings.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <zlib.h>
35 #include <assert.h>
36 #include <signal.h>
37 #include <inttypes.h>
38 #include <unistd.h>
39
40 // Suppress deprecation message for cigar_tab, which we initialise
41 #include "htslib/hts_defs.h"
42 #undef HTS_DEPRECATED
43 #define HTS_DEPRECATED(message)
44
45 #include "htslib/sam.h"
46 #include "htslib/bgzf.h"
47 #include "cram/cram.h"
48 #include "hts_internal.h"
49 #include "sam_internal.h"
50 #include "htslib/hfile.h"
51 #include "htslib/hts_endian.h"
52 #include "htslib/hts_expr.h"
53 #include "header.h"
54
55 #include "htslib/khash.h"
56 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
57 KHASH_SET_INIT_INT(tag)
58
59 #ifndef EFTYPE
60 #define EFTYPE ENOEXEC
61 #endif
62 #ifndef EOVERFLOW
63 #define EOVERFLOW ERANGE
64 #endif
65
66 /**********************
67 *** BAM header I/O ***
68 **********************/
69
70 HTSLIB_EXPORT
71 const int8_t bam_cigar_table[256] = {
72 // 0 .. 47
73 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
74 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
75 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
76
77 // 48 .. 63 (including =)
78 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, BAM_CEQUAL, -1, -1,
79
80 // 64 .. 79 (including MIDNHB)
81 -1, -1, BAM_CBACK, -1, BAM_CDEL, -1, -1, -1,
82 BAM_CHARD_CLIP, BAM_CINS, -1, -1, -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
83
84 // 80 .. 95 (including SPX)
85 BAM_CPAD, -1, -1, BAM_CSOFT_CLIP, -1, -1, -1, -1,
86 BAM_CDIFF, -1, -1, -1, -1, -1, -1, -1,
87
88 // 96 .. 127
89 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
90 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
91
92 // 128 .. 255
93 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
94 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
95 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
96 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
97 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
98 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
99 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
100 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
101 };
102
sam_hdr_init()103 sam_hdr_t *sam_hdr_init()
104 {
105 sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
106 if (bh == NULL) return NULL;
107
108 bh->cigar_tab = bam_cigar_table;
109 return bh;
110 }
111
sam_hdr_destroy(sam_hdr_t * bh)112 void sam_hdr_destroy(sam_hdr_t *bh)
113 {
114 int32_t i;
115
116 if (bh == NULL) return;
117
118 if (bh->ref_count > 0) {
119 --bh->ref_count;
120 return;
121 }
122
123 if (bh->target_name) {
124 for (i = 0; i < bh->n_targets; ++i)
125 free(bh->target_name[i]);
126 free(bh->target_name);
127 free(bh->target_len);
128 }
129 free(bh->text);
130 if (bh->hrecs)
131 sam_hrecs_free(bh->hrecs);
132 if (bh->sdict)
133 kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
134 free(bh);
135 }
136
137 // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
138 // references before sam_hdr_t::hrecs is populated
sam_hdr_dup_sdict(const sam_hdr_t * h0,sam_hdr_t * h)139 int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
140 {
141 const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
142 khash_t(s2i) *dest_long_refs = kh_init(s2i);
143 int i;
144 if (!dest_long_refs) return -1;
145
146 for (i = 0; i < h->n_targets; i++) {
147 int ret;
148 khiter_t ksrc, kdest;
149 if (h->target_len[i] < UINT32_MAX) continue;
150 ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
151 if (ksrc == kh_end(src_long_refs)) continue;
152 kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
153 if (ret < 0) {
154 kh_destroy(s2i, dest_long_refs);
155 return -1;
156 }
157 kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
158 }
159
160 h->sdict = dest_long_refs;
161 return 0;
162 }
163
sam_hdr_dup(const sam_hdr_t * h0)164 sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
165 {
166 if (h0 == NULL) return NULL;
167 sam_hdr_t *h;
168 if ((h = sam_hdr_init()) == NULL) return NULL;
169 // copy the simple data
170 h->n_targets = 0;
171 h->ignore_sam_err = h0->ignore_sam_err;
172 h->l_text = 0;
173
174 // Then the pointery stuff
175
176 if (!h0->hrecs) {
177 h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
178 if (!h->target_len) goto fail;
179 h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
180 if (!h->target_name) goto fail;
181
182 int i;
183 for (i = 0; i < h0->n_targets; ++i) {
184 h->target_len[i] = h0->target_len[i];
185 h->target_name[i] = strdup(h0->target_name[i]);
186 if (!h->target_name[i]) break;
187 }
188 h->n_targets = i;
189 if (i < h0->n_targets) goto fail;
190
191 if (h0->sdict) {
192 if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
193 }
194 }
195
196 if (h0->hrecs) {
197 kstring_t tmp = { 0, 0, NULL };
198 if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
199 free(ks_release(&tmp));
200 goto fail;
201 }
202
203 h->l_text = tmp.l;
204 h->text = ks_release(&tmp);
205
206 if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
207 goto fail;
208 } else {
209 h->l_text = h0->l_text;
210 h->text = malloc(h->l_text + 1);
211 if (!h->text) goto fail;
212 memcpy(h->text, h0->text, h->l_text);
213 h->text[h->l_text] = '\0';
214 }
215
216 return h;
217
218 fail:
219 sam_hdr_destroy(h);
220 return NULL;
221 }
222
bam_hdr_read(BGZF * fp)223 sam_hdr_t *bam_hdr_read(BGZF *fp)
224 {
225 sam_hdr_t *h;
226 uint8_t buf[4];
227 int magic_len, has_EOF;
228 int32_t i, name_len, num_names = 0;
229 size_t bufsize;
230 ssize_t bytes;
231 // check EOF
232 has_EOF = bgzf_check_EOF(fp);
233 if (has_EOF < 0) {
234 perror("[W::bam_hdr_read] bgzf_check_EOF");
235 } else if (has_EOF == 0) {
236 hts_log_warning("EOF marker is absent. The input is probably truncated");
237 }
238 // read "BAM1"
239 magic_len = bgzf_read(fp, buf, 4);
240 if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
241 hts_log_error("Invalid BAM binary header");
242 return 0;
243 }
244 h = sam_hdr_init();
245 if (!h) goto nomem;
246
247 // read plain text and the number of reference sequences
248 bytes = bgzf_read(fp, buf, 4);
249 if (bytes != 4) goto read_err;
250 h->l_text = le_to_u32(buf);
251
252 bufsize = h->l_text + 1;
253 if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
254 h->text = (char*)malloc(bufsize);
255 if (!h->text) goto nomem;
256 h->text[h->l_text] = 0; // make sure it is NULL terminated
257 bytes = bgzf_read(fp, h->text, h->l_text);
258 if (bytes != h->l_text) goto read_err;
259
260 bytes = bgzf_read(fp, &h->n_targets, 4);
261 if (bytes != 4) goto read_err;
262 if (fp->is_be) ed_swap_4p(&h->n_targets);
263
264 if (h->n_targets < 0) goto invalid;
265
266 // read reference sequence names and lengths
267 if (h->n_targets > 0) {
268 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
269 if (!h->target_name) goto nomem;
270 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
271 if (!h->target_len) goto nomem;
272 }
273 else {
274 h->target_name = NULL;
275 h->target_len = NULL;
276 }
277
278 for (i = 0; i != h->n_targets; ++i) {
279 bytes = bgzf_read(fp, &name_len, 4);
280 if (bytes != 4) goto read_err;
281 if (fp->is_be) ed_swap_4p(&name_len);
282 if (name_len <= 0) goto invalid;
283
284 h->target_name[i] = (char*)malloc(name_len);
285 if (!h->target_name[i]) goto nomem;
286 num_names++;
287
288 bytes = bgzf_read(fp, h->target_name[i], name_len);
289 if (bytes != name_len) goto read_err;
290
291 if (h->target_name[i][name_len - 1] != '\0') {
292 /* Fix missing NUL-termination. Is this being too nice?
293 We could alternatively bail out with an error. */
294 char *new_name;
295 if (name_len == INT32_MAX) goto invalid;
296 new_name = realloc(h->target_name[i], name_len + 1);
297 if (new_name == NULL) goto nomem;
298 h->target_name[i] = new_name;
299 h->target_name[i][name_len] = '\0';
300 }
301
302 bytes = bgzf_read(fp, &h->target_len[i], 4);
303 if (bytes != 4) goto read_err;
304 if (fp->is_be) ed_swap_4p(&h->target_len[i]);
305 }
306 return h;
307
308 nomem:
309 hts_log_error("Out of memory");
310 goto clean;
311
312 read_err:
313 if (bytes < 0) {
314 hts_log_error("Error reading BGZF stream");
315 } else {
316 hts_log_error("Truncated BAM header");
317 }
318 goto clean;
319
320 invalid:
321 hts_log_error("Invalid BAM binary header");
322
323 clean:
324 if (h != NULL) {
325 h->n_targets = num_names; // ensure we free only allocated target_names
326 sam_hdr_destroy(h);
327 }
328 return NULL;
329 }
330
bam_hdr_write(BGZF * fp,const sam_hdr_t * h)331 int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
332 {
333 int32_t i, name_len, x;
334 kstring_t hdr_ks = { 0, 0, NULL };
335 char *text;
336 uint32_t l_text;
337
338 if (!h) return -1;
339
340 if (h->hrecs) {
341 if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
342 if (hdr_ks.l > INT32_MAX) {
343 hts_log_error("Header too long for BAM format");
344 free(hdr_ks.s);
345 return -1;
346 }
347 text = hdr_ks.s;
348 l_text = hdr_ks.l;
349 } else {
350 if (h->l_text > INT32_MAX) {
351 hts_log_error("Header too long for BAM format");
352 return -1;
353 }
354 text = h->text;
355 l_text = h->l_text;
356 }
357 // write "BAM1"
358 if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
359 // write plain text and the number of reference sequences
360 if (fp->is_be) {
361 x = ed_swap_4(l_text);
362 if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
363 if (l_text) {
364 if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
365 }
366 x = ed_swap_4(h->n_targets);
367 if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
368 } else {
369 if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
370 if (l_text) {
371 if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
372 }
373 if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
374 }
375 free(hdr_ks.s);
376 // write sequence names and lengths
377 for (i = 0; i != h->n_targets; ++i) {
378 char *p = h->target_name[i];
379 name_len = strlen(p) + 1;
380 if (fp->is_be) {
381 x = ed_swap_4(name_len);
382 if (bgzf_write(fp, &x, 4) < 0) return -1;
383 } else {
384 if (bgzf_write(fp, &name_len, 4) < 0) return -1;
385 }
386 if (bgzf_write(fp, p, name_len) < 0) return -1;
387 if (fp->is_be) {
388 x = ed_swap_4(h->target_len[i]);
389 if (bgzf_write(fp, &x, 4) < 0) return -1;
390 } else {
391 if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
392 }
393 }
394 if (bgzf_flush(fp) < 0) return -1;
395 return 0;
396 }
397
sam_parse_region(sam_hdr_t * h,const char * s,int * tid,hts_pos_t * beg,hts_pos_t * end,int flags)398 const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
399 hts_pos_t *beg, hts_pos_t *end, int flags) {
400 return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
401 }
402
403 /*************************
404 *** BAM alignment I/O ***
405 *************************/
406
bam_init1()407 bam1_t *bam_init1()
408 {
409 return (bam1_t*)calloc(1, sizeof(bam1_t));
410 }
411
sam_realloc_bam_data(bam1_t * b,size_t desired)412 int sam_realloc_bam_data(bam1_t *b, size_t desired)
413 {
414 uint32_t new_m_data;
415 uint8_t *new_data;
416 new_m_data = desired;
417 kroundup32(new_m_data);
418 if (new_m_data < desired) {
419 errno = ENOMEM; // Not strictly true but we can't store the size
420 return -1;
421 }
422 if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
423 new_data = realloc(b->data, new_m_data);
424 } else {
425 if ((new_data = malloc(new_m_data)) != NULL) {
426 if (b->l_data > 0)
427 memcpy(new_data, b->data,
428 b->l_data < b->m_data ? b->l_data : b->m_data);
429 bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
430 }
431 }
432 if (!new_data) return -1;
433 b->data = new_data;
434 b->m_data = new_m_data;
435 return 0;
436 }
437
bam_destroy1(bam1_t * b)438 void bam_destroy1(bam1_t *b)
439 {
440 if (b == 0) return;
441 if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
442 free(b->data);
443 if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
444 // In case of reuse
445 b->data = NULL;
446 b->m_data = 0;
447 b->l_data = 0;
448 }
449 }
450
451 if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
452 free(b);
453 }
454
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)455 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
456 {
457 if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
458 memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
459 memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
460 bdst->l_data = bsrc->l_data;
461 bdst->id = bsrc->id;
462 return bdst;
463 }
464
bam_dup1(const bam1_t * bsrc)465 bam1_t *bam_dup1(const bam1_t *bsrc)
466 {
467 if (bsrc == NULL) return NULL;
468 bam1_t *bdst = bam_init1();
469 if (bdst == NULL) return NULL;
470 if (bam_copy1(bdst, bsrc) == NULL) {
471 bam_destroy1(bdst);
472 return NULL;
473 }
474 return bdst;
475 }
476
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,hts_pos_t * rlen,hts_pos_t * qlen)477 static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
478 hts_pos_t *rlen, hts_pos_t *qlen)
479 {
480 int k;
481 *rlen = *qlen = 0;
482 for (k = 0; k < n_cigar; ++k) {
483 int type = bam_cigar_type(bam_cigar_op(cigar[k]));
484 int len = bam_cigar_oplen(cigar[k]);
485 if (type & 1) *qlen += len;
486 if (type & 2) *rlen += len;
487 }
488 }
489
subtract_check_underflow(size_t length,size_t * limit)490 static int subtract_check_underflow(size_t length, size_t *limit)
491 {
492 if (length <= *limit) {
493 *limit -= length;
494 return 0;
495 }
496
497 return -1;
498 }
499
bam_set1(bam1_t * bam,size_t l_qname,const char * qname,uint16_t flag,int32_t tid,hts_pos_t pos,uint8_t mapq,size_t n_cigar,const uint32_t * cigar,int32_t mtid,hts_pos_t mpos,hts_pos_t isize,size_t l_seq,const char * seq,const char * qual,size_t l_aux)500 int bam_set1(bam1_t *bam,
501 size_t l_qname, const char *qname,
502 uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
503 size_t n_cigar, const uint32_t *cigar,
504 int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
505 size_t l_seq, const char *seq, const char *qual,
506 size_t l_aux)
507 {
508 // use a default qname "*" if none is provided
509 if (l_qname == 0) {
510 l_qname = 1;
511 qname = "*";
512 }
513
514 // note: the qname is stored nul terminated and padded as described in the
515 // documentation for the bam1_t struct.
516 size_t qname_nuls = 4 - l_qname % 4;
517
518 // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
519 // can't use bam_endpos() directly as some fields not yet set up.
520 hts_pos_t rlen = 0, qlen = 0;
521 if (!(flag & BAM_FUNMAP)) {
522 bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
523 }
524 if (rlen == 0) {
525 rlen = 1;
526 }
527
528 // validate parameters
529 if (l_qname > 254) {
530 hts_log_error("Query name too long");
531 errno = EINVAL;
532 return -1;
533 }
534 if (HTS_POS_MAX - rlen <= pos) {
535 hts_log_error("Read ends beyond highest supported position");
536 errno = EINVAL;
537 return -1;
538 }
539 if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
540 hts_log_error("Mapped query must have a CIGAR");
541 errno = EINVAL;
542 return -1;
543 }
544 if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
545 hts_log_error("CIGAR and query sequence are of different length");
546 errno = EINVAL;
547 return -1;
548 }
549
550 size_t limit = INT32_MAX;
551 int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
552 u += subtract_check_underflow(n_cigar * 4, &limit);
553 u += subtract_check_underflow((l_seq + 1) / 2, &limit);
554 u += subtract_check_underflow(l_seq, &limit);
555 u += subtract_check_underflow(l_aux, &limit);
556 if (u != 0) {
557 hts_log_error("Size overflow");
558 errno = EINVAL;
559 return -1;
560 }
561
562 // re-allocate the data buffer as needed.
563 size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
564 if (realloc_bam_data(bam, data_len + l_aux) < 0) {
565 return -1;
566 }
567
568 bam->l_data = (int)data_len;
569 bam->core.pos = pos;
570 bam->core.tid = tid;
571 bam->core.bin = bam_reg2bin(pos, pos + rlen);
572 bam->core.qual = mapq;
573 bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
574 bam->core.flag = flag;
575 bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
576 bam->core.n_cigar = (uint32_t)n_cigar;
577 bam->core.l_qseq = (int32_t)l_seq;
578 bam->core.mtid = mtid;
579 bam->core.mpos = mpos;
580 bam->core.isize = isize;
581
582 uint8_t *cp = bam->data;
583 strncpy((char *)cp, qname, l_qname);
584 int i;
585 for (i = 0; i < qname_nuls; i++) {
586 cp[l_qname + i] = '\0';
587 }
588 cp += l_qname + qname_nuls;
589
590 if (n_cigar > 0) {
591 memcpy(cp, cigar, n_cigar * 4);
592 }
593 cp += n_cigar * 4;
594
595 for (i = 0; i + 1 < l_seq; i += 2) {
596 *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]];
597 }
598 for (; i < l_seq; i++) {
599 *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
600 }
601
602 if (qual) {
603 memcpy(cp, qual, l_seq);
604 }
605 else {
606 memset(cp, '\xff', l_seq);
607 }
608
609 return (int)data_len;
610 }
611
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)612 hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
613 {
614 int k;
615 hts_pos_t l;
616 for (k = l = 0; k < n_cigar; ++k)
617 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
618 l += bam_cigar_oplen(cigar[k]);
619 return l;
620 }
621
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)622 hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
623 {
624 int k;
625 hts_pos_t l;
626 for (k = l = 0; k < n_cigar; ++k)
627 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
628 l += bam_cigar_oplen(cigar[k]);
629 return l;
630 }
631
bam_endpos(const bam1_t * b)632 hts_pos_t bam_endpos(const bam1_t *b)
633 {
634 hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
635 if (rlen == 0) rlen = 1;
636 return b->core.pos + rlen;
637 }
638
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)639 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
640 {
641 bam1_core_t *c = &b->core;
642 uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
643 uint8_t *CG;
644
645 // test where there is a real CIGAR in the CG tag to move
646 if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
647 cigar0 = bam_get_cigar(b);
648 if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
649 fake_bytes = c->n_cigar * 4;
650 int saved_errno = errno;
651 CG = bam_aux_get(b, "CG");
652 if (!CG) {
653 if (errno != ENOENT) return -1; // Bad aux data
654 errno = saved_errno; // restore errno on expected no-CG-tag case
655 return 0;
656 }
657 if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
658 return 0; // not of type B,I
659 CG_len = le_to_u32(CG + 2);
660 if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
661
662 // move from the CG tag to the right position
663 cigar_st = (uint8_t*)cigar0 - b->data;
664 c->n_cigar = CG_len;
665 n_cigar4 = c->n_cigar * 4;
666 CG_st = CG - b->data - 2;
667 CG_en = CG_st + 8 + n_cigar4;
668 if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
669 b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
670 memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
671 memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
672 if (ori_len > CG_en) // move data after the CG tag
673 memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
674 b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
675 if (recal_bin)
676 b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
677 if (give_warning)
678 hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
679 return 1;
680 }
681
aux_type2size(uint8_t type)682 static inline int aux_type2size(uint8_t type)
683 {
684 switch (type) {
685 case 'A': case 'c': case 'C':
686 return 1;
687 case 's': case 'S':
688 return 2;
689 case 'i': case 'I': case 'f':
690 return 4;
691 case 'd':
692 return 8;
693 case 'Z': case 'H': case 'B':
694 return type;
695 default:
696 return 0;
697 }
698 }
699
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)700 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
701 {
702 uint32_t *cigar = (uint32_t*)(data + c->l_qname);
703 uint32_t i;
704 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
705 }
706
707 // Fix bad records where qname is not terminated correctly.
fixup_missing_qname_nul(bam1_t * b)708 static int fixup_missing_qname_nul(bam1_t *b) {
709 bam1_core_t *c = &b->core;
710
711 // Note this is called before c->l_extranul is added to c->l_qname
712 if (c->l_extranul > 0) {
713 b->data[c->l_qname++] = '\0';
714 c->l_extranul--;
715 } else {
716 if (b->l_data > INT_MAX - 4) return -1;
717 if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
718 b->l_data += 4;
719 b->data[c->l_qname++] = '\0';
720 c->l_extranul = 3;
721 }
722 return 0;
723 }
724
725 /*
726 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
727 * in multi-threaded handling. This may be worth considering for htslib2.
728 */
bam_read1(BGZF * fp,bam1_t * b)729 int bam_read1(BGZF *fp, bam1_t *b)
730 {
731 bam1_core_t *c = &b->core;
732 int32_t block_len, ret, i;
733 uint32_t x[8], new_l_data;
734
735 b->l_data = 0;
736
737 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
738 if (ret == 0) return -1; // normal end-of-file
739 else return -2; // truncated
740 }
741 if (fp->is_be)
742 ed_swap_4p(&block_len);
743 if (block_len < 32) return -4; // block_len includes core data
744 if (bgzf_read(fp, x, 32) != 32) return -3;
745 if (fp->is_be) {
746 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
747 }
748 c->tid = x[0]; c->pos = (int32_t)x[1];
749 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
750 c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
751 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
752 c->l_qseq = x[4];
753 c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
754
755 new_l_data = block_len - 32 + c->l_extranul;
756 if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
757 if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
758 + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
759 return -4;
760 if (realloc_bam_data(b, new_l_data) < 0) return -4;
761 b->l_data = new_l_data;
762
763 if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
764 if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
765 if (fixup_missing_qname_nul(b) < 0) return -4;
766 }
767 for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
768 c->l_qname += c->l_extranul;
769 if (b->l_data < c->l_qname ||
770 bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
771 return -4;
772 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
773 if (bam_tag2cigar(b, 0, 0) < 0)
774 return -4;
775
776 if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
777 hts_pos_t rlen, qlen;
778 bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
779 if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
780 b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
781 // Sanity check for broken CIGAR alignments
782 if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
783 hts_log_error("CIGAR and query sequence lengths differ for %s",
784 bam_get_qname(b));
785 return -4;
786 }
787 }
788
789 return 4 + block_len;
790 }
791
bam_write1(BGZF * fp,const bam1_t * b)792 int bam_write1(BGZF *fp, const bam1_t *b)
793 {
794 const bam1_core_t *c = &b->core;
795 uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
796 int i, ok;
797 if (c->l_qname - c->l_extranul > 255) {
798 hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
799 errno = EOVERFLOW;
800 return -1;
801 }
802 if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
803 if (c->pos > INT_MAX ||
804 c->mpos > INT_MAX ||
805 c->isize < INT_MIN || c->isize > INT_MAX) {
806 hts_log_error("Positional data is too large for BAM format");
807 return -1;
808 }
809 x[0] = c->tid;
810 x[1] = c->pos;
811 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
812 if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
813 else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
814 x[4] = c->l_qseq;
815 x[5] = c->mtid;
816 x[6] = c->mpos;
817 x[7] = c->isize;
818 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
819 if (fp->is_be) {
820 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
821 y = block_len;
822 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
823 swap_data(c, b->l_data, b->data, 1);
824 } else {
825 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
826 }
827 if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
828 if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
829 if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
830 if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
831 } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
832 uint8_t buf[8];
833 uint32_t cigar_st, cigar_en, cigar[2];
834 hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
835 if (cigreflen >= (1<<28)) {
836 // Length of reference covered is greater than the biggest
837 // CIGAR operation currently allowed.
838 hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
839 " cannot be written in BAM. Try writing SAM or CRAM instead.\n",
840 bam_get_qname(b), c->n_cigar, cigreflen);
841 return -1;
842 }
843 cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
844 cigar_en = cigar_st + c->n_cigar * 4;
845 cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
846 cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
847 u32_to_le(cigar[0], buf);
848 u32_to_le(cigar[1], buf + 4);
849 if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
850 if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
851 if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
852 u32_to_le(c->n_cigar, buf);
853 if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
854 if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
855 }
856 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
857 return ok? 4 + block_len : -1;
858 }
859
860 /*
861 * Write a BAM file and append to the in-memory index simultaneously.
862 */
bam_write_idx1(htsFile * fp,const sam_hdr_t * h,const bam1_t * b)863 static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
864 BGZF *bfp = fp->fp.bgzf;
865
866 if (!fp->idx)
867 return bam_write1(bfp, b);
868
869 uint32_t block_len = b->l_data - b->core.l_extranul + 32;
870 if (bgzf_flush_try(bfp, 4 + block_len) < 0)
871 return -1;
872 if (!bfp->mt)
873 hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
874 else
875 bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp));
876
877 int ret = bam_write1(bfp, b);
878 if (ret < 0)
879 return -1;
880
881 if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
882 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
883 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
884 ret = -1;
885 }
886
887 return ret;
888 }
889
890 /*
891 * Set the qname in a BAM record
892 */
bam_set_qname(bam1_t * rec,const char * qname)893 int bam_set_qname(bam1_t *rec, const char *qname)
894 {
895 if (!rec) return -1;
896 if (!qname || !*qname) return -1;
897
898 size_t old_len = rec->core.l_qname;
899 size_t new_len = strlen(qname) + 1;
900 if (new_len < 1 || new_len > 255) return -1;
901
902 int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
903
904 size_t new_data_len = rec->l_data - old_len + new_len + extranul;
905 if (realloc_bam_data(rec, new_data_len) < 0) return -1;
906
907 // Make room
908 if (new_len + extranul != rec->core.l_qname)
909 memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
910 // Copy in new name and pad if needed
911 memcpy(rec->data, qname, new_len);
912 int n;
913 for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
914
915 rec->l_data = new_data_len;
916 rec->core.l_qname = new_len + extranul;
917 rec->core.l_extranul = extranul;
918
919 return 0;
920 }
921
922 /********************
923 *** BAM indexing ***
924 ********************/
925
sam_index(htsFile * fp,int min_shift)926 static hts_idx_t *sam_index(htsFile *fp, int min_shift)
927 {
928 int n_lvls, i, fmt, ret;
929 bam1_t *b;
930 hts_idx_t *idx;
931 sam_hdr_t *h;
932 h = sam_hdr_read(fp);
933 if (h == NULL) return NULL;
934 if (min_shift > 0) {
935 hts_pos_t max_len = 0, s;
936 for (i = 0; i < h->n_targets; ++i) {
937 hts_pos_t len = sam_hdr_tid2len(h, i);
938 if (max_len < len) max_len = len;
939 }
940 max_len += 256;
941 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
942 fmt = HTS_FMT_CSI;
943 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
944 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
945 b = bam_init1();
946 while ((ret = sam_read1(fp, h, b)) >= 0) {
947 ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
948 if (ret < 0) { // unsorted or doesn't fit
949 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
950 goto err;
951 }
952 }
953 if (ret < -1) goto err; // corrupted BAM file
954
955 hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
956 sam_hdr_destroy(h);
957 bam_destroy1(b);
958 return idx;
959
960 err:
961 bam_destroy1(b);
962 hts_idx_destroy(idx);
963 return NULL;
964 }
965
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)966 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
967 {
968 hts_idx_t *idx;
969 htsFile *fp;
970 int ret = 0;
971
972 if ((fp = hts_open(fn, "r")) == 0) return -2;
973 if (nthreads)
974 hts_set_threads(fp, nthreads);
975
976 switch (fp->format.format) {
977 case cram:
978
979 ret = cram_index_build(fp->fp.cram, fn, fnidx);
980 break;
981
982 case bam:
983 case sam:
984 if (fp->format.compression != bgzf) {
985 hts_log_error("%s file \"%s\" not BGZF compressed",
986 fp->format.format == bam ? "BAM" : "SAM", fn);
987 ret = -1;
988 break;
989 }
990 idx = sam_index(fp, min_shift);
991 if (idx) {
992 ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
993 if (ret < 0) ret = -4;
994 hts_idx_destroy(idx);
995 }
996 else ret = -1;
997 break;
998
999 default:
1000 ret = -3;
1001 break;
1002 }
1003 hts_close(fp);
1004
1005 return ret;
1006 }
1007
sam_index_build2(const char * fn,const char * fnidx,int min_shift)1008 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1009 {
1010 return sam_index_build3(fn, fnidx, min_shift, 0);
1011 }
1012
sam_index_build(const char * fn,int min_shift)1013 int sam_index_build(const char *fn, int min_shift)
1014 {
1015 return sam_index_build3(fn, NULL, min_shift, 0);
1016 }
1017
1018 // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1019 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)1020 int bam_index_build(const char *fn, int min_shift)
1021 {
1022 return sam_index_build2(fn, NULL, min_shift);
1023 }
1024
1025 // Initialise fp->idx for the current format type.
1026 // This must be called after the header has been written but no other data.
sam_idx_init(htsFile * fp,sam_hdr_t * h,int min_shift,const char * fnidx)1027 int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1028 fp->fnidx = fnidx;
1029 if (fp->format.format == bam || fp->format.format == bcf ||
1030 (fp->format.format == sam && fp->format.compression == bgzf)) {
1031 int n_lvls, fmt = HTS_FMT_CSI;
1032 if (min_shift > 0) {
1033 int64_t max_len = 0, s;
1034 int i;
1035 for (i = 0; i < h->n_targets; ++i)
1036 if (max_len < h->target_len[i]) max_len = h->target_len[i];
1037 max_len += 256;
1038 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1039
1040 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1041
1042 fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1043 return fp->idx ? 0 : -1;
1044 }
1045
1046 if (fp->format.format == cram) {
1047 fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1048 return fp->fp.cram->idxfp ? 0 : -1;
1049 }
1050
1051 return -1;
1052 }
1053
1054 // Finishes an index. Call after the last record has been written.
1055 // Returns 0 on success, <0 on failure.
sam_idx_save(htsFile * fp)1056 int sam_idx_save(htsFile *fp) {
1057 if (fp->format.format == bam || fp->format.format == bcf ||
1058 fp->format.format == vcf || fp->format.format == sam) {
1059 int ret;
1060 if ((ret = sam_state_destroy(fp)) < 0) {
1061 errno = -ret;
1062 return -1;
1063 }
1064 if (bgzf_flush(fp->fp.bgzf) < 0)
1065 return -1;
1066 hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1067
1068 if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1069 return -1;
1070
1071 return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx));
1072
1073 } else if (fp->format.format == cram) {
1074 // flushed and closed by cram_close
1075 }
1076
1077 return 0;
1078 }
1079
sam_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1080 static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1081 {
1082 htsFile *fp = (htsFile *)fpv;
1083 bam1_t *b = bv;
1084 fp->line.l = 0;
1085 int ret = sam_read1(fp, fp->bam_header, b);
1086 if (ret >= 0) {
1087 *tid = b->core.tid;
1088 *beg = b->core.pos;
1089 *end = bam_endpos(b);
1090 }
1091 return ret;
1092 }
1093
1094 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_readrec_rest(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1095 static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1096 {
1097 htsFile *fp = (htsFile *)fpv;
1098 bam1_t *b = bv;
1099 fp->line.l = 0;
1100 int ret = sam_read1(fp, fp->bam_header, b);
1101 return ret;
1102 }
1103
1104 // Internal (for now) func used by bam_sym_lookup. This is copied from
1105 // samtools/bam.c.
bam_get_library(const bam_hdr_t * h,const bam1_t * b)1106 static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1107 {
1108 const char *rg;
1109 kstring_t lib = { 0, 0, NULL };
1110 rg = (char *)bam_aux_get(b, "RG");
1111
1112 if (!rg)
1113 return NULL;
1114 else
1115 rg++;
1116
1117 if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0)
1118 return NULL;
1119
1120 static char LB_text[1024];
1121 int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1122
1123 memcpy(LB_text, lib.s, len);
1124 LB_text[len] = 0;
1125
1126 free(lib.s);
1127
1128 return LB_text;
1129 }
1130
1131
1132 // Bam record pointer and SAM header combined
1133 typedef struct {
1134 const sam_hdr_t *h;
1135 const bam1_t *b;
1136 } hb_pair;
1137
1138 // Looks up variable names in str and replaces them with their value.
1139 // Also supports aux tags.
1140 //
1141 // Note the expression parser deliberately overallocates str size so it
1142 // is safe to use memcmp over strcmp.
bam_sym_lookup(void * data,char * str,char ** end,hts_expr_val_t * res)1143 static int bam_sym_lookup(void *data, char *str, char **end,
1144 hts_expr_val_t *res) {
1145 hb_pair *hb = (hb_pair *)data;
1146 const bam1_t *b = hb->b;
1147
1148 res->is_str = 0;
1149 switch(*str) {
1150 case 'c':
1151 if (memcmp(str, "cigar", 5) == 0) {
1152 *end = str+5;
1153 res->is_str = 1;
1154 ks_clear(&res->s);
1155 uint32_t *cigar = bam_get_cigar(b);
1156 int i, n = b->core.n_cigar, r = 0;
1157 if (n) {
1158 for (i = 0; i < n; i++) {
1159 r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1160 r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1161 }
1162 r |= kputs("", &res->s) < 0;
1163 } else {
1164 r |= kputs("*", &res->s) < 0;
1165 }
1166 return r ? -1 : 0;
1167 }
1168 break;
1169
1170 case 'e':
1171 if (memcmp(str, "endpos", 6) == 0) {
1172 *end = str+6;
1173 res->d = bam_endpos(b);
1174 return 0;
1175 }
1176 break;
1177
1178 case 'f':
1179 if (memcmp(str, "flag", 4) == 0) {
1180 str = *end = str+4;
1181 if (*str != '.') {
1182 res->d = b->core.flag;
1183 return 0;
1184 } else {
1185 str++;
1186 if (!memcmp(str, "paired", 6)) {
1187 *end = str+6;
1188 res->d = b->core.flag & BAM_FPAIRED;
1189 return 0;
1190 } else if (!memcmp(str, "proper_pair", 11)) {
1191 *end = str+11;
1192 res->d = b->core.flag & BAM_FPROPER_PAIR;
1193 return 0;
1194 } else if (!memcmp(str, "unmap", 5)) {
1195 *end = str+5;
1196 res->d = b->core.flag & BAM_FUNMAP;
1197 return 0;
1198 } else if (!memcmp(str, "munmap", 6)) {
1199 *end = str+6;
1200 res->d = b->core.flag & BAM_FMUNMAP;
1201 return 0;
1202 } else if (!memcmp(str, "reverse", 7)) {
1203 *end = str+7;
1204 res->d = b->core.flag & BAM_FREVERSE;
1205 return 0;
1206 } else if (!memcmp(str, "mreverse", 8)) {
1207 *end = str+8;
1208 res->d = b->core.flag & BAM_FMREVERSE;
1209 return 0;
1210 } else if (!memcmp(str, "read1", 5)) {
1211 *end = str+5;
1212 res->d = b->core.flag & BAM_FREAD1;
1213 return 0;
1214 } else if (!memcmp(str, "read2", 5)) {
1215 *end = str+5;
1216 res->d = b->core.flag & BAM_FREAD2;
1217 return 0;
1218 } else if (!memcmp(str, "secondary", 9)) {
1219 *end = str+9;
1220 res->d = b->core.flag & BAM_FSECONDARY;
1221 return 0;
1222 } else if (!memcmp(str, "qcfail", 6)) {
1223 *end = str+6;
1224 res->d = b->core.flag & BAM_FQCFAIL;
1225 return 0;
1226 } else if (!memcmp(str, "dup", 3)) {
1227 *end = str+3;
1228 res->d = b->core.flag & BAM_FDUP;
1229 return 0;
1230 } else if (!memcmp(str, "supplementary", 13)) {
1231 *end = str+13;
1232 res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1233 return 0;
1234 } else {
1235 hts_log_error("Unrecognised flag string");
1236 return -1;
1237 }
1238 }
1239 }
1240 break;
1241
1242 case 'l':
1243 if (memcmp(str, "library", 7) == 0) {
1244 *end = str+7;
1245 res->is_str = 1;
1246 const char *lib = bam_get_library(hb->h, b);
1247 kputs(lib ? lib : "", ks_clear(&res->s));
1248 return 0;
1249 }
1250 break;
1251
1252 case 'm':
1253 if (memcmp(str, "mapq", 4) == 0) {
1254 *end = str+4;
1255 res->d = b->core.qual;
1256 return 0;
1257 } else if (memcmp(str, "mpos", 4) == 0) {
1258 *end = str+4;
1259 res->d = b->core.mpos+1;
1260 return 0;
1261 } else if (memcmp(str, "mrname", 6) == 0) {
1262 *end = str+6;
1263 res->is_str = 1;
1264 const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1265 kputs(rn ? rn : "*", ks_clear(&res->s));
1266 return 0;
1267 } else if (memcmp(str, "mrefid", 6) == 0) {
1268 *end = str+6;
1269 res->d = b->core.mtid;
1270 return 0;
1271 }
1272 break;
1273
1274 case 'n':
1275 if (memcmp(str, "ncigar", 6) == 0) {
1276 *end = str+6;
1277 res->d = b->core.n_cigar;
1278 return 0;
1279 }
1280 break;
1281
1282 case 'p':
1283 if (memcmp(str, "pos", 3) == 0) {
1284 *end = str+3;
1285 res->d = b->core.pos+1;
1286 return 0;
1287 } else if (memcmp(str, "pnext", 5) == 0) {
1288 *end = str+5;
1289 res->d = b->core.mpos+1;
1290 return 0;
1291 }
1292 break;
1293
1294 case 'q':
1295 if (memcmp(str, "qlen", 4) == 0) {
1296 *end = str+4;
1297 res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1298 return 0;
1299 } else if (memcmp(str, "qname", 5) == 0) {
1300 *end = str+5;
1301 res->is_str = 1;
1302 kputs(bam_get_qname(b), ks_clear(&res->s));
1303 return 0;
1304 } else if (memcmp(str, "qual", 4) == 0) {
1305 *end = str+4;
1306 ks_clear(&res->s);
1307 if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1308 return -1;
1309 memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1310 res->s.l = b->core.l_qseq;
1311 res->is_str = 1;
1312 return 0;
1313 }
1314 break;
1315
1316 case 'r':
1317 if (memcmp(str, "rlen", 4) == 0) {
1318 *end = str+4;
1319 res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1320 return 0;
1321 } else if (memcmp(str, "rname", 5) == 0) {
1322 *end = str+5;
1323 res->is_str = 1;
1324 const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1325 kputs(rn ? rn : "*", ks_clear(&res->s));
1326 return 0;
1327 } else if (memcmp(str, "rnext", 5) == 0) {
1328 *end = str+5;
1329 res->is_str = 1;
1330 const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1331 kputs(rn ? rn : "*", ks_clear(&res->s));
1332 return 0;
1333 } else if (memcmp(str, "refid", 5) == 0) {
1334 *end = str+5;
1335 res->d = b->core.tid;
1336 return 0;
1337 }
1338 break;
1339
1340 case 's':
1341 if (memcmp(str, "seq", 3) == 0) {
1342 *end = str+3;
1343 ks_clear(&res->s);
1344 if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1345 return -1;
1346 nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1347 res->s.s[b->core.l_qseq] = 0;
1348 res->s.l = b->core.l_qseq;
1349 res->is_str = 1;
1350 return 0;
1351 }
1352 break;
1353
1354 case 't':
1355 if (memcmp(str, "tlen", 4) == 0) {
1356 *end = str+4;
1357 res->d = b->core.isize;
1358 return 0;
1359 }
1360 break;
1361
1362 case '[':
1363 if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1364 /* aux tags */
1365 *end = str+4;
1366
1367 uint8_t *aux = bam_aux_get(b, str+1);
1368 if (aux) {
1369 // we define the truth of a tag to be its presence, even if 0.
1370 res->is_true = 1;
1371 switch (*aux) {
1372 case 'Z':
1373 case 'H':
1374 res->is_str = 1;
1375 kputs((char *)aux+1, ks_clear(&res->s));
1376 break;
1377
1378 case 'A':
1379 res->is_str = 1;
1380 kputsn((char *)aux+1, 1, ks_clear(&res->s));
1381 break;
1382
1383 case 'i': case 'I':
1384 case 's': case 'S':
1385 case 'c': case 'C':
1386 res->is_str = 0;
1387 res->d = bam_aux2i(aux);
1388 break;
1389
1390 case 'f':
1391 case 'd':
1392 res->is_str = 0;
1393 res->d = bam_aux2f(aux);
1394 break;
1395
1396 default:
1397 hts_log_error("Aux type '%c not yet supported by filters",
1398 *aux);
1399 return -1;
1400 }
1401 return 0;
1402
1403 } else {
1404 // hence absent tags are always false (and strings)
1405 res->is_str = 1;
1406 res->s.l = 0;
1407 res->d = 0;
1408 res->is_true = 0;
1409 return 0;
1410 }
1411 }
1412 break;
1413 }
1414
1415 // All successful matches in switch should return 0.
1416 // So if we didn't match, it's a parse error.
1417 return -1;
1418 }
1419
1420 // Returns 1 when accepted by the filter, 0 if not, -1 on error.
sam_passes_filter(const sam_hdr_t * h,const bam1_t * b,hts_filter_t * filt)1421 int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1422 {
1423 hb_pair hb = {h, b};
1424 hts_expr_val_t res;
1425 if (hts_filter_eval(filt, &hb, bam_sym_lookup, &res)) {
1426 hts_log_error("Couldn't process filter expression");
1427 hts_expr_val_free(&res);
1428 return -1;
1429 }
1430
1431 int t = res.is_true;
1432 hts_expr_val_free(&res);
1433
1434 return t;
1435 }
1436
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1437 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1438 {
1439 htsFile *fp = fpv;
1440 bam1_t *b = bv;
1441 int pass_filter, ret;
1442
1443 do {
1444 ret = cram_get_bam_seq(fp->fp.cram, &b);
1445 if (ret < 0)
1446 return cram_eof(fp->fp.cram) ? -1 : -2;
1447
1448 if (bam_tag2cigar(b, 1, 1) < 0)
1449 return -2;
1450
1451 *tid = b->core.tid;
1452 *beg = b->core.pos;
1453 *end = bam_endpos(b);
1454
1455 if (fp->filter) {
1456 pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1457 if (pass_filter < 0)
1458 return -2;
1459 } else {
1460 pass_filter = 1;
1461 }
1462 } while (pass_filter == 0);
1463
1464 return ret;
1465 }
1466
cram_pseek(void * fp,int64_t offset,int whence)1467 static int cram_pseek(void *fp, int64_t offset, int whence)
1468 {
1469 cram_fd *fd = (cram_fd *)fp;
1470
1471 if ((0 != cram_seek(fd, offset, SEEK_SET))
1472 && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1473 return -1;
1474
1475 fd->curr_position = offset;
1476
1477 if (fd->ctr) {
1478 cram_free_container(fd->ctr);
1479 if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1480 cram_free_container(fd->ctr_mt);
1481
1482 fd->ctr = NULL;
1483 fd->ctr_mt = NULL;
1484 fd->ooc = 0;
1485 }
1486
1487 return 0;
1488 }
1489
1490 /*
1491 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1492 * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1493 * container previously fetched. It was designed like this to integrate with the functionality
1494 * of the iterator stepping logic.
1495 */
1496
cram_ptell(void * fp)1497 static int64_t cram_ptell(void *fp)
1498 {
1499 cram_fd *fd = (cram_fd *)fp;
1500 cram_container *c;
1501 cram_slice *s;
1502 int64_t ret = -1L;
1503
1504 if (fd) {
1505 if ((c = fd->ctr) != NULL) {
1506 if ((s = c->slice) != NULL && s->max_rec) {
1507 if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1508 fd->curr_position += c->offset + c->length;
1509 }
1510 }
1511 ret = fd->curr_position;
1512 }
1513
1514 return ret;
1515 }
1516
bam_pseek(void * fp,int64_t offset,int whence)1517 static int bam_pseek(void *fp, int64_t offset, int whence)
1518 {
1519 BGZF *fd = (BGZF *)fp;
1520
1521 return bgzf_seek(fd, offset, whence);
1522 }
1523
bam_ptell(void * fp)1524 static int64_t bam_ptell(void *fp)
1525 {
1526 BGZF *fd = (BGZF *)fp;
1527 if (!fd)
1528 return -1L;
1529
1530 return bgzf_tell(fd);
1531 }
1532
1533
1534
index_load(htsFile * fp,const char * fn,const char * fnidx,int flags)1535 static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1536 {
1537 switch (fp->format.format) {
1538 case bam:
1539 case sam:
1540 return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1541
1542 case cram: {
1543 if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1544
1545 // Cons up a fake "index" just pointing at the associated cram_fd:
1546 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1547 if (idx == NULL) return NULL;
1548 idx->fmt = HTS_FMT_CRAI;
1549 idx->cram = fp->fp.cram;
1550 return (hts_idx_t *) idx;
1551 }
1552
1553 default:
1554 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1555 }
1556 }
1557
sam_index_load3(htsFile * fp,const char * fn,const char * fnidx,int flags)1558 hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1559 {
1560 return index_load(fp, fn, fnidx, flags);
1561 }
1562
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)1563 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1564 return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1565 }
1566
sam_index_load(htsFile * fp,const char * fn)1567 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1568 {
1569 return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1570 }
1571
cram_itr_query(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end,hts_readrec_func * readrec)1572 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1573 {
1574 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1575 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1576 if (iter == NULL) return NULL;
1577
1578 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1579 // the readrec function:
1580 iter->is_cram = 1;
1581 iter->read_rest = 1;
1582 iter->off = NULL;
1583 iter->bins.a = NULL;
1584 iter->readrec = readrec;
1585
1586 if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1587 cram_range r = { tid, beg+1, end };
1588 int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1589
1590 iter->curr_off = 0;
1591 // The following fields are not required by hts_itr_next(), but are
1592 // filled in in case user code wants to look at them.
1593 iter->tid = tid;
1594 iter->beg = beg;
1595 iter->end = end;
1596
1597 switch (ret) {
1598 case 0:
1599 break;
1600
1601 case -2:
1602 // No data vs this ref, so mark iterator as completed.
1603 // Same as HTS_IDX_NONE.
1604 iter->finished = 1;
1605 break;
1606
1607 default:
1608 free(iter);
1609 return NULL;
1610 }
1611 }
1612 else switch (tid) {
1613 case HTS_IDX_REST:
1614 iter->curr_off = 0;
1615 break;
1616 case HTS_IDX_NONE:
1617 iter->curr_off = 0;
1618 iter->finished = 1;
1619 break;
1620 default:
1621 hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1622 abort();
1623 break;
1624 }
1625
1626 return iter;
1627 }
1628
sam_itr_queryi(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end)1629 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1630 {
1631 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1632 if (idx == NULL)
1633 return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1634 else if (cidx->fmt == HTS_FMT_CRAI)
1635 return cram_itr_query(idx, tid, beg, end, sam_readrec);
1636 else
1637 return hts_itr_query(idx, tid, beg, end, sam_readrec);
1638 }
1639
cram_name2id(void * fdv,const char * ref)1640 static int cram_name2id(void *fdv, const char *ref)
1641 {
1642 cram_fd *fd = (cram_fd *) fdv;
1643 return sam_hdr_name2tid(fd->header, ref);
1644 }
1645
sam_itr_querys(const hts_idx_t * idx,sam_hdr_t * hdr,const char * region)1646 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1647 {
1648 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1649 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1650 cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1651 sam_readrec);
1652 }
1653
sam_itr_regarray(const hts_idx_t * idx,sam_hdr_t * hdr,char ** regarray,unsigned int regcount)1654 hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1655 {
1656 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1657 hts_reglist_t *r_list = NULL;
1658 int r_count = 0;
1659
1660 if (!cidx || !hdr)
1661 return NULL;
1662
1663 hts_itr_t *itr = NULL;
1664 if (cidx->fmt == HTS_FMT_CRAI) {
1665 r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1666 if (!r_list)
1667 return NULL;
1668 itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1669 hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1670 } else {
1671 r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1672 if (!r_list)
1673 return NULL;
1674 itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1675 hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1676 }
1677
1678 if (!itr)
1679 hts_reglist_free(r_list, r_count);
1680
1681 return itr;
1682 }
1683
sam_itr_regions(const hts_idx_t * idx,sam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)1684 hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1685 {
1686 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1687
1688 if(!cidx || !hdr || !reglist)
1689 return NULL;
1690
1691 if (cidx->fmt == HTS_FMT_CRAI)
1692 return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1693 hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1694 else
1695 return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1696 hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1697 }
1698
1699 /**********************
1700 *** SAM header I/O ***
1701 **********************/
1702
1703 #include "htslib/kseq.h"
1704 #include "htslib/kstring.h"
1705
sam_hdr_parse(size_t l_text,const char * text)1706 sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1707 {
1708 sam_hdr_t *bh = sam_hdr_init();
1709 if (!bh) return NULL;
1710
1711 if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1712 sam_hdr_destroy(bh);
1713 return NULL;
1714 }
1715
1716 return bh;
1717 }
1718
valid_sam_header_type(const char * s)1719 static int valid_sam_header_type(const char *s) {
1720 if (s[0] != '@') return 0;
1721 switch (s[1]) {
1722 case 'H':
1723 return s[2] == 'D' && s[3] == '\t';
1724 case 'S':
1725 return s[2] == 'Q' && s[3] == '\t';
1726 case 'R':
1727 case 'P':
1728 return s[2] == 'G' && s[3] == '\t';
1729 case 'C':
1730 return s[2] == 'O';
1731 }
1732 return 0;
1733 }
1734
1735 // Minimal sanitisation of a header to ensure.
1736 // - null terminated string.
1737 // - all lines start with @ (also implies no blank lines).
1738 //
1739 // Much more could be done, but currently is not, including:
1740 // - checking header types are known (HD, SQ, etc).
1741 // - syntax (eg checking tab separated fields).
1742 // - validating n_targets matches @SQ records.
1743 // - validating target lengths against @SQ records.
sam_hdr_sanitise(sam_hdr_t * h)1744 static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1745 if (!h)
1746 return NULL;
1747
1748 // Special case for empty headers.
1749 if (h->l_text == 0)
1750 return h;
1751
1752 size_t i;
1753 unsigned int lnum = 0;
1754 char *cp = h->text, last = '\n';
1755 for (i = 0; i < h->l_text; i++) {
1756 // NB: l_text excludes terminating nul. This finds early ones.
1757 if (cp[i] == 0)
1758 break;
1759
1760 // Error on \n[^@], including duplicate newlines
1761 if (last == '\n') {
1762 lnum++;
1763 if (cp[i] != '@') {
1764 hts_log_error("Malformed SAM header at line %u", lnum);
1765 sam_hdr_destroy(h);
1766 return NULL;
1767 }
1768 }
1769
1770 last = cp[i];
1771 }
1772
1773 if (i < h->l_text) { // Early nul found. Complain if not just padding.
1774 size_t j = i;
1775 while (j < h->l_text && cp[j] == '\0') j++;
1776 if (j < h->l_text)
1777 hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1778 }
1779
1780 // Add trailing newline and/or trailing nul if required.
1781 if (last != '\n') {
1782 hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1783
1784 if (h->l_text < 2 || i >= h->l_text - 2) {
1785 if (h->l_text >= SIZE_MAX - 2) {
1786 hts_log_error("No room for extra newline");
1787 sam_hdr_destroy(h);
1788 return NULL;
1789 }
1790
1791 cp = realloc(h->text, (size_t) h->l_text+2);
1792 if (!cp) {
1793 sam_hdr_destroy(h);
1794 return NULL;
1795 }
1796 h->text = cp;
1797 }
1798 cp[i++] = '\n';
1799
1800 // l_text may be larger already due to multiple nul padding
1801 if (h->l_text < i)
1802 h->l_text = i;
1803 cp[h->l_text] = '\0';
1804 }
1805
1806 return h;
1807 }
1808
known_stderr(const char * tool,const char * advice)1809 static void known_stderr(const char *tool, const char *advice) {
1810 hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
1811 hts_log_warning("%s", advice);
1812 }
1813
warn_if_known_stderr(const char * line)1814 static void warn_if_known_stderr(const char *line) {
1815 if (strstr(line, "M::bwa_idx_load_from_disk") != NULL)
1816 known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
1817 else if (strstr(line, "M::mem_pestat") != NULL)
1818 known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
1819 else if (strstr(line, "loaded/built the index") != NULL)
1820 known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
1821 }
1822
sam_hdr_create(htsFile * fp)1823 static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1824 kstring_t str = { 0, 0, NULL };
1825 khint_t k;
1826 sam_hdr_t* h = sam_hdr_init();
1827 const char *q, *r;
1828 char* sn = NULL;
1829 khash_t(s2i) *d = kh_init(s2i);
1830 khash_t(s2i) *long_refs = NULL;
1831 if (!h || !d)
1832 goto error;
1833
1834 int ret, has_SQ = 0;
1835 int next_c = '@';
1836 while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1837 if (fp->line.s[0] != '@')
1838 break;
1839
1840 if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1841 has_SQ = 1;
1842 hts_pos_t ln = -1;
1843 for (q = fp->line.s + 4;; ++q) {
1844 if (strncmp(q, "SN:", 3) == 0) {
1845 q += 3;
1846 for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1847
1848 if (sn) {
1849 hts_log_warning("SQ header line has more than one SN: tag");
1850 free(sn);
1851 }
1852 sn = (char*)calloc(r - q + 1, 1);
1853 if (!sn)
1854 goto error;
1855
1856 strncpy(sn, q, r - q);
1857 q = r;
1858 } else {
1859 if (strncmp(q, "LN:", 3) == 0)
1860 ln = strtoll(q + 3, (char**)&q, 10);
1861 }
1862
1863 while (*q != '\t' && *q != '\n' && *q != '\0')
1864 ++q;
1865 if (*q == '\0' || *q == '\n')
1866 break;
1867 }
1868 if (sn) {
1869 if (ln >= 0) {
1870 int absent;
1871 k = kh_put(s2i, d, sn, &absent);
1872 if (absent < 0)
1873 goto error;
1874
1875 if (!absent) {
1876 hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn);
1877 free(sn);
1878 } else {
1879 sn = NULL;
1880 if (ln >= UINT32_MAX) {
1881 // Stash away ref length that
1882 // doesn't fit in target_len array
1883 int k2;
1884 if (!long_refs) {
1885 long_refs = kh_init(s2i);
1886 if (!long_refs)
1887 goto error;
1888 }
1889 k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
1890 if (absent < 0)
1891 goto error;
1892 kh_val(long_refs, k2) = ln;
1893 kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1894 | UINT32_MAX);
1895 } else {
1896 kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1897 }
1898 }
1899 } else {
1900 hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
1901 warn_if_known_stderr(fp->line.s);
1902 free(sn);
1903 }
1904 } else {
1905 hts_log_warning("Ignored @SQ line with missing SN: tag");
1906 warn_if_known_stderr(fp->line.s);
1907 }
1908 sn = NULL;
1909 }
1910 else if (!valid_sam_header_type(fp->line.s)) {
1911 hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
1912 warn_if_known_stderr(fp->line.s);
1913 goto error;
1914 }
1915
1916 if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1917 goto error;
1918
1919 if (kputc('\n', &str) < 0)
1920 goto error;
1921
1922 if (fp->is_bgzf) {
1923 next_c = bgzf_peek(fp->fp.bgzf);
1924 } else {
1925 unsigned char nc;
1926 ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
1927 next_c = pret > 0 ? nc : pret - 1;
1928 }
1929 if (next_c < -1)
1930 goto error;
1931 }
1932 if (next_c != '@')
1933 fp->line.l = 0;
1934
1935 if (ret < -1)
1936 goto error;
1937
1938 if (!has_SQ && fp->fn_aux) {
1939 kstring_t line = { 0, 0, NULL };
1940
1941 /* The reference index (.fai) is actually needed here */
1942 char *fai_fn = fp->fn_aux;
1943 char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
1944 if (fn_delim)
1945 fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
1946
1947 hFILE* f = hopen(fai_fn, "r");
1948 int e = 0, absent;
1949 if (f == NULL)
1950 goto error;
1951
1952 while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
1953 char* tab = strchr(line.s, '\t');
1954 hts_pos_t ln;
1955
1956 if (tab == NULL)
1957 continue;
1958
1959 sn = (char*)calloc(tab-line.s+1, 1);
1960 if (!sn) {
1961 e = 1;
1962 break;
1963 }
1964 memcpy(sn, line.s, tab-line.s);
1965 k = kh_put(s2i, d, sn, &absent);
1966 if (absent < 0) {
1967 e = 1;
1968 break;
1969 }
1970
1971 ln = strtoll(tab, NULL, 10);
1972
1973 if (!absent) {
1974 hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn);
1975 free(sn);
1976 sn = NULL;
1977 } else {
1978 sn = NULL;
1979 if (ln >= UINT32_MAX) {
1980 // Stash away ref length that
1981 // doesn't fit in target_len array
1982 khint_t k2;
1983 int absent = -1;
1984 if (!long_refs) {
1985 long_refs = kh_init(s2i);
1986 if (!long_refs) {
1987 e = 1;
1988 break;
1989 }
1990 }
1991 k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
1992 if (absent < 0) {
1993 e = 1;
1994 break;
1995 }
1996 kh_val(long_refs, k2) = ln;
1997 kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1998 | UINT32_MAX);
1999 } else {
2000 kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2001 }
2002 has_SQ = 1;
2003 }
2004
2005 e |= kputs("@SQ\tSN:", &str) < 0;
2006 e |= kputsn(line.s, tab - line.s, &str) < 0;
2007 e |= kputs("\tLN:", &str) < 0;
2008 e |= kputll(ln, &str) < 0;
2009 e |= kputc('\n', &str) < 0;
2010 if (e)
2011 break;
2012 }
2013
2014 ks_free(&line);
2015 if (hclose(f) != 0) {
2016 hts_log_error("Error on closing %s", fai_fn);
2017 e = 1;
2018 }
2019 if (e)
2020 goto error;
2021 }
2022
2023 if (has_SQ) {
2024 // Populate the targets array
2025 h->n_targets = kh_size(d);
2026
2027 h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
2028 if (!h->target_name) {
2029 h->n_targets = 0;
2030 goto error;
2031 }
2032
2033 h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
2034 if (!h->target_len) {
2035 h->n_targets = 0;
2036 goto error;
2037 }
2038
2039 for (k = kh_begin(d); k != kh_end(d); ++k) {
2040 if (!kh_exist(d, k))
2041 continue;
2042
2043 h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
2044 h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
2045 kh_val(d, k) >>= 32;
2046 }
2047 }
2048
2049 // Repurpose sdict to hold any references longer than UINT32_MAX
2050 h->sdict = long_refs;
2051
2052 kh_destroy(s2i, d);
2053
2054 if (str.l == 0)
2055 kputsn("", 0, &str);
2056 h->l_text = str.l;
2057 h->text = ks_release(&str);
2058 fp->bam_header = sam_hdr_sanitise(h);
2059 fp->bam_header->ref_count = 1;
2060
2061 return fp->bam_header;
2062
2063 error:
2064 if (h && d && (!h->target_name || !h->target_len)) {
2065 for (k = kh_begin(d); k != kh_end(d); ++k)
2066 if (kh_exist(d, k)) free((void *)kh_key(d, k));
2067 }
2068 sam_hdr_destroy(h);
2069 ks_free(&str);
2070 kh_destroy(s2i, d);
2071 kh_destroy(s2i, long_refs);
2072 if (sn) free(sn);
2073 return NULL;
2074 }
2075
sam_hdr_read(htsFile * fp)2076 sam_hdr_t *sam_hdr_read(htsFile *fp)
2077 {
2078 if (!fp) {
2079 errno = EINVAL;
2080 return NULL;
2081 }
2082
2083 switch (fp->format.format) {
2084 case bam:
2085 return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
2086
2087 case cram:
2088 return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
2089
2090 case sam:
2091 return sam_hdr_create(fp);
2092
2093 case fastq_format:
2094 case fasta_format:
2095 return sam_hdr_init();
2096
2097 case empty_format:
2098 errno = EPIPE;
2099 return NULL;
2100
2101 default:
2102 errno = EFTYPE;
2103 return NULL;
2104 }
2105 }
2106
sam_hdr_write(htsFile * fp,const sam_hdr_t * h)2107 int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
2108 {
2109 if (!fp || !h) {
2110 errno = EINVAL;
2111 return -1;
2112 }
2113
2114 switch (fp->format.format) {
2115 case binary_format:
2116 fp->format.category = sequence_data;
2117 fp->format.format = bam;
2118 /* fall-through */
2119 case bam:
2120 if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
2121 break;
2122
2123 case cram: {
2124 cram_fd *fd = fp->fp.cram;
2125 if (cram_set_header2(fd, h) < 0) return -1;
2126 if (fp->fn_aux)
2127 cram_load_reference(fd, fp->fn_aux);
2128 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
2129 }
2130 break;
2131
2132 case text_format:
2133 fp->format.category = sequence_data;
2134 fp->format.format = sam;
2135 /* fall-through */
2136 case sam: {
2137 if (!h->hrecs && !h->text)
2138 return 0;
2139 char *text;
2140 kstring_t hdr_ks = { 0, 0, NULL };
2141 size_t l_text;
2142 ssize_t bytes;
2143 int r = 0, no_sq = 0;
2144
2145 if (h->hrecs) {
2146 if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2147 return -1;
2148 text = hdr_ks.s;
2149 l_text = hdr_ks.l;
2150 } else {
2151 const char *p = NULL;
2152 do {
2153 const char *q = p == NULL ? h->text : p + 4;
2154 p = strstr(q, "@SQ\t");
2155 } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2156 no_sq = p == NULL;
2157 text = h->text;
2158 l_text = h->l_text;
2159 }
2160
2161 if (fp->is_bgzf) {
2162 bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2163 } else {
2164 bytes = hwrite(fp->fp.hfile, text, l_text);
2165 }
2166 free(hdr_ks.s);
2167 if (bytes != l_text)
2168 return -1;
2169
2170 if (no_sq) {
2171 int i;
2172 for (i = 0; i < h->n_targets; ++i) {
2173 fp->line.l = 0;
2174 r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2175 r |= kputs(h->target_name[i], &fp->line) < 0;
2176 r |= kputsn("\tLN:", 4, &fp->line) < 0;
2177 r |= kputw(h->target_len[i], &fp->line) < 0;
2178 r |= kputc('\n', &fp->line) < 0;
2179 if (r != 0)
2180 return -1;
2181
2182 if (fp->is_bgzf) {
2183 bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2184 } else {
2185 bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2186 }
2187 if (bytes != fp->line.l)
2188 return -1;
2189 }
2190 }
2191 if (fp->is_bgzf) {
2192 if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2193 } else {
2194 if (hflush(fp->fp.hfile) != 0) return -1;
2195 }
2196 }
2197 break;
2198
2199 case fastq_format:
2200 case fasta_format:
2201 // Nothing to output; FASTQ has no file headers.
2202 break;
2203
2204 default:
2205 errno = EBADF;
2206 return -1;
2207 }
2208 return 0;
2209 }
2210
old_sam_hdr_change_HD(sam_hdr_t * h,const char * key,const char * val)2211 static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2212 {
2213 char *p, *q, *beg = NULL, *end = NULL, *newtext;
2214 size_t new_l_text;
2215 if (!h || !key)
2216 return -1;
2217
2218 if (h->l_text > 3) {
2219 if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2220 if ((p = strchr(h->text, '\n')) == 0) return -1;
2221 *p = '\0'; // for strstr call
2222
2223 char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2224
2225 if ((q = strstr(h->text, tmp)) != 0) { // key exists
2226 *p = '\n'; // change back
2227
2228 // mark the key:val
2229 beg = q;
2230 for (q += 4; *q != '\n' && *q != '\t'; ++q);
2231 end = q;
2232
2233 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2234 && strlen(val) == end - beg - 4)
2235 return 0; // val is the same, no need to change
2236
2237 } else {
2238 beg = end = p;
2239 *p = '\n';
2240 }
2241 }
2242 }
2243 if (beg == NULL) { // no @HD
2244 new_l_text = h->l_text;
2245 if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2246 return -1;
2247 new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2248 if (val) {
2249 if (new_l_text > SIZE_MAX - strlen(val) - 5)
2250 return -1;
2251 new_l_text += strlen(val) + 4;
2252 }
2253 newtext = (char*)malloc(new_l_text + 1);
2254 if (!newtext) return -1;
2255
2256 if (val)
2257 snprintf(newtext, new_l_text + 1,
2258 "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2259 else
2260 snprintf(newtext, new_l_text + 1,
2261 "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2262 } else { // has @HD but different or no key
2263 new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2264 if (val) {
2265 if (new_l_text > SIZE_MAX - strlen(val) - 5)
2266 return -1;
2267 new_l_text += strlen(val) + 4;
2268 }
2269 newtext = (char*)malloc(new_l_text + 1);
2270 if (!newtext) return -1;
2271
2272 if (val) {
2273 snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2274 (int) (beg - h->text), h->text, key, val, end);
2275 } else { //delete key
2276 snprintf(newtext, new_l_text + 1, "%.*s%s",
2277 (int) (beg - h->text), h->text, end);
2278 }
2279 }
2280 free(h->text);
2281 h->text = newtext;
2282 h->l_text = new_l_text;
2283 return 0;
2284 }
2285
2286
sam_hdr_change_HD(sam_hdr_t * h,const char * key,const char * val)2287 int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2288 {
2289 if (!h || !key)
2290 return -1;
2291
2292 if (!h->hrecs)
2293 return old_sam_hdr_change_HD(h, key, val);
2294
2295 if (val) {
2296 if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2297 return -1;
2298 } else {
2299 if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2300 return -1;
2301 }
2302 return sam_hdr_rebuild(h);
2303 }
2304 /**********************
2305 *** SAM record I/O ***
2306 **********************/
2307
sam_parse_B_vals(char type,uint32_t n,char * in,char ** end,char * r,bam1_t * b)2308 static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
2309 char *r, bam1_t *b)
2310 {
2311 int orig_l = b->l_data;
2312 char *q = in;
2313 int32_t size;
2314 size_t bytes;
2315 int overflow = 0;
2316
2317 size = aux_type2size(type);
2318 if (size <= 0 || size > 4) {
2319 hts_log_error("Unrecognized type B:%c", type);
2320 return -1;
2321 }
2322
2323 // Ensure space for type + values
2324 bytes = (size_t) n * (size_t) size;
2325 if (bytes / size != n
2326 || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2327 hts_log_error("Out of memory");
2328 return -1;
2329 }
2330
2331 b->data[b->l_data++] = 'B';
2332 b->data[b->l_data++] = type;
2333 i32_to_le(n, b->data + b->l_data);
2334 b->l_data += sizeof(uint32_t);
2335 // This ensures that q always ends up at the next comma after
2336 // reading a number even if it's followed by junk. It
2337 // prevents the possibility of trying to read more than n items.
2338 #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2339 if (type == 'c') {
2340 while (q < r) {
2341 *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow);
2342 b->l_data++;
2343 skip_to_comma_(q);
2344 }
2345 } else if (type == 'C') {
2346 while (q < r) {
2347 if (*q != '-') {
2348 *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow);
2349 b->l_data++;
2350 } else {
2351 overflow = 1;
2352 }
2353 skip_to_comma_(q);
2354 }
2355 } else if (type == 's') {
2356 while (q < r) {
2357 i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data);
2358 b->l_data += 2;
2359 skip_to_comma_(q);
2360 }
2361 } else if (type == 'S') {
2362 while (q < r) {
2363 if (*q != '-') {
2364 u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data);
2365 b->l_data += 2;
2366 } else {
2367 overflow = 1;
2368 }
2369 skip_to_comma_(q);
2370 }
2371 } else if (type == 'i') {
2372 while (q < r) {
2373 i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data);
2374 b->l_data += 4;
2375 skip_to_comma_(q);
2376 }
2377 } else if (type == 'I') {
2378 while (q < r) {
2379 if (*q != '-') {
2380 u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data);
2381 b->l_data += 4;
2382 } else {
2383 overflow = 1;
2384 }
2385 skip_to_comma_(q);
2386 }
2387 } else if (type == 'f') {
2388 while (q < r) {
2389 float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2390 b->l_data += 4;
2391 skip_to_comma_(q);
2392 }
2393 } else {
2394 hts_log_error("Unrecognized type B:%c", type);
2395 return -1;
2396 }
2397
2398 if (!overflow) {
2399 *end = q;
2400 return 0;
2401 } else {
2402 int64_t max = 0, min = 0, val;
2403 // Given type was incorrect. Try to rescue the situation.
2404 q = in;
2405 overflow = 0;
2406 b->l_data = orig_l;
2407 // Find out what range of values is present
2408 while (q < r) {
2409 val = hts_str2int(q + 1, &q, 64, &overflow);
2410 if (max < val) max = val;
2411 if (min > val) min = val;
2412 skip_to_comma_(q);
2413 }
2414 // Retry with appropriate type
2415 if (!overflow) {
2416 if (min < 0) {
2417 if (min >= INT8_MIN && max <= INT8_MAX) {
2418 return sam_parse_B_vals('c', n, in, end, r, b);
2419 } else if (min >= INT16_MIN && max <= INT16_MAX) {
2420 return sam_parse_B_vals('s', n, in, end, r, b);
2421 } else if (min >= INT32_MIN && max <= INT32_MAX) {
2422 return sam_parse_B_vals('i', n, in, end, r, b);
2423 }
2424 } else {
2425 if (max < UINT8_MAX) {
2426 return sam_parse_B_vals('C', n, in, end, r, b);
2427 } else if (max <= UINT16_MAX) {
2428 return sam_parse_B_vals('S', n, in, end, r, b);
2429 } else if (max <= UINT32_MAX) {
2430 return sam_parse_B_vals('I', n, in, end, r, b);
2431 }
2432 }
2433 }
2434 // If here then at least one of the values is too big to store
2435 hts_log_error("Numeric value in B array out of allowed range");
2436 return -1;
2437 }
2438 #undef skip_to_comma_
2439 }
2440
parse_sam_flag(char * v,char ** rv,int * overflow)2441 static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2442 if (*v >= '1' && *v <= '9') {
2443 return hts_str2uint(v, rv, 16, overflow);
2444 }
2445 else if (*v == '0') {
2446 // handle single-digit "0" directly; otherwise it's hex or octal
2447 if (v[1] == '\t') { *rv = v+1; return 0; }
2448 else {
2449 unsigned long val = strtoul(v, rv, 0);
2450 if (val > 65535) { *overflow = 1; return 65535; }
2451 return val;
2452 }
2453 }
2454 else {
2455 // TODO implement symbolic flag letters
2456 *rv = v;
2457 return 0;
2458 }
2459 }
2460
2461 // Parse tag line and append to bam object b.
2462 // Shared by both SAM and FASTQ parsers.
2463 //
2464 // The difference between the two is how lenient we are to recognising
2465 // non-compliant strings. The FASTQ parser glosses over arbitrary
2466 // non-SAM looking strings.
aux_parse(char * start,char * end,bam1_t * b,int lenient,khash_t (tag)* tag_whitelist)2467 static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2468 khash_t(tag) *tag_whitelist) {
2469 int overflow = 0;
2470 int checkpoint;
2471 char logbuf[40];
2472 char *q = start, *p = end;
2473
2474 #define _parse_err(cond, ...) \
2475 do { \
2476 if (cond) { \
2477 if (lenient) { \
2478 while (q < p && !isspace_c(*q)) \
2479 q++; \
2480 while (q < p && isspace_c(*q)) \
2481 q++; \
2482 b->l_data = checkpoint; \
2483 goto loop; \
2484 } else { \
2485 hts_log_error(__VA_ARGS__); \
2486 goto err_ret; \
2487 } \
2488 } \
2489 } while (0)
2490
2491 while (q < p) loop: {
2492 char type;
2493 checkpoint = b->l_data;
2494 if (p - q < 5) {
2495 if (lenient) {
2496 break;
2497 } else {
2498 hts_log_error("Incomplete aux field");
2499 goto err_ret;
2500 }
2501 }
2502 _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2503
2504 if (lenient && (q[2] | q[4]) != ':') {
2505 while (q < p && !isspace_c(*q))
2506 q++;
2507 while (q < p && isspace_c(*q))
2508 q++;
2509 continue;
2510 }
2511
2512 if (tag_whitelist) {
2513 int tt = q[0]*256 + q[1];
2514 if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2515 while (q < p && *q != '\t')
2516 q++;
2517 continue;
2518 }
2519 }
2520
2521 // Copy over id
2522 if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2523 memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2524 q += 3; type = *q++; ++q; // q points to value
2525 if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2526 _parse_err(*q <= '\t', "incomplete aux field");
2527
2528 // Ensure enough space for a double + type allocated.
2529 if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2530
2531 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2532 b->data[b->l_data++] = 'A';
2533 b->data[b->l_data++] = *q++;
2534 } else if (type == 'i' || type == 'I') {
2535 if (*q == '-') {
2536 int32_t x = hts_str2int(q, &q, 32, &overflow);
2537 if (x >= INT8_MIN) {
2538 b->data[b->l_data++] = 'c';
2539 b->data[b->l_data++] = x;
2540 } else if (x >= INT16_MIN) {
2541 b->data[b->l_data++] = 's';
2542 i16_to_le(x, b->data + b->l_data);
2543 b->l_data += 2;
2544 } else {
2545 b->data[b->l_data++] = 'i';
2546 i32_to_le(x, b->data + b->l_data);
2547 b->l_data += 4;
2548 }
2549 } else {
2550 uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2551 if (x <= UINT8_MAX) {
2552 b->data[b->l_data++] = 'C';
2553 b->data[b->l_data++] = x;
2554 } else if (x <= UINT16_MAX) {
2555 b->data[b->l_data++] = 'S';
2556 u16_to_le(x, b->data + b->l_data);
2557 b->l_data += 2;
2558 } else {
2559 b->data[b->l_data++] = 'I';
2560 u32_to_le(x, b->data + b->l_data);
2561 b->l_data += 4;
2562 }
2563 }
2564 } else if (type == 'f') {
2565 b->data[b->l_data++] = 'f';
2566 float_to_le(strtod(q, &q), b->data + b->l_data);
2567 b->l_data += sizeof(float);
2568 } else if (type == 'd') {
2569 b->data[b->l_data++] = 'd';
2570 double_to_le(strtod(q, &q), b->data + b->l_data);
2571 b->l_data += sizeof(double);
2572 } else if (type == 'Z' || type == 'H') {
2573 char *end = strchr(q, '\t');
2574 if (!end) end = q + strlen(q);
2575 _parse_err(type == 'H' && ((end-q)&1) != 0,
2576 "hex field does not have an even number of digits");
2577 b->data[b->l_data++] = type;
2578 if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2579 memcpy(b->data + b->l_data, q, end - q);
2580 b->l_data += end - q;
2581 b->data[b->l_data++] = '\0';
2582 q = end;
2583 } else if (type == 'B') {
2584 uint32_t n;
2585 char *r;
2586 type = *q++; // q points to the first ',' following the typing byte
2587 _parse_err(*q && *q != ',' && *q != '\t',
2588 "B aux field type not followed by ','");
2589
2590 for (r = q, n = 0; *r > '\t'; ++r)
2591 if (*r == ',') ++n;
2592
2593 if (sam_parse_B_vals(type, n, q, &q, r, b) < 0)
2594 goto err_ret;
2595 } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2596
2597 while (*q > '\t') { q++; } // Skip any junk to next tab
2598 q++;
2599 }
2600
2601 _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2602 #undef _parse_err
2603
2604 return 0;
2605
2606 err_ret:
2607 return -2;
2608 }
2609
sam_parse1(kstring_t * s,sam_hdr_t * h,bam1_t * b)2610 int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2611 {
2612 #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2613
2614 #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2615
2616 // Macro that operates on 64-bits at a time.
2617 #define COPY_MINUS_N(to,from,n,l,failed) \
2618 do { \
2619 uint64_u *from8 = (uint64_u *)(from); \
2620 uint64_u *to8 = (uint64_u *)(to); \
2621 uint64_t uflow = 0; \
2622 size_t l8 = (l)>>3, i; \
2623 for (i = 0; i < l8; i++) { \
2624 to8[i] = from8[i] - (n)*0x0101010101010101UL; \
2625 uflow |= to8[i]; \
2626 } \
2627 for (i<<=3; i < (l); ++i) { \
2628 to[i] = from[i] - (n); \
2629 uflow |= to[i]; \
2630 } \
2631 failed = (uflow & 0x8080808080808080UL) > 0; \
2632 } while (0)
2633
2634 #else
2635
2636 // Basic version which operates a byte at a time
2637 #define COPY_MINUS_N(to,from,n,l,failed) do { \
2638 uint8_t uflow = 0; \
2639 for (i = 0; i < (l); ++i) { \
2640 (to)[i] = (from)[i] - (n); \
2641 uflow |= (uint8_t) (to)[i]; \
2642 } \
2643 failed = (uflow & 0x80) > 0; \
2644 } while (0)
2645
2646 #endif
2647
2648 #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2649 #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2650 #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2651
2652 uint8_t *t;
2653
2654 char *p = s->s, *q;
2655 int i, overflow = 0;
2656 char logbuf[40];
2657 hts_pos_t cigreflen;
2658 bam1_core_t *c = &b->core;
2659
2660 b->l_data = 0;
2661 memset(c, 0, 32);
2662
2663 // qname
2664 q = _read_token(p);
2665
2666 _parse_warn(p - q <= 1, "empty query name");
2667 _parse_err(p - q > 255, "query name too long");
2668 // resize large enough for name + extranul
2669 if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2670 memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2671
2672 c->l_extranul = (4 - (b->l_data & 3)) & 3;
2673 memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2674 b->l_data += c->l_extranul;
2675
2676 c->l_qname = p - q + c->l_extranul;
2677
2678 // flag
2679 c->flag = parse_sam_flag(p, &p, &overflow);
2680 if (*p++ != '\t') goto err_ret; // malformated flag
2681
2682 // chr
2683 q = _read_token(p);
2684 if (strcmp(q, "*")) {
2685 _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2686 c->tid = bam_name2id(h, q);
2687 _parse_err(c->tid < -1, "failed to parse header");
2688 _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2689 } else c->tid = -1;
2690
2691 // pos
2692 c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
2693 if (*p++ != '\t') goto err_ret;
2694 if (c->pos < 0 && c->tid >= 0) {
2695 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2696 c->tid = -1;
2697 }
2698 if (c->tid < 0) c->flag |= BAM_FUNMAP;
2699
2700 // mapq
2701 c->qual = hts_str2uint(p, &p, 8, &overflow);
2702 if (*p++ != '\t') goto err_ret;
2703 // cigar
2704 if (*p != '*') {
2705 uint32_t *cigar = NULL;
2706 int old_l_data = b->l_data;
2707 int n_cigar = bam_parse_cigar(p, &p, b);
2708 if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2709 cigar = (uint32_t *)(b->data + old_l_data);
2710 c->n_cigar = n_cigar;
2711
2712 // can't use bam_endpos() directly as some fields not yet set up
2713 cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2714 if (cigreflen == 0) cigreflen = 1;
2715 } else {
2716 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2717 c->flag |= BAM_FUNMAP;
2718 q = _read_token(p);
2719 cigreflen = 1;
2720 }
2721 _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2722 "read ends beyond highest supported position");
2723 c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2724 // mate chr
2725 q = _read_token(p);
2726 if (strcmp(q, "=") == 0) {
2727 c->mtid = c->tid;
2728 } else if (strcmp(q, "*") == 0) {
2729 c->mtid = -1;
2730 } else {
2731 c->mtid = bam_name2id(h, q);
2732 _parse_err(c->mtid < -1, "failed to parse header");
2733 _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2734 }
2735 // mpos
2736 c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
2737 if (*p++ != '\t') goto err_ret;
2738 if (c->mpos < 0 && c->mtid >= 0) {
2739 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2740 c->mtid = -1;
2741 }
2742 // tlen
2743 c->isize = hts_str2int(p, &p, 64, &overflow);
2744 if (*p++ != '\t') goto err_ret;
2745 // seq
2746 q = _read_token(p);
2747 if (strcmp(q, "*")) {
2748 _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2749 c->l_qseq = p - q - 1;
2750 hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2751 _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2752 i = (c->l_qseq + 1) >> 1;
2753 _get_mem(uint8_t, &t, b, i);
2754
2755 unsigned int lqs2 = c->l_qseq&~1, i;
2756 for (i = 0; i < lqs2; i+=2)
2757 t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2758 for (; i < c->l_qseq; ++i)
2759 t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2760 } else c->l_qseq = 0;
2761 // qual
2762 _get_mem(uint8_t, &t, b, c->l_qseq);
2763 if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2764 memset(t, 0xff, c->l_qseq);
2765 p += 2;
2766 } else {
2767 int failed = 0;
2768 _parse_err(s->l - (p - s->s) < c->l_qseq
2769 || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2770 "SEQ and QUAL are of different length");
2771 COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2772 _parse_err(failed, "invalid QUAL character");
2773 p += c->l_qseq + 1;
2774 }
2775
2776 // aux
2777 if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2778 goto err_ret;
2779
2780 if (bam_tag2cigar(b, 1, 1) < 0)
2781 return -2;
2782 return 0;
2783
2784 #undef _parse_warn
2785 #undef _parse_err
2786 #undef _get_mem
2787 #undef _read_token
2788 err_ret:
2789 return -2;
2790 }
2791
read_ncigar(const char * q)2792 static uint32_t read_ncigar(const char *q) {
2793 uint32_t n_cigar = 0;
2794 for (; *q && *q != '\t'; ++q)
2795 if (!isdigit_c(*q)) ++n_cigar;
2796 if (!n_cigar) {
2797 hts_log_error("No CIGAR operations");
2798 return 0;
2799 }
2800 if (n_cigar >= 2147483647) {
2801 hts_log_error("Too many CIGAR operations");
2802 return 0;
2803 }
2804
2805 return n_cigar;
2806 }
2807
2808 /*! @function
2809 @abstract Parse a CIGAR string into preallocated a uint32_t array
2810 @param in [in] pointer to the source string
2811 @param a_cigar [out] address of the destination uint32_t buffer
2812 @return number of processed input characters; 0 on error
2813 */
parse_cigar(const char * in,uint32_t * a_cigar,uint32_t n_cigar)2814 static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2815 int i, overflow = 0;
2816 const char *p = in;
2817 for (i = 0; i < n_cigar; i++) {
2818 uint32_t len;
2819 int op;
2820 char *q;
2821 len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2822 if (q == p) {
2823 hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2824 return 0;
2825 }
2826 if (overflow) {
2827 hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2828 return 0;
2829 }
2830 p = q;
2831 op = bam_cigar_table[(unsigned char)*p++];
2832 if (op < 0) {
2833 hts_log_error("Unrecognized CIGAR operator");
2834 return 0;
2835 }
2836 a_cigar[i] = len;
2837 a_cigar[i] |= op;
2838 }
2839
2840 return p-in;
2841 }
2842
sam_parse_cigar(const char * in,char ** end,uint32_t ** a_cigar,size_t * a_mem)2843 ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2844 size_t n_cigar = 0;
2845 int diff;
2846
2847 if (!in || !a_cigar || !a_mem) {
2848 hts_log_error("NULL pointer arguments");
2849 return -1;
2850 }
2851 if (end) *end = (char *)in;
2852
2853 if (*in == '*') {
2854 if (end) (*end)++;
2855 return 0;
2856 }
2857 n_cigar = read_ncigar(in);
2858 if (!n_cigar) return 0;
2859 if (n_cigar > *a_mem) {
2860 uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2861 if (a_tmp) {
2862 *a_cigar = a_tmp;
2863 *a_mem = n_cigar;
2864 } else {
2865 hts_log_error("Memory allocation error");
2866 return -1;
2867 }
2868 }
2869
2870 if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2871 if (end) *end = (char *)in+diff;
2872
2873 return n_cigar;
2874 }
2875
bam_parse_cigar(const char * in,char ** end,bam1_t * b)2876 ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2877 size_t n_cigar = 0;
2878 int diff;
2879
2880 if (!in || !b) {
2881 hts_log_error("NULL pointer arguments");
2882 return -1;
2883 }
2884 if (end) *end = (char *)in;
2885
2886 if (*in == '*') {
2887 if (end) (*end)++;
2888 return 0;
2889 }
2890 n_cigar = read_ncigar(in);
2891 if (!n_cigar) return 0;
2892 if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) {
2893 hts_log_error("Memory allocation error");
2894 return -1;
2895 }
2896
2897 if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1;
2898 b->l_data += (n_cigar * sizeof(uint32_t));
2899 if (end) *end = (char *)in+diff;
2900
2901 return n_cigar;
2902 }
2903
2904 /*
2905 * -----------------------------------------------------------------------------
2906 * SAM threading
2907 */
2908 // Size of SAM text block (reading)
2909 #define NM 240000
2910 // Number of BAM records (writing)
2911 #define NB 1000
2912
2913 struct SAM_state;
2914
2915 // Output job - a block of BAM records
2916 typedef struct sp_bams {
2917 struct sp_bams *next;
2918 int serial;
2919
2920 bam1_t *bams;
2921 int nbams, abams; // used and alloc
2922
2923 struct SAM_state *fd;
2924 } sp_bams;
2925
2926 // Input job - a block of SAM text
2927 typedef struct sp_lines {
2928 struct sp_lines *next;
2929 int serial;
2930
2931 char *data;
2932 int data_size;
2933 int alloc;
2934
2935 struct SAM_state *fd;
2936 sp_bams *bams;
2937 } sp_lines;
2938
2939 enum sam_cmd {
2940 SAM_NONE = 0,
2941 SAM_CLOSE,
2942 SAM_CLOSE_DONE,
2943 };
2944
2945 typedef struct SAM_state {
2946 sam_hdr_t *h;
2947
2948 hts_tpool *p;
2949 int own_pool;
2950 pthread_mutex_t lines_m;
2951 hts_tpool_process *q;
2952 pthread_t dispatcher;
2953 int dispatcher_set;
2954
2955 sp_lines *lines;
2956 sp_bams *bams;
2957
2958 sp_bams *curr_bam;
2959 int curr_idx;
2960 int serial;
2961
2962 // Be warned: moving these mutexes around in this struct can reduce
2963 // threading performance by up to 70%!
2964 pthread_mutex_t command_m;
2965 pthread_cond_t command_c;
2966 enum sam_cmd command;
2967
2968 // One of the E* errno codes
2969 int errcode;
2970
2971 htsFile *fp;
2972 } SAM_state;
2973
2974 // Returns a SAM_state struct from a generic hFILE.
2975 //
2976 // Returns NULL on failure.
sam_state_create(htsFile * fp)2977 static SAM_state *sam_state_create(htsFile *fp) {
2978 // Ideally sam_open wouldn't be a #define to hts_open but instead would
2979 // be a redirect call with an additional 'S' mode. This in turn would
2980 // correctly set the designed format to sam instead of a generic
2981 // text_format.
2982 if (fp->format.format != sam && fp->format.format != text_format)
2983 return NULL;
2984
2985 SAM_state *fd = calloc(1, sizeof(*fd));
2986 if (!fd)
2987 return NULL;
2988
2989 fp->state = fd;
2990 fd->fp = fp;
2991
2992 return fd;
2993 }
2994
2995 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
2996 static void *sam_format_worker(void *arg);
2997
sam_state_err(SAM_state * fd,int errcode)2998 static void sam_state_err(SAM_state *fd, int errcode) {
2999 pthread_mutex_lock(&fd->command_m);
3000 if (!fd->errcode)
3001 fd->errcode = errcode;
3002 pthread_mutex_unlock(&fd->command_m);
3003 }
3004
sam_free_sp_bams(sp_bams * b)3005 static void sam_free_sp_bams(sp_bams *b) {
3006 if (!b)
3007 return;
3008
3009 if (b->bams) {
3010 int i;
3011 for (i = 0; i < b->abams; i++) {
3012 if (b->bams[i].data)
3013 free(b->bams[i].data);
3014 }
3015 free(b->bams);
3016 }
3017 free(b);
3018 }
3019
3020 // Destroys the state produce by sam_state_create.
sam_state_destroy(htsFile * fp)3021 int sam_state_destroy(htsFile *fp) {
3022 int ret = 0;
3023
3024 if (!fp->state)
3025 return 0;
3026
3027 SAM_state *fd = fp->state;
3028 if (fd->p) {
3029 if (fd->h) {
3030 // Notify sam_dispatcher we're closing
3031 pthread_mutex_lock(&fd->command_m);
3032 if (fd->command != SAM_CLOSE_DONE)
3033 fd->command = SAM_CLOSE;
3034 pthread_cond_signal(&fd->command_c);
3035 ret = -fd->errcode;
3036 if (fd->q)
3037 hts_tpool_wake_dispatch(fd->q); // unstick the reader
3038
3039 if (!fp->is_write && fd->q && fd->dispatcher_set) {
3040 for (;;) {
3041 // Avoid deadlocks with dispatcher
3042 if (fd->command == SAM_CLOSE_DONE)
3043 break;
3044 hts_tpool_wake_dispatch(fd->q);
3045 pthread_mutex_unlock(&fd->command_m);
3046 usleep(10000);
3047 pthread_mutex_lock(&fd->command_m);
3048 }
3049 }
3050 pthread_mutex_unlock(&fd->command_m);
3051
3052 if (fp->is_write) {
3053 // Dispatch the last partial block.
3054 sp_bams *gb = fd->curr_bam;
3055 if (!ret && gb && gb->nbams > 0 && fd->q)
3056 ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3057
3058 // Flush and drain output
3059 if (fd->q)
3060 hts_tpool_process_flush(fd->q);
3061 pthread_mutex_lock(&fd->command_m);
3062 if (!ret) ret = -fd->errcode;
3063 pthread_mutex_unlock(&fd->command_m);
3064
3065 while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3066 usleep(10000);
3067 pthread_mutex_lock(&fd->command_m);
3068 ret = -fd->errcode;
3069 // not empty but shutdown implies error
3070 if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3071 ret = EIO;
3072 pthread_mutex_unlock(&fd->command_m);
3073 }
3074 if (fd->q)
3075 hts_tpool_process_shutdown(fd->q);
3076 }
3077
3078 // Wait for it to acknowledge
3079 if (fd->dispatcher_set)
3080 pthread_join(fd->dispatcher, NULL);
3081 if (!ret) ret = -fd->errcode;
3082 }
3083
3084 // Tidy up memory
3085 if (fd->q)
3086 hts_tpool_process_destroy(fd->q);
3087
3088 if (fd->own_pool && fp->format.compression == no_compression) {
3089 hts_tpool_destroy(fd->p);
3090 fd->p = NULL;
3091 }
3092 pthread_mutex_destroy(&fd->lines_m);
3093 pthread_mutex_destroy(&fd->command_m);
3094 pthread_cond_destroy(&fd->command_c);
3095
3096 sp_lines *l = fd->lines;
3097 while (l) {
3098 sp_lines *n = l->next;
3099 free(l->data);
3100 free(l);
3101 l = n;
3102 }
3103
3104 sp_bams *b = fd->bams;
3105 while (b) {
3106 if (fd->curr_bam == b)
3107 fd->curr_bam = NULL;
3108 sp_bams *n = b->next;
3109 sam_free_sp_bams(b);
3110 b = n;
3111 }
3112
3113 if (fd->curr_bam)
3114 sam_free_sp_bams(fd->curr_bam);
3115
3116 // Decrement counter by one, maybe destroying too.
3117 // This is to permit the caller using bam_hdr_destroy
3118 // before sam_close without triggering decode errors
3119 // in the background threads.
3120 bam_hdr_destroy(fd->h);
3121 }
3122
3123 free(fp->state);
3124 fp->state = NULL;
3125 return ret;
3126 }
3127
3128 // Cleanup function - job for sam_parse_worker; result for sam_format_worker
cleanup_sp_lines(void * arg)3129 static void cleanup_sp_lines(void *arg) {
3130 sp_lines *gl = (sp_lines *)arg;
3131 if (!gl) return;
3132
3133 // Should always be true for lines passed to / from thread workers.
3134 assert(gl->next == NULL);
3135
3136 free(gl->data);
3137 sam_free_sp_bams(gl->bams);
3138 free(gl);
3139 }
3140
3141 // Run from one of the worker threads.
3142 // Convert a passed in array of lines to array of BAMs, returning
3143 // the result back to the thread queue.
sam_parse_worker(void * arg)3144 static void *sam_parse_worker(void *arg) {
3145 sp_lines *gl = (sp_lines *)arg;
3146 sp_bams *gb = NULL;
3147 char *lines = gl->data;
3148 int i;
3149 bam1_t *b;
3150 SAM_state *fd = gl->fd;
3151
3152 // Use a block of BAM structs we had earlier if available.
3153 pthread_mutex_lock(&fd->lines_m);
3154 if (fd->bams) {
3155 gb = fd->bams;
3156 fd->bams = gb->next;
3157 }
3158 pthread_mutex_unlock(&fd->lines_m);
3159
3160 if (gb == NULL) {
3161 gb = calloc(1, sizeof(*gb));
3162 if (!gb) {
3163 return NULL;
3164 }
3165 gb->abams = 100;
3166 gb->bams = b = calloc(gb->abams, sizeof(*b));
3167 if (!gb->bams) {
3168 sam_state_err(fd, ENOMEM);
3169 goto err;
3170 }
3171 gb->nbams = 0;
3172 }
3173 gb->serial = gl->serial;
3174 gb->next = NULL;
3175
3176 b = (bam1_t *)gb->bams;
3177 if (!b) {
3178 sam_state_err(fd, ENOMEM);
3179 goto err;
3180 }
3181
3182 i = 0;
3183 char *cp = lines, *cp_end = lines + gl->data_size;
3184 while (cp < cp_end) {
3185 if (i >= gb->abams) {
3186 int old_abams = gb->abams;
3187 gb->abams *= 2;
3188 b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3189 if (!b) {
3190 gb->abams /= 2;
3191 sam_state_err(fd, ENOMEM);
3192 goto err;
3193 }
3194 memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3195 gb->bams = b;
3196 }
3197
3198 // Ideally we'd get sam_parse1 to return the number of
3199 // bytes decoded and to be able to stop on newline as
3200 // well as \0.
3201 //
3202 // We can then avoid the additional strchr loop.
3203 // It's around 6% of our CPU cost, albeit threadable.
3204 //
3205 // However this is an API change so for now we copy.
3206
3207 char *nl = strchr(cp, '\n');
3208 char *line_end;
3209 if (nl) {
3210 line_end = nl;
3211 if (line_end > cp && *(line_end - 1) == '\r')
3212 line_end--;
3213 nl++;
3214 } else {
3215 nl = line_end = cp_end;
3216 }
3217 *line_end = '\0';
3218 kstring_t ks = { line_end - cp, gl->alloc, cp };
3219 if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3220 sam_state_err(fd, errno ? errno : EIO);
3221 cleanup_sp_lines(gl);
3222 goto err;
3223 }
3224 cp = nl;
3225 i++;
3226 }
3227 gb->nbams = i;
3228
3229 pthread_mutex_lock(&fd->lines_m);
3230 gl->next = fd->lines;
3231 fd->lines = gl;
3232 pthread_mutex_unlock(&fd->lines_m);
3233 return gb;
3234
3235 err:
3236 sam_free_sp_bams(gb);
3237 return NULL;
3238 }
3239
sam_parse_eof(void * arg)3240 static void *sam_parse_eof(void *arg) {
3241 return NULL;
3242 }
3243
3244 // Cleanup function - result for sam_parse_worker; job for sam_format_worker
cleanup_sp_bams(void * arg)3245 static void cleanup_sp_bams(void *arg) {
3246 sam_free_sp_bams((sp_bams *) arg);
3247 }
3248
3249 // Runs in its own thread.
3250 // Reads a block of text (SAM) and sends a new job to the thread queue to
3251 // translate this to BAM.
sam_dispatcher_read(void * vp)3252 static void *sam_dispatcher_read(void *vp) {
3253 htsFile *fp = vp;
3254 kstring_t line = {0};
3255 int line_frag = 0;
3256 SAM_state *fd = fp->state;
3257 sp_lines *l = NULL;
3258
3259 // Pre-allocate buffer for left-over bits of line (exact size doesn't
3260 // matter as it will grow if necessary).
3261 if (ks_resize(&line, 1000) < 0)
3262 goto err;
3263
3264 for (;;) {
3265 // Check for command
3266 pthread_mutex_lock(&fd->command_m);
3267 switch (fd->command) {
3268
3269 case SAM_CLOSE:
3270 pthread_cond_signal(&fd->command_c);
3271 pthread_mutex_unlock(&fd->command_m);
3272 hts_tpool_process_shutdown(fd->q);
3273 goto tidyup;
3274
3275 default:
3276 break;
3277 }
3278 pthread_mutex_unlock(&fd->command_m);
3279
3280 pthread_mutex_lock(&fd->lines_m);
3281 if (fd->lines) {
3282 // reuse existing line buffer
3283 l = fd->lines;
3284 fd->lines = l->next;
3285 }
3286 pthread_mutex_unlock(&fd->lines_m);
3287
3288 if (l == NULL) {
3289 // none to reuse, to create a new one
3290 l = calloc(1, sizeof(*l));
3291 if (!l)
3292 goto err;
3293 l->alloc = NM;
3294 l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3295 if (!l->data) {
3296 free(l);
3297 l = NULL;
3298 goto err;
3299 }
3300 l->fd = fd;
3301 }
3302 l->next = NULL;
3303
3304 if (l->alloc < line_frag+NM/2) {
3305 char *rp = realloc(l->data, line_frag+NM/2 +8);
3306 if (!rp)
3307 goto err;
3308 l->alloc = line_frag+NM/2;
3309 l->data = rp;
3310 }
3311 memcpy(l->data, line.s, line_frag);
3312
3313 l->data_size = line_frag;
3314 ssize_t nbytes;
3315 longer_line:
3316 if (fp->is_bgzf)
3317 nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3318 else
3319 nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3320 if (nbytes < 0) {
3321 sam_state_err(fd, errno ? errno : EIO);
3322 goto err;
3323 } else if (nbytes == 0)
3324 break; // EOF
3325 l->data_size += nbytes;
3326
3327 // trim to last \n. Maybe \r\n, but that's still fine
3328 if (nbytes == l->alloc - line_frag) {
3329 char *cp_end = l->data + l->data_size;
3330 char *cp = cp_end-1;
3331
3332 while (cp > (char *)l->data && *cp != '\n')
3333 cp--;
3334
3335 // entire buffer is part of a single line
3336 if (cp == l->data) {
3337 line_frag = l->data_size;
3338 char *rp = realloc(l->data, l->alloc * 2 + 8);
3339 if (!rp)
3340 goto err;
3341 l->alloc *= 2;
3342 l->data = rp;
3343 assert(l->alloc >= l->data_size);
3344 assert(l->alloc >= line_frag);
3345 assert(l->alloc >= l->alloc - line_frag);
3346 goto longer_line;
3347 }
3348 cp++;
3349
3350 // line holds the remainder of our line.
3351 if (ks_resize(&line, cp_end - cp) < 0)
3352 goto err;
3353 memcpy(line.s, cp, cp_end - cp);
3354 line_frag = cp_end - cp;
3355 l->data_size = l->alloc - line_frag;
3356 } else {
3357 // out of buffer
3358 line_frag = 0;
3359 }
3360
3361 l->serial = fd->serial++;
3362 //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3363 if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3364 cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3365 goto err;
3366 pthread_mutex_lock(&fd->command_m);
3367 if (fd->command == SAM_CLOSE) {
3368 pthread_mutex_unlock(&fd->command_m);
3369 l = NULL;
3370 goto tidyup;
3371 }
3372 l = NULL; // Now "owned" by sam_parse_worker()
3373 pthread_mutex_unlock(&fd->command_m);
3374 }
3375
3376 if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3377 goto err;
3378
3379 // At EOF, wait for close request.
3380 // (In future if we add support for seek, this is where we need to catch it.)
3381 for (;;) {
3382 pthread_mutex_lock(&fd->command_m);
3383 if (fd->command == SAM_NONE)
3384 pthread_cond_wait(&fd->command_c, &fd->command_m);
3385 switch (fd->command) {
3386 case SAM_CLOSE:
3387 pthread_cond_signal(&fd->command_c);
3388 pthread_mutex_unlock(&fd->command_m);
3389 hts_tpool_process_shutdown(fd->q);
3390 goto tidyup;
3391
3392 default:
3393 pthread_mutex_unlock(&fd->command_m);
3394 break;
3395 }
3396 }
3397
3398 tidyup:
3399 pthread_mutex_lock(&fd->command_m);
3400 fd->command = SAM_CLOSE_DONE;
3401 pthread_cond_signal(&fd->command_c);
3402 pthread_mutex_unlock(&fd->command_m);
3403
3404 if (l) {
3405 pthread_mutex_lock(&fd->lines_m);
3406 l->next = fd->lines;
3407 fd->lines = l;
3408 pthread_mutex_unlock(&fd->lines_m);
3409 }
3410 free(line.s);
3411
3412 return NULL;
3413
3414 err:
3415 sam_state_err(fd, errno ? errno : ENOMEM);
3416 hts_tpool_process_shutdown(fd->q);
3417 goto tidyup;
3418 }
3419
3420 // Runs in its own thread.
3421 // Takes encoded blocks of SAM off the thread results queue and writes them
3422 // to our output stream.
sam_dispatcher_write(void * vp)3423 static void *sam_dispatcher_write(void *vp) {
3424 htsFile *fp = vp;
3425 SAM_state *fd = fp->state;
3426 hts_tpool_result *r;
3427
3428 // Iterates until result queue is shutdown, where it returns NULL.
3429 while ((r = hts_tpool_next_result_wait(fd->q))) {
3430 sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3431 if (!gl) {
3432 sam_state_err(fd, ENOMEM);
3433 goto err;
3434 }
3435
3436 if (fp->idx) {
3437 sp_bams *gb = gl->bams;
3438 int i = 0, count = 0;
3439 while (i < gl->data_size) {
3440 int j = i;
3441 while (i < gl->data_size && gl->data[i] != '\n')
3442 i++;
3443 if (i < gl->data_size)
3444 i++;
3445
3446 if (fp->is_bgzf) {
3447 if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3448 goto err;
3449 } else {
3450 if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3451 goto err;
3452 }
3453
3454 bam1_t *b = &gb->bams[count++];
3455 if (fp->format.compression == bgzf) {
3456 if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3457 b->core.tid, b->core.pos, bam_endpos(b),
3458 bgzf_tell(fp->fp.bgzf),
3459 !(b->core.flag&BAM_FUNMAP)) < 0) {
3460 sam_state_err(fd, errno ? errno : ENOMEM);
3461 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3462 bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3463 goto err;
3464 }
3465 } else {
3466 if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3467 bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3468 sam_state_err(fd, errno ? errno : ENOMEM);
3469 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3470 bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3471 goto err;
3472 }
3473 }
3474 }
3475
3476 assert(count == gb->nbams);
3477
3478 // Add bam array to free-list
3479 pthread_mutex_lock(&fd->lines_m);
3480 gb->next = fd->bams;
3481 fd->bams = gl->bams;
3482 gl->bams = NULL;
3483 pthread_mutex_unlock(&fd->lines_m);
3484 } else {
3485 if (fp->is_bgzf) {
3486 if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size)
3487 goto err;
3488 } else {
3489 if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3490 goto err;
3491 }
3492 }
3493
3494 hts_tpool_delete_result(r, 0);
3495
3496 // Also updated by main thread
3497 pthread_mutex_lock(&fd->lines_m);
3498 gl->next = fd->lines;
3499 fd->lines = gl;
3500 pthread_mutex_unlock(&fd->lines_m);
3501 }
3502
3503 sam_state_err(fd, 0); // success
3504 hts_tpool_process_shutdown(fd->q);
3505 return NULL;
3506
3507 err:
3508 sam_state_err(fd, errno ? errno : EIO);
3509 return (void *)-1;
3510 }
3511
3512 // Run from one of the worker threads.
3513 // Convert a passed in array of BAMs (sp_bams) and converts to a block
3514 // of text SAM records (sp_lines).
sam_format_worker(void * arg)3515 static void *sam_format_worker(void *arg) {
3516 sp_bams *gb = (sp_bams *)arg;
3517 sp_lines *gl = NULL;
3518 int i;
3519 SAM_state *fd = gb->fd;
3520 htsFile *fp = fd->fp;
3521
3522 // Use a block of SAM strings we had earlier if available.
3523 pthread_mutex_lock(&fd->lines_m);
3524 if (fd->lines) {
3525 gl = fd->lines;
3526 fd->lines = gl->next;
3527 }
3528 pthread_mutex_unlock(&fd->lines_m);
3529
3530 if (gl == NULL) {
3531 gl = calloc(1, sizeof(*gl));
3532 if (!gl) {
3533 sam_state_err(fd, ENOMEM);
3534 return NULL;
3535 }
3536 gl->alloc = gl->data_size = 0;
3537 gl->data = NULL;
3538 }
3539 gl->serial = gb->serial;
3540 gl->next = NULL;
3541
3542 kstring_t ks = {0, gl->alloc, gl->data};
3543
3544 for (i = 0; i < gb->nbams; i++) {
3545 if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3546 sam_state_err(fd, errno ? errno : EIO);
3547 goto err;
3548 }
3549 kputc('\n', &ks);
3550 }
3551
3552 pthread_mutex_lock(&fd->lines_m);
3553 gl->data_size = ks.l;
3554 gl->alloc = ks.m;
3555 gl->data = ks.s;
3556
3557 if (fp->idx) {
3558 // Keep hold of the bam array a little longer as
3559 // sam_dispatcher_write needs to use them for building the index.
3560 gl->bams = gb;
3561 } else {
3562 // Add bam array to free-list
3563 gb->next = fd->bams;
3564 fd->bams = gb;
3565 }
3566 pthread_mutex_unlock(&fd->lines_m);
3567
3568 return gl;
3569
3570 err:
3571 // Possible race between this and fd->curr_bam.
3572 // Easier to not free and leave it on the input list so it
3573 // gets freed there instead?
3574 // sam_free_sp_bams(gb);
3575 if (gl) {
3576 free(gl->data);
3577 free(gl);
3578 }
3579 return NULL;
3580 }
3581
sam_set_thread_pool(htsFile * fp,htsThreadPool * p)3582 int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3583 if (fp->state)
3584 return 0;
3585
3586 if (!(fp->state = sam_state_create(fp)))
3587 return -1;
3588 SAM_state *fd = (SAM_state *)fp->state;
3589
3590 pthread_mutex_init(&fd->lines_m, NULL);
3591 pthread_mutex_init(&fd->command_m, NULL);
3592 pthread_cond_init(&fd->command_c, NULL);
3593 fd->p = p->pool;
3594 int qsize = p->qsize;
3595 if (!qsize)
3596 qsize = 2*hts_tpool_size(fd->p);
3597 fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3598 if (!fd->q) {
3599 sam_state_destroy(fp);
3600 return -1;
3601 }
3602
3603 if (fp->format.compression == bgzf)
3604 return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3605
3606 return 0;
3607 }
3608
sam_set_threads(htsFile * fp,int nthreads)3609 int sam_set_threads(htsFile *fp, int nthreads) {
3610 if (nthreads <= 0)
3611 return 0;
3612
3613 htsThreadPool p;
3614 p.pool = hts_tpool_init(nthreads);
3615 p.qsize = nthreads*2;
3616
3617 int ret = sam_set_thread_pool(fp, &p);
3618 if (ret < 0)
3619 return ret;
3620
3621 SAM_state *fd = (SAM_state *)fp->state;
3622 fd->own_pool = 1;
3623
3624 return 0;
3625 }
3626
3627 typedef struct {
3628 kstring_t name;
3629 kstring_t comment; // NB: pointer into name, do not free
3630 kstring_t seq;
3631 kstring_t qual;
3632 int casava;
3633 int aux;
3634 int rnum;
3635 char BC[3]; // aux tag ID for barcode
3636 khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3637 char nprefix;
3638 int sra_names;
3639 } fastq_state;
3640
3641 // Initialise fastq state.
3642 // Name char of '@' or '>' distinguishes fastq vs fasta variant
fastq_state_init(int name_char)3643 static fastq_state *fastq_state_init(int name_char) {
3644 fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3645 if (!x)
3646 return NULL;
3647 strcpy(x->BC, "BC");
3648 x->nprefix = name_char;
3649
3650 return x;
3651 }
3652
fastq_state_destroy(htsFile * fp)3653 void fastq_state_destroy(htsFile *fp) {
3654 if (fp->state) {
3655 fastq_state *x = (fastq_state *)fp->state;
3656 if (x->tags)
3657 kh_destroy(tag, x->tags);
3658 ks_free(&x->name);
3659 ks_free(&x->seq);
3660 ks_free(&x->qual);
3661 free(fp->state);
3662 }
3663 }
3664
fastq_state_set(samFile * fp,enum hts_fmt_option opt,...)3665 int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3666 va_list args;
3667
3668 if (!fp)
3669 return -1;
3670 if (!fp->state)
3671 if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3672 ? '@' : '>')))
3673 return -1;
3674
3675 fastq_state *x = (fastq_state *)fp->state;
3676
3677 switch (opt) {
3678 case FASTQ_OPT_CASAVA:
3679 x->casava = 1;
3680 break;
3681
3682 case FASTQ_OPT_NAME2:
3683 x->sra_names = 1;
3684 break;
3685
3686 case FASTQ_OPT_AUX: {
3687 va_start(args, opt);
3688 x->aux = 1;
3689 char *tag = va_arg(args, char *);
3690 va_end(args);
3691 if (tag && strcmp(tag, "1") != 0) {
3692 if (!x->tags)
3693 if (!(x->tags = kh_init(tag)))
3694 return -1;
3695
3696 size_t i, tlen = strlen(tag);
3697 for (i = 0; i+3 <= tlen+1; i += 3) {
3698 if (tag[i+0] == ',' || tag[i+1] == ',' ||
3699 !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3700 hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3701 break;
3702 }
3703 int ret, tcode = tag[i+0]*256 + tag[i+1];
3704 kh_put(tag, x->tags, tcode, &ret);
3705 if (ret < 0)
3706 return -1;
3707 }
3708 }
3709 break;
3710 }
3711
3712 case FASTQ_OPT_BARCODE: {
3713 va_start(args, opt);
3714 char *bc = va_arg(args, char *);
3715 va_end(args);
3716 strncpy(x->BC, bc, 2);
3717 x->BC[2] = 0;
3718 break;
3719 }
3720
3721 case FASTQ_OPT_RNUM:
3722 x->rnum = 1;
3723 break;
3724
3725 default:
3726 break;
3727 }
3728 return 0;
3729 }
3730
fastq_parse1(htsFile * fp,bam1_t * b)3731 static int fastq_parse1(htsFile *fp, bam1_t *b) {
3732 fastq_state *x = (fastq_state *)fp->state;
3733 size_t i, l;
3734 int ret = 0;
3735
3736 if (fp->format.format == fasta_format && fp->line.s) {
3737 // For FASTA we've already read the >name line; steal it
3738 // Not the most efficient, but we don't optimise for fasta reading.
3739 if (fp->line.l == 0)
3740 return -1; // EOF
3741
3742 free(x->name.s);
3743 x->name = fp->line;
3744 fp->line.l = fp->line.m = 0;
3745 fp->line.s = NULL;
3746 } else {
3747 // Read a FASTQ format entry.
3748 ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3749 if (ret == -1)
3750 return -1; // EOF
3751 else if (ret < -1)
3752 return ret; // ERR
3753 }
3754
3755 // Name
3756
3757 if (*x->name.s != x->nprefix)
3758 return -2;
3759
3760 // Reverse the SRA strangeness of putting the run_name.number before
3761 // the read name.
3762 i = 0;
3763 char *name = x->name.s+1;
3764 if (x->sra_names) {
3765 char *cp = strpbrk(x->name.s, " \t");
3766 if (cp) {
3767 while (*cp == ' ' || *cp == '\t')
3768 cp++;
3769 *--cp = '@';
3770 i = cp - x->name.s;
3771 name = cp+1;
3772 }
3773 }
3774
3775 l = x->name.l;
3776 char *s = x->name.s;
3777 while (i < l && !isspace_c(s[i]))
3778 i++;
3779 if (i < l) {
3780 s[i] = 0;
3781 x->name.l = i++;
3782 }
3783
3784 // Comment; a kstring struct, but pointer into name line. (Do not free)
3785 while (i < l && isspace_c(s[i]))
3786 i++;
3787 x->comment.s = s+i;
3788 x->comment.l = l - i;
3789
3790 // Seq
3791 x->seq.l = 0;
3792 for (;;) {
3793 if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3794 if (fp->format.format == fastq_format || ret < -1)
3795 return -2;
3796 if (*fp->line.s == (fp->format.format == fastq_format ? '+' : '>')
3797 || ret == -1)
3798 break;
3799 if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3800 return -2;
3801 }
3802
3803 // Qual
3804 if (fp->format.format == fastq_format) {
3805 size_t remainder = x->seq.l;
3806 x->qual.l = 0;
3807 do {
3808 if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3809 return -2;
3810 if (fp->line.l > remainder)
3811 return -2;
3812 if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
3813 return -2;
3814 remainder -= fp->line.l;
3815 } while (remainder > 0);
3816
3817 // Decr qual
3818 for (i = 0; i < x->qual.l; i++)
3819 x->qual.s[i] -= '!';
3820 }
3821
3822 int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
3823 if (x->name.l > 2 &&
3824 x->name.s[x->name.l-2] == '/' &&
3825 isdigit_c(x->name.s[x->name.l-1])) {
3826 switch(x->name.s[x->name.l-1]) {
3827 case '1': flag |= BAM_FREAD1 | pflag; break;
3828 case '2': flag |= BAM_FREAD2 | pflag; break;
3829 default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
3830 }
3831 x->name.s[x->name.l-=2] = 0;
3832 }
3833
3834 // Convert to BAM
3835 ret = bam_set1(b,
3836 x->name.s + x->name.l - name, name,
3837 flag,
3838 -1, -1, 0, // ref '*', pos, mapq,
3839 0, NULL, // no cigar,
3840 -1, -1, 0, // mate
3841 x->seq.l, x->seq.s, x->qual.s,
3842 0);
3843
3844 // Identify Illumina CASAVA strings.
3845 // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
3846 char *barcode = NULL;
3847 int barcode_len = 0;
3848 kstring_t *kc = &x->comment;
3849 char *endptr;
3850 if (x->casava &&
3851 // \d:[YN]:\d+:[ACGTN]+
3852 kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
3853 strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
3854 && *endptr == ':') {
3855
3856 // read num
3857 switch(kc->s[0]) {
3858 case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
3859 case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
3860 default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
3861 }
3862
3863 if (kc->s[2] == 'Y')
3864 b->core.flag |= BAM_FQCFAIL;
3865
3866 // Barcode, maybe numeric in which case we skip it
3867 if (!isdigit_c(endptr[1])) {
3868 barcode = endptr+1;
3869 for (i = barcode - kc->s; i < kc->l; i++)
3870 if (isspace_c(kc->s[i]))
3871 break;
3872
3873 kc->s[i] = 0;
3874 barcode_len = i+1-(barcode - kc->s);
3875 }
3876 }
3877
3878 if (ret >= 0 && barcode_len)
3879 if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
3880 ret = -2;
3881
3882 if (!x->aux)
3883 return ret;
3884
3885 // Identify any SAM style aux tags in comments too.
3886 if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
3887 ret = -2;
3888
3889 return ret;
3890 }
3891
3892 // Internal component of sam_read1 below
sam_read1_bam(htsFile * fp,sam_hdr_t * h,bam1_t * b)3893 static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
3894 int ret = bam_read1(fp->fp.bgzf, b);
3895 if (h && ret >= 0) {
3896 if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
3897 b->core.mtid >= h->n_targets || b->core.mtid < -1) {
3898 errno = ERANGE;
3899 return -3;
3900 }
3901 }
3902 return ret;
3903 }
3904
3905 // Internal component of sam_read1 below
sam_read1_cram(htsFile * fp,sam_hdr_t * h,bam1_t ** b)3906 static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
3907 int ret = cram_get_bam_seq(fp->fp.cram, b);
3908 if (ret < 0)
3909 return cram_eof(fp->fp.cram) ? -1 : -2;
3910
3911 if (bam_tag2cigar(*b, 1, 1) < 0)
3912 return -2;
3913
3914 return ret;
3915 }
3916
3917 // Internal component of sam_read1 below
sam_read1_sam(htsFile * fp,sam_hdr_t * h,bam1_t * b)3918 static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
3919 int ret;
3920
3921 // Consume 1st line after header parsing as it wasn't using peek
3922 if (fp->line.l != 0) {
3923 ret = sam_parse1(&fp->line, h, b);
3924 fp->line.l = 0;
3925 return ret;
3926 }
3927
3928 if (fp->state) {
3929 SAM_state *fd = (SAM_state *)fp->state;
3930
3931 if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
3932 // We don't support multi-threaded SAM parsing with seeks yet.
3933 int ret;
3934 if ((ret = sam_state_destroy(fp)) < 0) {
3935 errno = -ret;
3936 return -2;
3937 }
3938 if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
3939 return -1;
3940 fp->fp.bgzf->seeked = 0;
3941 goto err_recover;
3942 }
3943
3944 if (!fd->h) {
3945 fd->h = h;
3946 fd->h->ref_count++;
3947 // Ensure hrecs is initialised now as we don't want multiple
3948 // threads trying to do this simultaneously.
3949 if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
3950 return -2;
3951
3952 // We can only do this once we've got a header
3953 if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
3954 fp) != 0)
3955 return -2;
3956 fd->dispatcher_set = 1;
3957 }
3958
3959 if (fd->h != h) {
3960 hts_log_error("SAM multi-threaded decoding does not support changing header");
3961 return -1;
3962 }
3963
3964 sp_bams *gb = fd->curr_bam;
3965 if (!gb) {
3966 if (fd->errcode) {
3967 // In case reader failed
3968 errno = fd->errcode;
3969 return -2;
3970 }
3971 hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
3972 if (!r)
3973 return -2;
3974 fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
3975 hts_tpool_delete_result(r, 0);
3976 }
3977 if (!gb)
3978 return fd->errcode ? -2 : -1;
3979 bam1_t *b_array = (bam1_t *)gb->bams;
3980 if (fd->curr_idx < gb->nbams)
3981 if (!bam_copy1(b, &b_array[fd->curr_idx++]))
3982 return -2;
3983 if (fd->curr_idx == gb->nbams) {
3984 pthread_mutex_lock(&fd->lines_m);
3985 gb->next = fd->bams;
3986 fd->bams = gb;
3987 pthread_mutex_unlock(&fd->lines_m);
3988
3989 fd->curr_bam = NULL;
3990 fd->curr_idx = 0;
3991 }
3992
3993 ret = 0;
3994
3995 } else {
3996 err_recover:
3997 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3998 if (ret < 0) return ret;
3999
4000 ret = sam_parse1(&fp->line, h, b);
4001 fp->line.l = 0;
4002 if (ret < 0) {
4003 hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4004 if (h->ignore_sam_err) goto err_recover;
4005 }
4006 }
4007
4008 return ret;
4009 }
4010
4011 // Returns 0 on success,
4012 // -1 on EOF,
4013 // <-1 on error
sam_read1(htsFile * fp,sam_hdr_t * h,bam1_t * b)4014 int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4015 {
4016 int ret, pass_filter;
4017
4018 do {
4019 switch (fp->format.format) {
4020 case bam:
4021 ret = sam_read1_bam(fp, h, b);
4022 break;
4023
4024 case cram:
4025 ret = sam_read1_cram(fp, h, &b);
4026 break;
4027
4028 case sam:
4029 ret = sam_read1_sam(fp, h, b);
4030 break;
4031
4032 case fasta_format:
4033 case fastq_format: {
4034 fastq_state *x = (fastq_state *)fp->state;
4035 if (!x) {
4036 if (!(fp->state = fastq_state_init(fp->format.format
4037 == fastq_format ? '@' : '>')))
4038 return -2;
4039 }
4040
4041 return fastq_parse1(fp, b);
4042 }
4043
4044 case empty_format:
4045 errno = EPIPE;
4046 return -3;
4047
4048 default:
4049 errno = EFTYPE;
4050 return -3;
4051 }
4052
4053 pass_filter = (ret >= 0 && fp->filter)
4054 ? sam_passes_filter(h, b, fp->filter)
4055 : 1;
4056 } while (pass_filter == 0);
4057
4058 return pass_filter < 0 ? -2 : ret;
4059 }
4060
4061
sam_format1_append(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)4062 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4063 {
4064 int i, r = 0;
4065 uint8_t *s, *end;
4066 const bam1_core_t *c = &b->core;
4067
4068 if (c->l_qname == 0)
4069 return -1;
4070 r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4071 r |= kputc_('\t', str); // query name
4072 r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4073 if (c->tid >= 0) { // chr
4074 r |= kputs(h->target_name[c->tid] , str);
4075 r |= kputc_('\t', str);
4076 } else r |= kputsn_("*\t", 2, str);
4077 r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4078 r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4079 if (c->n_cigar) { // cigar
4080 uint32_t *cigar = bam_get_cigar(b);
4081 for (i = 0; i < c->n_cigar; ++i) {
4082 r |= kputw(bam_cigar_oplen(cigar[i]), str);
4083 r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4084 }
4085 } else r |= kputc_('*', str);
4086 r |= kputc_('\t', str);
4087 if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4088 else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4089 else {
4090 r |= kputs(h->target_name[c->mtid], str);
4091 r |= kputc_('\t', str);
4092 }
4093 r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4094 r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4095 if (c->l_qseq) { // seq and qual
4096 uint8_t *s = bam_get_seq(b);
4097 if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4098 char *cp = str->s + str->l;
4099
4100 // Sequence, 2 bases at a time
4101 nibble2base(s, cp, c->l_qseq);
4102 cp[c->l_qseq] = '\t';
4103 cp += c->l_qseq+1;
4104
4105 // Quality
4106 s = bam_get_qual(b);
4107 i = 0;
4108 if (s[0] == 0xff) {
4109 cp[i++] = '*';
4110 } else {
4111 // local copy of c->l_qseq to aid unrolling
4112 uint32_t lqseq = c->l_qseq;
4113 for (i = 0; i < lqseq; ++i)
4114 cp[i]=s[i]+33;
4115 }
4116 cp[i] = 0;
4117 cp += i;
4118 str->l = cp - str->s;
4119 } else r |= kputsn_("*\t*", 3, str);
4120
4121 s = bam_get_aux(b); // aux
4122 end = b->data + b->l_data;
4123
4124 while (end - s >= 4) {
4125 r |= kputc_('\t', str);
4126 if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4127 goto bad_aux;
4128 }
4129 r |= kputsn("", 0, str); // nul terminate
4130 if (r < 0) goto mem_err;
4131
4132 return str->l;
4133
4134 bad_aux:
4135 hts_log_error("Corrupted aux data for read %.*s",
4136 b->core.l_qname, bam_get_qname(b));
4137 errno = EINVAL;
4138 return -1;
4139
4140 mem_err:
4141 hts_log_error("Out of memory");
4142 errno = ENOMEM;
4143 return -1;
4144 }
4145
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)4146 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4147 {
4148 str->l = 0;
4149 return sam_format1_append(h, b, str);
4150 }
4151
4152 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
fastq_format1(fastq_state * x,const bam1_t * b,kstring_t * str)4153 int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4154 {
4155 unsigned flag = b->core.flag;
4156 int i, e = 0, len = b->core.l_qseq;
4157 uint8_t *seq, *qual;
4158
4159 str->l = 0;
4160
4161 if (len == 0) return 0;
4162
4163 // Name
4164 if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4165 return -1;
4166
4167 // /1 or /2 suffix
4168 if (x && x->rnum && (flag & BAM_FPAIRED)) {
4169 int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4170 if (r12 == BAM_FREAD1) {
4171 if (kputs("/1", str) == EOF)
4172 return -1;
4173 } else if (r12 == BAM_FREAD2) {
4174 if (kputs("/2", str) == EOF)
4175 return -1;
4176 }
4177 }
4178
4179 // Illumina CASAVA tag.
4180 // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4181 if (x && x->casava) {
4182 int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4183 char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4184 uint8_t *bc = bam_aux_get(b, x->BC);
4185 if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4186 bc ? (char *)bc+1 : "0") < 0)
4187 return -1;
4188
4189 // Replace any non-alpha with '+'. Ie seq-seq to seq+seq
4190 if (bc) {
4191 int l = strlen((char *)bc+1);
4192 char *c = (char *)str->s + str->l - l;
4193 for (i = 0; i < l; i++)
4194 if (!isalpha_c(c[i]))
4195 c[i] = '+';
4196 }
4197 }
4198
4199 // Aux tags
4200 if (x && x->aux) {
4201 uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4202 while (s && end - s >= 4) {
4203 int tt = s[0]*256 + s[1];
4204 if (x->tags == NULL ||
4205 kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4206 e |= kputc_('\t', str) < 0;
4207 if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4208 return -1;
4209 } else {
4210 s = skip_aux(s+2, end);
4211 }
4212 }
4213 e |= kputsn("", 0, str) < 0; // nul terminate
4214 }
4215
4216 if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4217 e |= kputc_('\n', str) < 0;
4218
4219 // Seq line
4220 seq = bam_get_seq(b);
4221 if (flag & BAM_FREVERSE)
4222 for (i = len-1; i >= 0; i--)
4223 e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4224 else
4225 for (i = 0; i < len; i++)
4226 e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4227
4228
4229 // Qual line
4230 if (x->nprefix == '@') {
4231 kputsn("\n+\n", 3, str);
4232 qual = bam_get_qual(b);
4233 if (qual[0] == 0xff)
4234 for (i = 0; i < len; i++)
4235 e |= kputc_('B', str) < 0;
4236 else if (flag & BAM_FREVERSE)
4237 for (i = len-1; i >= 0; i--)
4238 e |= kputc_(33 + qual[i], str) < 0;
4239 else
4240 for (i = 0; i < len; i++)
4241 e |= kputc_(33 + qual[i], str) < 0;
4242
4243 }
4244 e |= kputc('\n', str) < 0;
4245
4246 return e ? -1 : str->l;
4247 }
4248
4249 // Sadly we need to be able to modify the bam_hdr here so we can
4250 // reference count the structure.
sam_write1(htsFile * fp,const sam_hdr_t * h,const bam1_t * b)4251 int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4252 {
4253 switch (fp->format.format) {
4254 case binary_format:
4255 fp->format.category = sequence_data;
4256 fp->format.format = bam;
4257 /* fall-through */
4258 case bam:
4259 return bam_write_idx1(fp, h, b);
4260
4261 case cram:
4262 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4263
4264 case text_format:
4265 fp->format.category = sequence_data;
4266 fp->format.format = sam;
4267 /* fall-through */
4268 case sam:
4269 if (fp->state) {
4270 SAM_state *fd = (SAM_state *)fp->state;
4271
4272 // Threaded output
4273 if (!fd->h) {
4274 // NB: discard const. We don't actually modify sam_hdr_t here,
4275 // just data pointed to by it (which is a bit weasely still),
4276 // but out cached pointer must be non-const as we want to
4277 // destroy it later on and sam_hdr_destroy takes non-const.
4278 //
4279 // We do this because some tools do sam_hdr_destroy; sam_close
4280 // while others do sam_close; sam_hdr_destroy. The former is
4281 // an issue as we need the header still when flushing.
4282 fd->h = (sam_hdr_t *)h;
4283 fd->h->ref_count++;
4284
4285 if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4286 fp) != 0)
4287 return -2;
4288 fd->dispatcher_set = 1;
4289 }
4290
4291 if (fd->h != h) {
4292 hts_log_error("SAM multi-threaded decoding does not support changing header");
4293 return -2;
4294 }
4295
4296 // Find a suitable BAM array to copy to
4297 sp_bams *gb = fd->curr_bam;
4298 if (!gb) {
4299 pthread_mutex_lock(&fd->lines_m);
4300 if (fd->bams) {
4301 fd->curr_bam = gb = fd->bams;
4302 fd->bams = gb->next;
4303 gb->next = NULL;
4304 gb->nbams = 0;
4305 pthread_mutex_unlock(&fd->lines_m);
4306 } else {
4307 pthread_mutex_unlock(&fd->lines_m);
4308 if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4309 if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) {
4310 free(gb);
4311 return -1;
4312 }
4313 gb->nbams = 0;
4314 gb->abams = NB;
4315 gb->fd = fd;
4316 fd->curr_idx = 0;
4317 fd->curr_bam = gb;
4318 }
4319 }
4320
4321 if (!bam_copy1(&gb->bams[gb->nbams++], b))
4322 return -2;
4323
4324 // Dispatch if full
4325 if (gb->nbams == NB) {
4326 gb->serial = fd->serial++;
4327 //fprintf(stderr, "Dispatch another %d bams\n", NB);
4328 pthread_mutex_lock(&fd->command_m);
4329 if (fd->errcode != 0) {
4330 pthread_mutex_unlock(&fd->command_m);
4331 return -fd->errcode;
4332 }
4333 if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4334 cleanup_sp_bams,
4335 cleanup_sp_lines, 0) < 0) {
4336 pthread_mutex_unlock(&fd->command_m);
4337 return -1;
4338 }
4339 pthread_mutex_unlock(&fd->command_m);
4340 fd->curr_bam = NULL;
4341 }
4342
4343 // Dummy value as we don't know how long it really is.
4344 // We could track file sizes via a SAM_state field, but I don't think
4345 // it is necessary.
4346 return 1;
4347 } else {
4348 if (sam_format1(h, b, &fp->line) < 0) return -1;
4349 kputc('\n', &fp->line);
4350 if (fp->is_bgzf) {
4351 if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4352 } else {
4353 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4354 }
4355
4356 if (fp->idx) {
4357 if (fp->format.compression == bgzf) {
4358 if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4359 bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4360 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4361 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4362 return -1;
4363 }
4364 } else {
4365 if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4366 bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4367 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4368 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4369 return -1;
4370 }
4371 }
4372 }
4373
4374 return fp->line.l;
4375 }
4376
4377
4378 case fasta_format:
4379 case fastq_format: {
4380 fastq_state *x = (fastq_state *)fp->state;
4381 if (!x) {
4382 if (!(fp->state = fastq_state_init(fp->format.format
4383 == fastq_format ? '@' : '>')))
4384 return -2;
4385 }
4386
4387 if (fastq_format1(fp->state, b, &fp->line) < 0)
4388 return -1;
4389 if (fp->is_bgzf) {
4390 if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4391 return -1;
4392 } else {
4393 if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4394 return -1;
4395 }
4396 return fp->line.l;
4397 }
4398
4399 default:
4400 errno = EBADF;
4401 return -1;
4402 }
4403 }
4404
4405 /************************
4406 *** Auxiliary fields ***
4407 ************************/
4408 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)4409 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4410 int tsz = aux_type2size(type);
4411
4412 if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4413
4414 switch (tsz) {
4415 case 'H': case 'Z': case 1: // Trivial
4416 memcpy(out, in, len);
4417 break;
4418
4419 #define aux_val_to_le(type_t, store_le) do { \
4420 type_t v; \
4421 size_t i; \
4422 for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4423 memcpy(&v, in + i, sizeof(type_t)); \
4424 store_le(v, out); \
4425 } \
4426 } while (0)
4427
4428 case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4429 case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4430 case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4431
4432 #undef aux_val_to_le
4433
4434 case 'B': { // Recurse!
4435 uint32_t n;
4436 if (len < 5) return -1;
4437 memcpy(&n, in + 1, 4);
4438 out[0] = in[0];
4439 u32_to_le(n, out + 1);
4440 return aux_to_le(in[0], out + 5, in + 5, len - 5);
4441 }
4442
4443 default: // Unknown type code
4444 return -1;
4445 }
4446
4447
4448
4449 return 0;
4450 }
4451 #endif
4452
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)4453 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4454 {
4455 uint32_t new_len;
4456
4457 assert(b->l_data >= 0);
4458 new_len = b->l_data + 3 + len;
4459 if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4460
4461 if (realloc_bam_data(b, new_len) < 0) return -1;
4462
4463 b->data[b->l_data] = tag[0];
4464 b->data[b->l_data + 1] = tag[1];
4465 b->data[b->l_data + 2] = type;
4466
4467 #ifdef HTS_LITTLE_ENDIAN
4468 memcpy(b->data + b->l_data + 3, data, len);
4469 #else
4470 if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4471 errno = EINVAL;
4472 return -1;
4473 }
4474 #endif
4475
4476 b->l_data = new_len;
4477
4478 return 0;
4479
4480 nomem:
4481 errno = ENOMEM;
4482 return -1;
4483 }
4484
skip_aux(uint8_t * s,uint8_t * end)4485 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4486 {
4487 int size;
4488 uint32_t n;
4489 if (s >= end) return end;
4490 size = aux_type2size(*s); ++s; // skip type
4491 switch (size) {
4492 case 'Z':
4493 case 'H':
4494 while (s < end && *s) ++s;
4495 return s < end ? s + 1 : end;
4496 case 'B':
4497 if (end - s < 5) return NULL;
4498 size = aux_type2size(*s); ++s;
4499 n = le_to_u32(s);
4500 s += 4;
4501 if (size == 0 || end - s < size * n) return NULL;
4502 return s + size * n;
4503 case 0:
4504 return NULL;
4505 default:
4506 if (end - s < size) return NULL;
4507 return s + size;
4508 }
4509 }
4510
bam_aux_get(const bam1_t * b,const char tag[2])4511 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4512 {
4513 uint8_t *s, *end, *t = (uint8_t *) tag;
4514 uint16_t y = (uint16_t) t[0]<<8 | t[1];
4515 s = bam_get_aux(b);
4516 end = b->data + b->l_data;
4517 while (s != NULL && end - s >= 3) {
4518 uint16_t x = (uint16_t) s[0]<<8 | s[1];
4519 s += 2;
4520 if (x == y) {
4521 // Check the tag value is valid and complete
4522 uint8_t *e = skip_aux(s, end);
4523 if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
4524 goto bad_aux; // Unterminated string
4525 }
4526 if (e != NULL) {
4527 return s;
4528 } else {
4529 goto bad_aux;
4530 }
4531 }
4532 s = skip_aux(s, end);
4533 }
4534 if (s == NULL) goto bad_aux;
4535 errno = ENOENT;
4536 return NULL;
4537
4538 bad_aux:
4539 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4540 errno = EINVAL;
4541 return NULL;
4542 }
4543
4544 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)4545 int bam_aux_del(bam1_t *b, uint8_t *s)
4546 {
4547 uint8_t *p, *aux;
4548 int l_aux = bam_get_l_aux(b);
4549 aux = bam_get_aux(b);
4550 p = s - 2;
4551 s = skip_aux(s, aux + l_aux);
4552 if (s == NULL) goto bad_aux;
4553 memmove(p, s, l_aux - (s - aux));
4554 b->l_data -= s - p;
4555 return 0;
4556
4557 bad_aux:
4558 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4559 errno = EINVAL;
4560 return -1;
4561 }
4562
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)4563 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4564 {
4565 // FIXME: This is not at all efficient!
4566 size_t ln = len >= 0 ? len : strlen(data) + 1;
4567 size_t old_ln = 0;
4568 int need_nul = ln == 0 || data[ln - 1] != '\0';
4569 int save_errno = errno;
4570 int new_tag = 0;
4571 uint8_t *s = bam_aux_get(b,tag), *e;
4572
4573 if (s) { // Replacing existing tag
4574 char type = *s;
4575 if (type != 'Z') {
4576 hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4577 errno = EINVAL;
4578 return -1;
4579 }
4580 s++;
4581 e = memchr(s, '\0', b->data + b->l_data - s);
4582 old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4583 s -= 3;
4584 } else {
4585 if (errno != ENOENT) { // Invalid aux data, give up
4586 return -1;
4587 } else { // Tag doesn't exist - put it on the end
4588 errno = save_errno;
4589 s = b->data + b->l_data;
4590 new_tag = 3;
4591 }
4592 }
4593
4594 if (old_ln < ln + need_nul + new_tag) {
4595 ptrdiff_t s_offset = s - b->data;
4596 if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4597 return -1;
4598 s = b->data + s_offset;
4599 }
4600 if (!new_tag) {
4601 memmove(s + 3 + ln + need_nul,
4602 s + 3 + old_ln,
4603 b->l_data - (s + 3 - b->data) - old_ln);
4604 }
4605 b->l_data += new_tag + ln + need_nul - old_ln;
4606
4607 s[0] = tag[0];
4608 s[1] = tag[1];
4609 s[2] = 'Z';
4610 memmove(s+3,data,ln);
4611 if (need_nul) s[3 + ln] = '\0';
4612 return 0;
4613 }
4614
bam_aux_update_int(bam1_t * b,const char tag[2],int64_t val)4615 int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4616 {
4617 uint32_t sz, old_sz = 0, new = 0;
4618 uint8_t *s, type;
4619
4620 if (val < INT32_MIN || val > UINT32_MAX) {
4621 errno = EOVERFLOW;
4622 return -1;
4623 }
4624 if (val < INT16_MIN) { type = 'i'; sz = 4; }
4625 else if (val < INT8_MIN) { type = 's'; sz = 2; }
4626 else if (val < 0) { type = 'c'; sz = 1; }
4627 else if (val < UINT8_MAX) { type = 'C'; sz = 1; }
4628 else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4629 else { type = 'I'; sz = 4; }
4630
4631 s = bam_aux_get(b, tag);
4632 if (s) { // Tag present - how big was the old one?
4633 switch (*s) {
4634 case 'c': case 'C': old_sz = 1; break;
4635 case 's': case 'S': old_sz = 2; break;
4636 case 'i': case 'I': old_sz = 4; break;
4637 default: errno = EINVAL; return -1; // Not an integer
4638 }
4639 } else {
4640 if (errno == ENOENT) { // Tag doesn't exist - add a new one
4641 s = b->data + b->l_data;
4642 new = 1;
4643 } else { // Invalid aux data, give up.
4644 return -1;
4645 }
4646 }
4647
4648 if (new || old_sz < sz) {
4649 // Make room for new tag
4650 ptrdiff_t s_offset = s - b->data;
4651 if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4652 return -1;
4653 s = b->data + s_offset;
4654 if (new) { // Add tag id
4655 *s++ = tag[0];
4656 *s++ = tag[1];
4657 } else { // Shift following data so we have space
4658 memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4659 }
4660 } else {
4661 // Reuse old space. Data value may be bigger than necessary but
4662 // we avoid having to move everything else
4663 sz = old_sz;
4664 type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4665 assert(type > 0);
4666 }
4667 *s++ = type;
4668 #ifdef HTS_LITTLE_ENDIAN
4669 memcpy(s, &val, sz);
4670 #else
4671 switch (sz) {
4672 case 4: u32_to_le(val, s); break;
4673 case 2: u16_to_le(val, s); break;
4674 default: *s = val; break;
4675 }
4676 #endif
4677 b->l_data += (new ? 3 : 0) + sz - old_sz;
4678 return 0;
4679 }
4680
bam_aux_update_float(bam1_t * b,const char tag[2],float val)4681 int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
4682 {
4683 uint8_t *s = bam_aux_get(b, tag);
4684 int shrink = 0, new = 0;
4685
4686 if (s) { // Tag present - what was it?
4687 switch (*s) {
4688 case 'f': break;
4689 case 'd': shrink = 1; break;
4690 default: errno = EINVAL; return -1; // Not a float
4691 }
4692 } else {
4693 if (errno == ENOENT) { // Tag doesn't exist - add a new one
4694 new = 1;
4695 } else { // Invalid aux data, give up.
4696 return -1;
4697 }
4698 }
4699
4700 if (new) { // Ensure there's room
4701 if (possibly_expand_bam_data(b, 3 + 4) < 0)
4702 return -1;
4703 s = b->data + b->l_data;
4704 *s++ = tag[0];
4705 *s++ = tag[1];
4706 } else if (shrink) { // Convert non-standard double tag to float
4707 memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
4708 b->l_data -= 4;
4709 }
4710 *s++ = 'f';
4711 float_to_le(val, s);
4712 if (new) b->l_data += 7;
4713
4714 return 0;
4715 }
4716
bam_aux_update_array(bam1_t * b,const char tag[2],uint8_t type,uint32_t items,void * data)4717 int bam_aux_update_array(bam1_t *b, const char tag[2],
4718 uint8_t type, uint32_t items, void *data)
4719 {
4720 uint8_t *s = bam_aux_get(b, tag);
4721 size_t old_sz = 0, new_sz;
4722 int new = 0;
4723
4724 if (s) { // Tag present
4725 if (*s != 'B') { errno = EINVAL; return -1; }
4726 old_sz = aux_type2size(s[1]);
4727 if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
4728 old_sz *= le_to_u32(s + 2);
4729 } else {
4730 if (errno == ENOENT) { // Tag doesn't exist - add a new one
4731 s = b->data + b->l_data;
4732 new = 1;
4733 } else { // Invalid aux data, give up.
4734 return -1;
4735 }
4736 }
4737
4738 new_sz = aux_type2size(type);
4739 if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
4740 if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
4741 new_sz *= items;
4742
4743 if (new || old_sz < new_sz) {
4744 // Make room for new tag
4745 ptrdiff_t s_offset = s - b->data;
4746 if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
4747 return -1;
4748 s = b->data + s_offset;
4749 }
4750 if (new) { // Add tag id and type
4751 *s++ = tag[0];
4752 *s++ = tag[1];
4753 *s = 'B';
4754 b->l_data += 8 + new_sz;
4755 } else if (old_sz != new_sz) { // shift following data if necessary
4756 memmove(s + 6 + new_sz, s + 6 + old_sz,
4757 b->l_data - ((s + 6 + old_sz) - b->data));
4758 b->l_data -= old_sz;
4759 b->l_data += new_sz;
4760 }
4761
4762 s[1] = type;
4763 u32_to_le(items, s + 2);
4764 #ifdef HTS_LITTLE_ENDIAN
4765 memcpy(s + 6, data, new_sz);
4766 return 0;
4767 #else
4768 return aux_to_le(type, s + 6, data, new_sz);
4769 #endif
4770 }
4771
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)4772 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
4773 uint32_t idx)
4774 {
4775 switch (type) {
4776 case 'c': return le_to_i8(s + idx);
4777 case 'C': return s[idx];
4778 case 's': return le_to_i16(s + 2 * idx);
4779 case 'S': return le_to_u16(s + 2 * idx);
4780 case 'i': return le_to_i32(s + 4 * idx);
4781 case 'I': return le_to_u32(s + 4 * idx);
4782 default:
4783 errno = EINVAL;
4784 return 0;
4785 }
4786 }
4787
bam_aux2i(const uint8_t * s)4788 int64_t bam_aux2i(const uint8_t *s)
4789 {
4790 int type;
4791 type = *s++;
4792 return get_int_aux_val(type, s, 0);
4793 }
4794
bam_aux2f(const uint8_t * s)4795 double bam_aux2f(const uint8_t *s)
4796 {
4797 int type;
4798 type = *s++;
4799 if (type == 'd') return le_to_double(s);
4800 else if (type == 'f') return le_to_float(s);
4801 else return get_int_aux_val(type, s, 0);
4802 }
4803
bam_aux2A(const uint8_t * s)4804 char bam_aux2A(const uint8_t *s)
4805 {
4806 int type;
4807 type = *s++;
4808 if (type == 'A') return *(char*)s;
4809 errno = EINVAL;
4810 return 0;
4811 }
4812
bam_aux2Z(const uint8_t * s)4813 char *bam_aux2Z(const uint8_t *s)
4814 {
4815 int type;
4816 type = *s++;
4817 if (type == 'Z' || type == 'H') return (char*)s;
4818 errno = EINVAL;
4819 return 0;
4820 }
4821
bam_auxB_len(const uint8_t * s)4822 uint32_t bam_auxB_len(const uint8_t *s)
4823 {
4824 if (s[0] != 'B') {
4825 errno = EINVAL;
4826 return 0;
4827 }
4828 return le_to_u32(s + 2);
4829 }
4830
bam_auxB2i(const uint8_t * s,uint32_t idx)4831 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
4832 {
4833 uint32_t len = bam_auxB_len(s);
4834 if (idx >= len) {
4835 errno = ERANGE;
4836 return 0;
4837 }
4838 return get_int_aux_val(s[1], s + 6, idx);
4839 }
4840
bam_auxB2f(const uint8_t * s,uint32_t idx)4841 double bam_auxB2f(const uint8_t *s, uint32_t idx)
4842 {
4843 uint32_t len = bam_auxB_len(s);
4844 if (idx >= len) {
4845 errno = ERANGE;
4846 return 0.0;
4847 }
4848 if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
4849 else return get_int_aux_val(s[1], s + 6, idx);
4850 }
4851
sam_open_mode(char * mode,const char * fn,const char * format)4852 int sam_open_mode(char *mode, const char *fn, const char *format)
4853 {
4854 // TODO Parse "bam5" etc for compression level
4855 if (format == NULL) {
4856 // Try to pick a format based on the filename extension
4857 char extension[HTS_MAX_EXT_LEN];
4858 if (find_file_extension(fn, extension) < 0) return -1;
4859 return sam_open_mode(mode, fn, extension);
4860 }
4861 else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
4862 else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
4863 else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
4864 else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
4865 else if (strcasecmp(format, "fastq") == 0 ||
4866 strcasecmp(format, "fq") == 0) strcpy(mode, "f");
4867 else if (strcasecmp(format, "fastq.gz") == 0 ||
4868 strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
4869 else if (strcasecmp(format, "fasta") == 0 ||
4870 strcasecmp(format, "fa") == 0) strcpy(mode, "F");
4871 else if (strcasecmp(format, "fasta.gz") == 0 ||
4872 strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
4873 else return -1;
4874
4875 return 0;
4876 }
4877
4878 // A version of sam_open_mode that can handle ,key=value options.
4879 // The format string is allocated and returned, to be freed by the caller.
4880 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)4881 char *sam_open_mode_opts(const char *fn,
4882 const char *mode,
4883 const char *format)
4884 {
4885 char *mode_opts = malloc((format ? strlen(format) : 1) +
4886 (mode ? strlen(mode) : 1) + 12);
4887 char *opts, *cp;
4888 int format_len;
4889
4890 if (!mode_opts)
4891 return NULL;
4892
4893 strcpy(mode_opts, mode ? mode : "r");
4894 cp = mode_opts + strlen(mode_opts);
4895
4896 if (format == NULL) {
4897 // Try to pick a format based on the filename extension
4898 char extension[HTS_MAX_EXT_LEN];
4899 if (find_file_extension(fn, extension) < 0) {
4900 free(mode_opts);
4901 return NULL;
4902 }
4903 if (sam_open_mode(cp, fn, extension) == 0) {
4904 return mode_opts;
4905 } else {
4906 free(mode_opts);
4907 return NULL;
4908 }
4909 }
4910
4911 if ((opts = strchr(format, ','))) {
4912 format_len = opts-format;
4913 } else {
4914 opts="";
4915 format_len = strlen(format);
4916 }
4917
4918 if (strncmp(format, "bam", format_len) == 0) {
4919 *cp++ = 'b';
4920 } else if (strncmp(format, "cram", format_len) == 0) {
4921 *cp++ = 'c';
4922 } else if (strncmp(format, "cram2", format_len) == 0) {
4923 *cp++ = 'c';
4924 strcpy(cp, ",VERSION=2.1");
4925 cp += 12;
4926 } else if (strncmp(format, "cram3", format_len) == 0) {
4927 *cp++ = 'c';
4928 strcpy(cp, ",VERSION=3.0");
4929 cp += 12;
4930 } else if (strncmp(format, "sam", format_len) == 0) {
4931 ; // format mode=""
4932 } else if (strncmp(format, "sam.gz", format_len) == 0) {
4933 *cp++ = 'z';
4934 } else if (strncmp(format, "fastq", format_len) == 0 ||
4935 strncmp(format, "fq", format_len) == 0) {
4936 *cp++ = 'f';
4937 } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
4938 strncmp(format, "fq.gz", format_len) == 0) {
4939 *cp++ = 'f';
4940 *cp++ = 'z';
4941 } else if (strncmp(format, "fasta", format_len) == 0 ||
4942 strncmp(format, "fa", format_len) == 0) {
4943 *cp++ = 'F';
4944 } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
4945 strncmp(format, "fa", format_len) == 0) {
4946 *cp++ = 'F';
4947 *cp++ = 'z';
4948 } else {
4949 free(mode_opts);
4950 return NULL;
4951 }
4952
4953 strcpy(cp, opts);
4954
4955 return mode_opts;
4956 }
4957
4958 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)4959 int bam_str2flag(const char *str)
4960 {
4961 char *end, *beg = (char*) str;
4962 long int flag = strtol(str, &end, 0);
4963 if ( end!=str ) return flag; // the conversion was successful
4964 flag = 0;
4965 while ( *str )
4966 {
4967 end = beg;
4968 while ( *end && *end!=',' ) end++;
4969 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
4970 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
4971 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
4972 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
4973 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
4974 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
4975 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
4976 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
4977 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
4978 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
4979 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
4980 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
4981 else return -1;
4982 if ( !*end ) break;
4983 beg = end + 1;
4984 }
4985 return flag;
4986 }
4987
bam_flag2str(int flag)4988 char *bam_flag2str(int flag)
4989 {
4990 kstring_t str = {0,0,0};
4991 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
4992 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
4993 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
4994 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
4995 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
4996 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
4997 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
4998 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
4999 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5000 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5001 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5002 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5003 if ( str.l == 0 ) kputsn("", 0, &str);
5004 return str.s;
5005 }
5006
5007
5008 /**************************
5009 *** Pileup and Mpileup ***
5010 **************************/
5011
5012 #if !defined(BAM_NO_PILEUP)
5013
5014 #include <assert.h>
5015
5016 /*******************
5017 *** Memory pool ***
5018 *******************/
5019
5020 typedef struct {
5021 int k, y;
5022 hts_pos_t x, end;
5023 } cstate_t;
5024
5025 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5026
5027 typedef struct __linkbuf_t {
5028 bam1_t b;
5029 hts_pos_t beg, end;
5030 cstate_t s;
5031 struct __linkbuf_t *next;
5032 bam_pileup_cd cd;
5033 } lbnode_t;
5034
5035 typedef struct {
5036 int cnt, n, max;
5037 lbnode_t **buf;
5038 } mempool_t;
5039
mp_init(void)5040 static mempool_t *mp_init(void)
5041 {
5042 mempool_t *mp;
5043 mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5044 return mp;
5045 }
mp_destroy(mempool_t * mp)5046 static void mp_destroy(mempool_t *mp)
5047 {
5048 int k;
5049 for (k = 0; k < mp->n; ++k) {
5050 free(mp->buf[k]->b.data);
5051 free(mp->buf[k]);
5052 }
5053 free(mp->buf);
5054 free(mp);
5055 }
mp_alloc(mempool_t * mp)5056 static inline lbnode_t *mp_alloc(mempool_t *mp)
5057 {
5058 ++mp->cnt;
5059 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5060 else return mp->buf[--mp->n];
5061 }
mp_free(mempool_t * mp,lbnode_t * p)5062 static inline void mp_free(mempool_t *mp, lbnode_t *p)
5063 {
5064 --mp->cnt; p->next = 0; // clear lbnode_t::next here
5065 if (mp->n == mp->max) {
5066 mp->max = mp->max? mp->max<<1 : 256;
5067 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5068 }
5069 mp->buf[mp->n++] = p;
5070 }
5071
5072 /**********************
5073 *** CIGAR resolver ***
5074 **********************/
5075
5076 /* s->k: the index of the CIGAR operator that has just been processed.
5077 s->x: the reference coordinate of the start of s->k
5078 s->y: the query coordinate of the start of s->k
5079 */
resolve_cigar2(bam_pileup1_t * p,hts_pos_t pos,cstate_t * s)5080 static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5081 {
5082 #define _cop(c) ((c)&BAM_CIGAR_MASK)
5083 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5084
5085 bam1_t *b = p->b;
5086 bam1_core_t *c = &b->core;
5087 uint32_t *cigar = bam_get_cigar(b);
5088 int k;
5089 // determine the current CIGAR operation
5090 //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5091 if (s->k == -1) { // never processed
5092 p->qpos = 0;
5093 if (c->n_cigar == 1) { // just one operation, save a loop
5094 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5095 } else { // find the first match or deletion
5096 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5097 int op = _cop(cigar[k]);
5098 int l = _cln(cigar[k]);
5099 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5100 op == BAM_CEQUAL || op == BAM_CDIFF) break;
5101 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5102 }
5103 assert(k < c->n_cigar);
5104 s->k = k;
5105 }
5106 } else { // the read has been processed before
5107 int op, l = _cln(cigar[s->k]);
5108 if (pos - s->x >= l) { // jump to the next operation
5109 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5110 op = _cop(cigar[s->k+1]);
5111 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5112 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5113 s->x += l;
5114 ++s->k;
5115 } else { // find the next M/D/N/=/X
5116 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5117 s->x += l;
5118 for (k = s->k + 1; k < c->n_cigar; ++k) {
5119 op = _cop(cigar[k]), l = _cln(cigar[k]);
5120 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5121 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5122 }
5123 s->k = k;
5124 }
5125 assert(s->k < c->n_cigar); // otherwise a bug
5126 } // else, do nothing
5127 }
5128 { // collect pileup information
5129 int op, l;
5130 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5131 p->is_del = p->indel = p->is_refskip = 0;
5132 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5133 int op2 = _cop(cigar[s->k+1]);
5134 int l2 = _cln(cigar[s->k+1]);
5135 if (op2 == BAM_CDEL) p->indel = -(int)l2;
5136 else if (op2 == BAM_CINS) p->indel = l2;
5137 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
5138 int l3 = 0;
5139 for (k = s->k + 2; k < c->n_cigar; ++k) {
5140 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5141 if (op2 == BAM_CINS) l3 += l2;
5142 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5143 }
5144 if (l3 > 0) p->indel = l3;
5145 }
5146 }
5147 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5148 p->qpos = s->y + (pos - s->x);
5149 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5150 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5151 p->is_refskip = (op == BAM_CREF_SKIP);
5152 } // cannot be other operations; otherwise a bug
5153 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5154 }
5155 p->cigar_ind = s->k;
5156 return 1;
5157 }
5158
5159 /*******************************
5160 *** Expansion of insertions ***
5161 *******************************/
5162
5163 /*
5164 * Fills out the kstring with the padded insertion sequence for the current
5165 * location in 'p'. If this is not an insertion site, the string is blank.
5166 *
5167 * This variant handles base modifications, but only when "m" is non-NULL.
5168 *
5169 * Returns the number of inserted base on success, with string length being
5170 * accessable via ins->l;
5171 * -1 on failure.
5172 */
bam_plp_insertion_mod(const bam_pileup1_t * p,hts_base_mod_state * m,kstring_t * ins,int * del_len)5173 int bam_plp_insertion_mod(const bam_pileup1_t *p,
5174 hts_base_mod_state *m,
5175 kstring_t *ins, int *del_len) {
5176 int j, k, indel, nb = 0;
5177 uint32_t *cigar;
5178
5179 if (p->indel <= 0) {
5180 if (ks_resize(ins, 1) < 0)
5181 return -1;
5182 ins->l = 0;
5183 ins->s[0] = '\0';
5184 return 0;
5185 }
5186
5187 if (del_len)
5188 *del_len = 0;
5189
5190 // Measure indel length including pads
5191 indel = 0;
5192 k = p->cigar_ind+1;
5193 cigar = bam_get_cigar(p->b);
5194 while (k < p->b->core.n_cigar) {
5195 switch (cigar[k] & BAM_CIGAR_MASK) {
5196 case BAM_CPAD:
5197 case BAM_CINS:
5198 indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5199 break;
5200 default:
5201 k = p->b->core.n_cigar;
5202 break;
5203 }
5204 k++;
5205 }
5206 nb = ins->l = indel;
5207
5208 // Produce sequence
5209 if (ks_resize(ins, indel+1) < 0)
5210 return -1;
5211 indel = 0;
5212 k = p->cigar_ind+1;
5213 j = 1;
5214 while (k < p->b->core.n_cigar) {
5215 int l, c;
5216 switch (cigar[k] & BAM_CIGAR_MASK) {
5217 case BAM_CPAD:
5218 for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5219 ins->s[indel++] = '*';
5220 break;
5221 case BAM_CINS:
5222 for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5223 c = seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5224 p->qpos + j - p->is_del)];
5225 ins->s[indel++] = c;
5226 int nm;
5227 hts_base_mod mod[256];
5228 if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5229 m, mod, 256)) > 0) {
5230 if (ks_resize(ins, ins->l + nm*16+3) < 0)
5231 return -1;
5232 ins->s[indel++] = '[';
5233 int j;
5234 for (j = 0; j < nm; j++) {
5235 char qual[20];
5236 if (mod[j].qual >= 0)
5237 sprintf(qual, "%d", mod[j].qual);
5238 else
5239 *qual=0;
5240 if (mod[j].modified_base < 0)
5241 // ChEBI
5242 indel += sprintf(&ins->s[indel], "%c(%d)%s",
5243 "+-"[mod[j].strand],
5244 -mod[j].modified_base,
5245 qual);
5246 else
5247 indel += sprintf(&ins->s[indel], "%c%c%s",
5248 "+-"[mod[j].strand],
5249 mod[j].modified_base,
5250 qual);
5251 }
5252 ins->s[indel++] = ']';
5253 }
5254 }
5255 break;
5256 case BAM_CDEL:
5257 // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5258 if (del_len)
5259 *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5260 // fall through
5261 default:
5262 k = p->b->core.n_cigar;
5263 break;
5264 }
5265 k++;
5266 }
5267 ins->s[indel] = '\0';
5268 ins->l = indel; // string length
5269
5270 return nb; // base length
5271 }
5272
5273 /*
5274 * Fills out the kstring with the padded insertion sequence for the current
5275 * location in 'p'. If this is not an insertion site, the string is blank.
5276 *
5277 * This is the original interface with no capability for reporting base
5278 * modifications.
5279 *
5280 * Returns the length of insertion string on success;
5281 * -1 on failure.
5282 */
bam_plp_insertion(const bam_pileup1_t * p,kstring_t * ins,int * del_len)5283 int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5284 return bam_plp_insertion_mod(p, NULL, ins, del_len);
5285 }
5286
5287 /***********************
5288 *** Pileup iterator ***
5289 ***********************/
5290
5291 // Dictionary of overlapping reads
5292 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5293 typedef khash_t(olap_hash) olap_hash_t;
5294
5295 struct bam_plp_s {
5296 mempool_t *mp;
5297 lbnode_t *head, *tail;
5298 int32_t tid, max_tid;
5299 hts_pos_t pos, max_pos;
5300 int is_eof, max_plp, error, maxcnt;
5301 uint64_t id;
5302 bam_pileup1_t *plp;
5303 // for the "auto" interface only
5304 bam1_t *b;
5305 bam_plp_auto_f func;
5306 void *data;
5307 olap_hash_t *overlaps;
5308
5309 // For notification of creation and destruction events
5310 // and associated client-owned pointer.
5311 int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5312 int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5313 };
5314
bam_plp_init(bam_plp_auto_f func,void * data)5315 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5316 {
5317 bam_plp_t iter;
5318 iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5319 iter->mp = mp_init();
5320 iter->head = iter->tail = mp_alloc(iter->mp);
5321 iter->max_tid = iter->max_pos = -1;
5322 iter->maxcnt = 8000;
5323 if (func) {
5324 iter->func = func;
5325 iter->data = data;
5326 iter->b = bam_init1();
5327 }
5328 return iter;
5329 }
5330
bam_plp_init_overlaps(bam_plp_t iter)5331 int bam_plp_init_overlaps(bam_plp_t iter)
5332 {
5333 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
5334 return iter->overlaps ? 0 : -1;
5335 }
5336
bam_plp_destroy(bam_plp_t iter)5337 void bam_plp_destroy(bam_plp_t iter)
5338 {
5339 lbnode_t *p, *pnext;
5340 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5341 for (p = iter->head; p != NULL; p = pnext) {
5342 pnext = p->next;
5343 mp_free(iter->mp, p);
5344 }
5345 mp_destroy(iter->mp);
5346 if (iter->b) bam_destroy1(iter->b);
5347 free(iter->plp);
5348 free(iter);
5349 }
5350
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))5351 void bam_plp_constructor(bam_plp_t plp,
5352 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5353 plp->plp_construct = func;
5354 }
5355
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))5356 void bam_plp_destructor(bam_plp_t plp,
5357 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5358 plp->plp_destruct = func;
5359 }
5360
5361 //---------------------------------
5362 //--- Tweak overlapping reads
5363 //---------------------------------
5364
5365 /**
5366 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
5367 * cigar_iref2iseq_next() - get the next CMATCH base
5368 * @cigar: pointer to current cigar block (rw)
5369 * @cigar_max: pointer just beyond the last cigar block
5370 * @icig: position within the current cigar block (rw)
5371 * @iseq: position in the sequence (rw)
5372 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5373 *
5374 * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5375 * or -2 on error.
5376 */
cigar_iref2iseq_set(const uint32_t ** cigar,const uint32_t * cigar_max,hts_pos_t * icig,hts_pos_t * iseq,hts_pos_t * iref)5377 static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5378 const uint32_t *cigar_max,
5379 hts_pos_t *icig,
5380 hts_pos_t *iseq,
5381 hts_pos_t *iref)
5382 {
5383 hts_pos_t pos = *iref;
5384 if ( pos < 0 ) return -1;
5385 *icig = 0;
5386 *iseq = 0;
5387 *iref = 0;
5388 while ( *cigar<cigar_max )
5389 {
5390 int cig = (**cigar) & BAM_CIGAR_MASK;
5391 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5392
5393 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5394 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5395 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5396 {
5397 pos -= ncig;
5398 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5399 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5400 continue;
5401 }
5402 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5403 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5404 {
5405 pos -= ncig;
5406 if ( pos<0 ) pos = 0;
5407 (*cigar)++; *icig = 0; *iref += ncig;
5408 continue;
5409 }
5410 hts_log_error("Unexpected cigar %d", cig);
5411 return -2;
5412 }
5413 *iseq = -1;
5414 return -1;
5415 }
cigar_iref2iseq_next(const uint32_t ** cigar,const uint32_t * cigar_max,hts_pos_t * icig,hts_pos_t * iseq,hts_pos_t * iref)5416 static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5417 const uint32_t *cigar_max,
5418 hts_pos_t *icig,
5419 hts_pos_t *iseq,
5420 hts_pos_t *iref)
5421 {
5422 while ( *cigar < cigar_max )
5423 {
5424 int cig = (**cigar) & BAM_CIGAR_MASK;
5425 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5426
5427 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5428 {
5429 if ( *icig >= ncig - 1 ) { *icig = -1; (*cigar)++; continue; }
5430 (*iseq)++; (*icig)++; (*iref)++;
5431 return BAM_CMATCH;
5432 }
5433 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5434 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5435 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5436 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5437 hts_log_error("Unexpected cigar %d", cig);
5438 return -2;
5439 }
5440 *iseq = -1;
5441 *iref = -1;
5442 return -1;
5443 }
5444
5445 // Given overlapping read 'a' (left) and 'b' (right) on the same
5446 // template, adjust quality values to zero for either a or b.
5447 // Note versions 1.12 and earlier always removed quality from 'b' for
5448 // matching bases. Now we select a or b semi-randomly based on name hash.
5449 // Returns 0 on success,
5450 // -1 on failure
tweak_overlap_quality(bam1_t * a,bam1_t * b)5451 static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5452 {
5453 const uint32_t *a_cigar = bam_get_cigar(a),
5454 *a_cigar_max = a_cigar + a->core.n_cigar;
5455 const uint32_t *b_cigar = bam_get_cigar(b),
5456 *b_cigar_max = b_cigar + b->core.n_cigar;
5457 hts_pos_t a_icig = 0, a_iseq = 0;
5458 hts_pos_t b_icig = 0, b_iseq = 0;
5459 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5460 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
5461
5462 hts_pos_t iref = b->core.pos;
5463 hts_pos_t a_iref = iref - a->core.pos;
5464 hts_pos_t b_iref = iref - b->core.pos;
5465
5466 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5467 &a_icig, &a_iseq, &a_iref);
5468 if ( a_ret<0 )
5469 // no overlap or error
5470 return a_ret<-1 ? -1:0;
5471
5472 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5473 &b_icig, &b_iseq, &b_iref);
5474 if ( b_ret<0 )
5475 // no overlap or error
5476 return b_ret<-1 ? -1:0;
5477
5478 // Determine which seq is the one getting modified qualities.
5479 uint8_t amul, bmul;
5480 if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5481 amul = 1;
5482 bmul = 0;
5483 } else {
5484 amul = 0;
5485 bmul = 1;
5486 }
5487
5488 // Loop over the overlapping region nulling qualities in either
5489 // seq a or b.
5490 int err = 0;
5491 while ( 1 )
5492 {
5493 // Step to next matching reference position in a and b
5494 while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5495 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5496 &a_icig, &a_iseq, &a_iref);
5497 if ( a_ret<0 ) { // done
5498 err = a_ret<-1?-1:0;
5499 break;
5500 }
5501 if ( iref < a_iref + a->core.pos )
5502 iref = a_iref + a->core.pos;
5503
5504 while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5505 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5506 &b_iseq, &b_iref);
5507 if ( b_ret<0 ) { // done
5508 err = b_ret<-1?-1:0;
5509 break;
5510 }
5511 if ( iref < b_iref + b->core.pos )
5512 iref = b_iref + b->core.pos;
5513
5514 iref++;
5515
5516 if ( a_iref+a->core.pos != b_iref+b->core.pos )
5517 // only CMATCH positions, don't know what to do with indels
5518 continue;
5519
5520 if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5521 // Fell off end of sequence, bad CIGAR?
5522 return -1;
5523
5524 // We're finally at the same ref base in both a and b.
5525 // Check if the bases match (confident) or mismatch
5526 // (not so confident).
5527 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5528 // We are very confident about this base. Use sum of quals
5529 int qual = a_qual[a_iseq] + b_qual[b_iseq];
5530 a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5531 b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5532 } else {
5533 // Not so confident about anymore given the mismatch.
5534 // Reduce qual for lowest quality base.
5535 if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5536 // A highest qual base; keep
5537 a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5538 b_qual[b_iseq] = 0;
5539 } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5540 // B highest qual base; keep
5541 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5542 a_qual[a_iseq] = 0;
5543 } else {
5544 // Both equal, so pick randomly
5545 a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5546 b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5547 }
5548 }
5549 }
5550
5551 return err;
5552 }
5553
5554 // Fix overlapping reads. Simple soft-clipping did not give good results.
5555 // Lowering qualities of unwanted bases is more selective and works better.
5556 //
5557 // Returns 0 on success, -1 on failure
overlap_push(bam_plp_t iter,lbnode_t * node)5558 static int overlap_push(bam_plp_t iter, lbnode_t *node)
5559 {
5560 if ( !iter->overlaps ) return 0;
5561
5562 // mapped mates and paired reads only
5563 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5564
5565 // no overlap possible, unless some wild cigar
5566 if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5567 || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5568 && node->b.core.mpos >= node->end) // for those wild cigars
5569 ) return 0;
5570
5571 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5572 if ( kitr==kh_end(iter->overlaps) )
5573 {
5574 // Only add reads where the mate is still to arrive
5575 if (node->b.core.mpos >= node->b.core.pos ||
5576 ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5577 int ret;
5578 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5579 if (ret < 0) return -1;
5580 kh_value(iter->overlaps, kitr) = node;
5581 }
5582 }
5583 else
5584 {
5585 lbnode_t *a = kh_value(iter->overlaps, kitr);
5586 int err = tweak_overlap_quality(&a->b, &node->b);
5587 kh_del(olap_hash, iter->overlaps, kitr);
5588 assert(a->end-1 == a->s.end);
5589 return err;
5590 }
5591 return 0;
5592 }
5593
overlap_remove(bam_plp_t iter,const bam1_t * b)5594 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5595 {
5596 if ( !iter->overlaps ) return;
5597
5598 khiter_t kitr;
5599 if ( b )
5600 {
5601 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5602 if ( kitr!=kh_end(iter->overlaps) )
5603 kh_del(olap_hash, iter->overlaps, kitr);
5604 }
5605 else
5606 {
5607 // remove all
5608 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
5609 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
5610 }
5611 }
5612
5613
5614
5615 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
5616 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
5617 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp64_next(bam_plp_t iter,int * _tid,hts_pos_t * _pos,int * _n_plp)5618 const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
5619 {
5620 if (iter->error) { *_n_plp = -1; return NULL; }
5621 *_n_plp = 0;
5622 if (iter->is_eof && iter->head == iter->tail) return NULL;
5623 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
5624 int n_plp = 0;
5625 // write iter->plp at iter->pos
5626 lbnode_t **pptr = &iter->head;
5627 while (*pptr != iter->tail) {
5628 lbnode_t *p = *pptr;
5629 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
5630 overlap_remove(iter, &p->b);
5631 if (iter->plp_destruct)
5632 iter->plp_destruct(iter->data, &p->b, &p->cd);
5633 *pptr = p->next; mp_free(iter->mp, p);
5634 }
5635 else {
5636 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
5637 if (n_plp == iter->max_plp) { // then double the capacity
5638 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
5639 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
5640 }
5641 iter->plp[n_plp].b = &p->b;
5642 iter->plp[n_plp].cd = p->cd;
5643 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
5644 }
5645 pptr = &(*pptr)->next;
5646 }
5647 }
5648 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
5649 // update iter->tid and iter->pos
5650 if (iter->head != iter->tail) {
5651 if (iter->tid > iter->head->b.core.tid) {
5652 hts_log_error("Unsorted input. Pileup aborts");
5653 iter->error = 1;
5654 *_n_plp = -1;
5655 return NULL;
5656 }
5657 }
5658 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
5659 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
5660 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
5661 iter->pos = iter->head->beg; // jump to the next position
5662 } else ++iter->pos; // scan contiguously
5663 // return
5664 if (n_plp) return iter->plp;
5665 if (iter->is_eof && iter->head == iter->tail) break;
5666 }
5667 return NULL;
5668 }
5669
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)5670 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
5671 {
5672 hts_pos_t pos64 = 0;
5673 const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
5674 if (pos64 < INT_MAX) {
5675 *_pos = pos64;
5676 } else {
5677 hts_log_error("Position %"PRId64" too large", pos64);
5678 *_pos = INT_MAX;
5679 iter->error = 1;
5680 *_n_plp = -1;
5681 return NULL;
5682 }
5683 return p;
5684 }
5685
bam_plp_push(bam_plp_t iter,const bam1_t * b)5686 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
5687 {
5688 if (iter->error) return -1;
5689 if (b) {
5690 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
5691 // Skip only unmapped reads here, any additional filtering must be done in iter->func
5692 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
5693 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
5694 {
5695 overlap_remove(iter, b);
5696 return 0;
5697 }
5698 if (bam_copy1(&iter->tail->b, b) == NULL)
5699 return -1;
5700 iter->tail->b.id = iter->id++;
5701 iter->tail->beg = b->core.pos;
5702 // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
5703 iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
5704 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
5705 if (b->core.tid < iter->max_tid) {
5706 hts_log_error("The input is not sorted (chromosomes out of order)");
5707 iter->error = 1;
5708 return -1;
5709 }
5710 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
5711 hts_log_error("The input is not sorted (reads out of order)");
5712 iter->error = 1;
5713 return -1;
5714 }
5715 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
5716 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
5717 lbnode_t *next = mp_alloc(iter->mp);
5718 if (!next) {
5719 iter->error = 1;
5720 return -1;
5721 }
5722 if (iter->plp_construct) {
5723 if (iter->plp_construct(iter->data, &iter->tail->b,
5724 &iter->tail->cd) < 0) {
5725 mp_free(iter->mp, next);
5726 iter->error = 1;
5727 return -1;
5728 }
5729 }
5730 if (overlap_push(iter, iter->tail) < 0) {
5731 mp_free(iter->mp, next);
5732 iter->error = 1;
5733 return -1;
5734 }
5735 iter->tail->next = next;
5736 iter->tail = iter->tail->next;
5737 }
5738 } else iter->is_eof = 1;
5739 return 0;
5740 }
5741
bam_plp64_auto(bam_plp_t iter,int * _tid,hts_pos_t * _pos,int * _n_plp)5742 const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
5743 {
5744 const bam_pileup1_t *plp;
5745 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
5746 if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5747 else { // no pileup line can be obtained; read alignments
5748 *_n_plp = 0;
5749 if (iter->is_eof) return 0;
5750 int ret;
5751 while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
5752 if (bam_plp_push(iter, iter->b) < 0) {
5753 *_n_plp = -1;
5754 return 0;
5755 }
5756 if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5757 // otherwise no pileup line can be returned; read the next alignment.
5758 }
5759 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
5760 if (bam_plp_push(iter, 0) < 0) {
5761 *_n_plp = -1;
5762 return 0;
5763 }
5764 if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5765 return 0;
5766 }
5767 }
5768
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)5769 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
5770 {
5771 hts_pos_t pos64 = 0;
5772 const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
5773 if (pos64 < INT_MAX) {
5774 *_pos = pos64;
5775 } else {
5776 hts_log_error("Position %"PRId64" too large", pos64);
5777 *_pos = INT_MAX;
5778 iter->error = 1;
5779 *_n_plp = -1;
5780 return NULL;
5781 }
5782 return p;
5783 }
5784
bam_plp_reset(bam_plp_t iter)5785 void bam_plp_reset(bam_plp_t iter)
5786 {
5787 overlap_remove(iter, NULL);
5788 iter->max_tid = iter->max_pos = -1;
5789 iter->tid = iter->pos = 0;
5790 iter->is_eof = 0;
5791 while (iter->head != iter->tail) {
5792 lbnode_t *p = iter->head;
5793 iter->head = p->next;
5794 mp_free(iter->mp, p);
5795 }
5796 }
5797
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)5798 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
5799 {
5800 iter->maxcnt = maxcnt;
5801 }
5802
5803 /************************
5804 *** Mpileup iterator ***
5805 ************************/
5806
5807 struct bam_mplp_s {
5808 int n;
5809 int32_t min_tid, *tid;
5810 hts_pos_t min_pos, *pos;
5811 bam_plp_t *iter;
5812 int *n_plp;
5813 const bam_pileup1_t **plp;
5814 };
5815
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)5816 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
5817 {
5818 int i;
5819 bam_mplp_t iter;
5820 iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
5821 iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
5822 iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
5823 iter->n_plp = (int*)calloc(n, sizeof(int));
5824 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
5825 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
5826 iter->n = n;
5827 iter->min_pos = HTS_POS_MAX;
5828 iter->min_tid = (uint32_t)-1;
5829 for (i = 0; i < n; ++i) {
5830 iter->iter[i] = bam_plp_init(func, data[i]);
5831 iter->pos[i] = iter->min_pos;
5832 iter->tid[i] = iter->min_tid;
5833 }
5834 return iter;
5835 }
5836
bam_mplp_init_overlaps(bam_mplp_t iter)5837 int bam_mplp_init_overlaps(bam_mplp_t iter)
5838 {
5839 int i, r = 0;
5840 for (i = 0; i < iter->n; ++i)
5841 r |= bam_plp_init_overlaps(iter->iter[i]);
5842 return r == 0 ? 0 : -1;
5843 }
5844
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)5845 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
5846 {
5847 int i;
5848 for (i = 0; i < iter->n; ++i)
5849 iter->iter[i]->maxcnt = maxcnt;
5850 }
5851
bam_mplp_destroy(bam_mplp_t iter)5852 void bam_mplp_destroy(bam_mplp_t iter)
5853 {
5854 int i;
5855 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
5856 free(iter->iter); free(iter->pos); free(iter->tid);
5857 free(iter->n_plp); free(iter->plp);
5858 free(iter);
5859 }
5860
bam_mplp64_auto(bam_mplp_t iter,int * _tid,hts_pos_t * _pos,int * n_plp,const bam_pileup1_t ** plp)5861 int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
5862 {
5863 int i, ret = 0;
5864 hts_pos_t new_min_pos = HTS_POS_MAX;
5865 uint32_t new_min_tid = (uint32_t)-1;
5866 for (i = 0; i < iter->n; ++i) {
5867 if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
5868 int tid;
5869 hts_pos_t pos;
5870 iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
5871 if ( iter->iter[i]->error ) return -1;
5872 if (iter->plp[i]) {
5873 iter->tid[i] = tid;
5874 iter->pos[i] = pos;
5875 } else {
5876 iter->tid[i] = 0;
5877 iter->pos[i] = 0;
5878 }
5879 }
5880 if (iter->plp[i]) {
5881 if (iter->tid[i] < new_min_tid) {
5882 new_min_tid = iter->tid[i];
5883 new_min_pos = iter->pos[i];
5884 } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
5885 new_min_pos = iter->pos[i];
5886 }
5887 }
5888 }
5889 iter->min_pos = new_min_pos;
5890 iter->min_tid = new_min_tid;
5891 if (new_min_pos == HTS_POS_MAX) return 0;
5892 *_tid = new_min_tid; *_pos = new_min_pos;
5893 for (i = 0; i < iter->n; ++i) {
5894 if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
5895 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
5896 ++ret;
5897 } else n_plp[i] = 0, plp[i] = 0;
5898 }
5899 return ret;
5900 }
5901
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)5902 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
5903 {
5904 hts_pos_t pos64 = 0;
5905 int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
5906 if (ret >= 0) {
5907 if (pos64 < INT_MAX) {
5908 *_pos = pos64;
5909 } else {
5910 hts_log_error("Position %"PRId64" too large", pos64);
5911 *_pos = INT_MAX;
5912 return -1;
5913 }
5914 }
5915 return ret;
5916 }
5917
bam_mplp_reset(bam_mplp_t iter)5918 void bam_mplp_reset(bam_mplp_t iter)
5919 {
5920 int i;
5921 iter->min_pos = HTS_POS_MAX;
5922 iter->min_tid = (uint32_t)-1;
5923 for (i = 0; i < iter->n; ++i) {
5924 bam_plp_reset(iter->iter[i]);
5925 iter->pos[i] = HTS_POS_MAX;
5926 iter->tid[i] = (uint32_t)-1;
5927 iter->n_plp[i] = 0;
5928 iter->plp[i] = NULL;
5929 }
5930 }
5931
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))5932 void bam_mplp_constructor(bam_mplp_t iter,
5933 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
5934 int i;
5935 for (i = 0; i < iter->n; ++i)
5936 bam_plp_constructor(iter->iter[i], func);
5937 }
5938
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))5939 void bam_mplp_destructor(bam_mplp_t iter,
5940 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
5941 int i;
5942 for (i = 0; i < iter->n; ++i)
5943 bam_plp_destructor(iter->iter[i], func);
5944 }
5945
5946 #endif // ~!defined(BAM_NO_PILEUP)
5947
5948 // ---------------------------
5949 // Base Modification retrieval
5950 //
5951 // These operate by recording state in an opaque type, allocated and freed
5952 // via the functions below.
5953 //
5954 // Initially we call bam_parse_basemod to process the tags and record the
5955 // modifications in the state structure, and then functions such as
5956 // bam_next_basemod can iterate over this cached state.
5957
5958 /*
5959 * Base modification are stored in MM/Mm tags as <mod_list> defined as
5960 *
5961 * <mod_list> ::= <mod_chain><mod_list> | ""
5962 * <mod_chain> ::= <canonical_base><strand><mod-list><delta-list>
5963 *
5964 * <canonical_base> ::= "A" | "C" | "G" | "T" | "N".
5965 *
5966 * <strand> ::= "+" | "-".
5967 *
5968 * <mod-list> ::= <simple-mod-list> | <ChEBI-code>
5969 * <simple-mod-list> ::= <simple-mod><simple-mod-list> | <simple-mod>
5970 * <ChEBI-code> ::= <integer>
5971 * <simple-mod> ::= <letter>
5972 *
5973 * <delta-list> ::= "," <integer> <delta-list> | ";"
5974 *
5975 * We do not allocate additional memory other than the fixed size
5976 * state, thus we track up to 256 pointers to different locations
5977 * within the MM and ML tags. Each pointer is for a distinct
5978 * modification code (simple or ChEBI), meaning some may point to the
5979 * same delta-list when multiple codes are combined together
5980 * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array.
5981 *
5982 * Each numeric in the delta-list is tracked in MMcount[], counted
5983 * down until it hits zero in which case the next delta is fetched.
5984 *
5985 * ML array similarly holds the locations in the quality (ML) tag per
5986 * type, but these are interleaved so C+mhfc,10,15 will have 4 types
5987 * all pointing to the same delta position, but in ML we store
5988 * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML
5989 * also has MLstride indicating how many positions along ML to jump
5990 * each time we consume a base. (4 in our above example, but usually 1
5991 * for the simple case).
5992 *
5993 * One complexity of the base modification system is that mods are
5994 * always stored in the original DNA orientation. This is so that
5995 * tools that may reverse-complement a sequence (eg "samtools fastq -T
5996 * MM,ML") can pass through these modification tags irrespective of
5997 * whether they have any knowledge of their internal workings.
5998 *
5999 * Because we don't wish to allocate extra memory, we cannot simply
6000 * reverse the MM and ML tags. Sadly this means we have to manage the
6001 * reverse complementing ourselves on-the-fly.
6002 * For reversed reads we start at the right end of MM and no longer
6003 * stop at the semicolon. Instead we use MMend[] array to mark the
6004 * termination point.
6005 */
6006 #define MAX_BASE_MOD 256
6007 struct hts_base_mod_state {
6008 int type[MAX_BASE_MOD]; // char or minus-CHEBI
6009 int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15)
6010 char strand[MAX_BASE_MOD]; // strand of modification; + or -
6011 int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod
6012 char *MM[MAX_BASE_MOD]; // next pos delta (string)
6013 char *MMend[MAX_BASE_MOD]; // end of pos-delta string
6014 uint8_t *ML[MAX_BASE_MOD]; // next qual
6015 int MLstride[MAX_BASE_MOD]; // bytes between quals for this type
6016 int seq_pos; // current position along sequence
6017 int nmods; // used array size (0 to MAX_BASE_MOD-1).
6018 };
6019
hts_base_mod_state_alloc(void)6020 hts_base_mod_state *hts_base_mod_state_alloc(void) {
6021 return calloc(1, sizeof(hts_base_mod_state));
6022 }
6023
hts_base_mod_state_free(hts_base_mod_state * state)6024 void hts_base_mod_state_free(hts_base_mod_state *state) {
6025 free(state);
6026 }
6027
6028 /*
6029 * Count frequency of A, C, G, T and N canonical bases in the sequence
6030 */
seq_freq(const bam1_t * b,int freq[16])6031 static void seq_freq(const bam1_t *b, int freq[16]) {
6032 int i;
6033
6034 memset(freq, 0, 16*sizeof(*freq));
6035 uint8_t *seq = bam_get_seq(b);
6036 for (i = 0; i < b->core.l_qseq; i++)
6037 freq[bam_seqi(seq, i)]++;
6038 freq[15] = b->core.l_qseq; // all bases count as N for base mods
6039 }
6040
6041 //0123456789ABCDEF
6042 //=ACMGRSVTWYHKDBN aka seq_nt16_str[]
6043 //=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str
6044 //084C2A6E195D3B7F
6045 static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
6046
6047 /*
6048 * Parse the MM and ML tags to populate the base mod state.
6049 * This structure will have been previously allocated via
6050 * hts_base_mod_state_alloc, but it does not need to be repeatedly
6051 * freed and allocated for each new bam record. (Although obviously
6052 * it requires a new call to this function.)
6053 *
6054 */
bam_parse_basemod(const bam1_t * b,hts_base_mod_state * state)6055 int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
6056 // Read MM and ML tags
6057 uint8_t *mm = bam_aux_get(b, "MM");
6058 if (!mm) mm = bam_aux_get(b, "Mm");
6059 if (!mm)
6060 return 0;
6061 if (mm[0] != 'Z') {
6062 hts_log_error("MM tag is not of type Z");
6063 return -1;
6064 }
6065
6066 uint8_t *ml = bam_aux_get(b, "ML");
6067 if (!ml) ml = bam_aux_get(b, "Ml");
6068 if (ml && (ml[0] != 'B' || ml[1] != 'C')) {
6069 hts_log_error("ML tag is not of type B,C");
6070 return -1;
6071 }
6072 uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL;
6073 if (ml) ml += 6;
6074
6075 state->seq_pos = 0;
6076
6077 // Aggregate freqs of ACGTN if reversed, to get final-delta (later)
6078 int freq[16];
6079 if (b->core.flag & BAM_FREVERSE)
6080 seq_freq(b, freq);
6081
6082 char *cp = (char *)mm+1;
6083 int mod_num = 0;
6084 while (*cp) {
6085 for (; *cp; cp++) {
6086 // cp should be [ACGTNU][+-][^,]*(,\d+)*;
6087 unsigned char btype = *cp++;
6088
6089 if (btype != 'A' && btype != 'C' &&
6090 btype != 'G' && btype != 'T' &&
6091 btype != 'U' && btype != 'N')
6092 return -1;
6093 if (btype == 'U') btype = 'T';
6094
6095 btype = seq_nt16_table[btype];
6096
6097 // Strand
6098 if (*cp != '+' && *cp != '-')
6099 return -1; // malformed
6100 char strand = *cp++;
6101
6102 // List of modification types
6103 char *ms = cp, *me; // mod code start and end
6104 char *cp_end = NULL;
6105 int chebi = 0;
6106 if (isdigit(*cp)) {
6107 chebi = strtol(cp, &cp_end, 10);
6108 cp = cp_end;
6109 ms = cp-1;
6110 } else {
6111 while (*cp && *cp != ',' && *cp != ';')
6112 cp++;
6113 if (*cp == '\0')
6114 return -1;
6115 }
6116 me = cp;
6117
6118 long delta;
6119 int n = 0; // nth symbol in a multi-mod string
6120 int stride = me-ms;
6121 int ndelta = 0;
6122
6123 if (b->core.flag & BAM_FREVERSE) {
6124 // We process the sequence in left to right order,
6125 // but delta is successive count of bases to skip
6126 // counting right to left. This also means the number
6127 // of bases to skip at left edge is unrecorded (as it's
6128 // the remainder).
6129 //
6130 // To output mods in left to right, we step through the
6131 // MM list in reverse and need to identify the left-end
6132 // "remainder" delta.
6133 int total_seq = 0;
6134 for (;;) {
6135 cp += (*cp == ',');
6136 if (*cp == 0 || *cp == ';')
6137 break;
6138
6139 delta = strtol(cp, &cp_end, 10);
6140 if (cp_end == cp) {
6141 hts_log_error("Hit end of MM tag. Missing semicolon?");
6142 return -1;
6143 }
6144
6145 cp = cp_end;
6146 total_seq += delta+1;
6147 ndelta++;
6148 }
6149 delta = freq[seqi_rc[btype]] - total_seq; // remainder
6150 } else {
6151 delta = *cp == ','
6152 ? strtol(cp+1, &cp_end, 10)
6153 : 0;
6154 if (!cp_end) {
6155 // empty list
6156 delta = INT_MAX;
6157 cp_end = cp+1;
6158 }
6159 }
6160 // Now delta is first in list or computed remainder,
6161 // and cp_end is either start or end of the MM list.
6162 while (ms < me) {
6163 state->type [mod_num] = chebi ? -chebi : *ms;
6164 state->strand [mod_num] = (strand == '-');
6165 state->canonical[mod_num] = btype;
6166 state->MLstride [mod_num] = stride;
6167
6168 state->MMcount [mod_num] = delta;
6169 if (b->core.flag & BAM_FREVERSE) {
6170 state->MM [mod_num] = cp+1;
6171 state->MMend[mod_num] = cp_end;
6172 state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL;
6173 } else {
6174 state->MM [mod_num] = cp_end;
6175 state->MMend[mod_num] = NULL;
6176 state->ML [mod_num] = ml ? ml+n : NULL;
6177 }
6178
6179 if (++mod_num >= MAX_BASE_MOD) {
6180 hts_log_error("Too many base modification types");
6181 return -1;
6182 }
6183 ms++; n++;
6184 }
6185
6186 // Skip modification deltas
6187 if (ml) {
6188 if (b->core.flag & BAM_FREVERSE) {
6189 ml += ndelta*stride;
6190 } else {
6191 while (*cp && *cp != ';') {
6192 if (*cp == ',')
6193 ml+=stride;
6194 cp++;
6195 }
6196 }
6197 if (ml > ml_end) {
6198 hts_log_error("Insufficient number of entries in ML tag");
6199 return -1;
6200 }
6201 } else {
6202 // cp_end already known if FREVERSE
6203 if (cp_end && (b->core.flag & BAM_FREVERSE))
6204 cp = cp_end;
6205 else
6206 while (*cp && *cp != ';')
6207 cp++;
6208 }
6209 if (!*cp) {
6210 hts_log_error("Hit end of MM tag. Missing semicolon?");
6211 return -1;
6212 }
6213 }
6214 }
6215
6216 state->nmods = mod_num;
6217
6218 return 0;
6219 }
6220
6221 /*
6222 * Fills out mods[] with the base modifications found.
6223 * Returns the number found (0 if none), which may be more than
6224 * the size of n_mods if more were found than reported.
6225 * Returns <= -1 on error.
6226 *
6227 * This always marches left to right along sequence, irrespective of
6228 * reverse flag or modification strand.
6229 */
bam_mods_at_next_pos(const bam1_t * b,hts_base_mod_state * state,hts_base_mod * mods,int n_mods)6230 int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
6231 hts_base_mod *mods, int n_mods) {
6232 if (b->core.flag & BAM_FREVERSE) {
6233 if (state->seq_pos < 0)
6234 return -1;
6235 } else {
6236 if (state->seq_pos >= b->core.l_qseq)
6237 return -1;
6238 }
6239
6240 int i, j, n = 0;
6241 unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos);
6242 state->seq_pos++;
6243 if (b->core.flag & BAM_FREVERSE)
6244 base = seqi_rc[base];
6245
6246 for (i = 0; i < state->nmods; i++) {
6247 if (state->canonical[i] != base && state->canonical[i] != 15/*N*/)
6248 continue;
6249
6250 if (state->MMcount[i]-- > 0)
6251 continue;
6252
6253 char *MMptr = state->MM[i];
6254 if (n < n_mods) {
6255 mods[n].modified_base = state->type[i];
6256 mods[n].canonical_base = seq_nt16_str[state->canonical[i]];
6257 mods[n].strand = state->strand[i];
6258 mods[n].qual = state->ML[i] ? *state->ML[i] : -1;
6259 }
6260 n++;
6261 if (state->ML[i])
6262 state->ML[i] += (b->core.flag & BAM_FREVERSE)
6263 ? -state->MLstride[i]
6264 : +state->MLstride[i];
6265
6266 if (b->core.flag & BAM_FREVERSE) {
6267 // process MM list backwards
6268 char *cp;
6269 for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--)
6270 if (*cp == ',')
6271 break;
6272 state->MMend[i] = cp;
6273 if (cp != state->MM[i])
6274 state->MMcount[i] = strtol(cp+1, NULL, 10);
6275 else
6276 state->MMcount[i] = INT_MAX;
6277 } else {
6278 if (*state->MM[i] == ',')
6279 state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10);
6280 else
6281 state->MMcount[i] = INT_MAX;
6282 }
6283
6284 // Multiple mods at the same coords.
6285 for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) {
6286 if (n < n_mods) {
6287 mods[n].modified_base = state->type[j];
6288 mods[n].canonical_base = seq_nt16_str[state->canonical[j]];
6289 mods[n].strand = state->strand[j];
6290 mods[n].qual = state->ML[j] ? *state->ML[j] : -1;
6291 }
6292 n++;
6293 state->MMcount[j] = state->MMcount[i];
6294 state->MM[j] = state->MM[i];
6295 if (state->ML[j])
6296 state->ML[j] += (b->core.flag & BAM_FREVERSE)
6297 ? -state->MLstride[j]
6298 : +state->MLstride[j];
6299 }
6300 i = j-1;
6301 }
6302
6303 return n;
6304 }
6305
6306 /*
6307 * Looks for the next location with a base modification.
6308 */
bam_next_basemod(const bam1_t * b,hts_base_mod_state * state,hts_base_mod * mods,int n_mods,int * pos)6309 int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
6310 hts_base_mod *mods, int n_mods, int *pos) {
6311 if (state->seq_pos >= b->core.l_qseq)
6312 return 0;
6313
6314 // Look through state->MMcount arrays to see when the next lowest is
6315 // per base type;
6316 int next[16], freq[16] = {0}, i;
6317 memset(next, 0x7f, 16*sizeof(*next));
6318 if (b->core.flag & BAM_FREVERSE) {
6319 for (i = 0; i < state->nmods; i++) {
6320 if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i])
6321 next[seqi_rc[state->canonical[i]]] = state->MMcount[i];
6322 }
6323 } else {
6324 for (i = 0; i < state->nmods; i++) {
6325 if (next[state->canonical[i]] > state->MMcount[i])
6326 next[state->canonical[i]] = state->MMcount[i];
6327 }
6328 }
6329
6330 // Now step through the sequence counting off base types.
6331 for (i = state->seq_pos; i < b->core.l_qseq; i++) {
6332 unsigned char bc = bam_seqi(bam_get_seq(b), i);
6333 if (next[bc] <= freq[bc] || next[15] <= freq[15])
6334 break;
6335 freq[bc]++;
6336 if (bc != 15) // N
6337 freq[15]++;
6338 }
6339 *pos = state->seq_pos = i;
6340
6341 if (i >= b->core.l_qseq) {
6342 // Check for more MM elements than bases present.
6343 for (i = 0; i < state->nmods; i++) {
6344 if (!(b->core.flag & BAM_FREVERSE) &&
6345 state->MMcount[i] < 0x7f000000) {
6346 hts_log_warning("MM tag refers to bases beyond sequence length");
6347 return -1;
6348 }
6349 }
6350 return 0;
6351 }
6352
6353 if (b->core.flag & BAM_FREVERSE) {
6354 for (i = 0; i < state->nmods; i++)
6355 state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]];
6356 } else {
6357 for (i = 0; i < state->nmods; i++)
6358 state->MMcount[i] -= freq[state->canonical[i]];
6359 }
6360
6361 int r = bam_mods_at_next_pos(b, state, mods, n_mods);
6362 return r > 0 ? r : 0;
6363 }
6364
6365 /*
6366 * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos.
6367 * This can only march forwards along the read, but can do so by more than
6368 * one base-pair.
6369 *
6370 * This makes it useful for calling from pileup iterators where qpos may
6371 * start part way through a read for the first occurrence of that record.
6372 */
bam_mods_at_qpos(const bam1_t * b,int qpos,hts_base_mod_state * state,hts_base_mod * mods,int n_mods)6373 int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
6374 hts_base_mod *mods, int n_mods) {
6375 // FIXME: for now this is inefficient in implementation.
6376 int r = 0;
6377 while (state->seq_pos <= qpos)
6378 if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0)
6379 break;
6380
6381 return r;
6382 }
6383