1 /* sam.c -- SAM and BAM file I/O and manipulation.
2
3 Copyright (C) 2008-2010, 2012-2020 Genome Research Ltd.
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27 #include <config.h>
28
29 #include <strings.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <zlib.h>
35 #include <assert.h>
36 #include <signal.h>
37 #include <inttypes.h>
38 #include <unistd.h>
39
40 // Suppress deprecation message for cigar_tab, which we initialise
41 #include "htslib/hts_defs.h"
42 #undef HTS_DEPRECATED
43 #define HTS_DEPRECATED(message)
44
45 #include "htslib/sam.h"
46 #include "htslib/bgzf.h"
47 #include "cram/cram.h"
48 #include "hts_internal.h"
49 #include "sam_internal.h"
50 #include "htslib/hfile.h"
51 #include "htslib/hts_endian.h"
52 #include "header.h"
53
54 #include "htslib/khash.h"
55 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
56
57 #ifndef EFTYPE
58 #define EFTYPE ENOEXEC
59 #endif
60 #ifndef EOVERFLOW
61 #define EOVERFLOW ERANGE
62 #endif
63
64 /**********************
65 *** BAM header I/O ***
66 **********************/
67
68 HTSLIB_EXPORT
69 const int8_t bam_cigar_table[256] = {
70 // 0 .. 47
71 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
72 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
73 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
74
75 // 48 .. 63 (including =)
76 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, BAM_CEQUAL, -1, -1,
77
78 // 64 .. 79 (including MIDNHB)
79 -1, -1, BAM_CBACK, -1, BAM_CDEL, -1, -1, -1,
80 BAM_CHARD_CLIP, BAM_CINS, -1, -1, -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
81
82 // 80 .. 95 (including SPX)
83 BAM_CPAD, -1, -1, BAM_CSOFT_CLIP, -1, -1, -1, -1,
84 BAM_CDIFF, -1, -1, -1, -1, -1, -1, -1,
85
86 // 96 .. 127
87 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
88 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
89
90 // 128 .. 255
91 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
92 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
93 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
94 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
95 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
96 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
97 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
98 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
99 };
100
sam_hdr_init()101 sam_hdr_t *sam_hdr_init()
102 {
103 sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
104 if (bh == NULL) return NULL;
105
106 bh->cigar_tab = bam_cigar_table;
107 return bh;
108 }
109
sam_hdr_destroy(sam_hdr_t * bh)110 void sam_hdr_destroy(sam_hdr_t *bh)
111 {
112 int32_t i;
113
114 if (bh == NULL) return;
115
116 if (bh->ref_count > 0) {
117 --bh->ref_count;
118 return;
119 }
120
121 if (bh->target_name) {
122 for (i = 0; i < bh->n_targets; ++i)
123 free(bh->target_name[i]);
124 free(bh->target_name);
125 free(bh->target_len);
126 }
127 free(bh->text);
128 if (bh->hrecs)
129 sam_hrecs_free(bh->hrecs);
130 if (bh->sdict)
131 kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
132 free(bh);
133 }
134
135 // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
136 // references before sam_hdr_t::hrecs is populated
sam_hdr_dup_sdict(const sam_hdr_t * h0,sam_hdr_t * h)137 int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
138 {
139 const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
140 khash_t(s2i) *dest_long_refs = kh_init(s2i);
141 int i;
142 if (!dest_long_refs) return -1;
143
144 for (i = 0; i < h->n_targets; i++) {
145 int ret;
146 khiter_t ksrc, kdest;
147 if (h->target_len[i] < UINT32_MAX) continue;
148 ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
149 if (ksrc == kh_end(src_long_refs)) continue;
150 kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
151 if (ret < 0) {
152 kh_destroy(s2i, dest_long_refs);
153 return -1;
154 }
155 kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
156 }
157
158 h->sdict = dest_long_refs;
159 return 0;
160 }
161
sam_hdr_dup(const sam_hdr_t * h0)162 sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
163 {
164 if (h0 == NULL) return NULL;
165 sam_hdr_t *h;
166 if ((h = sam_hdr_init()) == NULL) return NULL;
167 // copy the simple data
168 h->n_targets = 0;
169 h->ignore_sam_err = h0->ignore_sam_err;
170 h->l_text = 0;
171
172 // Then the pointery stuff
173
174 if (!h0->hrecs) {
175 h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
176 if (!h->target_len) goto fail;
177 h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
178 if (!h->target_name) goto fail;
179
180 int i;
181 for (i = 0; i < h0->n_targets; ++i) {
182 h->target_len[i] = h0->target_len[i];
183 h->target_name[i] = strdup(h0->target_name[i]);
184 if (!h->target_name[i]) break;
185 }
186 h->n_targets = i;
187 if (i < h0->n_targets) goto fail;
188
189 if (h0->sdict) {
190 if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
191 }
192 }
193
194 if (h0->hrecs) {
195 kstring_t tmp = { 0, 0, NULL };
196 if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
197 free(ks_release(&tmp));
198 goto fail;
199 }
200
201 h->l_text = tmp.l;
202 h->text = ks_release(&tmp);
203
204 if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
205 goto fail;
206 } else {
207 h->l_text = h0->l_text;
208 h->text = malloc(h->l_text + 1);
209 if (!h->text) goto fail;
210 memcpy(h->text, h0->text, h->l_text);
211 h->text[h->l_text] = '\0';
212 }
213
214 return h;
215
216 fail:
217 sam_hdr_destroy(h);
218 return NULL;
219 }
220
bam_hdr_read(BGZF * fp)221 sam_hdr_t *bam_hdr_read(BGZF *fp)
222 {
223 sam_hdr_t *h;
224 uint8_t buf[4];
225 int magic_len, has_EOF;
226 int32_t i, name_len, num_names = 0;
227 size_t bufsize;
228 ssize_t bytes;
229 // check EOF
230 has_EOF = bgzf_check_EOF(fp);
231 if (has_EOF < 0) {
232 perror("[W::bam_hdr_read] bgzf_check_EOF");
233 } else if (has_EOF == 0) {
234 hts_log_warning("EOF marker is absent. The input is probably truncated");
235 }
236 // read "BAM1"
237 magic_len = bgzf_read(fp, buf, 4);
238 if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
239 hts_log_error("Invalid BAM binary header");
240 return 0;
241 }
242 h = sam_hdr_init();
243 if (!h) goto nomem;
244
245 // read plain text and the number of reference sequences
246 bytes = bgzf_read(fp, buf, 4);
247 if (bytes != 4) goto read_err;
248 h->l_text = le_to_u32(buf);
249
250 bufsize = h->l_text + 1;
251 if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
252 h->text = (char*)malloc(bufsize);
253 if (!h->text) goto nomem;
254 h->text[h->l_text] = 0; // make sure it is NULL terminated
255 bytes = bgzf_read(fp, h->text, h->l_text);
256 if (bytes != h->l_text) goto read_err;
257
258 bytes = bgzf_read(fp, &h->n_targets, 4);
259 if (bytes != 4) goto read_err;
260 if (fp->is_be) ed_swap_4p(&h->n_targets);
261
262 if (h->n_targets < 0) goto invalid;
263
264 // read reference sequence names and lengths
265 if (h->n_targets > 0) {
266 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
267 if (!h->target_name) goto nomem;
268 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
269 if (!h->target_len) goto nomem;
270 }
271 else {
272 h->target_name = NULL;
273 h->target_len = NULL;
274 }
275
276 for (i = 0; i != h->n_targets; ++i) {
277 bytes = bgzf_read(fp, &name_len, 4);
278 if (bytes != 4) goto read_err;
279 if (fp->is_be) ed_swap_4p(&name_len);
280 if (name_len <= 0) goto invalid;
281
282 h->target_name[i] = (char*)malloc(name_len);
283 if (!h->target_name[i]) goto nomem;
284 num_names++;
285
286 bytes = bgzf_read(fp, h->target_name[i], name_len);
287 if (bytes != name_len) goto read_err;
288
289 if (h->target_name[i][name_len - 1] != '\0') {
290 /* Fix missing NUL-termination. Is this being too nice?
291 We could alternatively bail out with an error. */
292 char *new_name;
293 if (name_len == INT32_MAX) goto invalid;
294 new_name = realloc(h->target_name[i], name_len + 1);
295 if (new_name == NULL) goto nomem;
296 h->target_name[i] = new_name;
297 h->target_name[i][name_len] = '\0';
298 }
299
300 bytes = bgzf_read(fp, &h->target_len[i], 4);
301 if (bytes != 4) goto read_err;
302 if (fp->is_be) ed_swap_4p(&h->target_len[i]);
303 }
304 return h;
305
306 nomem:
307 hts_log_error("Out of memory");
308 goto clean;
309
310 read_err:
311 if (bytes < 0) {
312 hts_log_error("Error reading BGZF stream");
313 } else {
314 hts_log_error("Truncated BAM header");
315 }
316 goto clean;
317
318 invalid:
319 hts_log_error("Invalid BAM binary header");
320
321 clean:
322 if (h != NULL) {
323 h->n_targets = num_names; // ensure we free only allocated target_names
324 sam_hdr_destroy(h);
325 }
326 return NULL;
327 }
328
bam_hdr_write(BGZF * fp,const sam_hdr_t * h)329 int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
330 {
331 int32_t i, name_len, x;
332 kstring_t hdr_ks = { 0, 0, NULL };
333 char *text;
334 uint32_t l_text;
335
336 if (!h) return -1;
337
338 if (h->hrecs) {
339 if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
340 if (hdr_ks.l > INT32_MAX) {
341 hts_log_error("Header too long for BAM format");
342 free(hdr_ks.s);
343 return -1;
344 }
345 text = hdr_ks.s;
346 l_text = hdr_ks.l;
347 } else {
348 if (h->l_text > INT32_MAX) {
349 hts_log_error("Header too long for BAM format");
350 return -1;
351 }
352 text = h->text;
353 l_text = h->l_text;
354 }
355 // write "BAM1"
356 if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
357 // write plain text and the number of reference sequences
358 if (fp->is_be) {
359 x = ed_swap_4(l_text);
360 if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
361 if (l_text) {
362 if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
363 }
364 x = ed_swap_4(h->n_targets);
365 if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
366 } else {
367 if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
368 if (l_text) {
369 if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
370 }
371 if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
372 }
373 free(hdr_ks.s);
374 // write sequence names and lengths
375 for (i = 0; i != h->n_targets; ++i) {
376 char *p = h->target_name[i];
377 name_len = strlen(p) + 1;
378 if (fp->is_be) {
379 x = ed_swap_4(name_len);
380 if (bgzf_write(fp, &x, 4) < 0) return -1;
381 } else {
382 if (bgzf_write(fp, &name_len, 4) < 0) return -1;
383 }
384 if (bgzf_write(fp, p, name_len) < 0) return -1;
385 if (fp->is_be) {
386 x = ed_swap_4(h->target_len[i]);
387 if (bgzf_write(fp, &x, 4) < 0) return -1;
388 } else {
389 if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
390 }
391 }
392 if (bgzf_flush(fp) < 0) return -1;
393 return 0;
394 }
395
sam_parse_region(sam_hdr_t * h,const char * s,int * tid,hts_pos_t * beg,hts_pos_t * end,int flags)396 const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
397 hts_pos_t *beg, hts_pos_t *end, int flags) {
398 return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
399 }
400
401 /*************************
402 *** BAM alignment I/O ***
403 *************************/
404
bam_init1()405 bam1_t *bam_init1()
406 {
407 return (bam1_t*)calloc(1, sizeof(bam1_t));
408 }
409
sam_realloc_bam_data(bam1_t * b,size_t desired)410 int sam_realloc_bam_data(bam1_t *b, size_t desired)
411 {
412 uint32_t new_m_data;
413 uint8_t *new_data;
414 new_m_data = desired;
415 kroundup32(new_m_data);
416 if (new_m_data < desired) {
417 errno = ENOMEM; // Not strictly true but we can't store the size
418 return -1;
419 }
420 if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
421 new_data = realloc(b->data, new_m_data);
422 } else {
423 if ((new_data = malloc(new_m_data)) != NULL) {
424 if (b->l_data > 0)
425 memcpy(new_data, b->data,
426 b->l_data < b->m_data ? b->l_data : b->m_data);
427 bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
428 }
429 }
430 if (!new_data) return -1;
431 b->data = new_data;
432 b->m_data = new_m_data;
433 return 0;
434 }
435
bam_destroy1(bam1_t * b)436 void bam_destroy1(bam1_t *b)
437 {
438 if (b == 0) return;
439 if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
440 free(b->data);
441 if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
442 // In case of reuse
443 b->data = NULL;
444 b->m_data = 0;
445 b->l_data = 0;
446 }
447 }
448
449 if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
450 free(b);
451 }
452
bam_copy1(bam1_t * bdst,const bam1_t * bsrc)453 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
454 {
455 if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
456 memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
457 memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
458 bdst->l_data = bsrc->l_data;
459 bdst->id = bsrc->id;
460 return bdst;
461 }
462
bam_dup1(const bam1_t * bsrc)463 bam1_t *bam_dup1(const bam1_t *bsrc)
464 {
465 if (bsrc == NULL) return NULL;
466 bam1_t *bdst = bam_init1();
467 if (bdst == NULL) return NULL;
468 if (bam_copy1(bdst, bsrc) == NULL) {
469 bam_destroy1(bdst);
470 return NULL;
471 }
472 return bdst;
473 }
474
bam_cigar2rqlens(int n_cigar,const uint32_t * cigar,hts_pos_t * rlen,hts_pos_t * qlen)475 static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
476 hts_pos_t *rlen, hts_pos_t *qlen)
477 {
478 int k;
479 *rlen = *qlen = 0;
480 for (k = 0; k < n_cigar; ++k) {
481 int type = bam_cigar_type(bam_cigar_op(cigar[k]));
482 int len = bam_cigar_oplen(cigar[k]);
483 if (type & 1) *qlen += len;
484 if (type & 2) *rlen += len;
485 }
486 }
487
subtract_check_underflow(size_t length,size_t * limit)488 static int subtract_check_underflow(size_t length, size_t *limit)
489 {
490 if (length <= *limit) {
491 *limit -= length;
492 return 0;
493 }
494
495 return -1;
496 }
497
bam_set1(bam1_t * bam,size_t l_qname,const char * qname,uint16_t flag,int32_t tid,hts_pos_t pos,uint8_t mapq,size_t n_cigar,const uint32_t * cigar,int32_t mtid,hts_pos_t mpos,hts_pos_t isize,size_t l_seq,const char * seq,const char * qual,size_t l_aux)498 int bam_set1(bam1_t *bam,
499 size_t l_qname, const char *qname,
500 uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
501 size_t n_cigar, const uint32_t *cigar,
502 int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
503 size_t l_seq, const char *seq, const char *qual,
504 size_t l_aux)
505 {
506 // use a default qname "*" if none is provided
507 if (l_qname == 0) {
508 l_qname = 1;
509 qname = "*";
510 }
511
512 // note: the qname is stored nul terminated and padded as described in the
513 // documentation for the bam1_t struct.
514 size_t qname_nuls = 4 - l_qname % 4;
515
516 // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
517 // can't use bam_endpos() directly as some fields not yet set up.
518 hts_pos_t rlen = 0, qlen = 0;
519 if (!(flag & BAM_FUNMAP)) {
520 bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
521 }
522 if (rlen == 0) {
523 rlen = 1;
524 }
525
526 // validate parameters
527 if (l_qname > 254) {
528 hts_log_error("Query name too long");
529 errno = EINVAL;
530 return -1;
531 }
532 if (HTS_POS_MAX - rlen <= pos) {
533 hts_log_error("Read ends beyond highest supported position");
534 errno = EINVAL;
535 return -1;
536 }
537 if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
538 hts_log_error("Mapped query must have a CIGAR");
539 errno = EINVAL;
540 return -1;
541 }
542 if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
543 hts_log_error("CIGAR and query sequence are of different length");
544 errno = EINVAL;
545 return -1;
546 }
547
548 size_t limit = INT32_MAX;
549 int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
550 u += subtract_check_underflow(n_cigar * 4, &limit);
551 u += subtract_check_underflow((l_seq + 1) / 2, &limit);
552 u += subtract_check_underflow(l_seq, &limit);
553 u += subtract_check_underflow(l_aux, &limit);
554 if (u != 0) {
555 hts_log_error("Size overflow");
556 errno = EINVAL;
557 return -1;
558 }
559
560 // re-allocate the data buffer as needed.
561 size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
562 if (realloc_bam_data(bam, data_len + l_aux) < 0) {
563 return -1;
564 }
565
566 bam->l_data = (int)data_len;
567 bam->core.pos = pos;
568 bam->core.tid = tid;
569 bam->core.bin = bam_reg2bin(pos, pos + rlen);
570 bam->core.qual = mapq;
571 bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
572 bam->core.flag = flag;
573 bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
574 bam->core.n_cigar = (uint32_t)n_cigar;
575 bam->core.l_qseq = (int32_t)l_seq;
576 bam->core.mtid = mtid;
577 bam->core.mpos = mpos;
578 bam->core.isize = isize;
579
580 uint8_t *cp = bam->data;
581 strncpy((char *)cp, qname, l_qname);
582 int i;
583 for (i = 0; i < qname_nuls; i++) {
584 cp[l_qname + i] = '\0';
585 }
586 cp += l_qname + qname_nuls;
587
588 if (n_cigar > 0) {
589 memcpy(cp, cigar, n_cigar * 4);
590 }
591 cp += n_cigar * 4;
592
593 for (i = 0; i + 1 < l_seq; i += 2) {
594 *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]];
595 }
596 for (; i < l_seq; i++) {
597 *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
598 }
599
600 if (qual) {
601 memcpy(cp, qual, l_seq);
602 }
603 else {
604 memset(cp, '\xff', l_seq);
605 }
606
607 return (int)data_len;
608 }
609
bam_cigar2qlen(int n_cigar,const uint32_t * cigar)610 hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
611 {
612 int k;
613 hts_pos_t l;
614 for (k = l = 0; k < n_cigar; ++k)
615 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
616 l += bam_cigar_oplen(cigar[k]);
617 return l;
618 }
619
bam_cigar2rlen(int n_cigar,const uint32_t * cigar)620 hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
621 {
622 int k;
623 hts_pos_t l;
624 for (k = l = 0; k < n_cigar; ++k)
625 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
626 l += bam_cigar_oplen(cigar[k]);
627 return l;
628 }
629
bam_endpos(const bam1_t * b)630 hts_pos_t bam_endpos(const bam1_t *b)
631 {
632 hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
633 if (rlen == 0) rlen = 1;
634 return b->core.pos + rlen;
635 }
636
bam_tag2cigar(bam1_t * b,int recal_bin,int give_warning)637 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
638 {
639 bam1_core_t *c = &b->core;
640 uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
641 uint8_t *CG;
642
643 // test where there is a real CIGAR in the CG tag to move
644 if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
645 cigar0 = bam_get_cigar(b);
646 if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
647 fake_bytes = c->n_cigar * 4;
648 int saved_errno = errno;
649 CG = bam_aux_get(b, "CG");
650 if (!CG) {
651 if (errno != ENOENT) return -1; // Bad aux data
652 errno = saved_errno; // restore errno on expected no-CG-tag case
653 return 0;
654 }
655 if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
656 CG_len = le_to_u32(CG + 2);
657 if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
658
659 // move from the CG tag to the right position
660 cigar_st = (uint8_t*)cigar0 - b->data;
661 c->n_cigar = CG_len;
662 n_cigar4 = c->n_cigar * 4;
663 CG_st = CG - b->data - 2;
664 CG_en = CG_st + 8 + n_cigar4;
665 if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
666 b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
667 memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
668 memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
669 if (ori_len > CG_en) // move data after the CG tag
670 memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
671 b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
672 if (recal_bin)
673 b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
674 if (give_warning)
675 hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
676 return 1;
677 }
678
aux_type2size(uint8_t type)679 static inline int aux_type2size(uint8_t type)
680 {
681 switch (type) {
682 case 'A': case 'c': case 'C':
683 return 1;
684 case 's': case 'S':
685 return 2;
686 case 'i': case 'I': case 'f':
687 return 4;
688 case 'd':
689 return 8;
690 case 'Z': case 'H': case 'B':
691 return type;
692 default:
693 return 0;
694 }
695 }
696
swap_data(const bam1_core_t * c,int l_data,uint8_t * data,int is_host)697 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
698 {
699 uint32_t *cigar = (uint32_t*)(data + c->l_qname);
700 uint32_t i;
701 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
702 }
703
704 // Fix bad records where qname is not terminated correctly.
fixup_missing_qname_nul(bam1_t * b)705 static int fixup_missing_qname_nul(bam1_t *b) {
706 bam1_core_t *c = &b->core;
707
708 // Note this is called before c->l_extranul is added to c->l_qname
709 if (c->l_extranul > 0) {
710 b->data[c->l_qname++] = '\0';
711 c->l_extranul--;
712 } else {
713 if (b->l_data > INT_MAX - 4) return -1;
714 if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
715 b->l_data += 4;
716 b->data[c->l_qname++] = '\0';
717 c->l_extranul = 3;
718 }
719 return 0;
720 }
721
722 /*
723 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
724 * in multi-threaded handling. This may be worth considering for htslib2.
725 */
bam_read1(BGZF * fp,bam1_t * b)726 int bam_read1(BGZF *fp, bam1_t *b)
727 {
728 bam1_core_t *c = &b->core;
729 int32_t block_len, ret, i;
730 uint32_t x[8], new_l_data;
731
732 b->l_data = 0;
733
734 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
735 if (ret == 0) return -1; // normal end-of-file
736 else return -2; // truncated
737 }
738 if (fp->is_be)
739 ed_swap_4p(&block_len);
740 if (block_len < 32) return -4; // block_len includes core data
741 if (bgzf_read(fp, x, 32) != 32) return -3;
742 if (fp->is_be) {
743 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
744 }
745 c->tid = x[0]; c->pos = (int32_t)x[1];
746 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
747 c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
748 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
749 c->l_qseq = x[4];
750 c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
751
752 new_l_data = block_len - 32 + c->l_extranul;
753 if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
754 if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
755 + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
756 return -4;
757 if (realloc_bam_data(b, new_l_data) < 0) return -4;
758 b->l_data = new_l_data;
759
760 if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
761 if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
762 if (fixup_missing_qname_nul(b) < 0) return -4;
763 }
764 for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
765 c->l_qname += c->l_extranul;
766 if (b->l_data < c->l_qname ||
767 bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
768 return -4;
769 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
770 if (bam_tag2cigar(b, 0, 0) < 0)
771 return -4;
772
773 if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
774 hts_pos_t rlen, qlen;
775 bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
776 if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
777 b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
778 // Sanity check for broken CIGAR alignments
779 if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
780 hts_log_error("CIGAR and query sequence lengths differ for %s",
781 bam_get_qname(b));
782 return -4;
783 }
784 }
785
786 return 4 + block_len;
787 }
788
bam_write1(BGZF * fp,const bam1_t * b)789 int bam_write1(BGZF *fp, const bam1_t *b)
790 {
791 const bam1_core_t *c = &b->core;
792 uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
793 int i, ok;
794 if (c->l_qname - c->l_extranul > 255) {
795 hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
796 errno = EOVERFLOW;
797 return -1;
798 }
799 if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
800 if (c->pos > INT_MAX ||
801 c->mpos > INT_MAX ||
802 c->isize < INT_MIN || c->isize > INT_MAX) {
803 hts_log_error("Positional data is too large for BAM format");
804 return -1;
805 }
806 x[0] = c->tid;
807 x[1] = c->pos;
808 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
809 if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
810 else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
811 x[4] = c->l_qseq;
812 x[5] = c->mtid;
813 x[6] = c->mpos;
814 x[7] = c->isize;
815 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
816 if (fp->is_be) {
817 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
818 y = block_len;
819 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
820 swap_data(c, b->l_data, b->data, 1);
821 } else {
822 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
823 }
824 if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
825 if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
826 if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
827 if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
828 } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
829 uint8_t buf[8];
830 uint32_t cigar_st, cigar_en, cigar[2];
831 hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
832 if (cigreflen >= (1<<28)) {
833 // Length of reference covered is greater than the biggest
834 // CIGAR operation currently allowed.
835 hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
836 " cannot be written in BAM. Try writing SAM or CRAM instead.\n",
837 bam_get_qname(b), c->n_cigar, cigreflen);
838 return -1;
839 }
840 cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
841 cigar_en = cigar_st + c->n_cigar * 4;
842 cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
843 cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
844 u32_to_le(cigar[0], buf);
845 u32_to_le(cigar[1], buf + 4);
846 if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
847 if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
848 if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
849 u32_to_le(c->n_cigar, buf);
850 if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
851 if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
852 }
853 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
854 return ok? 4 + block_len : -1;
855 }
856
857 /*
858 * Write a BAM file and append to the in-memory index simultaneously.
859 */
bam_write_idx1(htsFile * fp,const sam_hdr_t * h,const bam1_t * b)860 static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
861 BGZF *bfp = fp->fp.bgzf;
862
863 if (!fp->idx)
864 return bam_write1(bfp, b);
865
866 uint32_t block_len = b->l_data - b->core.l_extranul + 32;
867 if (bgzf_flush_try(bfp, 4 + block_len) < 0)
868 return -1;
869 if (!bfp->mt)
870 hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
871 else
872 bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp));
873
874 int ret = bam_write1(bfp, b);
875 if (ret < 0)
876 return -1;
877
878 if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
879 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
880 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
881 ret = -1;
882 }
883
884 return ret;
885 }
886
887 /*
888 * Set the qname in a BAM record
889 */
bam_set_qname(bam1_t * rec,const char * qname)890 int bam_set_qname(bam1_t *rec, const char *qname)
891 {
892 if (!rec) return -1;
893 if (!qname || !*qname) return -1;
894
895 size_t old_len = rec->core.l_qname;
896 size_t new_len = strlen(qname) + 1;
897 if (new_len < 1 || new_len > 255) return -1;
898
899 int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
900
901 size_t new_data_len = rec->l_data - old_len + new_len + extranul;
902 if (realloc_bam_data(rec, new_data_len) < 0) return -1;
903
904 // Make room
905 if (new_len + extranul != rec->core.l_qname)
906 memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
907 // Copy in new name and pad if needed
908 memcpy(rec->data, qname, new_len);
909 int n;
910 for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
911
912 rec->l_data = new_data_len;
913 rec->core.l_qname = new_len + extranul;
914 rec->core.l_extranul = extranul;
915
916 return 0;
917 }
918
919 /********************
920 *** BAM indexing ***
921 ********************/
922
sam_index(htsFile * fp,int min_shift)923 static hts_idx_t *sam_index(htsFile *fp, int min_shift)
924 {
925 int n_lvls, i, fmt, ret;
926 bam1_t *b;
927 hts_idx_t *idx;
928 sam_hdr_t *h;
929 h = sam_hdr_read(fp);
930 if (h == NULL) return NULL;
931 if (min_shift > 0) {
932 hts_pos_t max_len = 0, s;
933 for (i = 0; i < h->n_targets; ++i) {
934 hts_pos_t len = sam_hdr_tid2len(h, i);
935 if (max_len < len) max_len = len;
936 }
937 max_len += 256;
938 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
939 fmt = HTS_FMT_CSI;
940 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
941 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
942 b = bam_init1();
943 while ((ret = sam_read1(fp, h, b)) >= 0) {
944 ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
945 if (ret < 0) { // unsorted or doesn't fit
946 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
947 goto err;
948 }
949 }
950 if (ret < -1) goto err; // corrupted BAM file
951
952 hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
953 sam_hdr_destroy(h);
954 bam_destroy1(b);
955 return idx;
956
957 err:
958 bam_destroy1(b);
959 hts_idx_destroy(idx);
960 return NULL;
961 }
962
sam_index_build3(const char * fn,const char * fnidx,int min_shift,int nthreads)963 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
964 {
965 hts_idx_t *idx;
966 htsFile *fp;
967 int ret = 0;
968
969 if ((fp = hts_open(fn, "r")) == 0) return -2;
970 if (nthreads)
971 hts_set_threads(fp, nthreads);
972
973 switch (fp->format.format) {
974 case cram:
975
976 ret = cram_index_build(fp->fp.cram, fn, fnidx);
977 break;
978
979 case bam:
980 case sam:
981 if (fp->format.compression != bgzf) {
982 hts_log_error("%s file \"%s\" not BGZF compressed",
983 fp->format.format == bam ? "BAM" : "SAM", fn);
984 ret = -1;
985 break;
986 }
987 idx = sam_index(fp, min_shift);
988 if (idx) {
989 ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
990 if (ret < 0) ret = -4;
991 hts_idx_destroy(idx);
992 }
993 else ret = -1;
994 break;
995
996 default:
997 ret = -3;
998 break;
999 }
1000 hts_close(fp);
1001
1002 return ret;
1003 }
1004
sam_index_build2(const char * fn,const char * fnidx,int min_shift)1005 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1006 {
1007 return sam_index_build3(fn, fnidx, min_shift, 0);
1008 }
1009
sam_index_build(const char * fn,int min_shift)1010 int sam_index_build(const char *fn, int min_shift)
1011 {
1012 return sam_index_build3(fn, NULL, min_shift, 0);
1013 }
1014
1015 // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1016 #undef bam_index_build
bam_index_build(const char * fn,int min_shift)1017 int bam_index_build(const char *fn, int min_shift)
1018 {
1019 return sam_index_build2(fn, NULL, min_shift);
1020 }
1021
1022 // Initialise fp->idx for the current format type.
1023 // This must be called after the header has been written but no other data.
sam_idx_init(htsFile * fp,sam_hdr_t * h,int min_shift,const char * fnidx)1024 int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1025 fp->fnidx = fnidx;
1026 if (fp->format.format == bam || fp->format.format == bcf ||
1027 (fp->format.format == sam && fp->format.compression == bgzf)) {
1028 int n_lvls, fmt = HTS_FMT_CSI;
1029 if (min_shift > 0) {
1030 int64_t max_len = 0, s;
1031 int i;
1032 for (i = 0; i < h->n_targets; ++i)
1033 if (max_len < h->target_len[i]) max_len = h->target_len[i];
1034 max_len += 256;
1035 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1036
1037 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1038
1039 fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1040 return fp->idx ? 0 : -1;
1041 }
1042
1043 if (fp->format.format == cram) {
1044 fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1045 return fp->fp.cram->idxfp ? 0 : -1;
1046 }
1047
1048 return -1;
1049 }
1050
1051 // Finishes an index. Call after the last record has been written.
1052 // Returns 0 on success, <0 on failure.
sam_idx_save(htsFile * fp)1053 int sam_idx_save(htsFile *fp) {
1054 if (fp->format.format == bam || fp->format.format == bcf ||
1055 fp->format.format == vcf || fp->format.format == sam) {
1056 int ret;
1057 if ((ret = sam_state_destroy(fp)) < 0) {
1058 errno = -ret;
1059 return -1;
1060 }
1061 if (bgzf_flush(fp->fp.bgzf) < 0)
1062 return -1;
1063 hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1064
1065 if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1066 return -1;
1067
1068 return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx));
1069
1070 } else if (fp->format.format == cram) {
1071 // flushed and closed by cram_close
1072 }
1073
1074 return 0;
1075 }
1076
sam_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1077 static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1078 {
1079 htsFile *fp = (htsFile *)fpv;
1080 bam1_t *b = bv;
1081 fp->line.l = 0;
1082 int ret = sam_read1(fp, fp->bam_header, b);
1083 if (ret >= 0) {
1084 *tid = b->core.tid;
1085 *beg = b->core.pos;
1086 *end = bam_endpos(b);
1087 }
1088 return ret;
1089 }
1090
1091 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
sam_readrec_rest(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1092 static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1093 {
1094 htsFile *fp = (htsFile *)fpv;
1095 bam1_t *b = bv;
1096 fp->line.l = 0;
1097 int ret = sam_read1(fp, fp->bam_header, b);
1098 return ret;
1099 }
1100
cram_readrec(BGZF * ignored,void * fpv,void * bv,int * tid,hts_pos_t * beg,hts_pos_t * end)1101 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1102 {
1103 htsFile *fp = fpv;
1104 bam1_t *b = bv;
1105 int ret = cram_get_bam_seq(fp->fp.cram, &b);
1106 if (ret < 0)
1107 return cram_eof(fp->fp.cram) ? -1 : -2;
1108
1109 if (bam_tag2cigar(b, 1, 1) < 0)
1110 return -2;
1111
1112 *tid = b->core.tid;
1113 *beg = b->core.pos;
1114 *end = bam_endpos(b);
1115
1116 return ret;
1117 }
1118
cram_pseek(void * fp,int64_t offset,int whence)1119 static int cram_pseek(void *fp, int64_t offset, int whence)
1120 {
1121 cram_fd *fd = (cram_fd *)fp;
1122
1123 if ((0 != cram_seek(fd, offset, SEEK_SET))
1124 && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1125 return -1;
1126
1127 fd->curr_position = offset;
1128
1129 if (fd->ctr) {
1130 cram_free_container(fd->ctr);
1131 if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1132 cram_free_container(fd->ctr_mt);
1133
1134 fd->ctr = NULL;
1135 fd->ctr_mt = NULL;
1136 fd->ooc = 0;
1137 }
1138
1139 return 0;
1140 }
1141
1142 /*
1143 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1144 * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1145 * container previously fetched. It was designed like this to integrate with the functionality
1146 * of the iterator stepping logic.
1147 */
1148
cram_ptell(void * fp)1149 static int64_t cram_ptell(void *fp)
1150 {
1151 cram_fd *fd = (cram_fd *)fp;
1152 cram_container *c;
1153 cram_slice *s;
1154 int64_t ret = -1L;
1155
1156 if (fd) {
1157 if ((c = fd->ctr) != NULL) {
1158 if ((s = c->slice) != NULL && s->max_rec) {
1159 if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1160 fd->curr_position += c->offset + c->length;
1161 }
1162 }
1163 ret = fd->curr_position;
1164 }
1165
1166 return ret;
1167 }
1168
bam_pseek(void * fp,int64_t offset,int whence)1169 static int bam_pseek(void *fp, int64_t offset, int whence)
1170 {
1171 BGZF *fd = (BGZF *)fp;
1172
1173 return bgzf_seek(fd, offset, whence);
1174 }
1175
bam_ptell(void * fp)1176 static int64_t bam_ptell(void *fp)
1177 {
1178 BGZF *fd = (BGZF *)fp;
1179 if (!fd)
1180 return -1L;
1181
1182 return bgzf_tell(fd);
1183 }
1184
1185
1186
index_load(htsFile * fp,const char * fn,const char * fnidx,int flags)1187 static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1188 {
1189 switch (fp->format.format) {
1190 case bam:
1191 case sam:
1192 return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1193
1194 case cram: {
1195 if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1196
1197 // Cons up a fake "index" just pointing at the associated cram_fd:
1198 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1199 if (idx == NULL) return NULL;
1200 idx->fmt = HTS_FMT_CRAI;
1201 idx->cram = fp->fp.cram;
1202 return (hts_idx_t *) idx;
1203 }
1204
1205 default:
1206 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1207 }
1208 }
1209
sam_index_load3(htsFile * fp,const char * fn,const char * fnidx,int flags)1210 hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1211 {
1212 return index_load(fp, fn, fnidx, flags);
1213 }
1214
sam_index_load2(htsFile * fp,const char * fn,const char * fnidx)1215 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1216 return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1217 }
1218
sam_index_load(htsFile * fp,const char * fn)1219 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1220 {
1221 return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1222 }
1223
cram_itr_query(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end,hts_readrec_func * readrec)1224 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1225 {
1226 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1227 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1228 if (iter == NULL) return NULL;
1229
1230 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1231 // the readrec function:
1232 iter->is_cram = 1;
1233 iter->read_rest = 1;
1234 iter->off = NULL;
1235 iter->bins.a = NULL;
1236 iter->readrec = readrec;
1237
1238 if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1239 cram_range r = { tid, beg+1, end };
1240 int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1241
1242 iter->curr_off = 0;
1243 // The following fields are not required by hts_itr_next(), but are
1244 // filled in in case user code wants to look at them.
1245 iter->tid = tid;
1246 iter->beg = beg;
1247 iter->end = end;
1248
1249 switch (ret) {
1250 case 0:
1251 break;
1252
1253 case -2:
1254 // No data vs this ref, so mark iterator as completed.
1255 // Same as HTS_IDX_NONE.
1256 iter->finished = 1;
1257 break;
1258
1259 default:
1260 free(iter);
1261 return NULL;
1262 }
1263 }
1264 else switch (tid) {
1265 case HTS_IDX_REST:
1266 iter->curr_off = 0;
1267 break;
1268 case HTS_IDX_NONE:
1269 iter->curr_off = 0;
1270 iter->finished = 1;
1271 break;
1272 default:
1273 hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1274 abort();
1275 break;
1276 }
1277
1278 return iter;
1279 }
1280
sam_itr_queryi(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end)1281 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1282 {
1283 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1284 if (idx == NULL)
1285 return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1286 else if (cidx->fmt == HTS_FMT_CRAI)
1287 return cram_itr_query(idx, tid, beg, end, sam_readrec);
1288 else
1289 return hts_itr_query(idx, tid, beg, end, sam_readrec);
1290 }
1291
cram_name2id(void * fdv,const char * ref)1292 static int cram_name2id(void *fdv, const char *ref)
1293 {
1294 cram_fd *fd = (cram_fd *) fdv;
1295 return sam_hdr_name2tid(fd->header, ref);
1296 }
1297
sam_itr_querys(const hts_idx_t * idx,sam_hdr_t * hdr,const char * region)1298 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1299 {
1300 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1301 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1302 cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1303 sam_readrec);
1304 }
1305
sam_itr_regarray(const hts_idx_t * idx,sam_hdr_t * hdr,char ** regarray,unsigned int regcount)1306 hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1307 {
1308 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1309 hts_reglist_t *r_list = NULL;
1310 int r_count = 0;
1311
1312 if (!cidx || !hdr)
1313 return NULL;
1314
1315 hts_itr_t *itr = NULL;
1316 if (cidx->fmt == HTS_FMT_CRAI) {
1317 r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1318 if (!r_list)
1319 return NULL;
1320 itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1321 hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1322 } else {
1323 r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1324 if (!r_list)
1325 return NULL;
1326 itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1327 hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1328 }
1329
1330 if (!itr)
1331 hts_reglist_free(r_list, r_count);
1332
1333 return itr;
1334 }
1335
sam_itr_regions(const hts_idx_t * idx,sam_hdr_t * hdr,hts_reglist_t * reglist,unsigned int regcount)1336 hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1337 {
1338 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1339
1340 if(!cidx || !hdr || !reglist)
1341 return NULL;
1342
1343 if (cidx->fmt == HTS_FMT_CRAI)
1344 return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1345 hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1346 else
1347 return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1348 hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1349 }
1350
1351 /**********************
1352 *** SAM header I/O ***
1353 **********************/
1354
1355 #include "htslib/kseq.h"
1356 #include "htslib/kstring.h"
1357
sam_hdr_parse(size_t l_text,const char * text)1358 sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1359 {
1360 sam_hdr_t *bh = sam_hdr_init();
1361 if (!bh) return NULL;
1362
1363 if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1364 sam_hdr_destroy(bh);
1365 return NULL;
1366 }
1367
1368 return bh;
1369 }
1370
1371 // Minimal sanitisation of a header to ensure.
1372 // - null terminated string.
1373 // - all lines start with @ (also implies no blank lines).
1374 //
1375 // Much more could be done, but currently is not, including:
1376 // - checking header types are known (HD, SQ, etc).
1377 // - syntax (eg checking tab separated fields).
1378 // - validating n_targets matches @SQ records.
1379 // - validating target lengths against @SQ records.
sam_hdr_sanitise(sam_hdr_t * h)1380 static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1381 if (!h)
1382 return NULL;
1383
1384 // Special case for empty headers.
1385 if (h->l_text == 0)
1386 return h;
1387
1388 size_t i;
1389 unsigned int lnum = 0;
1390 char *cp = h->text, last = '\n';
1391 for (i = 0; i < h->l_text; i++) {
1392 // NB: l_text excludes terminating nul. This finds early ones.
1393 if (cp[i] == 0)
1394 break;
1395
1396 // Error on \n[^@], including duplicate newlines
1397 if (last == '\n') {
1398 lnum++;
1399 if (cp[i] != '@') {
1400 hts_log_error("Malformed SAM header at line %u", lnum);
1401 sam_hdr_destroy(h);
1402 return NULL;
1403 }
1404 }
1405
1406 last = cp[i];
1407 }
1408
1409 if (i < h->l_text) { // Early nul found. Complain if not just padding.
1410 size_t j = i;
1411 while (j < h->l_text && cp[j] == '\0') j++;
1412 if (j < h->l_text)
1413 hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1414 }
1415
1416 // Add trailing newline and/or trailing nul if required.
1417 if (last != '\n') {
1418 hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1419
1420 if (h->l_text < 2 || i >= h->l_text - 2) {
1421 if (h->l_text >= SIZE_MAX - 2) {
1422 hts_log_error("No room for extra newline");
1423 sam_hdr_destroy(h);
1424 return NULL;
1425 }
1426
1427 cp = realloc(h->text, (size_t) h->l_text+2);
1428 if (!cp) {
1429 sam_hdr_destroy(h);
1430 return NULL;
1431 }
1432 h->text = cp;
1433 }
1434 cp[i++] = '\n';
1435
1436 // l_text may be larger already due to multiple nul padding
1437 if (h->l_text < i)
1438 h->l_text = i;
1439 cp[h->l_text] = '\0';
1440 }
1441
1442 return h;
1443 }
1444
sam_hdr_create(htsFile * fp)1445 static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1446 kstring_t str = { 0, 0, NULL };
1447 khint_t k;
1448 sam_hdr_t* h = sam_hdr_init();
1449 const char *q, *r;
1450 char* sn = NULL;
1451 khash_t(s2i) *d = kh_init(s2i);
1452 khash_t(s2i) *long_refs = NULL;
1453 if (!h || !d)
1454 goto error;
1455
1456 int ret, has_SQ = 0;
1457 int next_c = '@';
1458 while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1459 if (fp->line.s[0] != '@')
1460 break;
1461
1462 if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1463 has_SQ = 1;
1464 hts_pos_t ln = -1;
1465 for (q = fp->line.s + 4;; ++q) {
1466 if (strncmp(q, "SN:", 3) == 0) {
1467 q += 3;
1468 for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1469
1470 if (sn) {
1471 hts_log_warning("SQ header line has more than one SN: tag");
1472 free(sn);
1473 }
1474 sn = (char*)calloc(r - q + 1, 1);
1475 if (!sn)
1476 goto error;
1477
1478 strncpy(sn, q, r - q);
1479 q = r;
1480 } else {
1481 if (strncmp(q, "LN:", 3) == 0)
1482 ln = strtoll(q + 3, (char**)&q, 10);
1483 }
1484
1485 while (*q != '\t' && *q != '\n' && *q != '\0')
1486 ++q;
1487 if (*q == '\0' || *q == '\n')
1488 break;
1489 }
1490 if (sn) {
1491 if (ln >= 0) {
1492 int absent;
1493 k = kh_put(s2i, d, sn, &absent);
1494 if (absent < 0)
1495 goto error;
1496
1497 if (!absent) {
1498 hts_log_warning("Duplicated sequence '%s'", sn);
1499 free(sn);
1500 } else {
1501 if (ln >= UINT32_MAX) {
1502 // Stash away ref length that
1503 // doesn't fit in target_len array
1504 int k2;
1505 if (!long_refs) {
1506 long_refs = kh_init(s2i);
1507 if (!long_refs)
1508 goto error;
1509 }
1510 k2 = kh_put(s2i, long_refs, sn, &absent);
1511 if (absent < 0)
1512 goto error;
1513 kh_val(long_refs, k2) = ln;
1514 kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1515 | UINT32_MAX);
1516 } else {
1517 kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1518 }
1519 }
1520 } else {
1521 hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
1522 free(sn);
1523 }
1524 } else {
1525 hts_log_warning("Ignored @SQ line with missing SN: tag");
1526 }
1527 sn = NULL;
1528 }
1529 if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1530 goto error;
1531
1532 if (kputc('\n', &str) < 0)
1533 goto error;
1534
1535 if (fp->is_bgzf) {
1536 next_c = bgzf_peek(fp->fp.bgzf);
1537 } else {
1538 unsigned char nc;
1539 ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
1540 next_c = pret > 0 ? nc : pret - 1;
1541 }
1542 if (next_c < -1)
1543 goto error;
1544 }
1545 if (next_c != '@')
1546 fp->line.l = 0;
1547
1548 if (ret < -1)
1549 goto error;
1550
1551 if (!has_SQ && fp->fn_aux) {
1552 kstring_t line = { 0, 0, NULL };
1553
1554 /* The reference index (.fai) is actually needed here */
1555 char *fai_fn = fp->fn_aux;
1556 char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
1557 if (fn_delim)
1558 fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
1559
1560 hFILE* f = hopen(fai_fn, "r");
1561 int e = 0, absent;
1562 if (f == NULL)
1563 goto error;
1564
1565 while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
1566 char* tab = strchr(line.s, '\t');
1567 hts_pos_t ln;
1568
1569 if (tab == NULL)
1570 continue;
1571
1572 sn = (char*)calloc(tab-line.s+1, 1);
1573 if (!sn)
1574 break;
1575 memcpy(sn, line.s, tab-line.s);
1576 k = kh_put(s2i, d, sn, &absent);
1577 if (absent < 0)
1578 break;
1579
1580 ln = strtoll(tab, NULL, 10);
1581
1582 if (!absent) {
1583 hts_log_warning("Duplicated sequence '%s'", sn);
1584 free(sn);
1585 } else {
1586 if (ln >= UINT32_MAX) {
1587 // Stash away ref length that
1588 // doesn't fit in target_len array
1589 khint_t k2;
1590 int absent = -1;
1591 if (!long_refs) {
1592 long_refs = kh_init(s2i);
1593 if (!long_refs)
1594 goto error;
1595 }
1596 k2 = kh_put(s2i, long_refs, sn, &absent);
1597 if (absent < 0)
1598 goto error;
1599 kh_val(long_refs, k2) = ln;
1600 kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1601 | UINT32_MAX);
1602 } else {
1603 kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1604 }
1605 has_SQ = 1;
1606 }
1607
1608 e |= kputs("@SQ\tSN:", &str) < 0;
1609 e |= kputsn(line.s, tab - line.s, &str) < 0;
1610 e |= kputs("\tLN:", &str) < 0;
1611 e |= kputll(ln, &str) < 0;
1612 e |= kputc('\n', &str) < 0;
1613 if (e)
1614 break;
1615 }
1616
1617 ks_free(&line);
1618 if (hclose(f) != 0) {
1619 hts_log_error("Error on closing %s", fai_fn);
1620 e = 1;
1621 }
1622 if (e)
1623 goto error;
1624 }
1625
1626 if (has_SQ) {
1627 // Populate the targets array
1628 h->n_targets = kh_size(d);
1629
1630 h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
1631 if (!h->target_name) {
1632 h->n_targets = 0;
1633 goto error;
1634 }
1635
1636 h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
1637 if (!h->target_len) {
1638 h->n_targets = 0;
1639 goto error;
1640 }
1641
1642 for (k = kh_begin(d); k != kh_end(d); ++k) {
1643 if (!kh_exist(d, k))
1644 continue;
1645
1646 h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
1647 h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
1648 kh_val(d, k) >>= 32;
1649 }
1650 }
1651
1652 // Repurpose sdict to hold any references longer than UINT32_MAX
1653 h->sdict = long_refs;
1654
1655 kh_destroy(s2i, d);
1656
1657 if (str.l == 0)
1658 kputsn("", 0, &str);
1659 h->l_text = str.l;
1660 h->text = ks_release(&str);
1661 fp->bam_header = sam_hdr_sanitise(h);
1662 fp->bam_header->ref_count = 1;
1663
1664 return fp->bam_header;
1665
1666 error:
1667 if (h && d && (!h->target_name || !h->target_len)) {
1668 for (k = kh_begin(d); k != kh_end(d); ++k)
1669 if (kh_exist(d, k)) free((void *)kh_key(d, k));
1670 }
1671 sam_hdr_destroy(h);
1672 ks_free(&str);
1673 kh_destroy(s2i, d);
1674 kh_destroy(s2i, long_refs);
1675 if (sn) free(sn);
1676 return NULL;
1677 }
1678
sam_hdr_read(htsFile * fp)1679 sam_hdr_t *sam_hdr_read(htsFile *fp)
1680 {
1681 if (!fp) {
1682 errno = EINVAL;
1683 return NULL;
1684 }
1685
1686 switch (fp->format.format) {
1687 case bam:
1688 return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1689
1690 case cram:
1691 return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
1692
1693 case sam:
1694 return sam_hdr_create(fp);
1695
1696 case empty_format:
1697 errno = EPIPE;
1698 return NULL;
1699
1700 default:
1701 errno = EFTYPE;
1702 return NULL;
1703 }
1704 }
1705
sam_hdr_write(htsFile * fp,const sam_hdr_t * h)1706 int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
1707 {
1708 if (!fp || !h) {
1709 errno = EINVAL;
1710 return -1;
1711 }
1712
1713 if (!h->hrecs && !h->text)
1714 return 0;
1715
1716 switch (fp->format.format) {
1717 case binary_format:
1718 fp->format.category = sequence_data;
1719 fp->format.format = bam;
1720 /* fall-through */
1721 case bam:
1722 if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1723 break;
1724
1725 case cram: {
1726 cram_fd *fd = fp->fp.cram;
1727 if (cram_set_header2(fd, h) < 0) return -1;
1728 if (fp->fn_aux)
1729 cram_load_reference(fd, fp->fn_aux);
1730 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1731 }
1732 break;
1733
1734 case text_format:
1735 fp->format.category = sequence_data;
1736 fp->format.format = sam;
1737 /* fall-through */
1738 case sam: {
1739 char *text;
1740 kstring_t hdr_ks = { 0, 0, NULL };
1741 size_t l_text;
1742 ssize_t bytes;
1743 int r = 0, no_sq = 0;
1744
1745 if (h->hrecs) {
1746 if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
1747 return -1;
1748 text = hdr_ks.s;
1749 l_text = hdr_ks.l;
1750 } else {
1751 const char *p = NULL;
1752 do {
1753 const char *q = p == NULL ? h->text : p + 4;
1754 p = strstr(q, "@SQ\t");
1755 } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
1756 no_sq = p == NULL;
1757 text = h->text;
1758 l_text = h->l_text;
1759 }
1760
1761 if (fp->is_bgzf) {
1762 bytes = bgzf_write(fp->fp.bgzf, text, l_text);
1763 } else {
1764 bytes = hwrite(fp->fp.hfile, text, l_text);
1765 }
1766 free(hdr_ks.s);
1767 if (bytes != l_text)
1768 return -1;
1769
1770 if (no_sq) {
1771 int i;
1772 for (i = 0; i < h->n_targets; ++i) {
1773 fp->line.l = 0;
1774 r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
1775 r |= kputs(h->target_name[i], &fp->line) < 0;
1776 r |= kputsn("\tLN:", 4, &fp->line) < 0;
1777 r |= kputw(h->target_len[i], &fp->line) < 0;
1778 r |= kputc('\n', &fp->line) < 0;
1779 if (r != 0)
1780 return -1;
1781
1782 if (fp->is_bgzf) {
1783 bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
1784 } else {
1785 bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
1786 }
1787 if (bytes != fp->line.l)
1788 return -1;
1789 }
1790 }
1791 if (fp->is_bgzf) {
1792 if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
1793 } else {
1794 if (hflush(fp->fp.hfile) != 0) return -1;
1795 }
1796 }
1797 break;
1798
1799 default:
1800 errno = EBADF;
1801 return -1;
1802 }
1803 return 0;
1804 }
1805
old_sam_hdr_change_HD(sam_hdr_t * h,const char * key,const char * val)1806 static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
1807 {
1808 char *p, *q, *beg = NULL, *end = NULL, *newtext;
1809 size_t new_l_text;
1810 if (!h || !key)
1811 return -1;
1812
1813 if (h->l_text > 3) {
1814 if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
1815 if ((p = strchr(h->text, '\n')) == 0) return -1;
1816 *p = '\0'; // for strstr call
1817
1818 char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
1819
1820 if ((q = strstr(h->text, tmp)) != 0) { // key exists
1821 *p = '\n'; // change back
1822
1823 // mark the key:val
1824 beg = q;
1825 for (q += 4; *q != '\n' && *q != '\t'; ++q);
1826 end = q;
1827
1828 if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
1829 && strlen(val) == end - beg - 4)
1830 return 0; // val is the same, no need to change
1831
1832 } else {
1833 beg = end = p;
1834 *p = '\n';
1835 }
1836 }
1837 }
1838 if (beg == NULL) { // no @HD
1839 new_l_text = h->l_text;
1840 if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
1841 return -1;
1842 new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
1843 if (val) {
1844 if (new_l_text > SIZE_MAX - strlen(val) - 5)
1845 return -1;
1846 new_l_text += strlen(val) + 4;
1847 }
1848 newtext = (char*)malloc(new_l_text + 1);
1849 if (!newtext) return -1;
1850
1851 if (val)
1852 snprintf(newtext, new_l_text + 1,
1853 "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
1854 else
1855 snprintf(newtext, new_l_text + 1,
1856 "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
1857 } else { // has @HD but different or no key
1858 new_l_text = (beg - h->text) + (h->text + h->l_text - end);
1859 if (val) {
1860 if (new_l_text > SIZE_MAX - strlen(val) - 5)
1861 return -1;
1862 new_l_text += strlen(val) + 4;
1863 }
1864 newtext = (char*)malloc(new_l_text + 1);
1865 if (!newtext) return -1;
1866
1867 if (val) {
1868 snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
1869 (int) (beg - h->text), h->text, key, val, end);
1870 } else { //delete key
1871 snprintf(newtext, new_l_text + 1, "%.*s%s",
1872 (int) (beg - h->text), h->text, end);
1873 }
1874 }
1875 free(h->text);
1876 h->text = newtext;
1877 h->l_text = new_l_text;
1878 return 0;
1879 }
1880
1881
sam_hdr_change_HD(sam_hdr_t * h,const char * key,const char * val)1882 int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
1883 {
1884 if (!h || !key)
1885 return -1;
1886
1887 if (!h->hrecs)
1888 return old_sam_hdr_change_HD(h, key, val);
1889
1890 if (val) {
1891 if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
1892 return -1;
1893 } else {
1894 if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
1895 return -1;
1896 }
1897 return sam_hdr_rebuild(h);
1898 }
1899 /**********************
1900 *** SAM record I/O ***
1901 **********************/
1902
sam_parse_B_vals(char type,uint32_t n,char * in,char ** end,char * r,bam1_t * b)1903 static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
1904 char *r, bam1_t *b)
1905 {
1906 int orig_l = b->l_data;
1907 char *q = in;
1908 int32_t size;
1909 size_t bytes;
1910 int overflow = 0;
1911
1912 size = aux_type2size(type);
1913 if (size <= 0 || size > 4) {
1914 hts_log_error("Unrecognized type B:%c", type);
1915 return -1;
1916 }
1917
1918 // Ensure space for type + values
1919 bytes = (size_t) n * (size_t) size;
1920 if (bytes / size != n
1921 || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
1922 hts_log_error("Out of memory");
1923 return -1;
1924 }
1925
1926 b->data[b->l_data++] = 'B';
1927 b->data[b->l_data++] = type;
1928 i32_to_le(n, b->data + b->l_data);
1929 b->l_data += sizeof(uint32_t);
1930 // This ensures that q always ends up at the next comma after
1931 // reading a number even if it's followed by junk. It
1932 // prevents the possibility of trying to read more than n items.
1933 #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
1934 if (type == 'c') {
1935 while (q < r) {
1936 *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow);
1937 b->l_data++;
1938 skip_to_comma_(q);
1939 }
1940 } else if (type == 'C') {
1941 while (q < r) {
1942 if (*q != '-') {
1943 *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow);
1944 b->l_data++;
1945 } else {
1946 overflow = 1;
1947 }
1948 skip_to_comma_(q);
1949 }
1950 } else if (type == 's') {
1951 while (q < r) {
1952 i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data);
1953 b->l_data += 2;
1954 skip_to_comma_(q);
1955 }
1956 } else if (type == 'S') {
1957 while (q < r) {
1958 if (*q != '-') {
1959 u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data);
1960 b->l_data += 2;
1961 } else {
1962 overflow = 1;
1963 }
1964 skip_to_comma_(q);
1965 }
1966 } else if (type == 'i') {
1967 while (q < r) {
1968 i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data);
1969 b->l_data += 4;
1970 skip_to_comma_(q);
1971 }
1972 } else if (type == 'I') {
1973 while (q < r) {
1974 if (*q != '-') {
1975 u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data);
1976 b->l_data += 4;
1977 } else {
1978 overflow = 1;
1979 }
1980 skip_to_comma_(q);
1981 }
1982 } else if (type == 'f') {
1983 while (q < r) {
1984 float_to_le(strtod(q + 1, &q), b->data + b->l_data);
1985 b->l_data += 4;
1986 skip_to_comma_(q);
1987 }
1988 } else {
1989 hts_log_error("Unrecognized type B:%c", type);
1990 return -1;
1991 }
1992
1993 if (!overflow) {
1994 *end = q;
1995 return 0;
1996 } else {
1997 int64_t max = 0, min = 0, val;
1998 // Given type was incorrect. Try to rescue the situation.
1999 q = in;
2000 overflow = 0;
2001 b->l_data = orig_l;
2002 // Find out what range of values is present
2003 while (q < r) {
2004 val = hts_str2int(q + 1, &q, 64, &overflow);
2005 if (max < val) max = val;
2006 if (min > val) min = val;
2007 skip_to_comma_(q);
2008 }
2009 // Retry with appropriate type
2010 if (!overflow) {
2011 if (min < 0) {
2012 if (min >= INT8_MIN && max <= INT8_MAX) {
2013 return sam_parse_B_vals('c', n, in, end, r, b);
2014 } else if (min >= INT16_MIN && max <= INT16_MAX) {
2015 return sam_parse_B_vals('s', n, in, end, r, b);
2016 } else if (min >= INT32_MIN && max <= INT32_MAX) {
2017 return sam_parse_B_vals('i', n, in, end, r, b);
2018 }
2019 } else {
2020 if (max < UINT8_MAX) {
2021 return sam_parse_B_vals('C', n, in, end, r, b);
2022 } else if (max <= UINT16_MAX) {
2023 return sam_parse_B_vals('S', n, in, end, r, b);
2024 } else if (max <= UINT32_MAX) {
2025 return sam_parse_B_vals('I', n, in, end, r, b);
2026 }
2027 }
2028 }
2029 // If here then at least one of the values is too big to store
2030 hts_log_error("Numeric value in B array out of allowed range");
2031 return -1;
2032 }
2033 #undef skip_to_comma_
2034 }
2035
parse_sam_flag(char * v,char ** rv,int * overflow)2036 static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2037 if (*v >= '1' && *v <= '9') {
2038 return hts_str2uint(v, rv, 16, overflow);
2039 }
2040 else if (*v == '0') {
2041 // handle single-digit "0" directly; otherwise it's hex or octal
2042 if (v[1] == '\t') { *rv = v+1; return 0; }
2043 else {
2044 unsigned long val = strtoul(v, rv, 0);
2045 if (val > 65535) { *overflow = 1; return 65535; }
2046 return val;
2047 }
2048 }
2049 else {
2050 // TODO implement symbolic flag letters
2051 *rv = v;
2052 return 0;
2053 }
2054 }
2055
sam_parse1(kstring_t * s,sam_hdr_t * h,bam1_t * b)2056 int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2057 {
2058 #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2059
2060 #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2061
2062 // Macro that operates on 64-bits at a time.
2063 #define COPY_MINUS_N(to,from,n,l,failed) \
2064 do { \
2065 uint64_u *from8 = (uint64_u *)(from); \
2066 uint64_u *to8 = (uint64_u *)(to); \
2067 uint64_t uflow = 0; \
2068 size_t l8 = (l)>>3, i; \
2069 for (i = 0; i < l8; i++) { \
2070 to8[i] = from8[i] - (n)*0x0101010101010101UL; \
2071 uflow |= to8[i]; \
2072 } \
2073 for (i<<=3; i < (l); ++i) { \
2074 to[i] = from[i] - (n); \
2075 uflow |= to[i]; \
2076 } \
2077 failed = (uflow & 0x8080808080808080UL) > 0; \
2078 } while (0)
2079
2080 #else
2081
2082 // Basic version which operates a byte at a time
2083 #define COPY_MINUS_N(to,from,n,l,failed) do { \
2084 uint8_t uflow = 0; \
2085 for (i = 0; i < (l); ++i) { \
2086 (to)[i] = (from)[i] - (n); \
2087 uflow |= (uint8_t) (to)[i]; \
2088 } \
2089 failed = (uflow & 0x80) > 0; \
2090 } while (0)
2091
2092 #endif
2093
2094 #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2095 #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2096 #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2097
2098 uint8_t *t;
2099
2100 char *p = s->s, *q;
2101 int i, overflow = 0;
2102 char logbuf[40];
2103 hts_pos_t cigreflen;
2104 bam1_core_t *c = &b->core;
2105
2106 b->l_data = 0;
2107 memset(c, 0, 32);
2108
2109 // qname
2110 q = _read_token(p);
2111
2112 _parse_warn(p - q <= 1, "empty query name");
2113 _parse_err(p - q > 255, "query name too long");
2114 // resize large enough for name + extranul
2115 if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2116 memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2117
2118 c->l_extranul = (4 - (b->l_data & 3)) & 3;
2119 memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2120 b->l_data += c->l_extranul;
2121
2122 c->l_qname = p - q + c->l_extranul;
2123
2124 // flag
2125 c->flag = parse_sam_flag(p, &p, &overflow);
2126 if (*p++ != '\t') goto err_ret; // malformated flag
2127
2128 // chr
2129 q = _read_token(p);
2130 if (strcmp(q, "*")) {
2131 _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2132 c->tid = bam_name2id(h, q);
2133 _parse_err(c->tid < -1, "failed to parse header");
2134 _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2135 } else c->tid = -1;
2136
2137 // pos
2138 c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
2139 if (*p++ != '\t') goto err_ret;
2140 if (c->pos < 0 && c->tid >= 0) {
2141 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2142 c->tid = -1;
2143 }
2144 if (c->tid < 0) c->flag |= BAM_FUNMAP;
2145
2146 // mapq
2147 c->qual = hts_str2uint(p, &p, 8, &overflow);
2148 if (*p++ != '\t') goto err_ret;
2149 // cigar
2150 if (*p != '*') {
2151 uint32_t *cigar = NULL;
2152 int old_l_data = b->l_data;
2153 int n_cigar = bam_parse_cigar(p, &p, b);
2154 if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2155 cigar = (uint32_t *)(b->data + old_l_data);
2156 c->n_cigar = n_cigar;
2157
2158 // can't use bam_endpos() directly as some fields not yet set up
2159 cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2160 if (cigreflen == 0) cigreflen = 1;
2161 } else {
2162 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2163 c->flag |= BAM_FUNMAP;
2164 q = _read_token(p);
2165 cigreflen = 1;
2166 }
2167 _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2168 "read ends beyond highest supported position");
2169 c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2170 // mate chr
2171 q = _read_token(p);
2172 if (strcmp(q, "=") == 0) {
2173 c->mtid = c->tid;
2174 } else if (strcmp(q, "*") == 0) {
2175 c->mtid = -1;
2176 } else {
2177 c->mtid = bam_name2id(h, q);
2178 _parse_err(c->mtid < -1, "failed to parse header");
2179 _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2180 }
2181 // mpos
2182 c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
2183 if (*p++ != '\t') goto err_ret;
2184 if (c->mpos < 0 && c->mtid >= 0) {
2185 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2186 c->mtid = -1;
2187 }
2188 // tlen
2189 c->isize = hts_str2int(p, &p, 64, &overflow);
2190 if (*p++ != '\t') goto err_ret;
2191 // seq
2192 q = _read_token(p);
2193 if (strcmp(q, "*")) {
2194 _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2195 c->l_qseq = p - q - 1;
2196 hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2197 _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2198 i = (c->l_qseq + 1) >> 1;
2199 _get_mem(uint8_t, &t, b, i);
2200
2201 unsigned int lqs2 = c->l_qseq&~1, i;
2202 for (i = 0; i < lqs2; i+=2)
2203 t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2204 for (; i < c->l_qseq; ++i)
2205 t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2206 } else c->l_qseq = 0;
2207 // qual
2208 _get_mem(uint8_t, &t, b, c->l_qseq);
2209 if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2210 memset(t, 0xff, c->l_qseq);
2211 p += 2;
2212 } else {
2213 int failed = 0;
2214 _parse_err(s->l - (p - s->s) < c->l_qseq
2215 || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2216 "SEQ and QUAL are of different length");
2217 COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2218 _parse_err(failed, "invalid QUAL character");
2219 p += c->l_qseq + 1;
2220 }
2221 // aux
2222 q = p;
2223 p = s->s + s->l;
2224 while (q < p) {
2225 char type;
2226 _parse_err(p - q < 5, "incomplete aux field");
2227 _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2228 // Copy over id
2229 if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2230 memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2231 q += 3; type = *q++; ++q; // q points to value
2232 if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2233 _parse_err(*q <= '\t', "incomplete aux field");
2234
2235 // Ensure enough space for a double + type allocated.
2236 if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2237
2238 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2239 b->data[b->l_data++] = 'A';
2240 b->data[b->l_data++] = *q++;
2241 } else if (type == 'i' || type == 'I') {
2242 if (*q == '-') {
2243 int32_t x = hts_str2int(q, &q, 32, &overflow);
2244 if (x >= INT8_MIN) {
2245 b->data[b->l_data++] = 'c';
2246 b->data[b->l_data++] = x;
2247 } else if (x >= INT16_MIN) {
2248 b->data[b->l_data++] = 's';
2249 i16_to_le(x, b->data + b->l_data);
2250 b->l_data += 2;
2251 } else {
2252 b->data[b->l_data++] = 'i';
2253 i32_to_le(x, b->data + b->l_data);
2254 b->l_data += 4;
2255 }
2256 } else {
2257 uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2258 if (x <= UINT8_MAX) {
2259 b->data[b->l_data++] = 'C';
2260 b->data[b->l_data++] = x;
2261 } else if (x <= UINT16_MAX) {
2262 b->data[b->l_data++] = 'S';
2263 u16_to_le(x, b->data + b->l_data);
2264 b->l_data += 2;
2265 } else {
2266 b->data[b->l_data++] = 'I';
2267 u32_to_le(x, b->data + b->l_data);
2268 b->l_data += 4;
2269 }
2270 }
2271 } else if (type == 'f') {
2272 b->data[b->l_data++] = 'f';
2273 float_to_le(strtod(q, &q), b->data + b->l_data);
2274 b->l_data += sizeof(float);
2275 } else if (type == 'd') {
2276 b->data[b->l_data++] = 'd';
2277 double_to_le(strtod(q, &q), b->data + b->l_data);
2278 b->l_data += sizeof(double);
2279 } else if (type == 'Z' || type == 'H') {
2280 char *end = strchr(q, '\t');
2281 if (!end) end = q + strlen(q);
2282 _parse_err(type == 'H' && ((end-q)&1) != 0,
2283 "hex field does not have an even number of digits");
2284 b->data[b->l_data++] = type;
2285 if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2286 memcpy(b->data + b->l_data, q, end - q);
2287 b->l_data += end - q;
2288 b->data[b->l_data++] = '\0';
2289 q = end;
2290 } else if (type == 'B') {
2291 uint32_t n;
2292 char *r;
2293 type = *q++; // q points to the first ',' following the typing byte
2294 _parse_err(*q && *q != ',' && *q != '\t',
2295 "B aux field type not followed by ','");
2296
2297 for (r = q, n = 0; *r > '\t'; ++r)
2298 if (*r == ',') ++n;
2299
2300 if (sam_parse_B_vals(type, n, q, &q, r, b) < 0)
2301 goto err_ret;
2302 } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2303
2304 while (*q > '\t') { q++; } // Skip any junk to next tab
2305 q++;
2306 }
2307
2308 _parse_err(overflow != 0, "numeric value out of allowed range");
2309
2310 if (bam_tag2cigar(b, 1, 1) < 0)
2311 return -2;
2312 return 0;
2313
2314 #undef _parse_warn
2315 #undef _parse_err
2316 #undef _get_mem
2317 #undef _read_token
2318 err_ret:
2319 return -2;
2320 }
2321
read_ncigar(const char * q)2322 static uint32_t read_ncigar(const char *q) {
2323 uint32_t n_cigar = 0;
2324 for (; *q && *q != '\t'; ++q)
2325 if (!isdigit_c(*q)) ++n_cigar;
2326 if (!n_cigar) {
2327 hts_log_error("No CIGAR operations");
2328 return 0;
2329 }
2330 if (n_cigar >= 2147483647) {
2331 hts_log_error("Too many CIGAR operations");
2332 return 0;
2333 }
2334
2335 return n_cigar;
2336 }
2337
2338 /*! @function
2339 @abstract Parse a CIGAR string into preallocated a uint32_t array
2340 @param in [in] pointer to the source string
2341 @param a_cigar [out] address of the destination uint32_t buffer
2342 @return number of processed input characters; 0 on error
2343 */
parse_cigar(const char * in,uint32_t * a_cigar,uint32_t n_cigar)2344 static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2345 int i, overflow = 0;
2346 const char *p = in;
2347 for (i = 0; i < n_cigar; i++) {
2348 uint32_t len;
2349 int op;
2350 char *q;
2351 len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2352 if (q == p) {
2353 hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2354 return 0;
2355 }
2356 if (overflow) {
2357 hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2358 return 0;
2359 }
2360 p = q;
2361 op = bam_cigar_table[(unsigned char)*p++];
2362 if (op < 0) {
2363 hts_log_error("Unrecognized CIGAR operator");
2364 return 0;
2365 }
2366 a_cigar[i] = len;
2367 a_cigar[i] |= op;
2368 }
2369
2370 return p-in;
2371 }
2372
sam_parse_cigar(const char * in,char ** end,uint32_t ** a_cigar,size_t * a_mem)2373 ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2374 size_t n_cigar = 0;
2375 int diff;
2376
2377 if (!in || !a_cigar || !a_mem) {
2378 hts_log_error("NULL pointer arguments");
2379 return -1;
2380 }
2381 if (end) *end = (char *)in;
2382
2383 if (*in == '*') {
2384 if (end) (*end)++;
2385 return 0;
2386 }
2387 n_cigar = read_ncigar(in);
2388 if (!n_cigar) return 0;
2389 if (n_cigar > *a_mem) {
2390 uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2391 if (a_tmp) {
2392 *a_cigar = a_tmp;
2393 *a_mem = n_cigar;
2394 } else {
2395 hts_log_error("Memory allocation error");
2396 return -1;
2397 }
2398 }
2399
2400 if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2401 if (end) *end = (char *)in+diff;
2402
2403 return n_cigar;
2404 }
2405
bam_parse_cigar(const char * in,char ** end,bam1_t * b)2406 ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2407 size_t n_cigar = 0;
2408 int diff;
2409
2410 if (!in || !b) {
2411 hts_log_error("NULL pointer arguments");
2412 return -1;
2413 }
2414 if (end) *end = (char *)in;
2415
2416 if (*in == '*') {
2417 if (end) (*end)++;
2418 return 0;
2419 }
2420 n_cigar = read_ncigar(in);
2421 if (!n_cigar) return 0;
2422 if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) {
2423 hts_log_error("Memory allocation error");
2424 return -1;
2425 }
2426
2427 if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1;
2428 b->l_data += (n_cigar * sizeof(uint32_t));
2429 if (end) *end = (char *)in+diff;
2430
2431 return n_cigar;
2432 }
2433
2434 /*
2435 * -----------------------------------------------------------------------------
2436 * SAM threading
2437 */
2438 // Size of SAM text block (reading)
2439 #define NM 240000
2440 // Number of BAM records (writing)
2441 #define NB 1000
2442
2443 struct SAM_state;
2444
2445 // Output job - a block of BAM records
2446 typedef struct sp_bams {
2447 struct sp_bams *next;
2448 int serial;
2449
2450 bam1_t *bams;
2451 int nbams, abams; // used and alloc
2452
2453 struct SAM_state *fd;
2454 } sp_bams;
2455
2456 // Input job - a block of SAM text
2457 typedef struct sp_lines {
2458 struct sp_lines *next;
2459 int serial;
2460
2461 char *data;
2462 int data_size;
2463 int alloc;
2464
2465 struct SAM_state *fd;
2466 sp_bams *bams;
2467 } sp_lines;
2468
2469 enum sam_cmd {
2470 SAM_NONE = 0,
2471 SAM_CLOSE,
2472 SAM_CLOSE_DONE,
2473 };
2474
2475 typedef struct SAM_state {
2476 sam_hdr_t *h;
2477
2478 hts_tpool *p;
2479 int own_pool;
2480 pthread_mutex_t lines_m;
2481 hts_tpool_process *q;
2482 pthread_t dispatcher;
2483 int dispatcher_set;
2484
2485 sp_lines *lines;
2486 sp_bams *bams;
2487
2488 sp_bams *curr_bam;
2489 int curr_idx;
2490 int serial;
2491
2492 // Be warned: moving these mutexes around in this struct can reduce
2493 // threading performance by up to 70%!
2494 pthread_mutex_t command_m;
2495 pthread_cond_t command_c;
2496 enum sam_cmd command;
2497
2498 // One of the E* errno codes
2499 int errcode;
2500
2501 htsFile *fp;
2502 } SAM_state;
2503
2504 // Returns a SAM_state struct from a generic hFILE.
2505 //
2506 // Returns NULL on failure.
sam_state_create(htsFile * fp)2507 static SAM_state *sam_state_create(htsFile *fp) {
2508 // Ideally sam_open wouldn't be a #define to hts_open but instead would
2509 // be a redirect call with an additional 'S' mode. This in turn would
2510 // correctly set the designed format to sam instead of a generic
2511 // text_format.
2512 if (fp->format.format != sam && fp->format.format != text_format)
2513 return NULL;
2514
2515 SAM_state *fd = calloc(1, sizeof(*fd));
2516 if (!fd)
2517 return NULL;
2518
2519 fp->state = fd;
2520 fd->fp = fp;
2521
2522 return fd;
2523 }
2524
2525 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
2526 static void *sam_format_worker(void *arg);
2527
sam_state_err(SAM_state * fd,int errcode)2528 static void sam_state_err(SAM_state *fd, int errcode) {
2529 pthread_mutex_lock(&fd->command_m);
2530 if (!fd->errcode)
2531 fd->errcode = errcode;
2532 pthread_mutex_unlock(&fd->command_m);
2533 }
2534
sam_free_sp_bams(sp_bams * b)2535 static void sam_free_sp_bams(sp_bams *b) {
2536 if (!b)
2537 return;
2538
2539 if (b->bams) {
2540 int i;
2541 for (i = 0; i < b->abams; i++) {
2542 if (b->bams[i].data)
2543 free(b->bams[i].data);
2544 }
2545 free(b->bams);
2546 }
2547 free(b);
2548 }
2549
2550 // Destroys the state produce by sam_state_create.
sam_state_destroy(htsFile * fp)2551 int sam_state_destroy(htsFile *fp) {
2552 int ret = 0;
2553
2554 if (!fp->state)
2555 return 0;
2556
2557 SAM_state *fd = fp->state;
2558 if (fd->p) {
2559 if (fd->h) {
2560 // Notify sam_dispatcher we're closing
2561 pthread_mutex_lock(&fd->command_m);
2562 if (fd->command != SAM_CLOSE_DONE)
2563 fd->command = SAM_CLOSE;
2564 pthread_cond_signal(&fd->command_c);
2565 ret = -fd->errcode;
2566 if (fd->q)
2567 hts_tpool_wake_dispatch(fd->q); // unstick the reader
2568
2569 if (!fp->is_write && fd->q && fd->dispatcher_set) {
2570 for (;;) {
2571 // Avoid deadlocks with dispatcher
2572 if (fd->command == SAM_CLOSE_DONE)
2573 break;
2574 hts_tpool_wake_dispatch(fd->q);
2575 pthread_mutex_unlock(&fd->command_m);
2576 usleep(10000);
2577 pthread_mutex_lock(&fd->command_m);
2578 }
2579 }
2580 pthread_mutex_unlock(&fd->command_m);
2581
2582 if (fp->is_write) {
2583 // Dispatch the last partial block.
2584 sp_bams *gb = fd->curr_bam;
2585 if (!ret && gb && gb->nbams > 0 && fd->q)
2586 ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
2587
2588 // Flush and drain output
2589 if (fd->q)
2590 hts_tpool_process_flush(fd->q);
2591 pthread_mutex_lock(&fd->command_m);
2592 if (!ret) ret = -fd->errcode;
2593 pthread_mutex_unlock(&fd->command_m);
2594
2595 while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
2596 usleep(10000);
2597 pthread_mutex_lock(&fd->command_m);
2598 ret = -fd->errcode;
2599 // not empty but shutdown implies error
2600 if (hts_tpool_process_is_shutdown(fd->q) && !ret)
2601 ret = EIO;
2602 pthread_mutex_unlock(&fd->command_m);
2603 }
2604 if (fd->q)
2605 hts_tpool_process_shutdown(fd->q);
2606 }
2607
2608 // Wait for it to acknowledge
2609 if (fd->dispatcher_set)
2610 pthread_join(fd->dispatcher, NULL);
2611 if (!ret) ret = -fd->errcode;
2612 }
2613
2614 // Tidy up memory
2615 if (fd->q)
2616 hts_tpool_process_destroy(fd->q);
2617
2618 if (fd->own_pool && fp->format.compression == no_compression) {
2619 hts_tpool_destroy(fd->p);
2620 fd->p = NULL;
2621 }
2622 pthread_mutex_destroy(&fd->lines_m);
2623 pthread_mutex_destroy(&fd->command_m);
2624 pthread_cond_destroy(&fd->command_c);
2625
2626 sp_lines *l = fd->lines;
2627 while (l) {
2628 sp_lines *n = l->next;
2629 free(l->data);
2630 free(l);
2631 l = n;
2632 }
2633
2634 sp_bams *b = fd->bams;
2635 while (b) {
2636 if (fd->curr_bam == b)
2637 fd->curr_bam = NULL;
2638 sp_bams *n = b->next;
2639 sam_free_sp_bams(b);
2640 b = n;
2641 }
2642
2643 if (fd->curr_bam)
2644 sam_free_sp_bams(fd->curr_bam);
2645
2646 // Decrement counter by one, maybe destroying too.
2647 // This is to permit the caller using bam_hdr_destroy
2648 // before sam_close without triggering decode errors
2649 // in the background threads.
2650 bam_hdr_destroy(fd->h);
2651 }
2652
2653 free(fp->state);
2654 fp->state = NULL;
2655 return ret;
2656 }
2657
2658 // Cleanup function - job for sam_parse_worker; result for sam_format_worker
cleanup_sp_lines(void * arg)2659 static void cleanup_sp_lines(void *arg) {
2660 sp_lines *gl = (sp_lines *)arg;
2661 if (!gl) return;
2662
2663 // Should always be true for lines passed to / from thread workers.
2664 assert(gl->next == NULL);
2665
2666 free(gl->data);
2667 sam_free_sp_bams(gl->bams);
2668 free(gl);
2669 }
2670
2671 // Run from one of the worker threads.
2672 // Convert a passed in array of lines to array of BAMs, returning
2673 // the result back to the thread queue.
sam_parse_worker(void * arg)2674 static void *sam_parse_worker(void *arg) {
2675 sp_lines *gl = (sp_lines *)arg;
2676 sp_bams *gb = NULL;
2677 char *lines = gl->data;
2678 int i;
2679 bam1_t *b;
2680 SAM_state *fd = gl->fd;
2681
2682 // Use a block of BAM structs we had earlier if available.
2683 pthread_mutex_lock(&fd->lines_m);
2684 if (fd->bams) {
2685 gb = fd->bams;
2686 fd->bams = gb->next;
2687 }
2688 pthread_mutex_unlock(&fd->lines_m);
2689
2690 if (gb == NULL) {
2691 gb = calloc(1, sizeof(*gb));
2692 if (!gb) {
2693 return NULL;
2694 }
2695 gb->abams = 100;
2696 gb->bams = b = calloc(gb->abams, sizeof(*b));
2697 if (!gb->bams) {
2698 sam_state_err(fd, ENOMEM);
2699 goto err;
2700 }
2701 gb->nbams = 0;
2702 }
2703 gb->serial = gl->serial;
2704 gb->next = NULL;
2705
2706 b = (bam1_t *)gb->bams;
2707 if (!b) {
2708 sam_state_err(fd, ENOMEM);
2709 goto err;
2710 }
2711
2712 i = 0;
2713 char *cp = lines, *cp_end = lines + gl->data_size;
2714 while (cp < cp_end) {
2715 if (i >= gb->abams) {
2716 int old_abams = gb->abams;
2717 gb->abams *= 2;
2718 b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
2719 if (!b) {
2720 gb->abams /= 2;
2721 sam_state_err(fd, ENOMEM);
2722 goto err;
2723 }
2724 memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
2725 gb->bams = b;
2726 }
2727
2728 // Ideally we'd get sam_parse1 to return the number of
2729 // bytes decoded and to be able to stop on newline as
2730 // well as \0.
2731 //
2732 // We can then avoid the additional strchr loop.
2733 // It's around 6% of our CPU cost, albeit threadable.
2734 //
2735 // However this is an API change so for now we copy.
2736
2737 char *nl = strchr(cp, '\n');
2738 char *line_end;
2739 if (nl) {
2740 line_end = nl;
2741 if (line_end > cp && *(line_end - 1) == '\r')
2742 line_end--;
2743 nl++;
2744 } else {
2745 nl = line_end = cp_end;
2746 }
2747 *line_end = '\0';
2748 kstring_t ks = { line_end - cp, gl->alloc, cp };
2749 if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
2750 sam_state_err(fd, errno ? errno : EIO);
2751 cleanup_sp_lines(gl);
2752 goto err;
2753 }
2754 cp = nl;
2755 i++;
2756 }
2757 gb->nbams = i;
2758
2759 pthread_mutex_lock(&fd->lines_m);
2760 gl->next = fd->lines;
2761 fd->lines = gl;
2762 pthread_mutex_unlock(&fd->lines_m);
2763 return gb;
2764
2765 err:
2766 sam_free_sp_bams(gb);
2767 return NULL;
2768 }
2769
sam_parse_eof(void * arg)2770 static void *sam_parse_eof(void *arg) {
2771 return NULL;
2772 }
2773
2774 // Cleanup function - result for sam_parse_worker; job for sam_format_worker
cleanup_sp_bams(void * arg)2775 static void cleanup_sp_bams(void *arg) {
2776 sam_free_sp_bams((sp_bams *) arg);
2777 }
2778
2779 // Runs in its own thread.
2780 // Reads a block of text (SAM) and sends a new job to the thread queue to
2781 // translate this to BAM.
sam_dispatcher_read(void * vp)2782 static void *sam_dispatcher_read(void *vp) {
2783 htsFile *fp = vp;
2784 kstring_t line = {0};
2785 int line_frag = 0;
2786 SAM_state *fd = fp->state;
2787 sp_lines *l = NULL;
2788
2789 // Pre-allocate buffer for left-over bits of line (exact size doesn't
2790 // matter as it will grow if necessary).
2791 if (ks_resize(&line, 1000) < 0)
2792 goto err;
2793
2794 for (;;) {
2795 // Check for command
2796 pthread_mutex_lock(&fd->command_m);
2797 switch (fd->command) {
2798
2799 case SAM_CLOSE:
2800 pthread_cond_signal(&fd->command_c);
2801 pthread_mutex_unlock(&fd->command_m);
2802 hts_tpool_process_shutdown(fd->q);
2803 goto tidyup;
2804
2805 default:
2806 break;
2807 }
2808 pthread_mutex_unlock(&fd->command_m);
2809
2810 pthread_mutex_lock(&fd->lines_m);
2811 if (fd->lines) {
2812 // reuse existing line buffer
2813 l = fd->lines;
2814 fd->lines = l->next;
2815 }
2816 pthread_mutex_unlock(&fd->lines_m);
2817
2818 if (l == NULL) {
2819 // none to reuse, to create a new one
2820 l = calloc(1, sizeof(*l));
2821 if (!l)
2822 goto err;
2823 l->alloc = NM;
2824 l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
2825 if (!l->data) {
2826 free(l);
2827 l = NULL;
2828 goto err;
2829 }
2830 l->fd = fd;
2831 }
2832 l->next = NULL;
2833
2834 if (l->alloc < line_frag+NM/2) {
2835 char *rp = realloc(l->data, line_frag+NM/2 +8);
2836 if (!rp)
2837 goto err;
2838 l->alloc = line_frag+NM/2;
2839 l->data = rp;
2840 }
2841 memcpy(l->data, line.s, line_frag);
2842
2843 l->data_size = line_frag;
2844 ssize_t nbytes;
2845 longer_line:
2846 if (fp->is_bgzf)
2847 nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
2848 else
2849 nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
2850 if (nbytes < 0) {
2851 sam_state_err(fd, errno ? errno : EIO);
2852 goto err;
2853 } else if (nbytes == 0)
2854 break; // EOF
2855 l->data_size += nbytes;
2856
2857 // trim to last \n. Maybe \r\n, but that's still fine
2858 if (nbytes == l->alloc - line_frag) {
2859 char *cp_end = l->data + l->data_size;
2860 char *cp = cp_end-1;
2861
2862 while (cp > (char *)l->data && *cp != '\n')
2863 cp--;
2864
2865 // entire buffer is part of a single line
2866 if (cp == l->data) {
2867 line_frag = l->data_size;
2868 char *rp = realloc(l->data, l->alloc * 2 + 8);
2869 if (!rp)
2870 goto err;
2871 l->alloc *= 2;
2872 l->data = rp;
2873 assert(l->alloc >= l->data_size);
2874 assert(l->alloc >= line_frag);
2875 assert(l->alloc >= l->alloc - line_frag);
2876 goto longer_line;
2877 }
2878 cp++;
2879
2880 // line holds the remainder of our line.
2881 if (ks_resize(&line, cp_end - cp) < 0)
2882 goto err;
2883 memcpy(line.s, cp, cp_end - cp);
2884 line_frag = cp_end - cp;
2885 l->data_size = l->alloc - line_frag;
2886 } else {
2887 // out of buffer
2888 line_frag = 0;
2889 }
2890
2891 l->serial = fd->serial++;
2892 //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
2893 if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
2894 cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
2895 goto err;
2896 pthread_mutex_lock(&fd->command_m);
2897 if (fd->command == SAM_CLOSE) {
2898 pthread_mutex_unlock(&fd->command_m);
2899 l = NULL;
2900 goto tidyup;
2901 }
2902 l = NULL; // Now "owned" by sam_parse_worker()
2903 pthread_mutex_unlock(&fd->command_m);
2904 }
2905
2906 if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
2907 goto err;
2908
2909 // At EOF, wait for close request.
2910 // (In future if we add support for seek, this is where we need to catch it.)
2911 for (;;) {
2912 pthread_mutex_lock(&fd->command_m);
2913 if (fd->command == SAM_NONE)
2914 pthread_cond_wait(&fd->command_c, &fd->command_m);
2915 switch (fd->command) {
2916 case SAM_CLOSE:
2917 pthread_cond_signal(&fd->command_c);
2918 pthread_mutex_unlock(&fd->command_m);
2919 hts_tpool_process_shutdown(fd->q);
2920 goto tidyup;
2921
2922 default:
2923 pthread_mutex_unlock(&fd->command_m);
2924 break;
2925 }
2926 }
2927
2928 tidyup:
2929 pthread_mutex_lock(&fd->command_m);
2930 fd->command = SAM_CLOSE_DONE;
2931 pthread_cond_signal(&fd->command_c);
2932 pthread_mutex_unlock(&fd->command_m);
2933
2934 if (l) {
2935 pthread_mutex_lock(&fd->lines_m);
2936 l->next = fd->lines;
2937 fd->lines = l;
2938 pthread_mutex_unlock(&fd->lines_m);
2939 }
2940 free(line.s);
2941
2942 return NULL;
2943
2944 err:
2945 sam_state_err(fd, errno ? errno : ENOMEM);
2946 hts_tpool_process_shutdown(fd->q);
2947 goto tidyup;
2948 }
2949
2950 // Runs in its own thread.
2951 // Takes encoded blocks of SAM off the thread results queue and writes them
2952 // to our output stream.
sam_dispatcher_write(void * vp)2953 static void *sam_dispatcher_write(void *vp) {
2954 htsFile *fp = vp;
2955 SAM_state *fd = fp->state;
2956 hts_tpool_result *r;
2957
2958 // Iterates until result queue is shutdown, where it returns NULL.
2959 while ((r = hts_tpool_next_result_wait(fd->q))) {
2960 sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
2961 if (!gl) {
2962 sam_state_err(fd, ENOMEM);
2963 goto err;
2964 }
2965
2966 if (fp->idx) {
2967 sp_bams *gb = gl->bams;
2968 int i = 0, count = 0;
2969 while (i < gl->data_size) {
2970 int j = i;
2971 while (i < gl->data_size && gl->data[i] != '\n')
2972 i++;
2973 if (i < gl->data_size)
2974 i++;
2975
2976 if (fp->is_bgzf) {
2977 if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
2978 goto err;
2979 } else {
2980 if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
2981 goto err;
2982 }
2983
2984 bam1_t *b = &gb->bams[count++];
2985 if (fp->format.compression == bgzf) {
2986 if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
2987 b->core.tid, b->core.pos, bam_endpos(b),
2988 bgzf_tell(fp->fp.bgzf),
2989 !(b->core.flag&BAM_FUNMAP)) < 0) {
2990 sam_state_err(fd, errno ? errno : ENOMEM);
2991 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
2992 bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
2993 goto err;
2994 }
2995 } else {
2996 if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
2997 bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
2998 sam_state_err(fd, errno ? errno : ENOMEM);
2999 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3000 bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3001 goto err;
3002 }
3003 }
3004 }
3005
3006 assert(count == gb->nbams);
3007
3008 // Add bam array to free-list
3009 pthread_mutex_lock(&fd->lines_m);
3010 gb->next = fd->bams;
3011 fd->bams = gl->bams;
3012 gl->bams = NULL;
3013 pthread_mutex_unlock(&fd->lines_m);
3014 } else {
3015 if (fp->is_bgzf) {
3016 if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size)
3017 goto err;
3018 } else {
3019 if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3020 goto err;
3021 }
3022 }
3023
3024 hts_tpool_delete_result(r, 0);
3025
3026 // Also updated by main thread
3027 pthread_mutex_lock(&fd->lines_m);
3028 gl->next = fd->lines;
3029 fd->lines = gl;
3030 pthread_mutex_unlock(&fd->lines_m);
3031 }
3032
3033 sam_state_err(fd, 0); // success
3034 hts_tpool_process_shutdown(fd->q);
3035 return NULL;
3036
3037 err:
3038 sam_state_err(fd, errno ? errno : EIO);
3039 return (void *)-1;
3040 }
3041
3042 // Run from one of the worker threads.
3043 // Convert a passed in array of BAMs (sp_bams) and converts to a block
3044 // of text SAM records (sp_lines).
sam_format_worker(void * arg)3045 static void *sam_format_worker(void *arg) {
3046 sp_bams *gb = (sp_bams *)arg;
3047 sp_lines *gl = NULL;
3048 int i;
3049 SAM_state *fd = gb->fd;
3050 htsFile *fp = fd->fp;
3051
3052 // Use a block of SAM strings we had earlier if available.
3053 pthread_mutex_lock(&fd->lines_m);
3054 if (fd->lines) {
3055 gl = fd->lines;
3056 fd->lines = gl->next;
3057 }
3058 pthread_mutex_unlock(&fd->lines_m);
3059
3060 if (gl == NULL) {
3061 gl = calloc(1, sizeof(*gl));
3062 if (!gl) {
3063 sam_state_err(fd, ENOMEM);
3064 return NULL;
3065 }
3066 gl->alloc = gl->data_size = 0;
3067 gl->data = NULL;
3068 }
3069 gl->serial = gb->serial;
3070 gl->next = NULL;
3071
3072 kstring_t ks = {0, gl->alloc, gl->data};
3073
3074 for (i = 0; i < gb->nbams; i++) {
3075 if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3076 sam_state_err(fd, errno ? errno : EIO);
3077 goto err;
3078 }
3079 kputc('\n', &ks);
3080 }
3081
3082 pthread_mutex_lock(&fd->lines_m);
3083 gl->data_size = ks.l;
3084 gl->alloc = ks.m;
3085 gl->data = ks.s;
3086
3087 if (fp->idx) {
3088 // Keep hold of the bam array a little longer as
3089 // sam_dispatcher_write needs to use them for building the index.
3090 gl->bams = gb;
3091 } else {
3092 // Add bam array to free-list
3093 gb->next = fd->bams;
3094 fd->bams = gb;
3095 }
3096 pthread_mutex_unlock(&fd->lines_m);
3097
3098 return gl;
3099
3100 err:
3101 // Possible race between this and fd->curr_bam.
3102 // Easier to not free and leave it on the input list so it
3103 // gets freed there instead?
3104 // sam_free_sp_bams(gb);
3105 if (gl) {
3106 free(gl->data);
3107 free(gl);
3108 }
3109 return NULL;
3110 }
3111
sam_set_thread_pool(htsFile * fp,htsThreadPool * p)3112 int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3113 if (fp->state)
3114 return 0;
3115
3116 if (!(fp->state = sam_state_create(fp)))
3117 return -1;
3118 SAM_state *fd = (SAM_state *)fp->state;
3119
3120 pthread_mutex_init(&fd->lines_m, NULL);
3121 pthread_mutex_init(&fd->command_m, NULL);
3122 pthread_cond_init(&fd->command_c, NULL);
3123 fd->p = p->pool;
3124 int qsize = p->qsize;
3125 if (!qsize)
3126 qsize = 2*hts_tpool_size(fd->p);
3127 fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3128 if (!fd->q) {
3129 sam_state_destroy(fp);
3130 return -1;
3131 }
3132
3133 if (fp->format.compression == bgzf)
3134 return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3135
3136 return 0;
3137 }
3138
sam_set_threads(htsFile * fp,int nthreads)3139 int sam_set_threads(htsFile *fp, int nthreads) {
3140 if (nthreads <= 0)
3141 return 0;
3142
3143 htsThreadPool p;
3144 p.pool = hts_tpool_init(nthreads);
3145 p.qsize = nthreads*2;
3146
3147 int ret = sam_set_thread_pool(fp, &p);
3148 if (ret < 0)
3149 return ret;
3150
3151 SAM_state *fd = (SAM_state *)fp->state;
3152 fd->own_pool = 1;
3153
3154 return 0;
3155 }
3156
3157 // Returns 0 on success,
3158 // -1 on EOF,
3159 // <-1 on error
sam_read1(htsFile * fp,sam_hdr_t * h,bam1_t * b)3160 int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
3161 {
3162 switch (fp->format.format) {
3163 case bam: {
3164 int r = bam_read1(fp->fp.bgzf, b);
3165 if (h && r >= 0) {
3166 if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
3167 b->core.mtid >= h->n_targets || b->core.mtid < -1) {
3168 errno = ERANGE;
3169 return -3;
3170 }
3171 }
3172 return r;
3173 }
3174
3175 case cram: {
3176 int ret = cram_get_bam_seq(fp->fp.cram, &b);
3177 if (ret < 0)
3178 return cram_eof(fp->fp.cram) ? -1 : -2;
3179
3180 if (bam_tag2cigar(b, 1, 1) < 0)
3181 return -2;
3182 return ret;
3183 }
3184
3185 case sam: {
3186 // Consume 1st line after header parsing as it wasn't using peek
3187 if (fp->line.l != 0) {
3188 int ret = sam_parse1(&fp->line, h, b);
3189 fp->line.l = 0;
3190 return ret;
3191 }
3192
3193 if (fp->state) {
3194 SAM_state *fd = (SAM_state *)fp->state;
3195
3196 if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
3197 // We don't support multi-threaded SAM parsing with seeks yet.
3198 int ret;
3199 if ((ret = sam_state_destroy(fp)) < 0) {
3200 errno = -ret;
3201 return -2;
3202 }
3203 if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
3204 return -1;
3205 fp->fp.bgzf->seeked = 0;
3206 goto err_recover;
3207 }
3208
3209 if (!fd->h) {
3210 fd->h = h;
3211 fd->h->ref_count++;
3212 // Ensure hrecs is initialised now as we don't want multiple
3213 // threads trying to do this simultaneously.
3214 if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
3215 return -2;
3216
3217 // We can only do this once we've got a header
3218 if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
3219 fp) != 0)
3220 return -2;
3221 fd->dispatcher_set = 1;
3222 }
3223
3224 if (fd->h != h) {
3225 hts_log_error("SAM multi-threaded decoding does not support changing header");
3226 return -1;
3227 }
3228
3229 sp_bams *gb = fd->curr_bam;
3230 if (!gb) {
3231 if (fd->errcode) {
3232 // In case reader failed
3233 errno = fd->errcode;
3234 return -2;
3235 }
3236 hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
3237 if (!r)
3238 return -2;
3239 fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
3240 hts_tpool_delete_result(r, 0);
3241 }
3242 if (!gb)
3243 return fd->errcode ? -2 : -1;
3244 bam1_t *b_array = (bam1_t *)gb->bams;
3245 if (fd->curr_idx < gb->nbams)
3246 if (!bam_copy1(b, &b_array[fd->curr_idx++]))
3247 return -2;
3248 if (fd->curr_idx == gb->nbams) {
3249 pthread_mutex_lock(&fd->lines_m);
3250 gb->next = fd->bams;
3251 fd->bams = gb;
3252 pthread_mutex_unlock(&fd->lines_m);
3253
3254 fd->curr_bam = NULL;
3255 fd->curr_idx = 0;
3256 }
3257
3258 return 0;
3259
3260 } else {
3261 int ret;
3262 err_recover:
3263
3264 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3265 if (ret < 0) return ret;
3266
3267 ret = sam_parse1(&fp->line, h, b);
3268 fp->line.l = 0;
3269 if (ret < 0) {
3270 hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
3271 if (h->ignore_sam_err) goto err_recover;
3272 }
3273 return ret;
3274 }
3275 }
3276
3277 case empty_format:
3278 errno = EPIPE;
3279 return -3;
3280
3281 default:
3282 errno = EFTYPE;
3283 return -3;
3284 }
3285 }
3286
3287
sam_format1_append(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)3288 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
3289 {
3290 int i, r = 0;
3291 uint8_t *s, *end;
3292 const bam1_core_t *c = &b->core;
3293
3294 if (c->l_qname == 0)
3295 return -1;
3296 r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
3297 r |= kputc_('\t', str); // query name
3298 r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
3299 if (c->tid >= 0) { // chr
3300 r |= kputs(h->target_name[c->tid] , str);
3301 r |= kputc_('\t', str);
3302 } else r |= kputsn_("*\t", 2, str);
3303 r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
3304 r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
3305 if (c->n_cigar) { // cigar
3306 uint32_t *cigar = bam_get_cigar(b);
3307 for (i = 0; i < c->n_cigar; ++i) {
3308 r |= kputw(bam_cigar_oplen(cigar[i]), str);
3309 r |= kputc_(bam_cigar_opchr(cigar[i]), str);
3310 }
3311 } else r |= kputc_('*', str);
3312 r |= kputc_('\t', str);
3313 if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
3314 else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
3315 else {
3316 r |= kputs(h->target_name[c->mtid], str);
3317 r |= kputc_('\t', str);
3318 }
3319 r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
3320 r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
3321 if (c->l_qseq) { // seq and qual
3322 uint8_t *s = bam_get_seq(b);
3323 if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
3324 char *cp = str->s + str->l;
3325
3326 // Sequence, 2 bases at a time
3327 nibble2base(s, cp, c->l_qseq);
3328 cp[c->l_qseq] = '\t';
3329 cp += c->l_qseq+1;
3330
3331 // Quality
3332 s = bam_get_qual(b);
3333 i = 0;
3334 if (s[0] == 0xff) {
3335 cp[i++] = '*';
3336 } else {
3337 // local copy of c->l_qseq to aid unrolling
3338 uint32_t lqseq = c->l_qseq;
3339 for (i = 0; i < lqseq; ++i)
3340 cp[i]=s[i]+33;
3341 }
3342 cp[i] = 0;
3343 cp += i;
3344 str->l = cp - str->s;
3345 } else r |= kputsn_("*\t*", 3, str);
3346
3347 s = bam_get_aux(b); // aux
3348 end = b->data + b->l_data;
3349
3350 while (end - s >= 4) {
3351 r |= kputc_('\t', str);
3352 if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
3353 goto bad_aux;
3354 }
3355 r |= kputsn("", 0, str); // nul terminate
3356 if (r < 0) goto mem_err;
3357
3358 return str->l;
3359
3360 bad_aux:
3361 hts_log_error("Corrupted aux data for read %.*s",
3362 b->core.l_qname, bam_get_qname(b));
3363 errno = EINVAL;
3364 return -1;
3365
3366 mem_err:
3367 hts_log_error("Out of memory");
3368 errno = ENOMEM;
3369 return -1;
3370 }
3371
sam_format1(const bam_hdr_t * h,const bam1_t * b,kstring_t * str)3372 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
3373 {
3374 str->l = 0;
3375 return sam_format1_append(h, b, str);
3376 }
3377
3378 // Sadly we need to be able to modify the bam_hdr here so we can
3379 // reference count the structure.
sam_write1(htsFile * fp,const sam_hdr_t * h,const bam1_t * b)3380 int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
3381 {
3382 switch (fp->format.format) {
3383 case binary_format:
3384 fp->format.category = sequence_data;
3385 fp->format.format = bam;
3386 /* fall-through */
3387 case bam:
3388 return bam_write_idx1(fp, h, b);
3389
3390 case cram:
3391 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
3392
3393 case text_format:
3394 fp->format.category = sequence_data;
3395 fp->format.format = sam;
3396 /* fall-through */
3397 case sam:
3398 if (fp->state) {
3399 SAM_state *fd = (SAM_state *)fp->state;
3400
3401 // Threaded output
3402 if (!fd->h) {
3403 // NB: discard const. We don't actually modify sam_hdr_t here,
3404 // just data pointed to by it (which is a bit weasely still),
3405 // but out cached pointer must be non-const as we want to
3406 // destroy it later on and sam_hdr_destroy takes non-const.
3407 //
3408 // We do this because some tools do sam_hdr_destroy; sam_close
3409 // while others do sam_close; sam_hdr_destroy. The former is
3410 // an issue as we need the header still when flushing.
3411 fd->h = (sam_hdr_t *)h;
3412 fd->h->ref_count++;
3413
3414 if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
3415 fp) != 0)
3416 return -2;
3417 fd->dispatcher_set = 1;
3418 }
3419
3420 if (fd->h != h) {
3421 hts_log_error("SAM multi-threaded decoding does not support changing header");
3422 return -2;
3423 }
3424
3425 // Find a suitable BAM array to copy to
3426 sp_bams *gb = fd->curr_bam;
3427 if (!gb) {
3428 pthread_mutex_lock(&fd->lines_m);
3429 if (fd->bams) {
3430 fd->curr_bam = gb = fd->bams;
3431 fd->bams = gb->next;
3432 gb->next = NULL;
3433 gb->nbams = 0;
3434 pthread_mutex_unlock(&fd->lines_m);
3435 } else {
3436 pthread_mutex_unlock(&fd->lines_m);
3437 if (!(gb = calloc(1, sizeof(*gb)))) return -1;
3438 if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) {
3439 free(gb);
3440 return -1;
3441 }
3442 gb->nbams = 0;
3443 gb->abams = NB;
3444 gb->fd = fd;
3445 fd->curr_idx = 0;
3446 fd->curr_bam = gb;
3447 }
3448 }
3449
3450 if (!bam_copy1(&gb->bams[gb->nbams++], b))
3451 return -2;
3452
3453 // Dispatch if full
3454 if (gb->nbams == NB) {
3455 gb->serial = fd->serial++;
3456 //fprintf(stderr, "Dispatch another %d bams\n", NB);
3457 pthread_mutex_lock(&fd->command_m);
3458 if (fd->errcode != 0) {
3459 pthread_mutex_unlock(&fd->command_m);
3460 return -fd->errcode;
3461 }
3462 if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
3463 cleanup_sp_bams,
3464 cleanup_sp_lines, 0) < 0) {
3465 pthread_mutex_unlock(&fd->command_m);
3466 return -1;
3467 }
3468 pthread_mutex_unlock(&fd->command_m);
3469 fd->curr_bam = NULL;
3470 }
3471
3472 // Dummy value as we don't know how long it really is.
3473 // We could track file sizes via a SAM_state field, but I don't think
3474 // it is necessary.
3475 return 1;
3476 } else {
3477 if (sam_format1(h, b, &fp->line) < 0) return -1;
3478 kputc('\n', &fp->line);
3479 if (fp->is_bgzf) {
3480 if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
3481 } else {
3482 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
3483 }
3484
3485 if (fp->idx) {
3486 if (fp->format.compression == bgzf) {
3487 if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3488 bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3489 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3490 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
3491 return -1;
3492 }
3493 } else {
3494 if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3495 bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3496 hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3497 bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
3498 return -1;
3499 }
3500 }
3501 }
3502
3503 return fp->line.l;
3504 }
3505
3506 default:
3507 errno = EBADF;
3508 return -1;
3509 }
3510 }
3511
3512 /************************
3513 *** Auxiliary fields ***
3514 ************************/
3515 #ifndef HTS_LITTLE_ENDIAN
aux_to_le(char type,uint8_t * out,const uint8_t * in,size_t len)3516 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
3517 int tsz = aux_type2size(type);
3518
3519 if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
3520
3521 switch (tsz) {
3522 case 'H': case 'Z': case 1: // Trivial
3523 memcpy(out, in, len);
3524 break;
3525
3526 #define aux_val_to_le(type_t, store_le) do { \
3527 type_t v; \
3528 size_t i; \
3529 for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
3530 memcpy(&v, in + i, sizeof(type_t)); \
3531 store_le(v, out); \
3532 } \
3533 } while (0)
3534
3535 case 2: aux_val_to_le(uint16_t, u16_to_le); break;
3536 case 4: aux_val_to_le(uint32_t, u32_to_le); break;
3537 case 8: aux_val_to_le(uint64_t, u64_to_le); break;
3538
3539 #undef aux_val_to_le
3540
3541 case 'B': { // Recurse!
3542 uint32_t n;
3543 if (len < 5) return -1;
3544 memcpy(&n, in + 1, 4);
3545 out[0] = in[0];
3546 u32_to_le(n, out + 1);
3547 return aux_to_le(in[0], out + 5, in + 5, len - 5);
3548 }
3549
3550 default: // Unknown type code
3551 return -1;
3552 }
3553
3554
3555
3556 return 0;
3557 }
3558 #endif
3559
bam_aux_append(bam1_t * b,const char tag[2],char type,int len,const uint8_t * data)3560 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
3561 {
3562 uint32_t new_len;
3563
3564 assert(b->l_data >= 0);
3565 new_len = b->l_data + 3 + len;
3566 if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
3567
3568 if (realloc_bam_data(b, new_len) < 0) return -1;
3569
3570 b->data[b->l_data] = tag[0];
3571 b->data[b->l_data + 1] = tag[1];
3572 b->data[b->l_data + 2] = type;
3573
3574 #ifdef HTS_LITTLE_ENDIAN
3575 memcpy(b->data + b->l_data + 3, data, len);
3576 #else
3577 if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
3578 errno = EINVAL;
3579 return -1;
3580 }
3581 #endif
3582
3583 b->l_data = new_len;
3584
3585 return 0;
3586
3587 nomem:
3588 errno = ENOMEM;
3589 return -1;
3590 }
3591
skip_aux(uint8_t * s,uint8_t * end)3592 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
3593 {
3594 int size;
3595 uint32_t n;
3596 if (s >= end) return end;
3597 size = aux_type2size(*s); ++s; // skip type
3598 switch (size) {
3599 case 'Z':
3600 case 'H':
3601 while (s < end && *s) ++s;
3602 return s < end ? s + 1 : end;
3603 case 'B':
3604 if (end - s < 5) return NULL;
3605 size = aux_type2size(*s); ++s;
3606 n = le_to_u32(s);
3607 s += 4;
3608 if (size == 0 || end - s < size * n) return NULL;
3609 return s + size * n;
3610 case 0:
3611 return NULL;
3612 default:
3613 if (end - s < size) return NULL;
3614 return s + size;
3615 }
3616 }
3617
bam_aux_get(const bam1_t * b,const char tag[2])3618 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
3619 {
3620 uint8_t *s, *end, *t = (uint8_t *) tag;
3621 uint16_t y = (uint16_t) t[0]<<8 | t[1];
3622 s = bam_get_aux(b);
3623 end = b->data + b->l_data;
3624 while (s != NULL && end - s >= 3) {
3625 uint16_t x = (uint16_t) s[0]<<8 | s[1];
3626 s += 2;
3627 if (x == y) {
3628 // Check the tag value is valid and complete
3629 uint8_t *e = skip_aux(s, end);
3630 if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
3631 goto bad_aux; // Unterminated string
3632 }
3633 if (e != NULL) {
3634 return s;
3635 } else {
3636 goto bad_aux;
3637 }
3638 }
3639 s = skip_aux(s, end);
3640 }
3641 if (s == NULL) goto bad_aux;
3642 errno = ENOENT;
3643 return NULL;
3644
3645 bad_aux:
3646 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
3647 errno = EINVAL;
3648 return NULL;
3649 }
3650
3651 // s MUST BE returned by bam_aux_get()
bam_aux_del(bam1_t * b,uint8_t * s)3652 int bam_aux_del(bam1_t *b, uint8_t *s)
3653 {
3654 uint8_t *p, *aux;
3655 int l_aux = bam_get_l_aux(b);
3656 aux = bam_get_aux(b);
3657 p = s - 2;
3658 s = skip_aux(s, aux + l_aux);
3659 if (s == NULL) goto bad_aux;
3660 memmove(p, s, l_aux - (s - aux));
3661 b->l_data -= s - p;
3662 return 0;
3663
3664 bad_aux:
3665 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
3666 errno = EINVAL;
3667 return -1;
3668 }
3669
bam_aux_update_str(bam1_t * b,const char tag[2],int len,const char * data)3670 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
3671 {
3672 // FIXME: This is not at all efficient!
3673 size_t ln = len >= 0 ? len : strlen(data) + 1;
3674 size_t old_ln = 0;
3675 int need_nul = ln == 0 || data[ln - 1] != '\0';
3676 int save_errno = errno;
3677 int new_tag = 0;
3678 uint8_t *s = bam_aux_get(b,tag), *e;
3679
3680 if (s) { // Replacing existing tag
3681 char type = *s;
3682 if (type != 'Z') {
3683 hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
3684 errno = EINVAL;
3685 return -1;
3686 }
3687 s++;
3688 e = memchr(s, '\0', b->data + b->l_data - s);
3689 old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
3690 s -= 3;
3691 } else {
3692 if (errno != ENOENT) { // Invalid aux data, give up
3693 return -1;
3694 } else { // Tag doesn't exist - put it on the end
3695 errno = save_errno;
3696 s = b->data + b->l_data;
3697 new_tag = 3;
3698 }
3699 }
3700
3701 if (old_ln < ln + need_nul + new_tag) {
3702 ptrdiff_t s_offset = s - b->data;
3703 if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
3704 return -1;
3705 s = b->data + s_offset;
3706 }
3707 if (!new_tag) {
3708 memmove(s + 3 + ln + need_nul,
3709 s + 3 + old_ln,
3710 b->l_data - (s + 3 - b->data) - old_ln);
3711 }
3712 b->l_data += new_tag + ln + need_nul - old_ln;
3713
3714 s[0] = tag[0];
3715 s[1] = tag[1];
3716 s[2] = 'Z';
3717 memmove(s+3,data,ln);
3718 if (need_nul) s[3 + ln] = '\0';
3719 return 0;
3720 }
3721
bam_aux_update_int(bam1_t * b,const char tag[2],int64_t val)3722 int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
3723 {
3724 uint32_t sz, old_sz = 0, new = 0;
3725 uint8_t *s, type;
3726
3727 if (val < INT32_MIN || val > UINT32_MAX) {
3728 errno = EOVERFLOW;
3729 return -1;
3730 }
3731 if (val < INT16_MIN) { type = 'i'; sz = 4; }
3732 else if (val < INT8_MIN) { type = 's'; sz = 2; }
3733 else if (val < 0) { type = 'c'; sz = 1; }
3734 else if (val < UINT8_MAX) { type = 'C'; sz = 1; }
3735 else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
3736 else { type = 'I'; sz = 4; }
3737
3738 s = bam_aux_get(b, tag);
3739 if (s) { // Tag present - how big was the old one?
3740 switch (*s) {
3741 case 'c': case 'C': old_sz = 1; break;
3742 case 's': case 'S': old_sz = 2; break;
3743 case 'i': case 'I': old_sz = 4; break;
3744 default: errno = EINVAL; return -1; // Not an integer
3745 }
3746 } else {
3747 if (errno == ENOENT) { // Tag doesn't exist - add a new one
3748 s = b->data + b->l_data;
3749 new = 1;
3750 } else { // Invalid aux data, give up.
3751 return -1;
3752 }
3753 }
3754
3755 if (new || old_sz < sz) {
3756 // Make room for new tag
3757 ptrdiff_t s_offset = s - b->data;
3758 if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
3759 return -1;
3760 s = b->data + s_offset;
3761 if (new) { // Add tag id
3762 *s++ = tag[0];
3763 *s++ = tag[1];
3764 } else { // Shift following data so we have space
3765 memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
3766 }
3767 } else {
3768 // Reuse old space. Data value may be bigger than necessary but
3769 // we avoid having to move everything else
3770 sz = old_sz;
3771 type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
3772 assert(type > 0);
3773 }
3774 *s++ = type;
3775 #ifdef HTS_LITTLE_ENDIAN
3776 memcpy(s, &val, sz);
3777 #else
3778 switch (sz) {
3779 case 4: u32_to_le(val, s); break;
3780 case 2: u16_to_le(val, s); break;
3781 default: *s = val; break;
3782 }
3783 #endif
3784 b->l_data += (new ? 3 : 0) + sz - old_sz;
3785 return 0;
3786 }
3787
bam_aux_update_float(bam1_t * b,const char tag[2],float val)3788 int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
3789 {
3790 uint8_t *s = bam_aux_get(b, tag);
3791 int shrink = 0, new = 0;
3792
3793 if (s) { // Tag present - what was it?
3794 switch (*s) {
3795 case 'f': break;
3796 case 'd': shrink = 1; break;
3797 default: errno = EINVAL; return -1; // Not a float
3798 }
3799 } else {
3800 if (errno == ENOENT) { // Tag doesn't exist - add a new one
3801 new = 1;
3802 } else { // Invalid aux data, give up.
3803 return -1;
3804 }
3805 }
3806
3807 if (new) { // Ensure there's room
3808 if (possibly_expand_bam_data(b, 3 + 4) < 0)
3809 return -1;
3810 s = b->data + b->l_data;
3811 *s++ = tag[0];
3812 *s++ = tag[1];
3813 } else if (shrink) { // Convert non-standard double tag to float
3814 memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
3815 b->l_data -= 4;
3816 }
3817 *s++ = 'f';
3818 float_to_le(val, s);
3819 if (new) b->l_data += 7;
3820
3821 return 0;
3822 }
3823
bam_aux_update_array(bam1_t * b,const char tag[2],uint8_t type,uint32_t items,void * data)3824 int bam_aux_update_array(bam1_t *b, const char tag[2],
3825 uint8_t type, uint32_t items, void *data)
3826 {
3827 uint8_t *s = bam_aux_get(b, tag);
3828 size_t old_sz = 0, new_sz;
3829 int new = 0;
3830
3831 if (s) { // Tag present
3832 if (*s != 'B') { errno = EINVAL; return -1; }
3833 old_sz = aux_type2size(s[1]);
3834 if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
3835 old_sz *= le_to_u32(s + 2);
3836 } else {
3837 if (errno == ENOENT) { // Tag doesn't exist - add a new one
3838 s = b->data + b->l_data;
3839 new = 1;
3840 } else { // Invalid aux data, give up.
3841 return -1;
3842 }
3843 }
3844
3845 new_sz = aux_type2size(type);
3846 if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
3847 if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
3848 new_sz *= items;
3849
3850 if (new || old_sz < new_sz) {
3851 // Make room for new tag
3852 ptrdiff_t s_offset = s - b->data;
3853 if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
3854 return -1;
3855 s = b->data + s_offset;
3856 }
3857 if (new) { // Add tag id and type
3858 *s++ = tag[0];
3859 *s++ = tag[1];
3860 *s = 'B';
3861 b->l_data += 8 + new_sz;
3862 } else if (old_sz != new_sz) { // shift following data if necessary
3863 memmove(s + 6 + new_sz, s + 6 + old_sz,
3864 b->l_data - ((s + 6 + old_sz) - b->data));
3865 b->l_data -= old_sz;
3866 b->l_data += new_sz;
3867 }
3868
3869 s[1] = type;
3870 u32_to_le(items, s + 2);
3871 #ifdef HTS_LITTLE_ENDIAN
3872 memcpy(s + 6, data, new_sz);
3873 return 0;
3874 #else
3875 return aux_to_le(type, s + 6, data, new_sz);
3876 #endif
3877 }
3878
get_int_aux_val(uint8_t type,const uint8_t * s,uint32_t idx)3879 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
3880 uint32_t idx)
3881 {
3882 switch (type) {
3883 case 'c': return le_to_i8(s + idx);
3884 case 'C': return s[idx];
3885 case 's': return le_to_i16(s + 2 * idx);
3886 case 'S': return le_to_u16(s + 2 * idx);
3887 case 'i': return le_to_i32(s + 4 * idx);
3888 case 'I': return le_to_u32(s + 4 * idx);
3889 default:
3890 errno = EINVAL;
3891 return 0;
3892 }
3893 }
3894
bam_aux2i(const uint8_t * s)3895 int64_t bam_aux2i(const uint8_t *s)
3896 {
3897 int type;
3898 type = *s++;
3899 return get_int_aux_val(type, s, 0);
3900 }
3901
bam_aux2f(const uint8_t * s)3902 double bam_aux2f(const uint8_t *s)
3903 {
3904 int type;
3905 type = *s++;
3906 if (type == 'd') return le_to_double(s);
3907 else if (type == 'f') return le_to_float(s);
3908 else return get_int_aux_val(type, s, 0);
3909 }
3910
bam_aux2A(const uint8_t * s)3911 char bam_aux2A(const uint8_t *s)
3912 {
3913 int type;
3914 type = *s++;
3915 if (type == 'A') return *(char*)s;
3916 errno = EINVAL;
3917 return 0;
3918 }
3919
bam_aux2Z(const uint8_t * s)3920 char *bam_aux2Z(const uint8_t *s)
3921 {
3922 int type;
3923 type = *s++;
3924 if (type == 'Z' || type == 'H') return (char*)s;
3925 errno = EINVAL;
3926 return 0;
3927 }
3928
bam_auxB_len(const uint8_t * s)3929 uint32_t bam_auxB_len(const uint8_t *s)
3930 {
3931 if (s[0] != 'B') {
3932 errno = EINVAL;
3933 return 0;
3934 }
3935 return le_to_u32(s + 2);
3936 }
3937
bam_auxB2i(const uint8_t * s,uint32_t idx)3938 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
3939 {
3940 uint32_t len = bam_auxB_len(s);
3941 if (idx >= len) {
3942 errno = ERANGE;
3943 return 0;
3944 }
3945 return get_int_aux_val(s[1], s + 6, idx);
3946 }
3947
bam_auxB2f(const uint8_t * s,uint32_t idx)3948 double bam_auxB2f(const uint8_t *s, uint32_t idx)
3949 {
3950 uint32_t len = bam_auxB_len(s);
3951 if (idx >= len) {
3952 errno = ERANGE;
3953 return 0.0;
3954 }
3955 if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
3956 else return get_int_aux_val(s[1], s + 6, idx);
3957 }
3958
sam_open_mode(char * mode,const char * fn,const char * format)3959 int sam_open_mode(char *mode, const char *fn, const char *format)
3960 {
3961 // TODO Parse "bam5" etc for compression level
3962 if (format == NULL) {
3963 // Try to pick a format based on the filename extension
3964 char extension[HTS_MAX_EXT_LEN];
3965 if (find_file_extension(fn, extension) < 0) return -1;
3966 return sam_open_mode(mode, fn, extension);
3967 }
3968 else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
3969 else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
3970 else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
3971 else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
3972 else return -1;
3973
3974 return 0;
3975 }
3976
3977 // A version of sam_open_mode that can handle ,key=value options.
3978 // The format string is allocated and returned, to be freed by the caller.
3979 // Prefix should be "r" or "w",
sam_open_mode_opts(const char * fn,const char * mode,const char * format)3980 char *sam_open_mode_opts(const char *fn,
3981 const char *mode,
3982 const char *format)
3983 {
3984 char *mode_opts = malloc((format ? strlen(format) : 1) +
3985 (mode ? strlen(mode) : 1) + 12);
3986 char *opts, *cp;
3987 int format_len;
3988
3989 if (!mode_opts)
3990 return NULL;
3991
3992 strcpy(mode_opts, mode ? mode : "r");
3993 cp = mode_opts + strlen(mode_opts);
3994
3995 if (format == NULL) {
3996 // Try to pick a format based on the filename extension
3997 char extension[HTS_MAX_EXT_LEN];
3998 if (find_file_extension(fn, extension) < 0) {
3999 free(mode_opts);
4000 return NULL;
4001 }
4002 if (sam_open_mode(cp, fn, extension) == 0) {
4003 return mode_opts;
4004 } else {
4005 free(mode_opts);
4006 return NULL;
4007 }
4008 }
4009
4010 if ((opts = strchr(format, ','))) {
4011 format_len = opts-format;
4012 } else {
4013 opts="";
4014 format_len = strlen(format);
4015 }
4016
4017 if (strncmp(format, "bam", format_len) == 0) {
4018 *cp++ = 'b';
4019 } else if (strncmp(format, "cram", format_len) == 0) {
4020 *cp++ = 'c';
4021 } else if (strncmp(format, "cram2", format_len) == 0) {
4022 *cp++ = 'c';
4023 strcpy(cp, ",VERSION=2.1");
4024 cp += 12;
4025 } else if (strncmp(format, "cram3", format_len) == 0) {
4026 *cp++ = 'c';
4027 strcpy(cp, ",VERSION=3.0");
4028 cp += 12;
4029 } else if (strncmp(format, "sam", format_len) == 0) {
4030 ; // format mode=""
4031 } else if (strncmp(format, "sam.gz", format_len) == 0) {
4032 *cp++ = 'z';
4033 } else {
4034 free(mode_opts);
4035 return NULL;
4036 }
4037
4038 strcpy(cp, opts);
4039
4040 return mode_opts;
4041 }
4042
4043 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
bam_str2flag(const char * str)4044 int bam_str2flag(const char *str)
4045 {
4046 char *end, *beg = (char*) str;
4047 long int flag = strtol(str, &end, 0);
4048 if ( end!=str ) return flag; // the conversion was successful
4049 flag = 0;
4050 while ( *str )
4051 {
4052 end = beg;
4053 while ( *end && *end!=',' ) end++;
4054 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
4055 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
4056 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
4057 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
4058 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
4059 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
4060 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
4061 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
4062 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
4063 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
4064 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
4065 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
4066 else return -1;
4067 if ( !*end ) break;
4068 beg = end + 1;
4069 }
4070 return flag;
4071 }
4072
bam_flag2str(int flag)4073 char *bam_flag2str(int flag)
4074 {
4075 kstring_t str = {0,0,0};
4076 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
4077 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
4078 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
4079 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
4080 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
4081 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
4082 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
4083 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
4084 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
4085 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
4086 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
4087 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
4088 if ( str.l == 0 ) kputsn("", 0, &str);
4089 return str.s;
4090 }
4091
4092
4093 /**************************
4094 *** Pileup and Mpileup ***
4095 **************************/
4096
4097 #if !defined(BAM_NO_PILEUP)
4098
4099 #include <assert.h>
4100
4101 /*******************
4102 *** Memory pool ***
4103 *******************/
4104
4105 typedef struct {
4106 int k, y;
4107 hts_pos_t x, end;
4108 } cstate_t;
4109
4110 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
4111
4112 typedef struct __linkbuf_t {
4113 bam1_t b;
4114 hts_pos_t beg, end;
4115 cstate_t s;
4116 struct __linkbuf_t *next;
4117 bam_pileup_cd cd;
4118 } lbnode_t;
4119
4120 typedef struct {
4121 int cnt, n, max;
4122 lbnode_t **buf;
4123 } mempool_t;
4124
mp_init(void)4125 static mempool_t *mp_init(void)
4126 {
4127 mempool_t *mp;
4128 mp = (mempool_t*)calloc(1, sizeof(mempool_t));
4129 return mp;
4130 }
mp_destroy(mempool_t * mp)4131 static void mp_destroy(mempool_t *mp)
4132 {
4133 int k;
4134 for (k = 0; k < mp->n; ++k) {
4135 free(mp->buf[k]->b.data);
4136 free(mp->buf[k]);
4137 }
4138 free(mp->buf);
4139 free(mp);
4140 }
mp_alloc(mempool_t * mp)4141 static inline lbnode_t *mp_alloc(mempool_t *mp)
4142 {
4143 ++mp->cnt;
4144 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
4145 else return mp->buf[--mp->n];
4146 }
mp_free(mempool_t * mp,lbnode_t * p)4147 static inline void mp_free(mempool_t *mp, lbnode_t *p)
4148 {
4149 --mp->cnt; p->next = 0; // clear lbnode_t::next here
4150 if (mp->n == mp->max) {
4151 mp->max = mp->max? mp->max<<1 : 256;
4152 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
4153 }
4154 mp->buf[mp->n++] = p;
4155 }
4156
4157 /**********************
4158 *** CIGAR resolver ***
4159 **********************/
4160
4161 /* s->k: the index of the CIGAR operator that has just been processed.
4162 s->x: the reference coordinate of the start of s->k
4163 s->y: the query coordinate of the start of s->k
4164 */
resolve_cigar2(bam_pileup1_t * p,hts_pos_t pos,cstate_t * s)4165 static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
4166 {
4167 #define _cop(c) ((c)&BAM_CIGAR_MASK)
4168 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
4169
4170 bam1_t *b = p->b;
4171 bam1_core_t *c = &b->core;
4172 uint32_t *cigar = bam_get_cigar(b);
4173 int k;
4174 // determine the current CIGAR operation
4175 //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
4176 if (s->k == -1) { // never processed
4177 p->qpos = 0;
4178 if (c->n_cigar == 1) { // just one operation, save a loop
4179 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
4180 } else { // find the first match or deletion
4181 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
4182 int op = _cop(cigar[k]);
4183 int l = _cln(cigar[k]);
4184 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
4185 op == BAM_CEQUAL || op == BAM_CDIFF) break;
4186 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
4187 }
4188 assert(k < c->n_cigar);
4189 s->k = k;
4190 }
4191 } else { // the read has been processed before
4192 int op, l = _cln(cigar[s->k]);
4193 if (pos - s->x >= l) { // jump to the next operation
4194 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
4195 op = _cop(cigar[s->k+1]);
4196 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
4197 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
4198 s->x += l;
4199 ++s->k;
4200 } else { // find the next M/D/N/=/X
4201 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
4202 s->x += l;
4203 for (k = s->k + 1; k < c->n_cigar; ++k) {
4204 op = _cop(cigar[k]), l = _cln(cigar[k]);
4205 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
4206 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
4207 }
4208 s->k = k;
4209 }
4210 assert(s->k < c->n_cigar); // otherwise a bug
4211 } // else, do nothing
4212 }
4213 { // collect pileup information
4214 int op, l;
4215 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
4216 p->is_del = p->indel = p->is_refskip = 0;
4217 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
4218 int op2 = _cop(cigar[s->k+1]);
4219 int l2 = _cln(cigar[s->k+1]);
4220 if (op2 == BAM_CDEL) p->indel = -(int)l2;
4221 else if (op2 == BAM_CINS) p->indel = l2;
4222 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
4223 int l3 = 0;
4224 for (k = s->k + 2; k < c->n_cigar; ++k) {
4225 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
4226 if (op2 == BAM_CINS) l3 += l2;
4227 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
4228 }
4229 if (l3 > 0) p->indel = l3;
4230 }
4231 }
4232 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
4233 p->qpos = s->y + (pos - s->x);
4234 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
4235 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
4236 p->is_refskip = (op == BAM_CREF_SKIP);
4237 } // cannot be other operations; otherwise a bug
4238 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
4239 }
4240 p->cigar_ind = s->k;
4241 return 1;
4242 }
4243
4244 /*******************************
4245 *** Expansion of insertions ***
4246 *******************************/
4247
4248 /*
4249 * Fills out the kstring with the padded insertion sequence for the current
4250 * location in 'p'. If this is not an insertion site, the string is blank.
4251 *
4252 * Returns the length of insertion string on success;
4253 * -1 on failure.
4254 */
bam_plp_insertion(const bam_pileup1_t * p,kstring_t * ins,int * del_len)4255 int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
4256 int j, k, indel;
4257 uint32_t *cigar;
4258
4259 if (p->indel <= 0) {
4260 if (ks_resize(ins, 1) < 0)
4261 return -1;
4262 ins->l = 0;
4263 ins->s[0] = '\0';
4264 return 0;
4265 }
4266
4267 if (del_len)
4268 *del_len = 0;
4269
4270 // Measure indel length including pads
4271 indel = 0;
4272 k = p->cigar_ind+1;
4273 cigar = bam_get_cigar(p->b);
4274 while (k < p->b->core.n_cigar) {
4275 switch (cigar[k] & BAM_CIGAR_MASK) {
4276 case BAM_CPAD:
4277 case BAM_CINS:
4278 indel += (cigar[k] >> BAM_CIGAR_SHIFT);
4279 break;
4280 default:
4281 k = p->b->core.n_cigar;
4282 break;
4283 }
4284 k++;
4285 }
4286 ins->l = indel;
4287
4288 // Produce sequence
4289 if (ks_resize(ins, indel+1) < 0)
4290 return -1;
4291 indel = 0;
4292 k = p->cigar_ind+1;
4293 j = 1;
4294 while (k < p->b->core.n_cigar) {
4295 int l, c;
4296 switch (cigar[k] & BAM_CIGAR_MASK) {
4297 case BAM_CPAD:
4298 for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
4299 ins->s[indel++] = '*';
4300 break;
4301 case BAM_CINS:
4302 for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
4303 c = seq_nt16_str[bam_seqi(bam_get_seq(p->b),
4304 p->qpos + j - p->is_del)];
4305 ins->s[indel++] = c;
4306 }
4307 break;
4308 case BAM_CDEL:
4309 // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
4310 if (del_len)
4311 *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
4312 // fall through
4313 default:
4314 k = p->b->core.n_cigar;
4315 break;
4316 }
4317 k++;
4318 }
4319 ins->s[indel] = '\0';
4320
4321 return indel;
4322 }
4323
4324 /***********************
4325 *** Pileup iterator ***
4326 ***********************/
4327
4328 // Dictionary of overlapping reads
4329 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
4330 typedef khash_t(olap_hash) olap_hash_t;
4331
4332 struct bam_plp_s {
4333 mempool_t *mp;
4334 lbnode_t *head, *tail;
4335 int32_t tid, max_tid;
4336 hts_pos_t pos, max_pos;
4337 int is_eof, max_plp, error, maxcnt;
4338 uint64_t id;
4339 bam_pileup1_t *plp;
4340 // for the "auto" interface only
4341 bam1_t *b;
4342 bam_plp_auto_f func;
4343 void *data;
4344 olap_hash_t *overlaps;
4345
4346 // For notification of creation and destruction events
4347 // and associated client-owned pointer.
4348 int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
4349 int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
4350 };
4351
bam_plp_init(bam_plp_auto_f func,void * data)4352 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
4353 {
4354 bam_plp_t iter;
4355 iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
4356 iter->mp = mp_init();
4357 iter->head = iter->tail = mp_alloc(iter->mp);
4358 iter->max_tid = iter->max_pos = -1;
4359 iter->maxcnt = 8000;
4360 if (func) {
4361 iter->func = func;
4362 iter->data = data;
4363 iter->b = bam_init1();
4364 }
4365 return iter;
4366 }
4367
bam_plp_init_overlaps(bam_plp_t iter)4368 int bam_plp_init_overlaps(bam_plp_t iter)
4369 {
4370 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
4371 return iter->overlaps ? 0 : -1;
4372 }
4373
bam_plp_destroy(bam_plp_t iter)4374 void bam_plp_destroy(bam_plp_t iter)
4375 {
4376 lbnode_t *p, *pnext;
4377 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
4378 for (p = iter->head; p != NULL; p = pnext) {
4379 pnext = p->next;
4380 mp_free(iter->mp, p);
4381 }
4382 mp_destroy(iter->mp);
4383 if (iter->b) bam_destroy1(iter->b);
4384 free(iter->plp);
4385 free(iter);
4386 }
4387
bam_plp_constructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))4388 void bam_plp_constructor(bam_plp_t plp,
4389 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
4390 plp->plp_construct = func;
4391 }
4392
bam_plp_destructor(bam_plp_t plp,int (* func)(void * data,const bam1_t * b,bam_pileup_cd * cd))4393 void bam_plp_destructor(bam_plp_t plp,
4394 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
4395 plp->plp_destruct = func;
4396 }
4397
4398 //---------------------------------
4399 //--- Tweak overlapping reads
4400 //---------------------------------
4401
4402 /**
4403 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
4404 * cigar_iref2iseq_next() - get the next CMATCH base
4405 * @cigar: pointer to current cigar block (rw)
4406 * @cigar_max: pointer just beyond the last cigar block
4407 * @icig: position within the current cigar block (rw)
4408 * @iseq: position in the sequence (rw)
4409 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
4410 *
4411 * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
4412 * or -2 on error.
4413 */
cigar_iref2iseq_set(uint32_t ** cigar,uint32_t * cigar_max,hts_pos_t * icig,hts_pos_t * iseq,hts_pos_t * iref)4414 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref)
4415 {
4416 hts_pos_t pos = *iref;
4417 if ( pos < 0 ) return -1;
4418 *icig = 0;
4419 *iseq = 0;
4420 *iref = 0;
4421 while ( *cigar<cigar_max )
4422 {
4423 int cig = (**cigar) & BAM_CIGAR_MASK;
4424 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
4425
4426 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
4427 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
4428 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
4429 {
4430 pos -= ncig;
4431 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
4432 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
4433 continue;
4434 }
4435 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
4436 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
4437 {
4438 pos -= ncig;
4439 if ( pos<0 ) pos = 0;
4440 (*cigar)++; *icig = 0; *iref += ncig;
4441 continue;
4442 }
4443 hts_log_error("Unexpected cigar %d", cig);
4444 return -2;
4445 }
4446 *iseq = -1;
4447 return -1;
4448 }
cigar_iref2iseq_next(uint32_t ** cigar,uint32_t * cigar_max,hts_pos_t * icig,hts_pos_t * iseq,hts_pos_t * iref)4449 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref)
4450 {
4451 while ( *cigar < cigar_max )
4452 {
4453 int cig = (**cigar) & BAM_CIGAR_MASK;
4454 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
4455
4456 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
4457 {
4458 if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
4459 (*iseq)++; (*icig)++; (*iref)++;
4460 return BAM_CMATCH;
4461 }
4462 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
4463 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
4464 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
4465 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
4466 hts_log_error("Unexpected cigar %d", cig);
4467 return -2;
4468 }
4469 *iseq = -1;
4470 *iref = -1;
4471 return -1;
4472 }
4473
tweak_overlap_quality(bam1_t * a,bam1_t * b)4474 static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
4475 {
4476 uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
4477 uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
4478 hts_pos_t a_icig = 0, a_iseq = 0;
4479 hts_pos_t b_icig = 0, b_iseq = 0;
4480 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
4481 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
4482
4483 hts_pos_t iref = b->core.pos;
4484 hts_pos_t a_iref = iref - a->core.pos;
4485 hts_pos_t b_iref = iref - b->core.pos;
4486 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
4487 if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error
4488 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
4489 if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error
4490
4491 #if DBG
4492 fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
4493 a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
4494 #endif
4495
4496 int err = 0;
4497 while ( 1 )
4498 {
4499 // Increment reference position
4500 while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
4501 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
4502 if ( a_ret<0 ) { err = a_ret<-1?-1:0; break; } // done
4503 if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
4504
4505 while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
4506 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
4507 if ( b_ret<0 ) { err = b_ret<-1?-1:0; break; } // done
4508 if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
4509
4510 iref++;
4511 if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
4512
4513 if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
4514 return -1; // Fell off end of sequence, bad CIGAR?
4515
4516 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
4517 {
4518 #if DBG
4519 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
4520 #endif
4521 // we are very confident about this base
4522 int qual = a_qual[a_iseq] + b_qual[b_iseq];
4523 a_qual[a_iseq] = qual>200 ? 200 : qual;
4524 b_qual[b_iseq] = 0;
4525 }
4526 else
4527 {
4528 if ( a_qual[a_iseq] >= b_qual[b_iseq] )
4529 {
4530 #if DBG
4531 fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
4532 #endif
4533 a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
4534 b_qual[b_iseq] = 0;
4535 }
4536 else
4537 {
4538 #if DBG
4539 fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
4540 #endif
4541 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
4542 a_qual[a_iseq] = 0;
4543 }
4544 }
4545 }
4546 #if DBG
4547 fprintf(stderr,"\n");
4548 #endif
4549 return err;
4550 }
4551
4552 // Fix overlapping reads. Simple soft-clipping did not give good results.
4553 // Lowering qualities of unwanted bases is more selective and works better.
4554 //
4555 // Returns 0 on success, -1 on failure
overlap_push(bam_plp_t iter,lbnode_t * node)4556 static int overlap_push(bam_plp_t iter, lbnode_t *node)
4557 {
4558 if ( !iter->overlaps ) return 0;
4559
4560 // mapped mates and paired reads only
4561 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
4562
4563 // no overlap possible, unless some wild cigar
4564 if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
4565 || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
4566 && node->b.core.mpos >= node->end) // for those wild cigars
4567 ) return 0;
4568
4569 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
4570 if ( kitr==kh_end(iter->overlaps) )
4571 {
4572 // Only add reads where the mate is still to arrive
4573 if (node->b.core.mpos >= node->b.core.pos ||
4574 ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
4575 int ret;
4576 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
4577 if (ret < 0) return -1;
4578 kh_value(iter->overlaps, kitr) = node;
4579 }
4580 }
4581 else
4582 {
4583 lbnode_t *a = kh_value(iter->overlaps, kitr);
4584 int err = tweak_overlap_quality(&a->b, &node->b);
4585 kh_del(olap_hash, iter->overlaps, kitr);
4586 assert(a->end-1 == a->s.end);
4587 a->end = bam_endpos(&a->b);
4588 a->s.end = a->end - 1;
4589 return err;
4590 }
4591 return 0;
4592 }
4593
overlap_remove(bam_plp_t iter,const bam1_t * b)4594 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
4595 {
4596 if ( !iter->overlaps ) return;
4597
4598 khiter_t kitr;
4599 if ( b )
4600 {
4601 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
4602 if ( kitr!=kh_end(iter->overlaps) )
4603 kh_del(olap_hash, iter->overlaps, kitr);
4604 }
4605 else
4606 {
4607 // remove all
4608 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
4609 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
4610 }
4611 }
4612
4613
4614
4615 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
4616 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
4617 // buffer yet (the current position is still the maximum position across all buffered reads).
bam_plp64_next(bam_plp_t iter,int * _tid,hts_pos_t * _pos,int * _n_plp)4618 const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
4619 {
4620 if (iter->error) { *_n_plp = -1; return NULL; }
4621 *_n_plp = 0;
4622 if (iter->is_eof && iter->head == iter->tail) return NULL;
4623 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
4624 int n_plp = 0;
4625 // write iter->plp at iter->pos
4626 lbnode_t **pptr = &iter->head;
4627 while (*pptr != iter->tail) {
4628 lbnode_t *p = *pptr;
4629 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
4630 overlap_remove(iter, &p->b);
4631 if (iter->plp_destruct)
4632 iter->plp_destruct(iter->data, &p->b, &p->cd);
4633 *pptr = p->next; mp_free(iter->mp, p);
4634 }
4635 else {
4636 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
4637 if (n_plp == iter->max_plp) { // then double the capacity
4638 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
4639 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
4640 }
4641 iter->plp[n_plp].b = &p->b;
4642 iter->plp[n_plp].cd = p->cd;
4643 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
4644 }
4645 pptr = &(*pptr)->next;
4646 }
4647 }
4648 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
4649 // update iter->tid and iter->pos
4650 if (iter->head != iter->tail) {
4651 if (iter->tid > iter->head->b.core.tid) {
4652 hts_log_error("Unsorted input. Pileup aborts");
4653 iter->error = 1;
4654 *_n_plp = -1;
4655 return NULL;
4656 }
4657 }
4658 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
4659 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
4660 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
4661 iter->pos = iter->head->beg; // jump to the next position
4662 } else ++iter->pos; // scan contiguously
4663 // return
4664 if (n_plp) return iter->plp;
4665 if (iter->is_eof && iter->head == iter->tail) break;
4666 }
4667 return NULL;
4668 }
4669
bam_plp_next(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)4670 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
4671 {
4672 hts_pos_t pos64 = 0;
4673 const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
4674 if (pos64 < INT_MAX) {
4675 *_pos = pos64;
4676 } else {
4677 hts_log_error("Position %"PRId64" too large", pos64);
4678 *_pos = INT_MAX;
4679 iter->error = 1;
4680 *_n_plp = -1;
4681 return NULL;
4682 }
4683 return p;
4684 }
4685
bam_plp_push(bam_plp_t iter,const bam1_t * b)4686 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
4687 {
4688 if (iter->error) return -1;
4689 if (b) {
4690 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
4691 // Skip only unmapped reads here, any additional filtering must be done in iter->func
4692 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
4693 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
4694 {
4695 overlap_remove(iter, b);
4696 return 0;
4697 }
4698 if (bam_copy1(&iter->tail->b, b) == NULL)
4699 return -1;
4700 iter->tail->b.id = iter->id++;
4701 iter->tail->beg = b->core.pos;
4702 // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
4703 iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
4704 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
4705 if (b->core.tid < iter->max_tid) {
4706 hts_log_error("The input is not sorted (chromosomes out of order)");
4707 iter->error = 1;
4708 return -1;
4709 }
4710 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
4711 hts_log_error("The input is not sorted (reads out of order)");
4712 iter->error = 1;
4713 return -1;
4714 }
4715 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
4716 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
4717 lbnode_t *next = mp_alloc(iter->mp);
4718 if (!next) {
4719 iter->error = 1;
4720 return -1;
4721 }
4722 if (iter->plp_construct)
4723 iter->plp_construct(iter->data, &iter->tail->b,
4724 &iter->tail->cd);
4725 if (overlap_push(iter, iter->tail) < 0) {
4726 mp_free(iter->mp, next);
4727 iter->error = 1;
4728 return -1;
4729 }
4730 iter->tail->next = next;
4731 iter->tail = iter->tail->next;
4732 }
4733 } else iter->is_eof = 1;
4734 return 0;
4735 }
4736
bam_plp64_auto(bam_plp_t iter,int * _tid,hts_pos_t * _pos,int * _n_plp)4737 const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
4738 {
4739 const bam_pileup1_t *plp;
4740 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
4741 if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
4742 else { // no pileup line can be obtained; read alignments
4743 *_n_plp = 0;
4744 if (iter->is_eof) return 0;
4745 int ret;
4746 while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
4747 if (bam_plp_push(iter, iter->b) < 0) {
4748 *_n_plp = -1;
4749 return 0;
4750 }
4751 if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
4752 // otherwise no pileup line can be returned; read the next alignment.
4753 }
4754 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
4755 if (bam_plp_push(iter, 0) < 0) {
4756 *_n_plp = -1;
4757 return 0;
4758 }
4759 if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
4760 return 0;
4761 }
4762 }
4763
bam_plp_auto(bam_plp_t iter,int * _tid,int * _pos,int * _n_plp)4764 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
4765 {
4766 hts_pos_t pos64 = 0;
4767 const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
4768 if (pos64 < INT_MAX) {
4769 *_pos = pos64;
4770 } else {
4771 hts_log_error("Position %"PRId64" too large", pos64);
4772 *_pos = INT_MAX;
4773 iter->error = 1;
4774 *_n_plp = -1;
4775 return NULL;
4776 }
4777 return p;
4778 }
4779
bam_plp_reset(bam_plp_t iter)4780 void bam_plp_reset(bam_plp_t iter)
4781 {
4782 overlap_remove(iter, NULL);
4783 iter->max_tid = iter->max_pos = -1;
4784 iter->tid = iter->pos = 0;
4785 iter->is_eof = 0;
4786 while (iter->head != iter->tail) {
4787 lbnode_t *p = iter->head;
4788 iter->head = p->next;
4789 mp_free(iter->mp, p);
4790 }
4791 }
4792
bam_plp_set_maxcnt(bam_plp_t iter,int maxcnt)4793 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
4794 {
4795 iter->maxcnt = maxcnt;
4796 }
4797
4798 /************************
4799 *** Mpileup iterator ***
4800 ************************/
4801
4802 struct bam_mplp_s {
4803 int n;
4804 int32_t min_tid, *tid;
4805 hts_pos_t min_pos, *pos;
4806 bam_plp_t *iter;
4807 int *n_plp;
4808 const bam_pileup1_t **plp;
4809 };
4810
bam_mplp_init(int n,bam_plp_auto_f func,void ** data)4811 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
4812 {
4813 int i;
4814 bam_mplp_t iter;
4815 iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
4816 iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
4817 iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
4818 iter->n_plp = (int*)calloc(n, sizeof(int));
4819 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
4820 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
4821 iter->n = n;
4822 iter->min_pos = HTS_POS_MAX;
4823 iter->min_tid = (uint32_t)-1;
4824 for (i = 0; i < n; ++i) {
4825 iter->iter[i] = bam_plp_init(func, data[i]);
4826 iter->pos[i] = iter->min_pos;
4827 iter->tid[i] = iter->min_tid;
4828 }
4829 return iter;
4830 }
4831
bam_mplp_init_overlaps(bam_mplp_t iter)4832 int bam_mplp_init_overlaps(bam_mplp_t iter)
4833 {
4834 int i, r = 0;
4835 for (i = 0; i < iter->n; ++i)
4836 r |= bam_plp_init_overlaps(iter->iter[i]);
4837 return r == 0 ? 0 : -1;
4838 }
4839
bam_mplp_set_maxcnt(bam_mplp_t iter,int maxcnt)4840 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
4841 {
4842 int i;
4843 for (i = 0; i < iter->n; ++i)
4844 iter->iter[i]->maxcnt = maxcnt;
4845 }
4846
bam_mplp_destroy(bam_mplp_t iter)4847 void bam_mplp_destroy(bam_mplp_t iter)
4848 {
4849 int i;
4850 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
4851 free(iter->iter); free(iter->pos); free(iter->tid);
4852 free(iter->n_plp); free(iter->plp);
4853 free(iter);
4854 }
4855
bam_mplp64_auto(bam_mplp_t iter,int * _tid,hts_pos_t * _pos,int * n_plp,const bam_pileup1_t ** plp)4856 int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
4857 {
4858 int i, ret = 0;
4859 hts_pos_t new_min_pos = HTS_POS_MAX;
4860 uint32_t new_min_tid = (uint32_t)-1;
4861 for (i = 0; i < iter->n; ++i) {
4862 if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
4863 int tid;
4864 hts_pos_t pos;
4865 iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
4866 if ( iter->iter[i]->error ) return -1;
4867 if (iter->plp[i]) {
4868 iter->tid[i] = tid;
4869 iter->pos[i] = pos;
4870 } else {
4871 iter->tid[i] = 0;
4872 iter->pos[i] = 0;
4873 }
4874 }
4875 if (iter->plp[i]) {
4876 if (iter->tid[i] < new_min_tid) {
4877 new_min_tid = iter->tid[i];
4878 new_min_pos = iter->pos[i];
4879 } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
4880 new_min_pos = iter->pos[i];
4881 }
4882 }
4883 }
4884 iter->min_pos = new_min_pos;
4885 iter->min_tid = new_min_tid;
4886 if (new_min_pos == HTS_POS_MAX) return 0;
4887 *_tid = new_min_tid; *_pos = new_min_pos;
4888 for (i = 0; i < iter->n; ++i) {
4889 if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
4890 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
4891 ++ret;
4892 } else n_plp[i] = 0, plp[i] = 0;
4893 }
4894 return ret;
4895 }
4896
bam_mplp_auto(bam_mplp_t iter,int * _tid,int * _pos,int * n_plp,const bam_pileup1_t ** plp)4897 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
4898 {
4899 hts_pos_t pos64 = 0;
4900 int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
4901 if (ret >= 0) {
4902 if (pos64 < INT_MAX) {
4903 *_pos = pos64;
4904 } else {
4905 hts_log_error("Position %"PRId64" too large", pos64);
4906 *_pos = INT_MAX;
4907 return -1;
4908 }
4909 }
4910 return ret;
4911 }
4912
bam_mplp_reset(bam_mplp_t iter)4913 void bam_mplp_reset(bam_mplp_t iter)
4914 {
4915 int i;
4916 iter->min_pos = HTS_POS_MAX;
4917 iter->min_tid = (uint32_t)-1;
4918 for (i = 0; i < iter->n; ++i) {
4919 bam_plp_reset(iter->iter[i]);
4920 iter->pos[i] = HTS_POS_MAX;
4921 iter->tid[i] = (uint32_t)-1;
4922 iter->n_plp[i] = 0;
4923 iter->plp[i] = NULL;
4924 }
4925 }
4926
bam_mplp_constructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))4927 void bam_mplp_constructor(bam_mplp_t iter,
4928 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
4929 int i;
4930 for (i = 0; i < iter->n; ++i)
4931 bam_plp_constructor(iter->iter[i], func);
4932 }
4933
bam_mplp_destructor(bam_mplp_t iter,int (* func)(void * arg,const bam1_t * b,bam_pileup_cd * cd))4934 void bam_mplp_destructor(bam_mplp_t iter,
4935 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
4936 int i;
4937 for (i = 0; i < iter->n; ++i)
4938 bam_plp_destructor(iter->iter[i], func);
4939 }
4940
4941 #endif // ~!defined(BAM_NO_PILEUP)
4942