1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3 Copyright (C) 2008, 2009, 2012-2017 Genome Research Ltd.
4 Copyright (C) 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <zlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <strings.h>
32 #include <stdlib.h>
33 #include <limits.h>
34 #include <fcntl.h>
35 #include <errno.h>
36 #include <sys/stat.h>
37 #include <assert.h>
38
39 #include "htslib/hts.h"
40 #include "htslib/bgzf.h"
41 #include "cram/cram.h"
42 #include "htslib/hfile.h"
43 #include "htslib/hts_endian.h"
44 #include "version.h"
45 #include "hts_internal.h"
46
47 #include "htslib/khash.h"
48 #include "htslib/kseq.h"
49 #include "htslib/ksort.h"
50
51 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
52
53 int hts_verbose = 3;
54
hts_version()55 const char *hts_version()
56 {
57 return HTS_VERSION;
58 }
59
60 const unsigned char seq_nt16_table[256] = {
61 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
62 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
63 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
64 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
65 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
66 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
67 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
68 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
69
70 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
71 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
72 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
73 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
74 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
75 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
78 };
79
80 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
81
82 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
83
84 /**********************
85 *** Basic file I/O ***
86 **********************/
87
format_category(enum htsExactFormat fmt)88 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
89 {
90 switch (fmt) {
91 case bam:
92 case sam:
93 case cram:
94 return sequence_data;
95
96 case vcf:
97 case bcf:
98 return variant_data;
99
100 case bai:
101 case crai:
102 case csi:
103 case gzi:
104 case tbi:
105 return index_file;
106
107 case bed:
108 return region_list;
109
110 case json:
111 return unknown_category;
112
113 case unknown_format:
114 case binary_format:
115 case text_format:
116 case format_maximum:
117 break;
118 }
119
120 return unknown_category;
121 }
122
123 // Decompress up to ten or so bytes by peeking at the file, which must be
124 // positioned at the start of a GZIP block.
decompress_peek(hFILE * fp,unsigned char * dest,size_t destsize)125 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
126 {
127 // Typically at most a couple of hundred bytes of input are required
128 // to get a few bytes of output from inflate(), so hopefully this buffer
129 // size suffices in general.
130 unsigned char buffer[512];
131 z_stream zs;
132 ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
133
134 if (npeek < 0) return 0;
135
136 zs.zalloc = NULL;
137 zs.zfree = NULL;
138 zs.next_in = buffer;
139 zs.avail_in = npeek;
140 zs.next_out = dest;
141 zs.avail_out = destsize;
142 if (inflateInit2(&zs, 31) != Z_OK) return 0;
143
144 while (zs.total_out < destsize)
145 if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
146
147 destsize = zs.total_out;
148 inflateEnd(&zs);
149
150 return destsize;
151 }
152
153 // Parse "x.y" text, taking care because the string is not NUL-terminated
154 // and filling in major/minor only when the digits are followed by a delimiter,
155 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
156 static void
parse_version(htsFormat * fmt,const unsigned char * u,const unsigned char * ulim)157 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
158 {
159 const char *s = (const char *) u;
160 const char *slim = (const char *) ulim;
161 short v;
162
163 fmt->version.major = fmt->version.minor = -1;
164
165 for (v = 0; s < slim && isdigit_c(*s); s++)
166 v = 10 * v + *s - '0';
167
168 if (s < slim) {
169 fmt->version.major = v;
170 if (*s == '.') {
171 s++;
172 for (v = 0; s < slim && isdigit_c(*s); s++)
173 v = 10 * v + *s - '0';
174 if (s < slim)
175 fmt->version.minor = v;
176 }
177 else
178 fmt->version.minor = 0;
179 }
180 }
181
182 static int
cmp_nonblank(const char * key,const unsigned char * u,const unsigned char * ulim)183 cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
184 {
185 const unsigned char *ukey = (const unsigned char *) key;
186
187 while (*ukey)
188 if (u >= ulim) return +1;
189 else if (isspace_c(*u)) u++;
190 else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
191 else u++, ukey++;
192
193 return 0;
194 }
195
hts_detect_format(hFILE * hfile,htsFormat * fmt)196 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
197 {
198 unsigned char s[21];
199 ssize_t len = hpeek(hfile, s, 18);
200 if (len < 0) return -1;
201
202 if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
203 // The stream is either gzip-compressed or BGZF-compressed.
204 // Determine which, and decompress the first few bytes.
205 fmt->compression = (len >= 18 && (s[3] & 4) &&
206 memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
207 len = decompress_peek(hfile, s, sizeof s);
208 }
209 else {
210 fmt->compression = no_compression;
211 len = hpeek(hfile, s, sizeof s);
212 }
213 if (len < 0) return -1;
214
215 fmt->compression_level = -1;
216 fmt->specific = NULL;
217
218 if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
219 fmt->category = sequence_data;
220 fmt->format = cram;
221 fmt->version.major = s[4], fmt->version.minor = s[5];
222 fmt->compression = custom;
223 return 0;
224 }
225 else if (len >= 4 && s[3] <= '\4') {
226 if (memcmp(s, "BAM\1", 4) == 0) {
227 fmt->category = sequence_data;
228 fmt->format = bam;
229 // TODO Decompress enough to pick version from @HD-VN header
230 fmt->version.major = 1, fmt->version.minor = -1;
231 return 0;
232 }
233 else if (memcmp(s, "BAI\1", 4) == 0) {
234 fmt->category = index_file;
235 fmt->format = bai;
236 fmt->version.major = -1, fmt->version.minor = -1;
237 return 0;
238 }
239 else if (memcmp(s, "BCF\4", 4) == 0) {
240 fmt->category = variant_data;
241 fmt->format = bcf;
242 fmt->version.major = 1, fmt->version.minor = -1;
243 return 0;
244 }
245 else if (memcmp(s, "BCF\2", 4) == 0) {
246 fmt->category = variant_data;
247 fmt->format = bcf;
248 fmt->version.major = s[3];
249 fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
250 return 0;
251 }
252 else if (memcmp(s, "CSI\1", 4) == 0) {
253 fmt->category = index_file;
254 fmt->format = csi;
255 fmt->version.major = 1, fmt->version.minor = -1;
256 return 0;
257 }
258 else if (memcmp(s, "TBI\1", 4) == 0) {
259 fmt->category = index_file;
260 fmt->format = tbi;
261 fmt->version.major = -1, fmt->version.minor = -1;
262 return 0;
263 }
264 }
265 else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
266 fmt->category = variant_data;
267 fmt->format = vcf;
268 if (len >= 21 && s[16] == 'v')
269 parse_version(fmt, &s[17], &s[len]);
270 else
271 fmt->version.major = fmt->version.minor = -1;
272 return 0;
273 }
274 else if (len >= 4 && s[0] == '@' &&
275 (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
276 memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
277 fmt->category = sequence_data;
278 fmt->format = sam;
279 // @HD-VN is not guaranteed to be the first tag, but then @HD is
280 // not guaranteed to be present at all...
281 if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
282 parse_version(fmt, &s[7], &s[len]);
283 else
284 fmt->version.major = 1, fmt->version.minor = -1;
285 return 0;
286 }
287 else if (cmp_nonblank("{\"", s, &s[len]) == 0) {
288 fmt->category = unknown_category;
289 fmt->format = json;
290 fmt->version.major = fmt->version.minor = -1;
291 return 0;
292 }
293 else {
294 // Various possibilities for tab-delimited text:
295 // .crai (gzipped tab-delimited six columns: seqid 5*number)
296 // .bed ([3..12] tab-delimited columns)
297 // .bedpe (>= 10 tab-delimited columns)
298 // .sam (tab-delimited >= 11 columns: seqid number seqid...)
299 // FIXME For now, assume it's SAM
300 fmt->category = sequence_data;
301 fmt->format = sam;
302 fmt->version.major = 1, fmt->version.minor = -1;
303 return 0;
304 }
305
306 fmt->category = unknown_category;
307 fmt->format = unknown_format;
308 fmt->version.major = fmt->version.minor = -1;
309 fmt->compression = no_compression;
310 return 0;
311 }
312
hts_format_description(const htsFormat * format)313 char *hts_format_description(const htsFormat *format)
314 {
315 kstring_t str = { 0, 0, NULL };
316
317 switch (format->format) {
318 case sam: kputs("SAM", &str); break;
319 case bam: kputs("BAM", &str); break;
320 case cram: kputs("CRAM", &str); break;
321 case vcf: kputs("VCF", &str); break;
322 case bcf:
323 if (format->version.major == 1) kputs("Legacy BCF", &str);
324 else kputs("BCF", &str);
325 break;
326 case bai: kputs("BAI", &str); break;
327 case crai: kputs("CRAI", &str); break;
328 case csi: kputs("CSI", &str); break;
329 case tbi: kputs("Tabix", &str); break;
330 case json: kputs("JSON", &str); break;
331 default: kputs("unknown", &str); break;
332 }
333
334 if (format->version.major >= 0) {
335 kputs(" version ", &str);
336 kputw(format->version.major, &str);
337 if (format->version.minor >= 0) {
338 kputc('.', &str);
339 kputw(format->version.minor, &str);
340 }
341 }
342
343 switch (format->compression) {
344 case custom: kputs(" compressed", &str); break;
345 case gzip: kputs(" gzip-compressed", &str); break;
346 case bgzf:
347 switch (format->format) {
348 case bam:
349 case bcf:
350 case csi:
351 case tbi:
352 // These are by definition BGZF, so just use the generic term
353 kputs(" compressed", &str);
354 break;
355 default:
356 kputs(" BGZF-compressed", &str);
357 break;
358 }
359 break;
360 default: break;
361 }
362
363 switch (format->category) {
364 case sequence_data: kputs(" sequence", &str); break;
365 case variant_data: kputs(" variant calling", &str); break;
366 case index_file: kputs(" index", &str); break;
367 case region_list: kputs(" genomic region", &str); break;
368 default: break;
369 }
370
371 if (format->compression == no_compression)
372 switch (format->format) {
373 case sam:
374 case crai:
375 case vcf:
376 case bed:
377 case json:
378 kputs(" text", &str);
379 break;
380
381 default:
382 kputs(" data", &str);
383 break;
384 }
385 else
386 kputs(" data", &str);
387
388 return ks_release(&str);
389 }
390
hts_open_format(const char * fn,const char * mode,const htsFormat * fmt)391 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
392 {
393 char smode[102], *cp, *cp2, *mode_c;
394 htsFile *fp = NULL;
395 hFILE *hfile;
396 char fmt_code = '\0';
397
398 strncpy(smode, mode, 100);
399 smode[100]=0;
400 if ((cp = strchr(smode, ',')))
401 *cp = '\0';
402
403 // Migrate format code (b or c) to the end of the smode buffer.
404 for (cp2 = cp = smode; *cp; cp++) {
405 if (*cp == 'b')
406 fmt_code = 'b';
407 else if (*cp == 'c')
408 fmt_code = 'c';
409 else
410 *cp2++ = *cp;
411 }
412 mode_c = cp2;
413 *cp2++ = fmt_code;
414 *cp2++ = 0;
415 *cp2++ = 0;
416
417 // Set or reset the format code if opts->format is used
418 if (fmt && fmt->format != unknown_format)
419 *mode_c = "\0g\0\0b\0c\0\0b\0g\0\0"[fmt->format];
420
421 hfile = hopen(fn, smode);
422 if (hfile == NULL) goto error;
423
424 fp = hts_hopen(hfile, fn, smode);
425 if (fp == NULL) goto error;
426
427 if (fmt && fmt->specific)
428 if (hts_opt_apply(fp, fmt->specific) != 0)
429 goto error;
430
431 return fp;
432
433 error:
434 if (hts_verbose >= 2)
435 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
436
437 if (hfile)
438 hclose_abruptly(hfile);
439
440 return NULL;
441 }
442
hts_open(const char * fn,const char * mode)443 htsFile *hts_open(const char *fn, const char *mode) {
444 return hts_open_format(fn, mode, NULL);
445 }
446
447 /*
448 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
449 * the prefix in lowercase into buf and returning a pointer to the suffix.
450 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
451 * prefix should be one of several known values of maximum length buflen-2.
452 * (If delim is not found, returns a pointer to the '\0'.)
453 */
454 static const char *
scan_keyword(const char * str,char delim,char * buf,size_t buflen)455 scan_keyword(const char *str, char delim, char *buf, size_t buflen)
456 {
457 size_t i = 0;
458 while (*str && *str != delim) {
459 if (i < buflen-1) buf[i++] = tolower_c(*str);
460 str++;
461 }
462
463 buf[i] = '\0';
464 return *str? str+1 : str;
465 }
466
467 /*
468 * Parses arg and appends it to the option list.
469 *
470 * Returns 0 on success;
471 * -1 on failure.
472 */
hts_opt_add(hts_opt ** opts,const char * c_arg)473 int hts_opt_add(hts_opt **opts, const char *c_arg) {
474 hts_opt *o, *t;
475 char *val;
476
477 /*
478 * IMPORTANT!!!
479 * If you add another string option here, don't forget to also add
480 * it to the case statement in hts_opt_apply.
481 */
482
483 if (!c_arg)
484 return -1;
485
486 if (!(o = malloc(sizeof(*o))))
487 return -1;
488
489 if (!(o->arg = strdup(c_arg))) {
490 free(o);
491 return -1;
492 }
493
494 if (!(val = strchr(o->arg, '=')))
495 val = "1"; // assume boolean
496 else
497 *val++ = '\0';
498
499 if (strcmp(o->arg, "decode_md") == 0 ||
500 strcmp(o->arg, "DECODE_MD") == 0)
501 o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
502
503 else if (strcmp(o->arg, "verbosity") == 0 ||
504 strcmp(o->arg, "VERBOSITY") == 0)
505 o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
506
507 else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
508 strcmp(o->arg, "SEQS_PER_SLICE") == 0)
509 o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
510
511 else if (strcmp(o->arg, "bases_per_slice") == 0 ||
512 strcmp(o->arg, "BASES_PER_SLICE") == 0)
513 o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
514
515 else if (strcmp(o->arg, "slices_per_container") == 0 ||
516 strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
517 o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
518
519 else if (strcmp(o->arg, "embed_ref") == 0 ||
520 strcmp(o->arg, "EMBED_REF") == 0)
521 o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
522
523 else if (strcmp(o->arg, "no_ref") == 0 ||
524 strcmp(o->arg, "NO_REF") == 0)
525 o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
526
527 else if (strcmp(o->arg, "ignore_md5") == 0 ||
528 strcmp(o->arg, "IGNORE_MD5") == 0)
529 o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
530
531 else if (strcmp(o->arg, "use_bzip2") == 0 ||
532 strcmp(o->arg, "USE_BZIP2") == 0)
533 o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
534
535 else if (strcmp(o->arg, "use_rans") == 0 ||
536 strcmp(o->arg, "USE_RANS") == 0)
537 o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
538
539 else if (strcmp(o->arg, "use_lzma") == 0 ||
540 strcmp(o->arg, "USE_LZMA") == 0)
541 o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
542
543 else if (strcmp(o->arg, "reference") == 0 ||
544 strcmp(o->arg, "REFERENCE") == 0)
545 o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
546
547 else if (strcmp(o->arg, "version") == 0 ||
548 strcmp(o->arg, "VERSION") == 0)
549 o->opt = CRAM_OPT_VERSION, o->val.s =val;
550
551 else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
552 strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
553 o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
554
555 else if (strcmp(o->arg, "nthreads") == 0 ||
556 strcmp(o->arg, "NTHREADS") == 0)
557 o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
558
559 else if (strcmp(o->arg, "cache_size") == 0 ||
560 strcmp(o->arg, "CACHE_SIZE") == 0) {
561 char *endp;
562 o->opt = HTS_OPT_CACHE_SIZE;
563 o->val.i = strtol(val, &endp, 0);
564 // NB: Doesn't support floats, eg 1.5g
565 // TODO: extend hts_parse_decimal? See also samtools sort.
566 switch (*endp) {
567 case 'g': case 'G': o->val.i *= 1024;
568 case 'm': case 'M': o->val.i *= 1024;
569 case 'k': case 'K': o->val.i *= 1024; break;
570 case '\0': break;
571 default:
572 fprintf(stderr, "Unrecognised cache size suffix '%c'\n", *endp);
573 free(o->arg);
574 free(o);
575 return -1;
576 }
577 }
578
579 else if (strcmp(o->arg, "required_fields") == 0 ||
580 strcmp(o->arg, "REQUIRED_FIELDS") == 0)
581 o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
582
583 else if (strcmp(o->arg, "lossy_names") == 0 ||
584 strcmp(o->arg, "LOSSY_NAMES") == 0)
585 o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
586
587 else if (strcmp(o->arg, "name_prefix") == 0 ||
588 strcmp(o->arg, "NAME_PREFIX") == 0)
589 o->opt = CRAM_OPT_PREFIX, o->val.s = val;
590
591 else {
592 fprintf(stderr, "Unknown option '%s'\n", o->arg);
593 free(o->arg);
594 free(o);
595 return -1;
596 }
597
598 o->next = NULL;
599
600 // Append; assumes small list.
601 if (*opts) {
602 t = *opts;
603 while (t->next)
604 t = t->next;
605 t->next = o;
606 } else {
607 *opts = o;
608 }
609
610 return 0;
611 }
612
613 /*
614 * Applies an hts_opt option list to a given htsFile.
615 *
616 * Returns 0 on success
617 * -1 on failure
618 */
hts_opt_apply(htsFile * fp,hts_opt * opts)619 int hts_opt_apply(htsFile *fp, hts_opt *opts) {
620 hts_opt *last = NULL;
621
622 for (; opts; opts = (last=opts)->next) {
623 switch (opts->opt) {
624 case CRAM_OPT_REFERENCE:
625 case CRAM_OPT_VERSION:
626 case CRAM_OPT_PREFIX:
627 if (hts_set_opt(fp, opts->opt, opts->val.s) != 0)
628 return -1;
629 break;
630 default:
631 if (hts_set_opt(fp, opts->opt, opts->val.i) != 0)
632 return -1;
633 break;
634 }
635 }
636
637 return 0;
638 }
639
640 /*
641 * Frees an hts_opt list.
642 */
hts_opt_free(hts_opt * opts)643 void hts_opt_free(hts_opt *opts) {
644 hts_opt *last = NULL;
645 while (opts) {
646 opts = (last=opts)->next;
647 free(last->arg);
648 free(last);
649 }
650 }
651
652
653 /*
654 * Tokenise options as (key(=value)?,)*(key(=value)?)?
655 * NB: No provision for ',' appearing in the value!
656 * Add backslashing rules?
657 *
658 * This could be used as part of a general command line option parser or
659 * as a string concatenated onto the file open mode.
660 *
661 * Returns 0 on success
662 * -1 on failure.
663 */
hts_parse_opt_list(htsFormat * fmt,const char * str)664 int hts_parse_opt_list(htsFormat *fmt, const char *str) {
665 while (str && *str) {
666 const char *str_start;
667 int len;
668 char arg[8001];
669
670 while (*str && *str == ',')
671 str++;
672
673 for (str_start = str; *str && *str != ','; str++);
674 len = str - str_start;
675
676 // Produce a nul terminated copy of the option
677 strncpy(arg, str_start, len < 8000 ? len : 8000);
678 arg[len < 8000 ? len : 8000] = '\0';
679
680 if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
681 return -1;
682
683 if (*str)
684 str++;
685 }
686
687 return 0;
688 }
689
690 /*
691 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
692 * followed by a comma separated list of key=value options and splits
693 * these up into the fields of htsFormat struct.
694 *
695 * format is assumed to be already initialised, either to blank
696 * "unknown" values or via previous hts_opt_add calls.
697 *
698 * Returns 0 on success
699 * -1 on failure.
700 */
hts_parse_format(htsFormat * format,const char * str)701 int hts_parse_format(htsFormat *format, const char *str) {
702 char fmt[8];
703 const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
704
705 format->version.minor = 0; // unknown
706 format->version.major = 0; // unknown
707
708 if (strcmp(fmt, "sam") == 0) {
709 format->category = sequence_data;
710 format->format = sam;
711 format->compression = no_compression;;
712 format->compression_level = 0;
713 } else if (strcmp(fmt, "bam") == 0) {
714 format->category = sequence_data;
715 format->format = bam;
716 format->compression = bgzf;
717 format->compression_level = -1;
718 } else if (strcmp(fmt, "cram") == 0) {
719 format->category = sequence_data;
720 format->format = cram;
721 format->compression = custom;
722 format->compression_level = -1;
723 } else if (strcmp(fmt, "vcf") == 0) {
724 format->category = variant_data;
725 format->format = vcf;
726 format->compression = no_compression;;
727 format->compression_level = 0;
728 } else if (strcmp(fmt, "bcf") == 0) {
729 format->category = variant_data;
730 format->format = bcf;
731 format->compression = bgzf;
732 format->compression_level = -1;
733 } else {
734 return -1;
735 }
736
737 return hts_parse_opt_list(format, cp);
738 }
739
740
741 /*
742 * Tokenise options as (key(=value)?,)*(key(=value)?)?
743 * NB: No provision for ',' appearing in the value!
744 * Add backslashing rules?
745 *
746 * This could be used as part of a general command line option parser or
747 * as a string concatenated onto the file open mode.
748 *
749 * Returns 0 on success
750 * -1 on failure.
751 */
hts_process_opts(htsFile * fp,const char * opts)752 static int hts_process_opts(htsFile *fp, const char *opts) {
753 htsFormat fmt;
754
755 fmt.specific = NULL;
756 if (hts_parse_opt_list(&fmt, opts) != 0)
757 return -1;
758
759 if (hts_opt_apply(fp, fmt.specific) != 0) {
760 hts_opt_free(fmt.specific);
761 return -1;
762 }
763
764 hts_opt_free(fmt.specific);
765
766 return 0;
767 }
768
769
hts_hopen(hFILE * hfile,const char * fn,const char * mode)770 htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
771 {
772 hFILE *hfile_orig = hfile;
773 htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
774 char simple_mode[101], *cp, *opts;
775 simple_mode[100] = '\0';
776
777 if (fp == NULL) goto error;
778
779 fp->fn = strdup(fn);
780 fp->is_be = ed_is_big();
781
782 // Split mode into simple_mode,opts strings
783 if ((cp = strchr(mode, ','))) {
784 strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
785 simple_mode[cp-mode] = '\0';
786 opts = cp+1;
787 } else {
788 strncpy(simple_mode, mode, 100);
789 opts = NULL;
790 }
791
792 if (strchr(simple_mode, 'r')) {
793 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
794
795 if (fp->format.format == json) {
796 hFILE *hfile2 = hopen_json_redirect(hfile, simple_mode);
797 if (hfile2 == NULL) goto error;
798
799 // Build fp against the result of the redirection
800 hfile = hfile2;
801 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
802 }
803 }
804 else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
805 htsFormat *fmt = &fp->format;
806 fp->is_write = 1;
807
808 if (strchr(simple_mode, 'b')) fmt->format = binary_format;
809 else if (strchr(simple_mode, 'c')) fmt->format = cram;
810 else fmt->format = text_format;
811
812 if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
813 else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
814 else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
815 else {
816 // No compression mode specified, set to the default for the format
817 switch (fmt->format) {
818 case binary_format: fmt->compression = bgzf; break;
819 case cram: fmt->compression = custom; break;
820 case text_format: fmt->compression = no_compression; break;
821 default: abort();
822 }
823 }
824
825 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
826 fmt->category = format_category(fmt->format);
827
828 fmt->version.major = fmt->version.minor = -1;
829 fmt->compression_level = -1;
830 fmt->specific = NULL;
831 }
832 else { errno = EINVAL; goto error; }
833
834 switch (fp->format.format) {
835 case binary_format:
836 case bam:
837 case bcf:
838 fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
839 if (fp->fp.bgzf == NULL) goto error;
840 fp->is_bin = fp->is_bgzf = 1;
841 break;
842
843 case cram:
844 fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
845 if (fp->fp.cram == NULL) goto error;
846 if (!fp->is_write)
847 cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
848 fp->is_cram = 1;
849 break;
850
851 case text_format:
852 case sam:
853 case vcf:
854 if (fp->format.compression != no_compression) {
855 fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
856 if (fp->fp.bgzf == NULL) goto error;
857 fp->is_bgzf = 1;
858 }
859 else
860 fp->fp.hfile = hfile;
861 break;
862
863 default:
864 errno = ENOEXEC;
865 goto error;
866 }
867
868 if (opts)
869 hts_process_opts(fp, opts);
870
871 // If redirecting, close the original hFILE now (pedantically we would
872 // instead close it in hts_close(), but this a simplifying optimisation)
873 if (hfile != hfile_orig) hclose_abruptly(hfile_orig);
874
875 return fp;
876
877 error:
878 if (hts_verbose >= 2)
879 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
880
881 // If redirecting, close the failed redirection hFILE that we have opened
882 if (hfile != hfile_orig) hclose_abruptly(hfile);
883
884 if (fp) {
885 free(fp->fn);
886 free(fp->fn_aux);
887 free(fp);
888 }
889 return NULL;
890 }
891
hts_close(htsFile * fp)892 int hts_close(htsFile *fp)
893 {
894 int ret, save;
895
896 switch (fp->format.format) {
897 case binary_format:
898 case bam:
899 case bcf:
900 ret = bgzf_close(fp->fp.bgzf);
901 break;
902
903 case cram:
904 if (!fp->is_write) {
905 switch (cram_eof(fp->fp.cram)) {
906 case 2:
907 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
908 break;
909 case 0: /* not at EOF, but may not have wanted all seqs */
910 default: /* case 1, expected EOF */
911 break;
912 }
913 }
914 ret = cram_close(fp->fp.cram);
915 break;
916
917 case text_format:
918 case sam:
919 case vcf:
920 if (fp->format.compression != no_compression)
921 ret = bgzf_close(fp->fp.bgzf);
922 else
923 ret = hclose(fp->fp.hfile);
924 break;
925
926 default:
927 ret = -1;
928 break;
929 }
930
931 save = errno;
932 free(fp->fn);
933 free(fp->fn_aux);
934 free(fp->line.s);
935 free(fp);
936 errno = save;
937 return ret;
938 }
939
hts_get_format(htsFile * fp)940 const htsFormat *hts_get_format(htsFile *fp)
941 {
942 return fp? &fp->format : NULL;
943 }
944
hts_format_file_extension(const htsFormat * format)945 const char *hts_format_file_extension(const htsFormat *format) {
946 if (!format)
947 return "?";
948
949 switch (format->format) {
950 case sam: return "sam";
951 case bam: return "bam";
952 case bai: return "bai";
953 case cram: return "cram";
954 case crai: return "crai";
955 case vcf: return "vcf";
956 case bcf: return "bcf";
957 case csi: return "csi";
958 case gzi: return "gzi";
959 case tbi: return "tbi";
960 case bed: return "bed";
961 default: return "?";
962 }
963 }
964
hts_set_opt(htsFile * fp,enum hts_fmt_option opt,...)965 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
966 int r;
967 va_list args;
968
969 switch (opt) {
970 case HTS_OPT_NTHREADS: {
971 va_start(args, opt);
972 int nthreads = va_arg(args, int);
973 va_end(args);
974 return hts_set_threads(fp, nthreads);
975 }
976
977 case HTS_OPT_THREAD_POOL: {
978 va_start(args, opt);
979 htsThreadPool *p = va_arg(args, htsThreadPool *);
980 va_end(args);
981 return hts_set_thread_pool(fp, p);
982 }
983
984 case HTS_OPT_CACHE_SIZE: {
985 va_start(args, opt);
986 int cache_size = va_arg(args, int);
987 va_end(args);
988 hts_set_cache_size(fp, cache_size);
989 return 0;
990 }
991
992 default:
993 break;
994 }
995
996 if (fp->format.format != cram)
997 return 0;
998
999 va_start(args, opt);
1000 r = cram_set_voption(fp->fp.cram, opt, args);
1001 va_end(args);
1002
1003 return r;
1004 }
1005
1006 BGZF *hts_get_bgzfp(htsFile *fp);
1007
hts_set_threads(htsFile * fp,int n)1008 int hts_set_threads(htsFile *fp, int n)
1009 {
1010 if (fp->format.compression == bgzf) {
1011 return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1012 } else if (fp->format.format == cram) {
1013 return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1014 }
1015 else return 0;
1016 }
1017
hts_set_thread_pool(htsFile * fp,htsThreadPool * p)1018 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1019 if (fp->format.compression == bgzf) {
1020 return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1021 } else if (fp->format.format == cram) {
1022 return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1023 }
1024 else return 0;
1025 }
1026
hts_set_cache_size(htsFile * fp,int n)1027 void hts_set_cache_size(htsFile *fp, int n)
1028 {
1029 if (fp->format.compression == bgzf)
1030 bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1031 }
1032
hts_set_fai_filename(htsFile * fp,const char * fn_aux)1033 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1034 {
1035 free(fp->fn_aux);
1036 if (fn_aux) {
1037 fp->fn_aux = strdup(fn_aux);
1038 if (fp->fn_aux == NULL) return -1;
1039 }
1040 else fp->fn_aux = NULL;
1041
1042 if (fp->format.format == cram)
1043 if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1044 return -1;
1045
1046 return 0;
1047 }
1048
1049 // For VCF/BCF backward sweeper. Not exposing these functions because their
1050 // future is uncertain. Things will probably have to change with hFILE...
hts_get_bgzfp(htsFile * fp)1051 BGZF *hts_get_bgzfp(htsFile *fp)
1052 {
1053 if (fp->is_bgzf)
1054 return fp->fp.bgzf;
1055 else
1056 return NULL;
1057 }
hts_useek(htsFile * fp,long uoffset,int where)1058 int hts_useek(htsFile *fp, long uoffset, int where)
1059 {
1060 if (fp->is_bgzf)
1061 return bgzf_useek(fp->fp.bgzf, uoffset, where);
1062 else
1063 return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1064 }
hts_utell(htsFile * fp)1065 long hts_utell(htsFile *fp)
1066 {
1067 if (fp->is_bgzf)
1068 return bgzf_utell(fp->fp.bgzf);
1069 else
1070 return htell(fp->fp.hfile);
1071 }
1072
hts_getline(htsFile * fp,int delimiter,kstring_t * str)1073 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1074 {
1075 int ret;
1076 if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1077 fprintf(stderr, "[hts_getline] unexpected delimiter %d\n", delimiter);
1078 abort();
1079 }
1080
1081 switch (fp->format.compression) {
1082 case no_compression:
1083 str->l = 0;
1084 ret = kgetline(str, (kgets_func *) hgets, fp->fp.hfile);
1085 if (ret >= 0) ret = str->l;
1086 else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1087 else ret = -1;
1088 break;
1089
1090 case gzip:
1091 case bgzf:
1092 ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1093 break;
1094
1095 default:
1096 abort();
1097 }
1098
1099 ++fp->lineno;
1100 return ret;
1101 }
1102
hts_readlist(const char * string,int is_file,int * _n)1103 char **hts_readlist(const char *string, int is_file, int *_n)
1104 {
1105 int m = 0, n = 0;
1106 char **s = 0;
1107 if ( is_file )
1108 {
1109 BGZF *fp = bgzf_open(string, "r");
1110 if ( !fp ) return NULL;
1111
1112 kstring_t str;
1113 str.s = 0; str.l = str.m = 0;
1114 while (bgzf_getline(fp, '\n', &str) >= 0)
1115 {
1116 if (str.l == 0) continue;
1117 n++;
1118 hts_expand(char*,n,m,s);
1119 s[n-1] = strdup(str.s);
1120 }
1121 bgzf_close(fp);
1122 free(str.s);
1123 }
1124 else
1125 {
1126 const char *q = string, *p = string;
1127 while ( 1 )
1128 {
1129 if (*p == ',' || *p == 0)
1130 {
1131 n++;
1132 hts_expand(char*,n,m,s);
1133 s[n-1] = (char*)calloc(p - q + 1, 1);
1134 strncpy(s[n-1], q, p - q);
1135 q = p + 1;
1136 }
1137 if ( !*p ) break;
1138 p++;
1139 }
1140 }
1141 s = (char**)realloc(s, n * sizeof(char*));
1142 *_n = n;
1143 return s;
1144 }
1145
hts_readlines(const char * fn,int * _n)1146 char **hts_readlines(const char *fn, int *_n)
1147 {
1148 int m = 0, n = 0;
1149 char **s = 0;
1150 BGZF *fp = bgzf_open(fn, "r");
1151 if ( fp ) { // read from file
1152 kstring_t str;
1153 str.s = 0; str.l = str.m = 0;
1154 while (bgzf_getline(fp, '\n', &str) >= 0) {
1155 if (str.l == 0) continue;
1156 if (m == n) {
1157 m = m? m<<1 : 16;
1158 s = (char**)realloc(s, m * sizeof(char*));
1159 }
1160 s[n++] = strdup(str.s);
1161 }
1162 bgzf_close(fp);
1163 s = (char**)realloc(s, n * sizeof(char*));
1164 free(str.s);
1165 } else if (*fn == ':') { // read from string
1166 const char *q, *p;
1167 for (q = p = fn + 1;; ++p)
1168 if (*p == ',' || *p == 0) {
1169 if (m == n) {
1170 m = m? m<<1 : 16;
1171 s = (char**)realloc(s, m * sizeof(char*));
1172 }
1173 s[n] = (char*)calloc(p - q + 1, 1);
1174 strncpy(s[n++], q, p - q);
1175 q = p + 1;
1176 if (*p == 0) break;
1177 }
1178 } else return 0;
1179 s = (char**)realloc(s, n * sizeof(char*));
1180 *_n = n;
1181 return s;
1182 }
1183
1184 // DEPRECATED: To be removed in a future HTSlib release
hts_file_type(const char * fname)1185 int hts_file_type(const char *fname)
1186 {
1187 int len = strlen(fname);
1188 if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
1189 if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
1190 if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
1191 if ( !strcmp("-",fname) ) return FT_STDIN;
1192
1193 hFILE *f = hopen(fname, "r");
1194 if (f == NULL) return 0;
1195
1196 htsFormat fmt;
1197 if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
1198 if (hclose(f) < 0) return 0;
1199
1200 switch (fmt.format) {
1201 case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
1202 case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
1203 default: return 0;
1204 }
1205 }
1206
hts_check_EOF(htsFile * fp)1207 int hts_check_EOF(htsFile *fp)
1208 {
1209 if (fp->format.compression == bgzf)
1210 return bgzf_check_EOF(hts_get_bgzfp(fp));
1211 else if (fp->format.format == cram)
1212 return cram_check_EOF(fp->fp.cram);
1213 else
1214 return 3;
1215 }
1216
1217
1218 /****************
1219 *** Indexing ***
1220 ****************/
1221
1222 #define HTS_MIN_MARKER_DIST 0x10000
1223
1224 // Finds the special meta bin
1225 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
1226 #define META_BIN(idx) ((idx)->n_bins + 1)
1227
1228 #define pair64_lt(a,b) ((a).u < (b).u)
1229
1230 KSORT_INIT(_off, hts_pair64_t, pair64_lt)
1231
1232 typedef struct {
1233 int32_t m, n;
1234 uint64_t loff;
1235 hts_pair64_t *list;
1236 } bins_t;
1237
1238 KHASH_MAP_INIT_INT(bin, bins_t)
1239 typedef khash_t(bin) bidx_t;
1240
1241 typedef struct {
1242 int32_t n, m;
1243 uint64_t *offset;
1244 } lidx_t;
1245
1246 struct __hts_idx_t {
1247 int fmt, min_shift, n_lvls, n_bins;
1248 uint32_t l_meta;
1249 int32_t n, m;
1250 uint64_t n_no_coor;
1251 bidx_t **bidx;
1252 lidx_t *lidx;
1253 uint8_t *meta; // MUST have a terminating NUL on the end
1254 struct {
1255 uint32_t last_bin, save_bin;
1256 int last_coor, last_tid, save_tid, finished;
1257 uint64_t last_off, save_off;
1258 uint64_t off_beg, off_end;
1259 uint64_t n_mapped, n_unmapped;
1260 } z; // keep internal states
1261 };
1262
idx_format_name(int fmt)1263 static char * idx_format_name(int fmt) {
1264 switch (fmt) {
1265 case HTS_FMT_CSI: return "csi";
1266 case HTS_FMT_BAI: return "bai";
1267 case HTS_FMT_TBI: return "tbi";
1268 case HTS_FMT_CRAI: return "crai";
1269 default: return "unknown";
1270 }
1271 }
1272
insert_to_b(bidx_t * b,int bin,uint64_t beg,uint64_t end)1273 static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
1274 {
1275 khint_t k;
1276 bins_t *l;
1277 int absent;
1278 k = kh_put(bin, b, bin, &absent);
1279 if (absent < 0) return -1; // Out of memory
1280 l = &kh_value(b, k);
1281 if (absent) {
1282 l->m = 1; l->n = 0;
1283 l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
1284 if (!l->list) {
1285 kh_del(bin, b, k);
1286 return -1;
1287 }
1288 } else if (l->n == l->m) {
1289 uint32_t new_m = l->m ? l->m << 1 : 1;
1290 hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
1291 if (!new_list) return -1;
1292 l->list = new_list;
1293 l->m = new_m;
1294 }
1295 l->list[l->n].u = beg;
1296 l->list[l->n++].v = end;
1297 return 0;
1298 }
1299
insert_to_l(lidx_t * l,int64_t _beg,int64_t _end,uint64_t offset,int min_shift)1300 static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
1301 {
1302 int i, beg, end;
1303 beg = _beg >> min_shift;
1304 end = (_end - 1) >> min_shift;
1305 if (l->m < end + 1) {
1306 size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
1307 uint64_t *new_offset;
1308
1309 new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
1310 if (!new_offset) return -1;
1311
1312 // fill unused memory with (uint64_t)-1
1313 memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
1314 l->m = new_m;
1315 l->offset = new_offset;
1316 }
1317 for (i = beg; i <= end; ++i) {
1318 if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
1319 }
1320 if (l->n < end + 1) l->n = end + 1;
1321 return 0;
1322 }
1323
hts_idx_init(int n,int fmt,uint64_t offset0,int min_shift,int n_lvls)1324 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
1325 {
1326 hts_idx_t *idx;
1327 idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
1328 if (idx == NULL) return NULL;
1329 idx->fmt = fmt;
1330 idx->min_shift = min_shift;
1331 idx->n_lvls = n_lvls;
1332 idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
1333 idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
1334 idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
1335 idx->z.last_coor = 0xffffffffu;
1336 if (n) {
1337 idx->n = idx->m = n;
1338 idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
1339 if (idx->bidx == NULL) { free(idx); return NULL; }
1340 idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
1341 if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
1342 }
1343 return idx;
1344 }
1345
update_loff(hts_idx_t * idx,int i,int free_lidx)1346 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
1347 {
1348 bidx_t *bidx = idx->bidx[i];
1349 lidx_t *lidx = &idx->lidx[i];
1350 khint_t k;
1351 int l;
1352 uint64_t offset0 = 0;
1353 if (bidx) {
1354 k = kh_get(bin, bidx, META_BIN(idx));
1355 if (k != kh_end(bidx))
1356 offset0 = kh_val(bidx, k).list[0].u;
1357 for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
1358 lidx->offset[l] = offset0;
1359 } else l = 1;
1360 for (; l < lidx->n; ++l) // fill missing values
1361 if (lidx->offset[l] == (uint64_t)-1)
1362 lidx->offset[l] = lidx->offset[l-1];
1363 if (bidx == 0) return;
1364 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
1365 if (kh_exist(bidx, k))
1366 {
1367 if ( kh_key(bidx, k) < idx->n_bins )
1368 {
1369 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
1370 // disable linear index if bot_bin out of bounds
1371 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
1372 }
1373 else
1374 kh_val(bidx, k).loff = 0;
1375 }
1376 if (free_lidx) {
1377 free(lidx->offset);
1378 lidx->m = lidx->n = 0;
1379 lidx->offset = 0;
1380 }
1381 }
1382
compress_binning(hts_idx_t * idx,int i)1383 static void compress_binning(hts_idx_t *idx, int i)
1384 {
1385 bidx_t *bidx = idx->bidx[i];
1386 khint_t k;
1387 int l, m;
1388 if (bidx == 0) return;
1389 // merge a bin to its parent if the bin is too small
1390 for (l = idx->n_lvls; l > 0; --l) {
1391 unsigned start = hts_bin_first(l);
1392 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1393 bins_t *p, *q;
1394 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
1395 p = &kh_value(bidx, k);
1396 if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
1397 if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
1398 khint_t kp;
1399 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
1400 if (kp == kh_end(bidx)) continue;
1401 q = &kh_val(bidx, kp);
1402 if (q->n + p->n > q->m) {
1403 q->m = q->n + p->n;
1404 kroundup32(q->m);
1405 q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
1406 }
1407 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
1408 q->n += p->n;
1409 free(p->list);
1410 kh_del(bin, bidx, k);
1411 }
1412 }
1413 }
1414 k = kh_get(bin, bidx, 0);
1415 if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
1416 // merge adjacent chunks that start from the same BGZF block
1417 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1418 bins_t *p;
1419 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
1420 p = &kh_value(bidx, k);
1421 for (l = 1, m = 0; l < p->n; ++l) {
1422 if (p->list[m].v>>16 >= p->list[l].u>>16) {
1423 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
1424 } else p->list[++m] = p->list[l];
1425 }
1426 p->n = m + 1;
1427 }
1428 }
1429
hts_idx_finish(hts_idx_t * idx,uint64_t final_offset)1430 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
1431 {
1432 int i;
1433 if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
1434 if (idx->z.save_tid >= 0) {
1435 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
1436 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
1437 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
1438 }
1439 for (i = 0; i < idx->n; ++i) {
1440 update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
1441 compress_binning(idx, i);
1442 }
1443 idx->z.finished = 1;
1444 }
1445
hts_idx_push(hts_idx_t * idx,int tid,int beg,int end,uint64_t offset,int is_mapped)1446 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
1447 {
1448 int bin;
1449 int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
1450 if (tid<0) beg = -1, end = 0;
1451 if (tid >= 0 && (beg > maxpos || end > maxpos)) {
1452 goto pos_too_big;
1453 }
1454 if (tid >= idx->m) { // enlarge the index
1455 uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
1456 bidx_t **new_bidx;
1457 lidx_t *new_lidx;
1458
1459 new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
1460 if (!new_bidx) return -1;
1461 idx->bidx = new_bidx;
1462
1463 new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
1464 if (!new_lidx) return -1;
1465 idx->lidx = new_lidx;
1466
1467 memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
1468 memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
1469 idx->m = new_m;
1470 }
1471 if (idx->n < tid + 1) idx->n = tid + 1;
1472 if (idx->z.finished) return 0;
1473 if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
1474 if ( tid>=0 && idx->n_no_coor )
1475 {
1476 if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid);
1477 return -1;
1478 }
1479 if (tid>=0 && idx->bidx[tid] != 0)
1480 {
1481 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__);
1482 return -1;
1483 }
1484 idx->z.last_tid = tid;
1485 idx->z.last_bin = 0xffffffffu;
1486 } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
1487 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions on sequence #%d: %d followed by %d\n", __func__, tid+1, idx->z.last_coor+1, beg+1);
1488 return -1;
1489 }
1490 if ( tid>=0 )
1491 {
1492 if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
1493 if (is_mapped) {
1494 // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
1495 if (beg < 0) beg = 0;
1496 if (end <= 0) end = 1;
1497 // idx->z.last_off points to the start of the current record
1498 if (insert_to_l(&idx->lidx[tid], beg, end,
1499 idx->z.last_off, idx->min_shift) < 0) return -1;
1500 }
1501 }
1502 else idx->n_no_coor++;
1503 bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
1504 if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
1505 if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
1506 if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
1507 idx->z.save_off, idx->z.last_off) < 0) return -1;
1508 }
1509 if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
1510 idx->z.off_end = idx->z.last_off;
1511 if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1512 idx->z.off_beg, idx->z.off_end) < 0) return -1;
1513 if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1514 idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
1515 idx->z.n_mapped = idx->z.n_unmapped = 0;
1516 idx->z.off_beg = idx->z.off_end;
1517 }
1518 idx->z.save_off = idx->z.last_off;
1519 idx->z.save_bin = idx->z.last_bin = bin;
1520 idx->z.save_tid = tid;
1521 }
1522 if (is_mapped) ++idx->z.n_mapped;
1523 else ++idx->z.n_unmapped;
1524 idx->z.last_off = offset;
1525 idx->z.last_coor = beg;
1526 return 0;
1527
1528 pos_too_big: {
1529 int64_t max = end > beg ? end : beg, s = 1 << 14;
1530 int n_lvls = 0;
1531 while (max > s) {
1532 n_lvls++;
1533 s <<= 3;
1534 }
1535
1536 if (hts_verbose >= 1) {
1537 if (idx->fmt == HTS_FMT_CSI) {
1538 fprintf(stderr,
1539 "[E::%s] Region %d..%d cannot be stored in a csi index "
1540 "with min_shift = %d, n_lvls = %d. Try using "
1541 " min_shift = 14, n_lvls >= %d\n",
1542 __func__, beg, end,
1543 idx->min_shift, idx->n_lvls, n_lvls);
1544 } else {
1545 fprintf(stderr,
1546 "[E::%s] Region %d..%d cannot be stored in a %s index. "
1547 "Try using a csi index with min_shift = 14, "
1548 "n_lvls >= %d\n",
1549 __func__, beg, end, idx_format_name(idx->fmt), n_lvls);
1550 }
1551 }
1552 errno = ERANGE;
1553 return -1;
1554 }
1555 }
1556
hts_idx_destroy(hts_idx_t * idx)1557 void hts_idx_destroy(hts_idx_t *idx)
1558 {
1559 khint_t k;
1560 int i;
1561 if (idx == 0) return;
1562
1563 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
1564 if (idx->fmt == HTS_FMT_CRAI) {
1565 hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
1566 cram_index_free(cidx->cram);
1567 free(cidx);
1568 return;
1569 }
1570
1571 for (i = 0; i < idx->m; ++i) {
1572 bidx_t *bidx = idx->bidx[i];
1573 free(idx->lidx[i].offset);
1574 if (bidx == 0) continue;
1575 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1576 if (kh_exist(bidx, k))
1577 free(kh_value(bidx, k).list);
1578 kh_destroy(bin, bidx);
1579 }
1580 free(idx->bidx); free(idx->lidx); free(idx->meta);
1581 free(idx);
1582 }
1583
1584 // The optimizer eliminates these ed_is_big() calls; still it would be good to
1585 // TODO Determine endianness at configure- or compile-time
1586
idx_write_int32(BGZF * fp,int32_t x)1587 static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
1588 {
1589 if (ed_is_big()) x = ed_swap_4(x);
1590 return bgzf_write(fp, &x, sizeof x);
1591 }
1592
idx_write_uint32(BGZF * fp,uint32_t x)1593 static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
1594 {
1595 if (ed_is_big()) x = ed_swap_4(x);
1596 return bgzf_write(fp, &x, sizeof x);
1597 }
1598
idx_write_uint64(BGZF * fp,uint64_t x)1599 static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
1600 {
1601 if (ed_is_big()) x = ed_swap_8(x);
1602 return bgzf_write(fp, &x, sizeof x);
1603 }
1604
swap_bins(bins_t * p)1605 static inline void swap_bins(bins_t *p)
1606 {
1607 int i;
1608 for (i = 0; i < p->n; ++i) {
1609 ed_swap_8p(&p->list[i].u);
1610 ed_swap_8p(&p->list[i].v);
1611 }
1612 }
1613
hts_idx_save_core(const hts_idx_t * idx,BGZF * fp,int fmt)1614 static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
1615 {
1616 int32_t i, j;
1617
1618 #define check(ret) if ((ret) < 0) return -1
1619
1620 check(idx_write_int32(fp, idx->n));
1621 if (fmt == HTS_FMT_TBI && idx->l_meta)
1622 check(bgzf_write(fp, idx->meta, idx->l_meta));
1623
1624 for (i = 0; i < idx->n; ++i) {
1625 khint_t k;
1626 bidx_t *bidx = idx->bidx[i];
1627 lidx_t *lidx = &idx->lidx[i];
1628 // write binning index
1629 check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
1630 if (bidx)
1631 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1632 if (kh_exist(bidx, k)) {
1633 bins_t *p = &kh_value(bidx, k);
1634 check(idx_write_uint32(fp, kh_key(bidx, k)));
1635 if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
1636 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
1637 check(idx_write_int32(fp, p->n));
1638 for (j = 0; j < p->n; ++j) {
1639 check(idx_write_uint64(fp, p->list[j].u));
1640 check(idx_write_uint64(fp, p->list[j].v));
1641 }
1642 }
1643
1644 // write linear index
1645 if (fmt != HTS_FMT_CSI) {
1646 check(idx_write_int32(fp, lidx->n));
1647 for (j = 0; j < lidx->n; ++j)
1648 check(idx_write_uint64(fp, lidx->offset[j]));
1649 }
1650 }
1651
1652 check(idx_write_uint64(fp, idx->n_no_coor));
1653 return 0;
1654 #undef check
1655 }
1656
hts_idx_save(const hts_idx_t * idx,const char * fn,int fmt)1657 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
1658 {
1659 int ret, save;
1660 char *fnidx = (char*)calloc(1, strlen(fn) + 5);
1661 if (fnidx == NULL) return -1;
1662
1663 strcpy(fnidx, fn);
1664 switch (fmt) {
1665 case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
1666 case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
1667 case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
1668 default: abort();
1669 }
1670
1671 ret = hts_idx_save_as(idx, fn, fnidx, fmt);
1672 save = errno;
1673 free(fnidx);
1674 errno = save;
1675 return ret;
1676 }
1677
hts_idx_save_as(const hts_idx_t * idx,const char * fn,const char * fnidx,int fmt)1678 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
1679 {
1680 BGZF *fp;
1681
1682 #define check(ret) if ((ret) < 0) goto fail
1683
1684 if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
1685
1686 fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
1687 if (fp == NULL) return -1;
1688
1689 if (fmt == HTS_FMT_CSI) {
1690 check(bgzf_write(fp, "CSI\1", 4));
1691 check(idx_write_int32(fp, idx->min_shift));
1692 check(idx_write_int32(fp, idx->n_lvls));
1693 check(idx_write_uint32(fp, idx->l_meta));
1694 if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
1695 } else if (fmt == HTS_FMT_TBI) {
1696 check(bgzf_write(fp, "TBI\1", 4));
1697 } else if (fmt == HTS_FMT_BAI) {
1698 check(bgzf_write(fp, "BAI\1", 4));
1699 } else abort();
1700
1701 check(hts_idx_save_core(idx, fp, fmt));
1702
1703 return bgzf_close(fp);
1704 #undef check
1705
1706 fail:
1707 bgzf_close(fp);
1708 return -1;
1709 }
1710
hts_idx_load_core(hts_idx_t * idx,BGZF * fp,int fmt)1711 static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt)
1712 {
1713 int32_t i, n, is_be;
1714 is_be = ed_is_big();
1715 if (idx == NULL) return -4;
1716 for (i = 0; i < idx->n; ++i) {
1717 bidx_t *h;
1718 lidx_t *l = &idx->lidx[i];
1719 uint32_t key;
1720 int j, absent;
1721 bins_t *p;
1722 h = idx->bidx[i] = kh_init(bin);
1723 if (bgzf_read(fp, &n, 4) != 4) return -1;
1724 if (is_be) ed_swap_4p(&n);
1725 for (j = 0; j < n; ++j) {
1726 khint_t k;
1727 if (bgzf_read(fp, &key, 4) != 4) return -1;
1728 if (is_be) ed_swap_4p(&key);
1729 k = kh_put(bin, h, key, &absent);
1730 if (absent <= 0) return -3; // Duplicate bin number
1731 p = &kh_val(h, k);
1732 if (fmt == HTS_FMT_CSI) {
1733 if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
1734 if (is_be) ed_swap_8p(&p->loff);
1735 } else p->loff = 0;
1736 if (bgzf_read(fp, &p->n, 4) != 4) return -1;
1737 if (is_be) ed_swap_4p(&p->n);
1738 p->m = p->n;
1739 p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
1740 if (p->list == NULL) return -2;
1741 if (bgzf_read(fp, p->list, p->n<<4) != p->n<<4) return -1;
1742 if (is_be) swap_bins(p);
1743 }
1744 if (fmt != HTS_FMT_CSI) { // load linear index
1745 int j;
1746 if (bgzf_read(fp, &l->n, 4) != 4) return -1;
1747 if (is_be) ed_swap_4p(&l->n);
1748 l->m = l->n;
1749 l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
1750 if (l->offset == NULL) return -2;
1751 if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
1752 if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
1753 for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
1754 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
1755 update_loff(idx, i, 1);
1756 }
1757 }
1758 if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
1759 if (is_be) ed_swap_8p(&idx->n_no_coor);
1760 return 0;
1761 }
1762
hts_idx_load_local(const char * fn)1763 static hts_idx_t *hts_idx_load_local(const char *fn)
1764 {
1765 uint8_t magic[4];
1766 int i, is_be;
1767 hts_idx_t *idx = NULL;
1768 uint8_t *meta = NULL;
1769 BGZF *fp = bgzf_open(fn, "r");
1770 if (fp == NULL) return NULL;
1771 is_be = ed_is_big();
1772 if (bgzf_read(fp, magic, 4) != 4) goto fail;
1773
1774 if (memcmp(magic, "CSI\1", 4) == 0) {
1775 uint32_t x[3], n;
1776 if (bgzf_read(fp, x, 12) != 12) goto fail;
1777 if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
1778 if (x[2]) {
1779 if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
1780 if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
1781 if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
1782 // Prevent possible strlen past the end in tbx_index_load2
1783 meta[x[2]] = '\0';
1784 }
1785 if (bgzf_read(fp, &n, 4) != 4) goto fail;
1786 if (is_be) ed_swap_4p(&n);
1787 if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
1788 idx->l_meta = x[2];
1789 idx->meta = meta;
1790 meta = NULL;
1791 if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
1792 }
1793 else if (memcmp(magic, "TBI\1", 4) == 0) {
1794 uint8_t x[8 * 4];
1795 uint32_t n;
1796 // Read file header
1797 if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
1798 n = le_to_u32(&x[0]); // location of n_ref
1799 if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
1800 n = le_to_u32(&x[7*4]); // location of l_nm
1801 if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
1802 idx->l_meta = 28 + n;
1803 if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
1804 // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
1805 // N.B. left in little-endian byte order.
1806 memcpy(idx->meta, &x[1*4], 28);
1807 // Read in sequence names.
1808 if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
1809 // Prevent possible strlen past the end in tbx_index_load2
1810 idx->meta[idx->l_meta] = '\0';
1811 if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
1812 }
1813 else if (memcmp(magic, "BAI\1", 4) == 0) {
1814 uint32_t n;
1815 if (bgzf_read(fp, &n, 4) != 4) goto fail;
1816 if (is_be) ed_swap_4p(&n);
1817 idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5);
1818 if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
1819 }
1820 else { errno = EINVAL; goto fail; }
1821
1822 bgzf_close(fp);
1823 return idx;
1824
1825 fail:
1826 bgzf_close(fp);
1827 hts_idx_destroy(idx);
1828 free(meta);
1829 return NULL;
1830 }
1831
hts_idx_set_meta(hts_idx_t * idx,uint32_t l_meta,uint8_t * meta,int is_copy)1832 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
1833 int is_copy)
1834 {
1835 uint8_t *new_meta = meta;
1836 if (is_copy) {
1837 size_t l = l_meta;
1838 if (l > SIZE_MAX - 1) {
1839 errno = ENOMEM;
1840 return -1;
1841 }
1842 new_meta = malloc(l + 1);
1843 if (!new_meta) return -1;
1844 memcpy(new_meta, meta, l);
1845 // Prevent possible strlen past the end in tbx_index_load2
1846 meta[l + 1] = '\0';
1847 }
1848 if (idx->meta) free(idx->meta);
1849 idx->l_meta = l_meta;
1850 idx->meta = new_meta;
1851 return 0;
1852 }
1853
hts_idx_get_meta(hts_idx_t * idx,uint32_t * l_meta)1854 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
1855 {
1856 *l_meta = idx->l_meta;
1857 return idx->meta;
1858 }
1859
hts_idx_seqnames(const hts_idx_t * idx,int * n,hts_id2name_f getid,void * hdr)1860 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
1861 {
1862 if ( !idx->n )
1863 {
1864 *n = 0;
1865 return NULL;
1866 }
1867
1868 int tid = 0, i;
1869 const char **names = (const char**) calloc(idx->n,sizeof(const char*));
1870 for (i=0; i<idx->n; i++)
1871 {
1872 bidx_t *bidx = idx->bidx[i];
1873 if ( !bidx ) continue;
1874 names[tid++] = getid(hdr,i);
1875 }
1876 *n = tid;
1877 return names;
1878 }
1879
hts_idx_get_stat(const hts_idx_t * idx,int tid,uint64_t * mapped,uint64_t * unmapped)1880 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
1881 {
1882 if ( idx->fmt == HTS_FMT_CRAI ) {
1883 *mapped = 0; *unmapped = 0;
1884 return -1;
1885 }
1886
1887 bidx_t *h = idx->bidx[tid];
1888 khint_t k = kh_get(bin, h, META_BIN(idx));
1889 if (k != kh_end(h)) {
1890 *mapped = kh_val(h, k).list[1].u;
1891 *unmapped = kh_val(h, k).list[1].v;
1892 return 0;
1893 } else {
1894 *mapped = 0; *unmapped = 0;
1895 return -1;
1896 }
1897 }
1898
hts_idx_get_n_no_coor(const hts_idx_t * idx)1899 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
1900 {
1901 return idx->n_no_coor;
1902 }
1903
1904 /****************
1905 *** Iterator ***
1906 ****************/
1907
reg2bins(int64_t beg,int64_t end,hts_itr_t * itr,int min_shift,int n_lvls)1908 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
1909 {
1910 int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
1911 if (beg >= end) return 0;
1912 if (end >= 1LL<<s) end = 1LL<<s;
1913 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
1914 int b, e, n, i;
1915 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
1916 if (itr->bins.n + n > itr->bins.m) {
1917 itr->bins.m = itr->bins.n + n;
1918 kroundup32(itr->bins.m);
1919 itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
1920 }
1921 for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
1922 }
1923 return itr->bins.n;
1924 }
1925
hts_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)1926 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
1927 {
1928 int i, n_off, l, bin;
1929 hts_pair64_t *off;
1930 khint_t k;
1931 bidx_t *bidx;
1932 uint64_t min_off, max_off;
1933 hts_itr_t *iter = 0;
1934 if (tid < 0) {
1935 int finished0 = 0;
1936 uint64_t off0 = (uint64_t)-1;
1937 khint_t k;
1938 switch (tid) {
1939 case HTS_IDX_START:
1940 // Find the smallest offset, note that sequence ids may not be ordered sequentially
1941 for (i=0; i<idx->n; i++)
1942 {
1943 bidx = idx->bidx[i];
1944 k = kh_get(bin, bidx, META_BIN(idx));
1945 if (k == kh_end(bidx)) continue;
1946 if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u;
1947 }
1948 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1949 break;
1950
1951 case HTS_IDX_NOCOOR:
1952 /* No-coor reads sort after all of the mapped reads. The position
1953 is not stored in the index itself, so need to find the end
1954 offset for the last mapped read. A loop is needed here in
1955 case references at the end of the file have no mapped reads,
1956 or sequence ids are not ordered sequentially.
1957 See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
1958 for (i = 0; i < idx->n; i++) {
1959 bidx = idx->bidx[i];
1960 k = kh_get(bin, bidx, META_BIN(idx));
1961 if (k != kh_end(bidx)) {
1962 if (off0==(uint64_t)-1 || off0 < kh_val(bidx, k).list[0].v) {
1963 off0 = kh_val(bidx, k).list[0].v;
1964 }
1965 }
1966 }
1967 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1968 break;
1969
1970 case HTS_IDX_REST:
1971 off0 = 0;
1972 break;
1973
1974 case HTS_IDX_NONE:
1975 finished0 = 1;
1976 off0 = 0;
1977 break;
1978
1979 default:
1980 return 0;
1981 }
1982 if (off0 != (uint64_t)-1) {
1983 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
1984 iter->read_rest = 1;
1985 iter->finished = finished0;
1986 iter->curr_off = off0;
1987 iter->readrec = readrec;
1988 return iter;
1989 } else return 0;
1990 }
1991
1992 if (beg < 0) beg = 0;
1993 if (end < beg) return 0;
1994 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
1995
1996 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
1997 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
1998 iter->readrec = readrec;
1999
2000 if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
2001
2002 // compute min_off
2003 bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
2004 do {
2005 int first;
2006 k = kh_get(bin, bidx, bin);
2007 if (k != kh_end(bidx)) break;
2008 first = (hts_bin_parent(bin)<<3) + 1;
2009 if (bin > first) --bin;
2010 else bin = hts_bin_parent(bin);
2011 } while (bin);
2012 if (bin == 0) k = kh_get(bin, bidx, bin);
2013 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2014
2015 // compute max_off: a virtual offset from a bin to the right of end
2016 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2017 if (bin >= idx->n_bins) bin = 0;
2018 while (1) {
2019 // search for an extant bin by moving right, but moving up to the
2020 // parent whenever we get to a first child (which also covers falling
2021 // off the RHS, which wraps around and immediately goes up to bin 0)
2022 while (bin % 8 == 1) bin = hts_bin_parent(bin);
2023 if (bin == 0) { max_off = (uint64_t)-1; break; }
2024 k = kh_get(bin, bidx, bin);
2025 if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
2026 bin++;
2027 }
2028
2029 // retrieve bins
2030 reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
2031 for (i = n_off = 0; i < iter->bins.n; ++i)
2032 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
2033 n_off += kh_value(bidx, k).n;
2034 if (n_off == 0) {
2035 // No overlapping bins means the iterator has already finished.
2036 iter->finished = 1;
2037 return iter;
2038 }
2039 off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
2040 for (i = n_off = 0; i < iter->bins.n; ++i) {
2041 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
2042 int j;
2043 bins_t *p = &kh_value(bidx, k);
2044 for (j = 0; j < p->n; ++j)
2045 if (p->list[j].v > min_off && p->list[j].u < max_off)
2046 off[n_off++] = p->list[j];
2047 }
2048 }
2049 if (n_off == 0) {
2050 free(off);
2051 iter->finished = 1;
2052 return iter;
2053 }
2054 ks_introsort(_off, n_off, off);
2055 // resolve completely contained adjacent blocks
2056 for (i = 1, l = 0; i < n_off; ++i)
2057 if (off[l].v < off[i].v) off[++l] = off[i];
2058 n_off = l + 1;
2059 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2060 for (i = 1; i < n_off; ++i)
2061 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2062 // merge adjacent blocks
2063 for (i = 1, l = 0; i < n_off; ++i) {
2064 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
2065 else off[++l] = off[i];
2066 }
2067 n_off = l + 1;
2068 iter->n_off = n_off; iter->off = off;
2069 return iter;
2070 }
2071
hts_itr_destroy(hts_itr_t * iter)2072 void hts_itr_destroy(hts_itr_t *iter)
2073 {
2074 if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
2075 }
2076
push_digit(long long i,char c)2077 static inline long long push_digit(long long i, char c)
2078 {
2079 // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
2080 int digit = c - '0';
2081 return 10 * i + digit;
2082 }
2083
hts_parse_decimal(const char * str,char ** strend,int flags)2084 long long hts_parse_decimal(const char *str, char **strend, int flags)
2085 {
2086 long long n = 0;
2087 int decimals = 0, e = 0, lost = 0;
2088 char sign = '+', esign = '+';
2089 const char *s;
2090
2091 while (isspace_c(*str)) str++;
2092 s = str;
2093
2094 if (*s == '+' || *s == '-') sign = *s++;
2095 while (*s)
2096 if (isdigit_c(*s)) n = push_digit(n, *s++);
2097 else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
2098 else break;
2099
2100 if (*s == '.') {
2101 s++;
2102 while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++);
2103 }
2104
2105 if (*s == 'E' || *s == 'e') {
2106 s++;
2107 if (*s == '+' || *s == '-') esign = *s++;
2108 while (isdigit_c(*s)) e = push_digit(e, *s++);
2109 if (esign == '-') e = -e;
2110 }
2111
2112 e -= decimals;
2113 while (e > 0) n *= 10, e--;
2114 while (e < 0) lost += n % 10, n /= 10, e++;
2115
2116 if (lost > 0 && hts_verbose >= 3)
2117 fprintf(stderr, "[W::%s] discarding fractional part of %.*s\n",
2118 __func__, (int)(s - str), str);
2119
2120 if (strend) *strend = (char *) s;
2121 else if (*s && hts_verbose >= 2)
2122 fprintf(stderr, "[W::%s] ignoring unknown characters after %.*s[%s]\n",
2123 __func__, (int)(s - str), str, s);
2124
2125 return (sign == '+')? n : -n;
2126 }
2127
hts_parse_reg(const char * s,int * beg,int * end)2128 const char *hts_parse_reg(const char *s, int *beg, int *end)
2129 {
2130 char *hyphen;
2131 const char *colon = strrchr(s, ':');
2132 if (colon == NULL) {
2133 *beg = 0; *end = INT_MAX;
2134 return s + strlen(s);
2135 }
2136
2137 *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
2138 if (*beg < 0) *beg = 0;
2139
2140 if (*hyphen == '\0') *end = INT_MAX;
2141 else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
2142 else return NULL;
2143
2144 if (*beg >= *end) return NULL;
2145 return colon;
2146 }
2147
hts_itr_querys(const hts_idx_t * idx,const char * reg,hts_name2id_f getid,void * hdr,hts_itr_query_func * itr_query,hts_readrec_func * readrec)2148 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
2149 {
2150 int tid, beg, end;
2151 const char *q;
2152
2153 if (strcmp(reg, ".") == 0)
2154 return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
2155 else if (strcmp(reg, "*") == 0)
2156 return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
2157
2158 q = hts_parse_reg(reg, &beg, &end);
2159 if (q) {
2160 char tmp_a[1024], *tmp = tmp_a;
2161 if (q - reg + 1 > 1024)
2162 if (!(tmp = malloc(q - reg + 1)))
2163 return NULL;
2164 strncpy(tmp, reg, q - reg);
2165 tmp[q - reg] = 0;
2166 tid = getid(hdr, tmp);
2167 if (tmp != tmp_a)
2168 free(tmp);
2169 }
2170 else {
2171 // not parsable as a region, but possibly a sequence named "foo:a"
2172 tid = getid(hdr, reg);
2173 beg = 0; end = INT_MAX;
2174 }
2175
2176 if (tid < 0) return NULL;
2177 return itr_query(idx, tid, beg, end, readrec);
2178 }
2179
hts_itr_next(BGZF * fp,hts_itr_t * iter,void * r,void * data)2180 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
2181 {
2182 int ret, tid, beg, end;
2183 if (iter == NULL || iter->finished) return -1;
2184 if (iter->read_rest) {
2185 if (iter->curr_off) { // seek to the start
2186 if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) return -1;
2187 iter->curr_off = 0; // only seek once
2188 }
2189 ret = iter->readrec(fp, data, r, &tid, &beg, &end);
2190 if (ret < 0) iter->finished = 1;
2191 iter->curr_tid = tid;
2192 iter->curr_beg = beg;
2193 iter->curr_end = end;
2194 return ret;
2195 }
2196 // A NULL iter->off should always be accompanied by iter->finished.
2197 assert(iter->off != NULL);
2198 for (;;) {
2199 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
2200 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
2201 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
2202 if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) return -1;
2203 iter->curr_off = bgzf_tell(fp);
2204 }
2205 ++iter->i;
2206 }
2207 if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
2208 iter->curr_off = bgzf_tell(fp);
2209 if (tid != iter->tid || beg >= iter->end) { // no need to proceed
2210 ret = -1; break;
2211 } else if (end > iter->beg && iter->end > beg) {
2212 iter->curr_tid = tid;
2213 iter->curr_beg = beg;
2214 iter->curr_end = end;
2215 return ret;
2216 }
2217 } else break; // end of file or error
2218 }
2219 iter->finished = 1;
2220 return ret;
2221 }
2222
2223 /**********************
2224 *** Retrieve index ***
2225 **********************/
2226
test_and_fetch(const char * fn)2227 static char *test_and_fetch(const char *fn)
2228 {
2229 if (hisremote(fn)) {
2230 const int buf_size = 1 * 1024 * 1024;
2231 hFILE *fp_remote;
2232 FILE *fp;
2233 uint8_t *buf;
2234 int l;
2235 const char *p;
2236 for (p = fn + strlen(fn) - 1; p >= fn; --p)
2237 if (*p == '/') break;
2238 ++p; // p now points to the local file name
2239 // Attempt to open local file first
2240 if ((fp = fopen((char*)p, "rb")) != 0)
2241 {
2242 fclose(fp);
2243 return (char*)p;
2244 }
2245 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
2246 if ((fp_remote = hopen(fn, "r")) == 0) return 0;
2247 if ((fp = fopen(p, "w")) == 0) {
2248 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
2249 hclose_abruptly(fp_remote);
2250 return 0;
2251 }
2252 if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn);
2253 buf = (uint8_t*)calloc(buf_size, 1);
2254 while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
2255 free(buf);
2256 fclose(fp);
2257 if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn);
2258 return (char*)p;
2259 } else {
2260 hFILE *fp;
2261 if ((fp = hopen(fn, "r")) == 0) return 0;
2262 hclose_abruptly(fp);
2263 return (char*)fn;
2264 }
2265 }
2266
hts_idx_getfn(const char * fn,const char * ext)2267 char *hts_idx_getfn(const char *fn, const char *ext)
2268 {
2269 int i, l_fn, l_ext;
2270 char *fnidx, *ret;
2271 l_fn = strlen(fn); l_ext = strlen(ext);
2272 fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
2273 strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
2274 if ((ret = test_and_fetch(fnidx)) == 0) {
2275 for (i = l_fn - 1; i > 0; --i)
2276 if (fnidx[i] == '.') break;
2277 strcpy(fnidx + i, ext);
2278 ret = test_and_fetch(fnidx);
2279 }
2280 if (ret == 0) {
2281 free(fnidx);
2282 return 0;
2283 }
2284 l_fn = strlen(ret);
2285 memmove(fnidx, ret, l_fn + 1);
2286 return fnidx;
2287 }
2288
hts_idx_load(const char * fn,int fmt)2289 hts_idx_t *hts_idx_load(const char *fn, int fmt)
2290 {
2291 char *fnidx;
2292 hts_idx_t *idx;
2293 fnidx = hts_idx_getfn(fn, ".csi");
2294 if (! fnidx) fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
2295 if (fnidx == 0) return 0;
2296
2297 idx = hts_idx_load2(fn, fnidx);
2298 free(fnidx);
2299 return idx;
2300 }
2301
hts_idx_load2(const char * fn,const char * fnidx)2302 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
2303 {
2304 // Check that the index file is up to date, the main file might have changed
2305 struct stat stat_idx,stat_main;
2306 if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
2307 {
2308 if ( hts_verbose >= 1 && stat_idx.st_mtime < stat_main.st_mtime )
2309 fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx);
2310 }
2311
2312 return hts_idx_load_local(fnidx);
2313 }
2314
2315
2316
2317 /**********************
2318 *** Memory ***
2319 **********************/
2320
2321 /* For use with hts_expand macros *only* */
hts_realloc_or_die(size_t n,size_t m,size_t m_sz,size_t size,int clear,void ** ptr,const char * func)2322 size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
2323 int clear, void **ptr, const char *func) {
2324 /* If new_m and size are both below this limit, multiplying them
2325 together can't overflow */
2326 const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
2327 void *new_ptr;
2328 size_t bytes, new_m;
2329
2330 new_m = n;
2331 kroundup_size_t(new_m);
2332
2333 bytes = size * new_m;
2334
2335 /* Check for overflow. Both ensure that new_m will fit in m (we make the
2336 pessimistic assumption that m is signed), and that bytes has not
2337 wrapped around. */
2338 if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
2339 || ((size > safe || new_m > safe)
2340 && bytes / new_m != size)) {
2341 errno = ENOMEM;
2342 goto die;
2343 }
2344
2345 new_ptr = realloc(*ptr, bytes);
2346 if (new_ptr == NULL) goto die;
2347
2348 if (clear) {
2349 if (new_m > m) {
2350 memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
2351 }
2352 }
2353
2354 *ptr = new_ptr;
2355
2356 return new_m;
2357
2358 die:
2359 if (hts_verbose > 1)
2360 fprintf(stderr, "[E::%s] %s\n", func, strerror(errno));
2361 exit(1);
2362 }
2363