1 /*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2 
3     Copyright (C) 2008, 2009, 2012-2017 Genome Research Ltd.
4     Copyright (C) 2012, 2013 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <zlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <strings.h>
32 #include <stdlib.h>
33 #include <limits.h>
34 #include <fcntl.h>
35 #include <errno.h>
36 #include <sys/stat.h>
37 #include <assert.h>
38 
39 #include "htslib/hts.h"
40 #include "htslib/bgzf.h"
41 #include "cram/cram.h"
42 #include "htslib/hfile.h"
43 #include "htslib/hts_endian.h"
44 #include "version.h"
45 #include "hts_internal.h"
46 
47 #include "htslib/khash.h"
48 #include "htslib/kseq.h"
49 #include "htslib/ksort.h"
50 
51 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
52 
53 int hts_verbose = 3;
54 
hts_version()55 const char *hts_version()
56 {
57     return HTS_VERSION;
58 }
59 
60 const unsigned char seq_nt16_table[256] = {
61     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
62     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
63     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
64      1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
65     15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
66     15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
67     15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
68     15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
69 
70     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
71     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
72     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
73     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
74     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
75     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
76     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
78 };
79 
80 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
81 
82 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
83 
84 /**********************
85  *** Basic file I/O ***
86  **********************/
87 
format_category(enum htsExactFormat fmt)88 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
89 {
90     switch (fmt) {
91     case bam:
92     case sam:
93     case cram:
94         return sequence_data;
95 
96     case vcf:
97     case bcf:
98         return variant_data;
99 
100     case bai:
101     case crai:
102     case csi:
103     case gzi:
104     case tbi:
105         return index_file;
106 
107     case bed:
108         return region_list;
109 
110     case json:
111         return unknown_category;
112 
113     case unknown_format:
114     case binary_format:
115     case text_format:
116     case format_maximum:
117         break;
118     }
119 
120     return unknown_category;
121 }
122 
123 // Decompress up to ten or so bytes by peeking at the file, which must be
124 // positioned at the start of a GZIP block.
decompress_peek(hFILE * fp,unsigned char * dest,size_t destsize)125 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
126 {
127     // Typically at most a couple of hundred bytes of input are required
128     // to get a few bytes of output from inflate(), so hopefully this buffer
129     // size suffices in general.
130     unsigned char buffer[512];
131     z_stream zs;
132     ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
133 
134     if (npeek < 0) return 0;
135 
136     zs.zalloc = NULL;
137     zs.zfree = NULL;
138     zs.next_in = buffer;
139     zs.avail_in = npeek;
140     zs.next_out = dest;
141     zs.avail_out = destsize;
142     if (inflateInit2(&zs, 31) != Z_OK) return 0;
143 
144     while (zs.total_out < destsize)
145         if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
146 
147     destsize = zs.total_out;
148     inflateEnd(&zs);
149 
150     return destsize;
151 }
152 
153 // Parse "x.y" text, taking care because the string is not NUL-terminated
154 // and filling in major/minor only when the digits are followed by a delimiter,
155 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
156 static void
parse_version(htsFormat * fmt,const unsigned char * u,const unsigned char * ulim)157 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
158 {
159     const char *s    = (const char *) u;
160     const char *slim = (const char *) ulim;
161     short v;
162 
163     fmt->version.major = fmt->version.minor = -1;
164 
165     for (v = 0; s < slim && isdigit_c(*s); s++)
166         v = 10 * v + *s - '0';
167 
168     if (s < slim) {
169         fmt->version.major = v;
170         if (*s == '.') {
171             s++;
172             for (v = 0; s < slim && isdigit_c(*s); s++)
173                 v = 10 * v + *s - '0';
174             if (s < slim)
175                 fmt->version.minor = v;
176         }
177         else
178             fmt->version.minor = 0;
179     }
180 }
181 
182 static int
cmp_nonblank(const char * key,const unsigned char * u,const unsigned char * ulim)183 cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
184 {
185     const unsigned char *ukey = (const unsigned char *) key;
186 
187     while (*ukey)
188         if (u >= ulim) return +1;
189         else if (isspace_c(*u)) u++;
190         else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
191         else u++, ukey++;
192 
193     return 0;
194 }
195 
hts_detect_format(hFILE * hfile,htsFormat * fmt)196 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
197 {
198     unsigned char s[21];
199     ssize_t len = hpeek(hfile, s, 18);
200     if (len < 0) return -1;
201 
202     if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
203         // The stream is either gzip-compressed or BGZF-compressed.
204         // Determine which, and decompress the first few bytes.
205         fmt->compression = (len >= 18 && (s[3] & 4) &&
206                             memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
207         len = decompress_peek(hfile, s, sizeof s);
208     }
209     else {
210         fmt->compression = no_compression;
211         len = hpeek(hfile, s, sizeof s);
212     }
213     if (len < 0) return -1;
214 
215     fmt->compression_level = -1;
216     fmt->specific = NULL;
217 
218     if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
219         fmt->category = sequence_data;
220         fmt->format = cram;
221         fmt->version.major = s[4], fmt->version.minor = s[5];
222         fmt->compression = custom;
223         return 0;
224     }
225     else if (len >= 4 && s[3] <= '\4') {
226         if (memcmp(s, "BAM\1", 4) == 0) {
227             fmt->category = sequence_data;
228             fmt->format = bam;
229             // TODO Decompress enough to pick version from @HD-VN header
230             fmt->version.major = 1, fmt->version.minor = -1;
231             return 0;
232         }
233         else if (memcmp(s, "BAI\1", 4) == 0) {
234             fmt->category = index_file;
235             fmt->format = bai;
236             fmt->version.major = -1, fmt->version.minor = -1;
237             return 0;
238         }
239         else if (memcmp(s, "BCF\4", 4) == 0) {
240             fmt->category = variant_data;
241             fmt->format = bcf;
242             fmt->version.major = 1, fmt->version.minor = -1;
243             return 0;
244         }
245         else if (memcmp(s, "BCF\2", 4) == 0) {
246             fmt->category = variant_data;
247             fmt->format = bcf;
248             fmt->version.major = s[3];
249             fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
250             return 0;
251         }
252         else if (memcmp(s, "CSI\1", 4) == 0) {
253             fmt->category = index_file;
254             fmt->format = csi;
255             fmt->version.major = 1, fmt->version.minor = -1;
256             return 0;
257         }
258         else if (memcmp(s, "TBI\1", 4) == 0) {
259             fmt->category = index_file;
260             fmt->format = tbi;
261             fmt->version.major = -1, fmt->version.minor = -1;
262             return 0;
263         }
264     }
265     else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
266         fmt->category = variant_data;
267         fmt->format = vcf;
268         if (len >= 21 && s[16] == 'v')
269             parse_version(fmt, &s[17], &s[len]);
270         else
271             fmt->version.major = fmt->version.minor = -1;
272         return 0;
273     }
274     else if (len >= 4 && s[0] == '@' &&
275              (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
276               memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
277         fmt->category = sequence_data;
278         fmt->format = sam;
279         // @HD-VN is not guaranteed to be the first tag, but then @HD is
280         // not guaranteed to be present at all...
281         if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
282             parse_version(fmt, &s[7], &s[len]);
283         else
284             fmt->version.major = 1, fmt->version.minor = -1;
285         return 0;
286     }
287     else if (cmp_nonblank("{\"", s, &s[len]) == 0) {
288         fmt->category = unknown_category;
289         fmt->format = json;
290         fmt->version.major = fmt->version.minor = -1;
291         return 0;
292     }
293     else {
294         // Various possibilities for tab-delimited text:
295         // .crai   (gzipped tab-delimited six columns: seqid 5*number)
296         // .bed    ([3..12] tab-delimited columns)
297         // .bedpe  (>= 10 tab-delimited columns)
298         // .sam    (tab-delimited >= 11 columns: seqid number seqid...)
299         // FIXME For now, assume it's SAM
300         fmt->category = sequence_data;
301         fmt->format = sam;
302         fmt->version.major = 1, fmt->version.minor = -1;
303         return 0;
304     }
305 
306     fmt->category = unknown_category;
307     fmt->format = unknown_format;
308     fmt->version.major = fmt->version.minor = -1;
309     fmt->compression = no_compression;
310     return 0;
311 }
312 
hts_format_description(const htsFormat * format)313 char *hts_format_description(const htsFormat *format)
314 {
315     kstring_t str = { 0, 0, NULL };
316 
317     switch (format->format) {
318     case sam:   kputs("SAM", &str); break;
319     case bam:   kputs("BAM", &str); break;
320     case cram:  kputs("CRAM", &str); break;
321     case vcf:   kputs("VCF", &str); break;
322     case bcf:
323         if (format->version.major == 1) kputs("Legacy BCF", &str);
324         else kputs("BCF", &str);
325         break;
326     case bai:   kputs("BAI", &str); break;
327     case crai:  kputs("CRAI", &str); break;
328     case csi:   kputs("CSI", &str); break;
329     case tbi:   kputs("Tabix", &str); break;
330     case json:  kputs("JSON", &str); break;
331     default:    kputs("unknown", &str); break;
332     }
333 
334     if (format->version.major >= 0) {
335         kputs(" version ", &str);
336         kputw(format->version.major, &str);
337         if (format->version.minor >= 0) {
338             kputc('.', &str);
339             kputw(format->version.minor, &str);
340         }
341     }
342 
343     switch (format->compression) {
344     case custom: kputs(" compressed", &str); break;
345     case gzip:   kputs(" gzip-compressed", &str); break;
346     case bgzf:
347         switch (format->format) {
348         case bam:
349         case bcf:
350         case csi:
351         case tbi:
352             // These are by definition BGZF, so just use the generic term
353             kputs(" compressed", &str);
354             break;
355         default:
356             kputs(" BGZF-compressed", &str);
357             break;
358         }
359         break;
360     default: break;
361     }
362 
363     switch (format->category) {
364     case sequence_data: kputs(" sequence", &str); break;
365     case variant_data:  kputs(" variant calling", &str); break;
366     case index_file:    kputs(" index", &str); break;
367     case region_list:   kputs(" genomic region", &str); break;
368     default: break;
369     }
370 
371     if (format->compression == no_compression)
372         switch (format->format) {
373         case sam:
374         case crai:
375         case vcf:
376         case bed:
377         case json:
378             kputs(" text", &str);
379             break;
380 
381         default:
382             kputs(" data", &str);
383             break;
384         }
385     else
386         kputs(" data", &str);
387 
388     return ks_release(&str);
389 }
390 
hts_open_format(const char * fn,const char * mode,const htsFormat * fmt)391 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
392 {
393     char smode[102], *cp, *cp2, *mode_c;
394     htsFile *fp = NULL;
395     hFILE *hfile;
396     char fmt_code = '\0';
397 
398     strncpy(smode, mode, 100);
399     smode[100]=0;
400     if ((cp = strchr(smode, ',')))
401         *cp = '\0';
402 
403     // Migrate format code (b or c) to the end of the smode buffer.
404     for (cp2 = cp = smode; *cp; cp++) {
405         if (*cp == 'b')
406             fmt_code = 'b';
407         else if (*cp == 'c')
408             fmt_code = 'c';
409         else
410             *cp2++ = *cp;
411     }
412     mode_c = cp2;
413     *cp2++ = fmt_code;
414     *cp2++ = 0;
415     *cp2++ = 0;
416 
417     // Set or reset the format code if opts->format is used
418     if (fmt && fmt->format != unknown_format)
419         *mode_c = "\0g\0\0b\0c\0\0b\0g\0\0"[fmt->format];
420 
421     hfile = hopen(fn, smode);
422     if (hfile == NULL) goto error;
423 
424     fp = hts_hopen(hfile, fn, smode);
425     if (fp == NULL) goto error;
426 
427     if (fmt && fmt->specific)
428         if (hts_opt_apply(fp, fmt->specific) != 0)
429             goto error;
430 
431     return fp;
432 
433 error:
434     if (hts_verbose >= 2)
435         fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
436 
437     if (hfile)
438         hclose_abruptly(hfile);
439 
440     return NULL;
441 }
442 
hts_open(const char * fn,const char * mode)443 htsFile *hts_open(const char *fn, const char *mode) {
444     return hts_open_format(fn, mode, NULL);
445 }
446 
447 /*
448  * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
449  * the prefix in lowercase into buf and returning a pointer to the suffix.
450  * On return, buf is always NUL-terminated; thus assumes that the "keyword"
451  * prefix should be one of several known values of maximum length buflen-2.
452  * (If delim is not found, returns a pointer to the '\0'.)
453  */
454 static const char *
scan_keyword(const char * str,char delim,char * buf,size_t buflen)455 scan_keyword(const char *str, char delim, char *buf, size_t buflen)
456 {
457     size_t i = 0;
458     while (*str && *str != delim) {
459         if (i < buflen-1) buf[i++] = tolower_c(*str);
460         str++;
461     }
462 
463     buf[i] = '\0';
464     return *str? str+1 : str;
465 }
466 
467 /*
468  * Parses arg and appends it to the option list.
469  *
470  * Returns 0 on success;
471  *        -1 on failure.
472  */
hts_opt_add(hts_opt ** opts,const char * c_arg)473 int hts_opt_add(hts_opt **opts, const char *c_arg) {
474     hts_opt *o, *t;
475     char *val;
476 
477     /*
478      * IMPORTANT!!!
479      * If you add another string option here, don't forget to also add
480      * it to the case statement in hts_opt_apply.
481      */
482 
483     if (!c_arg)
484         return -1;
485 
486     if (!(o =  malloc(sizeof(*o))))
487         return -1;
488 
489     if (!(o->arg = strdup(c_arg))) {
490         free(o);
491         return -1;
492     }
493 
494     if (!(val = strchr(o->arg, '=')))
495         val = "1"; // assume boolean
496     else
497         *val++ = '\0';
498 
499     if (strcmp(o->arg, "decode_md") == 0 ||
500         strcmp(o->arg, "DECODE_MD") == 0)
501         o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
502 
503     else if (strcmp(o->arg, "verbosity") == 0 ||
504              strcmp(o->arg, "VERBOSITY") == 0)
505         o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
506 
507     else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
508              strcmp(o->arg, "SEQS_PER_SLICE") == 0)
509         o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
510 
511     else if (strcmp(o->arg, "bases_per_slice") == 0 ||
512              strcmp(o->arg, "BASES_PER_SLICE") == 0)
513         o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
514 
515     else if (strcmp(o->arg, "slices_per_container") == 0 ||
516              strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
517         o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
518 
519     else if (strcmp(o->arg, "embed_ref") == 0 ||
520              strcmp(o->arg, "EMBED_REF") == 0)
521         o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
522 
523     else if (strcmp(o->arg, "no_ref") == 0 ||
524              strcmp(o->arg, "NO_REF") == 0)
525         o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
526 
527     else if (strcmp(o->arg, "ignore_md5") == 0 ||
528              strcmp(o->arg, "IGNORE_MD5") == 0)
529         o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
530 
531     else if (strcmp(o->arg, "use_bzip2") == 0 ||
532              strcmp(o->arg, "USE_BZIP2") == 0)
533         o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
534 
535     else if (strcmp(o->arg, "use_rans") == 0 ||
536              strcmp(o->arg, "USE_RANS") == 0)
537         o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
538 
539     else if (strcmp(o->arg, "use_lzma") == 0 ||
540              strcmp(o->arg, "USE_LZMA") == 0)
541         o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
542 
543     else if (strcmp(o->arg, "reference") == 0 ||
544              strcmp(o->arg, "REFERENCE") == 0)
545         o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
546 
547     else if (strcmp(o->arg, "version") == 0 ||
548              strcmp(o->arg, "VERSION") == 0)
549         o->opt = CRAM_OPT_VERSION, o->val.s =val;
550 
551     else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
552              strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
553         o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
554 
555     else if (strcmp(o->arg, "nthreads") == 0 ||
556              strcmp(o->arg, "NTHREADS") == 0)
557         o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
558 
559     else if (strcmp(o->arg, "cache_size") == 0 ||
560              strcmp(o->arg, "CACHE_SIZE") == 0) {
561         char *endp;
562         o->opt = HTS_OPT_CACHE_SIZE;
563         o->val.i = strtol(val, &endp, 0);
564         // NB: Doesn't support floats, eg 1.5g
565         // TODO: extend hts_parse_decimal? See also samtools sort.
566         switch (*endp) {
567         case 'g': case 'G': o->val.i *= 1024;
568         case 'm': case 'M': o->val.i *= 1024;
569         case 'k': case 'K': o->val.i *= 1024; break;
570         case '\0': break;
571         default:
572             fprintf(stderr, "Unrecognised cache size suffix '%c'\n", *endp);
573             free(o->arg);
574             free(o);
575             return -1;
576         }
577     }
578 
579     else if (strcmp(o->arg, "required_fields") == 0 ||
580              strcmp(o->arg, "REQUIRED_FIELDS") == 0)
581         o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
582 
583     else if (strcmp(o->arg, "lossy_names") == 0 ||
584              strcmp(o->arg, "LOSSY_NAMES") == 0)
585         o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
586 
587     else if (strcmp(o->arg, "name_prefix") == 0 ||
588              strcmp(o->arg, "NAME_PREFIX") == 0)
589         o->opt = CRAM_OPT_PREFIX, o->val.s = val;
590 
591     else {
592         fprintf(stderr, "Unknown option '%s'\n", o->arg);
593         free(o->arg);
594         free(o);
595         return -1;
596     }
597 
598     o->next = NULL;
599 
600     // Append; assumes small list.
601     if (*opts) {
602         t = *opts;
603         while (t->next)
604             t = t->next;
605         t->next = o;
606     } else {
607         *opts = o;
608     }
609 
610     return 0;
611 }
612 
613 /*
614  * Applies an hts_opt option list to a given htsFile.
615  *
616  * Returns 0 on success
617  *        -1 on failure
618  */
hts_opt_apply(htsFile * fp,hts_opt * opts)619 int hts_opt_apply(htsFile *fp, hts_opt *opts) {
620     hts_opt *last = NULL;
621 
622     for (; opts;  opts = (last=opts)->next) {
623         switch (opts->opt) {
624             case CRAM_OPT_REFERENCE:
625             case CRAM_OPT_VERSION:
626             case CRAM_OPT_PREFIX:
627                 if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
628                     return -1;
629                 break;
630             default:
631                 if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
632                     return -1;
633                 break;
634         }
635     }
636 
637     return 0;
638 }
639 
640 /*
641  * Frees an hts_opt list.
642  */
hts_opt_free(hts_opt * opts)643 void hts_opt_free(hts_opt *opts) {
644     hts_opt *last = NULL;
645     while (opts) {
646         opts = (last=opts)->next;
647         free(last->arg);
648         free(last);
649     }
650 }
651 
652 
653 /*
654  * Tokenise options as (key(=value)?,)*(key(=value)?)?
655  * NB: No provision for ',' appearing in the value!
656  * Add backslashing rules?
657  *
658  * This could be used as part of a general command line option parser or
659  * as a string concatenated onto the file open mode.
660  *
661  * Returns 0 on success
662  *        -1 on failure.
663  */
hts_parse_opt_list(htsFormat * fmt,const char * str)664 int hts_parse_opt_list(htsFormat *fmt, const char *str) {
665     while (str && *str) {
666         const char *str_start;
667         int len;
668         char arg[8001];
669 
670         while (*str && *str == ',')
671             str++;
672 
673         for (str_start = str; *str && *str != ','; str++);
674         len = str - str_start;
675 
676         // Produce a nul terminated copy of the option
677         strncpy(arg, str_start, len < 8000 ? len : 8000);
678         arg[len < 8000 ? len : 8000] = '\0';
679 
680         if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
681             return -1;
682 
683         if (*str)
684             str++;
685     }
686 
687     return 0;
688 }
689 
690 /*
691  * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
692  * followed by a comma separated list of key=value options and splits
693  * these up into the fields of htsFormat struct.
694  *
695  * format is assumed to be already initialised, either to blank
696  * "unknown" values or via previous hts_opt_add calls.
697  *
698  * Returns 0 on success
699  *        -1 on failure.
700  */
hts_parse_format(htsFormat * format,const char * str)701 int hts_parse_format(htsFormat *format, const char *str) {
702     char fmt[8];
703     const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
704 
705     format->version.minor = 0; // unknown
706     format->version.major = 0; // unknown
707 
708     if (strcmp(fmt, "sam") == 0) {
709         format->category          = sequence_data;
710         format->format            = sam;
711         format->compression       = no_compression;;
712         format->compression_level = 0;
713     } else if (strcmp(fmt, "bam") == 0) {
714         format->category          = sequence_data;
715         format->format            = bam;
716         format->compression       = bgzf;
717         format->compression_level = -1;
718     } else if (strcmp(fmt, "cram") == 0) {
719         format->category          = sequence_data;
720         format->format            = cram;
721         format->compression       = custom;
722         format->compression_level = -1;
723     } else if (strcmp(fmt, "vcf") == 0) {
724         format->category          = variant_data;
725         format->format            = vcf;
726         format->compression       = no_compression;;
727         format->compression_level = 0;
728     } else if (strcmp(fmt, "bcf") == 0) {
729         format->category          = variant_data;
730         format->format            = bcf;
731         format->compression       = bgzf;
732         format->compression_level = -1;
733     } else {
734         return -1;
735     }
736 
737     return hts_parse_opt_list(format, cp);
738 }
739 
740 
741 /*
742  * Tokenise options as (key(=value)?,)*(key(=value)?)?
743  * NB: No provision for ',' appearing in the value!
744  * Add backslashing rules?
745  *
746  * This could be used as part of a general command line option parser or
747  * as a string concatenated onto the file open mode.
748  *
749  * Returns 0 on success
750  *        -1 on failure.
751  */
hts_process_opts(htsFile * fp,const char * opts)752 static int hts_process_opts(htsFile *fp, const char *opts) {
753     htsFormat fmt;
754 
755     fmt.specific = NULL;
756     if (hts_parse_opt_list(&fmt, opts) != 0)
757         return -1;
758 
759     if (hts_opt_apply(fp, fmt.specific) != 0) {
760         hts_opt_free(fmt.specific);
761         return -1;
762     }
763 
764     hts_opt_free(fmt.specific);
765 
766     return 0;
767 }
768 
769 
hts_hopen(hFILE * hfile,const char * fn,const char * mode)770 htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
771 {
772     hFILE *hfile_orig = hfile;
773     htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
774     char simple_mode[101], *cp, *opts;
775     simple_mode[100] = '\0';
776 
777     if (fp == NULL) goto error;
778 
779     fp->fn = strdup(fn);
780     fp->is_be = ed_is_big();
781 
782     // Split mode into simple_mode,opts strings
783     if ((cp = strchr(mode, ','))) {
784         strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
785         simple_mode[cp-mode] = '\0';
786         opts = cp+1;
787     } else {
788         strncpy(simple_mode, mode, 100);
789         opts = NULL;
790     }
791 
792     if (strchr(simple_mode, 'r')) {
793         if (hts_detect_format(hfile, &fp->format) < 0) goto error;
794 
795         if (fp->format.format == json) {
796             hFILE *hfile2 = hopen_json_redirect(hfile, simple_mode);
797             if (hfile2 == NULL) goto error;
798 
799             // Build fp against the result of the redirection
800             hfile = hfile2;
801             if (hts_detect_format(hfile, &fp->format) < 0) goto error;
802         }
803     }
804     else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
805         htsFormat *fmt = &fp->format;
806         fp->is_write = 1;
807 
808         if (strchr(simple_mode, 'b')) fmt->format = binary_format;
809         else if (strchr(simple_mode, 'c')) fmt->format = cram;
810         else fmt->format = text_format;
811 
812         if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
813         else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
814         else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
815         else {
816             // No compression mode specified, set to the default for the format
817             switch (fmt->format) {
818             case binary_format: fmt->compression = bgzf; break;
819             case cram: fmt->compression = custom; break;
820             case text_format: fmt->compression = no_compression; break;
821             default: abort();
822             }
823         }
824 
825         // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
826         fmt->category = format_category(fmt->format);
827 
828         fmt->version.major = fmt->version.minor = -1;
829         fmt->compression_level = -1;
830         fmt->specific = NULL;
831     }
832     else { errno = EINVAL; goto error; }
833 
834     switch (fp->format.format) {
835     case binary_format:
836     case bam:
837     case bcf:
838         fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
839         if (fp->fp.bgzf == NULL) goto error;
840         fp->is_bin = fp->is_bgzf = 1;
841         break;
842 
843     case cram:
844         fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
845         if (fp->fp.cram == NULL) goto error;
846         if (!fp->is_write)
847             cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
848         fp->is_cram = 1;
849         break;
850 
851     case text_format:
852     case sam:
853     case vcf:
854         if (fp->format.compression != no_compression) {
855             fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
856             if (fp->fp.bgzf == NULL) goto error;
857             fp->is_bgzf = 1;
858         }
859         else
860             fp->fp.hfile = hfile;
861         break;
862 
863     default:
864         errno = ENOEXEC;
865         goto error;
866     }
867 
868     if (opts)
869         hts_process_opts(fp, opts);
870 
871     // If redirecting, close the original hFILE now (pedantically we would
872     // instead close it in hts_close(), but this a simplifying optimisation)
873     if (hfile != hfile_orig) hclose_abruptly(hfile_orig);
874 
875     return fp;
876 
877 error:
878     if (hts_verbose >= 2)
879         fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
880 
881     // If redirecting, close the failed redirection hFILE that we have opened
882     if (hfile != hfile_orig) hclose_abruptly(hfile);
883 
884     if (fp) {
885         free(fp->fn);
886         free(fp->fn_aux);
887         free(fp);
888     }
889     return NULL;
890 }
891 
hts_close(htsFile * fp)892 int hts_close(htsFile *fp)
893 {
894     int ret, save;
895 
896     switch (fp->format.format) {
897     case binary_format:
898     case bam:
899     case bcf:
900         ret = bgzf_close(fp->fp.bgzf);
901         break;
902 
903     case cram:
904         if (!fp->is_write) {
905             switch (cram_eof(fp->fp.cram)) {
906             case 2:
907                 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
908                 break;
909             case 0:  /* not at EOF, but may not have wanted all seqs */
910             default: /* case 1, expected EOF */
911                 break;
912             }
913         }
914         ret = cram_close(fp->fp.cram);
915         break;
916 
917     case text_format:
918     case sam:
919     case vcf:
920         if (fp->format.compression != no_compression)
921             ret = bgzf_close(fp->fp.bgzf);
922         else
923             ret = hclose(fp->fp.hfile);
924         break;
925 
926     default:
927         ret = -1;
928         break;
929     }
930 
931     save = errno;
932     free(fp->fn);
933     free(fp->fn_aux);
934     free(fp->line.s);
935     free(fp);
936     errno = save;
937     return ret;
938 }
939 
hts_get_format(htsFile * fp)940 const htsFormat *hts_get_format(htsFile *fp)
941 {
942     return fp? &fp->format : NULL;
943 }
944 
hts_format_file_extension(const htsFormat * format)945 const char *hts_format_file_extension(const htsFormat *format) {
946     if (!format)
947         return "?";
948 
949     switch (format->format) {
950     case sam:  return "sam";
951     case bam:  return "bam";
952     case bai:  return "bai";
953     case cram: return "cram";
954     case crai: return "crai";
955     case vcf:  return "vcf";
956     case bcf:  return "bcf";
957     case csi:  return "csi";
958     case gzi:  return "gzi";
959     case tbi:  return "tbi";
960     case bed:  return "bed";
961     default:   return "?";
962     }
963 }
964 
hts_set_opt(htsFile * fp,enum hts_fmt_option opt,...)965 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
966     int r;
967     va_list args;
968 
969     switch (opt) {
970     case HTS_OPT_NTHREADS: {
971         va_start(args, opt);
972         int nthreads = va_arg(args, int);
973         va_end(args);
974         return hts_set_threads(fp, nthreads);
975     }
976 
977     case HTS_OPT_THREAD_POOL: {
978         va_start(args, opt);
979         htsThreadPool *p = va_arg(args, htsThreadPool *);
980         va_end(args);
981         return hts_set_thread_pool(fp, p);
982     }
983 
984     case HTS_OPT_CACHE_SIZE: {
985         va_start(args, opt);
986         int cache_size = va_arg(args, int);
987         va_end(args);
988         hts_set_cache_size(fp, cache_size);
989         return 0;
990     }
991 
992     default:
993         break;
994     }
995 
996     if (fp->format.format != cram)
997         return 0;
998 
999     va_start(args, opt);
1000     r = cram_set_voption(fp->fp.cram, opt, args);
1001     va_end(args);
1002 
1003     return r;
1004 }
1005 
1006 BGZF *hts_get_bgzfp(htsFile *fp);
1007 
hts_set_threads(htsFile * fp,int n)1008 int hts_set_threads(htsFile *fp, int n)
1009 {
1010     if (fp->format.compression == bgzf) {
1011         return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1012     } else if (fp->format.format == cram) {
1013         return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1014     }
1015     else return 0;
1016 }
1017 
hts_set_thread_pool(htsFile * fp,htsThreadPool * p)1018 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1019     if (fp->format.compression == bgzf) {
1020         return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1021     } else if (fp->format.format == cram) {
1022         return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1023     }
1024     else return 0;
1025 }
1026 
hts_set_cache_size(htsFile * fp,int n)1027 void hts_set_cache_size(htsFile *fp, int n)
1028 {
1029     if (fp->format.compression == bgzf)
1030         bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1031 }
1032 
hts_set_fai_filename(htsFile * fp,const char * fn_aux)1033 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1034 {
1035     free(fp->fn_aux);
1036     if (fn_aux) {
1037         fp->fn_aux = strdup(fn_aux);
1038         if (fp->fn_aux == NULL) return -1;
1039     }
1040     else fp->fn_aux = NULL;
1041 
1042     if (fp->format.format == cram)
1043         if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1044             return -1;
1045 
1046     return 0;
1047 }
1048 
1049 // For VCF/BCF backward sweeper. Not exposing these functions because their
1050 // future is uncertain. Things will probably have to change with hFILE...
hts_get_bgzfp(htsFile * fp)1051 BGZF *hts_get_bgzfp(htsFile *fp)
1052 {
1053     if (fp->is_bgzf)
1054         return fp->fp.bgzf;
1055     else
1056         return NULL;
1057 }
hts_useek(htsFile * fp,long uoffset,int where)1058 int hts_useek(htsFile *fp, long uoffset, int where)
1059 {
1060     if (fp->is_bgzf)
1061         return bgzf_useek(fp->fp.bgzf, uoffset, where);
1062     else
1063         return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1064 }
hts_utell(htsFile * fp)1065 long hts_utell(htsFile *fp)
1066 {
1067     if (fp->is_bgzf)
1068         return bgzf_utell(fp->fp.bgzf);
1069     else
1070         return htell(fp->fp.hfile);
1071 }
1072 
hts_getline(htsFile * fp,int delimiter,kstring_t * str)1073 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1074 {
1075     int ret;
1076     if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1077         fprintf(stderr, "[hts_getline] unexpected delimiter %d\n", delimiter);
1078         abort();
1079     }
1080 
1081     switch (fp->format.compression) {
1082     case no_compression:
1083         str->l = 0;
1084         ret = kgetline(str, (kgets_func *) hgets, fp->fp.hfile);
1085         if (ret >= 0) ret = str->l;
1086         else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1087         else ret = -1;
1088         break;
1089 
1090     case gzip:
1091     case bgzf:
1092         ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1093         break;
1094 
1095     default:
1096         abort();
1097     }
1098 
1099     ++fp->lineno;
1100     return ret;
1101 }
1102 
hts_readlist(const char * string,int is_file,int * _n)1103 char **hts_readlist(const char *string, int is_file, int *_n)
1104 {
1105     int m = 0, n = 0;
1106     char **s = 0;
1107     if ( is_file )
1108     {
1109         BGZF *fp = bgzf_open(string, "r");
1110         if ( !fp ) return NULL;
1111 
1112         kstring_t str;
1113         str.s = 0; str.l = str.m = 0;
1114         while (bgzf_getline(fp, '\n', &str) >= 0)
1115         {
1116             if (str.l == 0) continue;
1117             n++;
1118             hts_expand(char*,n,m,s);
1119             s[n-1] = strdup(str.s);
1120         }
1121         bgzf_close(fp);
1122         free(str.s);
1123     }
1124     else
1125     {
1126         const char *q = string, *p = string;
1127         while ( 1 )
1128         {
1129             if (*p == ',' || *p == 0)
1130             {
1131                 n++;
1132                 hts_expand(char*,n,m,s);
1133                 s[n-1] = (char*)calloc(p - q + 1, 1);
1134                 strncpy(s[n-1], q, p - q);
1135                 q = p + 1;
1136             }
1137             if ( !*p ) break;
1138             p++;
1139         }
1140     }
1141     s = (char**)realloc(s, n * sizeof(char*));
1142     *_n = n;
1143     return s;
1144 }
1145 
hts_readlines(const char * fn,int * _n)1146 char **hts_readlines(const char *fn, int *_n)
1147 {
1148     int m = 0, n = 0;
1149     char **s = 0;
1150     BGZF *fp = bgzf_open(fn, "r");
1151     if ( fp ) { // read from file
1152         kstring_t str;
1153         str.s = 0; str.l = str.m = 0;
1154         while (bgzf_getline(fp, '\n', &str) >= 0) {
1155             if (str.l == 0) continue;
1156             if (m == n) {
1157                 m = m? m<<1 : 16;
1158                 s = (char**)realloc(s, m * sizeof(char*));
1159             }
1160             s[n++] = strdup(str.s);
1161         }
1162         bgzf_close(fp);
1163         s = (char**)realloc(s, n * sizeof(char*));
1164         free(str.s);
1165     } else if (*fn == ':') { // read from string
1166         const char *q, *p;
1167         for (q = p = fn + 1;; ++p)
1168             if (*p == ',' || *p == 0) {
1169                 if (m == n) {
1170                     m = m? m<<1 : 16;
1171                     s = (char**)realloc(s, m * sizeof(char*));
1172                 }
1173                 s[n] = (char*)calloc(p - q + 1, 1);
1174                 strncpy(s[n++], q, p - q);
1175                 q = p + 1;
1176                 if (*p == 0) break;
1177             }
1178     } else return 0;
1179     s = (char**)realloc(s, n * sizeof(char*));
1180     *_n = n;
1181     return s;
1182 }
1183 
1184 // DEPRECATED: To be removed in a future HTSlib release
hts_file_type(const char * fname)1185 int hts_file_type(const char *fname)
1186 {
1187     int len = strlen(fname);
1188     if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
1189     if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
1190     if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
1191     if ( !strcmp("-",fname) ) return FT_STDIN;
1192 
1193     hFILE *f = hopen(fname, "r");
1194     if (f == NULL) return 0;
1195 
1196     htsFormat fmt;
1197     if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
1198     if (hclose(f) < 0) return 0;
1199 
1200     switch (fmt.format) {
1201     case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
1202     case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
1203     default:  return 0;
1204     }
1205 }
1206 
hts_check_EOF(htsFile * fp)1207 int hts_check_EOF(htsFile *fp)
1208 {
1209     if (fp->format.compression == bgzf)
1210         return bgzf_check_EOF(hts_get_bgzfp(fp));
1211     else if (fp->format.format == cram)
1212         return cram_check_EOF(fp->fp.cram);
1213     else
1214         return 3;
1215 }
1216 
1217 
1218 /****************
1219  *** Indexing ***
1220  ****************/
1221 
1222 #define HTS_MIN_MARKER_DIST 0x10000
1223 
1224 // Finds the special meta bin
1225 //  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
1226 #define META_BIN(idx) ((idx)->n_bins + 1)
1227 
1228 #define pair64_lt(a,b) ((a).u < (b).u)
1229 
1230 KSORT_INIT(_off, hts_pair64_t, pair64_lt)
1231 
1232 typedef struct {
1233     int32_t m, n;
1234     uint64_t loff;
1235     hts_pair64_t *list;
1236 } bins_t;
1237 
1238 KHASH_MAP_INIT_INT(bin, bins_t)
1239 typedef khash_t(bin) bidx_t;
1240 
1241 typedef struct {
1242     int32_t n, m;
1243     uint64_t *offset;
1244 } lidx_t;
1245 
1246 struct __hts_idx_t {
1247     int fmt, min_shift, n_lvls, n_bins;
1248     uint32_t l_meta;
1249     int32_t n, m;
1250     uint64_t n_no_coor;
1251     bidx_t **bidx;
1252     lidx_t *lidx;
1253     uint8_t *meta; // MUST have a terminating NUL on the end
1254     struct {
1255         uint32_t last_bin, save_bin;
1256         int last_coor, last_tid, save_tid, finished;
1257         uint64_t last_off, save_off;
1258         uint64_t off_beg, off_end;
1259         uint64_t n_mapped, n_unmapped;
1260     } z; // keep internal states
1261 };
1262 
idx_format_name(int fmt)1263 static char * idx_format_name(int fmt) {
1264     switch (fmt) {
1265         case HTS_FMT_CSI: return "csi";
1266         case HTS_FMT_BAI: return "bai";
1267         case HTS_FMT_TBI: return "tbi";
1268         case HTS_FMT_CRAI: return "crai";
1269         default: return "unknown";
1270     }
1271 }
1272 
insert_to_b(bidx_t * b,int bin,uint64_t beg,uint64_t end)1273 static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
1274 {
1275     khint_t k;
1276     bins_t *l;
1277     int absent;
1278     k = kh_put(bin, b, bin, &absent);
1279     if (absent < 0) return -1; // Out of memory
1280     l = &kh_value(b, k);
1281     if (absent) {
1282         l->m = 1; l->n = 0;
1283         l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
1284         if (!l->list) {
1285             kh_del(bin, b, k);
1286             return -1;
1287         }
1288     } else if (l->n == l->m) {
1289         uint32_t new_m = l->m ? l->m << 1 : 1;
1290         hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
1291         if (!new_list) return -1;
1292         l->list = new_list;
1293         l->m = new_m;
1294     }
1295     l->list[l->n].u = beg;
1296     l->list[l->n++].v = end;
1297     return 0;
1298 }
1299 
insert_to_l(lidx_t * l,int64_t _beg,int64_t _end,uint64_t offset,int min_shift)1300 static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
1301 {
1302     int i, beg, end;
1303     beg = _beg >> min_shift;
1304     end = (_end - 1) >> min_shift;
1305     if (l->m < end + 1) {
1306         size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
1307         uint64_t *new_offset;
1308 
1309         new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
1310         if (!new_offset) return -1;
1311 
1312         // fill unused memory with (uint64_t)-1
1313         memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
1314         l->m = new_m;
1315         l->offset = new_offset;
1316     }
1317     for (i = beg; i <= end; ++i) {
1318         if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
1319     }
1320     if (l->n < end + 1) l->n = end + 1;
1321     return 0;
1322 }
1323 
hts_idx_init(int n,int fmt,uint64_t offset0,int min_shift,int n_lvls)1324 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
1325 {
1326     hts_idx_t *idx;
1327     idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
1328     if (idx == NULL) return NULL;
1329     idx->fmt = fmt;
1330     idx->min_shift = min_shift;
1331     idx->n_lvls = n_lvls;
1332     idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
1333     idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
1334     idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
1335     idx->z.last_coor = 0xffffffffu;
1336     if (n) {
1337         idx->n = idx->m = n;
1338         idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
1339         if (idx->bidx == NULL) { free(idx); return NULL; }
1340         idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
1341         if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
1342     }
1343     return idx;
1344 }
1345 
update_loff(hts_idx_t * idx,int i,int free_lidx)1346 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
1347 {
1348     bidx_t *bidx = idx->bidx[i];
1349     lidx_t *lidx = &idx->lidx[i];
1350     khint_t k;
1351     int l;
1352     uint64_t offset0 = 0;
1353     if (bidx) {
1354         k = kh_get(bin, bidx, META_BIN(idx));
1355         if (k != kh_end(bidx))
1356             offset0 = kh_val(bidx, k).list[0].u;
1357         for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
1358             lidx->offset[l] = offset0;
1359     } else l = 1;
1360     for (; l < lidx->n; ++l) // fill missing values
1361         if (lidx->offset[l] == (uint64_t)-1)
1362             lidx->offset[l] = lidx->offset[l-1];
1363     if (bidx == 0) return;
1364     for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
1365         if (kh_exist(bidx, k))
1366         {
1367             if ( kh_key(bidx, k) < idx->n_bins )
1368             {
1369                 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
1370                 // disable linear index if bot_bin out of bounds
1371                 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
1372             }
1373             else
1374                 kh_val(bidx, k).loff = 0;
1375         }
1376     if (free_lidx) {
1377         free(lidx->offset);
1378         lidx->m = lidx->n = 0;
1379         lidx->offset = 0;
1380     }
1381 }
1382 
compress_binning(hts_idx_t * idx,int i)1383 static void compress_binning(hts_idx_t *idx, int i)
1384 {
1385     bidx_t *bidx = idx->bidx[i];
1386     khint_t k;
1387     int l, m;
1388     if (bidx == 0) return;
1389     // merge a bin to its parent if the bin is too small
1390     for (l = idx->n_lvls; l > 0; --l) {
1391         unsigned start = hts_bin_first(l);
1392         for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1393             bins_t *p, *q;
1394             if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
1395             p = &kh_value(bidx, k);
1396             if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
1397             if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
1398                 khint_t kp;
1399                 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
1400                 if (kp == kh_end(bidx)) continue;
1401                 q = &kh_val(bidx, kp);
1402                 if (q->n + p->n > q->m) {
1403                     q->m = q->n + p->n;
1404                     kroundup32(q->m);
1405                     q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
1406                 }
1407                 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
1408                 q->n += p->n;
1409                 free(p->list);
1410                 kh_del(bin, bidx, k);
1411             }
1412         }
1413     }
1414     k = kh_get(bin, bidx, 0);
1415     if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
1416     // merge adjacent chunks that start from the same BGZF block
1417     for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1418         bins_t *p;
1419         if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
1420         p = &kh_value(bidx, k);
1421         for (l = 1, m = 0; l < p->n; ++l) {
1422             if (p->list[m].v>>16 >= p->list[l].u>>16) {
1423                 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
1424             } else p->list[++m] = p->list[l];
1425         }
1426         p->n = m + 1;
1427     }
1428 }
1429 
hts_idx_finish(hts_idx_t * idx,uint64_t final_offset)1430 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
1431 {
1432     int i;
1433     if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
1434     if (idx->z.save_tid >= 0) {
1435         insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
1436         insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
1437         insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
1438     }
1439     for (i = 0; i < idx->n; ++i) {
1440         update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
1441         compress_binning(idx, i);
1442     }
1443     idx->z.finished = 1;
1444 }
1445 
hts_idx_push(hts_idx_t * idx,int tid,int beg,int end,uint64_t offset,int is_mapped)1446 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
1447 {
1448     int bin;
1449     int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
1450     if (tid<0) beg = -1, end = 0;
1451     if (tid >= 0 && (beg > maxpos || end > maxpos)) {
1452         goto pos_too_big;
1453     }
1454     if (tid >= idx->m) { // enlarge the index
1455         uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
1456         bidx_t **new_bidx;
1457         lidx_t *new_lidx;
1458 
1459         new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
1460         if (!new_bidx) return -1;
1461         idx->bidx = new_bidx;
1462 
1463         new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
1464         if (!new_lidx) return -1;
1465         idx->lidx = new_lidx;
1466 
1467         memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
1468         memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
1469         idx->m = new_m;
1470     }
1471     if (idx->n < tid + 1) idx->n = tid + 1;
1472     if (idx->z.finished) return 0;
1473     if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
1474         if ( tid>=0 && idx->n_no_coor )
1475         {
1476             if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid);
1477             return -1;
1478         }
1479         if (tid>=0 && idx->bidx[tid] != 0)
1480         {
1481             if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__);
1482             return -1;
1483         }
1484         idx->z.last_tid = tid;
1485         idx->z.last_bin = 0xffffffffu;
1486     } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
1487         if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions on sequence #%d: %d followed by %d\n", __func__, tid+1, idx->z.last_coor+1, beg+1);
1488         return -1;
1489     }
1490     if ( tid>=0 )
1491     {
1492         if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
1493         if (is_mapped) {
1494             // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
1495             if (beg < 0)  beg = 0;
1496             if (end <= 0) end = 1;
1497             // idx->z.last_off points to the start of the current record
1498             if (insert_to_l(&idx->lidx[tid], beg, end,
1499                             idx->z.last_off, idx->min_shift) < 0) return -1;
1500         }
1501     }
1502     else idx->n_no_coor++;
1503     bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
1504     if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
1505         if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
1506             if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
1507                             idx->z.save_off, idx->z.last_off) < 0) return -1;
1508         }
1509         if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
1510             idx->z.off_end = idx->z.last_off;
1511             if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1512                             idx->z.off_beg, idx->z.off_end) < 0) return -1;
1513             if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1514                             idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
1515             idx->z.n_mapped = idx->z.n_unmapped = 0;
1516             idx->z.off_beg = idx->z.off_end;
1517         }
1518         idx->z.save_off = idx->z.last_off;
1519         idx->z.save_bin = idx->z.last_bin = bin;
1520         idx->z.save_tid = tid;
1521     }
1522     if (is_mapped) ++idx->z.n_mapped;
1523     else ++idx->z.n_unmapped;
1524     idx->z.last_off = offset;
1525     idx->z.last_coor = beg;
1526     return 0;
1527 
1528  pos_too_big: {
1529         int64_t max = end > beg ? end : beg, s = 1 << 14;
1530         int n_lvls = 0;
1531         while (max > s) {
1532             n_lvls++;
1533             s <<= 3;
1534         }
1535 
1536         if (hts_verbose >= 1) {
1537             if (idx->fmt == HTS_FMT_CSI) {
1538                 fprintf(stderr,
1539                         "[E::%s] Region %d..%d cannot be stored in a csi index "
1540                         "with min_shift = %d, n_lvls = %d.  Try using "
1541                         " min_shift = 14, n_lvls >= %d\n",
1542                         __func__, beg, end,
1543                         idx->min_shift, idx->n_lvls, n_lvls);
1544             } else {
1545                 fprintf(stderr,
1546                         "[E::%s] Region %d..%d cannot be stored in a %s index. "
1547                         "Try using a csi index with min_shift = 14, "
1548                         "n_lvls >= %d\n",
1549                         __func__, beg, end, idx_format_name(idx->fmt), n_lvls);
1550             }
1551         }
1552         errno = ERANGE;
1553         return -1;
1554     }
1555 }
1556 
hts_idx_destroy(hts_idx_t * idx)1557 void hts_idx_destroy(hts_idx_t *idx)
1558 {
1559     khint_t k;
1560     int i;
1561     if (idx == 0) return;
1562 
1563     // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
1564     if (idx->fmt == HTS_FMT_CRAI) {
1565         hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
1566         cram_index_free(cidx->cram);
1567         free(cidx);
1568         return;
1569     }
1570 
1571     for (i = 0; i < idx->m; ++i) {
1572         bidx_t *bidx = idx->bidx[i];
1573         free(idx->lidx[i].offset);
1574         if (bidx == 0) continue;
1575         for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1576             if (kh_exist(bidx, k))
1577                 free(kh_value(bidx, k).list);
1578         kh_destroy(bin, bidx);
1579     }
1580     free(idx->bidx); free(idx->lidx); free(idx->meta);
1581     free(idx);
1582 }
1583 
1584 // The optimizer eliminates these ed_is_big() calls; still it would be good to
1585 // TODO Determine endianness at configure- or compile-time
1586 
idx_write_int32(BGZF * fp,int32_t x)1587 static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
1588 {
1589     if (ed_is_big()) x = ed_swap_4(x);
1590     return bgzf_write(fp, &x, sizeof x);
1591 }
1592 
idx_write_uint32(BGZF * fp,uint32_t x)1593 static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
1594 {
1595     if (ed_is_big()) x = ed_swap_4(x);
1596     return bgzf_write(fp, &x, sizeof x);
1597 }
1598 
idx_write_uint64(BGZF * fp,uint64_t x)1599 static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
1600 {
1601     if (ed_is_big()) x = ed_swap_8(x);
1602     return bgzf_write(fp, &x, sizeof x);
1603 }
1604 
swap_bins(bins_t * p)1605 static inline void swap_bins(bins_t *p)
1606 {
1607     int i;
1608     for (i = 0; i < p->n; ++i) {
1609         ed_swap_8p(&p->list[i].u);
1610         ed_swap_8p(&p->list[i].v);
1611     }
1612 }
1613 
hts_idx_save_core(const hts_idx_t * idx,BGZF * fp,int fmt)1614 static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
1615 {
1616     int32_t i, j;
1617 
1618     #define check(ret) if ((ret) < 0) return -1
1619 
1620     check(idx_write_int32(fp, idx->n));
1621     if (fmt == HTS_FMT_TBI && idx->l_meta)
1622         check(bgzf_write(fp, idx->meta, idx->l_meta));
1623 
1624     for (i = 0; i < idx->n; ++i) {
1625         khint_t k;
1626         bidx_t *bidx = idx->bidx[i];
1627         lidx_t *lidx = &idx->lidx[i];
1628         // write binning index
1629         check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
1630         if (bidx)
1631             for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1632                 if (kh_exist(bidx, k)) {
1633                     bins_t *p = &kh_value(bidx, k);
1634                     check(idx_write_uint32(fp, kh_key(bidx, k)));
1635                     if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
1636                     //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
1637                     check(idx_write_int32(fp, p->n));
1638                     for (j = 0; j < p->n; ++j) {
1639                         check(idx_write_uint64(fp, p->list[j].u));
1640                         check(idx_write_uint64(fp, p->list[j].v));
1641                     }
1642                 }
1643 
1644         // write linear index
1645         if (fmt != HTS_FMT_CSI) {
1646             check(idx_write_int32(fp, lidx->n));
1647             for (j = 0; j < lidx->n; ++j)
1648                 check(idx_write_uint64(fp, lidx->offset[j]));
1649         }
1650     }
1651 
1652     check(idx_write_uint64(fp, idx->n_no_coor));
1653     return 0;
1654     #undef check
1655 }
1656 
hts_idx_save(const hts_idx_t * idx,const char * fn,int fmt)1657 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
1658 {
1659     int ret, save;
1660     char *fnidx = (char*)calloc(1, strlen(fn) + 5);
1661     if (fnidx == NULL) return -1;
1662 
1663     strcpy(fnidx, fn);
1664     switch (fmt) {
1665     case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
1666     case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
1667     case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
1668     default: abort();
1669     }
1670 
1671     ret = hts_idx_save_as(idx, fn, fnidx, fmt);
1672     save = errno;
1673     free(fnidx);
1674     errno = save;
1675     return ret;
1676 }
1677 
hts_idx_save_as(const hts_idx_t * idx,const char * fn,const char * fnidx,int fmt)1678 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
1679 {
1680     BGZF *fp;
1681 
1682     #define check(ret) if ((ret) < 0) goto fail
1683 
1684     if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
1685 
1686     fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
1687     if (fp == NULL) return -1;
1688 
1689     if (fmt == HTS_FMT_CSI) {
1690         check(bgzf_write(fp, "CSI\1", 4));
1691         check(idx_write_int32(fp, idx->min_shift));
1692         check(idx_write_int32(fp, idx->n_lvls));
1693         check(idx_write_uint32(fp, idx->l_meta));
1694         if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
1695     } else if (fmt == HTS_FMT_TBI) {
1696         check(bgzf_write(fp, "TBI\1", 4));
1697     } else if (fmt == HTS_FMT_BAI) {
1698         check(bgzf_write(fp, "BAI\1", 4));
1699     } else abort();
1700 
1701     check(hts_idx_save_core(idx, fp, fmt));
1702 
1703     return bgzf_close(fp);
1704     #undef check
1705 
1706 fail:
1707     bgzf_close(fp);
1708     return -1;
1709 }
1710 
hts_idx_load_core(hts_idx_t * idx,BGZF * fp,int fmt)1711 static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt)
1712 {
1713     int32_t i, n, is_be;
1714     is_be = ed_is_big();
1715     if (idx == NULL) return -4;
1716     for (i = 0; i < idx->n; ++i) {
1717         bidx_t *h;
1718         lidx_t *l = &idx->lidx[i];
1719         uint32_t key;
1720         int j, absent;
1721         bins_t *p;
1722         h = idx->bidx[i] = kh_init(bin);
1723         if (bgzf_read(fp, &n, 4) != 4) return -1;
1724         if (is_be) ed_swap_4p(&n);
1725         for (j = 0; j < n; ++j) {
1726             khint_t k;
1727             if (bgzf_read(fp, &key, 4) != 4) return -1;
1728             if (is_be) ed_swap_4p(&key);
1729             k = kh_put(bin, h, key, &absent);
1730             if (absent <= 0) return -3; // Duplicate bin number
1731             p = &kh_val(h, k);
1732             if (fmt == HTS_FMT_CSI) {
1733                 if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
1734                 if (is_be) ed_swap_8p(&p->loff);
1735             } else p->loff = 0;
1736             if (bgzf_read(fp, &p->n, 4) != 4) return -1;
1737             if (is_be) ed_swap_4p(&p->n);
1738             p->m = p->n;
1739             p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
1740             if (p->list == NULL) return -2;
1741             if (bgzf_read(fp, p->list, p->n<<4) != p->n<<4) return -1;
1742             if (is_be) swap_bins(p);
1743         }
1744         if (fmt != HTS_FMT_CSI) { // load linear index
1745             int j;
1746             if (bgzf_read(fp, &l->n, 4) != 4) return -1;
1747             if (is_be) ed_swap_4p(&l->n);
1748             l->m = l->n;
1749             l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
1750             if (l->offset == NULL) return -2;
1751             if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
1752             if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
1753             for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
1754                 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
1755             update_loff(idx, i, 1);
1756         }
1757     }
1758     if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
1759     if (is_be) ed_swap_8p(&idx->n_no_coor);
1760     return 0;
1761 }
1762 
hts_idx_load_local(const char * fn)1763 static hts_idx_t *hts_idx_load_local(const char *fn)
1764 {
1765     uint8_t magic[4];
1766     int i, is_be;
1767     hts_idx_t *idx = NULL;
1768     uint8_t *meta = NULL;
1769     BGZF *fp = bgzf_open(fn, "r");
1770     if (fp == NULL) return NULL;
1771     is_be = ed_is_big();
1772     if (bgzf_read(fp, magic, 4) != 4) goto fail;
1773 
1774     if (memcmp(magic, "CSI\1", 4) == 0) {
1775         uint32_t x[3], n;
1776         if (bgzf_read(fp, x, 12) != 12) goto fail;
1777         if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
1778         if (x[2]) {
1779             if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
1780             if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
1781             if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
1782             // Prevent possible strlen past the end in tbx_index_load2
1783             meta[x[2]] = '\0';
1784         }
1785         if (bgzf_read(fp, &n, 4) != 4) goto fail;
1786         if (is_be) ed_swap_4p(&n);
1787         if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
1788         idx->l_meta = x[2];
1789         idx->meta = meta;
1790         meta = NULL;
1791         if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
1792     }
1793     else if (memcmp(magic, "TBI\1", 4) == 0) {
1794         uint8_t x[8 * 4];
1795         uint32_t n;
1796         // Read file header
1797         if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
1798         n = le_to_u32(&x[0]); // location of n_ref
1799         if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
1800         n = le_to_u32(&x[7*4]); // location of l_nm
1801         if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
1802         idx->l_meta = 28 + n;
1803         if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
1804         // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
1805         // N.B. left in little-endian byte order.
1806         memcpy(idx->meta, &x[1*4], 28);
1807         // Read in sequence names.
1808         if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
1809         // Prevent possible strlen past the end in tbx_index_load2
1810         idx->meta[idx->l_meta] = '\0';
1811         if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
1812     }
1813     else if (memcmp(magic, "BAI\1", 4) == 0) {
1814         uint32_t n;
1815         if (bgzf_read(fp, &n, 4) != 4) goto fail;
1816         if (is_be) ed_swap_4p(&n);
1817         idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5);
1818         if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
1819     }
1820     else { errno = EINVAL; goto fail; }
1821 
1822     bgzf_close(fp);
1823     return idx;
1824 
1825 fail:
1826     bgzf_close(fp);
1827     hts_idx_destroy(idx);
1828     free(meta);
1829     return NULL;
1830 }
1831 
hts_idx_set_meta(hts_idx_t * idx,uint32_t l_meta,uint8_t * meta,int is_copy)1832 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
1833                       int is_copy)
1834 {
1835     uint8_t *new_meta = meta;
1836     if (is_copy) {
1837         size_t l = l_meta;
1838         if (l > SIZE_MAX - 1) {
1839             errno = ENOMEM;
1840             return -1;
1841         }
1842         new_meta = malloc(l + 1);
1843         if (!new_meta) return -1;
1844         memcpy(new_meta, meta, l);
1845         // Prevent possible strlen past the end in tbx_index_load2
1846         meta[l + 1] = '\0';
1847     }
1848     if (idx->meta) free(idx->meta);
1849     idx->l_meta = l_meta;
1850     idx->meta = new_meta;
1851     return 0;
1852 }
1853 
hts_idx_get_meta(hts_idx_t * idx,uint32_t * l_meta)1854 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
1855 {
1856     *l_meta = idx->l_meta;
1857     return idx->meta;
1858 }
1859 
hts_idx_seqnames(const hts_idx_t * idx,int * n,hts_id2name_f getid,void * hdr)1860 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
1861 {
1862     if ( !idx->n )
1863     {
1864         *n = 0;
1865         return NULL;
1866     }
1867 
1868     int tid = 0, i;
1869     const char **names = (const char**) calloc(idx->n,sizeof(const char*));
1870     for (i=0; i<idx->n; i++)
1871     {
1872         bidx_t *bidx = idx->bidx[i];
1873         if ( !bidx ) continue;
1874         names[tid++] = getid(hdr,i);
1875     }
1876     *n = tid;
1877     return names;
1878 }
1879 
hts_idx_get_stat(const hts_idx_t * idx,int tid,uint64_t * mapped,uint64_t * unmapped)1880 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
1881 {
1882     if ( idx->fmt == HTS_FMT_CRAI ) {
1883         *mapped = 0; *unmapped = 0;
1884         return -1;
1885     }
1886 
1887     bidx_t *h = idx->bidx[tid];
1888     khint_t k = kh_get(bin, h, META_BIN(idx));
1889     if (k != kh_end(h)) {
1890         *mapped = kh_val(h, k).list[1].u;
1891         *unmapped = kh_val(h, k).list[1].v;
1892         return 0;
1893     } else {
1894         *mapped = 0; *unmapped = 0;
1895         return -1;
1896     }
1897 }
1898 
hts_idx_get_n_no_coor(const hts_idx_t * idx)1899 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
1900 {
1901     return idx->n_no_coor;
1902 }
1903 
1904 /****************
1905  *** Iterator ***
1906  ****************/
1907 
reg2bins(int64_t beg,int64_t end,hts_itr_t * itr,int min_shift,int n_lvls)1908 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
1909 {
1910     int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
1911     if (beg >= end) return 0;
1912     if (end >= 1LL<<s) end = 1LL<<s;
1913     for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
1914         int b, e, n, i;
1915         b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
1916         if (itr->bins.n + n > itr->bins.m) {
1917             itr->bins.m = itr->bins.n + n;
1918             kroundup32(itr->bins.m);
1919             itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
1920         }
1921         for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
1922     }
1923     return itr->bins.n;
1924 }
1925 
hts_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)1926 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
1927 {
1928     int i, n_off, l, bin;
1929     hts_pair64_t *off;
1930     khint_t k;
1931     bidx_t *bidx;
1932     uint64_t min_off, max_off;
1933     hts_itr_t *iter = 0;
1934     if (tid < 0) {
1935         int finished0 = 0;
1936         uint64_t off0 = (uint64_t)-1;
1937         khint_t k;
1938         switch (tid) {
1939         case HTS_IDX_START:
1940             // Find the smallest offset, note that sequence ids may not be ordered sequentially
1941             for (i=0; i<idx->n; i++)
1942             {
1943                 bidx = idx->bidx[i];
1944                 k = kh_get(bin, bidx, META_BIN(idx));
1945                 if (k == kh_end(bidx)) continue;
1946                 if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u;
1947             }
1948             if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1949             break;
1950 
1951         case HTS_IDX_NOCOOR:
1952             /* No-coor reads sort after all of the mapped reads.  The position
1953                is not stored in the index itself, so need to find the end
1954                offset for the last mapped read.  A loop is needed here in
1955                case references at the end of the file have no mapped reads,
1956                or sequence ids are not ordered sequentially.
1957                See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
1958             for (i = 0; i < idx->n; i++) {
1959                 bidx = idx->bidx[i];
1960                 k = kh_get(bin, bidx, META_BIN(idx));
1961                 if (k != kh_end(bidx)) {
1962                     if (off0==(uint64_t)-1 || off0 < kh_val(bidx, k).list[0].v) {
1963                         off0 = kh_val(bidx, k).list[0].v;
1964                     }
1965                 }
1966             }
1967             if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1968             break;
1969 
1970         case HTS_IDX_REST:
1971             off0 = 0;
1972             break;
1973 
1974         case HTS_IDX_NONE:
1975             finished0 = 1;
1976             off0 = 0;
1977             break;
1978 
1979         default:
1980             return 0;
1981         }
1982         if (off0 != (uint64_t)-1) {
1983             iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
1984             iter->read_rest = 1;
1985             iter->finished = finished0;
1986             iter->curr_off = off0;
1987             iter->readrec = readrec;
1988             return iter;
1989         } else return 0;
1990     }
1991 
1992     if (beg < 0) beg = 0;
1993     if (end < beg) return 0;
1994     if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
1995 
1996     iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
1997     iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
1998     iter->readrec = readrec;
1999 
2000     if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
2001 
2002     // compute min_off
2003     bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
2004     do {
2005         int first;
2006         k = kh_get(bin, bidx, bin);
2007         if (k != kh_end(bidx)) break;
2008         first = (hts_bin_parent(bin)<<3) + 1;
2009         if (bin > first) --bin;
2010         else bin = hts_bin_parent(bin);
2011     } while (bin);
2012     if (bin == 0) k = kh_get(bin, bidx, bin);
2013     min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2014 
2015     // compute max_off: a virtual offset from a bin to the right of end
2016     bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2017     if (bin >= idx->n_bins) bin = 0;
2018     while (1) {
2019         // search for an extant bin by moving right, but moving up to the
2020         // parent whenever we get to a first child (which also covers falling
2021         // off the RHS, which wraps around and immediately goes up to bin 0)
2022         while (bin % 8 == 1) bin = hts_bin_parent(bin);
2023         if (bin == 0) { max_off = (uint64_t)-1; break; }
2024         k = kh_get(bin, bidx, bin);
2025         if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
2026         bin++;
2027     }
2028 
2029     // retrieve bins
2030     reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
2031     for (i = n_off = 0; i < iter->bins.n; ++i)
2032         if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
2033             n_off += kh_value(bidx, k).n;
2034     if (n_off == 0) {
2035         // No overlapping bins means the iterator has already finished.
2036         iter->finished = 1;
2037         return iter;
2038     }
2039     off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
2040     for (i = n_off = 0; i < iter->bins.n; ++i) {
2041         if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
2042             int j;
2043             bins_t *p = &kh_value(bidx, k);
2044             for (j = 0; j < p->n; ++j)
2045                 if (p->list[j].v > min_off && p->list[j].u < max_off)
2046                     off[n_off++] = p->list[j];
2047         }
2048     }
2049     if (n_off == 0) {
2050         free(off);
2051         iter->finished = 1;
2052         return iter;
2053     }
2054     ks_introsort(_off, n_off, off);
2055     // resolve completely contained adjacent blocks
2056     for (i = 1, l = 0; i < n_off; ++i)
2057         if (off[l].v < off[i].v) off[++l] = off[i];
2058     n_off = l + 1;
2059     // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2060     for (i = 1; i < n_off; ++i)
2061         if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2062     // merge adjacent blocks
2063     for (i = 1, l = 0; i < n_off; ++i) {
2064         if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
2065         else off[++l] = off[i];
2066     }
2067     n_off = l + 1;
2068     iter->n_off = n_off; iter->off = off;
2069     return iter;
2070 }
2071 
hts_itr_destroy(hts_itr_t * iter)2072 void hts_itr_destroy(hts_itr_t *iter)
2073 {
2074     if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
2075 }
2076 
push_digit(long long i,char c)2077 static inline long long push_digit(long long i, char c)
2078 {
2079     // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
2080     int digit = c - '0';
2081     return 10 * i + digit;
2082 }
2083 
hts_parse_decimal(const char * str,char ** strend,int flags)2084 long long hts_parse_decimal(const char *str, char **strend, int flags)
2085 {
2086     long long n = 0;
2087     int decimals = 0, e = 0, lost = 0;
2088     char sign = '+', esign = '+';
2089     const char *s;
2090 
2091     while (isspace_c(*str)) str++;
2092     s = str;
2093 
2094     if (*s == '+' || *s == '-') sign = *s++;
2095     while (*s)
2096         if (isdigit_c(*s)) n = push_digit(n, *s++);
2097         else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
2098         else break;
2099 
2100     if (*s == '.') {
2101         s++;
2102         while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++);
2103     }
2104 
2105     if (*s == 'E' || *s == 'e') {
2106         s++;
2107         if (*s == '+' || *s == '-') esign = *s++;
2108         while (isdigit_c(*s)) e = push_digit(e, *s++);
2109         if (esign == '-') e = -e;
2110     }
2111 
2112     e -= decimals;
2113     while (e > 0) n *= 10, e--;
2114     while (e < 0) lost += n % 10, n /= 10, e++;
2115 
2116     if (lost > 0 && hts_verbose >= 3)
2117         fprintf(stderr, "[W::%s] discarding fractional part of %.*s\n",
2118                 __func__, (int)(s - str), str);
2119 
2120     if (strend) *strend = (char *) s;
2121     else if (*s && hts_verbose >= 2)
2122         fprintf(stderr, "[W::%s] ignoring unknown characters after %.*s[%s]\n",
2123                 __func__, (int)(s - str), str, s);
2124 
2125     return (sign == '+')? n : -n;
2126 }
2127 
hts_parse_reg(const char * s,int * beg,int * end)2128 const char *hts_parse_reg(const char *s, int *beg, int *end)
2129 {
2130     char *hyphen;
2131     const char *colon = strrchr(s, ':');
2132     if (colon == NULL) {
2133         *beg = 0; *end = INT_MAX;
2134         return s + strlen(s);
2135     }
2136 
2137     *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
2138     if (*beg < 0) *beg = 0;
2139 
2140     if (*hyphen == '\0') *end = INT_MAX;
2141     else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
2142     else return NULL;
2143 
2144     if (*beg >= *end) return NULL;
2145     return colon;
2146 }
2147 
hts_itr_querys(const hts_idx_t * idx,const char * reg,hts_name2id_f getid,void * hdr,hts_itr_query_func * itr_query,hts_readrec_func * readrec)2148 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
2149 {
2150     int tid, beg, end;
2151     const char *q;
2152 
2153     if (strcmp(reg, ".") == 0)
2154         return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
2155     else if (strcmp(reg, "*") == 0)
2156         return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
2157 
2158     q = hts_parse_reg(reg, &beg, &end);
2159     if (q) {
2160         char tmp_a[1024], *tmp = tmp_a;
2161         if (q - reg + 1 > 1024)
2162             if (!(tmp = malloc(q - reg + 1)))
2163                 return NULL;
2164         strncpy(tmp, reg, q - reg);
2165         tmp[q - reg] = 0;
2166         tid = getid(hdr, tmp);
2167         if (tmp != tmp_a)
2168             free(tmp);
2169     }
2170     else {
2171         // not parsable as a region, but possibly a sequence named "foo:a"
2172         tid = getid(hdr, reg);
2173         beg = 0; end = INT_MAX;
2174     }
2175 
2176     if (tid < 0) return NULL;
2177     return itr_query(idx, tid, beg, end, readrec);
2178 }
2179 
hts_itr_next(BGZF * fp,hts_itr_t * iter,void * r,void * data)2180 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
2181 {
2182     int ret, tid, beg, end;
2183     if (iter == NULL || iter->finished) return -1;
2184     if (iter->read_rest) {
2185         if (iter->curr_off) { // seek to the start
2186             if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) return -1;
2187             iter->curr_off = 0; // only seek once
2188         }
2189         ret = iter->readrec(fp, data, r, &tid, &beg, &end);
2190         if (ret < 0) iter->finished = 1;
2191         iter->curr_tid = tid;
2192         iter->curr_beg = beg;
2193         iter->curr_end = end;
2194         return ret;
2195     }
2196     // A NULL iter->off should always be accompanied by iter->finished.
2197     assert(iter->off != NULL);
2198     for (;;) {
2199         if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
2200             if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
2201             if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
2202                 if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) return -1;
2203                 iter->curr_off = bgzf_tell(fp);
2204             }
2205             ++iter->i;
2206         }
2207         if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
2208             iter->curr_off = bgzf_tell(fp);
2209             if (tid != iter->tid || beg >= iter->end) { // no need to proceed
2210                 ret = -1; break;
2211             } else if (end > iter->beg && iter->end > beg) {
2212                 iter->curr_tid = tid;
2213                 iter->curr_beg = beg;
2214                 iter->curr_end = end;
2215                 return ret;
2216             }
2217         } else break; // end of file or error
2218     }
2219     iter->finished = 1;
2220     return ret;
2221 }
2222 
2223 /**********************
2224  *** Retrieve index ***
2225  **********************/
2226 
test_and_fetch(const char * fn)2227 static char *test_and_fetch(const char *fn)
2228 {
2229     if (hisremote(fn)) {
2230         const int buf_size = 1 * 1024 * 1024;
2231         hFILE *fp_remote;
2232         FILE *fp;
2233         uint8_t *buf;
2234         int l;
2235         const char *p;
2236         for (p = fn + strlen(fn) - 1; p >= fn; --p)
2237             if (*p == '/') break;
2238         ++p; // p now points to the local file name
2239         // Attempt to open local file first
2240         if ((fp = fopen((char*)p, "rb")) != 0)
2241         {
2242             fclose(fp);
2243             return (char*)p;
2244         }
2245         // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
2246         if ((fp_remote = hopen(fn, "r")) == 0) return 0;
2247         if ((fp = fopen(p, "w")) == 0) {
2248             if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
2249             hclose_abruptly(fp_remote);
2250             return 0;
2251         }
2252         if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn);
2253         buf = (uint8_t*)calloc(buf_size, 1);
2254         while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
2255         free(buf);
2256         fclose(fp);
2257         if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn);
2258         return (char*)p;
2259     } else {
2260         hFILE *fp;
2261         if ((fp = hopen(fn, "r")) == 0) return 0;
2262         hclose_abruptly(fp);
2263         return (char*)fn;
2264     }
2265 }
2266 
hts_idx_getfn(const char * fn,const char * ext)2267 char *hts_idx_getfn(const char *fn, const char *ext)
2268 {
2269     int i, l_fn, l_ext;
2270     char *fnidx, *ret;
2271     l_fn = strlen(fn); l_ext = strlen(ext);
2272     fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
2273     strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
2274     if ((ret = test_and_fetch(fnidx)) == 0) {
2275         for (i = l_fn - 1; i > 0; --i)
2276             if (fnidx[i] == '.') break;
2277         strcpy(fnidx + i, ext);
2278         ret = test_and_fetch(fnidx);
2279     }
2280     if (ret == 0) {
2281         free(fnidx);
2282         return 0;
2283     }
2284     l_fn = strlen(ret);
2285     memmove(fnidx, ret, l_fn + 1);
2286     return fnidx;
2287 }
2288 
hts_idx_load(const char * fn,int fmt)2289 hts_idx_t *hts_idx_load(const char *fn, int fmt)
2290 {
2291     char *fnidx;
2292     hts_idx_t *idx;
2293     fnidx = hts_idx_getfn(fn, ".csi");
2294     if (! fnidx) fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
2295     if (fnidx == 0) return 0;
2296 
2297     idx = hts_idx_load2(fn, fnidx);
2298     free(fnidx);
2299     return idx;
2300 }
2301 
hts_idx_load2(const char * fn,const char * fnidx)2302 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
2303 {
2304     // Check that the index file is up to date, the main file might have changed
2305     struct stat stat_idx,stat_main;
2306     if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
2307     {
2308         if ( hts_verbose >= 1 && stat_idx.st_mtime < stat_main.st_mtime )
2309             fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx);
2310     }
2311 
2312     return hts_idx_load_local(fnidx);
2313 }
2314 
2315 
2316 
2317 /**********************
2318  ***     Memory     ***
2319  **********************/
2320 
2321 /* For use with hts_expand macros *only* */
hts_realloc_or_die(size_t n,size_t m,size_t m_sz,size_t size,int clear,void ** ptr,const char * func)2322 size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
2323                           int clear, void **ptr, const char *func) {
2324     /* If new_m and size are both below this limit, multiplying them
2325        together can't overflow */
2326     const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
2327     void *new_ptr;
2328     size_t bytes, new_m;
2329 
2330     new_m = n;
2331     kroundup_size_t(new_m);
2332 
2333     bytes = size * new_m;
2334 
2335     /* Check for overflow.  Both ensure that new_m will fit in m (we make the
2336        pessimistic assumption that m is signed), and that bytes has not
2337        wrapped around. */
2338     if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
2339         || ((size > safe || new_m > safe)
2340             && bytes / new_m != size)) {
2341         errno = ENOMEM;
2342         goto die;
2343     }
2344 
2345     new_ptr = realloc(*ptr, bytes);
2346     if (new_ptr == NULL) goto die;
2347 
2348     if (clear) {
2349         if (new_m > m) {
2350             memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
2351         }
2352     }
2353 
2354     *ptr = new_ptr;
2355 
2356     return new_m;
2357 
2358  die:
2359     if (hts_verbose > 1)
2360         fprintf(stderr, "[E::%s] %s\n", func, strerror(errno));
2361     exit(1);
2362 }
2363