1 /*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2 
3     Copyright (C) 2008, 2009, 2012-2017 Genome Research Ltd.
4     Copyright (C) 2012, 2013 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <zlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <strings.h>
32 #include <stdlib.h>
33 #include <limits.h>
34 #include <fcntl.h>
35 #include <errno.h>
36 #include <sys/stat.h>
37 #include <assert.h>
38 
39 #include "htslib/hts.h"
40 #include "htslib/bgzf.h"
41 #include "cram/cram.h"
42 #include "htslib/hfile.h"
43 #include "htslib/hts_endian.h"
44 #include "version.h"
45 #include "hts_internal.h"
46 #include "hfile_internal.h"
47 #include "htslib/hts_os.h" // drand48
48 
49 #include "htslib/khash.h"
50 #include "htslib/kseq.h"
51 #include "htslib/ksort.h"
52 
53 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
54 
55 int hts_verbose = HTS_LOG_WARNING;
56 
hts_version()57 const char *hts_version()
58 {
59     return HTS_VERSION;
60 }
61 
62 const unsigned char seq_nt16_table[256] = {
63     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
64     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
65     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
66      1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
67     15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
68     15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
69     15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
70     15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
71 
72     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
73     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
74     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
75     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
76     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
78     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
79     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
80 };
81 
82 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
83 
84 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
85 
86 /**********************
87  *** Basic file I/O ***
88  **********************/
89 
format_category(enum htsExactFormat fmt)90 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
91 {
92     switch (fmt) {
93     case bam:
94     case sam:
95     case cram:
96         return sequence_data;
97 
98     case vcf:
99     case bcf:
100         return variant_data;
101 
102     case bai:
103     case crai:
104     case csi:
105     case gzi:
106     case tbi:
107         return index_file;
108 
109     case bed:
110         return region_list;
111 
112     case htsget:
113         return unknown_category;
114 
115     case unknown_format:
116     case binary_format:
117     case text_format:
118     case format_maximum:
119         break;
120     }
121 
122     return unknown_category;
123 }
124 
125 // Decompress up to ten or so bytes by peeking at the file, which must be
126 // positioned at the start of a GZIP block.
decompress_peek(hFILE * fp,unsigned char * dest,size_t destsize)127 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
128 {
129     // Typically at most a couple of hundred bytes of input are required
130     // to get a few bytes of output from inflate(), so hopefully this buffer
131     // size suffices in general.
132     unsigned char buffer[512];
133     z_stream zs;
134     ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
135 
136     if (npeek < 0) return 0;
137 
138     zs.zalloc = NULL;
139     zs.zfree = NULL;
140     zs.next_in = buffer;
141     zs.avail_in = npeek;
142     zs.next_out = dest;
143     zs.avail_out = destsize;
144     if (inflateInit2(&zs, 31) != Z_OK) return 0;
145 
146     while (zs.total_out < destsize)
147         if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
148 
149     destsize = zs.total_out;
150     inflateEnd(&zs);
151 
152     return destsize;
153 }
154 
155 // Parse "x.y" text, taking care because the string is not NUL-terminated
156 // and filling in major/minor only when the digits are followed by a delimiter,
157 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
158 static void
parse_version(htsFormat * fmt,const unsigned char * u,const unsigned char * ulim)159 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
160 {
161     const char *s    = (const char *) u;
162     const char *slim = (const char *) ulim;
163     short v;
164 
165     fmt->version.major = fmt->version.minor = -1;
166 
167     for (v = 0; s < slim && isdigit_c(*s); s++)
168         v = 10 * v + *s - '0';
169 
170     if (s < slim) {
171         fmt->version.major = v;
172         if (*s == '.') {
173             s++;
174             for (v = 0; s < slim && isdigit_c(*s); s++)
175                 v = 10 * v + *s - '0';
176             if (s < slim)
177                 fmt->version.minor = v;
178         }
179         else
180             fmt->version.minor = 0;
181     }
182 }
183 
184 static int
cmp_nonblank(const char * key,const unsigned char * u,const unsigned char * ulim)185 cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
186 {
187     const unsigned char *ukey = (const unsigned char *) key;
188 
189     while (*ukey)
190         if (u >= ulim) return +1;
191         else if (isspace_c(*u)) u++;
192         else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
193         else u++, ukey++;
194 
195     return 0;
196 }
197 
hts_detect_format(hFILE * hfile,htsFormat * fmt)198 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
199 {
200     unsigned char s[32];
201     ssize_t len = hpeek(hfile, s, 18);
202     if (len < 0) return -1;
203 
204     if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
205         // The stream is either gzip-compressed or BGZF-compressed.
206         // Determine which, and decompress the first few bytes.
207         fmt->compression = (len >= 18 && (s[3] & 4) &&
208                             memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
209         len = decompress_peek(hfile, s, sizeof s);
210     }
211     else {
212         fmt->compression = no_compression;
213         len = hpeek(hfile, s, sizeof s);
214     }
215     if (len < 0) return -1;
216 
217     fmt->compression_level = -1;
218     fmt->specific = NULL;
219 
220     if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
221         fmt->category = sequence_data;
222         fmt->format = cram;
223         fmt->version.major = s[4], fmt->version.minor = s[5];
224         fmt->compression = custom;
225         return 0;
226     }
227     else if (len >= 4 && s[3] <= '\4') {
228         if (memcmp(s, "BAM\1", 4) == 0) {
229             fmt->category = sequence_data;
230             fmt->format = bam;
231             // TODO Decompress enough to pick version from @HD-VN header
232             fmt->version.major = 1, fmt->version.minor = -1;
233             return 0;
234         }
235         else if (memcmp(s, "BAI\1", 4) == 0) {
236             fmt->category = index_file;
237             fmt->format = bai;
238             fmt->version.major = -1, fmt->version.minor = -1;
239             return 0;
240         }
241         else if (memcmp(s, "BCF\4", 4) == 0) {
242             fmt->category = variant_data;
243             fmt->format = bcf;
244             fmt->version.major = 1, fmt->version.minor = -1;
245             return 0;
246         }
247         else if (memcmp(s, "BCF\2", 4) == 0) {
248             fmt->category = variant_data;
249             fmt->format = bcf;
250             fmt->version.major = s[3];
251             fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
252             return 0;
253         }
254         else if (memcmp(s, "CSI\1", 4) == 0) {
255             fmt->category = index_file;
256             fmt->format = csi;
257             fmt->version.major = 1, fmt->version.minor = -1;
258             return 0;
259         }
260         else if (memcmp(s, "TBI\1", 4) == 0) {
261             fmt->category = index_file;
262             fmt->format = tbi;
263             fmt->version.major = -1, fmt->version.minor = -1;
264             return 0;
265         }
266     }
267     else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
268         fmt->category = variant_data;
269         fmt->format = vcf;
270         if (len >= 21 && s[16] == 'v')
271             parse_version(fmt, &s[17], &s[len]);
272         else
273             fmt->version.major = fmt->version.minor = -1;
274         return 0;
275     }
276     else if (len >= 4 && s[0] == '@' &&
277              (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
278               memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
279         fmt->category = sequence_data;
280         fmt->format = sam;
281         // @HD-VN is not guaranteed to be the first tag, but then @HD is
282         // not guaranteed to be present at all...
283         if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
284             parse_version(fmt, &s[7], &s[len]);
285         else
286             fmt->version.major = 1, fmt->version.minor = -1;
287         return 0;
288     }
289     else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
290         fmt->category = unknown_category;
291         fmt->format = htsget;
292         fmt->version.major = fmt->version.minor = -1;
293         return 0;
294     }
295     else {
296         // Various possibilities for tab-delimited text:
297         // .crai   (gzipped tab-delimited six columns: seqid 5*number)
298         // .bed    ([3..12] tab-delimited columns)
299         // .bedpe  (>= 10 tab-delimited columns)
300         // .sam    (tab-delimited >= 11 columns: seqid number seqid...)
301         // FIXME For now, assume it's SAM
302         fmt->category = sequence_data;
303         fmt->format = sam;
304         fmt->version.major = 1, fmt->version.minor = -1;
305         return 0;
306     }
307 
308     fmt->category = unknown_category;
309     fmt->format = unknown_format;
310     fmt->version.major = fmt->version.minor = -1;
311     fmt->compression = no_compression;
312     return 0;
313 }
314 
hts_format_description(const htsFormat * format)315 char *hts_format_description(const htsFormat *format)
316 {
317     kstring_t str = { 0, 0, NULL };
318 
319     switch (format->format) {
320     case sam:   kputs("SAM", &str); break;
321     case bam:   kputs("BAM", &str); break;
322     case cram:  kputs("CRAM", &str); break;
323     case vcf:   kputs("VCF", &str); break;
324     case bcf:
325         if (format->version.major == 1) kputs("Legacy BCF", &str);
326         else kputs("BCF", &str);
327         break;
328     case bai:   kputs("BAI", &str); break;
329     case crai:  kputs("CRAI", &str); break;
330     case csi:   kputs("CSI", &str); break;
331     case tbi:   kputs("Tabix", &str); break;
332     case htsget: kputs("htsget", &str); break;
333     default:    kputs("unknown", &str); break;
334     }
335 
336     if (format->version.major >= 0) {
337         kputs(" version ", &str);
338         kputw(format->version.major, &str);
339         if (format->version.minor >= 0) {
340             kputc('.', &str);
341             kputw(format->version.minor, &str);
342         }
343     }
344 
345     switch (format->compression) {
346     case custom: kputs(" compressed", &str); break;
347     case gzip:   kputs(" gzip-compressed", &str); break;
348     case bgzf:
349         switch (format->format) {
350         case bam:
351         case bcf:
352         case csi:
353         case tbi:
354             // These are by definition BGZF, so just use the generic term
355             kputs(" compressed", &str);
356             break;
357         default:
358             kputs(" BGZF-compressed", &str);
359             break;
360         }
361         break;
362     default: break;
363     }
364 
365     switch (format->category) {
366     case sequence_data: kputs(" sequence", &str); break;
367     case variant_data:  kputs(" variant calling", &str); break;
368     case index_file:    kputs(" index", &str); break;
369     case region_list:   kputs(" genomic region", &str); break;
370     default: break;
371     }
372 
373     if (format->compression == no_compression)
374         switch (format->format) {
375         case sam:
376         case crai:
377         case vcf:
378         case bed:
379         case htsget:
380             kputs(" text", &str);
381             break;
382 
383         default:
384             kputs(" data", &str);
385             break;
386         }
387     else
388         kputs(" data", &str);
389 
390     return ks_release(&str);
391 }
392 
hts_open_format_impl(const char * fn,const char * mode,const htsFormat * fmt,hFILE * hf)393 static htsFile *hts_open_format_impl(const char *fn, const char *mode, const htsFormat *fmt, hFILE* hf)
394 {
395     char smode[102], *cp, *cp2, *mode_c;
396     htsFile *fp = NULL;
397     hFILE *hfile;
398     char fmt_code = '\0';
399 
400     strncpy(smode, mode, 100);
401     smode[100]=0;
402     if ((cp = strchr(smode, ',')))
403         *cp = '\0';
404 
405     // Migrate format code (b or c) to the end of the smode buffer.
406     for (cp2 = cp = smode; *cp; cp++) {
407         if (*cp == 'b')
408             fmt_code = 'b';
409         else if (*cp == 'c')
410             fmt_code = 'c';
411         else
412             *cp2++ = *cp;
413     }
414     mode_c = cp2;
415     *cp2++ = fmt_code;
416     *cp2++ = 0;
417     *cp2++ = 0;
418 
419     // Set or reset the format code if opts->format is used
420     if (fmt && fmt->format != unknown_format)
421         *mode_c = "\0g\0\0b\0c\0\0b\0g\0\0"[fmt->format];
422 
423     hfile = hf ? hf : hopen(fn, smode);
424     if (hfile == NULL) goto error;
425 
426     fp = hts_hopen(hfile, fn, smode);
427     if (fp == NULL) goto error;
428 
429     if (fmt && fmt->specific)
430         if (hts_opt_apply(fp, fmt->specific) != 0)
431             goto error;
432 
433     return fp;
434 
435 error:
436     hts_log_error("Failed to open file %s", fn);
437 
438     if (hfile)
439         hclose_abruptly(hfile);
440 
441     return NULL;
442 }
hts_open_format(const char * fn,const char * mode,const htsFormat * fmt)443 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
444 {
445 	return hts_open_format_impl(fn, mode, fmt, NULL);
446 }
447 
hts_open_callback(const char * fn,hFILE_callback_ops * ops,const char * mode)448 htsFile *hts_open_callback(const char* fn, hFILE_callback_ops* ops, const char* mode)
449 {
450 	if(NULL == ops) return NULL;
451 	hFILE* fp = hopen_callback(*ops, mode);
452 	return hts_open_format_impl(fn ? fn : "-", mode, NULL, fp);
453 }
454 
hts_open(const char * fn,const char * mode)455 htsFile *hts_open(const char *fn, const char *mode) {
456     return hts_open_format(fn, mode, NULL);
457 }
458 
459 /*
460  * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
461  * the prefix in lowercase into buf and returning a pointer to the suffix.
462  * On return, buf is always NUL-terminated; thus assumes that the "keyword"
463  * prefix should be one of several known values of maximum length buflen-2.
464  * (If delim is not found, returns a pointer to the '\0'.)
465  */
466 static const char *
scan_keyword(const char * str,char delim,char * buf,size_t buflen)467 scan_keyword(const char *str, char delim, char *buf, size_t buflen)
468 {
469     size_t i = 0;
470     while (*str && *str != delim) {
471         if (i < buflen-1) buf[i++] = tolower_c(*str);
472         str++;
473     }
474 
475     buf[i] = '\0';
476     return *str? str+1 : str;
477 }
478 
479 /*
480  * Parses arg and appends it to the option list.
481  *
482  * Returns 0 on success;
483  *        -1 on failure.
484  */
hts_opt_add(hts_opt ** opts,const char * c_arg)485 int hts_opt_add(hts_opt **opts, const char *c_arg) {
486     hts_opt *o, *t;
487     char *val;
488 
489     /*
490      * IMPORTANT!!!
491      * If you add another string option here, don't forget to also add
492      * it to the case statement in hts_opt_apply.
493      */
494 
495     if (!c_arg)
496         return -1;
497 
498     if (!(o =  malloc(sizeof(*o))))
499         return -1;
500 
501     if (!(o->arg = strdup(c_arg))) {
502         free(o);
503         return -1;
504     }
505 
506     if (!(val = strchr(o->arg, '=')))
507         val = "1"; // assume boolean
508     else
509         *val++ = '\0';
510 
511     if (strcmp(o->arg, "decode_md") == 0 ||
512         strcmp(o->arg, "DECODE_MD") == 0)
513         o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
514 
515     else if (strcmp(o->arg, "verbosity") == 0 ||
516              strcmp(o->arg, "VERBOSITY") == 0)
517         o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
518 
519     else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
520              strcmp(o->arg, "SEQS_PER_SLICE") == 0)
521         o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
522 
523     else if (strcmp(o->arg, "bases_per_slice") == 0 ||
524              strcmp(o->arg, "BASES_PER_SLICE") == 0)
525         o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
526 
527     else if (strcmp(o->arg, "slices_per_container") == 0 ||
528              strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
529         o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
530 
531     else if (strcmp(o->arg, "embed_ref") == 0 ||
532              strcmp(o->arg, "EMBED_REF") == 0)
533         o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
534 
535     else if (strcmp(o->arg, "no_ref") == 0 ||
536              strcmp(o->arg, "NO_REF") == 0)
537         o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
538 
539     else if (strcmp(o->arg, "ignore_md5") == 0 ||
540              strcmp(o->arg, "IGNORE_MD5") == 0)
541         o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
542 
543     else if (strcmp(o->arg, "use_bzip2") == 0 ||
544              strcmp(o->arg, "USE_BZIP2") == 0)
545         o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
546 
547     else if (strcmp(o->arg, "use_rans") == 0 ||
548              strcmp(o->arg, "USE_RANS") == 0)
549         o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
550 
551     else if (strcmp(o->arg, "use_lzma") == 0 ||
552              strcmp(o->arg, "USE_LZMA") == 0)
553         o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
554 
555     else if (strcmp(o->arg, "reference") == 0 ||
556              strcmp(o->arg, "REFERENCE") == 0)
557         o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
558 
559     else if (strcmp(o->arg, "version") == 0 ||
560              strcmp(o->arg, "VERSION") == 0)
561         o->opt = CRAM_OPT_VERSION, o->val.s =val;
562 
563     else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
564              strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
565         o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
566 
567     else if (strcmp(o->arg, "nthreads") == 0 ||
568              strcmp(o->arg, "NTHREADS") == 0)
569         o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
570 
571     else if (strcmp(o->arg, "cache_size") == 0 ||
572              strcmp(o->arg, "CACHE_SIZE") == 0) {
573         char *endp;
574         o->opt = HTS_OPT_CACHE_SIZE;
575         o->val.i = strtol(val, &endp, 0);
576         // NB: Doesn't support floats, eg 1.5g
577         // TODO: extend hts_parse_decimal? See also samtools sort.
578         switch (*endp) {
579         case 'g': case 'G': o->val.i *= 1024;
580         case 'm': case 'M': o->val.i *= 1024;
581         case 'k': case 'K': o->val.i *= 1024; break;
582         case '\0': break;
583         default:
584             hts_log_error("Unrecognised cache size suffix '%c'", *endp);
585             free(o->arg);
586             free(o);
587             return -1;
588         }
589     }
590 
591     else if (strcmp(o->arg, "required_fields") == 0 ||
592              strcmp(o->arg, "REQUIRED_FIELDS") == 0)
593         o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
594 
595     else if (strcmp(o->arg, "lossy_names") == 0 ||
596              strcmp(o->arg, "LOSSY_NAMES") == 0)
597         o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
598 
599     else if (strcmp(o->arg, "name_prefix") == 0 ||
600              strcmp(o->arg, "NAME_PREFIX") == 0)
601         o->opt = CRAM_OPT_PREFIX, o->val.s = val;
602 
603     else if (strcmp(o->arg, "store_md") == 0 ||
604              strcmp(o->arg, "store_md") == 0)
605         o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
606 
607     else if (strcmp(o->arg, "store_nm") == 0 ||
608              strcmp(o->arg, "store_nm") == 0)
609         o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
610 
611     else if (strcmp(o->arg, "block_size") == 0 ||
612              strcmp(o->arg, "BLOCK_SIZE") == 0)
613         o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
614 
615     else if (strcmp(o->arg, "level") == 0 ||
616              strcmp(o->arg, "LEVEL") == 0)
617         o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
618 
619     else {
620         hts_log_error("Unknown option '%s'", o->arg);
621         free(o->arg);
622         free(o);
623         return -1;
624     }
625 
626     o->next = NULL;
627 
628     // Append; assumes small list.
629     if (*opts) {
630         t = *opts;
631         while (t->next)
632             t = t->next;
633         t->next = o;
634     } else {
635         *opts = o;
636     }
637 
638     return 0;
639 }
640 
641 /*
642  * Applies an hts_opt option list to a given htsFile.
643  *
644  * Returns 0 on success
645  *        -1 on failure
646  */
hts_opt_apply(htsFile * fp,hts_opt * opts)647 int hts_opt_apply(htsFile *fp, hts_opt *opts) {
648     hts_opt *last = NULL;
649 
650     for (; opts;  opts = (last=opts)->next) {
651         switch (opts->opt) {
652             case CRAM_OPT_REFERENCE:
653             case CRAM_OPT_VERSION:
654             case CRAM_OPT_PREFIX:
655                 if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
656                     return -1;
657                 break;
658             default:
659                 if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
660                     return -1;
661                 break;
662         }
663     }
664 
665     return 0;
666 }
667 
668 /*
669  * Frees an hts_opt list.
670  */
hts_opt_free(hts_opt * opts)671 void hts_opt_free(hts_opt *opts) {
672     hts_opt *last = NULL;
673     while (opts) {
674         opts = (last=opts)->next;
675         free(last->arg);
676         free(last);
677     }
678 }
679 
680 
681 /*
682  * Tokenise options as (key(=value)?,)*(key(=value)?)?
683  * NB: No provision for ',' appearing in the value!
684  * Add backslashing rules?
685  *
686  * This could be used as part of a general command line option parser or
687  * as a string concatenated onto the file open mode.
688  *
689  * Returns 0 on success
690  *        -1 on failure.
691  */
hts_parse_opt_list(htsFormat * fmt,const char * str)692 int hts_parse_opt_list(htsFormat *fmt, const char *str) {
693     while (str && *str) {
694         const char *str_start;
695         int len;
696         char arg[8001];
697 
698         while (*str && *str == ',')
699             str++;
700 
701         for (str_start = str; *str && *str != ','; str++);
702         len = str - str_start;
703 
704         // Produce a nul terminated copy of the option
705         strncpy(arg, str_start, len < 8000 ? len : 8000);
706         arg[len < 8000 ? len : 8000] = '\0';
707 
708         if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
709             return -1;
710 
711         if (*str)
712             str++;
713     }
714 
715     return 0;
716 }
717 
718 /*
719  * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
720  * followed by a comma separated list of key=value options and splits
721  * these up into the fields of htsFormat struct.
722  *
723  * format is assumed to be already initialised, either to blank
724  * "unknown" values or via previous hts_opt_add calls.
725  *
726  * Returns 0 on success
727  *        -1 on failure.
728  */
hts_parse_format(htsFormat * format,const char * str)729 int hts_parse_format(htsFormat *format, const char *str) {
730     char fmt[8];
731     const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
732 
733     format->version.minor = 0; // unknown
734     format->version.major = 0; // unknown
735 
736     if (strcmp(fmt, "sam") == 0) {
737         format->category          = sequence_data;
738         format->format            = sam;
739         format->compression       = no_compression;;
740         format->compression_level = 0;
741     } else if (strcmp(fmt, "bam") == 0) {
742         format->category          = sequence_data;
743         format->format            = bam;
744         format->compression       = bgzf;
745         format->compression_level = -1;
746     } else if (strcmp(fmt, "cram") == 0) {
747         format->category          = sequence_data;
748         format->format            = cram;
749         format->compression       = custom;
750         format->compression_level = -1;
751     } else if (strcmp(fmt, "vcf") == 0) {
752         format->category          = variant_data;
753         format->format            = vcf;
754         format->compression       = no_compression;;
755         format->compression_level = 0;
756     } else if (strcmp(fmt, "bcf") == 0) {
757         format->category          = variant_data;
758         format->format            = bcf;
759         format->compression       = bgzf;
760         format->compression_level = -1;
761     } else {
762         return -1;
763     }
764 
765     return hts_parse_opt_list(format, cp);
766 }
767 
768 
769 /*
770  * Tokenise options as (key(=value)?,)*(key(=value)?)?
771  * NB: No provision for ',' appearing in the value!
772  * Add backslashing rules?
773  *
774  * This could be used as part of a general command line option parser or
775  * as a string concatenated onto the file open mode.
776  *
777  * Returns 0 on success
778  *        -1 on failure.
779  */
hts_process_opts(htsFile * fp,const char * opts)780 static int hts_process_opts(htsFile *fp, const char *opts) {
781     htsFormat fmt;
782 
783     fmt.specific = NULL;
784     if (hts_parse_opt_list(&fmt, opts) != 0)
785         return -1;
786 
787     if (hts_opt_apply(fp, fmt.specific) != 0) {
788         hts_opt_free(fmt.specific);
789         return -1;
790     }
791 
792     hts_opt_free(fmt.specific);
793 
794     return 0;
795 }
796 
797 
hts_hopen(hFILE * hfile,const char * fn,const char * mode)798 htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
799 {
800     hFILE *hfile_orig = hfile;
801     htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
802     char simple_mode[101], *cp, *opts;
803     simple_mode[100] = '\0';
804 
805     if (fp == NULL) goto error;
806 
807     fp->fn = strdup(fn);
808     fp->is_be = ed_is_big();
809 
810     // Split mode into simple_mode,opts strings
811     if ((cp = strchr(mode, ','))) {
812         strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
813         simple_mode[cp-mode] = '\0';
814         opts = cp+1;
815     } else {
816         strncpy(simple_mode, mode, 100);
817         opts = NULL;
818     }
819 
820     if (strchr(simple_mode, 'r')) {
821         if (hts_detect_format(hfile, &fp->format) < 0) goto error;
822 
823         if (fp->format.format == htsget) {
824             hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
825             if (hfile2 == NULL) goto error;
826 
827             // Build fp against the result of the redirection
828             hfile = hfile2;
829             if (hts_detect_format(hfile, &fp->format) < 0) goto error;
830         }
831     }
832     else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
833         htsFormat *fmt = &fp->format;
834         fp->is_write = 1;
835 
836         if (strchr(simple_mode, 'b')) fmt->format = binary_format;
837         else if (strchr(simple_mode, 'c')) fmt->format = cram;
838         else fmt->format = text_format;
839 
840         if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
841         else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
842         else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
843         else {
844             // No compression mode specified, set to the default for the format
845             switch (fmt->format) {
846             case binary_format: fmt->compression = bgzf; break;
847             case cram: fmt->compression = custom; break;
848             case text_format: fmt->compression = no_compression; break;
849             default: abort();
850             }
851         }
852 
853         // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
854         fmt->category = format_category(fmt->format);
855 
856         fmt->version.major = fmt->version.minor = -1;
857         fmt->compression_level = -1;
858         fmt->specific = NULL;
859     }
860     else { errno = EINVAL; goto error; }
861 
862     switch (fp->format.format) {
863     case binary_format:
864     case bam:
865     case bcf:
866         fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
867         if (fp->fp.bgzf == NULL) goto error;
868         fp->is_bin = fp->is_bgzf = 1;
869         break;
870 
871     case cram:
872         fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
873         if (fp->fp.cram == NULL) goto error;
874         if (!fp->is_write)
875             cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
876         fp->is_cram = 1;
877         break;
878 
879     case text_format:
880     case sam:
881     case vcf:
882         if (fp->format.compression != no_compression) {
883             fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
884             if (fp->fp.bgzf == NULL) goto error;
885             fp->is_bgzf = 1;
886         }
887         else
888             fp->fp.hfile = hfile;
889         break;
890 
891     default:
892         errno = ENOEXEC;
893         goto error;
894     }
895 
896     if (opts)
897         hts_process_opts(fp, opts);
898 
899     // If redirecting, close the original hFILE now (pedantically we would
900     // instead close it in hts_close(), but this a simplifying optimisation)
901     if (hfile != hfile_orig) hclose_abruptly(hfile_orig);
902 
903     return fp;
904 
905 error:
906     hts_log_error("Failed to open file %s", fn);
907 
908     // If redirecting, close the failed redirection hFILE that we have opened
909     if (hfile != hfile_orig) hclose_abruptly(hfile);
910 
911     if (fp) {
912         free(fp->fn);
913         free(fp->fn_aux);
914         free(fp);
915     }
916     return NULL;
917 }
918 
hts_close(htsFile * fp)919 int hts_close(htsFile *fp)
920 {
921     int ret, save;
922 
923     switch (fp->format.format) {
924     case binary_format:
925     case bam:
926     case bcf:
927         ret = bgzf_close(fp->fp.bgzf);
928         break;
929 
930     case cram:
931         if (!fp->is_write) {
932             switch (cram_eof(fp->fp.cram)) {
933             case 2:
934                 hts_log_warning("EOF marker is absent. The input is probably truncated");
935                 break;
936             case 0:  /* not at EOF, but may not have wanted all seqs */
937             default: /* case 1, expected EOF */
938                 break;
939             }
940         }
941         ret = cram_close(fp->fp.cram);
942         break;
943 
944     case text_format:
945     case sam:
946     case vcf:
947         if (fp->format.compression != no_compression)
948             ret = bgzf_close(fp->fp.bgzf);
949         else
950             ret = hclose(fp->fp.hfile);
951         break;
952 
953     default:
954         ret = -1;
955         break;
956     }
957 
958     save = errno;
959     free(fp->fn);
960     free(fp->fn_aux);
961     free(fp->line.s);
962     free(fp);
963     errno = save;
964     return ret;
965 }
966 
hts_get_format(htsFile * fp)967 const htsFormat *hts_get_format(htsFile *fp)
968 {
969     return fp? &fp->format : NULL;
970 }
971 
hts_format_file_extension(const htsFormat * format)972 const char *hts_format_file_extension(const htsFormat *format) {
973     if (!format)
974         return "?";
975 
976     switch (format->format) {
977     case sam:  return "sam";
978     case bam:  return "bam";
979     case bai:  return "bai";
980     case cram: return "cram";
981     case crai: return "crai";
982     case vcf:  return "vcf";
983     case bcf:  return "bcf";
984     case csi:  return "csi";
985     case gzi:  return "gzi";
986     case tbi:  return "tbi";
987     case bed:  return "bed";
988     default:   return "?";
989     }
990 }
991 
hts_hfile(htsFile * fp)992 static hFILE *hts_hfile(htsFile *fp) {
993     switch (fp->format.format) {
994     case binary_format: // fall through; still valid if bcf?
995     case bam:          return bgzf_hfile(fp->fp.bgzf);
996     case cram:         return cram_hfile(fp->fp.cram);
997     case text_format:  return fp->fp.hfile;
998     case sam:          return fp->fp.hfile;
999     default:           return NULL;
1000     }
1001 }
1002 
hts_set_opt(htsFile * fp,enum hts_fmt_option opt,...)1003 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1004     int r;
1005     va_list args;
1006 
1007     switch (opt) {
1008     case HTS_OPT_NTHREADS: {
1009         va_start(args, opt);
1010         int nthreads = va_arg(args, int);
1011         va_end(args);
1012         return hts_set_threads(fp, nthreads);
1013     }
1014 
1015     case HTS_OPT_BLOCK_SIZE: {
1016         hFILE *hf = hts_hfile(fp);
1017 
1018         if (hf) {
1019             va_start(args, opt);
1020             if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1021                 hts_log_warning("Failed to change block size");
1022             va_end(args);
1023         }
1024         else {
1025             // To do - implement for vcf/bcf.
1026             hts_log_warning("Cannot change block size for this format");
1027         }
1028 
1029         return 0;
1030     }
1031 
1032     case HTS_OPT_THREAD_POOL: {
1033         va_start(args, opt);
1034         htsThreadPool *p = va_arg(args, htsThreadPool *);
1035         va_end(args);
1036         return hts_set_thread_pool(fp, p);
1037     }
1038 
1039     case HTS_OPT_CACHE_SIZE: {
1040         va_start(args, opt);
1041         int cache_size = va_arg(args, int);
1042         va_end(args);
1043         hts_set_cache_size(fp, cache_size);
1044         return 0;
1045     }
1046 
1047     case HTS_OPT_COMPRESSION_LEVEL: {
1048         va_start(args, opt);
1049         int level = va_arg(args, int);
1050         va_end(args);
1051         if (fp->is_bgzf)
1052             fp->fp.bgzf->compress_level = level;
1053     }
1054 
1055     default:
1056         break;
1057     }
1058 
1059     if (fp->format.format != cram)
1060         return 0;
1061 
1062     va_start(args, opt);
1063     r = cram_set_voption(fp->fp.cram, opt, args);
1064     va_end(args);
1065 
1066     return r;
1067 }
1068 
1069 BGZF *hts_get_bgzfp(htsFile *fp);
1070 
hts_set_threads(htsFile * fp,int n)1071 int hts_set_threads(htsFile *fp, int n)
1072 {
1073     if (fp->format.compression == bgzf) {
1074         return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1075     } else if (fp->format.format == cram) {
1076         return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1077     }
1078     else return 0;
1079 }
1080 
hts_set_thread_pool(htsFile * fp,htsThreadPool * p)1081 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1082     if (fp->format.compression == bgzf) {
1083         return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1084     } else if (fp->format.format == cram) {
1085         return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1086     }
1087     else return 0;
1088 }
1089 
hts_set_cache_size(htsFile * fp,int n)1090 void hts_set_cache_size(htsFile *fp, int n)
1091 {
1092     if (fp->format.compression == bgzf)
1093         bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1094 }
1095 
hts_set_fai_filename(htsFile * fp,const char * fn_aux)1096 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1097 {
1098     free(fp->fn_aux);
1099     if (fn_aux) {
1100         fp->fn_aux = strdup(fn_aux);
1101         if (fp->fn_aux == NULL) return -1;
1102     }
1103     else fp->fn_aux = NULL;
1104 
1105     if (fp->format.format == cram)
1106         if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1107             return -1;
1108 
1109     return 0;
1110 }
1111 
1112 // For VCF/BCF backward sweeper. Not exposing these functions because their
1113 // future is uncertain. Things will probably have to change with hFILE...
hts_get_bgzfp(htsFile * fp)1114 BGZF *hts_get_bgzfp(htsFile *fp)
1115 {
1116     if (fp->is_bgzf)
1117         return fp->fp.bgzf;
1118     else
1119         return NULL;
1120 }
hts_useek(htsFile * fp,long uoffset,int where)1121 int hts_useek(htsFile *fp, long uoffset, int where)
1122 {
1123     if (fp->is_bgzf)
1124         return bgzf_useek(fp->fp.bgzf, uoffset, where);
1125     else
1126         return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1127 }
hts_utell(htsFile * fp)1128 long hts_utell(htsFile *fp)
1129 {
1130     if (fp->is_bgzf)
1131         return bgzf_utell(fp->fp.bgzf);
1132     else
1133         return htell(fp->fp.hfile);
1134 }
1135 
hts_getline(htsFile * fp,int delimiter,kstring_t * str)1136 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1137 {
1138     int ret;
1139     if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1140         hts_log_error("Unexpected delimiter %d", delimiter);
1141         abort();
1142     }
1143 
1144     switch (fp->format.compression) {
1145     case no_compression:
1146         str->l = 0;
1147         ret = kgetline(str, (kgets_func *) hgets, fp->fp.hfile);
1148         if (ret >= 0) ret = str->l;
1149         else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1150         else ret = -1;
1151         break;
1152 
1153     case gzip:
1154     case bgzf:
1155         ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1156         break;
1157 
1158     default:
1159         abort();
1160     }
1161 
1162     ++fp->lineno;
1163     return ret;
1164 }
1165 
hts_readlist(const char * string,int is_file,int * _n)1166 char **hts_readlist(const char *string, int is_file, int *_n)
1167 {
1168     int m = 0, n = 0;
1169     char **s = 0;
1170     if ( is_file )
1171     {
1172         BGZF *fp = bgzf_open(string, "r");
1173         if ( !fp ) return NULL;
1174 
1175         kstring_t str;
1176         str.s = 0; str.l = str.m = 0;
1177         while (bgzf_getline(fp, '\n', &str) >= 0)
1178         {
1179             if (str.l == 0) continue;
1180             n++;
1181             hts_expand(char*,n,m,s);
1182             s[n-1] = strdup(str.s);
1183         }
1184         bgzf_close(fp);
1185         free(str.s);
1186     }
1187     else
1188     {
1189         const char *q = string, *p = string;
1190         while ( 1 )
1191         {
1192             if (*p == ',' || *p == 0)
1193             {
1194                 n++;
1195                 hts_expand(char*,n,m,s);
1196                 s[n-1] = (char*)calloc(p - q + 1, 1);
1197                 strncpy(s[n-1], q, p - q);
1198                 q = p + 1;
1199             }
1200             if ( !*p ) break;
1201             p++;
1202         }
1203     }
1204     s = (char**)realloc(s, n * sizeof(char*));
1205     *_n = n;
1206     return s;
1207 }
1208 
hts_readlines(const char * fn,int * _n)1209 char **hts_readlines(const char *fn, int *_n)
1210 {
1211     int m = 0, n = 0;
1212     char **s = 0;
1213     BGZF *fp = bgzf_open(fn, "r");
1214     if ( fp ) { // read from file
1215         kstring_t str;
1216         str.s = 0; str.l = str.m = 0;
1217         while (bgzf_getline(fp, '\n', &str) >= 0) {
1218             if (str.l == 0) continue;
1219             if (m == n) {
1220                 m = m? m<<1 : 16;
1221                 s = (char**)realloc(s, m * sizeof(char*));
1222             }
1223             s[n++] = strdup(str.s);
1224         }
1225         bgzf_close(fp);
1226         s = (char**)realloc(s, n * sizeof(char*));
1227         free(str.s);
1228     } else if (*fn == ':') { // read from string
1229         const char *q, *p;
1230         for (q = p = fn + 1;; ++p)
1231             if (*p == ',' || *p == 0) {
1232                 if (m == n) {
1233                     m = m? m<<1 : 16;
1234                     s = (char**)realloc(s, m * sizeof(char*));
1235                 }
1236                 s[n] = (char*)calloc(p - q + 1, 1);
1237                 strncpy(s[n++], q, p - q);
1238                 q = p + 1;
1239                 if (*p == 0) break;
1240             }
1241     } else return 0;
1242     s = (char**)realloc(s, n * sizeof(char*));
1243     *_n = n;
1244     return s;
1245 }
1246 
1247 // DEPRECATED: To be removed in a future HTSlib release
hts_file_type(const char * fname)1248 int hts_file_type(const char *fname)
1249 {
1250     int len = strlen(fname);
1251     if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
1252     if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
1253     if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
1254     if ( !strcmp("-",fname) ) return FT_STDIN;
1255 
1256     hFILE *f = hopen(fname, "r");
1257     if (f == NULL) return 0;
1258 
1259     htsFormat fmt;
1260     if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
1261     if (hclose(f) < 0) return 0;
1262 
1263     switch (fmt.format) {
1264     case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
1265     case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
1266     default:  return 0;
1267     }
1268 }
1269 
hts_check_EOF(htsFile * fp)1270 int hts_check_EOF(htsFile *fp)
1271 {
1272     if (fp->format.compression == bgzf)
1273         return bgzf_check_EOF(hts_get_bgzfp(fp));
1274     else if (fp->format.format == cram)
1275         return cram_check_EOF(fp->fp.cram);
1276     else
1277         return 3;
1278 }
1279 
1280 
1281 /****************
1282  *** Indexing ***
1283  ****************/
1284 
1285 #define HTS_MIN_MARKER_DIST 0x10000
1286 
1287 // Finds the special meta bin
1288 //  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
1289 #define META_BIN(idx) ((idx)->n_bins + 1)
1290 
1291 #define pair64_lt(a,b) ((a).u < (b).u)
1292 
1293 KSORT_INIT(_off, hts_pair64_t, pair64_lt)
1294 KSORT_INIT(_off_max, hts_pair64_max_t, pair64_lt)
1295 
1296 typedef struct {
1297     int32_t m, n;
1298     uint64_t loff;
1299     hts_pair64_t *list;
1300 } bins_t;
1301 
1302 KHASH_MAP_INIT_INT(bin, bins_t)
1303 typedef khash_t(bin) bidx_t;
1304 
1305 typedef struct {
1306     int32_t n, m;
1307     uint64_t *offset;
1308 } lidx_t;
1309 
1310 struct __hts_idx_t {
1311     int fmt, min_shift, n_lvls, n_bins;
1312     uint32_t l_meta;
1313     int32_t n, m;
1314     uint64_t n_no_coor;
1315     bidx_t **bidx;
1316     lidx_t *lidx;
1317     uint8_t *meta; // MUST have a terminating NUL on the end
1318     struct {
1319         uint32_t last_bin, save_bin;
1320         int last_coor, last_tid, save_tid, finished;
1321         uint64_t last_off, save_off;
1322         uint64_t off_beg, off_end;
1323         uint64_t n_mapped, n_unmapped;
1324     } z; // keep internal states
1325 };
1326 
idx_format_name(int fmt)1327 static char * idx_format_name(int fmt) {
1328     switch (fmt) {
1329         case HTS_FMT_CSI: return "csi";
1330         case HTS_FMT_BAI: return "bai";
1331         case HTS_FMT_TBI: return "tbi";
1332         case HTS_FMT_CRAI: return "crai";
1333         default: return "unknown";
1334     }
1335 }
1336 
insert_to_b(bidx_t * b,int bin,uint64_t beg,uint64_t end)1337 static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
1338 {
1339     khint_t k;
1340     bins_t *l;
1341     int absent;
1342     k = kh_put(bin, b, bin, &absent);
1343     if (absent < 0) return -1; // Out of memory
1344     l = &kh_value(b, k);
1345     if (absent) {
1346         l->m = 1; l->n = 0;
1347         l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
1348         if (!l->list) {
1349             kh_del(bin, b, k);
1350             return -1;
1351         }
1352     } else if (l->n == l->m) {
1353         uint32_t new_m = l->m ? l->m << 1 : 1;
1354         hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
1355         if (!new_list) return -1;
1356         l->list = new_list;
1357         l->m = new_m;
1358     }
1359     l->list[l->n].u = beg;
1360     l->list[l->n++].v = end;
1361     return 0;
1362 }
1363 
insert_to_l(lidx_t * l,int64_t _beg,int64_t _end,uint64_t offset,int min_shift)1364 static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
1365 {
1366     int i, beg, end;
1367     beg = _beg >> min_shift;
1368     end = (_end - 1) >> min_shift;
1369     if (l->m < end + 1) {
1370         size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
1371         uint64_t *new_offset;
1372 
1373         new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
1374         if (!new_offset) return -1;
1375 
1376         // fill unused memory with (uint64_t)-1
1377         memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
1378         l->m = new_m;
1379         l->offset = new_offset;
1380     }
1381     for (i = beg; i <= end; ++i) {
1382         if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
1383     }
1384     if (l->n < end + 1) l->n = end + 1;
1385     return 0;
1386 }
1387 
hts_idx_init(int n,int fmt,uint64_t offset0,int min_shift,int n_lvls)1388 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
1389 {
1390     hts_idx_t *idx;
1391     idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
1392     if (idx == NULL) return NULL;
1393     idx->fmt = fmt;
1394     idx->min_shift = min_shift;
1395     idx->n_lvls = n_lvls;
1396     idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
1397     idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
1398     idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
1399     idx->z.last_coor = 0xffffffffu;
1400     if (n) {
1401         idx->n = idx->m = n;
1402         idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
1403         if (idx->bidx == NULL) { free(idx); return NULL; }
1404         idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
1405         if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
1406     }
1407     return idx;
1408 }
1409 
update_loff(hts_idx_t * idx,int i,int free_lidx)1410 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
1411 {
1412     bidx_t *bidx = idx->bidx[i];
1413     lidx_t *lidx = &idx->lidx[i];
1414     khint_t k;
1415     int l;
1416     uint64_t offset0 = 0;
1417     if (bidx) {
1418         k = kh_get(bin, bidx, META_BIN(idx));
1419         if (k != kh_end(bidx))
1420             offset0 = kh_val(bidx, k).list[0].u;
1421         for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
1422             lidx->offset[l] = offset0;
1423     } else l = 1;
1424     for (; l < lidx->n; ++l) // fill missing values
1425         if (lidx->offset[l] == (uint64_t)-1)
1426             lidx->offset[l] = lidx->offset[l-1];
1427     if (bidx == 0) return;
1428     for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
1429         if (kh_exist(bidx, k))
1430         {
1431             if ( kh_key(bidx, k) < idx->n_bins )
1432             {
1433                 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
1434                 // disable linear index if bot_bin out of bounds
1435                 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
1436             }
1437             else
1438                 kh_val(bidx, k).loff = 0;
1439         }
1440     if (free_lidx) {
1441         free(lidx->offset);
1442         lidx->m = lidx->n = 0;
1443         lidx->offset = 0;
1444     }
1445 }
1446 
compress_binning(hts_idx_t * idx,int i)1447 static void compress_binning(hts_idx_t *idx, int i)
1448 {
1449     bidx_t *bidx = idx->bidx[i];
1450     khint_t k;
1451     int l, m;
1452     if (bidx == 0) return;
1453     // merge a bin to its parent if the bin is too small
1454     for (l = idx->n_lvls; l > 0; --l) {
1455         unsigned start = hts_bin_first(l);
1456         for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1457             bins_t *p, *q;
1458             if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
1459             p = &kh_value(bidx, k);
1460             if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
1461             if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
1462                 khint_t kp;
1463                 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
1464                 if (kp == kh_end(bidx)) continue;
1465                 q = &kh_val(bidx, kp);
1466                 if (q->n + p->n > q->m) {
1467                     q->m = q->n + p->n;
1468                     kroundup32(q->m);
1469                     q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
1470                 }
1471                 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
1472                 q->n += p->n;
1473                 free(p->list);
1474                 kh_del(bin, bidx, k);
1475             }
1476         }
1477     }
1478     k = kh_get(bin, bidx, 0);
1479     if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
1480     // merge adjacent chunks that start from the same BGZF block
1481     for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1482         bins_t *p;
1483         if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
1484         p = &kh_value(bidx, k);
1485         for (l = 1, m = 0; l < p->n; ++l) {
1486             if (p->list[m].v>>16 >= p->list[l].u>>16) {
1487                 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
1488             } else p->list[++m] = p->list[l];
1489         }
1490         p->n = m + 1;
1491     }
1492 }
1493 
hts_idx_finish(hts_idx_t * idx,uint64_t final_offset)1494 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
1495 {
1496     int i;
1497     if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
1498     if (idx->z.save_tid >= 0) {
1499         insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
1500         insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
1501         insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
1502     }
1503     for (i = 0; i < idx->n; ++i) {
1504         update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
1505         compress_binning(idx, i);
1506     }
1507     idx->z.finished = 1;
1508 }
1509 
hts_idx_push(hts_idx_t * idx,int tid,int beg,int end,uint64_t offset,int is_mapped)1510 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
1511 {
1512     int bin;
1513     int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
1514     if (tid<0) beg = -1, end = 0;
1515     if (tid >= 0 && (beg > maxpos || end > maxpos)) {
1516         goto pos_too_big;
1517     }
1518     if (tid >= idx->m) { // enlarge the index
1519         uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
1520         bidx_t **new_bidx;
1521         lidx_t *new_lidx;
1522 
1523         new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
1524         if (!new_bidx) return -1;
1525         idx->bidx = new_bidx;
1526 
1527         new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
1528         if (!new_lidx) return -1;
1529         idx->lidx = new_lidx;
1530 
1531         memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
1532         memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
1533         idx->m = new_m;
1534     }
1535     if (idx->n < tid + 1) idx->n = tid + 1;
1536     if (idx->z.finished) return 0;
1537     if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
1538         if ( tid>=0 && idx->n_no_coor )
1539         {
1540             hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
1541             return -1;
1542         }
1543         if (tid>=0 && idx->bidx[tid] != 0)
1544         {
1545             hts_log_error("Chromosome blocks not continuous");
1546             return -1;
1547         }
1548         idx->z.last_tid = tid;
1549         idx->z.last_bin = 0xffffffffu;
1550     } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
1551         hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid+1, idx->z.last_coor+1, beg+1);
1552         return -1;
1553     }
1554     if ( tid>=0 )
1555     {
1556         if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
1557         if (is_mapped) {
1558             // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
1559             if (beg < 0)  beg = 0;
1560             if (end <= 0) end = 1;
1561             // idx->z.last_off points to the start of the current record
1562             if (insert_to_l(&idx->lidx[tid], beg, end,
1563                             idx->z.last_off, idx->min_shift) < 0) return -1;
1564         }
1565     }
1566     else idx->n_no_coor++;
1567     bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
1568     if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
1569         if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
1570             if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
1571                             idx->z.save_off, idx->z.last_off) < 0) return -1;
1572         }
1573         if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
1574             idx->z.off_end = idx->z.last_off;
1575             if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1576                             idx->z.off_beg, idx->z.off_end) < 0) return -1;
1577             if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1578                             idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
1579             idx->z.n_mapped = idx->z.n_unmapped = 0;
1580             idx->z.off_beg = idx->z.off_end;
1581         }
1582         idx->z.save_off = idx->z.last_off;
1583         idx->z.save_bin = idx->z.last_bin = bin;
1584         idx->z.save_tid = tid;
1585     }
1586     if (is_mapped) ++idx->z.n_mapped;
1587     else ++idx->z.n_unmapped;
1588     idx->z.last_off = offset;
1589     idx->z.last_coor = beg;
1590     return 0;
1591 
1592  pos_too_big: {
1593         int64_t max = end > beg ? end : beg, s = 1 << 14;
1594         int n_lvls = 0;
1595         while (max > s) {
1596             n_lvls++;
1597             s <<= 3;
1598         }
1599 
1600         if (idx->fmt == HTS_FMT_CSI) {
1601             hts_log_error("Region %d..%d cannot be stored in a csi index "
1602                 "with min_shift = %d, n_lvls = %d. Try using "
1603                 "min_shift = 14, n_lvls >= %d",
1604                 beg, end,
1605                 idx->min_shift, idx->n_lvls,
1606                 n_lvls);
1607         } else {
1608             hts_log_error("Region %d..%d cannot be stored in a %s index. "
1609                 "Try using a csi index with min_shift = 14, "
1610                 "n_lvls >= %d",
1611                 beg, end, idx_format_name(idx->fmt),
1612                 n_lvls);
1613         }
1614         errno = ERANGE;
1615         return -1;
1616     }
1617 }
1618 
hts_idx_destroy(hts_idx_t * idx)1619 void hts_idx_destroy(hts_idx_t *idx)
1620 {
1621     khint_t k;
1622     int i;
1623     if (idx == 0) return;
1624 
1625     // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
1626     if (idx->fmt == HTS_FMT_CRAI) {
1627         hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
1628         cram_index_free(cidx->cram);
1629         free(cidx);
1630         return;
1631     }
1632 
1633     for (i = 0; i < idx->m; ++i) {
1634         bidx_t *bidx = idx->bidx[i];
1635         free(idx->lidx[i].offset);
1636         if (bidx == 0) continue;
1637         for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1638             if (kh_exist(bidx, k))
1639                 free(kh_value(bidx, k).list);
1640         kh_destroy(bin, bidx);
1641     }
1642     free(idx->bidx); free(idx->lidx); free(idx->meta);
1643     free(idx);
1644 }
1645 
1646 // The optimizer eliminates these ed_is_big() calls; still it would be good to
1647 // TODO Determine endianness at configure- or compile-time
1648 
idx_write_int32(BGZF * fp,int32_t x)1649 static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
1650 {
1651     if (ed_is_big()) x = ed_swap_4(x);
1652     return bgzf_write(fp, &x, sizeof x);
1653 }
1654 
idx_write_uint32(BGZF * fp,uint32_t x)1655 static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
1656 {
1657     if (ed_is_big()) x = ed_swap_4(x);
1658     return bgzf_write(fp, &x, sizeof x);
1659 }
1660 
idx_write_uint64(BGZF * fp,uint64_t x)1661 static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
1662 {
1663     if (ed_is_big()) x = ed_swap_8(x);
1664     return bgzf_write(fp, &x, sizeof x);
1665 }
1666 
swap_bins(bins_t * p)1667 static inline void swap_bins(bins_t *p)
1668 {
1669     int i;
1670     for (i = 0; i < p->n; ++i) {
1671         ed_swap_8p(&p->list[i].u);
1672         ed_swap_8p(&p->list[i].v);
1673     }
1674 }
1675 
hts_idx_save_core(const hts_idx_t * idx,BGZF * fp,int fmt)1676 static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
1677 {
1678     int32_t i, j;
1679 
1680     #define check(ret) if ((ret) < 0) return -1
1681 
1682     check(idx_write_int32(fp, idx->n));
1683     if (fmt == HTS_FMT_TBI && idx->l_meta)
1684         check(bgzf_write(fp, idx->meta, idx->l_meta));
1685 
1686     for (i = 0; i < idx->n; ++i) {
1687         khint_t k;
1688         bidx_t *bidx = idx->bidx[i];
1689         lidx_t *lidx = &idx->lidx[i];
1690         // write binning index
1691         check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
1692         if (bidx)
1693             for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1694                 if (kh_exist(bidx, k)) {
1695                     bins_t *p = &kh_value(bidx, k);
1696                     check(idx_write_uint32(fp, kh_key(bidx, k)));
1697                     if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
1698                     //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
1699                     check(idx_write_int32(fp, p->n));
1700                     for (j = 0; j < p->n; ++j) {
1701                         check(idx_write_uint64(fp, p->list[j].u));
1702                         check(idx_write_uint64(fp, p->list[j].v));
1703                     }
1704                 }
1705 
1706         // write linear index
1707         if (fmt != HTS_FMT_CSI) {
1708             check(idx_write_int32(fp, lidx->n));
1709             for (j = 0; j < lidx->n; ++j)
1710                 check(idx_write_uint64(fp, lidx->offset[j]));
1711         }
1712     }
1713 
1714     check(idx_write_uint64(fp, idx->n_no_coor));
1715     return 0;
1716     #undef check
1717 }
1718 
hts_idx_save(const hts_idx_t * idx,const char * fn,int fmt)1719 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
1720 {
1721     int ret, save;
1722     char *fnidx = (char*)calloc(1, strlen(fn) + 5);
1723     if (fnidx == NULL) return -1;
1724 
1725     strcpy(fnidx, fn);
1726     switch (fmt) {
1727     case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
1728     case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
1729     case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
1730     default: abort();
1731     }
1732 
1733     ret = hts_idx_save_as(idx, fn, fnidx, fmt);
1734     save = errno;
1735     free(fnidx);
1736     errno = save;
1737     return ret;
1738 }
1739 
hts_idx_save_as(const hts_idx_t * idx,const char * fn,const char * fnidx,int fmt)1740 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
1741 {
1742     BGZF *fp;
1743 
1744     #define check(ret) if ((ret) < 0) goto fail
1745 
1746     if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
1747 
1748     fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
1749     if (fp == NULL) return -1;
1750 
1751     if (fmt == HTS_FMT_CSI) {
1752         check(bgzf_write(fp, "CSI\1", 4));
1753         check(idx_write_int32(fp, idx->min_shift));
1754         check(idx_write_int32(fp, idx->n_lvls));
1755         check(idx_write_uint32(fp, idx->l_meta));
1756         if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
1757     } else if (fmt == HTS_FMT_TBI) {
1758         check(bgzf_write(fp, "TBI\1", 4));
1759     } else if (fmt == HTS_FMT_BAI) {
1760         check(bgzf_write(fp, "BAI\1", 4));
1761     } else abort();
1762 
1763     check(hts_idx_save_core(idx, fp, fmt));
1764 
1765     return bgzf_close(fp);
1766     #undef check
1767 
1768 fail:
1769     bgzf_close(fp);
1770     return -1;
1771 }
1772 
hts_idx_load_core(hts_idx_t * idx,BGZF * fp,int fmt)1773 static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt)
1774 {
1775     int32_t i, n, is_be;
1776     is_be = ed_is_big();
1777     if (idx == NULL) return -4;
1778     for (i = 0; i < idx->n; ++i) {
1779         bidx_t *h;
1780         lidx_t *l = &idx->lidx[i];
1781         uint32_t key;
1782         int j, absent;
1783         bins_t *p;
1784         h = idx->bidx[i] = kh_init(bin);
1785         if (bgzf_read(fp, &n, 4) != 4) return -1;
1786         if (is_be) ed_swap_4p(&n);
1787         for (j = 0; j < n; ++j) {
1788             khint_t k;
1789             if (bgzf_read(fp, &key, 4) != 4) return -1;
1790             if (is_be) ed_swap_4p(&key);
1791             k = kh_put(bin, h, key, &absent);
1792             if (absent <= 0) return -3; // Duplicate bin number
1793             p = &kh_val(h, k);
1794             if (fmt == HTS_FMT_CSI) {
1795                 if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
1796                 if (is_be) ed_swap_8p(&p->loff);
1797             } else p->loff = 0;
1798             if (bgzf_read(fp, &p->n, 4) != 4) return -1;
1799             if (is_be) ed_swap_4p(&p->n);
1800             p->m = p->n;
1801             p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
1802             if (p->list == NULL) return -2;
1803             if (bgzf_read(fp, p->list, p->n<<4) != p->n<<4) return -1;
1804             if (is_be) swap_bins(p);
1805         }
1806         if (fmt != HTS_FMT_CSI) { // load linear index
1807             int j;
1808             if (bgzf_read(fp, &l->n, 4) != 4) return -1;
1809             if (is_be) ed_swap_4p(&l->n);
1810             l->m = l->n;
1811             l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
1812             if (l->offset == NULL) return -2;
1813             if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
1814             if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
1815             for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
1816                 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
1817             update_loff(idx, i, 1);
1818         }
1819     }
1820     if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
1821     if (is_be) ed_swap_8p(&idx->n_no_coor);
1822     return 0;
1823 }
1824 
hts_idx_load_local(const char * fn)1825 static hts_idx_t *hts_idx_load_local(const char *fn)
1826 {
1827     uint8_t magic[4];
1828     int i, is_be;
1829     hts_idx_t *idx = NULL;
1830     uint8_t *meta = NULL;
1831     BGZF *fp = bgzf_open(fn, "r");
1832     if (fp == NULL) return NULL;
1833     is_be = ed_is_big();
1834     if (bgzf_read(fp, magic, 4) != 4) goto fail;
1835 
1836     if (memcmp(magic, "CSI\1", 4) == 0) {
1837         uint32_t x[3], n;
1838         if (bgzf_read(fp, x, 12) != 12) goto fail;
1839         if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
1840         if (x[2]) {
1841             if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
1842             if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
1843             if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
1844             // Prevent possible strlen past the end in tbx_index_load2
1845             meta[x[2]] = '\0';
1846         }
1847         if (bgzf_read(fp, &n, 4) != 4) goto fail;
1848         if (is_be) ed_swap_4p(&n);
1849         if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
1850         idx->l_meta = x[2];
1851         idx->meta = meta;
1852         meta = NULL;
1853         if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
1854     }
1855     else if (memcmp(magic, "TBI\1", 4) == 0) {
1856         uint8_t x[8 * 4];
1857         uint32_t n;
1858         // Read file header
1859         if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
1860         n = le_to_u32(&x[0]); // location of n_ref
1861         if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
1862         n = le_to_u32(&x[7*4]); // location of l_nm
1863         if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
1864         idx->l_meta = 28 + n;
1865         if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
1866         // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
1867         // N.B. left in little-endian byte order.
1868         memcpy(idx->meta, &x[1*4], 28);
1869         // Read in sequence names.
1870         if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
1871         // Prevent possible strlen past the end in tbx_index_load2
1872         idx->meta[idx->l_meta] = '\0';
1873         if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
1874     }
1875     else if (memcmp(magic, "BAI\1", 4) == 0) {
1876         uint32_t n;
1877         if (bgzf_read(fp, &n, 4) != 4) goto fail;
1878         if (is_be) ed_swap_4p(&n);
1879         idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5);
1880         if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
1881     }
1882     else { errno = EINVAL; goto fail; }
1883 
1884     bgzf_close(fp);
1885     return idx;
1886 
1887 fail:
1888     bgzf_close(fp);
1889     hts_idx_destroy(idx);
1890     free(meta);
1891     return NULL;
1892 }
1893 
hts_idx_set_meta(hts_idx_t * idx,uint32_t l_meta,uint8_t * meta,int is_copy)1894 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
1895                       int is_copy)
1896 {
1897     uint8_t *new_meta = meta;
1898     if (is_copy) {
1899         size_t l = l_meta;
1900         if (l > SIZE_MAX - 1) {
1901             errno = ENOMEM;
1902             return -1;
1903         }
1904         new_meta = malloc(l + 1);
1905         if (!new_meta) return -1;
1906         memcpy(new_meta, meta, l);
1907         // Prevent possible strlen past the end in tbx_index_load2
1908         meta[l + 1] = '\0';
1909     }
1910     if (idx->meta) free(idx->meta);
1911     idx->l_meta = l_meta;
1912     idx->meta = new_meta;
1913     return 0;
1914 }
1915 
hts_idx_get_meta(hts_idx_t * idx,uint32_t * l_meta)1916 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
1917 {
1918     *l_meta = idx->l_meta;
1919     return idx->meta;
1920 }
1921 
hts_idx_seqnames(const hts_idx_t * idx,int * n,hts_id2name_f getid,void * hdr)1922 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
1923 {
1924     if ( !idx->n )
1925     {
1926         *n = 0;
1927         return NULL;
1928     }
1929 
1930     int tid = 0, i;
1931     const char **names = (const char**) calloc(idx->n,sizeof(const char*));
1932     for (i=0; i<idx->n; i++)
1933     {
1934         bidx_t *bidx = idx->bidx[i];
1935         if ( !bidx ) continue;
1936         names[tid++] = getid(hdr,i);
1937     }
1938     *n = tid;
1939     return names;
1940 }
1941 
hts_idx_get_stat(const hts_idx_t * idx,int tid,uint64_t * mapped,uint64_t * unmapped)1942 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
1943 {
1944     if ( idx->fmt == HTS_FMT_CRAI ) {
1945         *mapped = 0; *unmapped = 0;
1946         return -1;
1947     }
1948 
1949     bidx_t *h = idx->bidx[tid];
1950     khint_t k = kh_get(bin, h, META_BIN(idx));
1951     if (k != kh_end(h)) {
1952         *mapped = kh_val(h, k).list[1].u;
1953         *unmapped = kh_val(h, k).list[1].v;
1954         return 0;
1955     } else {
1956         *mapped = 0; *unmapped = 0;
1957         return -1;
1958     }
1959 }
1960 
hts_idx_get_n_no_coor(const hts_idx_t * idx)1961 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
1962 {
1963     return idx->n_no_coor;
1964 }
1965 
1966 /****************
1967  *** Iterator ***
1968  ****************/
1969 
reg2bins(int64_t beg,int64_t end,hts_itr_t * itr,int min_shift,int n_lvls)1970 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
1971 {
1972     int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
1973     if (beg >= end) return 0;
1974     if (end >= 1LL<<s) end = 1LL<<s;
1975     for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
1976         int b, e, n, i;
1977         b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
1978         if (itr->bins.n + n > itr->bins.m) {
1979             itr->bins.m = itr->bins.n + n;
1980             kroundup32(itr->bins.m);
1981             itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
1982         }
1983         for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
1984     }
1985     return itr->bins.n;
1986 }
1987 
reg2intervals(hts_itr_multi_t * iter,const hts_idx_t * idx,int tid,int64_t beg,int64_t end,uint64_t min_off,uint64_t max_off,int min_shift,int n_lvls)1988 static inline int reg2intervals(hts_itr_multi_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
1989 {
1990     int l, t, s;
1991     int b, e, i, j;
1992     hts_pair64_max_t *off;
1993     bidx_t *bidx;
1994     khint_t k;
1995 
1996     if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
1997         return -1;
1998 
1999     s = min_shift + (n_lvls<<1) + n_lvls;
2000     if (end >= 1LL<<s)
2001         end = 1LL<<s;
2002 
2003     for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
2004         b = t + (beg>>s); e = t + (end>>s);
2005 
2006         for (i = b; i <= e; ++i) {
2007             if ((k = kh_get(bin, bidx, i)) != kh_end(bidx)) {
2008                 bins_t *p = &kh_value(bidx, k);
2009 
2010                 if (p->n) {
2011                     off = (hts_pair64_max_t*)realloc(iter->off, (iter->n_off + p->n) * sizeof(hts_pair64_max_t));
2012                     if (!off)
2013                         return -2;
2014 
2015                     iter->off = off;
2016                     for (j = 0; j < p->n; ++j) {
2017                         if (p->list[j].v > min_off && p->list[j].u < max_off) {
2018                             iter->off[iter->n_off].u = p->list[j].u;
2019                             iter->off[iter->n_off].v = p->list[j].v;
2020                             iter->off[iter->n_off].max = ((uint64_t)tid<<32) | (end+1);
2021                             iter->n_off++;
2022                         }
2023                     }
2024                 }
2025             }
2026         }
2027     }
2028 
2029     return iter->n_off;
2030 }
2031 
compare_regions(const void * r1,const void * r2)2032 static int compare_regions(const void *r1, const void *r2) {
2033     hts_reglist_t *reg1 = (hts_reglist_t *)r1;
2034     hts_reglist_t *reg2 = (hts_reglist_t *)r2;
2035 
2036     if (reg1->tid < 0 && reg2->tid >= 0)
2037         return 1;
2038     else if (reg1->tid >= 0 && reg2->tid < 0)
2039         return -1;
2040     else
2041         return reg1->tid - reg2->tid;
2042 }
2043 
hts_itr_off(const hts_idx_t * idx,int tid)2044 uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
2045 
2046     int i;
2047     bidx_t* bidx;
2048     uint64_t off0 = (uint64_t) -1;
2049     khint_t k;
2050     switch (tid) {
2051     case HTS_IDX_START:
2052         // Find the smallest offset, note that sequence ids may not be ordered sequentially
2053         for (i = 0; i < idx->n; i++) {
2054             bidx = idx->bidx[i];
2055             k = kh_get(bin, bidx, META_BIN(idx));
2056             if (k == kh_end(bidx))
2057                 continue;
2058 
2059             if (off0 > kh_val(bidx, k).list[0].u)
2060                 off0 = kh_val(bidx, k).list[0].u;
2061         }
2062         if (off0 == (uint64_t) -1 && idx->n_no_coor)
2063             off0 = 0;
2064         // only no-coor reads in this bam
2065         break;
2066     case HTS_IDX_NOCOOR:
2067         /* No-coor reads sort after all of the mapped reads.  The position
2068            is not stored in the index itself, so need to find the end
2069            offset for the last mapped read.  A loop is needed here in
2070            case references at the end of the file have no mapped reads,
2071            or sequence ids are not ordered sequentially.
2072            See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
2073         for (i = 0; i < idx->n; i++) {
2074             bidx = idx->bidx[i];
2075             k = kh_get(bin, bidx, META_BIN(idx));
2076             if (k != kh_end(bidx)) {
2077                 if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
2078                     off0 = kh_val(bidx, k).list[0].v;
2079                 }
2080             }
2081         }
2082         if (off0 == (uint64_t) -1 && idx->n_no_coor)
2083             off0 = 0;
2084         // only no-coor reads in this bam
2085         break;
2086     case HTS_IDX_REST:
2087         off0 = 0;
2088         break;
2089     case HTS_IDX_NONE:
2090         off0 = 0;
2091         break;
2092     }
2093 
2094     return off0;
2095 }
2096 
hts_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)2097 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
2098 {
2099     int i, n_off, l, bin;
2100     hts_pair64_t *off;
2101     khint_t k;
2102     bidx_t *bidx;
2103     uint64_t min_off, max_off;
2104     hts_itr_t *iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
2105     if (iter) {
2106         if (tid < 0) {
2107             uint64_t off = hts_itr_off(idx, tid);
2108             if (off != (uint64_t) -1) {
2109                 iter->read_rest = 1;
2110                 iter->curr_off = off;
2111                 iter->readrec = readrec;
2112                 if (tid == HTS_IDX_NONE)
2113                     iter->finished = 1;
2114             } else {
2115                 free(iter);
2116                 iter = NULL;
2117             }
2118         } else {
2119             if (beg < 0) beg = 0;
2120             if (end < beg) return 0;
2121             if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
2122 
2123             iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
2124             iter->readrec = readrec;
2125 
2126             if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
2127 
2128             // compute min_off
2129             bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
2130             do {
2131                 int first;
2132                 k = kh_get(bin, bidx, bin);
2133                 if (k != kh_end(bidx)) break;
2134                 first = (hts_bin_parent(bin)<<3) + 1;
2135                 if (bin > first) --bin;
2136                 else bin = hts_bin_parent(bin);
2137             } while (bin);
2138             if (bin == 0) k = kh_get(bin, bidx, bin);
2139             min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2140 
2141             // compute max_off: a virtual offset from a bin to the right of end
2142             bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2143             if (bin >= idx->n_bins) bin = 0;
2144             while (1) {
2145                 // search for an extant bin by moving right, but moving up to the
2146                 // parent whenever we get to a first child (which also covers falling
2147                 // off the RHS, which wraps around and immediately goes up to bin 0)
2148                 while (bin % 8 == 1) bin = hts_bin_parent(bin);
2149                 if (bin == 0) { max_off = (uint64_t)-1; break; }
2150                 k = kh_get(bin, bidx, bin);
2151                 if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
2152                 bin++;
2153             }
2154 
2155             // retrieve bins
2156             reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
2157 
2158             for (i = n_off = 0; i < iter->bins.n; ++i)
2159                 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
2160                     n_off += kh_value(bidx, k).n;
2161             if (n_off == 0) {
2162                 // No overlapping bins means the iterator has already finished.
2163                 iter->finished = 1;
2164                 return iter;
2165             }
2166             off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
2167             for (i = n_off = 0; i < iter->bins.n; ++i) {
2168                 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
2169                     int j;
2170                     bins_t *p = &kh_value(bidx, k);
2171                     for (j = 0; j < p->n; ++j)
2172                         if (p->list[j].v > min_off && p->list[j].u < max_off)
2173                             off[n_off++] = p->list[j];
2174                 }
2175             }
2176 
2177             if (n_off == 0) {
2178                 free(off);
2179                 iter->finished = 1;
2180                 return iter;
2181             }
2182             ks_introsort(_off, n_off, off);
2183             // resolve completely contained adjacent blocks
2184             for (i = 1, l = 0; i < n_off; ++i)
2185                 if (off[l].v < off[i].v) off[++l] = off[i];
2186             n_off = l + 1;
2187             // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2188             for (i = 1; i < n_off; ++i)
2189                 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2190             // merge adjacent blocks
2191             for (i = 1, l = 0; i < n_off; ++i) {
2192                 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
2193                 else off[++l] = off[i];
2194             }
2195             n_off = l + 1;
2196             iter->n_off = n_off; iter->off = off;
2197         }
2198     }
2199     return iter;
2200 }
2201 
hts_itr_multi_bam(const hts_idx_t * idx,hts_itr_multi_t * iter)2202 hts_itr_multi_t *hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_multi_t *iter)
2203 {
2204     int i, j, l, n_off = 0, bin;
2205     hts_pair64_max_t *off = NULL;
2206     khint_t k;
2207     bidx_t *bidx;
2208     uint64_t min_off, max_off, t_off = (uint64_t)-1;
2209     int tid, beg, end;
2210     hts_reglist_t *curr_reg;
2211 
2212     if (iter) {
2213         iter->i = -1;
2214         for (i=0; i<iter->n_reg; i++) {
2215 
2216             curr_reg = &iter->reg_list[i];
2217             tid = curr_reg->tid;
2218 
2219             if (tid < 0) {
2220                 t_off = hts_itr_off(idx, tid);
2221                 if (t_off != (uint64_t)-1) {
2222                     switch (tid) {
2223                         case HTS_IDX_NONE:
2224                             iter->finished = 1;
2225                         case HTS_IDX_START:
2226                         case HTS_IDX_REST:
2227                             iter->curr_off = t_off;
2228                             iter->n_reg = 0;
2229                             iter->reg_list = NULL;
2230                             iter->read_rest = 1;
2231                             return iter;
2232                         case HTS_IDX_NOCOOR:
2233                             iter->nocoor = 1;
2234                             iter->nocoor_off = t_off;
2235                     }
2236                 }
2237             } else {
2238                 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
2239                     continue;
2240 
2241                 for(j=0; j<curr_reg->count; j++) {
2242                     hts_pair32_t *curr_intv = &curr_reg->intervals[j];
2243                     if (curr_intv->end < curr_intv->beg)
2244                         continue;
2245 
2246                     beg = curr_intv->beg;
2247                     end = curr_intv->end;
2248 
2249                     /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
2250                        If the computed bin is not in the index, try the next bin to the
2251                        left, belonging to the same parent. If it is the first sibling bin,
2252                        try the parent bin. */
2253                     bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
2254                     do {
2255                         int first;
2256                         k = kh_get(bin, bidx, bin);
2257                         if (k != kh_end(bidx)) break;
2258                         first = (hts_bin_parent(bin)<<3) + 1;
2259                         if (bin > first) --bin;
2260                         else bin = hts_bin_parent(bin);
2261                     } while (bin);
2262                     if (bin == 0)
2263                         k = kh_get(bin, bidx, bin);
2264                     min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2265 
2266                     // compute max_off: a virtual offset from a bin to the right of end
2267                     bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2268                     if (bin >= idx->n_bins) bin = 0;
2269                     while (1) {
2270                     // search for an extant bin by moving right, but moving up to the
2271                     // parent whenever we get to a first child (which also covers falling
2272                     // off the RHS, which wraps around and immediately goes up to bin 0)
2273                         while (bin % 8 == 1) bin = hts_bin_parent(bin);
2274                         if (bin == 0) { max_off = (uint64_t)-1; break; }
2275                         k = kh_get(bin, bidx, bin);
2276                         if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
2277                             max_off = kh_val(bidx, k).list[0].u;
2278                             break;
2279                         }
2280                         bin++;
2281                     }
2282 
2283                     //convert coordinates to file offsets
2284                     reg2intervals(iter, idx, tid, beg, end, min_off, max_off, idx->min_shift, idx->n_lvls);
2285                 }
2286             }
2287         }
2288 
2289         off = iter->off;
2290         n_off = iter->n_off;
2291 
2292         if (n_off) {
2293             ks_introsort(_off_max, n_off, off);
2294             // resolve completely contained adjacent blocks
2295             for (i = 1, l = 0; i < n_off; ++i) {
2296                 if (off[l].v < off[i].v) {
2297                     off[++l] = off[i];
2298                 } else {
2299                     off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max);
2300                 }
2301             }
2302             n_off = l + 1;
2303             // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2304             for (i = 1; i < n_off; ++i)
2305                 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2306             // merge adjacent blocks
2307             for (i = 1, l = 0; i < n_off; ++i) {
2308                 if (off[l].v>>16 == off[i].u>>16) {
2309                     off[l].v = off[i].v;
2310                     off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max);
2311                 } else off[++l] = off[i];
2312             }
2313             n_off = l + 1;
2314             iter->n_off = n_off; iter->off = off;
2315         }
2316 
2317         if(!n_off && !iter->nocoor)
2318             iter->finished = 1;
2319     }
2320     return iter;
2321 }
2322 
hts_itr_multi_cram(const hts_idx_t * idx,hts_itr_multi_t * iter)2323 hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter)
2324 {
2325     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
2326     int tid, beg, end, i, j, l, n_off = 0;
2327     hts_reglist_t *curr_reg;
2328     hts_pair32_t *curr_intv;
2329     hts_pair64_max_t *off = NULL;
2330     cram_index *e = NULL;
2331 
2332     if (!cidx || !iter)
2333         return NULL;
2334 
2335     iter->is_cram = 1;
2336     iter->read_rest = 0;
2337     iter->off = NULL;
2338     iter->n_off = 0;
2339     iter->curr_off = 0;
2340     iter->i = -1;
2341 
2342     for (i=0; i<iter->n_reg; i++) {
2343 
2344         curr_reg = &iter->reg_list[i];
2345         tid = curr_reg->tid;
2346 
2347         if (tid >= 0) {
2348             off = (hts_pair64_max_t*)realloc(off, (n_off + curr_reg->count) * sizeof(hts_pair64_max_t));
2349             if (!off)
2350                 return NULL;
2351 
2352             for (j=0; j < curr_reg->count; j++) {
2353                 curr_intv = &curr_reg->intervals[j];
2354                 if (curr_intv->end < curr_intv->beg)
2355                     continue;
2356 
2357                 beg = curr_intv->beg;
2358                 end = curr_intv->end;
2359 
2360 /* First, fetch the container overlapping 'beg' and assign its file offset to u, then
2361  * find the container overlapping 'end' and assing the relative end of the slice to v.
2362  * The cram_ptell function will adjust with the container offset, which is not stored
2363  * in the index.
2364  */
2365                 e = cram_index_query(cidx->cram, tid, beg+1, NULL);
2366                 if (e) {
2367                     off[n_off].u = e->offset;
2368 
2369                     if (end == INT_MAX) {
2370                        e = cram_index_last(cidx->cram, tid, NULL);
2371                     } else {
2372                        e = cram_index_query(cidx->cram, tid, end+1, NULL);
2373                     }
2374 
2375                     if (e) {
2376                         off[n_off].v = e->offset + e->slice + e->len;
2377                         off[n_off].max = (uint64_t)tid<<32 | end;
2378                         n_off++;
2379                     } else {
2380                         hts_log_warning("Could not set offset end for region %d(%s):%d-%d. Skipping", tid, curr_reg->reg, beg, end);
2381                     }
2382                 } else {
2383                     hts_log_warning("No index entry for region %d:%d-%d", tid, beg, end);
2384                 }
2385             }
2386         } else {
2387             switch (tid) {
2388                 case HTS_IDX_NOCOOR:
2389                     e = cram_index_query(cidx->cram, tid, 1, NULL);
2390                     if (e) {
2391                         iter->nocoor = 1;
2392                         iter->nocoor_off = e->offset;
2393                     } else {
2394                         hts_log_warning("No index entry for NOCOOR region");
2395                     }
2396                     break;
2397                 case HTS_IDX_START:
2398                     e = cram_index_query(cidx->cram, tid, 1, NULL);
2399                     if (e) {
2400                         iter->read_rest = 1;
2401                         off = (hts_pair64_max_t*)realloc(off, sizeof(hts_pair64_max_t));
2402                         off[0].u = e->offset;
2403                         off[0].v = 0;
2404                         off[0].max = 0;
2405                         n_off=1;
2406                     } else {
2407                         hts_log_warning("No index entries");
2408                     }
2409                     break;
2410                 case HTS_IDX_REST:
2411                     break;
2412                 case HTS_IDX_NONE:
2413                     iter->finished = 1;
2414                     break;
2415                 default:
2416                     hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
2417             }
2418         }
2419     }
2420 
2421     if (n_off) {
2422         ks_introsort(_off_max, n_off, off);
2423         // resolve completely contained adjacent blocks
2424         for (i = 1, l = 0; i < n_off; ++i) {
2425             if (off[l].v < off[i].v) {
2426                 off[++l] = off[i];
2427             } else {
2428                 off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max);
2429             }
2430         }
2431         n_off = l + 1;
2432         // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2433         for (i = 1; i < n_off; ++i)
2434             if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2435         // merge adjacent blocks
2436         for (i = 1, l = 0; i < n_off; ++i) {
2437             if (off[l].v>>16 == off[i].u>>16) {
2438                 off[l].v = off[i].v;
2439                 off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max);
2440             } else off[++l] = off[i];
2441         }
2442         n_off = l + 1;
2443         iter->n_off = n_off; iter->off = off;
2444     }
2445 
2446     if(!n_off && !iter->nocoor)
2447         iter->finished = 1;
2448 
2449     return iter;
2450 }
2451 
hts_itr_destroy(hts_itr_t * iter)2452 void hts_itr_destroy(hts_itr_t *iter)
2453 {
2454     if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
2455 }
2456 
hts_reglist_free(hts_reglist_t * reglist,int count)2457 void hts_reglist_free(hts_reglist_t *reglist, int count) {
2458 
2459     int i;
2460     if(reglist) {
2461         for (i=0;i<count;i++) {
2462             if (reglist[i].intervals)
2463                 free(reglist[i].intervals);
2464         }
2465         free(reglist);
2466     }
2467 }
2468 
hts_itr_multi_destroy(hts_itr_multi_t * iter)2469 void hts_itr_multi_destroy(hts_itr_multi_t *iter) {
2470 
2471     if (iter) {
2472         if (iter->reg_list && iter->n_reg)
2473             hts_reglist_free(iter->reg_list, iter->n_reg);
2474 
2475         if (iter->off && iter->n_off)
2476             free(iter->off);
2477         free(iter);
2478     }
2479 }
2480 
push_digit(long long i,char c)2481 static inline long long push_digit(long long i, char c)
2482 {
2483     // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
2484     int digit = c - '0';
2485     return 10 * i + digit;
2486 }
2487 
hts_parse_decimal(const char * str,char ** strend,int flags)2488 long long hts_parse_decimal(const char *str, char **strend, int flags)
2489 {
2490     long long n = 0;
2491     int decimals = 0, e = 0, lost = 0;
2492     char sign = '+', esign = '+';
2493     const char *s;
2494 
2495     while (isspace_c(*str)) str++;
2496     s = str;
2497 
2498     if (*s == '+' || *s == '-') sign = *s++;
2499     while (*s)
2500         if (isdigit_c(*s)) n = push_digit(n, *s++);
2501         else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
2502         else break;
2503 
2504     if (*s == '.') {
2505         s++;
2506         while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++);
2507     }
2508 
2509     if (*s == 'E' || *s == 'e') {
2510         s++;
2511         if (*s == '+' || *s == '-') esign = *s++;
2512         while (isdigit_c(*s)) e = push_digit(e, *s++);
2513         if (esign == '-') e = -e;
2514     }
2515 
2516     e -= decimals;
2517     while (e > 0) n *= 10, e--;
2518     while (e < 0) lost += n % 10, n /= 10, e++;
2519 
2520     if (lost > 0) {
2521         hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
2522     }
2523 
2524     if (strend) {
2525         *strend = (char *)s;
2526     } else if (*s) {
2527         hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
2528     }
2529 
2530     return (sign == '+')? n : -n;
2531 }
2532 
hts_parse_reg(const char * s,int * beg,int * end)2533 const char *hts_parse_reg(const char *s, int *beg, int *end)
2534 {
2535     char *hyphen;
2536     const char *colon = strrchr(s, ':');
2537     if (colon == NULL) {
2538         *beg = 0; *end = INT_MAX;
2539         return s + strlen(s);
2540     }
2541 
2542     *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
2543     if (*beg < 0) *beg = 0;
2544 
2545     if (*hyphen == '\0') *end = INT_MAX;
2546     else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
2547     else return NULL;
2548 
2549     if (*beg >= *end) return NULL;
2550     return colon;
2551 }
2552 
hts_itr_querys(const hts_idx_t * idx,const char * reg,hts_name2id_f getid,void * hdr,hts_itr_query_func * itr_query,hts_readrec_func * readrec)2553 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
2554 {
2555     int tid, beg, end;
2556     const char *q;
2557 
2558     if (strcmp(reg, ".") == 0)
2559         return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
2560     else if (strcmp(reg, "*") == 0)
2561         return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
2562 
2563     q = hts_parse_reg(reg, &beg, &end);
2564     if (q) {
2565         char tmp_a[1024], *tmp = tmp_a;
2566         if (q - reg + 1 > 1024)
2567             if (!(tmp = malloc(q - reg + 1)))
2568                 return NULL;
2569         strncpy(tmp, reg, q - reg);
2570         tmp[q - reg] = 0;
2571         tid = getid(hdr, tmp);
2572         if (tmp != tmp_a)
2573             free(tmp);
2574     }
2575     else {
2576         // not parsable as a region, but possibly a sequence named "foo:a"
2577         tid = getid(hdr, reg);
2578         beg = 0; end = INT_MAX;
2579     }
2580 
2581     if (tid < 0) return NULL;
2582     return itr_query(idx, tid, beg, end, readrec);
2583 }
2584 
hts_itr_regions(const hts_idx_t * idx,hts_reglist_t * reglist,int count,hts_name2id_f getid,void * hdr,hts_itr_multi_query_func * itr_specific,hts_readrec_func * readrec,hts_seek_func * seek,hts_tell_func * tell)2585 hts_itr_multi_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
2586 
2587     int i;
2588 
2589     if (!reglist)
2590         return NULL;
2591 
2592     hts_itr_multi_t *itr = (hts_itr_multi_t*)calloc(1, sizeof(hts_itr_multi_t));
2593     if (itr) {
2594         itr->n_reg = count;
2595         itr->readrec = readrec;
2596         itr->seek = seek;
2597         itr->tell = tell;
2598         itr->reg_list = reglist;
2599         itr->finished = 0;
2600         itr->nocoor = 0;
2601 
2602 
2603         for (i = 0; i < itr->n_reg; i++) {
2604             if (!strcmp(itr->reg_list[i].reg, ".")) {
2605                 itr->reg_list[i].tid = HTS_IDX_START;
2606                 continue;
2607             }
2608 
2609             if (!strcmp(itr->reg_list[i].reg, "*")) {
2610                 itr->reg_list[i].tid = HTS_IDX_NOCOOR;
2611                 continue;
2612             }
2613 
2614             itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
2615             if (itr->reg_list[i].tid < 0)
2616                 hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
2617         }
2618 
2619         qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
2620         itr_specific(idx, itr);
2621     }
2622     return itr;
2623 }
2624 
hts_itr_next(BGZF * fp,hts_itr_t * iter,void * r,void * data)2625 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
2626 {
2627     int ret, tid, beg, end;
2628     if (iter == NULL || iter->finished) return -1;
2629     if (iter->read_rest) {
2630         if (iter->curr_off) { // seek to the start
2631             if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) return -1;
2632             iter->curr_off = 0; // only seek once
2633         }
2634         ret = iter->readrec(fp, data, r, &tid, &beg, &end);
2635         if (ret < 0) iter->finished = 1;
2636         iter->curr_tid = tid;
2637         iter->curr_beg = beg;
2638         iter->curr_end = end;
2639         return ret;
2640     }
2641     // A NULL iter->off should always be accompanied by iter->finished.
2642     assert(iter->off != NULL);
2643     for (;;) {
2644         if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
2645             if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
2646             if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
2647                 if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) return -1;
2648                 iter->curr_off = bgzf_tell(fp);
2649             }
2650             ++iter->i;
2651         }
2652         if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
2653             iter->curr_off = bgzf_tell(fp);
2654             if (tid != iter->tid || beg >= iter->end) { // no need to proceed
2655                 ret = -1; break;
2656             } else if (end > iter->beg && iter->end > beg) {
2657                 iter->curr_tid = tid;
2658                 iter->curr_beg = beg;
2659                 iter->curr_end = end;
2660                 return ret;
2661             }
2662         } else break; // end of file or error
2663     }
2664     iter->finished = 1;
2665     return ret;
2666 }
2667 
hts_itr_multi_next(htsFile * fd,hts_itr_multi_t * iter,void * r)2668 int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r)
2669 {
2670     void *fp;
2671     int ret, tid, beg, end, i, cr, ci;
2672     hts_reglist_t *found_reg;
2673 
2674     if (iter == NULL || iter->finished) return -1;
2675 
2676     if (iter->is_cram) {
2677         fp = fd->fp.cram;
2678     } else {
2679         fp = fd->fp.bgzf;
2680     }
2681 
2682     if (iter->read_rest) {
2683         if (iter->curr_off) { // seek to the start
2684             if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
2685                 return -1;
2686             }
2687             iter->curr_off = 0; // only seek once
2688         }
2689 
2690         ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
2691         if (ret < 0) {
2692             iter->finished = 1;
2693         }
2694 
2695         iter->curr_tid = tid;
2696         iter->curr_beg = beg;
2697         iter->curr_end = end;
2698 
2699         return ret;
2700     }
2701     // A NULL iter->off should always be accompanied by iter->finished.
2702     assert(iter->off != NULL || iter->nocoor != 0);
2703 
2704     for (;;) {
2705         if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
2706             if (iter->i == iter->n_off - 1) { // no more chunks, except NOCOORs
2707                if (iter->nocoor) {
2708                    iter->read_rest = 1;
2709                    iter->curr_off = iter->nocoor_off;
2710 
2711                    return hts_itr_multi_next(fd, iter, r);
2712                } else {
2713                    ret = -1; break;
2714                }
2715             }
2716 
2717             if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
2718                 if (iter->seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
2719                     return -1;
2720                 }
2721 
2722                 iter->curr_off = iter->tell(fp);
2723             }
2724             ++iter->i;
2725         }
2726 
2727         ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
2728         if (ret < 0)
2729             break;
2730 
2731         iter->curr_off = iter->tell(fp);
2732         if (tid != iter->curr_tid) {
2733             hts_reglist_t key;
2734             key.tid = tid;
2735 
2736             found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list, iter->n_reg, sizeof(hts_reglist_t), compare_regions);
2737             if (!found_reg)
2738                 continue;
2739 
2740             iter->curr_reg = (found_reg - iter->reg_list);
2741             iter->curr_tid = tid;
2742             iter->curr_intv = 0;
2743         }
2744 
2745         cr = iter->curr_reg;
2746         ci = iter->curr_intv;
2747 
2748         if (beg >  iter->off[iter->i].max) {
2749             iter->curr_off = iter->off[iter->i].v;
2750             continue;
2751         }
2752         if (beg >  iter->reg_list[cr].max_end)
2753             continue;
2754 
2755         for (i = ci; i < iter->reg_list[cr].count; i++) {
2756             if (end > iter->reg_list[cr].intervals[i].beg && iter->reg_list[cr].intervals[i].end > beg) {
2757                 iter->curr_beg = beg;
2758                 iter->curr_end = end;
2759                 iter->curr_intv = i;
2760 
2761                 return ret;
2762             }
2763         }
2764     }
2765     iter->finished = 1;
2766 
2767     return ret;
2768 }
2769 
2770 /**********************
2771  *** Retrieve index ***
2772  **********************/
2773 // Returns -1 if index couldn't be opened.
2774 //         -2 on other errors
test_and_fetch(const char * fn,const char ** local_fn)2775 static int test_and_fetch(const char *fn, const char **local_fn)
2776 {
2777     hFILE *remote_hfp;
2778     FILE *local_fp = NULL;
2779     uint8_t *buf = NULL;
2780     int save_errno;
2781 
2782     if (hisremote(fn)) {
2783         const int buf_size = 1 * 1024 * 1024;
2784         int l;
2785         const char *p;
2786         for (p = fn + strlen(fn) - 1; p >= fn; --p)
2787             if (*p == '/') break;
2788         ++p; // p now points to the local file name
2789         // Attempt to open local file first
2790         if ((local_fp = fopen((char*)p, "rb")) != 0)
2791         {
2792             fclose(local_fp);
2793             *local_fn = p;
2794             return 0;
2795         }
2796         // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
2797         if ((remote_hfp = hopen(fn, "r")) == 0) return -1;
2798         if ((local_fp = fopen(p, "w")) == 0) {
2799             hts_log_error("Failed to create file %s in the working directory", p);
2800             goto fail;
2801         }
2802         hts_log_info("Downloading file %s to local directory", fn);
2803         buf = (uint8_t*)calloc(buf_size, 1);
2804         if (!buf) {
2805             hts_log_error("%s", strerror(errno));
2806             goto fail;
2807         }
2808         while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
2809             if (fwrite(buf, 1, l, local_fp) != l) {
2810                 hts_log_error("Failed to write data to %s : %s",
2811                               fn, strerror(errno));
2812                 goto fail;
2813             }
2814         }
2815         free(buf);
2816         if (fclose(local_fp) < 0) {
2817             hts_log_error("Error closing %s : %s", fn, strerror(errno));
2818             local_fp = NULL;
2819             goto fail;
2820         }
2821         if (hclose(remote_hfp) != 0) {
2822             hts_log_error("Failed to close remote file %s", fn);
2823         }
2824         *local_fn = p;
2825         return 0;
2826     } else {
2827         hFILE *local_hfp;
2828         if ((local_hfp = hopen(fn, "r")) == 0) return -1;
2829         hclose_abruptly(local_hfp);
2830         *local_fn = fn;
2831         return 0;
2832     }
2833 
2834  fail:
2835     save_errno = errno;
2836     hclose_abruptly(remote_hfp);
2837     if (local_fp) fclose(local_fp);
2838     free(buf);
2839     errno = save_errno;
2840     return -2;
2841 }
2842 
hts_idx_getfn(const char * fn,const char * ext)2843 char *hts_idx_getfn(const char *fn, const char *ext)
2844 {
2845     int i, l_fn, l_ext, ret;
2846     char *fnidx;
2847     const char *local_fn = NULL;
2848     l_fn = strlen(fn); l_ext = strlen(ext);
2849     fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
2850     if (!fnidx) return NULL;
2851     // First try : append `ext` to `fn`
2852     strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
2853     if ((ret = test_and_fetch(fnidx, &local_fn)) == -1) {
2854         // Second try : replace suffix of `fn` with `ext`
2855         for (i = l_fn - 1; i > 0; --i)
2856             if (fnidx[i] == '.' || fnidx[i] == '/') break;
2857         if (fnidx[i] == '.') {
2858             strcpy(fnidx + i, ext);
2859             ret = test_and_fetch(fnidx, &local_fn);
2860         }
2861     }
2862     if (ret < 0) {
2863         free(fnidx);
2864         return NULL;
2865     }
2866     l_fn = strlen(local_fn);
2867     memmove(fnidx, local_fn, l_fn + 1);
2868     return fnidx;
2869 }
2870 
hts_idx_load(const char * fn,int fmt)2871 hts_idx_t *hts_idx_load(const char *fn, int fmt)
2872 {
2873     char *fnidx;
2874     hts_idx_t *idx;
2875     fnidx = hts_idx_getfn(fn, ".csi");
2876     if (! fnidx) fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
2877     if (fnidx == 0) return 0;
2878 
2879     idx = hts_idx_load2(fn, fnidx);
2880     free(fnidx);
2881     return idx;
2882 }
2883 
hts_idx_load2(const char * fn,const char * fnidx)2884 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
2885 {
2886     // Check that the index file is up to date, the main file might have changed
2887     struct stat stat_idx,stat_main;
2888     if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
2889     {
2890         if ( stat_idx.st_mtime < stat_main.st_mtime )
2891             hts_log_warning("The index file is older than the data file: %s", fnidx);
2892     }
2893 
2894     return hts_idx_load_local(fnidx);
2895 }
2896 
2897 
2898 
2899 /**********************
2900  ***     Memory     ***
2901  **********************/
2902 
2903 /* For use with hts_expand macros *only* */
hts_realloc_or_die(size_t n,size_t m,size_t m_sz,size_t size,int clear,void ** ptr,const char * func)2904 size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
2905                           int clear, void **ptr, const char *func) {
2906     /* If new_m and size are both below this limit, multiplying them
2907        together can't overflow */
2908     const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
2909     void *new_ptr;
2910     size_t bytes, new_m;
2911 
2912     new_m = n;
2913     kroundup_size_t(new_m);
2914 
2915     bytes = size * new_m;
2916 
2917     /* Check for overflow.  Both ensure that new_m will fit in m (we make the
2918        pessimistic assumption that m is signed), and that bytes has not
2919        wrapped around. */
2920     if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
2921         || ((size > safe || new_m > safe)
2922             && bytes / new_m != size)) {
2923         errno = ENOMEM;
2924         goto die;
2925     }
2926 
2927     new_ptr = realloc(*ptr, bytes);
2928     if (new_ptr == NULL) goto die;
2929 
2930     if (clear) {
2931         if (new_m > m) {
2932             memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
2933         }
2934     }
2935 
2936     *ptr = new_ptr;
2937 
2938     return new_m;
2939 
2940  die:
2941     hts_log_error("%s", strerror(errno));
2942     exit(1);
2943 }
2944 
hts_set_log_level(enum htsLogLevel level)2945 void hts_set_log_level(enum htsLogLevel level)
2946 {
2947     hts_verbose = level;
2948 }
2949 
hts_get_log_level()2950 enum htsLogLevel hts_get_log_level()
2951 {
2952     return hts_verbose;
2953 }
2954 
get_severity_tag(enum htsLogLevel severity)2955 static char get_severity_tag(enum htsLogLevel severity)
2956 {
2957     switch (severity) {
2958     case HTS_LOG_ERROR:
2959         return 'E';
2960     case HTS_LOG_WARNING:
2961         return 'W';
2962     case HTS_LOG_INFO:
2963         return 'I';
2964     case HTS_LOG_DEBUG:
2965         return 'D';
2966     case HTS_LOG_TRACE:
2967         return 'T';
2968     default:
2969         break;
2970     }
2971 
2972     return '*';
2973 }
2974 
hts_log(enum htsLogLevel severity,const char * context,const char * format,...)2975 void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
2976 {
2977     int save_errno = errno;
2978     if (severity <= hts_verbose) {
2979         va_list argptr;
2980 
2981         fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
2982 
2983         va_start(argptr, format);
2984         vfprintf(stderr, format, argptr);
2985         va_end(argptr);
2986 
2987         fprintf(stderr, "\n");
2988     }
2989     errno = save_errno;
2990 }
2991