1 /*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2 
3     Copyright (C) 2008, 2009, 2012-2020 Genome Research Ltd.
4     Copyright (C) 2012, 2013 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27 #include <config.h>
28 
29 #include <zlib.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <stdlib.h>
34 #include <unistd.h>
35 #include <inttypes.h>
36 #include <limits.h>
37 #include <stdint.h>
38 #include <fcntl.h>
39 #include <errno.h>
40 #include <time.h>
41 #include <sys/stat.h>
42 #include <assert.h>
43 
44 #include "htslib/hts.h"
45 #include "htslib/bgzf.h"
46 #include "cram/cram.h"
47 #include "htslib/hfile.h"
48 #include "htslib/hts_endian.h"
49 #include "version.h"
50 #include "hts_internal.h"
51 #include "hfile_internal.h"
52 #include "sam_internal.h"
53 #include "htslib/hts_os.h" // drand48
54 
55 #include "htslib/khash.h"
56 #include "htslib/kseq.h"
57 #include "htslib/ksort.h"
58 #include "htslib/tbx.h"
59 
60 #ifndef EFTYPE
61 #define EFTYPE ENOEXEC
62 #endif
63 
64 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
65 
66 HTSLIB_EXPORT
67 int hts_verbose = HTS_LOG_WARNING;
68 
hts_version()69 const char *hts_version()
70 {
71     return HTS_VERSION_TEXT;
72 }
73 
74 HTSLIB_EXPORT
75 const unsigned char seq_nt16_table[256] = {
76     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
78     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
79      1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
80     15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
81     15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
82     15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
83     15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
84 
85     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
86     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
87     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
88     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
89     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
90     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
91     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
92     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
93 };
94 
95 HTSLIB_EXPORT
96 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
97 
98 HTSLIB_EXPORT
99 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
100 
101 /**********************
102  *** Basic file I/O ***
103  **********************/
104 
format_category(enum htsExactFormat fmt)105 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
106 {
107     switch (fmt) {
108     case bam:
109     case sam:
110     case cram:
111     case fastq_format:
112         return sequence_data;
113 
114     case vcf:
115     case bcf:
116         return variant_data;
117 
118     case bai:
119     case crai:
120     case csi:
121     case fai_format:
122     case fqi_format:
123     case gzi:
124     case tbi:
125         return index_file;
126 
127     case bed:
128         return region_list;
129 
130     case fasta_format:
131     case htsget:
132     case hts_crypt4gh_format:
133         return unknown_category;
134 
135     case unknown_format:
136     case binary_format:
137     case text_format:
138     case empty_format:
139     case format_maximum:
140         break;
141     }
142 
143     return unknown_category;
144 }
145 
146 // Decompress several hundred bytes by peeking at the file, which must be
147 // positioned at the start of a GZIP block.
decompress_peek(hFILE * fp,unsigned char * dest,size_t destsize)148 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
149 {
150     unsigned char buffer[2048];
151     z_stream zs;
152     ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
153 
154     if (npeek < 0) return 0;
155 
156     zs.zalloc = NULL;
157     zs.zfree = NULL;
158     zs.next_in = buffer;
159     zs.avail_in = npeek;
160     zs.next_out = dest;
161     zs.avail_out = destsize;
162     if (inflateInit2(&zs, 31) != Z_OK) return 0;
163 
164     while (zs.total_out < destsize)
165         if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
166 
167     destsize = zs.total_out;
168     inflateEnd(&zs);
169 
170     return destsize;
171 }
172 
173 // Parse "x.y" text, taking care because the string is not NUL-terminated
174 // and filling in major/minor only when the digits are followed by a delimiter,
175 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
176 static void
parse_version(htsFormat * fmt,const unsigned char * u,const unsigned char * ulim)177 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
178 {
179     const char *s    = (const char *) u;
180     const char *slim = (const char *) ulim;
181     short v;
182 
183     fmt->version.major = fmt->version.minor = -1;
184 
185     for (v = 0; s < slim && isdigit_c(*s); s++)
186         v = 10 * v + *s - '0';
187 
188     if (s < slim) {
189         fmt->version.major = v;
190         if (*s == '.') {
191             s++;
192             for (v = 0; s < slim && isdigit_c(*s); s++)
193                 v = 10 * v + *s - '0';
194             if (s < slim)
195                 fmt->version.minor = v;
196         }
197         else
198             fmt->version.minor = 0;
199     }
200 }
201 
202 static int
cmp_nonblank(const char * key,const unsigned char * u,const unsigned char * ulim)203 cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
204 {
205     const unsigned char *ukey = (const unsigned char *) key;
206 
207     while (*ukey)
208         if (u >= ulim) return +1;
209         else if (isspace_c(*u)) u++;
210         else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
211         else u++, ukey++;
212 
213     return 0;
214 }
215 
is_text_only(const unsigned char * u,const unsigned char * ulim)216 static int is_text_only(const unsigned char *u, const unsigned char *ulim)
217 {
218     for (; u < ulim; u++)
219         if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
220             return 0;
221 
222     return 1;
223 }
224 
225 static int
secondline_is_bases(const unsigned char * u,const unsigned char * ulim)226 secondline_is_bases(const unsigned char *u, const unsigned char *ulim)
227 {
228     // Skip to second line, returning false if there isn't one
229     u = memchr(u, '\n', ulim - u);
230     if (u == NULL || ++u == ulim) return 0;
231 
232     // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
233     while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
234         if (*u == '=') return 0;
235         u++;
236     }
237 
238     return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
239 }
240 
241 // Parse tab-delimited text, filling in a string of column types and returning
242 // the number of columns spotted (within [u,ulim), and up to column_len) or -1
243 // if non-printable characters were seen.  Column types:
244 //     i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
245 static int
parse_tabbed_text(char * columns,int column_len,const unsigned char * u,const unsigned char * ulim,int * complete)246 parse_tabbed_text(char *columns, int column_len,
247                   const unsigned char *u, const unsigned char *ulim,
248                   int *complete)
249 {
250     const char *str  = (const char *) u;
251     const char *slim = (const char *) ulim;
252     const char *s;
253     int ncolumns = 0;
254 
255     enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
256     unsigned seen = 0;
257     *complete = 0;
258 
259     for (s = str; s < slim; s++)
260         if (*s >= ' ') {
261             if (isdigit_c(*s))
262                 seen |= digit;
263             else if ((*s == '+' || *s == '-') && s == str)
264                 seen |= leading_sign;
265             else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
266                 seen |= cigar_operator;
267             else
268                 seen |= other;
269         }
270         else if (*s == '\t' || *s == '\r' || *s == '\n') {
271             size_t len = s - str;
272             char type;
273 
274             if (seen == digit || seen == (leading_sign|digit)) type = 'i';
275             else if (seen == (digit|cigar_operator)) type = 'C';
276             else if (len == 1)
277                 switch (str[0]) {
278                 case '*': type = 'C'; break;
279                 case '+': case '-': case '.': type = 's'; break;
280                 default: type = 'Z'; break;
281                 }
282             else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
283             else type = 'Z';
284 
285             columns[ncolumns++] = type;
286             if (*s != '\t' || ncolumns >= column_len - 1) {
287                 *complete = 1; // finished the line or more columns than needed
288                 break;
289             }
290 
291             str = s + 1;
292             seen = 0;
293         }
294         else return -1;
295 
296     columns[ncolumns] = '\0';
297     return ncolumns;
298 }
299 
300 // Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
301 // Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
colmatch(const char * columns,const char * pattern)302 static int colmatch(const char *columns, const char *pattern)
303 {
304     int i;
305     for (i = 0; columns[i] != '\0'; i++) {
306         if (pattern[i] == '+') return i;
307         if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
308     }
309 
310     return i;
311 }
312 
hts_detect_format(hFILE * hfile,htsFormat * fmt)313 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
314 {
315     char columns[24];
316     unsigned char s[1024];
317     int complete = 0;
318     ssize_t len = hpeek(hfile, s, 18);
319     if (len < 0) return -1;
320 
321     fmt->category = unknown_category;
322     fmt->format = unknown_format;
323     fmt->version.major = fmt->version.minor = -1;
324     fmt->compression = no_compression;
325     fmt->compression_level = -1;
326     fmt->specific = NULL;
327 
328     if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
329         // The stream is either gzip-compressed or BGZF-compressed.
330         // Determine which, and decompress the first few records or lines.
331         fmt->compression = (len >= 18 && (s[3] & 4) &&
332                             memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
333         if (len >= 9 && s[2] == 8)
334             fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
335 
336         len = decompress_peek(hfile, s, sizeof s);
337     }
338     else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
339              (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
340               memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
341         fmt->compression = bzip2_compression;
342         fmt->compression_level = s[3] - '0';
343         // Decompressing via libbz2 produces no output until it has a whole
344         // block (of size 100Kb x level), which is too large for peeking.
345         // So unfortunately we can recognise bzip2 but not the contents,
346         // except that \x1772... magic indicates the stream is empty.
347         if (s[4] == '\x31') return 0;
348         else len = 0;
349     }
350     else {
351         len = hpeek(hfile, s, sizeof s);
352     }
353     if (len < 0) return -1;
354 
355     if (len == 0) {
356         fmt->format = empty_format;
357         return 0;
358     }
359 
360     if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
361         fmt->category = sequence_data;
362         fmt->format = cram;
363         fmt->version.major = s[4], fmt->version.minor = s[5];
364         fmt->compression = custom;
365         return 0;
366     }
367     else if (len >= 4 && s[3] <= '\4') {
368         if (memcmp(s, "BAM\1", 4) == 0) {
369             fmt->category = sequence_data;
370             fmt->format = bam;
371             // TODO Decompress enough to pick version from @HD-VN header
372             fmt->version.major = 1, fmt->version.minor = -1;
373             return 0;
374         }
375         else if (memcmp(s, "BAI\1", 4) == 0) {
376             fmt->category = index_file;
377             fmt->format = bai;
378             fmt->version.major = -1, fmt->version.minor = -1;
379             return 0;
380         }
381         else if (memcmp(s, "BCF\4", 4) == 0) {
382             fmt->category = variant_data;
383             fmt->format = bcf;
384             fmt->version.major = 1, fmt->version.minor = -1;
385             return 0;
386         }
387         else if (memcmp(s, "BCF\2", 4) == 0) {
388             fmt->category = variant_data;
389             fmt->format = bcf;
390             fmt->version.major = s[3];
391             fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
392             return 0;
393         }
394         else if (memcmp(s, "CSI\1", 4) == 0) {
395             fmt->category = index_file;
396             fmt->format = csi;
397             fmt->version.major = 1, fmt->version.minor = -1;
398             return 0;
399         }
400         else if (memcmp(s, "TBI\1", 4) == 0) {
401             fmt->category = index_file;
402             fmt->format = tbi;
403             return 0;
404         }
405     }
406     else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
407         fmt->category = variant_data;
408         fmt->format = vcf;
409         if (len >= 21 && s[16] == 'v')
410             parse_version(fmt, &s[17], &s[len]);
411         return 0;
412     }
413     else if (len >= 4 && s[0] == '@' &&
414              (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
415               memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
416               memcmp(s, "@CO\t", 4) == 0)) {
417         fmt->category = sequence_data;
418         fmt->format = sam;
419         // @HD-VN is not guaranteed to be the first tag, but then @HD is
420         // not guaranteed to be present at all...
421         if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
422             parse_version(fmt, &s[7], &s[len]);
423         else
424             fmt->version.major = 1, fmt->version.minor = -1;
425         return 0;
426     }
427     else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
428         fmt->category = unknown_category;
429         fmt->format = htsget;
430         return 0;
431     }
432     else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
433         fmt->category = unknown_category;
434         fmt->format = hts_crypt4gh_format;
435         return 0;
436     }
437     else if (len >= 1 && s[0] == '>' && secondline_is_bases(s, &s[len])) {
438         fmt->format = fasta_format;
439         return 0;
440     }
441     else if (len >= 1 && s[0] == '@' && secondline_is_bases(s, &s[len])) {
442         fmt->category = sequence_data;
443         fmt->format = fastq_format;
444         return 0;
445     }
446     else if (parse_tabbed_text(columns, sizeof columns, s,
447                                &s[len], &complete) > 0) {
448         // A complete SAM line is at least 11 columns.  On unmapped long reads may
449         // be missing two.  (On mapped long reads we must have an @ header so long
450         // CIGAR is irrelevant.)
451         if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
452             >= 9 + 2*complete) {
453             fmt->category = sequence_data;
454             fmt->format = sam;
455             fmt->version.major = 1, fmt->version.minor = -1;
456             return 0;
457         }
458         else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
459             fmt->category = index_file;
460             fmt->format = crai;
461             return 0;
462         }
463         else if (colmatch(columns, "Ziiiii") == 6) {
464             fmt->category = index_file;
465             fmt->format = fqi_format;
466             return 0;
467         }
468         else if (colmatch(columns, "Ziiii") == 5) {
469             fmt->category = index_file;
470             fmt->format = fai_format;
471             return 0;
472         }
473         else if (colmatch(columns, "Zii+") >= 3) {
474             fmt->category = region_list;
475             fmt->format = bed;
476             return 0;
477         }
478     }
479 
480     // Arbitrary text files can be read using hts_getline().
481     if (is_text_only(s, &s[len])) fmt->format = text_format;
482 
483     // Nothing recognised: leave unset fmt-> fields as unknown.
484     return 0;
485 }
486 
hts_format_description(const htsFormat * format)487 char *hts_format_description(const htsFormat *format)
488 {
489     kstring_t str = { 0, 0, NULL };
490 
491     switch (format->format) {
492     case sam:   kputs("SAM", &str); break;
493     case bam:   kputs("BAM", &str); break;
494     case cram:  kputs("CRAM", &str); break;
495     case fasta_format:  kputs("FASTA", &str); break;
496     case fastq_format:  kputs("FASTQ", &str); break;
497     case vcf:   kputs("VCF", &str); break;
498     case bcf:
499         if (format->version.major == 1) kputs("Legacy BCF", &str);
500         else kputs("BCF", &str);
501         break;
502     case bai:   kputs("BAI", &str); break;
503     case crai:  kputs("CRAI", &str); break;
504     case csi:   kputs("CSI", &str); break;
505     case fai_format:    kputs("FASTA-IDX", &str); break;
506     case fqi_format:    kputs("FASTQ-IDX", &str); break;
507     case gzi:   kputs("GZI", &str); break;
508     case tbi:   kputs("Tabix", &str); break;
509     case bed:   kputs("BED", &str); break;
510     case htsget: kputs("htsget", &str); break;
511     case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
512     case empty_format:  kputs("empty", &str); break;
513     default:    kputs("unknown", &str); break;
514     }
515 
516     if (format->version.major >= 0) {
517         kputs(" version ", &str);
518         kputw(format->version.major, &str);
519         if (format->version.minor >= 0) {
520             kputc('.', &str);
521             kputw(format->version.minor, &str);
522         }
523     }
524 
525     switch (format->compression) {
526     case bzip2_compression:  kputs(" bzip2-compressed", &str); break;
527     case custom: kputs(" compressed", &str); break;
528     case gzip:   kputs(" gzip-compressed", &str); break;
529     case bgzf:
530         switch (format->format) {
531         case bam:
532         case bcf:
533         case csi:
534         case tbi:
535             // These are by definition BGZF, so just use the generic term
536             kputs(" compressed", &str);
537             break;
538         default:
539             kputs(" BGZF-compressed", &str);
540             break;
541         }
542         break;
543     default: break;
544     }
545 
546     switch (format->category) {
547     case sequence_data: kputs(" sequence", &str); break;
548     case variant_data:  kputs(" variant calling", &str); break;
549     case index_file:    kputs(" index", &str); break;
550     case region_list:   kputs(" genomic region", &str); break;
551     default: break;
552     }
553 
554     if (format->compression == no_compression)
555         switch (format->format) {
556         case text_format:
557         case sam:
558         case crai:
559         case vcf:
560         case bed:
561         case fai_format:
562         case fqi_format:
563         case fasta_format:
564         case fastq_format:
565         case htsget:
566             kputs(" text", &str);
567             break;
568 
569         case empty_format:
570             break;
571 
572         default:
573             kputs(" data", &str);
574             break;
575         }
576     else
577         kputs(" data", &str);
578 
579     return ks_release(&str);
580 }
581 
hts_open_format(const char * fn,const char * mode,const htsFormat * fmt)582 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
583 {
584     char smode[101], *cp, *cp2, *mode_c;
585     htsFile *fp = NULL;
586     hFILE *hfile = NULL;
587     char fmt_code = '\0';
588     const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0";
589 
590     strncpy(smode, mode, 99);
591     smode[99]=0;
592     if ((cp = strchr(smode, ',')))
593         *cp = '\0';
594 
595     // Migrate format code (b or c) to the end of the smode buffer.
596     for (cp2 = cp = smode; *cp; cp++) {
597         if (*cp == 'b')
598             fmt_code = 'b';
599         else if (*cp == 'c')
600             fmt_code = 'c';
601         else
602             *cp2++ = *cp;
603     }
604     mode_c = cp2;
605     *cp2++ = fmt_code;
606     *cp2++ = 0;
607 
608     // Set or reset the format code if opts->format is used
609     if (fmt && fmt->format > unknown_format
610         && fmt->format < sizeof(format_to_mode)) {
611         *mode_c = format_to_mode[fmt->format];
612     }
613 
614     // If we really asked for a compressed text format then mode_c above will
615     // point to nul.  We set to 'z' to enable bgzf.
616     if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
617         if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
618             *mode_c = 'z';
619     }
620 
621     char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
622     if ( fnidx ) {
623         rmme = strdup(fn);
624         if ( !rmme ) goto error;
625         rmme[fnidx-fn] = 0;
626         fn = rmme;
627     }
628 
629     hfile = hopen(fn, smode);
630     if (hfile == NULL) goto error;
631 
632     fp = hts_hopen(hfile, fn, smode);
633     if (fp == NULL) goto error;
634 
635     // Compensate for the loss of exactness in htsExactFormat.
636     // hts_hopen returns generics such as binary or text, but we
637     // have been given something explicit here so use that instead.
638     if (fp->is_write && fmt &&
639         (fmt->format == bam || fmt->format == sam ||
640          fmt->format == vcf || fmt->format == bcf ||
641          fmt->format == bed || fmt->format == fasta_format ||
642          fmt->format == fastq_format))
643         fp->format.format = fmt->format;
644 
645     if (fmt && fmt->specific)
646         if (hts_opt_apply(fp, fmt->specific) != 0)
647             goto error;
648 
649     if ( rmme ) free(rmme);
650     return fp;
651 
652 error:
653     hts_log_error("Failed to open file \"%s\"%s%s", fn,
654                   errno ? " : " : "", errno ? strerror(errno) : "");
655     if ( rmme ) free(rmme);
656 
657     if (hfile)
658         hclose_abruptly(hfile);
659 
660     return NULL;
661 }
662 
hts_open(const char * fn,const char * mode)663 htsFile *hts_open(const char *fn, const char *mode) {
664     return hts_open_format(fn, mode, NULL);
665 }
666 
667 /*
668  * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
669  * the prefix in lowercase into buf and returning a pointer to the suffix.
670  * On return, buf is always NUL-terminated; thus assumes that the "keyword"
671  * prefix should be one of several known values of maximum length buflen-2.
672  * (If delim is not found, returns a pointer to the '\0'.)
673  */
674 static const char *
scan_keyword(const char * str,char delim,char * buf,size_t buflen)675 scan_keyword(const char *str, char delim, char *buf, size_t buflen)
676 {
677     size_t i = 0;
678     while (*str && *str != delim) {
679         if (i < buflen-1) buf[i++] = tolower_c(*str);
680         str++;
681     }
682 
683     buf[i] = '\0';
684     return *str? str+1 : str;
685 }
686 
687 /*
688  * Parses arg and appends it to the option list.
689  *
690  * Returns 0 on success;
691  *        -1 on failure.
692  */
hts_opt_add(hts_opt ** opts,const char * c_arg)693 int hts_opt_add(hts_opt **opts, const char *c_arg) {
694     hts_opt *o, *t;
695     char *val;
696 
697     /*
698      * IMPORTANT!!!
699      * If you add another string option here, don't forget to also add
700      * it to the case statement in hts_opt_apply.
701      */
702 
703     if (!c_arg)
704         return -1;
705 
706     if (!(o =  malloc(sizeof(*o))))
707         return -1;
708 
709     if (!(o->arg = strdup(c_arg))) {
710         free(o);
711         return -1;
712     }
713 
714     if (!(val = strchr(o->arg, '=')))
715         val = "1"; // assume boolean
716     else
717         *val++ = '\0';
718 
719     if (strcmp(o->arg, "decode_md") == 0 ||
720         strcmp(o->arg, "DECODE_MD") == 0)
721         o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
722 
723     else if (strcmp(o->arg, "verbosity") == 0 ||
724              strcmp(o->arg, "VERBOSITY") == 0)
725         o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
726 
727     else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
728              strcmp(o->arg, "SEQS_PER_SLICE") == 0)
729         o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
730 
731     else if (strcmp(o->arg, "bases_per_slice") == 0 ||
732              strcmp(o->arg, "BASES_PER_SLICE") == 0)
733         o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
734 
735     else if (strcmp(o->arg, "slices_per_container") == 0 ||
736              strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
737         o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
738 
739     else if (strcmp(o->arg, "embed_ref") == 0 ||
740              strcmp(o->arg, "EMBED_REF") == 0)
741         o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
742 
743     else if (strcmp(o->arg, "no_ref") == 0 ||
744              strcmp(o->arg, "NO_REF") == 0)
745         o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
746 
747     else if (strcmp(o->arg, "ignore_md5") == 0 ||
748              strcmp(o->arg, "IGNORE_MD5") == 0)
749         o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
750 
751     else if (strcmp(o->arg, "use_bzip2") == 0 ||
752              strcmp(o->arg, "USE_BZIP2") == 0)
753         o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
754 
755     else if (strcmp(o->arg, "use_rans") == 0 ||
756              strcmp(o->arg, "USE_RANS") == 0)
757         o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
758 
759     else if (strcmp(o->arg, "use_lzma") == 0 ||
760              strcmp(o->arg, "USE_LZMA") == 0)
761         o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
762 
763     else if (strcmp(o->arg, "reference") == 0 ||
764              strcmp(o->arg, "REFERENCE") == 0)
765         o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
766 
767     else if (strcmp(o->arg, "version") == 0 ||
768              strcmp(o->arg, "VERSION") == 0)
769         o->opt = CRAM_OPT_VERSION, o->val.s =val;
770 
771     else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
772              strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
773         o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
774 
775     else if (strcmp(o->arg, "nthreads") == 0 ||
776              strcmp(o->arg, "NTHREADS") == 0)
777         o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
778 
779     else if (strcmp(o->arg, "cache_size") == 0 ||
780              strcmp(o->arg, "CACHE_SIZE") == 0) {
781         char *endp;
782         o->opt = HTS_OPT_CACHE_SIZE;
783         o->val.i = strtol(val, &endp, 0);
784         // NB: Doesn't support floats, eg 1.5g
785         // TODO: extend hts_parse_decimal? See also samtools sort.
786         switch (*endp) {
787         case 'g': case 'G': o->val.i *= 1024; // fall through
788         case 'm': case 'M': o->val.i *= 1024; // fall through
789         case 'k': case 'K': o->val.i *= 1024; break;
790         case '\0': break;
791         default:
792             hts_log_error("Unrecognised cache size suffix '%c'", *endp);
793             free(o->arg);
794             free(o);
795             return -1;
796         }
797     }
798 
799     else if (strcmp(o->arg, "required_fields") == 0 ||
800              strcmp(o->arg, "REQUIRED_FIELDS") == 0)
801         o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
802 
803     else if (strcmp(o->arg, "lossy_names") == 0 ||
804              strcmp(o->arg, "LOSSY_NAMES") == 0)
805         o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
806 
807     else if (strcmp(o->arg, "name_prefix") == 0 ||
808              strcmp(o->arg, "NAME_PREFIX") == 0)
809         o->opt = CRAM_OPT_PREFIX, o->val.s = val;
810 
811     else if (strcmp(o->arg, "store_md") == 0 ||
812              strcmp(o->arg, "store_md") == 0)
813         o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
814 
815     else if (strcmp(o->arg, "store_nm") == 0 ||
816              strcmp(o->arg, "store_nm") == 0)
817         o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
818 
819     else if (strcmp(o->arg, "block_size") == 0 ||
820              strcmp(o->arg, "BLOCK_SIZE") == 0)
821         o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
822 
823     else if (strcmp(o->arg, "level") == 0 ||
824              strcmp(o->arg, "LEVEL") == 0)
825         o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
826 
827     else {
828         hts_log_error("Unknown option '%s'", o->arg);
829         free(o->arg);
830         free(o);
831         return -1;
832     }
833 
834     o->next = NULL;
835 
836     // Append; assumes small list.
837     if (*opts) {
838         t = *opts;
839         while (t->next)
840             t = t->next;
841         t->next = o;
842     } else {
843         *opts = o;
844     }
845 
846     return 0;
847 }
848 
849 /*
850  * Applies an hts_opt option list to a given htsFile.
851  *
852  * Returns 0 on success
853  *        -1 on failure
854  */
hts_opt_apply(htsFile * fp,hts_opt * opts)855 int hts_opt_apply(htsFile *fp, hts_opt *opts) {
856     hts_opt *last = NULL;
857 
858     for (; opts;  opts = (last=opts)->next) {
859         switch (opts->opt) {
860             case CRAM_OPT_REFERENCE:
861                 if (!(fp->fn_aux = strdup(opts->val.s)))
862                     return -1;
863                 // fall through
864             case CRAM_OPT_VERSION:
865             case CRAM_OPT_PREFIX:
866                 if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
867                     return -1;
868                 break;
869             default:
870                 if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
871                     return -1;
872                 break;
873         }
874     }
875 
876     return 0;
877 }
878 
879 /*
880  * Frees an hts_opt list.
881  */
hts_opt_free(hts_opt * opts)882 void hts_opt_free(hts_opt *opts) {
883     hts_opt *last = NULL;
884     while (opts) {
885         opts = (last=opts)->next;
886         free(last->arg);
887         free(last);
888     }
889 }
890 
891 
892 /*
893  * Tokenise options as (key(=value)?,)*(key(=value)?)?
894  * NB: No provision for ',' appearing in the value!
895  * Add backslashing rules?
896  *
897  * This could be used as part of a general command line option parser or
898  * as a string concatenated onto the file open mode.
899  *
900  * Returns 0 on success
901  *        -1 on failure.
902  */
hts_parse_opt_list(htsFormat * fmt,const char * str)903 int hts_parse_opt_list(htsFormat *fmt, const char *str) {
904     while (str && *str) {
905         const char *str_start;
906         int len;
907         char arg[8001];
908 
909         while (*str && *str == ',')
910             str++;
911 
912         for (str_start = str; *str && *str != ','; str++);
913         len = str - str_start;
914 
915         // Produce a nul terminated copy of the option
916         strncpy(arg, str_start, len < 8000 ? len : 8000);
917         arg[len < 8000 ? len : 8000] = '\0';
918 
919         if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
920             return -1;
921 
922         if (*str)
923             str++;
924     }
925 
926     return 0;
927 }
928 
929 /*
930  * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
931  * followed by a comma separated list of key=value options and splits
932  * these up into the fields of htsFormat struct.
933  *
934  * format is assumed to be already initialised, either to blank
935  * "unknown" values or via previous hts_opt_add calls.
936  *
937  * Returns 0 on success
938  *        -1 on failure.
939  */
hts_parse_format(htsFormat * format,const char * str)940 int hts_parse_format(htsFormat *format, const char *str) {
941     char fmt[8];
942     const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
943 
944     format->version.minor = 0; // unknown
945     format->version.major = 0; // unknown
946 
947     if (strcmp(fmt, "sam") == 0) {
948         format->category          = sequence_data;
949         format->format            = sam;
950         format->compression       = no_compression;;
951         format->compression_level = 0;
952     } else if (strcmp(fmt, "sam.gz") == 0) {
953         format->category          = sequence_data;
954         format->format            = sam;
955         format->compression       = bgzf;
956         format->compression_level = -1;
957     } else if (strcmp(fmt, "bam") == 0) {
958         format->category          = sequence_data;
959         format->format            = bam;
960         format->compression       = bgzf;
961         format->compression_level = -1;
962     } else if (strcmp(fmt, "cram") == 0) {
963         format->category          = sequence_data;
964         format->format            = cram;
965         format->compression       = custom;
966         format->compression_level = -1;
967     } else if (strcmp(fmt, "vcf") == 0) {
968         format->category          = variant_data;
969         format->format            = vcf;
970         format->compression       = no_compression;;
971         format->compression_level = 0;
972     } else if (strcmp(fmt, "bcf") == 0) {
973         format->category          = variant_data;
974         format->format            = bcf;
975         format->compression       = bgzf;
976         format->compression_level = -1;
977     } else {
978         return -1;
979     }
980 
981     return hts_parse_opt_list(format, cp);
982 }
983 
984 
985 /*
986  * Tokenise options as (key(=value)?,)*(key(=value)?)?
987  * NB: No provision for ',' appearing in the value!
988  * Add backslashing rules?
989  *
990  * This could be used as part of a general command line option parser or
991  * as a string concatenated onto the file open mode.
992  *
993  * Returns 0 on success
994  *        -1 on failure.
995  */
hts_process_opts(htsFile * fp,const char * opts)996 static int hts_process_opts(htsFile *fp, const char *opts) {
997     htsFormat fmt;
998 
999     fmt.specific = NULL;
1000     if (hts_parse_opt_list(&fmt, opts) != 0)
1001         return -1;
1002 
1003     if (hts_opt_apply(fp, fmt.specific) != 0) {
1004         hts_opt_free(fmt.specific);
1005         return -1;
1006     }
1007 
1008     hts_opt_free(fmt.specific);
1009 
1010     return 0;
1011 }
1012 
hts_crypt4gh_redirect(const char * fn,const char * mode,hFILE ** hfile_ptr,htsFile * fp)1013 static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1014                                  hFILE **hfile_ptr, htsFile *fp) {
1015     hFILE *hfile1 = *hfile_ptr;
1016     hFILE *hfile2 = NULL;
1017     char fn_buf[512], *fn2 = fn_buf;
1018     const char *prefix = "crypt4gh:";
1019     size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1020     int ret = -1;
1021 
1022     if (fn2_len > sizeof(fn_buf)) {
1023         fn2 = malloc(fn2_len);
1024         if (!fn2) return -1;
1025     }
1026 
1027     // Reopen fn using the crypt4gh plug-in (if available)
1028     snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1029     hfile2 = hopen(fn2, mode, "parent", hfile1, NULL);
1030     if (hfile2) {
1031         // Replace original hfile with the new one.  The original is now
1032         // enclosed within hfile2
1033         *hfile_ptr = hfile2;
1034         ret = 0;
1035     }
1036 
1037     if (fn2 != fn_buf)
1038         free(fn2);
1039     return ret;
1040 }
1041 
hts_hopen(hFILE * hfile,const char * fn,const char * mode)1042 htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1043 {
1044     hFILE *hfile_orig = hfile;
1045     htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1046     char simple_mode[101], *cp, *opts;
1047     simple_mode[100] = '\0';
1048 
1049     if (fp == NULL) goto error;
1050 
1051     fp->fn = strdup(fn);
1052     fp->is_be = ed_is_big();
1053 
1054     // Split mode into simple_mode,opts strings
1055     if ((cp = strchr(mode, ','))) {
1056         strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1057         simple_mode[cp-mode] = '\0';
1058         opts = cp+1;
1059     } else {
1060         strncpy(simple_mode, mode, 100);
1061         opts = NULL;
1062     }
1063 
1064     if (strchr(simple_mode, 'r')) {
1065         const int max_loops = 5; // Should be plenty
1066         int loops = 0;
1067         if (hts_detect_format(hfile, &fp->format) < 0) goto error;
1068 
1069         // Deal with formats that re-direct an underlying file via a plug-in.
1070         // Loops as we may have crypt4gh served via htsget, or
1071         // crypt4gh-in-crypt4gh.
1072         while (fp->format.format == htsget ||
1073                fp->format.format == hts_crypt4gh_format) {
1074             // Ensure we don't get stuck in an endless redirect loop
1075             if (++loops > max_loops) {
1076                 errno = ELOOP;
1077                 goto error;
1078             }
1079 
1080             if (fp->format.format == htsget) {
1081                 hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1082                 if (hfile2 == NULL) goto error;
1083 
1084                 hfile = hfile2;
1085             }
1086             else if (fp->format.format == hts_crypt4gh_format) {
1087                 if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1088                     goto error;
1089             }
1090 
1091             // Re-detect format against the result of the redirection
1092             if (hts_detect_format(hfile, &fp->format) < 0) goto error;
1093         }
1094     }
1095     else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1096         htsFormat *fmt = &fp->format;
1097         fp->is_write = 1;
1098 
1099         if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1100         else if (strchr(simple_mode, 'c')) fmt->format = cram;
1101         else fmt->format = text_format;
1102 
1103         if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1104         else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1105         else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1106         else {
1107             // No compression mode specified, set to the default for the format
1108             switch (fmt->format) {
1109             case binary_format: fmt->compression = bgzf; break;
1110             case cram: fmt->compression = custom; break;
1111             case text_format: fmt->compression = no_compression; break;
1112             default: abort();
1113             }
1114         }
1115 
1116         // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1117         fmt->category = format_category(fmt->format);
1118 
1119         fmt->version.major = fmt->version.minor = -1;
1120         fmt->compression_level = -1;
1121         fmt->specific = NULL;
1122     }
1123     else { errno = EINVAL; goto error; }
1124 
1125     switch (fp->format.format) {
1126     case binary_format:
1127     case bam:
1128     case bcf:
1129         fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1130         if (fp->fp.bgzf == NULL) goto error;
1131         fp->is_bin = fp->is_bgzf = 1;
1132         break;
1133 
1134     case cram:
1135         fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1136         if (fp->fp.cram == NULL) goto error;
1137         if (!fp->is_write)
1138             cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
1139         fp->is_cram = 1;
1140         break;
1141 
1142     case empty_format:
1143     case text_format:
1144     case bed:
1145     case fasta_format:
1146     case fastq_format:
1147     case sam:
1148     case vcf:
1149         if (fp->format.compression != no_compression) {
1150             fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1151             if (fp->fp.bgzf == NULL) goto error;
1152             fp->is_bgzf = 1;
1153         }
1154         else
1155             fp->fp.hfile = hfile;
1156         break;
1157 
1158     default:
1159         errno = EFTYPE;
1160         goto error;
1161     }
1162 
1163     if (opts)
1164         hts_process_opts(fp, opts);
1165 
1166     // If redirecting, close the original hFILE now (pedantically we would
1167     // instead close it in hts_close(), but this a simplifying optimisation)
1168     if (hfile != hfile_orig) hclose_abruptly(hfile_orig);
1169 
1170     return fp;
1171 
1172 error:
1173     hts_log_error("Failed to open file %s", fn);
1174 
1175     // If redirecting, close the failed redirection hFILE that we have opened
1176     if (hfile != hfile_orig) hclose_abruptly(hfile);
1177 
1178     if (fp) {
1179         free(fp->fn);
1180         free(fp->fn_aux);
1181         free(fp);
1182     }
1183     return NULL;
1184 }
1185 
hts_close(htsFile * fp)1186 int hts_close(htsFile *fp)
1187 {
1188     int ret, save;
1189 
1190     switch (fp->format.format) {
1191     case binary_format:
1192     case bam:
1193     case bcf:
1194         ret = bgzf_close(fp->fp.bgzf);
1195         break;
1196 
1197     case cram:
1198         if (!fp->is_write) {
1199             switch (cram_eof(fp->fp.cram)) {
1200             case 2:
1201                 hts_log_warning("EOF marker is absent. The input is probably truncated");
1202                 break;
1203             case 0:  /* not at EOF, but may not have wanted all seqs */
1204             default: /* case 1, expected EOF */
1205                 break;
1206             }
1207         }
1208         ret = cram_close(fp->fp.cram);
1209         break;
1210 
1211     case empty_format:
1212     case text_format:
1213     case bed:
1214     case fasta_format:
1215     case fastq_format:
1216     case sam:
1217     case vcf:
1218         ret = sam_state_destroy(fp);
1219 
1220         if (fp->format.compression != no_compression)
1221             ret |= bgzf_close(fp->fp.bgzf);
1222         else
1223             ret |= hclose(fp->fp.hfile);
1224         break;
1225 
1226     default:
1227         ret = -1;
1228         break;
1229     }
1230 
1231     save = errno;
1232     sam_hdr_destroy(fp->bam_header);
1233     hts_idx_destroy(fp->idx);
1234     free(fp->fn);
1235     free(fp->fn_aux);
1236     free(fp->line.s);
1237     free(fp);
1238     errno = save;
1239     return ret;
1240 }
1241 
hts_get_format(htsFile * fp)1242 const htsFormat *hts_get_format(htsFile *fp)
1243 {
1244     return fp? &fp->format : NULL;
1245 }
1246 
hts_format_file_extension(const htsFormat * format)1247 const char *hts_format_file_extension(const htsFormat *format) {
1248     if (!format)
1249         return "?";
1250 
1251     switch (format->format) {
1252     case sam:  return "sam";
1253     case bam:  return "bam";
1254     case bai:  return "bai";
1255     case cram: return "cram";
1256     case crai: return "crai";
1257     case vcf:  return "vcf";
1258     case bcf:  return "bcf";
1259     case csi:  return "csi";
1260     case fai_format:   return "fai";
1261     case fqi_format:   return "fqi";
1262     case gzi:  return "gzi";
1263     case tbi:  return "tbi";
1264     case bed:  return "bed";
1265     case fasta_format: return "fa";
1266     case fastq_format: return "fq";
1267     default:   return "?";
1268     }
1269 }
1270 
hts_hfile(htsFile * fp)1271 static hFILE *hts_hfile(htsFile *fp) {
1272     switch (fp->format.format) {
1273     case binary_format:// fall through
1274     case bcf:          // fall through
1275     case bam:          return bgzf_hfile(fp->fp.bgzf);
1276     case cram:         return cram_hfile(fp->fp.cram);
1277     case text_format:  return fp->fp.hfile;
1278     case vcf:          // fall through
1279     case sam:          return fp->format.compression != no_compression
1280                               ? bgzf_hfile(fp->fp.bgzf)
1281                               : fp->fp.hfile;
1282     default:           return NULL;
1283     }
1284 }
1285 
hts_set_opt(htsFile * fp,enum hts_fmt_option opt,...)1286 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1287     int r;
1288     va_list args;
1289 
1290     switch (opt) {
1291     case HTS_OPT_NTHREADS: {
1292         va_start(args, opt);
1293         int nthreads = va_arg(args, int);
1294         va_end(args);
1295         return hts_set_threads(fp, nthreads);
1296     }
1297 
1298     case HTS_OPT_BLOCK_SIZE: {
1299         hFILE *hf = hts_hfile(fp);
1300 
1301         if (hf) {
1302             va_start(args, opt);
1303             if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1304                 hts_log_warning("Failed to change block size");
1305             va_end(args);
1306         }
1307         else {
1308             // To do - implement for vcf/bcf.
1309             hts_log_warning("Cannot change block size for this format");
1310         }
1311 
1312         return 0;
1313     }
1314 
1315     case HTS_OPT_THREAD_POOL: {
1316         va_start(args, opt);
1317         htsThreadPool *p = va_arg(args, htsThreadPool *);
1318         va_end(args);
1319         return hts_set_thread_pool(fp, p);
1320     }
1321 
1322     case HTS_OPT_CACHE_SIZE: {
1323         va_start(args, opt);
1324         int cache_size = va_arg(args, int);
1325         va_end(args);
1326         hts_set_cache_size(fp, cache_size);
1327         return 0;
1328     }
1329 
1330     case HTS_OPT_COMPRESSION_LEVEL: {
1331         va_start(args, opt);
1332         int level = va_arg(args, int);
1333         va_end(args);
1334         if (fp->is_bgzf)
1335             fp->fp.bgzf->compress_level = level;
1336     }
1337 
1338     default:
1339         break;
1340     }
1341 
1342     if (fp->format.format != cram)
1343         return 0;
1344 
1345     va_start(args, opt);
1346     r = cram_set_voption(fp->fp.cram, opt, args);
1347     va_end(args);
1348 
1349     return r;
1350 }
1351 
1352 BGZF *hts_get_bgzfp(htsFile *fp);
1353 
hts_set_threads(htsFile * fp,int n)1354 int hts_set_threads(htsFile *fp, int n)
1355 {
1356     if (fp->format.format == sam) {
1357         return sam_set_threads(fp, n);
1358     } else if (fp->format.compression == bgzf) {
1359         return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1360     } else if (fp->format.format == cram) {
1361         return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1362     }
1363     else return 0;
1364 }
1365 
hts_set_thread_pool(htsFile * fp,htsThreadPool * p)1366 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1367     if (fp->format.format == sam || fp->format.format == text_format) {
1368         return sam_set_thread_pool(fp, p);
1369     } else if (fp->format.compression == bgzf) {
1370         return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1371     } else if (fp->format.format == cram) {
1372         return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1373     }
1374     else return 0;
1375 }
1376 
hts_set_cache_size(htsFile * fp,int n)1377 void hts_set_cache_size(htsFile *fp, int n)
1378 {
1379     if (fp->format.compression == bgzf)
1380         bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1381 }
1382 
hts_set_fai_filename(htsFile * fp,const char * fn_aux)1383 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1384 {
1385     free(fp->fn_aux);
1386     if (fn_aux) {
1387         fp->fn_aux = strdup(fn_aux);
1388         if (fp->fn_aux == NULL) return -1;
1389     }
1390     else fp->fn_aux = NULL;
1391 
1392     if (fp->format.format == cram)
1393         if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1394             return -1;
1395 
1396     return 0;
1397 }
1398 
hts_open_tmpfile(const char * fname,const char * mode,kstring_t * tmpname)1399 hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1400 {
1401     int pid = (int) getpid();
1402     unsigned ptr = (uintptr_t) tmpname;
1403     int n = 0;
1404     hFILE *fp = NULL;
1405 
1406     do {
1407         // Attempt to further uniquify the temporary filename
1408         unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1409         n++;
1410 
1411         ks_clear(tmpname);
1412         if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1413 
1414         fp = hopen(tmpname->s, mode);
1415     } while (fp == NULL && errno == EEXIST && n < 100);
1416 
1417     return fp;
1418 }
1419 
1420 // For VCF/BCF backward sweeper. Not exposing these functions because their
1421 // future is uncertain. Things will probably have to change with hFILE...
hts_get_bgzfp(htsFile * fp)1422 BGZF *hts_get_bgzfp(htsFile *fp)
1423 {
1424     if (fp->is_bgzf)
1425         return fp->fp.bgzf;
1426     else
1427         return NULL;
1428 }
hts_useek(htsFile * fp,off_t uoffset,int where)1429 int hts_useek(htsFile *fp, off_t uoffset, int where)
1430 {
1431     if (fp->is_bgzf)
1432         return bgzf_useek(fp->fp.bgzf, uoffset, where);
1433     else
1434         return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1435 }
hts_utell(htsFile * fp)1436 off_t hts_utell(htsFile *fp)
1437 {
1438     if (fp->is_bgzf)
1439         return bgzf_utell(fp->fp.bgzf);
1440     else
1441         return htell(fp->fp.hfile);
1442 }
1443 
hts_getline(htsFile * fp,int delimiter,kstring_t * str)1444 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1445 {
1446     int ret;
1447     if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1448         hts_log_error("Unexpected delimiter %d", delimiter);
1449         abort();
1450     }
1451 
1452     switch (fp->format.compression) {
1453     case no_compression:
1454         str->l = 0;
1455         ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
1456         if (ret >= 0) ret = str->l;
1457         else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1458         else ret = -1;
1459         break;
1460 
1461     case gzip:
1462     case bgzf:
1463         ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1464         break;
1465 
1466     default:
1467         abort();
1468     }
1469 
1470     ++fp->lineno;
1471     return ret;
1472 }
1473 
hts_readlist(const char * string,int is_file,int * _n)1474 char **hts_readlist(const char *string, int is_file, int *_n)
1475 {
1476     unsigned int m = 0, n = 0;
1477     char **s = 0, **s_new;
1478     if ( is_file )
1479     {
1480         BGZF *fp = bgzf_open(string, "r");
1481         if ( !fp ) return NULL;
1482 
1483         kstring_t str;
1484         str.s = 0; str.l = str.m = 0;
1485         while (bgzf_getline(fp, '\n', &str) >= 0)
1486         {
1487             if (str.l == 0) continue;
1488             if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
1489                 goto err;
1490             s[n] = strdup(str.s);
1491             if (!s[n])
1492                 goto err;
1493             n++;
1494         }
1495         bgzf_close(fp);
1496         free(str.s);
1497     }
1498     else
1499     {
1500         const char *q = string, *p = string;
1501         while ( 1 )
1502         {
1503             if (*p == ',' || *p == 0)
1504             {
1505                 if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
1506                     goto err;
1507                 s[n] = (char*)calloc(p - q + 1, 1);
1508                 if (!s[n])
1509                     goto err;
1510                 strncpy(s[n++], q, p - q);
1511                 q = p + 1;
1512             }
1513             if ( !*p ) break;
1514             p++;
1515         }
1516     }
1517     // Try to shrink s to the minimum size needed
1518     s_new = (char**)realloc(s, n * sizeof(char*));
1519     if (!s_new)
1520         goto err;
1521 
1522     s = s_new;
1523     assert(n < INT_MAX); // hts_resize() should ensure this
1524     *_n = n;
1525     return s;
1526 
1527  err:
1528     for (m = 0; m < n; m++)
1529         free(s[m]);
1530     free(s);
1531     return NULL;
1532 }
1533 
hts_readlines(const char * fn,int * _n)1534 char **hts_readlines(const char *fn, int *_n)
1535 {
1536     unsigned int m = 0, n = 0;
1537     char **s = 0, **s_new;
1538     BGZF *fp = bgzf_open(fn, "r");
1539     if ( fp ) { // read from file
1540         kstring_t str;
1541         str.s = 0; str.l = str.m = 0;
1542         while (bgzf_getline(fp, '\n', &str) >= 0) {
1543             if (str.l == 0) continue;
1544             if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
1545                 goto err;
1546             s[n] = strdup(str.s);
1547             if (!s[n])
1548                 goto err;
1549             n++;
1550         }
1551         bgzf_close(fp);
1552         free(str.s);
1553     } else if (*fn == ':') { // read from string
1554         const char *q, *p;
1555         for (q = p = fn + 1;; ++p)
1556             if (*p == ',' || *p == 0) {
1557                 if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
1558                     goto err;
1559                 s[n] = (char*)calloc(p - q + 1, 1);
1560                 if (!s[n])
1561                     goto err;
1562                 strncpy(s[n++], q, p - q);
1563                 q = p + 1;
1564                 if (*p == 0) break;
1565             }
1566     } else return 0;
1567     // Try to shrink s to the minimum size needed
1568     s_new = (char**)realloc(s, n * sizeof(char*));
1569     if (!s_new)
1570         goto err;
1571 
1572     s = s_new;
1573     assert(n < INT_MAX); // hts_resize() should ensure this
1574     *_n = n;
1575     return s;
1576 
1577  err:
1578     for (m = 0; m < n; m++)
1579         free(s[m]);
1580     free(s);
1581     return NULL;
1582 }
1583 
1584 // DEPRECATED: To be removed in a future HTSlib release
hts_file_type(const char * fname)1585 int hts_file_type(const char *fname)
1586 {
1587     int len = strlen(fname);
1588     if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
1589     if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
1590     if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
1591     if ( !strcmp("-",fname) ) return FT_STDIN;
1592 
1593     hFILE *f = hopen(fname, "r");
1594     if (f == NULL) return 0;
1595 
1596     htsFormat fmt;
1597     if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
1598     if (hclose(f) < 0) return 0;
1599 
1600     switch (fmt.format) {
1601     case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
1602     case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
1603     default:  return 0;
1604     }
1605 }
1606 
hts_check_EOF(htsFile * fp)1607 int hts_check_EOF(htsFile *fp)
1608 {
1609     if (fp->format.compression == bgzf)
1610         return bgzf_check_EOF(hts_get_bgzfp(fp));
1611     else if (fp->format.format == cram)
1612         return cram_check_EOF(fp->fp.cram);
1613     else
1614         return 3;
1615 }
1616 
1617 
1618 /****************
1619  *** Indexing ***
1620  ****************/
1621 
1622 #define HTS_MIN_MARKER_DIST 0x10000
1623 
1624 // Finds the special meta bin
1625 //  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
1626 #define META_BIN(idx) ((idx)->n_bins + 1)
1627 
1628 #define pair64_lt(a,b) ((a).u < (b).u)
1629 #define pair64max_lt(a,b) ((a).u < (b).u || \
1630                            ((a).u == (b).u && (a).max < (b).max))
1631 
1632 KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
1633 KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
1634 
1635 typedef struct {
1636     int32_t m, n;
1637     uint64_t loff;
1638     hts_pair64_t *list;
1639 } bins_t;
1640 
1641 KHASH_MAP_INIT_INT(bin, bins_t)
1642 typedef khash_t(bin) bidx_t;
1643 
1644 typedef struct {
1645     hts_pos_t n, m;
1646     uint64_t *offset;
1647 } lidx_t;
1648 
1649 struct hts_idx_t {
1650     int fmt, min_shift, n_lvls, n_bins;
1651     uint32_t l_meta;
1652     int32_t n, m;
1653     uint64_t n_no_coor;
1654     bidx_t **bidx;
1655     lidx_t *lidx;
1656     uint8_t *meta; // MUST have a terminating NUL on the end
1657     int tbi_n, last_tbi_tid;
1658     struct {
1659         uint32_t last_bin, save_bin;
1660         hts_pos_t last_coor;
1661         int last_tid, save_tid, finished;
1662         uint64_t last_off, save_off;
1663         uint64_t off_beg, off_end;
1664         uint64_t n_mapped, n_unmapped;
1665     } z; // keep internal states
1666 };
1667 
idx_format_name(int fmt)1668 static char * idx_format_name(int fmt) {
1669     switch (fmt) {
1670         case HTS_FMT_CSI: return "csi";
1671         case HTS_FMT_BAI: return "bai";
1672         case HTS_FMT_TBI: return "tbi";
1673         case HTS_FMT_CRAI: return "crai";
1674         default: return "unknown";
1675     }
1676 }
1677 
insert_to_b(bidx_t * b,int bin,uint64_t beg,uint64_t end)1678 static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
1679 {
1680     khint_t k;
1681     bins_t *l;
1682     int absent;
1683     k = kh_put(bin, b, bin, &absent);
1684     if (absent < 0) return -1; // Out of memory
1685     l = &kh_value(b, k);
1686     if (absent) {
1687         l->m = 1; l->n = 0;
1688         l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
1689         if (!l->list) {
1690             kh_del(bin, b, k);
1691             return -1;
1692         }
1693     } else if (l->n == l->m) {
1694         uint32_t new_m = l->m ? l->m << 1 : 1;
1695         hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
1696         if (!new_list) return -1;
1697         l->list = new_list;
1698         l->m = new_m;
1699     }
1700     l->list[l->n].u = beg;
1701     l->list[l->n++].v = end;
1702     return 0;
1703 }
1704 
insert_to_l(lidx_t * l,int64_t _beg,int64_t _end,uint64_t offset,int min_shift)1705 static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
1706 {
1707     int i;
1708     hts_pos_t beg, end;
1709     beg = _beg >> min_shift;
1710     end = (_end - 1) >> min_shift;
1711     if (l->m < end + 1) {
1712         size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
1713         uint64_t *new_offset;
1714 
1715         new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
1716         if (!new_offset) return -1;
1717 
1718         // fill unused memory with (uint64_t)-1
1719         memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
1720         l->m = new_m;
1721         l->offset = new_offset;
1722     }
1723     for (i = beg; i <= end; ++i) {
1724         if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
1725     }
1726     if (l->n < end + 1) l->n = end + 1;
1727     return 0;
1728 }
1729 
hts_idx_init(int n,int fmt,uint64_t offset0,int min_shift,int n_lvls)1730 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
1731 {
1732     hts_idx_t *idx;
1733     idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
1734     if (idx == NULL) return NULL;
1735     idx->fmt = fmt;
1736     idx->min_shift = min_shift;
1737     idx->n_lvls = n_lvls;
1738     idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
1739     idx->z.save_tid = idx->z.last_tid = -1;
1740     idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
1741     idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
1742     idx->z.last_coor = 0xffffffffu;
1743     if (n) {
1744         idx->n = idx->m = n;
1745         idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
1746         if (idx->bidx == NULL) { free(idx); return NULL; }
1747         idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
1748         if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
1749     }
1750     idx->tbi_n = -1;
1751     idx->last_tbi_tid = -1;
1752     return idx;
1753 }
1754 
update_loff(hts_idx_t * idx,int i,int free_lidx)1755 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
1756 {
1757     bidx_t *bidx = idx->bidx[i];
1758     lidx_t *lidx = &idx->lidx[i];
1759     khint_t k;
1760     int l;
1761     uint64_t offset0 = 0;
1762     if (bidx) {
1763         k = kh_get(bin, bidx, META_BIN(idx));
1764         if (k != kh_end(bidx))
1765             offset0 = kh_val(bidx, k).list[0].u;
1766         for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
1767             lidx->offset[l] = offset0;
1768     } else l = 1;
1769     for (; l < lidx->n; ++l) // fill missing values
1770         if (lidx->offset[l] == (uint64_t)-1)
1771             lidx->offset[l] = lidx->offset[l-1];
1772     if (bidx == 0) return;
1773     for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
1774         if (kh_exist(bidx, k))
1775         {
1776             if ( kh_key(bidx, k) < idx->n_bins )
1777             {
1778                 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
1779                 // disable linear index if bot_bin out of bounds
1780                 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
1781             }
1782             else
1783                 kh_val(bidx, k).loff = 0;
1784         }
1785     if (free_lidx) {
1786         free(lidx->offset);
1787         lidx->m = lidx->n = 0;
1788         lidx->offset = 0;
1789     }
1790 }
1791 
compress_binning(hts_idx_t * idx,int i)1792 static int compress_binning(hts_idx_t *idx, int i)
1793 {
1794     bidx_t *bidx = idx->bidx[i];
1795     khint_t k;
1796     int l, m;
1797     if (bidx == 0) return 0;
1798     // merge a bin to its parent if the bin is too small
1799     for (l = idx->n_lvls; l > 0; --l) {
1800         unsigned start = hts_bin_first(l);
1801         for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1802             bins_t *p, *q;
1803             if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
1804             p = &kh_value(bidx, k);
1805             if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
1806             if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
1807                 khint_t kp;
1808                 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
1809                 if (kp == kh_end(bidx)) continue;
1810                 q = &kh_val(bidx, kp);
1811                 if (q->n + p->n > q->m) {
1812                     uint32_t new_m = q->n + p->n;
1813                     hts_pair64_t *new_list;
1814                     kroundup32(new_m);
1815                     if (new_m > INT32_MAX) return -1; // Limited by index format
1816                     new_list = realloc(q->list, new_m * sizeof(*new_list));
1817                     if (!new_list) return -1;
1818                     q->m = new_m;
1819                     q->list = new_list;
1820                 }
1821                 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
1822                 q->n += p->n;
1823                 free(p->list);
1824                 kh_del(bin, bidx, k);
1825             }
1826         }
1827     }
1828     k = kh_get(bin, bidx, 0);
1829     if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
1830     // merge adjacent chunks that start from the same BGZF block
1831     for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1832         bins_t *p;
1833         if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
1834         p = &kh_value(bidx, k);
1835         for (l = 1, m = 0; l < p->n; ++l) {
1836             if (p->list[m].v>>16 >= p->list[l].u>>16) {
1837                 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
1838             } else p->list[++m] = p->list[l];
1839         }
1840         p->n = m + 1;
1841     }
1842     return 0;
1843 }
1844 
hts_idx_finish(hts_idx_t * idx,uint64_t final_offset)1845 int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
1846 {
1847     int i, ret = 0;
1848     if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
1849     if (idx->z.save_tid >= 0) {
1850         ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
1851         ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
1852         ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
1853     }
1854     for (i = 0; i < idx->n; ++i) {
1855         update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
1856         ret |= compress_binning(idx, i);
1857     }
1858     idx->z.finished = 1;
1859 
1860     return ret;
1861 }
1862 
hts_idx_check_range(hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end)1863 int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1864 {
1865     int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
1866     if (tid < 0 || (beg <= maxpos && end <= maxpos))
1867         return 0;
1868 
1869     if (idx->fmt == HTS_FMT_CSI) {
1870         hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
1871                       " cannot be stored in a csi index. "
1872                       "Please check headers match the data",
1873                       beg, end);
1874     } else {
1875         hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
1876                       " cannot be stored in a %s index. Try using a csi index",
1877                       beg, end, idx_format_name(idx->fmt));
1878     }
1879     errno = ERANGE;
1880     return -1;
1881 }
1882 
hts_idx_push(hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end,uint64_t offset,int is_mapped)1883 int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
1884 {
1885     int bin;
1886     if (tid<0) beg = -1, end = 0;
1887     if (hts_idx_check_range(idx, tid, beg, end) < 0)
1888         return -1;
1889     if (tid >= idx->m) { // enlarge the index
1890         uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
1891         bidx_t **new_bidx;
1892         lidx_t *new_lidx;
1893 
1894         new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
1895         if (!new_bidx) return -1;
1896         idx->bidx = new_bidx;
1897 
1898         new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
1899         if (!new_lidx) return -1;
1900         idx->lidx = new_lidx;
1901 
1902         memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
1903         memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
1904         idx->m = new_m;
1905     }
1906     if (idx->n < tid + 1) idx->n = tid + 1;
1907     if (idx->z.finished) return 0;
1908     if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
1909         if ( tid>=0 && idx->n_no_coor )
1910         {
1911             hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
1912             return -1;
1913         }
1914         if (tid>=0 && idx->bidx[tid] != 0)
1915         {
1916             hts_log_error("Chromosome blocks not continuous");
1917             return -1;
1918         }
1919         idx->z.last_tid = tid;
1920         idx->z.last_bin = 0xffffffffu;
1921     } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
1922         hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
1923         return -1;
1924     }
1925     if (end < beg) {
1926         // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
1927         hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
1928         return -1;
1929     }
1930     if ( tid>=0 )
1931     {
1932         if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
1933         if (is_mapped) {
1934             // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
1935             if (beg < 0)  beg = 0;
1936             if (end <= 0) end = 1;
1937             // idx->z.last_off points to the start of the current record
1938             if (insert_to_l(&idx->lidx[tid], beg, end,
1939                             idx->z.last_off, idx->min_shift) < 0) return -1;
1940         }
1941     }
1942     else idx->n_no_coor++;
1943     bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
1944     if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
1945         if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
1946             if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
1947                             idx->z.save_off, idx->z.last_off) < 0) return -1;
1948         }
1949         if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
1950             idx->z.off_end = idx->z.last_off;
1951             if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1952                             idx->z.off_beg, idx->z.off_end) < 0) return -1;
1953             if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1954                             idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
1955             idx->z.n_mapped = idx->z.n_unmapped = 0;
1956             idx->z.off_beg = idx->z.off_end;
1957         }
1958         idx->z.save_off = idx->z.last_off;
1959         idx->z.save_bin = idx->z.last_bin = bin;
1960         idx->z.save_tid = tid;
1961     }
1962     if (is_mapped) ++idx->z.n_mapped;
1963     else ++idx->z.n_unmapped;
1964     idx->z.last_off = offset;
1965     idx->z.last_coor = beg;
1966     return 0;
1967 }
1968 
1969 // Needed for TBI only.  Ensure 'tid' with 'name' is in the index meta data.
1970 // idx->meta needs to have been initialised first with an appropriate Tabix
1971 // configuration via hts_idx_set_meta.
1972 //
1973 // NB number of references (first 4 bytes of tabix header) aren't in
1974 // idx->meta, but held in idx->n instead.
hts_idx_tbi_name(hts_idx_t * idx,int tid,const char * name)1975 int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
1976     // Horrid - we have to map incoming tid to a tbi alternative tid.
1977     // This is because TBI counts tids by "covered" refs while everything
1978     // else counts by Nth SQ/contig record in header.
1979     if (tid == idx->last_tbi_tid || tid < 0 || !name)
1980         return idx->tbi_n;
1981 
1982     uint32_t len = strlen(name)+1;
1983     uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len);
1984     if (!tmp)
1985         return -1;
1986 
1987     // Append name
1988     idx->meta = tmp;
1989     strcpy((char *)idx->meta + idx->l_meta, name);
1990     idx->l_meta += len;
1991 
1992     // Update seq length
1993     u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
1994 
1995     idx->last_tbi_tid = tid;
1996     return ++idx->tbi_n;
1997 }
1998 
1999 // When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2000 // loop.  idx->z.last_off is the previous bzgf_tell location, so we know
2001 // the location the current bam record started at as well as where it ends.
2002 //
2003 // When building an index on the fly via a write_bam / hts_idx_push loop,
2004 // this isn't quite identical as we may amend the virtual coord returned
2005 // by bgzf_tell to the start of a new block if the next bam struct doesn't
2006 // fit.  It's essentially the same thing, but for bit-identical indices
2007 // we need to amend the idx->z.last_off when we know we're starting a new
2008 // block.
hts_idx_amend_last(hts_idx_t * idx,uint64_t offset)2009 void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2010 {
2011     idx->z.last_off = offset;
2012 }
2013 
hts_idx_destroy(hts_idx_t * idx)2014 void hts_idx_destroy(hts_idx_t *idx)
2015 {
2016     khint_t k;
2017     int i;
2018     if (idx == 0) return;
2019 
2020     // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2021     if (idx->fmt == HTS_FMT_CRAI) {
2022         hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2023         cram_index_free(cidx->cram);
2024         free(cidx);
2025         return;
2026     }
2027 
2028     for (i = 0; i < idx->m; ++i) {
2029         bidx_t *bidx = idx->bidx[i];
2030         free(idx->lidx[i].offset);
2031         if (bidx == 0) continue;
2032         for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2033             if (kh_exist(bidx, k))
2034                 free(kh_value(bidx, k).list);
2035         kh_destroy(bin, bidx);
2036     }
2037     free(idx->bidx); free(idx->lidx); free(idx->meta);
2038     free(idx);
2039 }
2040 
hts_idx_fmt(hts_idx_t * idx)2041 int hts_idx_fmt(hts_idx_t *idx) {
2042     return idx->fmt;
2043 }
2044 
2045 // The optimizer eliminates these ed_is_big() calls; still it would be good to
2046 // TODO Determine endianness at configure- or compile-time
2047 
idx_write_int32(BGZF * fp,int32_t x)2048 static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2049 {
2050     if (ed_is_big()) x = ed_swap_4(x);
2051     return bgzf_write(fp, &x, sizeof x);
2052 }
2053 
idx_write_uint32(BGZF * fp,uint32_t x)2054 static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2055 {
2056     if (ed_is_big()) x = ed_swap_4(x);
2057     return bgzf_write(fp, &x, sizeof x);
2058 }
2059 
idx_write_uint64(BGZF * fp,uint64_t x)2060 static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2061 {
2062     if (ed_is_big()) x = ed_swap_8(x);
2063     return bgzf_write(fp, &x, sizeof x);
2064 }
2065 
swap_bins(bins_t * p)2066 static inline void swap_bins(bins_t *p)
2067 {
2068     int i;
2069     for (i = 0; i < p->n; ++i) {
2070         ed_swap_8p(&p->list[i].u);
2071         ed_swap_8p(&p->list[i].v);
2072     }
2073 }
2074 
hts_idx_save_core(const hts_idx_t * idx,BGZF * fp,int fmt)2075 static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2076 {
2077     int32_t i, j;
2078 
2079     #define check(ret) if ((ret) < 0) return -1
2080 
2081     // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2082     //
2083     // NOTE: CSI meta is undefined in spec, so this code has an assumption
2084     // that we're only using it for Tabix data.
2085     int nids = idx->n;
2086     if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2087         for (i = nids = 0; i < idx->n; ++i) {
2088             if (idx->bidx[i])
2089                 nids++;
2090         }
2091     }
2092     check(idx_write_int32(fp, nids));
2093     if (fmt == HTS_FMT_TBI && idx->l_meta)
2094         check(bgzf_write(fp, idx->meta, idx->l_meta));
2095 
2096     for (i = 0; i < idx->n; ++i) {
2097         khint_t k;
2098         bidx_t *bidx = idx->bidx[i];
2099         lidx_t *lidx = &idx->lidx[i];
2100 
2101         // write binning index
2102         if (nids == idx->n || bidx)
2103             check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2104         if (bidx)
2105             for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2106                 if (kh_exist(bidx, k)) {
2107                     bins_t *p = &kh_value(bidx, k);
2108                     check(idx_write_uint32(fp, kh_key(bidx, k)));
2109                     if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2110                     //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2111                     check(idx_write_int32(fp, p->n));
2112                     for (j = 0; j < p->n; ++j) {
2113                         //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2114                         check(idx_write_uint64(fp, p->list[j].u));
2115                         check(idx_write_uint64(fp, p->list[j].v));
2116                     }
2117                 }
2118 
2119         // write linear index
2120         if (fmt != HTS_FMT_CSI) {
2121             check(idx_write_int32(fp, lidx->n));
2122             for (j = 0; j < lidx->n; ++j)
2123                 check(idx_write_uint64(fp, lidx->offset[j]));
2124         }
2125     }
2126 
2127     check(idx_write_uint64(fp, idx->n_no_coor));
2128     return 0;
2129     #undef check
2130 }
2131 
hts_idx_save(const hts_idx_t * idx,const char * fn,int fmt)2132 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2133 {
2134     int ret, save;
2135     if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2136     char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2137     if (fnidx == NULL) return -1;
2138 
2139     strcpy(fnidx, fn);
2140     switch (fmt) {
2141     case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2142     case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2143     case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2144     default: abort();
2145     }
2146 
2147     ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2148     save = errno;
2149     free(fnidx);
2150     errno = save;
2151     return ret;
2152 }
2153 
hts_idx_save_as(const hts_idx_t * idx,const char * fn,const char * fnidx,int fmt)2154 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2155 {
2156     BGZF *fp;
2157 
2158     #define check(ret) if ((ret) < 0) goto fail
2159 
2160     if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
2161 
2162     fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2163     if (fp == NULL) return -1;
2164 
2165     if (fmt == HTS_FMT_CSI) {
2166         check(bgzf_write(fp, "CSI\1", 4));
2167         check(idx_write_int32(fp, idx->min_shift));
2168         check(idx_write_int32(fp, idx->n_lvls));
2169         check(idx_write_uint32(fp, idx->l_meta));
2170         if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2171     } else if (fmt == HTS_FMT_TBI) {
2172         check(bgzf_write(fp, "TBI\1", 4));
2173     } else if (fmt == HTS_FMT_BAI) {
2174         check(bgzf_write(fp, "BAI\1", 4));
2175     } else abort();
2176 
2177     check(hts_idx_save_core(idx, fp, fmt));
2178 
2179     return bgzf_close(fp);
2180     #undef check
2181 
2182 fail:
2183     bgzf_close(fp);
2184     return -1;
2185 }
2186 
idx_read_core(hts_idx_t * idx,BGZF * fp,int fmt)2187 static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2188 {
2189     int32_t i, n, is_be;
2190     is_be = ed_is_big();
2191     if (idx == NULL) return -4;
2192     for (i = 0; i < idx->n; ++i) {
2193         bidx_t *h;
2194         lidx_t *l = &idx->lidx[i];
2195         uint32_t key;
2196         int j, absent;
2197         bins_t *p;
2198         h = idx->bidx[i] = kh_init(bin);
2199         if (bgzf_read(fp, &n, 4) != 4) return -1;
2200         if (is_be) ed_swap_4p(&n);
2201         if (n < 0) return -3;
2202         for (j = 0; j < n; ++j) {
2203             khint_t k;
2204             if (bgzf_read(fp, &key, 4) != 4) return -1;
2205             if (is_be) ed_swap_4p(&key);
2206             k = kh_put(bin, h, key, &absent);
2207             if (absent <  0) return -2; // No memory
2208             if (absent == 0) return -3; // Duplicate bin number
2209             p = &kh_val(h, k);
2210             if (fmt == HTS_FMT_CSI) {
2211                 if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2212                 if (is_be) ed_swap_8p(&p->loff);
2213             } else p->loff = 0;
2214             if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2215             if (is_be) ed_swap_4p(&p->n);
2216             if (p->n < 0) return -3;
2217             if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2218             p->m = p->n;
2219             p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
2220             if (p->list == NULL) return -2;
2221             if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2222             if (is_be) swap_bins(p);
2223         }
2224         if (fmt != HTS_FMT_CSI) { // load linear index
2225             int j;
2226             uint32_t x;
2227             if (bgzf_read(fp, &x, 4) != 4) return -1;
2228             if (is_be) ed_swap_4p(&x);
2229             l->n = x;
2230             if (l->n < 0) return -3;
2231             if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2232             l->m = l->n;
2233             l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
2234             if (l->offset == NULL) return -2;
2235             if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2236             if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2237             for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
2238                 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
2239             update_loff(idx, i, 0);
2240         }
2241     }
2242     if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2243     if (is_be) ed_swap_8p(&idx->n_no_coor);
2244     return 0;
2245 }
2246 
idx_read(const char * fn)2247 static hts_idx_t *idx_read(const char *fn)
2248 {
2249     uint8_t magic[4];
2250     int i, is_be;
2251     hts_idx_t *idx = NULL;
2252     uint8_t *meta = NULL;
2253     BGZF *fp = bgzf_open(fn, "r");
2254     if (fp == NULL) return NULL;
2255     is_be = ed_is_big();
2256     if (bgzf_read(fp, magic, 4) != 4) goto fail;
2257 
2258     if (memcmp(magic, "CSI\1", 4) == 0) {
2259         uint32_t x[3], n;
2260         if (bgzf_read(fp, x, 12) != 12) goto fail;
2261         if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
2262         if (x[2]) {
2263             if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
2264             if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
2265             if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
2266             // Prevent possible strlen past the end in tbx_index_load2
2267             meta[x[2]] = '\0';
2268         }
2269         if (bgzf_read(fp, &n, 4) != 4) goto fail;
2270         if (is_be) ed_swap_4p(&n);
2271         if (n > INT32_MAX) goto fail;
2272         if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
2273         idx->l_meta = x[2];
2274         idx->meta = meta;
2275         meta = NULL;
2276         if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
2277     }
2278     else if (memcmp(magic, "TBI\1", 4) == 0) {
2279         uint8_t x[8 * 4];
2280         uint32_t n;
2281         // Read file header
2282         if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
2283         n = le_to_u32(&x[0]); // location of n_ref
2284         if (n > INT32_MAX) goto fail;
2285         if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
2286         n = le_to_u32(&x[7*4]); // location of l_nm
2287         if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
2288         idx->l_meta = 28 + n;
2289         if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
2290         // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
2291         // N.B. left in little-endian byte order.
2292         memcpy(idx->meta, &x[1*4], 28);
2293         // Read in sequence names.
2294         if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
2295         // Prevent possible strlen past the end in tbx_index_load2
2296         idx->meta[idx->l_meta] = '\0';
2297         if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
2298     }
2299     else if (memcmp(magic, "BAI\1", 4) == 0) {
2300         uint32_t n;
2301         if (bgzf_read(fp, &n, 4) != 4) goto fail;
2302         if (is_be) ed_swap_4p(&n);
2303         if (n > INT32_MAX) goto fail;
2304         if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
2305         if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
2306     }
2307     else { errno = EINVAL; goto fail; }
2308 
2309     bgzf_close(fp);
2310     return idx;
2311 
2312 fail:
2313     bgzf_close(fp);
2314     hts_idx_destroy(idx);
2315     free(meta);
2316     return NULL;
2317 }
2318 
hts_idx_set_meta(hts_idx_t * idx,uint32_t l_meta,uint8_t * meta,int is_copy)2319 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
2320                       int is_copy)
2321 {
2322     uint8_t *new_meta = meta;
2323     if (is_copy) {
2324         size_t l = l_meta;
2325         if (l > SIZE_MAX - 1) {
2326             errno = ENOMEM;
2327             return -1;
2328         }
2329         new_meta = malloc(l + 1);
2330         if (!new_meta) return -1;
2331         memcpy(new_meta, meta, l);
2332         // Prevent possible strlen past the end in tbx_index_load2
2333         new_meta[l] = '\0';
2334     }
2335     if (idx->meta) free(idx->meta);
2336     idx->l_meta = l_meta;
2337     idx->meta = new_meta;
2338     return 0;
2339 }
2340 
hts_idx_get_meta(hts_idx_t * idx,uint32_t * l_meta)2341 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
2342 {
2343     *l_meta = idx->l_meta;
2344     return idx->meta;
2345 }
2346 
hts_idx_seqnames(const hts_idx_t * idx,int * n,hts_id2name_f getid,void * hdr)2347 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
2348 {
2349     if ( !idx->n )
2350     {
2351         *n = 0;
2352         return NULL;
2353     }
2354 
2355     int tid = 0, i;
2356     const char **names = (const char**) calloc(idx->n,sizeof(const char*));
2357     for (i=0; i<idx->n; i++)
2358     {
2359         bidx_t *bidx = idx->bidx[i];
2360         if ( !bidx ) continue;
2361         names[tid++] = getid(hdr,i);
2362     }
2363     *n = tid;
2364     return names;
2365 }
2366 
hts_idx_get_stat(const hts_idx_t * idx,int tid,uint64_t * mapped,uint64_t * unmapped)2367 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
2368 {
2369     if ( idx->fmt == HTS_FMT_CRAI ) {
2370         *mapped = 0; *unmapped = 0;
2371         return -1;
2372     }
2373 
2374     bidx_t *h = idx->bidx[tid];
2375     khint_t k = kh_get(bin, h, META_BIN(idx));
2376     if (k != kh_end(h)) {
2377         *mapped = kh_val(h, k).list[1].u;
2378         *unmapped = kh_val(h, k).list[1].v;
2379         return 0;
2380     } else {
2381         *mapped = 0; *unmapped = 0;
2382         return -1;
2383     }
2384 }
2385 
hts_idx_get_n_no_coor(const hts_idx_t * idx)2386 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
2387 {
2388     return idx->n_no_coor;
2389 }
2390 
2391 /****************
2392  *** Iterator ***
2393  ****************/
2394 
2395 // Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
reg2bins(int64_t beg,int64_t end,hts_itr_t * itr,int min_shift,int n_lvls)2396 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
2397 {
2398     int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
2399     if (beg >= end) return 0;
2400     if (end >= 1LL<<s) end = 1LL<<s;
2401     for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
2402         hts_pos_t b, e;
2403         int n, i;
2404         b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
2405         if (itr->bins.n + n > itr->bins.m) {
2406             itr->bins.m = itr->bins.n + n;
2407             kroundup32(itr->bins.m);
2408             itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
2409         }
2410         for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
2411     }
2412     return itr->bins.n;
2413 }
2414 
reg2intervals(hts_itr_t * iter,const hts_idx_t * idx,int tid,int64_t beg,int64_t end,uint32_t interval,uint64_t min_off,uint64_t max_off,int min_shift,int n_lvls)2415 static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
2416 {
2417     int l, t, s;
2418     int i, j;
2419     hts_pos_t b, e;
2420     hts_pair64_max_t *off;
2421     bidx_t *bidx;
2422     khint_t k;
2423     int start_n_off = iter->n_off;
2424 
2425     if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
2426         return -1;
2427 
2428     s = min_shift + (n_lvls<<1) + n_lvls;
2429     if (end >= 1LL<<s)
2430         end = 1LL<<s;
2431 
2432     for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
2433         b = t + (beg>>s); e = t + (end>>s);
2434 
2435         for (i = b; i <= e; ++i) {
2436             if ((k = kh_get(bin, bidx, i)) != kh_end(bidx)) {
2437                 bins_t *p = &kh_value(bidx, k);
2438 
2439                 if (p->n) {
2440                     off = realloc(iter->off, (iter->n_off + p->n) * sizeof(*off));
2441                     if (!off)
2442                         return -2;
2443 
2444                     iter->off = off;
2445                     for (j = 0; j < p->n; ++j) {
2446                         if (p->list[j].v > min_off && p->list[j].u < max_off) {
2447                             iter->off[iter->n_off].u = min_off > p->list[j].u
2448                                 ? min_off : p->list[j].u;
2449                             iter->off[iter->n_off].v = max_off < p->list[j].v
2450                                 ? max_off : p->list[j].v;
2451                             // hts_pair64_max_t::max is now used to link
2452                             // file offsets to region list entries.
2453                             // The iterator can use this to decide if it
2454                             // can skip some file regions.
2455                             iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
2456                             iter->n_off++;
2457                         }
2458                     }
2459                 }
2460             }
2461         }
2462     }
2463 
2464     if (iter->n_off - start_n_off > 1) {
2465         ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
2466         for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
2467             if (iter->off[i].v >= iter->off[j].u) {
2468                 if (iter->off[i].v < iter->off[j].v)
2469                     iter->off[i].v = iter->off[j].v;
2470             } else {
2471                 i++;
2472                 if (i < j)
2473                     iter->off[i] = iter->off[j];
2474             }
2475         }
2476         iter->n_off = i + 1;
2477     }
2478 
2479     return iter->n_off;
2480 }
2481 
compare_regions(const void * r1,const void * r2)2482 static int compare_regions(const void *r1, const void *r2) {
2483     hts_reglist_t *reg1 = (hts_reglist_t *)r1;
2484     hts_reglist_t *reg2 = (hts_reglist_t *)r2;
2485 
2486     if (reg1->tid < 0 && reg2->tid >= 0)
2487         return 1;
2488     else if (reg1->tid >= 0 && reg2->tid < 0)
2489         return -1;
2490     else
2491         return reg1->tid - reg2->tid;
2492 }
2493 
hts_itr_off(const hts_idx_t * idx,int tid)2494 uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
2495 
2496     int i;
2497     bidx_t* bidx;
2498     uint64_t off0 = (uint64_t) -1;
2499     khint_t k;
2500     switch (tid) {
2501     case HTS_IDX_START:
2502         // Find the smallest offset, note that sequence ids may not be ordered sequentially
2503         for (i = 0; i < idx->n; i++) {
2504             bidx = idx->bidx[i];
2505             k = kh_get(bin, bidx, META_BIN(idx));
2506             if (k == kh_end(bidx))
2507                 continue;
2508 
2509             if (off0 > kh_val(bidx, k).list[0].u)
2510                 off0 = kh_val(bidx, k).list[0].u;
2511         }
2512         if (off0 == (uint64_t) -1 && idx->n_no_coor)
2513             off0 = 0;
2514         // only no-coor reads in this bam
2515         break;
2516     case HTS_IDX_NOCOOR:
2517         /* No-coor reads sort after all of the mapped reads.  The position
2518            is not stored in the index itself, so need to find the end
2519            offset for the last mapped read.  A loop is needed here in
2520            case references at the end of the file have no mapped reads,
2521            or sequence ids are not ordered sequentially.
2522            See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
2523         for (i = 0; i < idx->n; i++) {
2524             bidx = idx->bidx[i];
2525             k = kh_get(bin, bidx, META_BIN(idx));
2526             if (k != kh_end(bidx)) {
2527                 if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
2528                     off0 = kh_val(bidx, k).list[0].v;
2529                 }
2530             }
2531         }
2532         if (off0 == (uint64_t) -1 && idx->n_no_coor)
2533             off0 = 0;
2534         // only no-coor reads in this bam
2535         break;
2536     case HTS_IDX_REST:
2537         off0 = 0;
2538         break;
2539     case HTS_IDX_NONE:
2540         off0 = 0;
2541         break;
2542     }
2543 
2544     return off0;
2545 }
2546 
hts_itr_query(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end,hts_readrec_func * readrec)2547 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
2548 {
2549     int i, n_off, l, bin;
2550     hts_pair64_max_t *off;
2551     khint_t k;
2552     bidx_t *bidx;
2553     uint64_t min_off, max_off;
2554     hts_itr_t *iter;
2555     uint32_t unmapped = 0, rel_off;
2556 
2557     // It's possible to call this function with NULL idx iff
2558     // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
2559     if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
2560         errno = EINVAL;
2561         return NULL;
2562     }
2563 
2564     iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
2565     if (iter) {
2566         if (tid < 0) {
2567             uint64_t off = hts_itr_off(idx, tid);
2568             if (off != (uint64_t) -1) {
2569                 iter->read_rest = 1;
2570                 iter->curr_off = off;
2571                 iter->readrec = readrec;
2572                 if (tid == HTS_IDX_NONE)
2573                     iter->finished = 1;
2574             } else {
2575                 free(iter);
2576                 iter = NULL;
2577             }
2578         } else {
2579             if (beg < 0) beg = 0;
2580             if (end < beg) {
2581               free(iter);
2582               return NULL;
2583             }
2584             if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
2585               free(iter);
2586               return NULL;
2587             }
2588 
2589             k = kh_get(bin, bidx, META_BIN(idx));
2590             if (k != kh_end(bidx))
2591                 unmapped = kh_val(bidx, k).list[1].v;
2592             else
2593                 unmapped = 1;
2594 
2595             iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
2596             iter->readrec = readrec;
2597 
2598             if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
2599 
2600             rel_off = beg>>idx->min_shift;
2601             // compute min_off
2602             bin = hts_bin_first(idx->n_lvls) + rel_off;
2603             do {
2604                 int first;
2605                 k = kh_get(bin, bidx, bin);
2606                 if (k != kh_end(bidx)) break;
2607                 first = (hts_bin_parent(bin)<<3) + 1;
2608                 if (bin > first) --bin;
2609                 else bin = hts_bin_parent(bin);
2610             } while (bin);
2611             if (bin == 0) k = kh_get(bin, bidx, bin);
2612             min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2613             // min_off can be calculated more accurately if the
2614             // linear index is available
2615             if (idx->lidx[tid].offset
2616                 && rel_off < idx->lidx[tid].n) {
2617                 if (min_off < idx->lidx[tid].offset[rel_off])
2618                     min_off = idx->lidx[tid].offset[rel_off];
2619                 if (unmapped) {
2620                     int tmp_off;
2621                     for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
2622                         if (idx->lidx[tid].offset[tmp_off] < min_off) {
2623                             min_off = idx->lidx[tid].offset[tmp_off];
2624                             break;
2625                         }
2626                     }
2627 
2628                     if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0))
2629                         min_off = kh_val(bidx, k).list[0].u;
2630                 }
2631             } else if (unmapped) { //CSI index
2632                 if (k != kh_end(bidx))
2633                     min_off = kh_val(bidx, k).list[0].u;
2634             }
2635 
2636             // compute max_off: a virtual offset from a bin to the right of end
2637             bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2638             if (bin >= idx->n_bins) bin = 0;
2639             while (1) {
2640                 // search for an extant bin by moving right, but moving up to the
2641                 // parent whenever we get to a first child (which also covers falling
2642                 // off the RHS, which wraps around and immediately goes up to bin 0)
2643                 while (bin % 8 == 1) bin = hts_bin_parent(bin);
2644                 if (bin == 0) { max_off = (uint64_t)-1; break; }
2645                 k = kh_get(bin, bidx, bin);
2646                 if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
2647                 bin++;
2648             }
2649 
2650             // retrieve bins
2651             reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
2652 
2653             for (i = n_off = 0; i < iter->bins.n; ++i)
2654                 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
2655                     n_off += kh_value(bidx, k).n;
2656             if (n_off == 0) {
2657                 // No overlapping bins means the iterator has already finished.
2658                 iter->finished = 1;
2659                 return iter;
2660             }
2661             off = calloc(n_off, sizeof(*off));
2662             for (i = n_off = 0; i < iter->bins.n; ++i) {
2663                 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
2664                     int j;
2665                     bins_t *p = &kh_value(bidx, k);
2666                     for (j = 0; j < p->n; ++j)
2667                         if (p->list[j].v > min_off && p->list[j].u < max_off) {
2668                             off[n_off].u = min_off > p->list[j].u
2669                                 ? min_off : p->list[j].u;
2670                             off[n_off].v = max_off < p->list[j].v
2671                                 ? max_off : p->list[j].v;
2672                             // hts_pair64_max_t::max is now used to link
2673                             // file offsets to region list entries.
2674                             // The iterator can use this to decide if it
2675                             // can skip some file regions.
2676                             off[n_off].max = ((uint64_t) tid << 32) | j;
2677                             n_off++;
2678                         }
2679                 }
2680             }
2681 
2682             if (n_off == 0) {
2683                 free(off);
2684                 iter->finished = 1;
2685                 return iter;
2686             }
2687             ks_introsort(_off_max, n_off, off);
2688             // resolve completely contained adjacent blocks
2689             for (i = 1, l = 0; i < n_off; ++i)
2690                 if (off[l].v < off[i].v) off[++l] = off[i];
2691             n_off = l + 1;
2692             // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2693             for (i = 1; i < n_off; ++i)
2694                 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2695             // merge adjacent blocks
2696             for (i = 1, l = 0; i < n_off; ++i) {
2697                 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
2698                 else off[++l] = off[i];
2699             }
2700             n_off = l + 1;
2701             iter->n_off = n_off; iter->off = off;
2702         }
2703     }
2704 
2705     return iter;
2706 }
2707 
hts_itr_multi_bam(const hts_idx_t * idx,hts_itr_t * iter)2708 int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
2709 {
2710     int i, j, bin;
2711     khint_t k;
2712     bidx_t *bidx;
2713     uint64_t min_off, max_off, t_off = (uint64_t)-1;
2714     int tid;
2715     hts_pos_t beg, end;
2716     hts_reglist_t *curr_reg;
2717     uint32_t unmapped = 0, rel_off;
2718 
2719     if (!idx || !iter || !iter->multi)
2720         return -1;
2721 
2722     iter->i = -1;
2723     for (i=0; i<iter->n_reg; i++) {
2724 
2725         curr_reg = &iter->reg_list[i];
2726         tid = curr_reg->tid;
2727 
2728         if (tid < 0) {
2729             t_off = hts_itr_off(idx, tid);
2730             if (t_off != (uint64_t)-1) {
2731                 switch (tid) {
2732                 case HTS_IDX_NONE:
2733                     iter->finished = 1;
2734                     // fall through
2735                 case HTS_IDX_START:
2736                 case HTS_IDX_REST:
2737                     iter->curr_off = t_off;
2738                     iter->n_reg = 0;
2739                     iter->reg_list = NULL;
2740                     iter->read_rest = 1;
2741                     return 0;
2742                 case HTS_IDX_NOCOOR:
2743                     iter->nocoor = 1;
2744                     iter->nocoor_off = t_off;
2745                 }
2746             }
2747         } else {
2748             if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
2749                 continue;
2750 
2751             k = kh_get(bin, bidx, META_BIN(idx));
2752             if (k != kh_end(bidx))
2753                 unmapped = kh_val(bidx, k).list[1].v;
2754             else
2755                 unmapped = 1;
2756 
2757             for(j=0; j<curr_reg->count; j++) {
2758                 hts_pair32_t *curr_intv = &curr_reg->intervals[j];
2759                 if (curr_intv->end < curr_intv->beg)
2760                     continue;
2761 
2762                 beg = curr_intv->beg;
2763                 end = curr_intv->end;
2764                 rel_off = beg>>idx->min_shift;
2765 
2766                 /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
2767                        If the computed bin is not in the index, try the next bin to the
2768                        left, belonging to the same parent. If it is the first sibling bin,
2769                        try the parent bin. */
2770                 bin = hts_bin_first(idx->n_lvls) + rel_off;
2771                 do {
2772                     int first;
2773                     k = kh_get(bin, bidx, bin);
2774                     if (k != kh_end(bidx)) break;
2775                     first = (hts_bin_parent(bin)<<3) + 1;
2776                     if (bin > first) --bin;
2777                     else bin = hts_bin_parent(bin);
2778                 } while (bin);
2779                 if (bin == 0)
2780                     k = kh_get(bin, bidx, bin);
2781                 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2782                 // min_off can be calculated more accurately if the
2783                 // linear index is available
2784                 if (idx->lidx[tid].offset
2785                     && rel_off < idx->lidx[tid].n) {
2786                     if (min_off < idx->lidx[tid].offset[rel_off])
2787                         min_off = idx->lidx[tid].offset[rel_off];
2788                     if (unmapped) {
2789                         int tmp_off;
2790                         for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
2791                             if (idx->lidx[tid].offset[tmp_off] < min_off) {
2792                                 min_off = idx->lidx[tid].offset[tmp_off];
2793                                 break;
2794                             }
2795                         }
2796 
2797                         if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0))
2798                             min_off = kh_val(bidx, k).list[0].u;
2799                     }
2800                 } else if (unmapped) { //CSI index
2801                     if (k != kh_end(bidx))
2802                         min_off = kh_val(bidx, k).list[0].u;
2803                 }
2804 
2805                 // compute max_off: a virtual offset from a bin to the right of end
2806                 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2807                 if (bin >= idx->n_bins) bin = 0;
2808                 while (1) {
2809                     // search for an extant bin by moving right, but moving up to the
2810                     // parent whenever we get to a first child (which also covers falling
2811                     // off the RHS, which wraps around and immediately goes up to bin 0)
2812                     while (bin % 8 == 1) bin = hts_bin_parent(bin);
2813                     if (bin == 0) { max_off = (uint64_t)-1; break; }
2814                     k = kh_get(bin, bidx, bin);
2815                     if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
2816                         max_off = kh_val(bidx, k).list[0].u;
2817                         break;
2818                     }
2819                     bin++;
2820                 }
2821 
2822                 //convert coordinates to file offsets
2823                 if (reg2intervals(iter, idx, tid, beg, end, j,
2824                                   min_off, max_off,
2825                                   idx->min_shift, idx->n_lvls) < 0) {
2826                     return -1;
2827                 }
2828             }
2829         }
2830     }
2831 
2832     if (iter->n_off > 1)
2833         ks_introsort(_off_max, iter->n_off, iter->off);
2834 
2835     if(!iter->n_off && !iter->nocoor)
2836         iter->finished = 1;
2837 
2838     return 0;
2839 }
2840 
hts_itr_multi_cram(const hts_idx_t * idx,hts_itr_t * iter)2841 int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
2842 {
2843     const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
2844     int tid, i, n_off = 0;
2845     uint32_t j;
2846     hts_pos_t beg, end;
2847     hts_reglist_t *curr_reg;
2848     hts_pair32_t *curr_intv;
2849     hts_pair64_max_t *off = NULL, *tmp;
2850     cram_index *e = NULL;
2851 
2852     if (!cidx || !iter || !iter->multi)
2853         return -1;
2854 
2855     iter->is_cram = 1;
2856     iter->read_rest = 0;
2857     iter->off = NULL;
2858     iter->n_off = 0;
2859     iter->curr_off = 0;
2860     iter->i = -1;
2861 
2862     for (i=0; i<iter->n_reg; i++) {
2863 
2864         curr_reg = &iter->reg_list[i];
2865         tid = curr_reg->tid;
2866 
2867         if (tid >= 0) {
2868             tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off));
2869             if (!tmp)
2870                 goto err;
2871             off = tmp;
2872 
2873             for (j=0; j < curr_reg->count; j++) {
2874                 curr_intv = &curr_reg->intervals[j];
2875                 if (curr_intv->end < curr_intv->beg)
2876                     continue;
2877 
2878                 beg = curr_intv->beg;
2879                 end = curr_intv->end;
2880 
2881 /* First, fetch the container overlapping 'beg' and assign its file offset to u, then
2882  * find the container overlapping 'end' and assign the relative end of the slice to v.
2883  * The cram_ptell function will adjust with the container offset, which is not stored
2884  * in the index.
2885  */
2886                 e = cram_index_query(cidx->cram, tid, beg+1, NULL);
2887                 if (e) {
2888                     off[n_off].u = e->offset;
2889                     // hts_pair64_max_t::max is now used to link
2890                     // file offsets to region list entries.
2891                     // The iterator can use this to decide if it
2892                     // can skip some file regions.
2893                     off[n_off].max = ((uint64_t) tid << 32) | j;
2894 
2895                     if (end >= HTS_POS_MAX) {
2896                        e = cram_index_last(cidx->cram, tid, NULL);
2897                     } else {
2898                        e = cram_index_query_last(cidx->cram, tid, end+1);
2899                     }
2900 
2901                     if (e) {
2902                         off[n_off++].v = e->next
2903                             ? e->next
2904                             : e->offset + e->slice + e->len;
2905                     } else {
2906                         hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
2907                     }
2908                 } else {
2909                     hts_log_warning("No index entry for region %d:%"PRIhts_pos"-%"PRIhts_pos"", tid, beg, end);
2910                 }
2911             }
2912         } else {
2913             switch (tid) {
2914                 case HTS_IDX_NOCOOR:
2915                     e = cram_index_query(cidx->cram, tid, 1, NULL);
2916                     if (e) {
2917                         iter->nocoor = 1;
2918                         iter->nocoor_off = e->offset;
2919                     } else {
2920                         hts_log_warning("No index entry for NOCOOR region");
2921                     }
2922                     break;
2923                 case HTS_IDX_START:
2924                     e = cram_index_query(cidx->cram, tid, 1, NULL);
2925                     if (e) {
2926                         iter->read_rest = 1;
2927                         tmp = realloc(off, sizeof(*off));
2928                         if (!tmp)
2929                             goto err;
2930                         off = tmp;
2931                         off[0].u = e->offset;
2932                         off[0].v = 0;
2933                         n_off=1;
2934                     } else {
2935                         hts_log_warning("No index entries");
2936                     }
2937                     break;
2938                 case HTS_IDX_REST:
2939                     break;
2940                 case HTS_IDX_NONE:
2941                     iter->finished = 1;
2942                     break;
2943                 default:
2944                     hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
2945             }
2946         }
2947     }
2948 
2949     if (n_off) {
2950         ks_introsort(_off_max, n_off, off);
2951         iter->n_off = n_off; iter->off = off;
2952     }
2953 
2954     if(!n_off && !iter->nocoor)
2955         iter->finished = 1;
2956 
2957     return 0;
2958 
2959  err:
2960     free(off);
2961     return -1;
2962 }
2963 
hts_itr_destroy(hts_itr_t * iter)2964 void hts_itr_destroy(hts_itr_t *iter)
2965 {
2966     if (iter) {
2967         if (iter->multi) {
2968             hts_reglist_free(iter->reg_list, iter->n_reg);
2969         } else {
2970             free(iter->bins.a);
2971         }
2972 
2973         if (iter->off)
2974             free(iter->off);
2975         free(iter);
2976     }
2977 }
2978 
push_digit(long long i,char c)2979 static inline long long push_digit(long long i, char c)
2980 {
2981     // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
2982     int digit = c - '0';
2983     return 10 * i + digit;
2984 }
2985 
hts_parse_decimal(const char * str,char ** strend,int flags)2986 long long hts_parse_decimal(const char *str, char **strend, int flags)
2987 {
2988     long long n = 0;
2989     int decimals = 0, e = 0, lost = 0;
2990     char sign = '+', esign = '+';
2991     const char *s;
2992 
2993     while (isspace_c(*str)) str++;
2994     s = str;
2995 
2996     if (*s == '+' || *s == '-') sign = *s++;
2997     while (*s)
2998         if (isdigit_c(*s)) n = push_digit(n, *s++);
2999         else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3000         else break;
3001 
3002     if (*s == '.') {
3003         s++;
3004         while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++);
3005     }
3006 
3007     if (*s == 'E' || *s == 'e') {
3008         s++;
3009         if (*s == '+' || *s == '-') esign = *s++;
3010         while (isdigit_c(*s)) e = push_digit(e, *s++);
3011         if (esign == '-') e = -e;
3012     }
3013 
3014     switch (*s) {
3015     case 'k': case 'K': e += 3; s++; break;
3016     case 'm': case 'M': e += 6; s++; break;
3017     case 'g': case 'G': e += 9; s++; break;
3018     }
3019 
3020     e -= decimals;
3021     while (e > 0) n *= 10, e--;
3022     while (e < 0) lost += n % 10, n /= 10, e++;
3023 
3024     if (lost > 0) {
3025         hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3026     }
3027 
3028     if (strend) {
3029         *strend = (char *)s;
3030     } else if (*s) {
3031         if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3032             hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3033     }
3034 
3035     return (sign == '+')? n : -n;
3036 }
3037 
hts_memrchr(const void * s,int c,size_t n)3038 static void *hts_memrchr(const void *s, int c, size_t n) {
3039     size_t i;
3040     unsigned char *u = (unsigned char *)s;
3041     for (i = n; i > 0; i--) {
3042         if (u[i-1] == c)
3043             return u+i-1;
3044     }
3045 
3046     return NULL;
3047 }
3048 
3049 /*
3050  * A variant of hts_parse_reg which is reference-id aware.  It uses
3051  * the iterator name2id callbacks to validate the region tokenisation works.
3052  *
3053  * This is necessary due to GRCh38 HLA additions which have reference names
3054  * like "HLA-DRB1*12:17".
3055  *
3056  * All parameters are mandatory.
3057  *
3058  * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3059  * are reference names, we may quote using curly braces.
3060  * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3061  *
3062  * Flags are used to control how parsing works, and can be one of the below.
3063  *
3064  * HTS_PARSE_LIST:
3065  *     If present, the region is assmed to be a comma separated list and
3066  *     position parsing will not contain commas (this implicitly
3067  *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3068  *     On success the return pointer will be the start of the next region, ie
3069  *     the character after the comma.  (If *ret != '\0' then the caller can
3070  *     assume another region is present in the list.)
3071  *
3072  *     If not set then positions may contain commas.  In this case the return
3073  *     value should point to the end of the string, or NULL on failure.
3074  *
3075  * HTS_PARSE_ONE_COORD:
3076  *     If present, X:100 is treated as the single base pair region X:100-100.
3077  *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3078  *     (This is the standard bcftools region convention.)
3079  *
3080  *     When not set X:100 is considered to be X:100-<end> where <end> is
3081  *     the end of chromosome X (set to HTS_POS_MAX here).  X:100- and X:-100
3082  *     are invalid.
3083  *     (This is the standard samtools region convention.)
3084  *
3085  * Note the supplied string expects 1 based inclusive coordinates, but the
3086  * returned coordinates start from 0 and are half open, so pos0 is valid
3087  * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3088  *
3089  * On success a pointer to the byte after the end of the entire region
3090  *            specifier is returned (plus any trailing comma), and tid,
3091  *            beg & end will be set.
3092  * On failure NULL is returned.
3093  */
hts_parse_region(const char * s,int * tid,hts_pos_t * beg,hts_pos_t * end,hts_name2id_f getid,void * hdr,int flags)3094 const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
3095                              hts_pos_t *end, hts_name2id_f getid, void *hdr,
3096                              int flags)
3097 {
3098     if (!s || !tid || !beg || !end || !getid)
3099         return NULL;
3100 
3101     size_t s_len = strlen(s);
3102     kstring_t ks = { 0, 0, NULL };
3103 
3104     const char *colon = NULL, *comma = NULL;
3105     int quoted = 0;
3106 
3107     if (flags & HTS_PARSE_LIST)
3108         flags &= ~HTS_PARSE_THOUSANDS_SEP;
3109     else
3110         flags |= HTS_PARSE_THOUSANDS_SEP;
3111 
3112     const char *s_end = s + s_len;
3113 
3114     // Braced quoting of references is permitted to resolve ambiguities.
3115     if (*s == '{') {
3116         const char *close = memchr(s, '}', s_len);
3117         if (!close) {
3118             hts_log_error("Mismatching braces in \"%s\"", s);
3119             *tid = -1;
3120             return NULL;
3121         }
3122         s++;
3123         s_len--;
3124         if (close[1] == ':')
3125             colon = close+1;
3126         quoted = 1; // number of trailing characters to trim
3127 
3128         // Truncate to this item only, if appropriate.
3129         if (flags & HTS_PARSE_LIST) {
3130             comma = strchr(close, ',');
3131             if (comma) {
3132                 s_len = comma-s;
3133                 s_end = comma+1;
3134             }
3135         }
3136     } else {
3137         // Truncate to this item only, if appropriate.
3138         if (flags & HTS_PARSE_LIST) {
3139             comma = strchr(s, ',');
3140             if (comma) {
3141                 s_len = comma-s;
3142                 s_end = comma+1;
3143             }
3144         }
3145 
3146         colon = hts_memrchr(s, ':', s_len);
3147     }
3148 
3149     // No colon is simplest case; just check and return.
3150     if (colon == NULL) {
3151         *beg = 0; *end = HTS_POS_MAX;
3152         kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
3153         if (!ks.s) {
3154             *tid = -2;
3155             return NULL;
3156         }
3157 
3158         *tid = getid(hdr, ks.s);
3159         free(ks.s);
3160 
3161         return *tid >= 0 ? s_end : NULL;
3162     }
3163 
3164     // Has a colon, but check whole name first.
3165     if (!quoted) {
3166         *beg = 0; *end = HTS_POS_MAX;
3167         kputsn(s, s_len, &ks); // convert to nul terminated string
3168         if (!ks.s) {
3169             *tid = -2;
3170             return NULL;
3171         }
3172         if ((*tid = getid(hdr, ks.s)) >= 0) {
3173             // Entire name matches, but also check this isn't
3174             // ambiguous.  eg we have ref chr1 and ref chr1:100-200
3175             // both present.
3176             ks.l = 0;
3177             kputsn(s, colon-s, &ks); // convert to nul terminated string
3178             if (!ks.s) {
3179                 *tid = -2;
3180                 return NULL;
3181             }
3182             if (getid(hdr, ks.s) >= 0) {
3183                 free(ks.s);
3184                 *tid = -1;
3185                 hts_log_error("Range is ambiguous. "
3186                               "Use {%s} or {%.*s}%s instead",
3187                               s, (int)(colon-s), s, colon);
3188                 return NULL;
3189             }
3190             free(ks.s);
3191 
3192             return s_end;
3193         }
3194         if (*tid < -1) // Failed to parse header
3195             return NULL;
3196     }
3197 
3198     // Quoted, or unquoted and whole string isn't a name.
3199     // Check the pre-colon part is valid.
3200     ks.l = 0;
3201     kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
3202     if (!ks.s) {
3203         *tid = -2;
3204         return NULL;
3205     }
3206     *tid = getid(hdr, ks.s);
3207     free(ks.s);
3208     if (*tid < 0)
3209         return NULL;
3210 
3211     // Finally parse the post-colon coordinates
3212     char *hyphen;
3213     *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
3214     if (*beg < 0) {
3215         if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
3216             // interpret chr:-100 as chr:1-100
3217             *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
3218             *beg = 0;
3219             return s_end;
3220         } else if (*hyphen == '-') {
3221             *beg = 0;
3222         } else {
3223             hts_log_error("Unexpected string \"%s\" after region", hyphen);
3224             return NULL;
3225         }
3226     }
3227 
3228     if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
3229         *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
3230     } else if (*hyphen == '-') {
3231         *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
3232         if (*hyphen != '\0' && *hyphen != ',') {
3233             hts_log_error("Unexpected string \"%s\" after region", hyphen);
3234             return NULL;
3235         }
3236     } else {
3237         hts_log_error("Unexpected string \"%s\" after region", hyphen);
3238         return NULL;
3239     }
3240 
3241     if (*end == 0)
3242         *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
3243 
3244     if (*beg >= *end) return NULL;
3245 
3246     return s_end;
3247 }
3248 
3249 // Next release we should mark this as deprecated?
3250 // Use hts_parse_region above instead.
hts_parse_reg64(const char * s,hts_pos_t * beg,hts_pos_t * end)3251 const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
3252 {
3253     char *hyphen;
3254     const char *colon = strrchr(s, ':');
3255     if (colon == NULL) {
3256         *beg = 0; *end = HTS_POS_MAX;
3257         return s + strlen(s);
3258     }
3259 
3260     *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
3261     if (*beg < 0) *beg = 0;
3262 
3263     if (*hyphen == '\0') *end = HTS_POS_MAX;
3264     else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
3265     else return NULL;
3266 
3267     if (*beg >= *end) return NULL;
3268     return colon;
3269 }
3270 
hts_parse_reg(const char * s,int * beg,int * end)3271 const char *hts_parse_reg(const char *s, int *beg, int *end)
3272 {
3273     hts_pos_t beg64 = 0, end64 = 0;
3274     const char *colon = hts_parse_reg64(s, &beg64, &end64);
3275     if (beg64 > INT_MAX) {
3276         hts_log_error("Position %"PRId64" too large", beg64);
3277         return NULL;
3278     }
3279     if (end64 > INT_MAX) {
3280         if (end64 == HTS_POS_MAX) {
3281             end64 = INT_MAX;
3282         } else {
3283             hts_log_error("Position %"PRId64" too large", end64);
3284             return NULL;
3285         }
3286     }
3287     *beg = beg64;
3288     *end = end64;
3289     return colon;
3290 }
3291 
hts_itr_querys(const hts_idx_t * idx,const char * reg,hts_name2id_f getid,void * hdr,hts_itr_query_func * itr_query,hts_readrec_func * readrec)3292 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
3293 {
3294     int tid;
3295     hts_pos_t beg, end;
3296 
3297     if (strcmp(reg, ".") == 0)
3298         return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
3299     else if (strcmp(reg, "*") == 0)
3300         return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
3301 
3302     if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
3303         return NULL;
3304 
3305     return itr_query(idx, tid, beg, end, readrec);
3306 }
3307 
hts_itr_regions(const hts_idx_t * idx,hts_reglist_t * reglist,int count,hts_name2id_f getid,void * hdr,hts_itr_multi_query_func * itr_specific,hts_readrec_func * readrec,hts_seek_func * seek,hts_tell_func * tell)3308 hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
3309 
3310     int i;
3311 
3312     if (!reglist)
3313         return NULL;
3314 
3315     hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3316     if (itr) {
3317         itr->n_reg = count;
3318         itr->readrec = readrec;
3319         itr->seek = seek;
3320         itr->tell = tell;
3321         itr->reg_list = reglist;
3322         itr->finished = 0;
3323         itr->nocoor = 0;
3324         itr->multi = 1;
3325 
3326         for (i = 0; i < itr->n_reg; i++) {
3327             if (itr->reg_list[i].reg) {
3328                 if (!strcmp(itr->reg_list[i].reg, ".")) {
3329                     itr->reg_list[i].tid = HTS_IDX_START;
3330                     continue;
3331                 }
3332 
3333                 if (!strcmp(itr->reg_list[i].reg, "*")) {
3334                     itr->reg_list[i].tid = HTS_IDX_NOCOOR;
3335                     continue;
3336                 }
3337 
3338                 itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
3339                 if (itr->reg_list[i].tid < 0) {
3340                     if (itr->reg_list[i].tid < -1) {
3341                         hts_log_error("Failed to parse header");
3342                         hts_itr_destroy(itr);
3343                         return NULL;
3344                     } else {
3345                         hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
3346                     }
3347                 }
3348             }
3349         }
3350 
3351         qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
3352         if (itr_specific(idx, itr) != 0) {
3353             hts_log_error("Failed to create the multi-region iterator!");
3354             hts_itr_destroy(itr);
3355             itr = NULL;
3356         }
3357     }
3358 
3359     return itr;
3360 }
3361 
hts_itr_next(BGZF * fp,hts_itr_t * iter,void * r,void * data)3362 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
3363 {
3364     int ret, tid;
3365     hts_pos_t beg, end;
3366     if (iter == NULL || iter->finished) return -1;
3367     if (iter->read_rest) {
3368         if (iter->curr_off) { // seek to the start
3369             if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
3370                 hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
3371                               iter->curr_off,
3372                               errno ? ": " : "", strerror(errno));
3373                 return -2;
3374             }
3375             iter->curr_off = 0; // only seek once
3376         }
3377         ret = iter->readrec(fp, data, r, &tid, &beg, &end);
3378         if (ret < 0) iter->finished = 1;
3379         iter->curr_tid = tid;
3380         iter->curr_beg = beg;
3381         iter->curr_end = end;
3382         return ret;
3383     }
3384     // A NULL iter->off should always be accompanied by iter->finished.
3385     assert(iter->off != NULL);
3386     for (;;) {
3387         if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
3388             if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
3389             if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
3390                 if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
3391                     hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
3392                                   iter->off[iter->i+1].u,
3393                                   errno ? ": " : "", strerror(errno));
3394                     return -2;
3395                 }
3396                 iter->curr_off = bgzf_tell(fp);
3397             }
3398             ++iter->i;
3399         }
3400         if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
3401             iter->curr_off = bgzf_tell(fp);
3402             if (tid != iter->tid || beg >= iter->end) { // no need to proceed
3403                 ret = -1; break;
3404             } else if (end > iter->beg && iter->end > beg) {
3405                 iter->curr_tid = tid;
3406                 iter->curr_beg = beg;
3407                 iter->curr_end = end;
3408                 return ret;
3409             }
3410         } else break; // end of file or error
3411     }
3412     iter->finished = 1;
3413     return ret;
3414 }
3415 
hts_itr_multi_next(htsFile * fd,hts_itr_t * iter,void * r)3416 int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
3417 {
3418     void *fp;
3419     int ret, tid, i, cr, ci;
3420     hts_pos_t beg, end;
3421     hts_reglist_t *found_reg;
3422 
3423     if (iter == NULL || iter->finished) return -1;
3424 
3425     if (iter->is_cram) {
3426         fp = fd->fp.cram;
3427     } else {
3428         fp = fd->fp.bgzf;
3429     }
3430 
3431     if (iter->read_rest) {
3432         if (iter->curr_off) { // seek to the start
3433             if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
3434                 hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
3435                 return -1;
3436             }
3437             iter->curr_off = 0; // only seek once
3438         }
3439 
3440         ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
3441         if (ret < 0)
3442             iter->finished = 1;
3443 
3444         iter->curr_tid = tid;
3445         iter->curr_beg = beg;
3446         iter->curr_end = end;
3447 
3448         return ret;
3449     }
3450     // A NULL iter->off should always be accompanied by iter->finished.
3451     assert(iter->off != NULL || iter->nocoor != 0);
3452 
3453     int next_range = 0;
3454     for (;;) {
3455         // Note that due to the way bam indexing works, iter->off may contain
3456         // file chunks that are not actually needed as they contain data
3457         // beyond the end of the requested region.  These are filtered out
3458         // by comparing the tid and index into hts_reglist_t::intervals
3459         // (packed for reasons of convenience into iter->off[iter->i].max)
3460         // associated with the file region with iter->curr_tid and
3461         // iter->curr_intv.
3462 
3463         if (next_range
3464             || iter->curr_off == 0
3465             || iter->i >= iter->n_off
3466             || iter->curr_off >= iter->off[iter->i].v
3467             || (iter->off[iter->i].max >> 32 == iter->curr_tid
3468                 && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
3469 
3470             // Jump to the next chunk.  It may be necessary to skip more
3471             // than one as the iter->off list can include overlapping entries.
3472             do {
3473                 iter->i++;
3474             } while (iter->i < iter->n_off
3475                      && (iter->curr_off >= iter->off[iter->i].v
3476                          || (iter->off[iter->i].max >> 32 == iter->curr_tid
3477                              && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
3478 
3479             if (iter->is_cram && iter->i < iter->n_off) {
3480                 // Ensure iter->curr_reg is correct.
3481                 //
3482                 // We need this for CRAM as we shortcut some of the later
3483                 // logic by getting an end-of-range and continuing to the
3484                 // next offset.
3485                 //
3486                 // We cannot do this for BAM (and fortunately do not need to
3487                 // either) because in BAM world a query to genomic positions
3488                 // GX and GY leading to a seek offsets PX and PY may have
3489                 // GX > GY and PX < PY.  (This is due to the R-tree and falling
3490                 // between intervals, bumping up to a higher bin.)
3491                 // CRAM strictly follows PX >= PY if GX >= GY, so this logic
3492                 // works.
3493                 int want_tid = iter->off[iter->i].max >> 32;
3494                 if (!(iter->curr_reg < iter->n_reg &&
3495                       iter->reg_list[iter->curr_reg].tid == want_tid)) {
3496                     int j;
3497                     for (j = 0; j < iter->n_reg; j++)
3498                         if (iter->reg_list[j].tid == want_tid)
3499                             break;
3500                     if (j == iter->n_reg)
3501                         return -1;
3502                     iter->curr_reg = j;
3503                     iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
3504                 };
3505                 iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
3506             }
3507 
3508             if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
3509                 if (iter->nocoor) {
3510                     next_range = 0;
3511                     if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
3512                         hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
3513                         return -1;
3514                     }
3515                     if (iter->is_cram) {
3516                         cram_range r = { HTS_IDX_NOCOOR };
3517                         cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
3518                     }
3519 
3520                     // The first slice covering the unmapped reads might
3521                     // contain a few mapped reads, so scroll
3522                     // forward until finding the first unmapped read.
3523                     do {
3524                         ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
3525                     } while (tid >= 0 && ret >=0);
3526 
3527                     if (ret < 0)
3528                         iter->finished = 1;
3529                     else
3530                         iter->read_rest = 1;
3531 
3532                     iter->curr_off = 0; // don't seek any more
3533                     iter->curr_tid = tid;
3534                     iter->curr_beg = beg;
3535                     iter->curr_end = end;
3536 
3537                     return ret;
3538                 } else {
3539                     ret = -1; break;
3540                 }
3541             } else if (iter->i < iter->n_off) {
3542                 // New chunk may overlap the last one, so ensure we
3543                 // only seek forwards.
3544                 if (iter->curr_off < iter->off[iter->i].u || next_range) {
3545                     iter->curr_off = iter->off[iter->i].u;
3546 
3547                     // CRAM has the capability of setting an end location.
3548                     // This means multi-threaded decodes can stop once they
3549                     // reach that point, rather than pointlessly decoding
3550                     // more slices than we'll be using.
3551                     //
3552                     // We have to be careful here.  Whenever we set the cram
3553                     // range we need a corresponding seek in order to ensure
3554                     // we can safely decode at that offset.  We use next_range
3555                     // var to ensure this is always true; this is set on
3556                     // end-of-range condition. It's never modified for BAM.
3557                     if (iter->is_cram) {
3558                         // Next offset.[uv] tuple, but it's already been
3559                         // included in our cram range, so don't seek and don't
3560                         // reset range so we can efficiently multi-thread.
3561                         if (next_range || iter->curr_off >= iter->end) {
3562                             if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
3563                                 hts_log_error("Seek at offset %" PRIu64
3564                                         " failed.", iter->curr_off);
3565                                 return -1;
3566                             }
3567 
3568                             // Find the genomic range matching this interval.
3569                             int j;
3570                             hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
3571                             cram_range r = {
3572                                     rl->tid,
3573                                     rl->intervals[iter->curr_intv].beg,
3574                                     rl->intervals[iter->curr_intv].end
3575                             };
3576 
3577                             // Expand it up to cover neighbouring intervals.
3578                             // Note we can only have a single chromosome in a
3579                             // range, so if we detect our blocks span chromosomes
3580                             // or we have a multi-ref mode slice, we just use
3581                             // HTS_IDX_START refid instead.  This doesn't actually
3582                             // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
3583                             // and indicator of decoding with no end limit.
3584                             //
3585                             // That isn't as efficient as it could be, but it's
3586                             // no poorer than before and it works.
3587                             int tid = r.refid;
3588                             int64_t end = r.end;
3589                             int64_t v = iter->off[iter->i].v;
3590                             j = iter->i+1;
3591                             while (j < iter->n_off) {
3592                                 if (iter->off[j].u > v)
3593                                     break;
3594 
3595                                 uint64_t max = iter->off[j].max;
3596                                 if ((max>>32) != tid)
3597                                     tid = HTS_IDX_START; // => no range limit
3598 
3599                                 if (end < rl->intervals[max & 0xffffffff].end)
3600                                     end = rl->intervals[max & 0xffffffff].end;
3601                                 if (v < iter->off[j].v)
3602                                     v = iter->off[j].v;
3603                                 j++;
3604                             }
3605                             r.refid = tid;
3606                             r.end = end;
3607 
3608                             // Remember maximum 'v' here so we don't do
3609                             // unnecessary subsequent seeks for the next
3610                             // regions.  We can't change curr_off, but
3611                             // beg/end are used only by single region iterator so
3612                             // we cache it there to avoid changing the struct.
3613                             iter->end = v;
3614 
3615                             cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
3616                             next_range = 0;
3617                         }
3618                     } else { // Not CRAM
3619                         if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
3620                             hts_log_error("Seek at offset %" PRIu64 " failed.",
3621                                           iter->curr_off);
3622                             return -1;
3623                         }
3624                     }
3625                 }
3626             }
3627         }
3628 
3629         ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
3630         if (ret < 0) {
3631             if (iter->is_cram && cram_eof(fp)) {
3632                 // Skip to end of range
3633                 //
3634                 // We should never be adjusting curr_off manually unless
3635                 // we also can guarantee we'll be doing a seek after to
3636                 // a new location.  Otherwise we'll be reading wrong offset
3637                 // for the next container.
3638                 //
3639                 // We ensure this by adjusting our CRAM_OPT_RANGE
3640                 // accordingly above, but to double check we also
3641                 // set the skipped_block flag to enforce a seek also.
3642                 iter->curr_off = iter->off[iter->i].v;
3643                 next_range = 1;
3644 
3645                 // Next region
3646                 if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
3647                     if (++iter->curr_reg >= iter->n_reg)
3648                         break;
3649                     iter->curr_intv = 0;
3650                     iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
3651                 }
3652                 continue;
3653             } else {
3654                 break;
3655             }
3656         }
3657 
3658         iter->curr_off = iter->tell(fp);
3659 
3660         if (tid != iter->curr_tid) {
3661             hts_reglist_t key;
3662             key.tid = tid;
3663 
3664             found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
3665                                                  iter->n_reg,
3666                                                  sizeof(hts_reglist_t),
3667                                                  compare_regions);
3668             if (!found_reg)
3669                 continue;
3670 
3671             iter->curr_reg = (found_reg - iter->reg_list);
3672             iter->curr_tid = tid;
3673             iter->curr_intv = 0;
3674         }
3675 
3676         cr = iter->curr_reg;
3677         ci = iter->curr_intv;
3678 
3679         for (i = ci; i < iter->reg_list[cr].count; i++) {
3680             if (end > iter->reg_list[cr].intervals[i].beg &&
3681                 iter->reg_list[cr].intervals[i].end > beg) {
3682                 iter->curr_beg = beg;
3683                 iter->curr_end = end;
3684                 iter->curr_intv = i;
3685 
3686                 return ret;
3687             }
3688 
3689             // Check if the read starts beyond intervals[i].end
3690             // If so, the interval is finished so move on to the next.
3691             if (beg > iter->reg_list[cr].intervals[i].end)
3692                 iter->curr_intv = i + 1;
3693 
3694             // No need to keep searching if the read ends before intervals[i].beg
3695             if (end < iter->reg_list[cr].intervals[i].beg)
3696                 break;
3697         }
3698     }
3699     iter->finished = 1;
3700 
3701     return ret;
3702 }
3703 
3704 /**********************
3705  *** Retrieve index ***
3706  **********************/
3707 // Local_fn and local_len will return a sub-region of 'fn'.
3708 // Eg http://elsewhere/dir/foo.bam.bai?a=b may return
3709 // foo.bam.bai via local_fn and local_len.
3710 //
3711 // Returns -1 if index couldn't be opened.
3712 //         -2 on other errors
idx_test_and_fetch(const char * fn,const char ** local_fn,int * local_len,int download)3713 static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
3714 {
3715     hFILE *remote_hfp = NULL;
3716     hFILE *local_fp = NULL;
3717     int save_errno;
3718     htsFormat fmt;
3719     kstring_t s = KS_INITIALIZE;
3720     kstring_t tmps = KS_INITIALIZE;
3721 
3722     if (hisremote(fn)) {
3723         const int buf_size = 1 * 1024 * 1024;
3724         int l;
3725         const char *p, *e;
3726         // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
3727         e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
3728         // Find the previous slash from there.
3729         p = e;
3730         while (p > fn && *p != '/') p--;
3731         if (*p == '/') p++;
3732 
3733         // Attempt to open local file first
3734         kputsn(p, e-p, &s);
3735         if (access(s.s, R_OK) == 0)
3736         {
3737             free(s.s);
3738             *local_fn = p;
3739             *local_len = e-p;
3740             return 0;
3741         }
3742 
3743         // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
3744         if ((remote_hfp = hopen(fn, "r")) == 0) {
3745             hts_log_info("Failed to open index file '%s'", fn);
3746             free(s.s);
3747             return -1;
3748         }
3749         if (hts_detect_format(remote_hfp, &fmt)) {
3750             hts_log_error("Failed to detect format of index file '%s'", fn);
3751             goto fail;
3752         }
3753         if (fmt.category != index_file || (fmt.format != bai &&  fmt.format != csi && fmt.format != tbi
3754                 && fmt.format != crai && fmt.format != fai_format)) {
3755             hts_log_error("Format of index file '%s' is not supported", fn);
3756             goto fail;
3757         }
3758 
3759         if (download) {
3760             if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps)) == NULL) {
3761                 hts_log_error("Failed to create file %s in the working directory", p);
3762                 goto fail;
3763             }
3764             hts_log_info("Downloading file %s to local directory", fn);
3765             uint8_t *buf = (uint8_t*)calloc(buf_size, 1);
3766             if (!buf) {
3767                 hts_log_error("%s", strerror(errno));
3768                 goto fail;
3769             }
3770             while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
3771                 if (hwrite(local_fp, buf, l) != l) {
3772                     hts_log_error("Failed to write data to %s : %s",
3773                             fn, strerror(errno));
3774                     free(buf);
3775                     goto fail;
3776                 }
3777             }
3778             free(buf);
3779             if (l < 0) {
3780                 hts_log_error("Error reading \"%s\"", fn);
3781                 goto fail;
3782             }
3783             if (hclose(local_fp) < 0) {
3784                 hts_log_error("Error closing %s : %s", fn, strerror(errno));
3785                 local_fp = NULL;
3786                 goto fail;
3787             }
3788             local_fp = NULL;
3789             if (rename(tmps.s, s.s) < 0) {
3790                 hts_log_error("Error renaming %s : %s", tmps.s, strerror(errno));
3791                 goto fail;
3792             }
3793             ks_clear(&tmps);
3794 
3795             *local_fn = p;
3796             *local_len = e-p;
3797         } else {
3798             *local_fn = fn;
3799             *local_len = e-fn;
3800         }
3801 
3802         if (hclose(remote_hfp) != 0) {
3803             hts_log_error("Failed to close remote file %s", fn);
3804         }
3805 
3806         free(tmps.s);
3807         free(s.s);
3808         return 0;
3809     } else {
3810         hFILE *local_hfp;
3811         if ((local_hfp = hopen(fn, "r")) == 0) return -1;
3812         hclose_abruptly(local_hfp);
3813         *local_fn = fn;
3814         *local_len = strlen(fn);
3815         return 0;
3816     }
3817 
3818  fail:
3819     save_errno = errno;
3820     if (remote_hfp) hclose_abruptly(remote_hfp);
3821     if (local_fp) hclose_abruptly(local_fp);
3822     if (tmps.l > 0) unlink(tmps.s);
3823     free(tmps.s);
3824     free(s.s);
3825     errno = save_errno;
3826     return -2;
3827 }
3828 
3829 /*
3830  * Check the existence of a local index file using part of the alignment file name.
3831  * The order is alignment.bam.csi, alignment.csi, alignment.bam.bai, alignment.bai
3832  * @param fn    - pointer to the file name
3833  * @param fnidx - pointer to the index file name placeholder
3834  * @return        1 for success, 0 for failure
3835  */
hts_idx_check_local(const char * fn,int fmt,char ** fnidx)3836 int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
3837     int i, l_fn, l_ext;
3838     const char *fn_tmp = NULL;
3839     char *fnidx_tmp;
3840     char *csi_ext = ".csi";
3841     char *bai_ext = ".bai";
3842     char *tbi_ext = ".tbi";
3843     char *crai_ext = ".crai";
3844     char *fai_ext = ".fai";
3845 
3846     if (!fn)
3847         return 0;
3848 
3849     if (hisremote(fn)) {
3850         for (i = strlen(fn) - 1; i >= 0; --i)
3851             if (fn[i] == '/') {
3852                 fn_tmp = (char *)&fn[i+1];
3853                 break;
3854             }
3855     } else {
3856         // Borrowed from hopen_fd_fileuri()
3857         if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16;
3858         else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7;
3859         else fn_tmp = fn;
3860 #if defined(_WIN32) || defined(__MSYS__)
3861         // For cases like C:/foo
3862         if (fn_tmp[0] == '/' && fn_tmp[1] && fn_tmp[2] == ':' && fn_tmp[3] == '/')
3863             fn_tmp++;
3864 #endif
3865     }
3866 
3867     if (!fn_tmp) return 0;
3868     hts_log_info("Using alignment file '%s'", fn_tmp);
3869     l_fn = strlen(fn_tmp); l_ext = 5;
3870     fnidx_tmp = (char*)calloc(l_fn + l_ext + 1, 1);
3871     if (!fnidx_tmp) return 0;
3872 
3873     struct stat sbuf;
3874 
3875     // Try alignment.bam.csi first
3876     strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, csi_ext);
3877     if(stat(fnidx_tmp, &sbuf) == 0) {
3878         *fnidx = fnidx_tmp;
3879         return 1;
3880     } else { // Then try alignment.csi
3881         for (i = l_fn - 1; i > 0; --i)
3882             if (fnidx_tmp[i] == '.') {
3883                 strcpy(fnidx_tmp + i, csi_ext);
3884                 if(stat(fnidx_tmp, &sbuf) == 0) {
3885                     *fnidx = fnidx_tmp;
3886                     return 1;
3887                 }
3888                 break;
3889             }
3890     }
3891     if (fmt == HTS_FMT_BAI) {
3892         // Next, try alignment.bam.bai
3893         strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, bai_ext);
3894         if(stat(fnidx_tmp, &sbuf) == 0) {
3895             *fnidx = fnidx_tmp;
3896             return 1;
3897         } else { // And finally, try alignment.bai
3898             for (i = l_fn - 1; i > 0; --i)
3899                 if (fnidx_tmp[i] == '.') {
3900                     strcpy(fnidx_tmp + i, bai_ext);
3901                     if(stat(fnidx_tmp, &sbuf) == 0) {
3902                         *fnidx = fnidx_tmp;
3903                         return 1;
3904                     }
3905                     break;
3906                 }
3907         }
3908     } else if (fmt == HTS_FMT_TBI) { // Or .tbi
3909         strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, tbi_ext);
3910         if(stat(fnidx_tmp, &sbuf) == 0) {
3911             *fnidx = fnidx_tmp;
3912             return 1;
3913         } else {
3914             for (i = l_fn - 1; i > 0; --i)
3915                 if (fnidx_tmp[i] == '.') {
3916                     strcpy(fnidx_tmp + i, tbi_ext);
3917                     if(stat(fnidx_tmp, &sbuf) == 0) {
3918                         *fnidx = fnidx_tmp;
3919                         return 1;
3920                     }
3921                     break;
3922                 }
3923         }
3924     } else if (fmt == HTS_FMT_CRAI) { // Or .crai
3925         strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, crai_ext);
3926         if(stat(fnidx_tmp, &sbuf) == 0) {
3927             *fnidx = fnidx_tmp;
3928             return 1;
3929         } else {
3930             for (i = l_fn - 1; i > 0; --i)
3931                 if (fnidx_tmp[i] == '.') {
3932                     strcpy(fnidx_tmp + i, crai_ext);
3933                     if(stat(fnidx_tmp, &sbuf) == 0) {
3934                         *fnidx = fnidx_tmp;
3935                         return 1;
3936                     }
3937                     break;
3938                 }
3939         }
3940     } else if (fmt == HTS_FMT_FAI) { // Or .fai
3941         strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, fai_ext);
3942         *fnidx = fnidx_tmp;
3943         if(stat(fnidx_tmp, &sbuf) == 0)
3944             return 1;
3945         else
3946             return 0;
3947     }
3948 
3949     free(fnidx_tmp);
3950     return 0;
3951 }
3952 
idx_filename(const char * fn,const char * ext,int download)3953 static char *idx_filename(const char *fn, const char *ext, int download) {
3954     int ret, local_len;
3955     char *fnidx;
3956     const char *local_fn = NULL;
3957     kstring_t buffer = KS_INITIALIZE;
3958 
3959     // First try : append `ext` to `fn`
3960     if (!(fnidx = haddextension(&buffer, fn, 0, ext))) {
3961         free(buffer.s);
3962         return NULL;
3963     }
3964     if ((ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download)) == -1) {
3965         // Second try : replace suffix of `fn` with `ext`
3966         if (!(fnidx = haddextension(&buffer, fn, 1, ext))) {
3967             free(buffer.s);
3968             return NULL;
3969         }
3970         ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download);
3971     }
3972 
3973     if (ret < 0) {
3974         free(buffer.s);
3975         return NULL;
3976     }
3977 
3978     memmove(fnidx, local_fn, local_len);
3979     fnidx[local_len] = 0;
3980     return fnidx;
3981 }
3982 
hts_idx_getfn(const char * fn,const char * ext)3983 char *hts_idx_getfn(const char *fn, const char *ext)
3984 {
3985     return idx_filename(fn, ext, HTS_IDX_SAVE_REMOTE);
3986 }
3987 
hts_idx_locatefn(const char * fn,const char * ext)3988 char *hts_idx_locatefn(const char *fn, const char *ext)
3989 {
3990     return idx_filename(fn, ext, 0);
3991 }
3992 
idx_find_and_load(const char * fn,int fmt,int flags)3993 static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags)
3994 {
3995     char *fnidx = strstr(fn, HTS_IDX_DELIM);
3996     hts_idx_t *idx;
3997 
3998     if ( fnidx ) {
3999         char *fn2 = strdup(fn);
4000         if (!fn2) {
4001             hts_log_error("%s", strerror(errno));
4002             return NULL;
4003         }
4004         fn2[fnidx - fn] = '\0';
4005         fnidx += strlen(HTS_IDX_DELIM);
4006         idx = hts_idx_load3(fn2, fnidx, fmt, flags);
4007         free(fn2);
4008         return idx;
4009     }
4010 
4011     if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) {
4012         if (flags & HTS_IDX_SAVE_REMOTE) {
4013             fnidx = hts_idx_getfn(fn, ".csi");
4014             if (!fnidx) {
4015                 switch (fmt) {
4016                 case HTS_FMT_BAI: fnidx = hts_idx_getfn(fn, ".bai"); break;
4017                 case HTS_FMT_TBI: fnidx = hts_idx_getfn(fn, ".tbi"); break;
4018                 default: break;
4019                 }
4020             }
4021         } else {
4022             fnidx = idx_filename(fn, ".csi", 0);
4023             if (!fnidx) {
4024                 switch (fmt) {
4025                 case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", 0); break;
4026                 case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", 0); break;
4027                 default: break;
4028                 }
4029             }
4030         }
4031     }
4032     if (!fnidx) {
4033         if (!(flags & HTS_IDX_SILENT_FAIL))
4034             hts_log_error("Could not retrieve index file for '%s'", fn);
4035         return 0;
4036     }
4037 
4038     if (flags & HTS_IDX_SAVE_REMOTE)
4039         idx = hts_idx_load3(fn, fnidx, fmt, flags);
4040     else
4041         idx = idx_read(fnidx);
4042     free(fnidx);
4043     return idx;
4044 }
4045 
hts_idx_load(const char * fn,int fmt)4046 hts_idx_t *hts_idx_load(const char *fn, int fmt) {
4047     return idx_find_and_load(fn, fmt, 1);
4048 }
4049 
hts_idx_load2(const char * fn,const char * fnidx)4050 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
4051 {
4052     return hts_idx_load3(fn, fnidx, 0, 0);
4053 }
4054 
hts_idx_load3(const char * fn,const char * fnidx,int fmt,int flags)4055 hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
4056 {
4057     const char *local_fn = NULL;
4058     char *local_fnidx = NULL;
4059     int local_len;
4060     if (!fnidx)
4061         return idx_find_and_load(fn, fmt, flags);
4062 
4063     // Check that the index file is up to date, the main file might have changed
4064     struct stat stat_idx,stat_main;
4065     int remote_fn = hisremote(fn), remote_fnidx = hisremote(fnidx);
4066     if ( !remote_fn && !remote_fnidx
4067          && !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
4068     {
4069         if ( stat_idx.st_mtime < stat_main.st_mtime )
4070             hts_log_warning("The index file is older than the data file: %s", fnidx);
4071     }
4072 
4073     if (remote_fnidx && (flags & HTS_IDX_SAVE_REMOTE))
4074     {
4075         int ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, 1);
4076         if (ret == 0) {
4077             local_fnidx = strdup(local_fn);
4078             if (local_fnidx) {
4079                 local_fnidx[local_len] = '\0';
4080                 fnidx = local_fnidx;
4081             }
4082         }
4083     }
4084 
4085     hts_idx_t *idx = idx_read(fnidx);
4086     if (!idx && !(flags & HTS_IDX_SILENT_FAIL))
4087         hts_log_error("Could not load local index file '%s'", fnidx);
4088 
4089     free(local_fnidx);
4090 
4091     return idx;
4092 }
4093 
4094 
4095 
4096 /**********************
4097  ***     Memory     ***
4098  **********************/
4099 
4100 /* For use with hts_expand macros *only* */
4101 HTSLIB_EXPORT
hts_realloc_or_die(size_t n,size_t m,size_t m_sz,size_t size,int clear,void ** ptr,const char * func)4102 size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
4103                           int clear, void **ptr, const char *func) {
4104     /* If new_m and size are both below this limit, multiplying them
4105        together can't overflow */
4106     const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
4107     void *new_ptr;
4108     size_t bytes, new_m;
4109 
4110     new_m = n;
4111     kroundup_size_t(new_m);
4112 
4113     bytes = size * new_m;
4114 
4115     /* Check for overflow.  Both ensure that new_m will fit in m (we make the
4116        pessimistic assumption that m is signed), and that bytes has not
4117        wrapped around. */
4118     if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
4119         || ((size > safe || new_m > safe)
4120             && bytes / new_m != size)) {
4121         errno = ENOMEM;
4122         goto die;
4123     }
4124 
4125     new_ptr = realloc(*ptr, bytes);
4126     if (new_ptr == NULL) goto die;
4127 
4128     if (clear) {
4129         if (new_m > m) {
4130             memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
4131         }
4132     }
4133 
4134     *ptr = new_ptr;
4135 
4136     return new_m;
4137 
4138  die:
4139     hts_log_error("%s", strerror(errno));
4140     exit(1);
4141 }
4142 
4143 /*
4144  * Companion to hts_resize() macro that does the actual allocation.
4145  *
4146  * Somewhat complicated as hts_resize() needs to write the new allocated
4147  * size back into *size_in_out, and the value pointed to may either be
4148  * int32_t, uint32_t or size_t depending on which array is being resized.
4149  * This is solved by making `size_in_out` a void pointer, getting the macro
4150  * to pass in the size of the item pointed to (in `size_sz`) and then using
4151  * an appropriate cast (based on the value of size_sz).  The function
4152  * ensures that the maximum size will be storable in a signed type of
4153  * the given size so storing to an int32_t should work correctly.
4154  *
4155  * Assumes that sizeof(uint32_t) and sizeof(int32_t) is 4,
4156  * sizeof(uint64_t) and sizeof(int64_t) is 8 and sizeof(size_t) is
4157  * either 4 or 8.  It also assumes casting from unsigned to signed will
4158  * work as long as the top bit isn't set.
4159  */
4160 
hts_resize_array_(size_t item_size,size_t num,size_t size_sz,void * size_in_out,void ** ptr_in_out,int flags,const char * func)4161 int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
4162                       void *size_in_out, void **ptr_in_out, int flags,
4163                       const char *func) {
4164     /* If new_size and item_size are both below this limit, multiplying them
4165        together can't overflow */
4166     const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
4167     void *new_ptr;
4168     size_t bytes, new_size;
4169 
4170     new_size = num;
4171     kroundup_size_t(new_size);
4172     bytes = item_size * new_size;
4173 
4174     /* Check for overflow.  Both ensure that alloc will fit in alloc_in_out (we
4175        make the pessimistic assumption that *alloc_in_out is signed), and that
4176        bytes has not wrapped around. */
4177 
4178     if ((new_size > (((size_t) 1 << (size_sz * 8 - 1)) - 1))
4179         || (((item_size > safe) || (new_size > safe))
4180             && bytes / new_size != item_size)) {
4181         hts_log(HTS_LOG_ERROR, func, "Memory allocation too large");
4182         errno = ENOMEM;
4183         return -1;
4184     }
4185 
4186     new_ptr = realloc(*ptr_in_out, bytes);
4187     if (new_ptr == NULL) {
4188         int save_errno = errno;
4189         hts_log(HTS_LOG_ERROR, func, "%s", strerror(errno));
4190         errno = save_errno;
4191         return -1;
4192     }
4193 
4194     if (flags & HTS_RESIZE_CLEAR) {
4195         size_t old_size;
4196         switch (size_sz) {
4197         case 4: old_size = *((uint32_t *) size_in_out); break;
4198         case 8: old_size = *((uint64_t *) size_in_out); break;
4199         default: abort();
4200         }
4201         if (new_size > old_size) {
4202             memset((char *) new_ptr + old_size * item_size, 0,
4203                    (new_size - old_size) * item_size);
4204         }
4205     }
4206 
4207     switch (size_sz) {
4208     case 4: *((uint32_t *) size_in_out) = new_size; break;
4209     case 8: *((uint64_t *) size_in_out) = new_size; break;
4210     default: abort();
4211     }
4212 
4213     *ptr_in_out = new_ptr;
4214     return 0;
4215 }
4216 
hts_lib_shutdown()4217 void hts_lib_shutdown()
4218 {
4219     hfile_shutdown(1);
4220 }
4221 
hts_free(void * ptr)4222 void hts_free(void *ptr) {
4223     free(ptr);
4224 }
4225 
hts_set_log_level(enum htsLogLevel level)4226 void hts_set_log_level(enum htsLogLevel level)
4227 {
4228     hts_verbose = level;
4229 }
4230 
hts_get_log_level()4231 enum htsLogLevel hts_get_log_level()
4232 {
4233     return hts_verbose;
4234 }
4235 
get_severity_tag(enum htsLogLevel severity)4236 static char get_severity_tag(enum htsLogLevel severity)
4237 {
4238     switch (severity) {
4239     case HTS_LOG_ERROR:
4240         return 'E';
4241     case HTS_LOG_WARNING:
4242         return 'W';
4243     case HTS_LOG_INFO:
4244         return 'I';
4245     case HTS_LOG_DEBUG:
4246         return 'D';
4247     case HTS_LOG_TRACE:
4248         return 'T';
4249     default:
4250         break;
4251     }
4252 
4253     return '*';
4254 }
4255 
hts_log(enum htsLogLevel severity,const char * context,const char * format,...)4256 void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
4257 {
4258     int save_errno = errno;
4259     if (severity <= hts_verbose) {
4260         va_list argptr;
4261 
4262         fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
4263 
4264         va_start(argptr, format);
4265         vfprintf(stderr, format, argptr);
4266         va_end(argptr);
4267 
4268         fprintf(stderr, "\n");
4269     }
4270     errno = save_errno;
4271 }
4272