1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3 Copyright (C) 2008, 2009, 2012-2020 Genome Research Ltd.
4 Copyright (C) 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27 #include <config.h>
28
29 #include <zlib.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <stdlib.h>
34 #include <unistd.h>
35 #include <inttypes.h>
36 #include <limits.h>
37 #include <stdint.h>
38 #include <fcntl.h>
39 #include <errno.h>
40 #include <time.h>
41 #include <sys/stat.h>
42 #include <assert.h>
43
44 #include "htslib/hts.h"
45 #include "htslib/bgzf.h"
46 #include "cram/cram.h"
47 #include "htslib/hfile.h"
48 #include "htslib/hts_endian.h"
49 #include "version.h"
50 #include "hts_internal.h"
51 #include "hfile_internal.h"
52 #include "sam_internal.h"
53 #include "htslib/hts_os.h" // drand48
54
55 #include "htslib/khash.h"
56 #include "htslib/kseq.h"
57 #include "htslib/ksort.h"
58 #include "htslib/tbx.h"
59
60 #ifndef EFTYPE
61 #define EFTYPE ENOEXEC
62 #endif
63
64 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
65
66 HTSLIB_EXPORT
67 int hts_verbose = HTS_LOG_WARNING;
68
hts_version()69 const char *hts_version()
70 {
71 return HTS_VERSION_TEXT;
72 }
73
74 HTSLIB_EXPORT
75 const unsigned char seq_nt16_table[256] = {
76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
78 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
79 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
80 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
81 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
82 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
83 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
84
85 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
86 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
87 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
88 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
89 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
90 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
91 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
92 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
93 };
94
95 HTSLIB_EXPORT
96 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
97
98 HTSLIB_EXPORT
99 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
100
101 /**********************
102 *** Basic file I/O ***
103 **********************/
104
format_category(enum htsExactFormat fmt)105 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
106 {
107 switch (fmt) {
108 case bam:
109 case sam:
110 case cram:
111 case fastq_format:
112 return sequence_data;
113
114 case vcf:
115 case bcf:
116 return variant_data;
117
118 case bai:
119 case crai:
120 case csi:
121 case fai_format:
122 case fqi_format:
123 case gzi:
124 case tbi:
125 return index_file;
126
127 case bed:
128 return region_list;
129
130 case fasta_format:
131 case htsget:
132 case hts_crypt4gh_format:
133 return unknown_category;
134
135 case unknown_format:
136 case binary_format:
137 case text_format:
138 case empty_format:
139 case format_maximum:
140 break;
141 }
142
143 return unknown_category;
144 }
145
146 // Decompress several hundred bytes by peeking at the file, which must be
147 // positioned at the start of a GZIP block.
decompress_peek(hFILE * fp,unsigned char * dest,size_t destsize)148 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
149 {
150 unsigned char buffer[2048];
151 z_stream zs;
152 ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
153
154 if (npeek < 0) return 0;
155
156 zs.zalloc = NULL;
157 zs.zfree = NULL;
158 zs.next_in = buffer;
159 zs.avail_in = npeek;
160 zs.next_out = dest;
161 zs.avail_out = destsize;
162 if (inflateInit2(&zs, 31) != Z_OK) return 0;
163
164 while (zs.total_out < destsize)
165 if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
166
167 destsize = zs.total_out;
168 inflateEnd(&zs);
169
170 return destsize;
171 }
172
173 // Parse "x.y" text, taking care because the string is not NUL-terminated
174 // and filling in major/minor only when the digits are followed by a delimiter,
175 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
176 static void
parse_version(htsFormat * fmt,const unsigned char * u,const unsigned char * ulim)177 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
178 {
179 const char *s = (const char *) u;
180 const char *slim = (const char *) ulim;
181 short v;
182
183 fmt->version.major = fmt->version.minor = -1;
184
185 for (v = 0; s < slim && isdigit_c(*s); s++)
186 v = 10 * v + *s - '0';
187
188 if (s < slim) {
189 fmt->version.major = v;
190 if (*s == '.') {
191 s++;
192 for (v = 0; s < slim && isdigit_c(*s); s++)
193 v = 10 * v + *s - '0';
194 if (s < slim)
195 fmt->version.minor = v;
196 }
197 else
198 fmt->version.minor = 0;
199 }
200 }
201
202 static int
cmp_nonblank(const char * key,const unsigned char * u,const unsigned char * ulim)203 cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
204 {
205 const unsigned char *ukey = (const unsigned char *) key;
206
207 while (*ukey)
208 if (u >= ulim) return +1;
209 else if (isspace_c(*u)) u++;
210 else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
211 else u++, ukey++;
212
213 return 0;
214 }
215
is_text_only(const unsigned char * u,const unsigned char * ulim)216 static int is_text_only(const unsigned char *u, const unsigned char *ulim)
217 {
218 for (; u < ulim; u++)
219 if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
220 return 0;
221
222 return 1;
223 }
224
225 static int
secondline_is_bases(const unsigned char * u,const unsigned char * ulim)226 secondline_is_bases(const unsigned char *u, const unsigned char *ulim)
227 {
228 // Skip to second line, returning false if there isn't one
229 u = memchr(u, '\n', ulim - u);
230 if (u == NULL || ++u == ulim) return 0;
231
232 // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
233 while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
234 if (*u == '=') return 0;
235 u++;
236 }
237
238 return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
239 }
240
241 // Parse tab-delimited text, filling in a string of column types and returning
242 // the number of columns spotted (within [u,ulim), and up to column_len) or -1
243 // if non-printable characters were seen. Column types:
244 // i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
245 static int
parse_tabbed_text(char * columns,int column_len,const unsigned char * u,const unsigned char * ulim,int * complete)246 parse_tabbed_text(char *columns, int column_len,
247 const unsigned char *u, const unsigned char *ulim,
248 int *complete)
249 {
250 const char *str = (const char *) u;
251 const char *slim = (const char *) ulim;
252 const char *s;
253 int ncolumns = 0;
254
255 enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
256 unsigned seen = 0;
257 *complete = 0;
258
259 for (s = str; s < slim; s++)
260 if (*s >= ' ') {
261 if (isdigit_c(*s))
262 seen |= digit;
263 else if ((*s == '+' || *s == '-') && s == str)
264 seen |= leading_sign;
265 else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
266 seen |= cigar_operator;
267 else
268 seen |= other;
269 }
270 else if (*s == '\t' || *s == '\r' || *s == '\n') {
271 size_t len = s - str;
272 char type;
273
274 if (seen == digit || seen == (leading_sign|digit)) type = 'i';
275 else if (seen == (digit|cigar_operator)) type = 'C';
276 else if (len == 1)
277 switch (str[0]) {
278 case '*': type = 'C'; break;
279 case '+': case '-': case '.': type = 's'; break;
280 default: type = 'Z'; break;
281 }
282 else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
283 else type = 'Z';
284
285 columns[ncolumns++] = type;
286 if (*s != '\t' || ncolumns >= column_len - 1) {
287 *complete = 1; // finished the line or more columns than needed
288 break;
289 }
290
291 str = s + 1;
292 seen = 0;
293 }
294 else return -1;
295
296 columns[ncolumns] = '\0';
297 return ncolumns;
298 }
299
300 // Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
301 // Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
colmatch(const char * columns,const char * pattern)302 static int colmatch(const char *columns, const char *pattern)
303 {
304 int i;
305 for (i = 0; columns[i] != '\0'; i++) {
306 if (pattern[i] == '+') return i;
307 if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
308 }
309
310 return i;
311 }
312
hts_detect_format(hFILE * hfile,htsFormat * fmt)313 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
314 {
315 char columns[24];
316 unsigned char s[1024];
317 int complete = 0;
318 ssize_t len = hpeek(hfile, s, 18);
319 if (len < 0) return -1;
320
321 fmt->category = unknown_category;
322 fmt->format = unknown_format;
323 fmt->version.major = fmt->version.minor = -1;
324 fmt->compression = no_compression;
325 fmt->compression_level = -1;
326 fmt->specific = NULL;
327
328 if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
329 // The stream is either gzip-compressed or BGZF-compressed.
330 // Determine which, and decompress the first few records or lines.
331 fmt->compression = (len >= 18 && (s[3] & 4) &&
332 memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
333 if (len >= 9 && s[2] == 8)
334 fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
335
336 len = decompress_peek(hfile, s, sizeof s);
337 }
338 else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
339 (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
340 memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
341 fmt->compression = bzip2_compression;
342 fmt->compression_level = s[3] - '0';
343 // Decompressing via libbz2 produces no output until it has a whole
344 // block (of size 100Kb x level), which is too large for peeking.
345 // So unfortunately we can recognise bzip2 but not the contents,
346 // except that \x1772... magic indicates the stream is empty.
347 if (s[4] == '\x31') return 0;
348 else len = 0;
349 }
350 else {
351 len = hpeek(hfile, s, sizeof s);
352 }
353 if (len < 0) return -1;
354
355 if (len == 0) {
356 fmt->format = empty_format;
357 return 0;
358 }
359
360 if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
361 fmt->category = sequence_data;
362 fmt->format = cram;
363 fmt->version.major = s[4], fmt->version.minor = s[5];
364 fmt->compression = custom;
365 return 0;
366 }
367 else if (len >= 4 && s[3] <= '\4') {
368 if (memcmp(s, "BAM\1", 4) == 0) {
369 fmt->category = sequence_data;
370 fmt->format = bam;
371 // TODO Decompress enough to pick version from @HD-VN header
372 fmt->version.major = 1, fmt->version.minor = -1;
373 return 0;
374 }
375 else if (memcmp(s, "BAI\1", 4) == 0) {
376 fmt->category = index_file;
377 fmt->format = bai;
378 fmt->version.major = -1, fmt->version.minor = -1;
379 return 0;
380 }
381 else if (memcmp(s, "BCF\4", 4) == 0) {
382 fmt->category = variant_data;
383 fmt->format = bcf;
384 fmt->version.major = 1, fmt->version.minor = -1;
385 return 0;
386 }
387 else if (memcmp(s, "BCF\2", 4) == 0) {
388 fmt->category = variant_data;
389 fmt->format = bcf;
390 fmt->version.major = s[3];
391 fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
392 return 0;
393 }
394 else if (memcmp(s, "CSI\1", 4) == 0) {
395 fmt->category = index_file;
396 fmt->format = csi;
397 fmt->version.major = 1, fmt->version.minor = -1;
398 return 0;
399 }
400 else if (memcmp(s, "TBI\1", 4) == 0) {
401 fmt->category = index_file;
402 fmt->format = tbi;
403 return 0;
404 }
405 }
406 else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
407 fmt->category = variant_data;
408 fmt->format = vcf;
409 if (len >= 21 && s[16] == 'v')
410 parse_version(fmt, &s[17], &s[len]);
411 return 0;
412 }
413 else if (len >= 4 && s[0] == '@' &&
414 (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
415 memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
416 memcmp(s, "@CO\t", 4) == 0)) {
417 fmt->category = sequence_data;
418 fmt->format = sam;
419 // @HD-VN is not guaranteed to be the first tag, but then @HD is
420 // not guaranteed to be present at all...
421 if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
422 parse_version(fmt, &s[7], &s[len]);
423 else
424 fmt->version.major = 1, fmt->version.minor = -1;
425 return 0;
426 }
427 else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
428 fmt->category = unknown_category;
429 fmt->format = htsget;
430 return 0;
431 }
432 else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
433 fmt->category = unknown_category;
434 fmt->format = hts_crypt4gh_format;
435 return 0;
436 }
437 else if (len >= 1 && s[0] == '>' && secondline_is_bases(s, &s[len])) {
438 fmt->format = fasta_format;
439 return 0;
440 }
441 else if (len >= 1 && s[0] == '@' && secondline_is_bases(s, &s[len])) {
442 fmt->category = sequence_data;
443 fmt->format = fastq_format;
444 return 0;
445 }
446 else if (parse_tabbed_text(columns, sizeof columns, s,
447 &s[len], &complete) > 0) {
448 // A complete SAM line is at least 11 columns. On unmapped long reads may
449 // be missing two. (On mapped long reads we must have an @ header so long
450 // CIGAR is irrelevant.)
451 if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
452 >= 9 + 2*complete) {
453 fmt->category = sequence_data;
454 fmt->format = sam;
455 fmt->version.major = 1, fmt->version.minor = -1;
456 return 0;
457 }
458 else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
459 fmt->category = index_file;
460 fmt->format = crai;
461 return 0;
462 }
463 else if (colmatch(columns, "Ziiiii") == 6) {
464 fmt->category = index_file;
465 fmt->format = fqi_format;
466 return 0;
467 }
468 else if (colmatch(columns, "Ziiii") == 5) {
469 fmt->category = index_file;
470 fmt->format = fai_format;
471 return 0;
472 }
473 else if (colmatch(columns, "Zii+") >= 3) {
474 fmt->category = region_list;
475 fmt->format = bed;
476 return 0;
477 }
478 }
479
480 // Arbitrary text files can be read using hts_getline().
481 if (is_text_only(s, &s[len])) fmt->format = text_format;
482
483 // Nothing recognised: leave unset fmt-> fields as unknown.
484 return 0;
485 }
486
hts_format_description(const htsFormat * format)487 char *hts_format_description(const htsFormat *format)
488 {
489 kstring_t str = { 0, 0, NULL };
490
491 switch (format->format) {
492 case sam: kputs("SAM", &str); break;
493 case bam: kputs("BAM", &str); break;
494 case cram: kputs("CRAM", &str); break;
495 case fasta_format: kputs("FASTA", &str); break;
496 case fastq_format: kputs("FASTQ", &str); break;
497 case vcf: kputs("VCF", &str); break;
498 case bcf:
499 if (format->version.major == 1) kputs("Legacy BCF", &str);
500 else kputs("BCF", &str);
501 break;
502 case bai: kputs("BAI", &str); break;
503 case crai: kputs("CRAI", &str); break;
504 case csi: kputs("CSI", &str); break;
505 case fai_format: kputs("FASTA-IDX", &str); break;
506 case fqi_format: kputs("FASTQ-IDX", &str); break;
507 case gzi: kputs("GZI", &str); break;
508 case tbi: kputs("Tabix", &str); break;
509 case bed: kputs("BED", &str); break;
510 case htsget: kputs("htsget", &str); break;
511 case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
512 case empty_format: kputs("empty", &str); break;
513 default: kputs("unknown", &str); break;
514 }
515
516 if (format->version.major >= 0) {
517 kputs(" version ", &str);
518 kputw(format->version.major, &str);
519 if (format->version.minor >= 0) {
520 kputc('.', &str);
521 kputw(format->version.minor, &str);
522 }
523 }
524
525 switch (format->compression) {
526 case bzip2_compression: kputs(" bzip2-compressed", &str); break;
527 case custom: kputs(" compressed", &str); break;
528 case gzip: kputs(" gzip-compressed", &str); break;
529 case bgzf:
530 switch (format->format) {
531 case bam:
532 case bcf:
533 case csi:
534 case tbi:
535 // These are by definition BGZF, so just use the generic term
536 kputs(" compressed", &str);
537 break;
538 default:
539 kputs(" BGZF-compressed", &str);
540 break;
541 }
542 break;
543 default: break;
544 }
545
546 switch (format->category) {
547 case sequence_data: kputs(" sequence", &str); break;
548 case variant_data: kputs(" variant calling", &str); break;
549 case index_file: kputs(" index", &str); break;
550 case region_list: kputs(" genomic region", &str); break;
551 default: break;
552 }
553
554 if (format->compression == no_compression)
555 switch (format->format) {
556 case text_format:
557 case sam:
558 case crai:
559 case vcf:
560 case bed:
561 case fai_format:
562 case fqi_format:
563 case fasta_format:
564 case fastq_format:
565 case htsget:
566 kputs(" text", &str);
567 break;
568
569 case empty_format:
570 break;
571
572 default:
573 kputs(" data", &str);
574 break;
575 }
576 else
577 kputs(" data", &str);
578
579 return ks_release(&str);
580 }
581
hts_open_format(const char * fn,const char * mode,const htsFormat * fmt)582 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
583 {
584 char smode[101], *cp, *cp2, *mode_c;
585 htsFile *fp = NULL;
586 hFILE *hfile = NULL;
587 char fmt_code = '\0';
588 const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0";
589
590 strncpy(smode, mode, 99);
591 smode[99]=0;
592 if ((cp = strchr(smode, ',')))
593 *cp = '\0';
594
595 // Migrate format code (b or c) to the end of the smode buffer.
596 for (cp2 = cp = smode; *cp; cp++) {
597 if (*cp == 'b')
598 fmt_code = 'b';
599 else if (*cp == 'c')
600 fmt_code = 'c';
601 else
602 *cp2++ = *cp;
603 }
604 mode_c = cp2;
605 *cp2++ = fmt_code;
606 *cp2++ = 0;
607
608 // Set or reset the format code if opts->format is used
609 if (fmt && fmt->format > unknown_format
610 && fmt->format < sizeof(format_to_mode)) {
611 *mode_c = format_to_mode[fmt->format];
612 }
613
614 // If we really asked for a compressed text format then mode_c above will
615 // point to nul. We set to 'z' to enable bgzf.
616 if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
617 if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
618 *mode_c = 'z';
619 }
620
621 char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
622 if ( fnidx ) {
623 rmme = strdup(fn);
624 if ( !rmme ) goto error;
625 rmme[fnidx-fn] = 0;
626 fn = rmme;
627 }
628
629 hfile = hopen(fn, smode);
630 if (hfile == NULL) goto error;
631
632 fp = hts_hopen(hfile, fn, smode);
633 if (fp == NULL) goto error;
634
635 // Compensate for the loss of exactness in htsExactFormat.
636 // hts_hopen returns generics such as binary or text, but we
637 // have been given something explicit here so use that instead.
638 if (fp->is_write && fmt &&
639 (fmt->format == bam || fmt->format == sam ||
640 fmt->format == vcf || fmt->format == bcf ||
641 fmt->format == bed || fmt->format == fasta_format ||
642 fmt->format == fastq_format))
643 fp->format.format = fmt->format;
644
645 if (fmt && fmt->specific)
646 if (hts_opt_apply(fp, fmt->specific) != 0)
647 goto error;
648
649 if ( rmme ) free(rmme);
650 return fp;
651
652 error:
653 hts_log_error("Failed to open file \"%s\"%s%s", fn,
654 errno ? " : " : "", errno ? strerror(errno) : "");
655 if ( rmme ) free(rmme);
656
657 if (hfile)
658 hclose_abruptly(hfile);
659
660 return NULL;
661 }
662
hts_open(const char * fn,const char * mode)663 htsFile *hts_open(const char *fn, const char *mode) {
664 return hts_open_format(fn, mode, NULL);
665 }
666
667 /*
668 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
669 * the prefix in lowercase into buf and returning a pointer to the suffix.
670 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
671 * prefix should be one of several known values of maximum length buflen-2.
672 * (If delim is not found, returns a pointer to the '\0'.)
673 */
674 static const char *
scan_keyword(const char * str,char delim,char * buf,size_t buflen)675 scan_keyword(const char *str, char delim, char *buf, size_t buflen)
676 {
677 size_t i = 0;
678 while (*str && *str != delim) {
679 if (i < buflen-1) buf[i++] = tolower_c(*str);
680 str++;
681 }
682
683 buf[i] = '\0';
684 return *str? str+1 : str;
685 }
686
687 /*
688 * Parses arg and appends it to the option list.
689 *
690 * Returns 0 on success;
691 * -1 on failure.
692 */
hts_opt_add(hts_opt ** opts,const char * c_arg)693 int hts_opt_add(hts_opt **opts, const char *c_arg) {
694 hts_opt *o, *t;
695 char *val;
696
697 /*
698 * IMPORTANT!!!
699 * If you add another string option here, don't forget to also add
700 * it to the case statement in hts_opt_apply.
701 */
702
703 if (!c_arg)
704 return -1;
705
706 if (!(o = malloc(sizeof(*o))))
707 return -1;
708
709 if (!(o->arg = strdup(c_arg))) {
710 free(o);
711 return -1;
712 }
713
714 if (!(val = strchr(o->arg, '=')))
715 val = "1"; // assume boolean
716 else
717 *val++ = '\0';
718
719 if (strcmp(o->arg, "decode_md") == 0 ||
720 strcmp(o->arg, "DECODE_MD") == 0)
721 o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
722
723 else if (strcmp(o->arg, "verbosity") == 0 ||
724 strcmp(o->arg, "VERBOSITY") == 0)
725 o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
726
727 else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
728 strcmp(o->arg, "SEQS_PER_SLICE") == 0)
729 o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
730
731 else if (strcmp(o->arg, "bases_per_slice") == 0 ||
732 strcmp(o->arg, "BASES_PER_SLICE") == 0)
733 o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
734
735 else if (strcmp(o->arg, "slices_per_container") == 0 ||
736 strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
737 o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
738
739 else if (strcmp(o->arg, "embed_ref") == 0 ||
740 strcmp(o->arg, "EMBED_REF") == 0)
741 o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
742
743 else if (strcmp(o->arg, "no_ref") == 0 ||
744 strcmp(o->arg, "NO_REF") == 0)
745 o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
746
747 else if (strcmp(o->arg, "ignore_md5") == 0 ||
748 strcmp(o->arg, "IGNORE_MD5") == 0)
749 o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
750
751 else if (strcmp(o->arg, "use_bzip2") == 0 ||
752 strcmp(o->arg, "USE_BZIP2") == 0)
753 o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
754
755 else if (strcmp(o->arg, "use_rans") == 0 ||
756 strcmp(o->arg, "USE_RANS") == 0)
757 o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
758
759 else if (strcmp(o->arg, "use_lzma") == 0 ||
760 strcmp(o->arg, "USE_LZMA") == 0)
761 o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
762
763 else if (strcmp(o->arg, "reference") == 0 ||
764 strcmp(o->arg, "REFERENCE") == 0)
765 o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
766
767 else if (strcmp(o->arg, "version") == 0 ||
768 strcmp(o->arg, "VERSION") == 0)
769 o->opt = CRAM_OPT_VERSION, o->val.s =val;
770
771 else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
772 strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
773 o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
774
775 else if (strcmp(o->arg, "nthreads") == 0 ||
776 strcmp(o->arg, "NTHREADS") == 0)
777 o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
778
779 else if (strcmp(o->arg, "cache_size") == 0 ||
780 strcmp(o->arg, "CACHE_SIZE") == 0) {
781 char *endp;
782 o->opt = HTS_OPT_CACHE_SIZE;
783 o->val.i = strtol(val, &endp, 0);
784 // NB: Doesn't support floats, eg 1.5g
785 // TODO: extend hts_parse_decimal? See also samtools sort.
786 switch (*endp) {
787 case 'g': case 'G': o->val.i *= 1024; // fall through
788 case 'm': case 'M': o->val.i *= 1024; // fall through
789 case 'k': case 'K': o->val.i *= 1024; break;
790 case '\0': break;
791 default:
792 hts_log_error("Unrecognised cache size suffix '%c'", *endp);
793 free(o->arg);
794 free(o);
795 return -1;
796 }
797 }
798
799 else if (strcmp(o->arg, "required_fields") == 0 ||
800 strcmp(o->arg, "REQUIRED_FIELDS") == 0)
801 o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
802
803 else if (strcmp(o->arg, "lossy_names") == 0 ||
804 strcmp(o->arg, "LOSSY_NAMES") == 0)
805 o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
806
807 else if (strcmp(o->arg, "name_prefix") == 0 ||
808 strcmp(o->arg, "NAME_PREFIX") == 0)
809 o->opt = CRAM_OPT_PREFIX, o->val.s = val;
810
811 else if (strcmp(o->arg, "store_md") == 0 ||
812 strcmp(o->arg, "store_md") == 0)
813 o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
814
815 else if (strcmp(o->arg, "store_nm") == 0 ||
816 strcmp(o->arg, "store_nm") == 0)
817 o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
818
819 else if (strcmp(o->arg, "block_size") == 0 ||
820 strcmp(o->arg, "BLOCK_SIZE") == 0)
821 o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
822
823 else if (strcmp(o->arg, "level") == 0 ||
824 strcmp(o->arg, "LEVEL") == 0)
825 o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
826
827 else {
828 hts_log_error("Unknown option '%s'", o->arg);
829 free(o->arg);
830 free(o);
831 return -1;
832 }
833
834 o->next = NULL;
835
836 // Append; assumes small list.
837 if (*opts) {
838 t = *opts;
839 while (t->next)
840 t = t->next;
841 t->next = o;
842 } else {
843 *opts = o;
844 }
845
846 return 0;
847 }
848
849 /*
850 * Applies an hts_opt option list to a given htsFile.
851 *
852 * Returns 0 on success
853 * -1 on failure
854 */
hts_opt_apply(htsFile * fp,hts_opt * opts)855 int hts_opt_apply(htsFile *fp, hts_opt *opts) {
856 hts_opt *last = NULL;
857
858 for (; opts; opts = (last=opts)->next) {
859 switch (opts->opt) {
860 case CRAM_OPT_REFERENCE:
861 if (!(fp->fn_aux = strdup(opts->val.s)))
862 return -1;
863 // fall through
864 case CRAM_OPT_VERSION:
865 case CRAM_OPT_PREFIX:
866 if (hts_set_opt(fp, opts->opt, opts->val.s) != 0)
867 return -1;
868 break;
869 default:
870 if (hts_set_opt(fp, opts->opt, opts->val.i) != 0)
871 return -1;
872 break;
873 }
874 }
875
876 return 0;
877 }
878
879 /*
880 * Frees an hts_opt list.
881 */
hts_opt_free(hts_opt * opts)882 void hts_opt_free(hts_opt *opts) {
883 hts_opt *last = NULL;
884 while (opts) {
885 opts = (last=opts)->next;
886 free(last->arg);
887 free(last);
888 }
889 }
890
891
892 /*
893 * Tokenise options as (key(=value)?,)*(key(=value)?)?
894 * NB: No provision for ',' appearing in the value!
895 * Add backslashing rules?
896 *
897 * This could be used as part of a general command line option parser or
898 * as a string concatenated onto the file open mode.
899 *
900 * Returns 0 on success
901 * -1 on failure.
902 */
hts_parse_opt_list(htsFormat * fmt,const char * str)903 int hts_parse_opt_list(htsFormat *fmt, const char *str) {
904 while (str && *str) {
905 const char *str_start;
906 int len;
907 char arg[8001];
908
909 while (*str && *str == ',')
910 str++;
911
912 for (str_start = str; *str && *str != ','; str++);
913 len = str - str_start;
914
915 // Produce a nul terminated copy of the option
916 strncpy(arg, str_start, len < 8000 ? len : 8000);
917 arg[len < 8000 ? len : 8000] = '\0';
918
919 if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
920 return -1;
921
922 if (*str)
923 str++;
924 }
925
926 return 0;
927 }
928
929 /*
930 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
931 * followed by a comma separated list of key=value options and splits
932 * these up into the fields of htsFormat struct.
933 *
934 * format is assumed to be already initialised, either to blank
935 * "unknown" values or via previous hts_opt_add calls.
936 *
937 * Returns 0 on success
938 * -1 on failure.
939 */
hts_parse_format(htsFormat * format,const char * str)940 int hts_parse_format(htsFormat *format, const char *str) {
941 char fmt[8];
942 const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
943
944 format->version.minor = 0; // unknown
945 format->version.major = 0; // unknown
946
947 if (strcmp(fmt, "sam") == 0) {
948 format->category = sequence_data;
949 format->format = sam;
950 format->compression = no_compression;;
951 format->compression_level = 0;
952 } else if (strcmp(fmt, "sam.gz") == 0) {
953 format->category = sequence_data;
954 format->format = sam;
955 format->compression = bgzf;
956 format->compression_level = -1;
957 } else if (strcmp(fmt, "bam") == 0) {
958 format->category = sequence_data;
959 format->format = bam;
960 format->compression = bgzf;
961 format->compression_level = -1;
962 } else if (strcmp(fmt, "cram") == 0) {
963 format->category = sequence_data;
964 format->format = cram;
965 format->compression = custom;
966 format->compression_level = -1;
967 } else if (strcmp(fmt, "vcf") == 0) {
968 format->category = variant_data;
969 format->format = vcf;
970 format->compression = no_compression;;
971 format->compression_level = 0;
972 } else if (strcmp(fmt, "bcf") == 0) {
973 format->category = variant_data;
974 format->format = bcf;
975 format->compression = bgzf;
976 format->compression_level = -1;
977 } else {
978 return -1;
979 }
980
981 return hts_parse_opt_list(format, cp);
982 }
983
984
985 /*
986 * Tokenise options as (key(=value)?,)*(key(=value)?)?
987 * NB: No provision for ',' appearing in the value!
988 * Add backslashing rules?
989 *
990 * This could be used as part of a general command line option parser or
991 * as a string concatenated onto the file open mode.
992 *
993 * Returns 0 on success
994 * -1 on failure.
995 */
hts_process_opts(htsFile * fp,const char * opts)996 static int hts_process_opts(htsFile *fp, const char *opts) {
997 htsFormat fmt;
998
999 fmt.specific = NULL;
1000 if (hts_parse_opt_list(&fmt, opts) != 0)
1001 return -1;
1002
1003 if (hts_opt_apply(fp, fmt.specific) != 0) {
1004 hts_opt_free(fmt.specific);
1005 return -1;
1006 }
1007
1008 hts_opt_free(fmt.specific);
1009
1010 return 0;
1011 }
1012
hts_crypt4gh_redirect(const char * fn,const char * mode,hFILE ** hfile_ptr,htsFile * fp)1013 static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1014 hFILE **hfile_ptr, htsFile *fp) {
1015 hFILE *hfile1 = *hfile_ptr;
1016 hFILE *hfile2 = NULL;
1017 char fn_buf[512], *fn2 = fn_buf;
1018 const char *prefix = "crypt4gh:";
1019 size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1020 int ret = -1;
1021
1022 if (fn2_len > sizeof(fn_buf)) {
1023 fn2 = malloc(fn2_len);
1024 if (!fn2) return -1;
1025 }
1026
1027 // Reopen fn using the crypt4gh plug-in (if available)
1028 snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1029 hfile2 = hopen(fn2, mode, "parent", hfile1, NULL);
1030 if (hfile2) {
1031 // Replace original hfile with the new one. The original is now
1032 // enclosed within hfile2
1033 *hfile_ptr = hfile2;
1034 ret = 0;
1035 }
1036
1037 if (fn2 != fn_buf)
1038 free(fn2);
1039 return ret;
1040 }
1041
hts_hopen(hFILE * hfile,const char * fn,const char * mode)1042 htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1043 {
1044 hFILE *hfile_orig = hfile;
1045 htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1046 char simple_mode[101], *cp, *opts;
1047 simple_mode[100] = '\0';
1048
1049 if (fp == NULL) goto error;
1050
1051 fp->fn = strdup(fn);
1052 fp->is_be = ed_is_big();
1053
1054 // Split mode into simple_mode,opts strings
1055 if ((cp = strchr(mode, ','))) {
1056 strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1057 simple_mode[cp-mode] = '\0';
1058 opts = cp+1;
1059 } else {
1060 strncpy(simple_mode, mode, 100);
1061 opts = NULL;
1062 }
1063
1064 if (strchr(simple_mode, 'r')) {
1065 const int max_loops = 5; // Should be plenty
1066 int loops = 0;
1067 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
1068
1069 // Deal with formats that re-direct an underlying file via a plug-in.
1070 // Loops as we may have crypt4gh served via htsget, or
1071 // crypt4gh-in-crypt4gh.
1072 while (fp->format.format == htsget ||
1073 fp->format.format == hts_crypt4gh_format) {
1074 // Ensure we don't get stuck in an endless redirect loop
1075 if (++loops > max_loops) {
1076 errno = ELOOP;
1077 goto error;
1078 }
1079
1080 if (fp->format.format == htsget) {
1081 hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1082 if (hfile2 == NULL) goto error;
1083
1084 hfile = hfile2;
1085 }
1086 else if (fp->format.format == hts_crypt4gh_format) {
1087 if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1088 goto error;
1089 }
1090
1091 // Re-detect format against the result of the redirection
1092 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
1093 }
1094 }
1095 else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1096 htsFormat *fmt = &fp->format;
1097 fp->is_write = 1;
1098
1099 if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1100 else if (strchr(simple_mode, 'c')) fmt->format = cram;
1101 else fmt->format = text_format;
1102
1103 if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1104 else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1105 else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1106 else {
1107 // No compression mode specified, set to the default for the format
1108 switch (fmt->format) {
1109 case binary_format: fmt->compression = bgzf; break;
1110 case cram: fmt->compression = custom; break;
1111 case text_format: fmt->compression = no_compression; break;
1112 default: abort();
1113 }
1114 }
1115
1116 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1117 fmt->category = format_category(fmt->format);
1118
1119 fmt->version.major = fmt->version.minor = -1;
1120 fmt->compression_level = -1;
1121 fmt->specific = NULL;
1122 }
1123 else { errno = EINVAL; goto error; }
1124
1125 switch (fp->format.format) {
1126 case binary_format:
1127 case bam:
1128 case bcf:
1129 fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1130 if (fp->fp.bgzf == NULL) goto error;
1131 fp->is_bin = fp->is_bgzf = 1;
1132 break;
1133
1134 case cram:
1135 fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1136 if (fp->fp.cram == NULL) goto error;
1137 if (!fp->is_write)
1138 cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
1139 fp->is_cram = 1;
1140 break;
1141
1142 case empty_format:
1143 case text_format:
1144 case bed:
1145 case fasta_format:
1146 case fastq_format:
1147 case sam:
1148 case vcf:
1149 if (fp->format.compression != no_compression) {
1150 fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1151 if (fp->fp.bgzf == NULL) goto error;
1152 fp->is_bgzf = 1;
1153 }
1154 else
1155 fp->fp.hfile = hfile;
1156 break;
1157
1158 default:
1159 errno = EFTYPE;
1160 goto error;
1161 }
1162
1163 if (opts)
1164 hts_process_opts(fp, opts);
1165
1166 // If redirecting, close the original hFILE now (pedantically we would
1167 // instead close it in hts_close(), but this a simplifying optimisation)
1168 if (hfile != hfile_orig) hclose_abruptly(hfile_orig);
1169
1170 return fp;
1171
1172 error:
1173 hts_log_error("Failed to open file %s", fn);
1174
1175 // If redirecting, close the failed redirection hFILE that we have opened
1176 if (hfile != hfile_orig) hclose_abruptly(hfile);
1177
1178 if (fp) {
1179 free(fp->fn);
1180 free(fp->fn_aux);
1181 free(fp);
1182 }
1183 return NULL;
1184 }
1185
hts_close(htsFile * fp)1186 int hts_close(htsFile *fp)
1187 {
1188 int ret, save;
1189
1190 switch (fp->format.format) {
1191 case binary_format:
1192 case bam:
1193 case bcf:
1194 ret = bgzf_close(fp->fp.bgzf);
1195 break;
1196
1197 case cram:
1198 if (!fp->is_write) {
1199 switch (cram_eof(fp->fp.cram)) {
1200 case 2:
1201 hts_log_warning("EOF marker is absent. The input is probably truncated");
1202 break;
1203 case 0: /* not at EOF, but may not have wanted all seqs */
1204 default: /* case 1, expected EOF */
1205 break;
1206 }
1207 }
1208 ret = cram_close(fp->fp.cram);
1209 break;
1210
1211 case empty_format:
1212 case text_format:
1213 case bed:
1214 case fasta_format:
1215 case fastq_format:
1216 case sam:
1217 case vcf:
1218 ret = sam_state_destroy(fp);
1219
1220 if (fp->format.compression != no_compression)
1221 ret |= bgzf_close(fp->fp.bgzf);
1222 else
1223 ret |= hclose(fp->fp.hfile);
1224 break;
1225
1226 default:
1227 ret = -1;
1228 break;
1229 }
1230
1231 save = errno;
1232 sam_hdr_destroy(fp->bam_header);
1233 hts_idx_destroy(fp->idx);
1234 free(fp->fn);
1235 free(fp->fn_aux);
1236 free(fp->line.s);
1237 free(fp);
1238 errno = save;
1239 return ret;
1240 }
1241
hts_get_format(htsFile * fp)1242 const htsFormat *hts_get_format(htsFile *fp)
1243 {
1244 return fp? &fp->format : NULL;
1245 }
1246
hts_format_file_extension(const htsFormat * format)1247 const char *hts_format_file_extension(const htsFormat *format) {
1248 if (!format)
1249 return "?";
1250
1251 switch (format->format) {
1252 case sam: return "sam";
1253 case bam: return "bam";
1254 case bai: return "bai";
1255 case cram: return "cram";
1256 case crai: return "crai";
1257 case vcf: return "vcf";
1258 case bcf: return "bcf";
1259 case csi: return "csi";
1260 case fai_format: return "fai";
1261 case fqi_format: return "fqi";
1262 case gzi: return "gzi";
1263 case tbi: return "tbi";
1264 case bed: return "bed";
1265 case fasta_format: return "fa";
1266 case fastq_format: return "fq";
1267 default: return "?";
1268 }
1269 }
1270
hts_hfile(htsFile * fp)1271 static hFILE *hts_hfile(htsFile *fp) {
1272 switch (fp->format.format) {
1273 case binary_format:// fall through
1274 case bcf: // fall through
1275 case bam: return bgzf_hfile(fp->fp.bgzf);
1276 case cram: return cram_hfile(fp->fp.cram);
1277 case text_format: return fp->fp.hfile;
1278 case vcf: // fall through
1279 case sam: return fp->format.compression != no_compression
1280 ? bgzf_hfile(fp->fp.bgzf)
1281 : fp->fp.hfile;
1282 default: return NULL;
1283 }
1284 }
1285
hts_set_opt(htsFile * fp,enum hts_fmt_option opt,...)1286 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1287 int r;
1288 va_list args;
1289
1290 switch (opt) {
1291 case HTS_OPT_NTHREADS: {
1292 va_start(args, opt);
1293 int nthreads = va_arg(args, int);
1294 va_end(args);
1295 return hts_set_threads(fp, nthreads);
1296 }
1297
1298 case HTS_OPT_BLOCK_SIZE: {
1299 hFILE *hf = hts_hfile(fp);
1300
1301 if (hf) {
1302 va_start(args, opt);
1303 if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1304 hts_log_warning("Failed to change block size");
1305 va_end(args);
1306 }
1307 else {
1308 // To do - implement for vcf/bcf.
1309 hts_log_warning("Cannot change block size for this format");
1310 }
1311
1312 return 0;
1313 }
1314
1315 case HTS_OPT_THREAD_POOL: {
1316 va_start(args, opt);
1317 htsThreadPool *p = va_arg(args, htsThreadPool *);
1318 va_end(args);
1319 return hts_set_thread_pool(fp, p);
1320 }
1321
1322 case HTS_OPT_CACHE_SIZE: {
1323 va_start(args, opt);
1324 int cache_size = va_arg(args, int);
1325 va_end(args);
1326 hts_set_cache_size(fp, cache_size);
1327 return 0;
1328 }
1329
1330 case HTS_OPT_COMPRESSION_LEVEL: {
1331 va_start(args, opt);
1332 int level = va_arg(args, int);
1333 va_end(args);
1334 if (fp->is_bgzf)
1335 fp->fp.bgzf->compress_level = level;
1336 }
1337
1338 default:
1339 break;
1340 }
1341
1342 if (fp->format.format != cram)
1343 return 0;
1344
1345 va_start(args, opt);
1346 r = cram_set_voption(fp->fp.cram, opt, args);
1347 va_end(args);
1348
1349 return r;
1350 }
1351
1352 BGZF *hts_get_bgzfp(htsFile *fp);
1353
hts_set_threads(htsFile * fp,int n)1354 int hts_set_threads(htsFile *fp, int n)
1355 {
1356 if (fp->format.format == sam) {
1357 return sam_set_threads(fp, n);
1358 } else if (fp->format.compression == bgzf) {
1359 return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1360 } else if (fp->format.format == cram) {
1361 return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1362 }
1363 else return 0;
1364 }
1365
hts_set_thread_pool(htsFile * fp,htsThreadPool * p)1366 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1367 if (fp->format.format == sam || fp->format.format == text_format) {
1368 return sam_set_thread_pool(fp, p);
1369 } else if (fp->format.compression == bgzf) {
1370 return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1371 } else if (fp->format.format == cram) {
1372 return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1373 }
1374 else return 0;
1375 }
1376
hts_set_cache_size(htsFile * fp,int n)1377 void hts_set_cache_size(htsFile *fp, int n)
1378 {
1379 if (fp->format.compression == bgzf)
1380 bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1381 }
1382
hts_set_fai_filename(htsFile * fp,const char * fn_aux)1383 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1384 {
1385 free(fp->fn_aux);
1386 if (fn_aux) {
1387 fp->fn_aux = strdup(fn_aux);
1388 if (fp->fn_aux == NULL) return -1;
1389 }
1390 else fp->fn_aux = NULL;
1391
1392 if (fp->format.format == cram)
1393 if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1394 return -1;
1395
1396 return 0;
1397 }
1398
hts_open_tmpfile(const char * fname,const char * mode,kstring_t * tmpname)1399 hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1400 {
1401 int pid = (int) getpid();
1402 unsigned ptr = (uintptr_t) tmpname;
1403 int n = 0;
1404 hFILE *fp = NULL;
1405
1406 do {
1407 // Attempt to further uniquify the temporary filename
1408 unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1409 n++;
1410
1411 ks_clear(tmpname);
1412 if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1413
1414 fp = hopen(tmpname->s, mode);
1415 } while (fp == NULL && errno == EEXIST && n < 100);
1416
1417 return fp;
1418 }
1419
1420 // For VCF/BCF backward sweeper. Not exposing these functions because their
1421 // future is uncertain. Things will probably have to change with hFILE...
hts_get_bgzfp(htsFile * fp)1422 BGZF *hts_get_bgzfp(htsFile *fp)
1423 {
1424 if (fp->is_bgzf)
1425 return fp->fp.bgzf;
1426 else
1427 return NULL;
1428 }
hts_useek(htsFile * fp,off_t uoffset,int where)1429 int hts_useek(htsFile *fp, off_t uoffset, int where)
1430 {
1431 if (fp->is_bgzf)
1432 return bgzf_useek(fp->fp.bgzf, uoffset, where);
1433 else
1434 return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1435 }
hts_utell(htsFile * fp)1436 off_t hts_utell(htsFile *fp)
1437 {
1438 if (fp->is_bgzf)
1439 return bgzf_utell(fp->fp.bgzf);
1440 else
1441 return htell(fp->fp.hfile);
1442 }
1443
hts_getline(htsFile * fp,int delimiter,kstring_t * str)1444 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1445 {
1446 int ret;
1447 if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1448 hts_log_error("Unexpected delimiter %d", delimiter);
1449 abort();
1450 }
1451
1452 switch (fp->format.compression) {
1453 case no_compression:
1454 str->l = 0;
1455 ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
1456 if (ret >= 0) ret = str->l;
1457 else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1458 else ret = -1;
1459 break;
1460
1461 case gzip:
1462 case bgzf:
1463 ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1464 break;
1465
1466 default:
1467 abort();
1468 }
1469
1470 ++fp->lineno;
1471 return ret;
1472 }
1473
hts_readlist(const char * string,int is_file,int * _n)1474 char **hts_readlist(const char *string, int is_file, int *_n)
1475 {
1476 unsigned int m = 0, n = 0;
1477 char **s = 0, **s_new;
1478 if ( is_file )
1479 {
1480 BGZF *fp = bgzf_open(string, "r");
1481 if ( !fp ) return NULL;
1482
1483 kstring_t str;
1484 str.s = 0; str.l = str.m = 0;
1485 while (bgzf_getline(fp, '\n', &str) >= 0)
1486 {
1487 if (str.l == 0) continue;
1488 if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
1489 goto err;
1490 s[n] = strdup(str.s);
1491 if (!s[n])
1492 goto err;
1493 n++;
1494 }
1495 bgzf_close(fp);
1496 free(str.s);
1497 }
1498 else
1499 {
1500 const char *q = string, *p = string;
1501 while ( 1 )
1502 {
1503 if (*p == ',' || *p == 0)
1504 {
1505 if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
1506 goto err;
1507 s[n] = (char*)calloc(p - q + 1, 1);
1508 if (!s[n])
1509 goto err;
1510 strncpy(s[n++], q, p - q);
1511 q = p + 1;
1512 }
1513 if ( !*p ) break;
1514 p++;
1515 }
1516 }
1517 // Try to shrink s to the minimum size needed
1518 s_new = (char**)realloc(s, n * sizeof(char*));
1519 if (!s_new)
1520 goto err;
1521
1522 s = s_new;
1523 assert(n < INT_MAX); // hts_resize() should ensure this
1524 *_n = n;
1525 return s;
1526
1527 err:
1528 for (m = 0; m < n; m++)
1529 free(s[m]);
1530 free(s);
1531 return NULL;
1532 }
1533
hts_readlines(const char * fn,int * _n)1534 char **hts_readlines(const char *fn, int *_n)
1535 {
1536 unsigned int m = 0, n = 0;
1537 char **s = 0, **s_new;
1538 BGZF *fp = bgzf_open(fn, "r");
1539 if ( fp ) { // read from file
1540 kstring_t str;
1541 str.s = 0; str.l = str.m = 0;
1542 while (bgzf_getline(fp, '\n', &str) >= 0) {
1543 if (str.l == 0) continue;
1544 if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
1545 goto err;
1546 s[n] = strdup(str.s);
1547 if (!s[n])
1548 goto err;
1549 n++;
1550 }
1551 bgzf_close(fp);
1552 free(str.s);
1553 } else if (*fn == ':') { // read from string
1554 const char *q, *p;
1555 for (q = p = fn + 1;; ++p)
1556 if (*p == ',' || *p == 0) {
1557 if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
1558 goto err;
1559 s[n] = (char*)calloc(p - q + 1, 1);
1560 if (!s[n])
1561 goto err;
1562 strncpy(s[n++], q, p - q);
1563 q = p + 1;
1564 if (*p == 0) break;
1565 }
1566 } else return 0;
1567 // Try to shrink s to the minimum size needed
1568 s_new = (char**)realloc(s, n * sizeof(char*));
1569 if (!s_new)
1570 goto err;
1571
1572 s = s_new;
1573 assert(n < INT_MAX); // hts_resize() should ensure this
1574 *_n = n;
1575 return s;
1576
1577 err:
1578 for (m = 0; m < n; m++)
1579 free(s[m]);
1580 free(s);
1581 return NULL;
1582 }
1583
1584 // DEPRECATED: To be removed in a future HTSlib release
hts_file_type(const char * fname)1585 int hts_file_type(const char *fname)
1586 {
1587 int len = strlen(fname);
1588 if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
1589 if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
1590 if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
1591 if ( !strcmp("-",fname) ) return FT_STDIN;
1592
1593 hFILE *f = hopen(fname, "r");
1594 if (f == NULL) return 0;
1595
1596 htsFormat fmt;
1597 if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
1598 if (hclose(f) < 0) return 0;
1599
1600 switch (fmt.format) {
1601 case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
1602 case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
1603 default: return 0;
1604 }
1605 }
1606
hts_check_EOF(htsFile * fp)1607 int hts_check_EOF(htsFile *fp)
1608 {
1609 if (fp->format.compression == bgzf)
1610 return bgzf_check_EOF(hts_get_bgzfp(fp));
1611 else if (fp->format.format == cram)
1612 return cram_check_EOF(fp->fp.cram);
1613 else
1614 return 3;
1615 }
1616
1617
1618 /****************
1619 *** Indexing ***
1620 ****************/
1621
1622 #define HTS_MIN_MARKER_DIST 0x10000
1623
1624 // Finds the special meta bin
1625 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
1626 #define META_BIN(idx) ((idx)->n_bins + 1)
1627
1628 #define pair64_lt(a,b) ((a).u < (b).u)
1629 #define pair64max_lt(a,b) ((a).u < (b).u || \
1630 ((a).u == (b).u && (a).max < (b).max))
1631
1632 KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
1633 KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
1634
1635 typedef struct {
1636 int32_t m, n;
1637 uint64_t loff;
1638 hts_pair64_t *list;
1639 } bins_t;
1640
1641 KHASH_MAP_INIT_INT(bin, bins_t)
1642 typedef khash_t(bin) bidx_t;
1643
1644 typedef struct {
1645 hts_pos_t n, m;
1646 uint64_t *offset;
1647 } lidx_t;
1648
1649 struct hts_idx_t {
1650 int fmt, min_shift, n_lvls, n_bins;
1651 uint32_t l_meta;
1652 int32_t n, m;
1653 uint64_t n_no_coor;
1654 bidx_t **bidx;
1655 lidx_t *lidx;
1656 uint8_t *meta; // MUST have a terminating NUL on the end
1657 int tbi_n, last_tbi_tid;
1658 struct {
1659 uint32_t last_bin, save_bin;
1660 hts_pos_t last_coor;
1661 int last_tid, save_tid, finished;
1662 uint64_t last_off, save_off;
1663 uint64_t off_beg, off_end;
1664 uint64_t n_mapped, n_unmapped;
1665 } z; // keep internal states
1666 };
1667
idx_format_name(int fmt)1668 static char * idx_format_name(int fmt) {
1669 switch (fmt) {
1670 case HTS_FMT_CSI: return "csi";
1671 case HTS_FMT_BAI: return "bai";
1672 case HTS_FMT_TBI: return "tbi";
1673 case HTS_FMT_CRAI: return "crai";
1674 default: return "unknown";
1675 }
1676 }
1677
insert_to_b(bidx_t * b,int bin,uint64_t beg,uint64_t end)1678 static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
1679 {
1680 khint_t k;
1681 bins_t *l;
1682 int absent;
1683 k = kh_put(bin, b, bin, &absent);
1684 if (absent < 0) return -1; // Out of memory
1685 l = &kh_value(b, k);
1686 if (absent) {
1687 l->m = 1; l->n = 0;
1688 l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
1689 if (!l->list) {
1690 kh_del(bin, b, k);
1691 return -1;
1692 }
1693 } else if (l->n == l->m) {
1694 uint32_t new_m = l->m ? l->m << 1 : 1;
1695 hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
1696 if (!new_list) return -1;
1697 l->list = new_list;
1698 l->m = new_m;
1699 }
1700 l->list[l->n].u = beg;
1701 l->list[l->n++].v = end;
1702 return 0;
1703 }
1704
insert_to_l(lidx_t * l,int64_t _beg,int64_t _end,uint64_t offset,int min_shift)1705 static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
1706 {
1707 int i;
1708 hts_pos_t beg, end;
1709 beg = _beg >> min_shift;
1710 end = (_end - 1) >> min_shift;
1711 if (l->m < end + 1) {
1712 size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
1713 uint64_t *new_offset;
1714
1715 new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
1716 if (!new_offset) return -1;
1717
1718 // fill unused memory with (uint64_t)-1
1719 memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
1720 l->m = new_m;
1721 l->offset = new_offset;
1722 }
1723 for (i = beg; i <= end; ++i) {
1724 if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
1725 }
1726 if (l->n < end + 1) l->n = end + 1;
1727 return 0;
1728 }
1729
hts_idx_init(int n,int fmt,uint64_t offset0,int min_shift,int n_lvls)1730 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
1731 {
1732 hts_idx_t *idx;
1733 idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
1734 if (idx == NULL) return NULL;
1735 idx->fmt = fmt;
1736 idx->min_shift = min_shift;
1737 idx->n_lvls = n_lvls;
1738 idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
1739 idx->z.save_tid = idx->z.last_tid = -1;
1740 idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
1741 idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
1742 idx->z.last_coor = 0xffffffffu;
1743 if (n) {
1744 idx->n = idx->m = n;
1745 idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
1746 if (idx->bidx == NULL) { free(idx); return NULL; }
1747 idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
1748 if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
1749 }
1750 idx->tbi_n = -1;
1751 idx->last_tbi_tid = -1;
1752 return idx;
1753 }
1754
update_loff(hts_idx_t * idx,int i,int free_lidx)1755 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
1756 {
1757 bidx_t *bidx = idx->bidx[i];
1758 lidx_t *lidx = &idx->lidx[i];
1759 khint_t k;
1760 int l;
1761 uint64_t offset0 = 0;
1762 if (bidx) {
1763 k = kh_get(bin, bidx, META_BIN(idx));
1764 if (k != kh_end(bidx))
1765 offset0 = kh_val(bidx, k).list[0].u;
1766 for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
1767 lidx->offset[l] = offset0;
1768 } else l = 1;
1769 for (; l < lidx->n; ++l) // fill missing values
1770 if (lidx->offset[l] == (uint64_t)-1)
1771 lidx->offset[l] = lidx->offset[l-1];
1772 if (bidx == 0) return;
1773 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
1774 if (kh_exist(bidx, k))
1775 {
1776 if ( kh_key(bidx, k) < idx->n_bins )
1777 {
1778 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
1779 // disable linear index if bot_bin out of bounds
1780 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
1781 }
1782 else
1783 kh_val(bidx, k).loff = 0;
1784 }
1785 if (free_lidx) {
1786 free(lidx->offset);
1787 lidx->m = lidx->n = 0;
1788 lidx->offset = 0;
1789 }
1790 }
1791
compress_binning(hts_idx_t * idx,int i)1792 static int compress_binning(hts_idx_t *idx, int i)
1793 {
1794 bidx_t *bidx = idx->bidx[i];
1795 khint_t k;
1796 int l, m;
1797 if (bidx == 0) return 0;
1798 // merge a bin to its parent if the bin is too small
1799 for (l = idx->n_lvls; l > 0; --l) {
1800 unsigned start = hts_bin_first(l);
1801 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1802 bins_t *p, *q;
1803 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
1804 p = &kh_value(bidx, k);
1805 if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
1806 if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
1807 khint_t kp;
1808 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
1809 if (kp == kh_end(bidx)) continue;
1810 q = &kh_val(bidx, kp);
1811 if (q->n + p->n > q->m) {
1812 uint32_t new_m = q->n + p->n;
1813 hts_pair64_t *new_list;
1814 kroundup32(new_m);
1815 if (new_m > INT32_MAX) return -1; // Limited by index format
1816 new_list = realloc(q->list, new_m * sizeof(*new_list));
1817 if (!new_list) return -1;
1818 q->m = new_m;
1819 q->list = new_list;
1820 }
1821 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
1822 q->n += p->n;
1823 free(p->list);
1824 kh_del(bin, bidx, k);
1825 }
1826 }
1827 }
1828 k = kh_get(bin, bidx, 0);
1829 if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
1830 // merge adjacent chunks that start from the same BGZF block
1831 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1832 bins_t *p;
1833 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
1834 p = &kh_value(bidx, k);
1835 for (l = 1, m = 0; l < p->n; ++l) {
1836 if (p->list[m].v>>16 >= p->list[l].u>>16) {
1837 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
1838 } else p->list[++m] = p->list[l];
1839 }
1840 p->n = m + 1;
1841 }
1842 return 0;
1843 }
1844
hts_idx_finish(hts_idx_t * idx,uint64_t final_offset)1845 int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
1846 {
1847 int i, ret = 0;
1848 if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
1849 if (idx->z.save_tid >= 0) {
1850 ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
1851 ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
1852 ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
1853 }
1854 for (i = 0; i < idx->n; ++i) {
1855 update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
1856 ret |= compress_binning(idx, i);
1857 }
1858 idx->z.finished = 1;
1859
1860 return ret;
1861 }
1862
hts_idx_check_range(hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end)1863 int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1864 {
1865 int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
1866 if (tid < 0 || (beg <= maxpos && end <= maxpos))
1867 return 0;
1868
1869 if (idx->fmt == HTS_FMT_CSI) {
1870 hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
1871 " cannot be stored in a csi index. "
1872 "Please check headers match the data",
1873 beg, end);
1874 } else {
1875 hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
1876 " cannot be stored in a %s index. Try using a csi index",
1877 beg, end, idx_format_name(idx->fmt));
1878 }
1879 errno = ERANGE;
1880 return -1;
1881 }
1882
hts_idx_push(hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end,uint64_t offset,int is_mapped)1883 int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
1884 {
1885 int bin;
1886 if (tid<0) beg = -1, end = 0;
1887 if (hts_idx_check_range(idx, tid, beg, end) < 0)
1888 return -1;
1889 if (tid >= idx->m) { // enlarge the index
1890 uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
1891 bidx_t **new_bidx;
1892 lidx_t *new_lidx;
1893
1894 new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
1895 if (!new_bidx) return -1;
1896 idx->bidx = new_bidx;
1897
1898 new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
1899 if (!new_lidx) return -1;
1900 idx->lidx = new_lidx;
1901
1902 memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
1903 memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
1904 idx->m = new_m;
1905 }
1906 if (idx->n < tid + 1) idx->n = tid + 1;
1907 if (idx->z.finished) return 0;
1908 if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
1909 if ( tid>=0 && idx->n_no_coor )
1910 {
1911 hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
1912 return -1;
1913 }
1914 if (tid>=0 && idx->bidx[tid] != 0)
1915 {
1916 hts_log_error("Chromosome blocks not continuous");
1917 return -1;
1918 }
1919 idx->z.last_tid = tid;
1920 idx->z.last_bin = 0xffffffffu;
1921 } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
1922 hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
1923 return -1;
1924 }
1925 if (end < beg) {
1926 // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
1927 hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
1928 return -1;
1929 }
1930 if ( tid>=0 )
1931 {
1932 if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
1933 if (is_mapped) {
1934 // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
1935 if (beg < 0) beg = 0;
1936 if (end <= 0) end = 1;
1937 // idx->z.last_off points to the start of the current record
1938 if (insert_to_l(&idx->lidx[tid], beg, end,
1939 idx->z.last_off, idx->min_shift) < 0) return -1;
1940 }
1941 }
1942 else idx->n_no_coor++;
1943 bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
1944 if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
1945 if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
1946 if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
1947 idx->z.save_off, idx->z.last_off) < 0) return -1;
1948 }
1949 if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
1950 idx->z.off_end = idx->z.last_off;
1951 if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1952 idx->z.off_beg, idx->z.off_end) < 0) return -1;
1953 if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1954 idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
1955 idx->z.n_mapped = idx->z.n_unmapped = 0;
1956 idx->z.off_beg = idx->z.off_end;
1957 }
1958 idx->z.save_off = idx->z.last_off;
1959 idx->z.save_bin = idx->z.last_bin = bin;
1960 idx->z.save_tid = tid;
1961 }
1962 if (is_mapped) ++idx->z.n_mapped;
1963 else ++idx->z.n_unmapped;
1964 idx->z.last_off = offset;
1965 idx->z.last_coor = beg;
1966 return 0;
1967 }
1968
1969 // Needed for TBI only. Ensure 'tid' with 'name' is in the index meta data.
1970 // idx->meta needs to have been initialised first with an appropriate Tabix
1971 // configuration via hts_idx_set_meta.
1972 //
1973 // NB number of references (first 4 bytes of tabix header) aren't in
1974 // idx->meta, but held in idx->n instead.
hts_idx_tbi_name(hts_idx_t * idx,int tid,const char * name)1975 int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
1976 // Horrid - we have to map incoming tid to a tbi alternative tid.
1977 // This is because TBI counts tids by "covered" refs while everything
1978 // else counts by Nth SQ/contig record in header.
1979 if (tid == idx->last_tbi_tid || tid < 0 || !name)
1980 return idx->tbi_n;
1981
1982 uint32_t len = strlen(name)+1;
1983 uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len);
1984 if (!tmp)
1985 return -1;
1986
1987 // Append name
1988 idx->meta = tmp;
1989 strcpy((char *)idx->meta + idx->l_meta, name);
1990 idx->l_meta += len;
1991
1992 // Update seq length
1993 u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
1994
1995 idx->last_tbi_tid = tid;
1996 return ++idx->tbi_n;
1997 }
1998
1999 // When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2000 // loop. idx->z.last_off is the previous bzgf_tell location, so we know
2001 // the location the current bam record started at as well as where it ends.
2002 //
2003 // When building an index on the fly via a write_bam / hts_idx_push loop,
2004 // this isn't quite identical as we may amend the virtual coord returned
2005 // by bgzf_tell to the start of a new block if the next bam struct doesn't
2006 // fit. It's essentially the same thing, but for bit-identical indices
2007 // we need to amend the idx->z.last_off when we know we're starting a new
2008 // block.
hts_idx_amend_last(hts_idx_t * idx,uint64_t offset)2009 void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2010 {
2011 idx->z.last_off = offset;
2012 }
2013
hts_idx_destroy(hts_idx_t * idx)2014 void hts_idx_destroy(hts_idx_t *idx)
2015 {
2016 khint_t k;
2017 int i;
2018 if (idx == 0) return;
2019
2020 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2021 if (idx->fmt == HTS_FMT_CRAI) {
2022 hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2023 cram_index_free(cidx->cram);
2024 free(cidx);
2025 return;
2026 }
2027
2028 for (i = 0; i < idx->m; ++i) {
2029 bidx_t *bidx = idx->bidx[i];
2030 free(idx->lidx[i].offset);
2031 if (bidx == 0) continue;
2032 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2033 if (kh_exist(bidx, k))
2034 free(kh_value(bidx, k).list);
2035 kh_destroy(bin, bidx);
2036 }
2037 free(idx->bidx); free(idx->lidx); free(idx->meta);
2038 free(idx);
2039 }
2040
hts_idx_fmt(hts_idx_t * idx)2041 int hts_idx_fmt(hts_idx_t *idx) {
2042 return idx->fmt;
2043 }
2044
2045 // The optimizer eliminates these ed_is_big() calls; still it would be good to
2046 // TODO Determine endianness at configure- or compile-time
2047
idx_write_int32(BGZF * fp,int32_t x)2048 static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2049 {
2050 if (ed_is_big()) x = ed_swap_4(x);
2051 return bgzf_write(fp, &x, sizeof x);
2052 }
2053
idx_write_uint32(BGZF * fp,uint32_t x)2054 static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2055 {
2056 if (ed_is_big()) x = ed_swap_4(x);
2057 return bgzf_write(fp, &x, sizeof x);
2058 }
2059
idx_write_uint64(BGZF * fp,uint64_t x)2060 static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2061 {
2062 if (ed_is_big()) x = ed_swap_8(x);
2063 return bgzf_write(fp, &x, sizeof x);
2064 }
2065
swap_bins(bins_t * p)2066 static inline void swap_bins(bins_t *p)
2067 {
2068 int i;
2069 for (i = 0; i < p->n; ++i) {
2070 ed_swap_8p(&p->list[i].u);
2071 ed_swap_8p(&p->list[i].v);
2072 }
2073 }
2074
hts_idx_save_core(const hts_idx_t * idx,BGZF * fp,int fmt)2075 static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2076 {
2077 int32_t i, j;
2078
2079 #define check(ret) if ((ret) < 0) return -1
2080
2081 // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2082 //
2083 // NOTE: CSI meta is undefined in spec, so this code has an assumption
2084 // that we're only using it for Tabix data.
2085 int nids = idx->n;
2086 if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2087 for (i = nids = 0; i < idx->n; ++i) {
2088 if (idx->bidx[i])
2089 nids++;
2090 }
2091 }
2092 check(idx_write_int32(fp, nids));
2093 if (fmt == HTS_FMT_TBI && idx->l_meta)
2094 check(bgzf_write(fp, idx->meta, idx->l_meta));
2095
2096 for (i = 0; i < idx->n; ++i) {
2097 khint_t k;
2098 bidx_t *bidx = idx->bidx[i];
2099 lidx_t *lidx = &idx->lidx[i];
2100
2101 // write binning index
2102 if (nids == idx->n || bidx)
2103 check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2104 if (bidx)
2105 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2106 if (kh_exist(bidx, k)) {
2107 bins_t *p = &kh_value(bidx, k);
2108 check(idx_write_uint32(fp, kh_key(bidx, k)));
2109 if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2110 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2111 check(idx_write_int32(fp, p->n));
2112 for (j = 0; j < p->n; ++j) {
2113 //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2114 check(idx_write_uint64(fp, p->list[j].u));
2115 check(idx_write_uint64(fp, p->list[j].v));
2116 }
2117 }
2118
2119 // write linear index
2120 if (fmt != HTS_FMT_CSI) {
2121 check(idx_write_int32(fp, lidx->n));
2122 for (j = 0; j < lidx->n; ++j)
2123 check(idx_write_uint64(fp, lidx->offset[j]));
2124 }
2125 }
2126
2127 check(idx_write_uint64(fp, idx->n_no_coor));
2128 return 0;
2129 #undef check
2130 }
2131
hts_idx_save(const hts_idx_t * idx,const char * fn,int fmt)2132 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2133 {
2134 int ret, save;
2135 if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2136 char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2137 if (fnidx == NULL) return -1;
2138
2139 strcpy(fnidx, fn);
2140 switch (fmt) {
2141 case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2142 case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2143 case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2144 default: abort();
2145 }
2146
2147 ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2148 save = errno;
2149 free(fnidx);
2150 errno = save;
2151 return ret;
2152 }
2153
hts_idx_save_as(const hts_idx_t * idx,const char * fn,const char * fnidx,int fmt)2154 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2155 {
2156 BGZF *fp;
2157
2158 #define check(ret) if ((ret) < 0) goto fail
2159
2160 if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
2161
2162 fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2163 if (fp == NULL) return -1;
2164
2165 if (fmt == HTS_FMT_CSI) {
2166 check(bgzf_write(fp, "CSI\1", 4));
2167 check(idx_write_int32(fp, idx->min_shift));
2168 check(idx_write_int32(fp, idx->n_lvls));
2169 check(idx_write_uint32(fp, idx->l_meta));
2170 if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2171 } else if (fmt == HTS_FMT_TBI) {
2172 check(bgzf_write(fp, "TBI\1", 4));
2173 } else if (fmt == HTS_FMT_BAI) {
2174 check(bgzf_write(fp, "BAI\1", 4));
2175 } else abort();
2176
2177 check(hts_idx_save_core(idx, fp, fmt));
2178
2179 return bgzf_close(fp);
2180 #undef check
2181
2182 fail:
2183 bgzf_close(fp);
2184 return -1;
2185 }
2186
idx_read_core(hts_idx_t * idx,BGZF * fp,int fmt)2187 static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2188 {
2189 int32_t i, n, is_be;
2190 is_be = ed_is_big();
2191 if (idx == NULL) return -4;
2192 for (i = 0; i < idx->n; ++i) {
2193 bidx_t *h;
2194 lidx_t *l = &idx->lidx[i];
2195 uint32_t key;
2196 int j, absent;
2197 bins_t *p;
2198 h = idx->bidx[i] = kh_init(bin);
2199 if (bgzf_read(fp, &n, 4) != 4) return -1;
2200 if (is_be) ed_swap_4p(&n);
2201 if (n < 0) return -3;
2202 for (j = 0; j < n; ++j) {
2203 khint_t k;
2204 if (bgzf_read(fp, &key, 4) != 4) return -1;
2205 if (is_be) ed_swap_4p(&key);
2206 k = kh_put(bin, h, key, &absent);
2207 if (absent < 0) return -2; // No memory
2208 if (absent == 0) return -3; // Duplicate bin number
2209 p = &kh_val(h, k);
2210 if (fmt == HTS_FMT_CSI) {
2211 if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2212 if (is_be) ed_swap_8p(&p->loff);
2213 } else p->loff = 0;
2214 if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2215 if (is_be) ed_swap_4p(&p->n);
2216 if (p->n < 0) return -3;
2217 if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2218 p->m = p->n;
2219 p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
2220 if (p->list == NULL) return -2;
2221 if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2222 if (is_be) swap_bins(p);
2223 }
2224 if (fmt != HTS_FMT_CSI) { // load linear index
2225 int j;
2226 uint32_t x;
2227 if (bgzf_read(fp, &x, 4) != 4) return -1;
2228 if (is_be) ed_swap_4p(&x);
2229 l->n = x;
2230 if (l->n < 0) return -3;
2231 if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2232 l->m = l->n;
2233 l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
2234 if (l->offset == NULL) return -2;
2235 if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2236 if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2237 for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
2238 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
2239 update_loff(idx, i, 0);
2240 }
2241 }
2242 if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2243 if (is_be) ed_swap_8p(&idx->n_no_coor);
2244 return 0;
2245 }
2246
idx_read(const char * fn)2247 static hts_idx_t *idx_read(const char *fn)
2248 {
2249 uint8_t magic[4];
2250 int i, is_be;
2251 hts_idx_t *idx = NULL;
2252 uint8_t *meta = NULL;
2253 BGZF *fp = bgzf_open(fn, "r");
2254 if (fp == NULL) return NULL;
2255 is_be = ed_is_big();
2256 if (bgzf_read(fp, magic, 4) != 4) goto fail;
2257
2258 if (memcmp(magic, "CSI\1", 4) == 0) {
2259 uint32_t x[3], n;
2260 if (bgzf_read(fp, x, 12) != 12) goto fail;
2261 if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
2262 if (x[2]) {
2263 if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
2264 if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
2265 if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
2266 // Prevent possible strlen past the end in tbx_index_load2
2267 meta[x[2]] = '\0';
2268 }
2269 if (bgzf_read(fp, &n, 4) != 4) goto fail;
2270 if (is_be) ed_swap_4p(&n);
2271 if (n > INT32_MAX) goto fail;
2272 if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
2273 idx->l_meta = x[2];
2274 idx->meta = meta;
2275 meta = NULL;
2276 if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
2277 }
2278 else if (memcmp(magic, "TBI\1", 4) == 0) {
2279 uint8_t x[8 * 4];
2280 uint32_t n;
2281 // Read file header
2282 if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
2283 n = le_to_u32(&x[0]); // location of n_ref
2284 if (n > INT32_MAX) goto fail;
2285 if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
2286 n = le_to_u32(&x[7*4]); // location of l_nm
2287 if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
2288 idx->l_meta = 28 + n;
2289 if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
2290 // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
2291 // N.B. left in little-endian byte order.
2292 memcpy(idx->meta, &x[1*4], 28);
2293 // Read in sequence names.
2294 if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
2295 // Prevent possible strlen past the end in tbx_index_load2
2296 idx->meta[idx->l_meta] = '\0';
2297 if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
2298 }
2299 else if (memcmp(magic, "BAI\1", 4) == 0) {
2300 uint32_t n;
2301 if (bgzf_read(fp, &n, 4) != 4) goto fail;
2302 if (is_be) ed_swap_4p(&n);
2303 if (n > INT32_MAX) goto fail;
2304 if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
2305 if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
2306 }
2307 else { errno = EINVAL; goto fail; }
2308
2309 bgzf_close(fp);
2310 return idx;
2311
2312 fail:
2313 bgzf_close(fp);
2314 hts_idx_destroy(idx);
2315 free(meta);
2316 return NULL;
2317 }
2318
hts_idx_set_meta(hts_idx_t * idx,uint32_t l_meta,uint8_t * meta,int is_copy)2319 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
2320 int is_copy)
2321 {
2322 uint8_t *new_meta = meta;
2323 if (is_copy) {
2324 size_t l = l_meta;
2325 if (l > SIZE_MAX - 1) {
2326 errno = ENOMEM;
2327 return -1;
2328 }
2329 new_meta = malloc(l + 1);
2330 if (!new_meta) return -1;
2331 memcpy(new_meta, meta, l);
2332 // Prevent possible strlen past the end in tbx_index_load2
2333 new_meta[l] = '\0';
2334 }
2335 if (idx->meta) free(idx->meta);
2336 idx->l_meta = l_meta;
2337 idx->meta = new_meta;
2338 return 0;
2339 }
2340
hts_idx_get_meta(hts_idx_t * idx,uint32_t * l_meta)2341 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
2342 {
2343 *l_meta = idx->l_meta;
2344 return idx->meta;
2345 }
2346
hts_idx_seqnames(const hts_idx_t * idx,int * n,hts_id2name_f getid,void * hdr)2347 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
2348 {
2349 if ( !idx->n )
2350 {
2351 *n = 0;
2352 return NULL;
2353 }
2354
2355 int tid = 0, i;
2356 const char **names = (const char**) calloc(idx->n,sizeof(const char*));
2357 for (i=0; i<idx->n; i++)
2358 {
2359 bidx_t *bidx = idx->bidx[i];
2360 if ( !bidx ) continue;
2361 names[tid++] = getid(hdr,i);
2362 }
2363 *n = tid;
2364 return names;
2365 }
2366
hts_idx_get_stat(const hts_idx_t * idx,int tid,uint64_t * mapped,uint64_t * unmapped)2367 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
2368 {
2369 if ( idx->fmt == HTS_FMT_CRAI ) {
2370 *mapped = 0; *unmapped = 0;
2371 return -1;
2372 }
2373
2374 bidx_t *h = idx->bidx[tid];
2375 khint_t k = kh_get(bin, h, META_BIN(idx));
2376 if (k != kh_end(h)) {
2377 *mapped = kh_val(h, k).list[1].u;
2378 *unmapped = kh_val(h, k).list[1].v;
2379 return 0;
2380 } else {
2381 *mapped = 0; *unmapped = 0;
2382 return -1;
2383 }
2384 }
2385
hts_idx_get_n_no_coor(const hts_idx_t * idx)2386 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
2387 {
2388 return idx->n_no_coor;
2389 }
2390
2391 /****************
2392 *** Iterator ***
2393 ****************/
2394
2395 // Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
reg2bins(int64_t beg,int64_t end,hts_itr_t * itr,int min_shift,int n_lvls)2396 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
2397 {
2398 int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
2399 if (beg >= end) return 0;
2400 if (end >= 1LL<<s) end = 1LL<<s;
2401 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
2402 hts_pos_t b, e;
2403 int n, i;
2404 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
2405 if (itr->bins.n + n > itr->bins.m) {
2406 itr->bins.m = itr->bins.n + n;
2407 kroundup32(itr->bins.m);
2408 itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
2409 }
2410 for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
2411 }
2412 return itr->bins.n;
2413 }
2414
reg2intervals(hts_itr_t * iter,const hts_idx_t * idx,int tid,int64_t beg,int64_t end,uint32_t interval,uint64_t min_off,uint64_t max_off,int min_shift,int n_lvls)2415 static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
2416 {
2417 int l, t, s;
2418 int i, j;
2419 hts_pos_t b, e;
2420 hts_pair64_max_t *off;
2421 bidx_t *bidx;
2422 khint_t k;
2423 int start_n_off = iter->n_off;
2424
2425 if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
2426 return -1;
2427
2428 s = min_shift + (n_lvls<<1) + n_lvls;
2429 if (end >= 1LL<<s)
2430 end = 1LL<<s;
2431
2432 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
2433 b = t + (beg>>s); e = t + (end>>s);
2434
2435 for (i = b; i <= e; ++i) {
2436 if ((k = kh_get(bin, bidx, i)) != kh_end(bidx)) {
2437 bins_t *p = &kh_value(bidx, k);
2438
2439 if (p->n) {
2440 off = realloc(iter->off, (iter->n_off + p->n) * sizeof(*off));
2441 if (!off)
2442 return -2;
2443
2444 iter->off = off;
2445 for (j = 0; j < p->n; ++j) {
2446 if (p->list[j].v > min_off && p->list[j].u < max_off) {
2447 iter->off[iter->n_off].u = min_off > p->list[j].u
2448 ? min_off : p->list[j].u;
2449 iter->off[iter->n_off].v = max_off < p->list[j].v
2450 ? max_off : p->list[j].v;
2451 // hts_pair64_max_t::max is now used to link
2452 // file offsets to region list entries.
2453 // The iterator can use this to decide if it
2454 // can skip some file regions.
2455 iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
2456 iter->n_off++;
2457 }
2458 }
2459 }
2460 }
2461 }
2462 }
2463
2464 if (iter->n_off - start_n_off > 1) {
2465 ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
2466 for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
2467 if (iter->off[i].v >= iter->off[j].u) {
2468 if (iter->off[i].v < iter->off[j].v)
2469 iter->off[i].v = iter->off[j].v;
2470 } else {
2471 i++;
2472 if (i < j)
2473 iter->off[i] = iter->off[j];
2474 }
2475 }
2476 iter->n_off = i + 1;
2477 }
2478
2479 return iter->n_off;
2480 }
2481
compare_regions(const void * r1,const void * r2)2482 static int compare_regions(const void *r1, const void *r2) {
2483 hts_reglist_t *reg1 = (hts_reglist_t *)r1;
2484 hts_reglist_t *reg2 = (hts_reglist_t *)r2;
2485
2486 if (reg1->tid < 0 && reg2->tid >= 0)
2487 return 1;
2488 else if (reg1->tid >= 0 && reg2->tid < 0)
2489 return -1;
2490 else
2491 return reg1->tid - reg2->tid;
2492 }
2493
hts_itr_off(const hts_idx_t * idx,int tid)2494 uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
2495
2496 int i;
2497 bidx_t* bidx;
2498 uint64_t off0 = (uint64_t) -1;
2499 khint_t k;
2500 switch (tid) {
2501 case HTS_IDX_START:
2502 // Find the smallest offset, note that sequence ids may not be ordered sequentially
2503 for (i = 0; i < idx->n; i++) {
2504 bidx = idx->bidx[i];
2505 k = kh_get(bin, bidx, META_BIN(idx));
2506 if (k == kh_end(bidx))
2507 continue;
2508
2509 if (off0 > kh_val(bidx, k).list[0].u)
2510 off0 = kh_val(bidx, k).list[0].u;
2511 }
2512 if (off0 == (uint64_t) -1 && idx->n_no_coor)
2513 off0 = 0;
2514 // only no-coor reads in this bam
2515 break;
2516 case HTS_IDX_NOCOOR:
2517 /* No-coor reads sort after all of the mapped reads. The position
2518 is not stored in the index itself, so need to find the end
2519 offset for the last mapped read. A loop is needed here in
2520 case references at the end of the file have no mapped reads,
2521 or sequence ids are not ordered sequentially.
2522 See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
2523 for (i = 0; i < idx->n; i++) {
2524 bidx = idx->bidx[i];
2525 k = kh_get(bin, bidx, META_BIN(idx));
2526 if (k != kh_end(bidx)) {
2527 if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
2528 off0 = kh_val(bidx, k).list[0].v;
2529 }
2530 }
2531 }
2532 if (off0 == (uint64_t) -1 && idx->n_no_coor)
2533 off0 = 0;
2534 // only no-coor reads in this bam
2535 break;
2536 case HTS_IDX_REST:
2537 off0 = 0;
2538 break;
2539 case HTS_IDX_NONE:
2540 off0 = 0;
2541 break;
2542 }
2543
2544 return off0;
2545 }
2546
hts_itr_query(const hts_idx_t * idx,int tid,hts_pos_t beg,hts_pos_t end,hts_readrec_func * readrec)2547 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
2548 {
2549 int i, n_off, l, bin;
2550 hts_pair64_max_t *off;
2551 khint_t k;
2552 bidx_t *bidx;
2553 uint64_t min_off, max_off;
2554 hts_itr_t *iter;
2555 uint32_t unmapped = 0, rel_off;
2556
2557 // It's possible to call this function with NULL idx iff
2558 // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
2559 if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
2560 errno = EINVAL;
2561 return NULL;
2562 }
2563
2564 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
2565 if (iter) {
2566 if (tid < 0) {
2567 uint64_t off = hts_itr_off(idx, tid);
2568 if (off != (uint64_t) -1) {
2569 iter->read_rest = 1;
2570 iter->curr_off = off;
2571 iter->readrec = readrec;
2572 if (tid == HTS_IDX_NONE)
2573 iter->finished = 1;
2574 } else {
2575 free(iter);
2576 iter = NULL;
2577 }
2578 } else {
2579 if (beg < 0) beg = 0;
2580 if (end < beg) {
2581 free(iter);
2582 return NULL;
2583 }
2584 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
2585 free(iter);
2586 return NULL;
2587 }
2588
2589 k = kh_get(bin, bidx, META_BIN(idx));
2590 if (k != kh_end(bidx))
2591 unmapped = kh_val(bidx, k).list[1].v;
2592 else
2593 unmapped = 1;
2594
2595 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
2596 iter->readrec = readrec;
2597
2598 if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
2599
2600 rel_off = beg>>idx->min_shift;
2601 // compute min_off
2602 bin = hts_bin_first(idx->n_lvls) + rel_off;
2603 do {
2604 int first;
2605 k = kh_get(bin, bidx, bin);
2606 if (k != kh_end(bidx)) break;
2607 first = (hts_bin_parent(bin)<<3) + 1;
2608 if (bin > first) --bin;
2609 else bin = hts_bin_parent(bin);
2610 } while (bin);
2611 if (bin == 0) k = kh_get(bin, bidx, bin);
2612 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2613 // min_off can be calculated more accurately if the
2614 // linear index is available
2615 if (idx->lidx[tid].offset
2616 && rel_off < idx->lidx[tid].n) {
2617 if (min_off < idx->lidx[tid].offset[rel_off])
2618 min_off = idx->lidx[tid].offset[rel_off];
2619 if (unmapped) {
2620 int tmp_off;
2621 for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
2622 if (idx->lidx[tid].offset[tmp_off] < min_off) {
2623 min_off = idx->lidx[tid].offset[tmp_off];
2624 break;
2625 }
2626 }
2627
2628 if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0))
2629 min_off = kh_val(bidx, k).list[0].u;
2630 }
2631 } else if (unmapped) { //CSI index
2632 if (k != kh_end(bidx))
2633 min_off = kh_val(bidx, k).list[0].u;
2634 }
2635
2636 // compute max_off: a virtual offset from a bin to the right of end
2637 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2638 if (bin >= idx->n_bins) bin = 0;
2639 while (1) {
2640 // search for an extant bin by moving right, but moving up to the
2641 // parent whenever we get to a first child (which also covers falling
2642 // off the RHS, which wraps around and immediately goes up to bin 0)
2643 while (bin % 8 == 1) bin = hts_bin_parent(bin);
2644 if (bin == 0) { max_off = (uint64_t)-1; break; }
2645 k = kh_get(bin, bidx, bin);
2646 if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
2647 bin++;
2648 }
2649
2650 // retrieve bins
2651 reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
2652
2653 for (i = n_off = 0; i < iter->bins.n; ++i)
2654 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
2655 n_off += kh_value(bidx, k).n;
2656 if (n_off == 0) {
2657 // No overlapping bins means the iterator has already finished.
2658 iter->finished = 1;
2659 return iter;
2660 }
2661 off = calloc(n_off, sizeof(*off));
2662 for (i = n_off = 0; i < iter->bins.n; ++i) {
2663 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
2664 int j;
2665 bins_t *p = &kh_value(bidx, k);
2666 for (j = 0; j < p->n; ++j)
2667 if (p->list[j].v > min_off && p->list[j].u < max_off) {
2668 off[n_off].u = min_off > p->list[j].u
2669 ? min_off : p->list[j].u;
2670 off[n_off].v = max_off < p->list[j].v
2671 ? max_off : p->list[j].v;
2672 // hts_pair64_max_t::max is now used to link
2673 // file offsets to region list entries.
2674 // The iterator can use this to decide if it
2675 // can skip some file regions.
2676 off[n_off].max = ((uint64_t) tid << 32) | j;
2677 n_off++;
2678 }
2679 }
2680 }
2681
2682 if (n_off == 0) {
2683 free(off);
2684 iter->finished = 1;
2685 return iter;
2686 }
2687 ks_introsort(_off_max, n_off, off);
2688 // resolve completely contained adjacent blocks
2689 for (i = 1, l = 0; i < n_off; ++i)
2690 if (off[l].v < off[i].v) off[++l] = off[i];
2691 n_off = l + 1;
2692 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2693 for (i = 1; i < n_off; ++i)
2694 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2695 // merge adjacent blocks
2696 for (i = 1, l = 0; i < n_off; ++i) {
2697 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
2698 else off[++l] = off[i];
2699 }
2700 n_off = l + 1;
2701 iter->n_off = n_off; iter->off = off;
2702 }
2703 }
2704
2705 return iter;
2706 }
2707
hts_itr_multi_bam(const hts_idx_t * idx,hts_itr_t * iter)2708 int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
2709 {
2710 int i, j, bin;
2711 khint_t k;
2712 bidx_t *bidx;
2713 uint64_t min_off, max_off, t_off = (uint64_t)-1;
2714 int tid;
2715 hts_pos_t beg, end;
2716 hts_reglist_t *curr_reg;
2717 uint32_t unmapped = 0, rel_off;
2718
2719 if (!idx || !iter || !iter->multi)
2720 return -1;
2721
2722 iter->i = -1;
2723 for (i=0; i<iter->n_reg; i++) {
2724
2725 curr_reg = &iter->reg_list[i];
2726 tid = curr_reg->tid;
2727
2728 if (tid < 0) {
2729 t_off = hts_itr_off(idx, tid);
2730 if (t_off != (uint64_t)-1) {
2731 switch (tid) {
2732 case HTS_IDX_NONE:
2733 iter->finished = 1;
2734 // fall through
2735 case HTS_IDX_START:
2736 case HTS_IDX_REST:
2737 iter->curr_off = t_off;
2738 iter->n_reg = 0;
2739 iter->reg_list = NULL;
2740 iter->read_rest = 1;
2741 return 0;
2742 case HTS_IDX_NOCOOR:
2743 iter->nocoor = 1;
2744 iter->nocoor_off = t_off;
2745 }
2746 }
2747 } else {
2748 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
2749 continue;
2750
2751 k = kh_get(bin, bidx, META_BIN(idx));
2752 if (k != kh_end(bidx))
2753 unmapped = kh_val(bidx, k).list[1].v;
2754 else
2755 unmapped = 1;
2756
2757 for(j=0; j<curr_reg->count; j++) {
2758 hts_pair32_t *curr_intv = &curr_reg->intervals[j];
2759 if (curr_intv->end < curr_intv->beg)
2760 continue;
2761
2762 beg = curr_intv->beg;
2763 end = curr_intv->end;
2764 rel_off = beg>>idx->min_shift;
2765
2766 /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
2767 If the computed bin is not in the index, try the next bin to the
2768 left, belonging to the same parent. If it is the first sibling bin,
2769 try the parent bin. */
2770 bin = hts_bin_first(idx->n_lvls) + rel_off;
2771 do {
2772 int first;
2773 k = kh_get(bin, bidx, bin);
2774 if (k != kh_end(bidx)) break;
2775 first = (hts_bin_parent(bin)<<3) + 1;
2776 if (bin > first) --bin;
2777 else bin = hts_bin_parent(bin);
2778 } while (bin);
2779 if (bin == 0)
2780 k = kh_get(bin, bidx, bin);
2781 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2782 // min_off can be calculated more accurately if the
2783 // linear index is available
2784 if (idx->lidx[tid].offset
2785 && rel_off < idx->lidx[tid].n) {
2786 if (min_off < idx->lidx[tid].offset[rel_off])
2787 min_off = idx->lidx[tid].offset[rel_off];
2788 if (unmapped) {
2789 int tmp_off;
2790 for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
2791 if (idx->lidx[tid].offset[tmp_off] < min_off) {
2792 min_off = idx->lidx[tid].offset[tmp_off];
2793 break;
2794 }
2795 }
2796
2797 if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0))
2798 min_off = kh_val(bidx, k).list[0].u;
2799 }
2800 } else if (unmapped) { //CSI index
2801 if (k != kh_end(bidx))
2802 min_off = kh_val(bidx, k).list[0].u;
2803 }
2804
2805 // compute max_off: a virtual offset from a bin to the right of end
2806 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2807 if (bin >= idx->n_bins) bin = 0;
2808 while (1) {
2809 // search for an extant bin by moving right, but moving up to the
2810 // parent whenever we get to a first child (which also covers falling
2811 // off the RHS, which wraps around and immediately goes up to bin 0)
2812 while (bin % 8 == 1) bin = hts_bin_parent(bin);
2813 if (bin == 0) { max_off = (uint64_t)-1; break; }
2814 k = kh_get(bin, bidx, bin);
2815 if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
2816 max_off = kh_val(bidx, k).list[0].u;
2817 break;
2818 }
2819 bin++;
2820 }
2821
2822 //convert coordinates to file offsets
2823 if (reg2intervals(iter, idx, tid, beg, end, j,
2824 min_off, max_off,
2825 idx->min_shift, idx->n_lvls) < 0) {
2826 return -1;
2827 }
2828 }
2829 }
2830 }
2831
2832 if (iter->n_off > 1)
2833 ks_introsort(_off_max, iter->n_off, iter->off);
2834
2835 if(!iter->n_off && !iter->nocoor)
2836 iter->finished = 1;
2837
2838 return 0;
2839 }
2840
hts_itr_multi_cram(const hts_idx_t * idx,hts_itr_t * iter)2841 int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
2842 {
2843 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
2844 int tid, i, n_off = 0;
2845 uint32_t j;
2846 hts_pos_t beg, end;
2847 hts_reglist_t *curr_reg;
2848 hts_pair32_t *curr_intv;
2849 hts_pair64_max_t *off = NULL, *tmp;
2850 cram_index *e = NULL;
2851
2852 if (!cidx || !iter || !iter->multi)
2853 return -1;
2854
2855 iter->is_cram = 1;
2856 iter->read_rest = 0;
2857 iter->off = NULL;
2858 iter->n_off = 0;
2859 iter->curr_off = 0;
2860 iter->i = -1;
2861
2862 for (i=0; i<iter->n_reg; i++) {
2863
2864 curr_reg = &iter->reg_list[i];
2865 tid = curr_reg->tid;
2866
2867 if (tid >= 0) {
2868 tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off));
2869 if (!tmp)
2870 goto err;
2871 off = tmp;
2872
2873 for (j=0; j < curr_reg->count; j++) {
2874 curr_intv = &curr_reg->intervals[j];
2875 if (curr_intv->end < curr_intv->beg)
2876 continue;
2877
2878 beg = curr_intv->beg;
2879 end = curr_intv->end;
2880
2881 /* First, fetch the container overlapping 'beg' and assign its file offset to u, then
2882 * find the container overlapping 'end' and assign the relative end of the slice to v.
2883 * The cram_ptell function will adjust with the container offset, which is not stored
2884 * in the index.
2885 */
2886 e = cram_index_query(cidx->cram, tid, beg+1, NULL);
2887 if (e) {
2888 off[n_off].u = e->offset;
2889 // hts_pair64_max_t::max is now used to link
2890 // file offsets to region list entries.
2891 // The iterator can use this to decide if it
2892 // can skip some file regions.
2893 off[n_off].max = ((uint64_t) tid << 32) | j;
2894
2895 if (end >= HTS_POS_MAX) {
2896 e = cram_index_last(cidx->cram, tid, NULL);
2897 } else {
2898 e = cram_index_query_last(cidx->cram, tid, end+1);
2899 }
2900
2901 if (e) {
2902 off[n_off++].v = e->next
2903 ? e->next
2904 : e->offset + e->slice + e->len;
2905 } else {
2906 hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
2907 }
2908 } else {
2909 hts_log_warning("No index entry for region %d:%"PRIhts_pos"-%"PRIhts_pos"", tid, beg, end);
2910 }
2911 }
2912 } else {
2913 switch (tid) {
2914 case HTS_IDX_NOCOOR:
2915 e = cram_index_query(cidx->cram, tid, 1, NULL);
2916 if (e) {
2917 iter->nocoor = 1;
2918 iter->nocoor_off = e->offset;
2919 } else {
2920 hts_log_warning("No index entry for NOCOOR region");
2921 }
2922 break;
2923 case HTS_IDX_START:
2924 e = cram_index_query(cidx->cram, tid, 1, NULL);
2925 if (e) {
2926 iter->read_rest = 1;
2927 tmp = realloc(off, sizeof(*off));
2928 if (!tmp)
2929 goto err;
2930 off = tmp;
2931 off[0].u = e->offset;
2932 off[0].v = 0;
2933 n_off=1;
2934 } else {
2935 hts_log_warning("No index entries");
2936 }
2937 break;
2938 case HTS_IDX_REST:
2939 break;
2940 case HTS_IDX_NONE:
2941 iter->finished = 1;
2942 break;
2943 default:
2944 hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
2945 }
2946 }
2947 }
2948
2949 if (n_off) {
2950 ks_introsort(_off_max, n_off, off);
2951 iter->n_off = n_off; iter->off = off;
2952 }
2953
2954 if(!n_off && !iter->nocoor)
2955 iter->finished = 1;
2956
2957 return 0;
2958
2959 err:
2960 free(off);
2961 return -1;
2962 }
2963
hts_itr_destroy(hts_itr_t * iter)2964 void hts_itr_destroy(hts_itr_t *iter)
2965 {
2966 if (iter) {
2967 if (iter->multi) {
2968 hts_reglist_free(iter->reg_list, iter->n_reg);
2969 } else {
2970 free(iter->bins.a);
2971 }
2972
2973 if (iter->off)
2974 free(iter->off);
2975 free(iter);
2976 }
2977 }
2978
push_digit(long long i,char c)2979 static inline long long push_digit(long long i, char c)
2980 {
2981 // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
2982 int digit = c - '0';
2983 return 10 * i + digit;
2984 }
2985
hts_parse_decimal(const char * str,char ** strend,int flags)2986 long long hts_parse_decimal(const char *str, char **strend, int flags)
2987 {
2988 long long n = 0;
2989 int decimals = 0, e = 0, lost = 0;
2990 char sign = '+', esign = '+';
2991 const char *s;
2992
2993 while (isspace_c(*str)) str++;
2994 s = str;
2995
2996 if (*s == '+' || *s == '-') sign = *s++;
2997 while (*s)
2998 if (isdigit_c(*s)) n = push_digit(n, *s++);
2999 else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3000 else break;
3001
3002 if (*s == '.') {
3003 s++;
3004 while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++);
3005 }
3006
3007 if (*s == 'E' || *s == 'e') {
3008 s++;
3009 if (*s == '+' || *s == '-') esign = *s++;
3010 while (isdigit_c(*s)) e = push_digit(e, *s++);
3011 if (esign == '-') e = -e;
3012 }
3013
3014 switch (*s) {
3015 case 'k': case 'K': e += 3; s++; break;
3016 case 'm': case 'M': e += 6; s++; break;
3017 case 'g': case 'G': e += 9; s++; break;
3018 }
3019
3020 e -= decimals;
3021 while (e > 0) n *= 10, e--;
3022 while (e < 0) lost += n % 10, n /= 10, e++;
3023
3024 if (lost > 0) {
3025 hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3026 }
3027
3028 if (strend) {
3029 *strend = (char *)s;
3030 } else if (*s) {
3031 if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3032 hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3033 }
3034
3035 return (sign == '+')? n : -n;
3036 }
3037
hts_memrchr(const void * s,int c,size_t n)3038 static void *hts_memrchr(const void *s, int c, size_t n) {
3039 size_t i;
3040 unsigned char *u = (unsigned char *)s;
3041 for (i = n; i > 0; i--) {
3042 if (u[i-1] == c)
3043 return u+i-1;
3044 }
3045
3046 return NULL;
3047 }
3048
3049 /*
3050 * A variant of hts_parse_reg which is reference-id aware. It uses
3051 * the iterator name2id callbacks to validate the region tokenisation works.
3052 *
3053 * This is necessary due to GRCh38 HLA additions which have reference names
3054 * like "HLA-DRB1*12:17".
3055 *
3056 * All parameters are mandatory.
3057 *
3058 * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3059 * are reference names, we may quote using curly braces.
3060 * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3061 *
3062 * Flags are used to control how parsing works, and can be one of the below.
3063 *
3064 * HTS_PARSE_LIST:
3065 * If present, the region is assmed to be a comma separated list and
3066 * position parsing will not contain commas (this implicitly
3067 * clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3068 * On success the return pointer will be the start of the next region, ie
3069 * the character after the comma. (If *ret != '\0' then the caller can
3070 * assume another region is present in the list.)
3071 *
3072 * If not set then positions may contain commas. In this case the return
3073 * value should point to the end of the string, or NULL on failure.
3074 *
3075 * HTS_PARSE_ONE_COORD:
3076 * If present, X:100 is treated as the single base pair region X:100-100.
3077 * In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3078 * (This is the standard bcftools region convention.)
3079 *
3080 * When not set X:100 is considered to be X:100-<end> where <end> is
3081 * the end of chromosome X (set to HTS_POS_MAX here). X:100- and X:-100
3082 * are invalid.
3083 * (This is the standard samtools region convention.)
3084 *
3085 * Note the supplied string expects 1 based inclusive coordinates, but the
3086 * returned coordinates start from 0 and are half open, so pos0 is valid
3087 * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3088 *
3089 * On success a pointer to the byte after the end of the entire region
3090 * specifier is returned (plus any trailing comma), and tid,
3091 * beg & end will be set.
3092 * On failure NULL is returned.
3093 */
hts_parse_region(const char * s,int * tid,hts_pos_t * beg,hts_pos_t * end,hts_name2id_f getid,void * hdr,int flags)3094 const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
3095 hts_pos_t *end, hts_name2id_f getid, void *hdr,
3096 int flags)
3097 {
3098 if (!s || !tid || !beg || !end || !getid)
3099 return NULL;
3100
3101 size_t s_len = strlen(s);
3102 kstring_t ks = { 0, 0, NULL };
3103
3104 const char *colon = NULL, *comma = NULL;
3105 int quoted = 0;
3106
3107 if (flags & HTS_PARSE_LIST)
3108 flags &= ~HTS_PARSE_THOUSANDS_SEP;
3109 else
3110 flags |= HTS_PARSE_THOUSANDS_SEP;
3111
3112 const char *s_end = s + s_len;
3113
3114 // Braced quoting of references is permitted to resolve ambiguities.
3115 if (*s == '{') {
3116 const char *close = memchr(s, '}', s_len);
3117 if (!close) {
3118 hts_log_error("Mismatching braces in \"%s\"", s);
3119 *tid = -1;
3120 return NULL;
3121 }
3122 s++;
3123 s_len--;
3124 if (close[1] == ':')
3125 colon = close+1;
3126 quoted = 1; // number of trailing characters to trim
3127
3128 // Truncate to this item only, if appropriate.
3129 if (flags & HTS_PARSE_LIST) {
3130 comma = strchr(close, ',');
3131 if (comma) {
3132 s_len = comma-s;
3133 s_end = comma+1;
3134 }
3135 }
3136 } else {
3137 // Truncate to this item only, if appropriate.
3138 if (flags & HTS_PARSE_LIST) {
3139 comma = strchr(s, ',');
3140 if (comma) {
3141 s_len = comma-s;
3142 s_end = comma+1;
3143 }
3144 }
3145
3146 colon = hts_memrchr(s, ':', s_len);
3147 }
3148
3149 // No colon is simplest case; just check and return.
3150 if (colon == NULL) {
3151 *beg = 0; *end = HTS_POS_MAX;
3152 kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
3153 if (!ks.s) {
3154 *tid = -2;
3155 return NULL;
3156 }
3157
3158 *tid = getid(hdr, ks.s);
3159 free(ks.s);
3160
3161 return *tid >= 0 ? s_end : NULL;
3162 }
3163
3164 // Has a colon, but check whole name first.
3165 if (!quoted) {
3166 *beg = 0; *end = HTS_POS_MAX;
3167 kputsn(s, s_len, &ks); // convert to nul terminated string
3168 if (!ks.s) {
3169 *tid = -2;
3170 return NULL;
3171 }
3172 if ((*tid = getid(hdr, ks.s)) >= 0) {
3173 // Entire name matches, but also check this isn't
3174 // ambiguous. eg we have ref chr1 and ref chr1:100-200
3175 // both present.
3176 ks.l = 0;
3177 kputsn(s, colon-s, &ks); // convert to nul terminated string
3178 if (!ks.s) {
3179 *tid = -2;
3180 return NULL;
3181 }
3182 if (getid(hdr, ks.s) >= 0) {
3183 free(ks.s);
3184 *tid = -1;
3185 hts_log_error("Range is ambiguous. "
3186 "Use {%s} or {%.*s}%s instead",
3187 s, (int)(colon-s), s, colon);
3188 return NULL;
3189 }
3190 free(ks.s);
3191
3192 return s_end;
3193 }
3194 if (*tid < -1) // Failed to parse header
3195 return NULL;
3196 }
3197
3198 // Quoted, or unquoted and whole string isn't a name.
3199 // Check the pre-colon part is valid.
3200 ks.l = 0;
3201 kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
3202 if (!ks.s) {
3203 *tid = -2;
3204 return NULL;
3205 }
3206 *tid = getid(hdr, ks.s);
3207 free(ks.s);
3208 if (*tid < 0)
3209 return NULL;
3210
3211 // Finally parse the post-colon coordinates
3212 char *hyphen;
3213 *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
3214 if (*beg < 0) {
3215 if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
3216 // interpret chr:-100 as chr:1-100
3217 *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
3218 *beg = 0;
3219 return s_end;
3220 } else if (*hyphen == '-') {
3221 *beg = 0;
3222 } else {
3223 hts_log_error("Unexpected string \"%s\" after region", hyphen);
3224 return NULL;
3225 }
3226 }
3227
3228 if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
3229 *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
3230 } else if (*hyphen == '-') {
3231 *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
3232 if (*hyphen != '\0' && *hyphen != ',') {
3233 hts_log_error("Unexpected string \"%s\" after region", hyphen);
3234 return NULL;
3235 }
3236 } else {
3237 hts_log_error("Unexpected string \"%s\" after region", hyphen);
3238 return NULL;
3239 }
3240
3241 if (*end == 0)
3242 *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
3243
3244 if (*beg >= *end) return NULL;
3245
3246 return s_end;
3247 }
3248
3249 // Next release we should mark this as deprecated?
3250 // Use hts_parse_region above instead.
hts_parse_reg64(const char * s,hts_pos_t * beg,hts_pos_t * end)3251 const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
3252 {
3253 char *hyphen;
3254 const char *colon = strrchr(s, ':');
3255 if (colon == NULL) {
3256 *beg = 0; *end = HTS_POS_MAX;
3257 return s + strlen(s);
3258 }
3259
3260 *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
3261 if (*beg < 0) *beg = 0;
3262
3263 if (*hyphen == '\0') *end = HTS_POS_MAX;
3264 else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
3265 else return NULL;
3266
3267 if (*beg >= *end) return NULL;
3268 return colon;
3269 }
3270
hts_parse_reg(const char * s,int * beg,int * end)3271 const char *hts_parse_reg(const char *s, int *beg, int *end)
3272 {
3273 hts_pos_t beg64 = 0, end64 = 0;
3274 const char *colon = hts_parse_reg64(s, &beg64, &end64);
3275 if (beg64 > INT_MAX) {
3276 hts_log_error("Position %"PRId64" too large", beg64);
3277 return NULL;
3278 }
3279 if (end64 > INT_MAX) {
3280 if (end64 == HTS_POS_MAX) {
3281 end64 = INT_MAX;
3282 } else {
3283 hts_log_error("Position %"PRId64" too large", end64);
3284 return NULL;
3285 }
3286 }
3287 *beg = beg64;
3288 *end = end64;
3289 return colon;
3290 }
3291
hts_itr_querys(const hts_idx_t * idx,const char * reg,hts_name2id_f getid,void * hdr,hts_itr_query_func * itr_query,hts_readrec_func * readrec)3292 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
3293 {
3294 int tid;
3295 hts_pos_t beg, end;
3296
3297 if (strcmp(reg, ".") == 0)
3298 return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
3299 else if (strcmp(reg, "*") == 0)
3300 return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
3301
3302 if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
3303 return NULL;
3304
3305 return itr_query(idx, tid, beg, end, readrec);
3306 }
3307
hts_itr_regions(const hts_idx_t * idx,hts_reglist_t * reglist,int count,hts_name2id_f getid,void * hdr,hts_itr_multi_query_func * itr_specific,hts_readrec_func * readrec,hts_seek_func * seek,hts_tell_func * tell)3308 hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
3309
3310 int i;
3311
3312 if (!reglist)
3313 return NULL;
3314
3315 hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3316 if (itr) {
3317 itr->n_reg = count;
3318 itr->readrec = readrec;
3319 itr->seek = seek;
3320 itr->tell = tell;
3321 itr->reg_list = reglist;
3322 itr->finished = 0;
3323 itr->nocoor = 0;
3324 itr->multi = 1;
3325
3326 for (i = 0; i < itr->n_reg; i++) {
3327 if (itr->reg_list[i].reg) {
3328 if (!strcmp(itr->reg_list[i].reg, ".")) {
3329 itr->reg_list[i].tid = HTS_IDX_START;
3330 continue;
3331 }
3332
3333 if (!strcmp(itr->reg_list[i].reg, "*")) {
3334 itr->reg_list[i].tid = HTS_IDX_NOCOOR;
3335 continue;
3336 }
3337
3338 itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
3339 if (itr->reg_list[i].tid < 0) {
3340 if (itr->reg_list[i].tid < -1) {
3341 hts_log_error("Failed to parse header");
3342 hts_itr_destroy(itr);
3343 return NULL;
3344 } else {
3345 hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
3346 }
3347 }
3348 }
3349 }
3350
3351 qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
3352 if (itr_specific(idx, itr) != 0) {
3353 hts_log_error("Failed to create the multi-region iterator!");
3354 hts_itr_destroy(itr);
3355 itr = NULL;
3356 }
3357 }
3358
3359 return itr;
3360 }
3361
hts_itr_next(BGZF * fp,hts_itr_t * iter,void * r,void * data)3362 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
3363 {
3364 int ret, tid;
3365 hts_pos_t beg, end;
3366 if (iter == NULL || iter->finished) return -1;
3367 if (iter->read_rest) {
3368 if (iter->curr_off) { // seek to the start
3369 if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
3370 hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
3371 iter->curr_off,
3372 errno ? ": " : "", strerror(errno));
3373 return -2;
3374 }
3375 iter->curr_off = 0; // only seek once
3376 }
3377 ret = iter->readrec(fp, data, r, &tid, &beg, &end);
3378 if (ret < 0) iter->finished = 1;
3379 iter->curr_tid = tid;
3380 iter->curr_beg = beg;
3381 iter->curr_end = end;
3382 return ret;
3383 }
3384 // A NULL iter->off should always be accompanied by iter->finished.
3385 assert(iter->off != NULL);
3386 for (;;) {
3387 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
3388 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
3389 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
3390 if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
3391 hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
3392 iter->off[iter->i+1].u,
3393 errno ? ": " : "", strerror(errno));
3394 return -2;
3395 }
3396 iter->curr_off = bgzf_tell(fp);
3397 }
3398 ++iter->i;
3399 }
3400 if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
3401 iter->curr_off = bgzf_tell(fp);
3402 if (tid != iter->tid || beg >= iter->end) { // no need to proceed
3403 ret = -1; break;
3404 } else if (end > iter->beg && iter->end > beg) {
3405 iter->curr_tid = tid;
3406 iter->curr_beg = beg;
3407 iter->curr_end = end;
3408 return ret;
3409 }
3410 } else break; // end of file or error
3411 }
3412 iter->finished = 1;
3413 return ret;
3414 }
3415
hts_itr_multi_next(htsFile * fd,hts_itr_t * iter,void * r)3416 int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
3417 {
3418 void *fp;
3419 int ret, tid, i, cr, ci;
3420 hts_pos_t beg, end;
3421 hts_reglist_t *found_reg;
3422
3423 if (iter == NULL || iter->finished) return -1;
3424
3425 if (iter->is_cram) {
3426 fp = fd->fp.cram;
3427 } else {
3428 fp = fd->fp.bgzf;
3429 }
3430
3431 if (iter->read_rest) {
3432 if (iter->curr_off) { // seek to the start
3433 if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
3434 hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
3435 return -1;
3436 }
3437 iter->curr_off = 0; // only seek once
3438 }
3439
3440 ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
3441 if (ret < 0)
3442 iter->finished = 1;
3443
3444 iter->curr_tid = tid;
3445 iter->curr_beg = beg;
3446 iter->curr_end = end;
3447
3448 return ret;
3449 }
3450 // A NULL iter->off should always be accompanied by iter->finished.
3451 assert(iter->off != NULL || iter->nocoor != 0);
3452
3453 int next_range = 0;
3454 for (;;) {
3455 // Note that due to the way bam indexing works, iter->off may contain
3456 // file chunks that are not actually needed as they contain data
3457 // beyond the end of the requested region. These are filtered out
3458 // by comparing the tid and index into hts_reglist_t::intervals
3459 // (packed for reasons of convenience into iter->off[iter->i].max)
3460 // associated with the file region with iter->curr_tid and
3461 // iter->curr_intv.
3462
3463 if (next_range
3464 || iter->curr_off == 0
3465 || iter->i >= iter->n_off
3466 || iter->curr_off >= iter->off[iter->i].v
3467 || (iter->off[iter->i].max >> 32 == iter->curr_tid
3468 && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
3469
3470 // Jump to the next chunk. It may be necessary to skip more
3471 // than one as the iter->off list can include overlapping entries.
3472 do {
3473 iter->i++;
3474 } while (iter->i < iter->n_off
3475 && (iter->curr_off >= iter->off[iter->i].v
3476 || (iter->off[iter->i].max >> 32 == iter->curr_tid
3477 && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
3478
3479 if (iter->is_cram && iter->i < iter->n_off) {
3480 // Ensure iter->curr_reg is correct.
3481 //
3482 // We need this for CRAM as we shortcut some of the later
3483 // logic by getting an end-of-range and continuing to the
3484 // next offset.
3485 //
3486 // We cannot do this for BAM (and fortunately do not need to
3487 // either) because in BAM world a query to genomic positions
3488 // GX and GY leading to a seek offsets PX and PY may have
3489 // GX > GY and PX < PY. (This is due to the R-tree and falling
3490 // between intervals, bumping up to a higher bin.)
3491 // CRAM strictly follows PX >= PY if GX >= GY, so this logic
3492 // works.
3493 int want_tid = iter->off[iter->i].max >> 32;
3494 if (!(iter->curr_reg < iter->n_reg &&
3495 iter->reg_list[iter->curr_reg].tid == want_tid)) {
3496 int j;
3497 for (j = 0; j < iter->n_reg; j++)
3498 if (iter->reg_list[j].tid == want_tid)
3499 break;
3500 if (j == iter->n_reg)
3501 return -1;
3502 iter->curr_reg = j;
3503 iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
3504 };
3505 iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
3506 }
3507
3508 if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
3509 if (iter->nocoor) {
3510 next_range = 0;
3511 if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
3512 hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
3513 return -1;
3514 }
3515 if (iter->is_cram) {
3516 cram_range r = { HTS_IDX_NOCOOR };
3517 cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
3518 }
3519
3520 // The first slice covering the unmapped reads might
3521 // contain a few mapped reads, so scroll
3522 // forward until finding the first unmapped read.
3523 do {
3524 ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
3525 } while (tid >= 0 && ret >=0);
3526
3527 if (ret < 0)
3528 iter->finished = 1;
3529 else
3530 iter->read_rest = 1;
3531
3532 iter->curr_off = 0; // don't seek any more
3533 iter->curr_tid = tid;
3534 iter->curr_beg = beg;
3535 iter->curr_end = end;
3536
3537 return ret;
3538 } else {
3539 ret = -1; break;
3540 }
3541 } else if (iter->i < iter->n_off) {
3542 // New chunk may overlap the last one, so ensure we
3543 // only seek forwards.
3544 if (iter->curr_off < iter->off[iter->i].u || next_range) {
3545 iter->curr_off = iter->off[iter->i].u;
3546
3547 // CRAM has the capability of setting an end location.
3548 // This means multi-threaded decodes can stop once they
3549 // reach that point, rather than pointlessly decoding
3550 // more slices than we'll be using.
3551 //
3552 // We have to be careful here. Whenever we set the cram
3553 // range we need a corresponding seek in order to ensure
3554 // we can safely decode at that offset. We use next_range
3555 // var to ensure this is always true; this is set on
3556 // end-of-range condition. It's never modified for BAM.
3557 if (iter->is_cram) {
3558 // Next offset.[uv] tuple, but it's already been
3559 // included in our cram range, so don't seek and don't
3560 // reset range so we can efficiently multi-thread.
3561 if (next_range || iter->curr_off >= iter->end) {
3562 if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
3563 hts_log_error("Seek at offset %" PRIu64
3564 " failed.", iter->curr_off);
3565 return -1;
3566 }
3567
3568 // Find the genomic range matching this interval.
3569 int j;
3570 hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
3571 cram_range r = {
3572 rl->tid,
3573 rl->intervals[iter->curr_intv].beg,
3574 rl->intervals[iter->curr_intv].end
3575 };
3576
3577 // Expand it up to cover neighbouring intervals.
3578 // Note we can only have a single chromosome in a
3579 // range, so if we detect our blocks span chromosomes
3580 // or we have a multi-ref mode slice, we just use
3581 // HTS_IDX_START refid instead. This doesn't actually
3582 // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
3583 // and indicator of decoding with no end limit.
3584 //
3585 // That isn't as efficient as it could be, but it's
3586 // no poorer than before and it works.
3587 int tid = r.refid;
3588 int64_t end = r.end;
3589 int64_t v = iter->off[iter->i].v;
3590 j = iter->i+1;
3591 while (j < iter->n_off) {
3592 if (iter->off[j].u > v)
3593 break;
3594
3595 uint64_t max = iter->off[j].max;
3596 if ((max>>32) != tid)
3597 tid = HTS_IDX_START; // => no range limit
3598
3599 if (end < rl->intervals[max & 0xffffffff].end)
3600 end = rl->intervals[max & 0xffffffff].end;
3601 if (v < iter->off[j].v)
3602 v = iter->off[j].v;
3603 j++;
3604 }
3605 r.refid = tid;
3606 r.end = end;
3607
3608 // Remember maximum 'v' here so we don't do
3609 // unnecessary subsequent seeks for the next
3610 // regions. We can't change curr_off, but
3611 // beg/end are used only by single region iterator so
3612 // we cache it there to avoid changing the struct.
3613 iter->end = v;
3614
3615 cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
3616 next_range = 0;
3617 }
3618 } else { // Not CRAM
3619 if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
3620 hts_log_error("Seek at offset %" PRIu64 " failed.",
3621 iter->curr_off);
3622 return -1;
3623 }
3624 }
3625 }
3626 }
3627 }
3628
3629 ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
3630 if (ret < 0) {
3631 if (iter->is_cram && cram_eof(fp)) {
3632 // Skip to end of range
3633 //
3634 // We should never be adjusting curr_off manually unless
3635 // we also can guarantee we'll be doing a seek after to
3636 // a new location. Otherwise we'll be reading wrong offset
3637 // for the next container.
3638 //
3639 // We ensure this by adjusting our CRAM_OPT_RANGE
3640 // accordingly above, but to double check we also
3641 // set the skipped_block flag to enforce a seek also.
3642 iter->curr_off = iter->off[iter->i].v;
3643 next_range = 1;
3644
3645 // Next region
3646 if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
3647 if (++iter->curr_reg >= iter->n_reg)
3648 break;
3649 iter->curr_intv = 0;
3650 iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
3651 }
3652 continue;
3653 } else {
3654 break;
3655 }
3656 }
3657
3658 iter->curr_off = iter->tell(fp);
3659
3660 if (tid != iter->curr_tid) {
3661 hts_reglist_t key;
3662 key.tid = tid;
3663
3664 found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
3665 iter->n_reg,
3666 sizeof(hts_reglist_t),
3667 compare_regions);
3668 if (!found_reg)
3669 continue;
3670
3671 iter->curr_reg = (found_reg - iter->reg_list);
3672 iter->curr_tid = tid;
3673 iter->curr_intv = 0;
3674 }
3675
3676 cr = iter->curr_reg;
3677 ci = iter->curr_intv;
3678
3679 for (i = ci; i < iter->reg_list[cr].count; i++) {
3680 if (end > iter->reg_list[cr].intervals[i].beg &&
3681 iter->reg_list[cr].intervals[i].end > beg) {
3682 iter->curr_beg = beg;
3683 iter->curr_end = end;
3684 iter->curr_intv = i;
3685
3686 return ret;
3687 }
3688
3689 // Check if the read starts beyond intervals[i].end
3690 // If so, the interval is finished so move on to the next.
3691 if (beg > iter->reg_list[cr].intervals[i].end)
3692 iter->curr_intv = i + 1;
3693
3694 // No need to keep searching if the read ends before intervals[i].beg
3695 if (end < iter->reg_list[cr].intervals[i].beg)
3696 break;
3697 }
3698 }
3699 iter->finished = 1;
3700
3701 return ret;
3702 }
3703
3704 /**********************
3705 *** Retrieve index ***
3706 **********************/
3707 // Local_fn and local_len will return a sub-region of 'fn'.
3708 // Eg http://elsewhere/dir/foo.bam.bai?a=b may return
3709 // foo.bam.bai via local_fn and local_len.
3710 //
3711 // Returns -1 if index couldn't be opened.
3712 // -2 on other errors
idx_test_and_fetch(const char * fn,const char ** local_fn,int * local_len,int download)3713 static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
3714 {
3715 hFILE *remote_hfp = NULL;
3716 hFILE *local_fp = NULL;
3717 int save_errno;
3718 htsFormat fmt;
3719 kstring_t s = KS_INITIALIZE;
3720 kstring_t tmps = KS_INITIALIZE;
3721
3722 if (hisremote(fn)) {
3723 const int buf_size = 1 * 1024 * 1024;
3724 int l;
3725 const char *p, *e;
3726 // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
3727 e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
3728 // Find the previous slash from there.
3729 p = e;
3730 while (p > fn && *p != '/') p--;
3731 if (*p == '/') p++;
3732
3733 // Attempt to open local file first
3734 kputsn(p, e-p, &s);
3735 if (access(s.s, R_OK) == 0)
3736 {
3737 free(s.s);
3738 *local_fn = p;
3739 *local_len = e-p;
3740 return 0;
3741 }
3742
3743 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
3744 if ((remote_hfp = hopen(fn, "r")) == 0) {
3745 hts_log_info("Failed to open index file '%s'", fn);
3746 free(s.s);
3747 return -1;
3748 }
3749 if (hts_detect_format(remote_hfp, &fmt)) {
3750 hts_log_error("Failed to detect format of index file '%s'", fn);
3751 goto fail;
3752 }
3753 if (fmt.category != index_file || (fmt.format != bai && fmt.format != csi && fmt.format != tbi
3754 && fmt.format != crai && fmt.format != fai_format)) {
3755 hts_log_error("Format of index file '%s' is not supported", fn);
3756 goto fail;
3757 }
3758
3759 if (download) {
3760 if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps)) == NULL) {
3761 hts_log_error("Failed to create file %s in the working directory", p);
3762 goto fail;
3763 }
3764 hts_log_info("Downloading file %s to local directory", fn);
3765 uint8_t *buf = (uint8_t*)calloc(buf_size, 1);
3766 if (!buf) {
3767 hts_log_error("%s", strerror(errno));
3768 goto fail;
3769 }
3770 while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
3771 if (hwrite(local_fp, buf, l) != l) {
3772 hts_log_error("Failed to write data to %s : %s",
3773 fn, strerror(errno));
3774 free(buf);
3775 goto fail;
3776 }
3777 }
3778 free(buf);
3779 if (l < 0) {
3780 hts_log_error("Error reading \"%s\"", fn);
3781 goto fail;
3782 }
3783 if (hclose(local_fp) < 0) {
3784 hts_log_error("Error closing %s : %s", fn, strerror(errno));
3785 local_fp = NULL;
3786 goto fail;
3787 }
3788 local_fp = NULL;
3789 if (rename(tmps.s, s.s) < 0) {
3790 hts_log_error("Error renaming %s : %s", tmps.s, strerror(errno));
3791 goto fail;
3792 }
3793 ks_clear(&tmps);
3794
3795 *local_fn = p;
3796 *local_len = e-p;
3797 } else {
3798 *local_fn = fn;
3799 *local_len = e-fn;
3800 }
3801
3802 if (hclose(remote_hfp) != 0) {
3803 hts_log_error("Failed to close remote file %s", fn);
3804 }
3805
3806 free(tmps.s);
3807 free(s.s);
3808 return 0;
3809 } else {
3810 hFILE *local_hfp;
3811 if ((local_hfp = hopen(fn, "r")) == 0) return -1;
3812 hclose_abruptly(local_hfp);
3813 *local_fn = fn;
3814 *local_len = strlen(fn);
3815 return 0;
3816 }
3817
3818 fail:
3819 save_errno = errno;
3820 if (remote_hfp) hclose_abruptly(remote_hfp);
3821 if (local_fp) hclose_abruptly(local_fp);
3822 if (tmps.l > 0) unlink(tmps.s);
3823 free(tmps.s);
3824 free(s.s);
3825 errno = save_errno;
3826 return -2;
3827 }
3828
3829 /*
3830 * Check the existence of a local index file using part of the alignment file name.
3831 * The order is alignment.bam.csi, alignment.csi, alignment.bam.bai, alignment.bai
3832 * @param fn - pointer to the file name
3833 * @param fnidx - pointer to the index file name placeholder
3834 * @return 1 for success, 0 for failure
3835 */
hts_idx_check_local(const char * fn,int fmt,char ** fnidx)3836 int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
3837 int i, l_fn, l_ext;
3838 const char *fn_tmp = NULL;
3839 char *fnidx_tmp;
3840 char *csi_ext = ".csi";
3841 char *bai_ext = ".bai";
3842 char *tbi_ext = ".tbi";
3843 char *crai_ext = ".crai";
3844 char *fai_ext = ".fai";
3845
3846 if (!fn)
3847 return 0;
3848
3849 if (hisremote(fn)) {
3850 for (i = strlen(fn) - 1; i >= 0; --i)
3851 if (fn[i] == '/') {
3852 fn_tmp = (char *)&fn[i+1];
3853 break;
3854 }
3855 } else {
3856 // Borrowed from hopen_fd_fileuri()
3857 if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16;
3858 else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7;
3859 else fn_tmp = fn;
3860 #if defined(_WIN32) || defined(__MSYS__)
3861 // For cases like C:/foo
3862 if (fn_tmp[0] == '/' && fn_tmp[1] && fn_tmp[2] == ':' && fn_tmp[3] == '/')
3863 fn_tmp++;
3864 #endif
3865 }
3866
3867 if (!fn_tmp) return 0;
3868 hts_log_info("Using alignment file '%s'", fn_tmp);
3869 l_fn = strlen(fn_tmp); l_ext = 5;
3870 fnidx_tmp = (char*)calloc(l_fn + l_ext + 1, 1);
3871 if (!fnidx_tmp) return 0;
3872
3873 struct stat sbuf;
3874
3875 // Try alignment.bam.csi first
3876 strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, csi_ext);
3877 if(stat(fnidx_tmp, &sbuf) == 0) {
3878 *fnidx = fnidx_tmp;
3879 return 1;
3880 } else { // Then try alignment.csi
3881 for (i = l_fn - 1; i > 0; --i)
3882 if (fnidx_tmp[i] == '.') {
3883 strcpy(fnidx_tmp + i, csi_ext);
3884 if(stat(fnidx_tmp, &sbuf) == 0) {
3885 *fnidx = fnidx_tmp;
3886 return 1;
3887 }
3888 break;
3889 }
3890 }
3891 if (fmt == HTS_FMT_BAI) {
3892 // Next, try alignment.bam.bai
3893 strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, bai_ext);
3894 if(stat(fnidx_tmp, &sbuf) == 0) {
3895 *fnidx = fnidx_tmp;
3896 return 1;
3897 } else { // And finally, try alignment.bai
3898 for (i = l_fn - 1; i > 0; --i)
3899 if (fnidx_tmp[i] == '.') {
3900 strcpy(fnidx_tmp + i, bai_ext);
3901 if(stat(fnidx_tmp, &sbuf) == 0) {
3902 *fnidx = fnidx_tmp;
3903 return 1;
3904 }
3905 break;
3906 }
3907 }
3908 } else if (fmt == HTS_FMT_TBI) { // Or .tbi
3909 strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, tbi_ext);
3910 if(stat(fnidx_tmp, &sbuf) == 0) {
3911 *fnidx = fnidx_tmp;
3912 return 1;
3913 } else {
3914 for (i = l_fn - 1; i > 0; --i)
3915 if (fnidx_tmp[i] == '.') {
3916 strcpy(fnidx_tmp + i, tbi_ext);
3917 if(stat(fnidx_tmp, &sbuf) == 0) {
3918 *fnidx = fnidx_tmp;
3919 return 1;
3920 }
3921 break;
3922 }
3923 }
3924 } else if (fmt == HTS_FMT_CRAI) { // Or .crai
3925 strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, crai_ext);
3926 if(stat(fnidx_tmp, &sbuf) == 0) {
3927 *fnidx = fnidx_tmp;
3928 return 1;
3929 } else {
3930 for (i = l_fn - 1; i > 0; --i)
3931 if (fnidx_tmp[i] == '.') {
3932 strcpy(fnidx_tmp + i, crai_ext);
3933 if(stat(fnidx_tmp, &sbuf) == 0) {
3934 *fnidx = fnidx_tmp;
3935 return 1;
3936 }
3937 break;
3938 }
3939 }
3940 } else if (fmt == HTS_FMT_FAI) { // Or .fai
3941 strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, fai_ext);
3942 *fnidx = fnidx_tmp;
3943 if(stat(fnidx_tmp, &sbuf) == 0)
3944 return 1;
3945 else
3946 return 0;
3947 }
3948
3949 free(fnidx_tmp);
3950 return 0;
3951 }
3952
idx_filename(const char * fn,const char * ext,int download)3953 static char *idx_filename(const char *fn, const char *ext, int download) {
3954 int ret, local_len;
3955 char *fnidx;
3956 const char *local_fn = NULL;
3957 kstring_t buffer = KS_INITIALIZE;
3958
3959 // First try : append `ext` to `fn`
3960 if (!(fnidx = haddextension(&buffer, fn, 0, ext))) {
3961 free(buffer.s);
3962 return NULL;
3963 }
3964 if ((ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download)) == -1) {
3965 // Second try : replace suffix of `fn` with `ext`
3966 if (!(fnidx = haddextension(&buffer, fn, 1, ext))) {
3967 free(buffer.s);
3968 return NULL;
3969 }
3970 ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download);
3971 }
3972
3973 if (ret < 0) {
3974 free(buffer.s);
3975 return NULL;
3976 }
3977
3978 memmove(fnidx, local_fn, local_len);
3979 fnidx[local_len] = 0;
3980 return fnidx;
3981 }
3982
hts_idx_getfn(const char * fn,const char * ext)3983 char *hts_idx_getfn(const char *fn, const char *ext)
3984 {
3985 return idx_filename(fn, ext, HTS_IDX_SAVE_REMOTE);
3986 }
3987
hts_idx_locatefn(const char * fn,const char * ext)3988 char *hts_idx_locatefn(const char *fn, const char *ext)
3989 {
3990 return idx_filename(fn, ext, 0);
3991 }
3992
idx_find_and_load(const char * fn,int fmt,int flags)3993 static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags)
3994 {
3995 char *fnidx = strstr(fn, HTS_IDX_DELIM);
3996 hts_idx_t *idx;
3997
3998 if ( fnidx ) {
3999 char *fn2 = strdup(fn);
4000 if (!fn2) {
4001 hts_log_error("%s", strerror(errno));
4002 return NULL;
4003 }
4004 fn2[fnidx - fn] = '\0';
4005 fnidx += strlen(HTS_IDX_DELIM);
4006 idx = hts_idx_load3(fn2, fnidx, fmt, flags);
4007 free(fn2);
4008 return idx;
4009 }
4010
4011 if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) {
4012 if (flags & HTS_IDX_SAVE_REMOTE) {
4013 fnidx = hts_idx_getfn(fn, ".csi");
4014 if (!fnidx) {
4015 switch (fmt) {
4016 case HTS_FMT_BAI: fnidx = hts_idx_getfn(fn, ".bai"); break;
4017 case HTS_FMT_TBI: fnidx = hts_idx_getfn(fn, ".tbi"); break;
4018 default: break;
4019 }
4020 }
4021 } else {
4022 fnidx = idx_filename(fn, ".csi", 0);
4023 if (!fnidx) {
4024 switch (fmt) {
4025 case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", 0); break;
4026 case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", 0); break;
4027 default: break;
4028 }
4029 }
4030 }
4031 }
4032 if (!fnidx) {
4033 if (!(flags & HTS_IDX_SILENT_FAIL))
4034 hts_log_error("Could not retrieve index file for '%s'", fn);
4035 return 0;
4036 }
4037
4038 if (flags & HTS_IDX_SAVE_REMOTE)
4039 idx = hts_idx_load3(fn, fnidx, fmt, flags);
4040 else
4041 idx = idx_read(fnidx);
4042 free(fnidx);
4043 return idx;
4044 }
4045
hts_idx_load(const char * fn,int fmt)4046 hts_idx_t *hts_idx_load(const char *fn, int fmt) {
4047 return idx_find_and_load(fn, fmt, 1);
4048 }
4049
hts_idx_load2(const char * fn,const char * fnidx)4050 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
4051 {
4052 return hts_idx_load3(fn, fnidx, 0, 0);
4053 }
4054
hts_idx_load3(const char * fn,const char * fnidx,int fmt,int flags)4055 hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
4056 {
4057 const char *local_fn = NULL;
4058 char *local_fnidx = NULL;
4059 int local_len;
4060 if (!fnidx)
4061 return idx_find_and_load(fn, fmt, flags);
4062
4063 // Check that the index file is up to date, the main file might have changed
4064 struct stat stat_idx,stat_main;
4065 int remote_fn = hisremote(fn), remote_fnidx = hisremote(fnidx);
4066 if ( !remote_fn && !remote_fnidx
4067 && !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
4068 {
4069 if ( stat_idx.st_mtime < stat_main.st_mtime )
4070 hts_log_warning("The index file is older than the data file: %s", fnidx);
4071 }
4072
4073 if (remote_fnidx && (flags & HTS_IDX_SAVE_REMOTE))
4074 {
4075 int ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, 1);
4076 if (ret == 0) {
4077 local_fnidx = strdup(local_fn);
4078 if (local_fnidx) {
4079 local_fnidx[local_len] = '\0';
4080 fnidx = local_fnidx;
4081 }
4082 }
4083 }
4084
4085 hts_idx_t *idx = idx_read(fnidx);
4086 if (!idx && !(flags & HTS_IDX_SILENT_FAIL))
4087 hts_log_error("Could not load local index file '%s'", fnidx);
4088
4089 free(local_fnidx);
4090
4091 return idx;
4092 }
4093
4094
4095
4096 /**********************
4097 *** Memory ***
4098 **********************/
4099
4100 /* For use with hts_expand macros *only* */
4101 HTSLIB_EXPORT
hts_realloc_or_die(size_t n,size_t m,size_t m_sz,size_t size,int clear,void ** ptr,const char * func)4102 size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
4103 int clear, void **ptr, const char *func) {
4104 /* If new_m and size are both below this limit, multiplying them
4105 together can't overflow */
4106 const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
4107 void *new_ptr;
4108 size_t bytes, new_m;
4109
4110 new_m = n;
4111 kroundup_size_t(new_m);
4112
4113 bytes = size * new_m;
4114
4115 /* Check for overflow. Both ensure that new_m will fit in m (we make the
4116 pessimistic assumption that m is signed), and that bytes has not
4117 wrapped around. */
4118 if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
4119 || ((size > safe || new_m > safe)
4120 && bytes / new_m != size)) {
4121 errno = ENOMEM;
4122 goto die;
4123 }
4124
4125 new_ptr = realloc(*ptr, bytes);
4126 if (new_ptr == NULL) goto die;
4127
4128 if (clear) {
4129 if (new_m > m) {
4130 memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
4131 }
4132 }
4133
4134 *ptr = new_ptr;
4135
4136 return new_m;
4137
4138 die:
4139 hts_log_error("%s", strerror(errno));
4140 exit(1);
4141 }
4142
4143 /*
4144 * Companion to hts_resize() macro that does the actual allocation.
4145 *
4146 * Somewhat complicated as hts_resize() needs to write the new allocated
4147 * size back into *size_in_out, and the value pointed to may either be
4148 * int32_t, uint32_t or size_t depending on which array is being resized.
4149 * This is solved by making `size_in_out` a void pointer, getting the macro
4150 * to pass in the size of the item pointed to (in `size_sz`) and then using
4151 * an appropriate cast (based on the value of size_sz). The function
4152 * ensures that the maximum size will be storable in a signed type of
4153 * the given size so storing to an int32_t should work correctly.
4154 *
4155 * Assumes that sizeof(uint32_t) and sizeof(int32_t) is 4,
4156 * sizeof(uint64_t) and sizeof(int64_t) is 8 and sizeof(size_t) is
4157 * either 4 or 8. It also assumes casting from unsigned to signed will
4158 * work as long as the top bit isn't set.
4159 */
4160
hts_resize_array_(size_t item_size,size_t num,size_t size_sz,void * size_in_out,void ** ptr_in_out,int flags,const char * func)4161 int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
4162 void *size_in_out, void **ptr_in_out, int flags,
4163 const char *func) {
4164 /* If new_size and item_size are both below this limit, multiplying them
4165 together can't overflow */
4166 const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
4167 void *new_ptr;
4168 size_t bytes, new_size;
4169
4170 new_size = num;
4171 kroundup_size_t(new_size);
4172 bytes = item_size * new_size;
4173
4174 /* Check for overflow. Both ensure that alloc will fit in alloc_in_out (we
4175 make the pessimistic assumption that *alloc_in_out is signed), and that
4176 bytes has not wrapped around. */
4177
4178 if ((new_size > (((size_t) 1 << (size_sz * 8 - 1)) - 1))
4179 || (((item_size > safe) || (new_size > safe))
4180 && bytes / new_size != item_size)) {
4181 hts_log(HTS_LOG_ERROR, func, "Memory allocation too large");
4182 errno = ENOMEM;
4183 return -1;
4184 }
4185
4186 new_ptr = realloc(*ptr_in_out, bytes);
4187 if (new_ptr == NULL) {
4188 int save_errno = errno;
4189 hts_log(HTS_LOG_ERROR, func, "%s", strerror(errno));
4190 errno = save_errno;
4191 return -1;
4192 }
4193
4194 if (flags & HTS_RESIZE_CLEAR) {
4195 size_t old_size;
4196 switch (size_sz) {
4197 case 4: old_size = *((uint32_t *) size_in_out); break;
4198 case 8: old_size = *((uint64_t *) size_in_out); break;
4199 default: abort();
4200 }
4201 if (new_size > old_size) {
4202 memset((char *) new_ptr + old_size * item_size, 0,
4203 (new_size - old_size) * item_size);
4204 }
4205 }
4206
4207 switch (size_sz) {
4208 case 4: *((uint32_t *) size_in_out) = new_size; break;
4209 case 8: *((uint64_t *) size_in_out) = new_size; break;
4210 default: abort();
4211 }
4212
4213 *ptr_in_out = new_ptr;
4214 return 0;
4215 }
4216
hts_lib_shutdown()4217 void hts_lib_shutdown()
4218 {
4219 hfile_shutdown(1);
4220 }
4221
hts_free(void * ptr)4222 void hts_free(void *ptr) {
4223 free(ptr);
4224 }
4225
hts_set_log_level(enum htsLogLevel level)4226 void hts_set_log_level(enum htsLogLevel level)
4227 {
4228 hts_verbose = level;
4229 }
4230
hts_get_log_level()4231 enum htsLogLevel hts_get_log_level()
4232 {
4233 return hts_verbose;
4234 }
4235
get_severity_tag(enum htsLogLevel severity)4236 static char get_severity_tag(enum htsLogLevel severity)
4237 {
4238 switch (severity) {
4239 case HTS_LOG_ERROR:
4240 return 'E';
4241 case HTS_LOG_WARNING:
4242 return 'W';
4243 case HTS_LOG_INFO:
4244 return 'I';
4245 case HTS_LOG_DEBUG:
4246 return 'D';
4247 case HTS_LOG_TRACE:
4248 return 'T';
4249 default:
4250 break;
4251 }
4252
4253 return '*';
4254 }
4255
hts_log(enum htsLogLevel severity,const char * context,const char * format,...)4256 void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
4257 {
4258 int save_errno = errno;
4259 if (severity <= hts_verbose) {
4260 va_list argptr;
4261
4262 fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
4263
4264 va_start(argptr, format);
4265 vfprintf(stderr, format, argptr);
4266 va_end(argptr);
4267
4268 fprintf(stderr, "\n");
4269 }
4270 errno = save_errno;
4271 }
4272