1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3 Copyright (C) 2008, 2009, 2012-2017 Genome Research Ltd.
4 Copyright (C) 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <zlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <strings.h>
32 #include <stdlib.h>
33 #include <limits.h>
34 #include <fcntl.h>
35 #include <errno.h>
36 #include <sys/stat.h>
37 #include <assert.h>
38
39 #include "htslib/hts.h"
40 #include "htslib/bgzf.h"
41 #include "cram/cram.h"
42 #include "htslib/hfile.h"
43 #include "htslib/hts_endian.h"
44 #include "version.h"
45 #include "hts_internal.h"
46 #include "hfile_internal.h"
47 #include "htslib/hts_os.h" // drand48
48
49 #include "htslib/khash.h"
50 #include "htslib/kseq.h"
51 #include "htslib/ksort.h"
52
53 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
54
55 int hts_verbose = HTS_LOG_WARNING;
56
hts_version()57 const char *hts_version()
58 {
59 return HTS_VERSION;
60 }
61
62 const unsigned char seq_nt16_table[256] = {
63 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
64 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
65 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
66 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
67 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
68 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
69 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
70 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
71
72 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
73 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
74 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
75 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
78 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
79 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
80 };
81
82 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
83
84 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
85
86 /**********************
87 *** Basic file I/O ***
88 **********************/
89
format_category(enum htsExactFormat fmt)90 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
91 {
92 switch (fmt) {
93 case bam:
94 case sam:
95 case cram:
96 return sequence_data;
97
98 case vcf:
99 case bcf:
100 return variant_data;
101
102 case bai:
103 case crai:
104 case csi:
105 case gzi:
106 case tbi:
107 return index_file;
108
109 case bed:
110 return region_list;
111
112 case htsget:
113 return unknown_category;
114
115 case unknown_format:
116 case binary_format:
117 case text_format:
118 case format_maximum:
119 break;
120 }
121
122 return unknown_category;
123 }
124
125 // Decompress up to ten or so bytes by peeking at the file, which must be
126 // positioned at the start of a GZIP block.
decompress_peek(hFILE * fp,unsigned char * dest,size_t destsize)127 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
128 {
129 // Typically at most a couple of hundred bytes of input are required
130 // to get a few bytes of output from inflate(), so hopefully this buffer
131 // size suffices in general.
132 unsigned char buffer[512];
133 z_stream zs;
134 ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
135
136 if (npeek < 0) return 0;
137
138 zs.zalloc = NULL;
139 zs.zfree = NULL;
140 zs.next_in = buffer;
141 zs.avail_in = npeek;
142 zs.next_out = dest;
143 zs.avail_out = destsize;
144 if (inflateInit2(&zs, 31) != Z_OK) return 0;
145
146 while (zs.total_out < destsize)
147 if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
148
149 destsize = zs.total_out;
150 inflateEnd(&zs);
151
152 return destsize;
153 }
154
155 // Parse "x.y" text, taking care because the string is not NUL-terminated
156 // and filling in major/minor only when the digits are followed by a delimiter,
157 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
158 static void
parse_version(htsFormat * fmt,const unsigned char * u,const unsigned char * ulim)159 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
160 {
161 const char *s = (const char *) u;
162 const char *slim = (const char *) ulim;
163 short v;
164
165 fmt->version.major = fmt->version.minor = -1;
166
167 for (v = 0; s < slim && isdigit_c(*s); s++)
168 v = 10 * v + *s - '0';
169
170 if (s < slim) {
171 fmt->version.major = v;
172 if (*s == '.') {
173 s++;
174 for (v = 0; s < slim && isdigit_c(*s); s++)
175 v = 10 * v + *s - '0';
176 if (s < slim)
177 fmt->version.minor = v;
178 }
179 else
180 fmt->version.minor = 0;
181 }
182 }
183
184 static int
cmp_nonblank(const char * key,const unsigned char * u,const unsigned char * ulim)185 cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
186 {
187 const unsigned char *ukey = (const unsigned char *) key;
188
189 while (*ukey)
190 if (u >= ulim) return +1;
191 else if (isspace_c(*u)) u++;
192 else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
193 else u++, ukey++;
194
195 return 0;
196 }
197
hts_detect_format(hFILE * hfile,htsFormat * fmt)198 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
199 {
200 unsigned char s[32];
201 ssize_t len = hpeek(hfile, s, 18);
202 if (len < 0) return -1;
203
204 if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
205 // The stream is either gzip-compressed or BGZF-compressed.
206 // Determine which, and decompress the first few bytes.
207 fmt->compression = (len >= 18 && (s[3] & 4) &&
208 memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
209 len = decompress_peek(hfile, s, sizeof s);
210 }
211 else {
212 fmt->compression = no_compression;
213 len = hpeek(hfile, s, sizeof s);
214 }
215 if (len < 0) return -1;
216
217 fmt->compression_level = -1;
218 fmt->specific = NULL;
219
220 if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
221 fmt->category = sequence_data;
222 fmt->format = cram;
223 fmt->version.major = s[4], fmt->version.minor = s[5];
224 fmt->compression = custom;
225 return 0;
226 }
227 else if (len >= 4 && s[3] <= '\4') {
228 if (memcmp(s, "BAM\1", 4) == 0) {
229 fmt->category = sequence_data;
230 fmt->format = bam;
231 // TODO Decompress enough to pick version from @HD-VN header
232 fmt->version.major = 1, fmt->version.minor = -1;
233 return 0;
234 }
235 else if (memcmp(s, "BAI\1", 4) == 0) {
236 fmt->category = index_file;
237 fmt->format = bai;
238 fmt->version.major = -1, fmt->version.minor = -1;
239 return 0;
240 }
241 else if (memcmp(s, "BCF\4", 4) == 0) {
242 fmt->category = variant_data;
243 fmt->format = bcf;
244 fmt->version.major = 1, fmt->version.minor = -1;
245 return 0;
246 }
247 else if (memcmp(s, "BCF\2", 4) == 0) {
248 fmt->category = variant_data;
249 fmt->format = bcf;
250 fmt->version.major = s[3];
251 fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
252 return 0;
253 }
254 else if (memcmp(s, "CSI\1", 4) == 0) {
255 fmt->category = index_file;
256 fmt->format = csi;
257 fmt->version.major = 1, fmt->version.minor = -1;
258 return 0;
259 }
260 else if (memcmp(s, "TBI\1", 4) == 0) {
261 fmt->category = index_file;
262 fmt->format = tbi;
263 fmt->version.major = -1, fmt->version.minor = -1;
264 return 0;
265 }
266 }
267 else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
268 fmt->category = variant_data;
269 fmt->format = vcf;
270 if (len >= 21 && s[16] == 'v')
271 parse_version(fmt, &s[17], &s[len]);
272 else
273 fmt->version.major = fmt->version.minor = -1;
274 return 0;
275 }
276 else if (len >= 4 && s[0] == '@' &&
277 (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
278 memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
279 fmt->category = sequence_data;
280 fmt->format = sam;
281 // @HD-VN is not guaranteed to be the first tag, but then @HD is
282 // not guaranteed to be present at all...
283 if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
284 parse_version(fmt, &s[7], &s[len]);
285 else
286 fmt->version.major = 1, fmt->version.minor = -1;
287 return 0;
288 }
289 else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
290 fmt->category = unknown_category;
291 fmt->format = htsget;
292 fmt->version.major = fmt->version.minor = -1;
293 return 0;
294 }
295 else {
296 // Various possibilities for tab-delimited text:
297 // .crai (gzipped tab-delimited six columns: seqid 5*number)
298 // .bed ([3..12] tab-delimited columns)
299 // .bedpe (>= 10 tab-delimited columns)
300 // .sam (tab-delimited >= 11 columns: seqid number seqid...)
301 // FIXME For now, assume it's SAM
302 fmt->category = sequence_data;
303 fmt->format = sam;
304 fmt->version.major = 1, fmt->version.minor = -1;
305 return 0;
306 }
307
308 fmt->category = unknown_category;
309 fmt->format = unknown_format;
310 fmt->version.major = fmt->version.minor = -1;
311 fmt->compression = no_compression;
312 return 0;
313 }
314
hts_format_description(const htsFormat * format)315 char *hts_format_description(const htsFormat *format)
316 {
317 kstring_t str = { 0, 0, NULL };
318
319 switch (format->format) {
320 case sam: kputs("SAM", &str); break;
321 case bam: kputs("BAM", &str); break;
322 case cram: kputs("CRAM", &str); break;
323 case vcf: kputs("VCF", &str); break;
324 case bcf:
325 if (format->version.major == 1) kputs("Legacy BCF", &str);
326 else kputs("BCF", &str);
327 break;
328 case bai: kputs("BAI", &str); break;
329 case crai: kputs("CRAI", &str); break;
330 case csi: kputs("CSI", &str); break;
331 case tbi: kputs("Tabix", &str); break;
332 case htsget: kputs("htsget", &str); break;
333 default: kputs("unknown", &str); break;
334 }
335
336 if (format->version.major >= 0) {
337 kputs(" version ", &str);
338 kputw(format->version.major, &str);
339 if (format->version.minor >= 0) {
340 kputc('.', &str);
341 kputw(format->version.minor, &str);
342 }
343 }
344
345 switch (format->compression) {
346 case custom: kputs(" compressed", &str); break;
347 case gzip: kputs(" gzip-compressed", &str); break;
348 case bgzf:
349 switch (format->format) {
350 case bam:
351 case bcf:
352 case csi:
353 case tbi:
354 // These are by definition BGZF, so just use the generic term
355 kputs(" compressed", &str);
356 break;
357 default:
358 kputs(" BGZF-compressed", &str);
359 break;
360 }
361 break;
362 default: break;
363 }
364
365 switch (format->category) {
366 case sequence_data: kputs(" sequence", &str); break;
367 case variant_data: kputs(" variant calling", &str); break;
368 case index_file: kputs(" index", &str); break;
369 case region_list: kputs(" genomic region", &str); break;
370 default: break;
371 }
372
373 if (format->compression == no_compression)
374 switch (format->format) {
375 case sam:
376 case crai:
377 case vcf:
378 case bed:
379 case htsget:
380 kputs(" text", &str);
381 break;
382
383 default:
384 kputs(" data", &str);
385 break;
386 }
387 else
388 kputs(" data", &str);
389
390 return ks_release(&str);
391 }
392
hts_open_format_impl(const char * fn,const char * mode,const htsFormat * fmt,hFILE * hf)393 static htsFile *hts_open_format_impl(const char *fn, const char *mode, const htsFormat *fmt, hFILE* hf)
394 {
395 char smode[102], *cp, *cp2, *mode_c;
396 htsFile *fp = NULL;
397 hFILE *hfile;
398 char fmt_code = '\0';
399
400 strncpy(smode, mode, 100);
401 smode[100]=0;
402 if ((cp = strchr(smode, ',')))
403 *cp = '\0';
404
405 // Migrate format code (b or c) to the end of the smode buffer.
406 for (cp2 = cp = smode; *cp; cp++) {
407 if (*cp == 'b')
408 fmt_code = 'b';
409 else if (*cp == 'c')
410 fmt_code = 'c';
411 else
412 *cp2++ = *cp;
413 }
414 mode_c = cp2;
415 *cp2++ = fmt_code;
416 *cp2++ = 0;
417 *cp2++ = 0;
418
419 // Set or reset the format code if opts->format is used
420 if (fmt && fmt->format != unknown_format)
421 *mode_c = "\0g\0\0b\0c\0\0b\0g\0\0"[fmt->format];
422
423 hfile = hf ? hf : hopen(fn, smode);
424 if (hfile == NULL) goto error;
425
426 fp = hts_hopen(hfile, fn, smode);
427 if (fp == NULL) goto error;
428
429 if (fmt && fmt->specific)
430 if (hts_opt_apply(fp, fmt->specific) != 0)
431 goto error;
432
433 return fp;
434
435 error:
436 hts_log_error("Failed to open file %s", fn);
437
438 if (hfile)
439 hclose_abruptly(hfile);
440
441 return NULL;
442 }
hts_open_format(const char * fn,const char * mode,const htsFormat * fmt)443 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
444 {
445 return hts_open_format_impl(fn, mode, fmt, NULL);
446 }
447
hts_open_callback(const char * fn,hFILE_callback_ops * ops,const char * mode)448 htsFile *hts_open_callback(const char* fn, hFILE_callback_ops* ops, const char* mode)
449 {
450 if(NULL == ops) return NULL;
451 hFILE* fp = hopen_callback(*ops, mode);
452 return hts_open_format_impl(fn ? fn : "-", mode, NULL, fp);
453 }
454
hts_open(const char * fn,const char * mode)455 htsFile *hts_open(const char *fn, const char *mode) {
456 return hts_open_format(fn, mode, NULL);
457 }
458
459 /*
460 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
461 * the prefix in lowercase into buf and returning a pointer to the suffix.
462 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
463 * prefix should be one of several known values of maximum length buflen-2.
464 * (If delim is not found, returns a pointer to the '\0'.)
465 */
466 static const char *
scan_keyword(const char * str,char delim,char * buf,size_t buflen)467 scan_keyword(const char *str, char delim, char *buf, size_t buflen)
468 {
469 size_t i = 0;
470 while (*str && *str != delim) {
471 if (i < buflen-1) buf[i++] = tolower_c(*str);
472 str++;
473 }
474
475 buf[i] = '\0';
476 return *str? str+1 : str;
477 }
478
479 /*
480 * Parses arg and appends it to the option list.
481 *
482 * Returns 0 on success;
483 * -1 on failure.
484 */
hts_opt_add(hts_opt ** opts,const char * c_arg)485 int hts_opt_add(hts_opt **opts, const char *c_arg) {
486 hts_opt *o, *t;
487 char *val;
488
489 /*
490 * IMPORTANT!!!
491 * If you add another string option here, don't forget to also add
492 * it to the case statement in hts_opt_apply.
493 */
494
495 if (!c_arg)
496 return -1;
497
498 if (!(o = malloc(sizeof(*o))))
499 return -1;
500
501 if (!(o->arg = strdup(c_arg))) {
502 free(o);
503 return -1;
504 }
505
506 if (!(val = strchr(o->arg, '=')))
507 val = "1"; // assume boolean
508 else
509 *val++ = '\0';
510
511 if (strcmp(o->arg, "decode_md") == 0 ||
512 strcmp(o->arg, "DECODE_MD") == 0)
513 o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
514
515 else if (strcmp(o->arg, "verbosity") == 0 ||
516 strcmp(o->arg, "VERBOSITY") == 0)
517 o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
518
519 else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
520 strcmp(o->arg, "SEQS_PER_SLICE") == 0)
521 o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
522
523 else if (strcmp(o->arg, "bases_per_slice") == 0 ||
524 strcmp(o->arg, "BASES_PER_SLICE") == 0)
525 o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
526
527 else if (strcmp(o->arg, "slices_per_container") == 0 ||
528 strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
529 o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
530
531 else if (strcmp(o->arg, "embed_ref") == 0 ||
532 strcmp(o->arg, "EMBED_REF") == 0)
533 o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
534
535 else if (strcmp(o->arg, "no_ref") == 0 ||
536 strcmp(o->arg, "NO_REF") == 0)
537 o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
538
539 else if (strcmp(o->arg, "ignore_md5") == 0 ||
540 strcmp(o->arg, "IGNORE_MD5") == 0)
541 o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
542
543 else if (strcmp(o->arg, "use_bzip2") == 0 ||
544 strcmp(o->arg, "USE_BZIP2") == 0)
545 o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
546
547 else if (strcmp(o->arg, "use_rans") == 0 ||
548 strcmp(o->arg, "USE_RANS") == 0)
549 o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
550
551 else if (strcmp(o->arg, "use_lzma") == 0 ||
552 strcmp(o->arg, "USE_LZMA") == 0)
553 o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
554
555 else if (strcmp(o->arg, "reference") == 0 ||
556 strcmp(o->arg, "REFERENCE") == 0)
557 o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
558
559 else if (strcmp(o->arg, "version") == 0 ||
560 strcmp(o->arg, "VERSION") == 0)
561 o->opt = CRAM_OPT_VERSION, o->val.s =val;
562
563 else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
564 strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
565 o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
566
567 else if (strcmp(o->arg, "nthreads") == 0 ||
568 strcmp(o->arg, "NTHREADS") == 0)
569 o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
570
571 else if (strcmp(o->arg, "cache_size") == 0 ||
572 strcmp(o->arg, "CACHE_SIZE") == 0) {
573 char *endp;
574 o->opt = HTS_OPT_CACHE_SIZE;
575 o->val.i = strtol(val, &endp, 0);
576 // NB: Doesn't support floats, eg 1.5g
577 // TODO: extend hts_parse_decimal? See also samtools sort.
578 switch (*endp) {
579 case 'g': case 'G': o->val.i *= 1024;
580 case 'm': case 'M': o->val.i *= 1024;
581 case 'k': case 'K': o->val.i *= 1024; break;
582 case '\0': break;
583 default:
584 hts_log_error("Unrecognised cache size suffix '%c'", *endp);
585 free(o->arg);
586 free(o);
587 return -1;
588 }
589 }
590
591 else if (strcmp(o->arg, "required_fields") == 0 ||
592 strcmp(o->arg, "REQUIRED_FIELDS") == 0)
593 o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
594
595 else if (strcmp(o->arg, "lossy_names") == 0 ||
596 strcmp(o->arg, "LOSSY_NAMES") == 0)
597 o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
598
599 else if (strcmp(o->arg, "name_prefix") == 0 ||
600 strcmp(o->arg, "NAME_PREFIX") == 0)
601 o->opt = CRAM_OPT_PREFIX, o->val.s = val;
602
603 else if (strcmp(o->arg, "store_md") == 0 ||
604 strcmp(o->arg, "store_md") == 0)
605 o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
606
607 else if (strcmp(o->arg, "store_nm") == 0 ||
608 strcmp(o->arg, "store_nm") == 0)
609 o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
610
611 else if (strcmp(o->arg, "block_size") == 0 ||
612 strcmp(o->arg, "BLOCK_SIZE") == 0)
613 o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
614
615 else if (strcmp(o->arg, "level") == 0 ||
616 strcmp(o->arg, "LEVEL") == 0)
617 o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
618
619 else {
620 hts_log_error("Unknown option '%s'", o->arg);
621 free(o->arg);
622 free(o);
623 return -1;
624 }
625
626 o->next = NULL;
627
628 // Append; assumes small list.
629 if (*opts) {
630 t = *opts;
631 while (t->next)
632 t = t->next;
633 t->next = o;
634 } else {
635 *opts = o;
636 }
637
638 return 0;
639 }
640
641 /*
642 * Applies an hts_opt option list to a given htsFile.
643 *
644 * Returns 0 on success
645 * -1 on failure
646 */
hts_opt_apply(htsFile * fp,hts_opt * opts)647 int hts_opt_apply(htsFile *fp, hts_opt *opts) {
648 hts_opt *last = NULL;
649
650 for (; opts; opts = (last=opts)->next) {
651 switch (opts->opt) {
652 case CRAM_OPT_REFERENCE:
653 case CRAM_OPT_VERSION:
654 case CRAM_OPT_PREFIX:
655 if (hts_set_opt(fp, opts->opt, opts->val.s) != 0)
656 return -1;
657 break;
658 default:
659 if (hts_set_opt(fp, opts->opt, opts->val.i) != 0)
660 return -1;
661 break;
662 }
663 }
664
665 return 0;
666 }
667
668 /*
669 * Frees an hts_opt list.
670 */
hts_opt_free(hts_opt * opts)671 void hts_opt_free(hts_opt *opts) {
672 hts_opt *last = NULL;
673 while (opts) {
674 opts = (last=opts)->next;
675 free(last->arg);
676 free(last);
677 }
678 }
679
680
681 /*
682 * Tokenise options as (key(=value)?,)*(key(=value)?)?
683 * NB: No provision for ',' appearing in the value!
684 * Add backslashing rules?
685 *
686 * This could be used as part of a general command line option parser or
687 * as a string concatenated onto the file open mode.
688 *
689 * Returns 0 on success
690 * -1 on failure.
691 */
hts_parse_opt_list(htsFormat * fmt,const char * str)692 int hts_parse_opt_list(htsFormat *fmt, const char *str) {
693 while (str && *str) {
694 const char *str_start;
695 int len;
696 char arg[8001];
697
698 while (*str && *str == ',')
699 str++;
700
701 for (str_start = str; *str && *str != ','; str++);
702 len = str - str_start;
703
704 // Produce a nul terminated copy of the option
705 strncpy(arg, str_start, len < 8000 ? len : 8000);
706 arg[len < 8000 ? len : 8000] = '\0';
707
708 if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
709 return -1;
710
711 if (*str)
712 str++;
713 }
714
715 return 0;
716 }
717
718 /*
719 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
720 * followed by a comma separated list of key=value options and splits
721 * these up into the fields of htsFormat struct.
722 *
723 * format is assumed to be already initialised, either to blank
724 * "unknown" values or via previous hts_opt_add calls.
725 *
726 * Returns 0 on success
727 * -1 on failure.
728 */
hts_parse_format(htsFormat * format,const char * str)729 int hts_parse_format(htsFormat *format, const char *str) {
730 char fmt[8];
731 const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
732
733 format->version.minor = 0; // unknown
734 format->version.major = 0; // unknown
735
736 if (strcmp(fmt, "sam") == 0) {
737 format->category = sequence_data;
738 format->format = sam;
739 format->compression = no_compression;;
740 format->compression_level = 0;
741 } else if (strcmp(fmt, "bam") == 0) {
742 format->category = sequence_data;
743 format->format = bam;
744 format->compression = bgzf;
745 format->compression_level = -1;
746 } else if (strcmp(fmt, "cram") == 0) {
747 format->category = sequence_data;
748 format->format = cram;
749 format->compression = custom;
750 format->compression_level = -1;
751 } else if (strcmp(fmt, "vcf") == 0) {
752 format->category = variant_data;
753 format->format = vcf;
754 format->compression = no_compression;;
755 format->compression_level = 0;
756 } else if (strcmp(fmt, "bcf") == 0) {
757 format->category = variant_data;
758 format->format = bcf;
759 format->compression = bgzf;
760 format->compression_level = -1;
761 } else {
762 return -1;
763 }
764
765 return hts_parse_opt_list(format, cp);
766 }
767
768
769 /*
770 * Tokenise options as (key(=value)?,)*(key(=value)?)?
771 * NB: No provision for ',' appearing in the value!
772 * Add backslashing rules?
773 *
774 * This could be used as part of a general command line option parser or
775 * as a string concatenated onto the file open mode.
776 *
777 * Returns 0 on success
778 * -1 on failure.
779 */
hts_process_opts(htsFile * fp,const char * opts)780 static int hts_process_opts(htsFile *fp, const char *opts) {
781 htsFormat fmt;
782
783 fmt.specific = NULL;
784 if (hts_parse_opt_list(&fmt, opts) != 0)
785 return -1;
786
787 if (hts_opt_apply(fp, fmt.specific) != 0) {
788 hts_opt_free(fmt.specific);
789 return -1;
790 }
791
792 hts_opt_free(fmt.specific);
793
794 return 0;
795 }
796
797
hts_hopen(hFILE * hfile,const char * fn,const char * mode)798 htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
799 {
800 hFILE *hfile_orig = hfile;
801 htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
802 char simple_mode[101], *cp, *opts;
803 simple_mode[100] = '\0';
804
805 if (fp == NULL) goto error;
806
807 fp->fn = strdup(fn);
808 fp->is_be = ed_is_big();
809
810 // Split mode into simple_mode,opts strings
811 if ((cp = strchr(mode, ','))) {
812 strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
813 simple_mode[cp-mode] = '\0';
814 opts = cp+1;
815 } else {
816 strncpy(simple_mode, mode, 100);
817 opts = NULL;
818 }
819
820 if (strchr(simple_mode, 'r')) {
821 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
822
823 if (fp->format.format == htsget) {
824 hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
825 if (hfile2 == NULL) goto error;
826
827 // Build fp against the result of the redirection
828 hfile = hfile2;
829 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
830 }
831 }
832 else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
833 htsFormat *fmt = &fp->format;
834 fp->is_write = 1;
835
836 if (strchr(simple_mode, 'b')) fmt->format = binary_format;
837 else if (strchr(simple_mode, 'c')) fmt->format = cram;
838 else fmt->format = text_format;
839
840 if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
841 else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
842 else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
843 else {
844 // No compression mode specified, set to the default for the format
845 switch (fmt->format) {
846 case binary_format: fmt->compression = bgzf; break;
847 case cram: fmt->compression = custom; break;
848 case text_format: fmt->compression = no_compression; break;
849 default: abort();
850 }
851 }
852
853 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
854 fmt->category = format_category(fmt->format);
855
856 fmt->version.major = fmt->version.minor = -1;
857 fmt->compression_level = -1;
858 fmt->specific = NULL;
859 }
860 else { errno = EINVAL; goto error; }
861
862 switch (fp->format.format) {
863 case binary_format:
864 case bam:
865 case bcf:
866 fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
867 if (fp->fp.bgzf == NULL) goto error;
868 fp->is_bin = fp->is_bgzf = 1;
869 break;
870
871 case cram:
872 fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
873 if (fp->fp.cram == NULL) goto error;
874 if (!fp->is_write)
875 cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
876 fp->is_cram = 1;
877 break;
878
879 case text_format:
880 case sam:
881 case vcf:
882 if (fp->format.compression != no_compression) {
883 fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
884 if (fp->fp.bgzf == NULL) goto error;
885 fp->is_bgzf = 1;
886 }
887 else
888 fp->fp.hfile = hfile;
889 break;
890
891 default:
892 errno = ENOEXEC;
893 goto error;
894 }
895
896 if (opts)
897 hts_process_opts(fp, opts);
898
899 // If redirecting, close the original hFILE now (pedantically we would
900 // instead close it in hts_close(), but this a simplifying optimisation)
901 if (hfile != hfile_orig) hclose_abruptly(hfile_orig);
902
903 return fp;
904
905 error:
906 hts_log_error("Failed to open file %s", fn);
907
908 // If redirecting, close the failed redirection hFILE that we have opened
909 if (hfile != hfile_orig) hclose_abruptly(hfile);
910
911 if (fp) {
912 free(fp->fn);
913 free(fp->fn_aux);
914 free(fp);
915 }
916 return NULL;
917 }
918
hts_close(htsFile * fp)919 int hts_close(htsFile *fp)
920 {
921 int ret, save;
922
923 switch (fp->format.format) {
924 case binary_format:
925 case bam:
926 case bcf:
927 ret = bgzf_close(fp->fp.bgzf);
928 break;
929
930 case cram:
931 if (!fp->is_write) {
932 switch (cram_eof(fp->fp.cram)) {
933 case 2:
934 hts_log_warning("EOF marker is absent. The input is probably truncated");
935 break;
936 case 0: /* not at EOF, but may not have wanted all seqs */
937 default: /* case 1, expected EOF */
938 break;
939 }
940 }
941 ret = cram_close(fp->fp.cram);
942 break;
943
944 case text_format:
945 case sam:
946 case vcf:
947 if (fp->format.compression != no_compression)
948 ret = bgzf_close(fp->fp.bgzf);
949 else
950 ret = hclose(fp->fp.hfile);
951 break;
952
953 default:
954 ret = -1;
955 break;
956 }
957
958 save = errno;
959 free(fp->fn);
960 free(fp->fn_aux);
961 free(fp->line.s);
962 free(fp);
963 errno = save;
964 return ret;
965 }
966
hts_get_format(htsFile * fp)967 const htsFormat *hts_get_format(htsFile *fp)
968 {
969 return fp? &fp->format : NULL;
970 }
971
hts_format_file_extension(const htsFormat * format)972 const char *hts_format_file_extension(const htsFormat *format) {
973 if (!format)
974 return "?";
975
976 switch (format->format) {
977 case sam: return "sam";
978 case bam: return "bam";
979 case bai: return "bai";
980 case cram: return "cram";
981 case crai: return "crai";
982 case vcf: return "vcf";
983 case bcf: return "bcf";
984 case csi: return "csi";
985 case gzi: return "gzi";
986 case tbi: return "tbi";
987 case bed: return "bed";
988 default: return "?";
989 }
990 }
991
hts_hfile(htsFile * fp)992 static hFILE *hts_hfile(htsFile *fp) {
993 switch (fp->format.format) {
994 case binary_format: // fall through; still valid if bcf?
995 case bam: return bgzf_hfile(fp->fp.bgzf);
996 case cram: return cram_hfile(fp->fp.cram);
997 case text_format: return fp->fp.hfile;
998 case sam: return fp->fp.hfile;
999 default: return NULL;
1000 }
1001 }
1002
hts_set_opt(htsFile * fp,enum hts_fmt_option opt,...)1003 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1004 int r;
1005 va_list args;
1006
1007 switch (opt) {
1008 case HTS_OPT_NTHREADS: {
1009 va_start(args, opt);
1010 int nthreads = va_arg(args, int);
1011 va_end(args);
1012 return hts_set_threads(fp, nthreads);
1013 }
1014
1015 case HTS_OPT_BLOCK_SIZE: {
1016 hFILE *hf = hts_hfile(fp);
1017
1018 if (hf) {
1019 va_start(args, opt);
1020 if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1021 hts_log_warning("Failed to change block size");
1022 va_end(args);
1023 }
1024 else {
1025 // To do - implement for vcf/bcf.
1026 hts_log_warning("Cannot change block size for this format");
1027 }
1028
1029 return 0;
1030 }
1031
1032 case HTS_OPT_THREAD_POOL: {
1033 va_start(args, opt);
1034 htsThreadPool *p = va_arg(args, htsThreadPool *);
1035 va_end(args);
1036 return hts_set_thread_pool(fp, p);
1037 }
1038
1039 case HTS_OPT_CACHE_SIZE: {
1040 va_start(args, opt);
1041 int cache_size = va_arg(args, int);
1042 va_end(args);
1043 hts_set_cache_size(fp, cache_size);
1044 return 0;
1045 }
1046
1047 case HTS_OPT_COMPRESSION_LEVEL: {
1048 va_start(args, opt);
1049 int level = va_arg(args, int);
1050 va_end(args);
1051 if (fp->is_bgzf)
1052 fp->fp.bgzf->compress_level = level;
1053 }
1054
1055 default:
1056 break;
1057 }
1058
1059 if (fp->format.format != cram)
1060 return 0;
1061
1062 va_start(args, opt);
1063 r = cram_set_voption(fp->fp.cram, opt, args);
1064 va_end(args);
1065
1066 return r;
1067 }
1068
1069 BGZF *hts_get_bgzfp(htsFile *fp);
1070
hts_set_threads(htsFile * fp,int n)1071 int hts_set_threads(htsFile *fp, int n)
1072 {
1073 if (fp->format.compression == bgzf) {
1074 return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1075 } else if (fp->format.format == cram) {
1076 return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1077 }
1078 else return 0;
1079 }
1080
hts_set_thread_pool(htsFile * fp,htsThreadPool * p)1081 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1082 if (fp->format.compression == bgzf) {
1083 return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1084 } else if (fp->format.format == cram) {
1085 return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1086 }
1087 else return 0;
1088 }
1089
hts_set_cache_size(htsFile * fp,int n)1090 void hts_set_cache_size(htsFile *fp, int n)
1091 {
1092 if (fp->format.compression == bgzf)
1093 bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1094 }
1095
hts_set_fai_filename(htsFile * fp,const char * fn_aux)1096 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1097 {
1098 free(fp->fn_aux);
1099 if (fn_aux) {
1100 fp->fn_aux = strdup(fn_aux);
1101 if (fp->fn_aux == NULL) return -1;
1102 }
1103 else fp->fn_aux = NULL;
1104
1105 if (fp->format.format == cram)
1106 if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1107 return -1;
1108
1109 return 0;
1110 }
1111
1112 // For VCF/BCF backward sweeper. Not exposing these functions because their
1113 // future is uncertain. Things will probably have to change with hFILE...
hts_get_bgzfp(htsFile * fp)1114 BGZF *hts_get_bgzfp(htsFile *fp)
1115 {
1116 if (fp->is_bgzf)
1117 return fp->fp.bgzf;
1118 else
1119 return NULL;
1120 }
hts_useek(htsFile * fp,long uoffset,int where)1121 int hts_useek(htsFile *fp, long uoffset, int where)
1122 {
1123 if (fp->is_bgzf)
1124 return bgzf_useek(fp->fp.bgzf, uoffset, where);
1125 else
1126 return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1127 }
hts_utell(htsFile * fp)1128 long hts_utell(htsFile *fp)
1129 {
1130 if (fp->is_bgzf)
1131 return bgzf_utell(fp->fp.bgzf);
1132 else
1133 return htell(fp->fp.hfile);
1134 }
1135
hts_getline(htsFile * fp,int delimiter,kstring_t * str)1136 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1137 {
1138 int ret;
1139 if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1140 hts_log_error("Unexpected delimiter %d", delimiter);
1141 abort();
1142 }
1143
1144 switch (fp->format.compression) {
1145 case no_compression:
1146 str->l = 0;
1147 ret = kgetline(str, (kgets_func *) hgets, fp->fp.hfile);
1148 if (ret >= 0) ret = str->l;
1149 else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1150 else ret = -1;
1151 break;
1152
1153 case gzip:
1154 case bgzf:
1155 ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1156 break;
1157
1158 default:
1159 abort();
1160 }
1161
1162 ++fp->lineno;
1163 return ret;
1164 }
1165
hts_readlist(const char * string,int is_file,int * _n)1166 char **hts_readlist(const char *string, int is_file, int *_n)
1167 {
1168 int m = 0, n = 0;
1169 char **s = 0;
1170 if ( is_file )
1171 {
1172 BGZF *fp = bgzf_open(string, "r");
1173 if ( !fp ) return NULL;
1174
1175 kstring_t str;
1176 str.s = 0; str.l = str.m = 0;
1177 while (bgzf_getline(fp, '\n', &str) >= 0)
1178 {
1179 if (str.l == 0) continue;
1180 n++;
1181 hts_expand(char*,n,m,s);
1182 s[n-1] = strdup(str.s);
1183 }
1184 bgzf_close(fp);
1185 free(str.s);
1186 }
1187 else
1188 {
1189 const char *q = string, *p = string;
1190 while ( 1 )
1191 {
1192 if (*p == ',' || *p == 0)
1193 {
1194 n++;
1195 hts_expand(char*,n,m,s);
1196 s[n-1] = (char*)calloc(p - q + 1, 1);
1197 strncpy(s[n-1], q, p - q);
1198 q = p + 1;
1199 }
1200 if ( !*p ) break;
1201 p++;
1202 }
1203 }
1204 s = (char**)realloc(s, n * sizeof(char*));
1205 *_n = n;
1206 return s;
1207 }
1208
hts_readlines(const char * fn,int * _n)1209 char **hts_readlines(const char *fn, int *_n)
1210 {
1211 int m = 0, n = 0;
1212 char **s = 0;
1213 BGZF *fp = bgzf_open(fn, "r");
1214 if ( fp ) { // read from file
1215 kstring_t str;
1216 str.s = 0; str.l = str.m = 0;
1217 while (bgzf_getline(fp, '\n', &str) >= 0) {
1218 if (str.l == 0) continue;
1219 if (m == n) {
1220 m = m? m<<1 : 16;
1221 s = (char**)realloc(s, m * sizeof(char*));
1222 }
1223 s[n++] = strdup(str.s);
1224 }
1225 bgzf_close(fp);
1226 s = (char**)realloc(s, n * sizeof(char*));
1227 free(str.s);
1228 } else if (*fn == ':') { // read from string
1229 const char *q, *p;
1230 for (q = p = fn + 1;; ++p)
1231 if (*p == ',' || *p == 0) {
1232 if (m == n) {
1233 m = m? m<<1 : 16;
1234 s = (char**)realloc(s, m * sizeof(char*));
1235 }
1236 s[n] = (char*)calloc(p - q + 1, 1);
1237 strncpy(s[n++], q, p - q);
1238 q = p + 1;
1239 if (*p == 0) break;
1240 }
1241 } else return 0;
1242 s = (char**)realloc(s, n * sizeof(char*));
1243 *_n = n;
1244 return s;
1245 }
1246
1247 // DEPRECATED: To be removed in a future HTSlib release
hts_file_type(const char * fname)1248 int hts_file_type(const char *fname)
1249 {
1250 int len = strlen(fname);
1251 if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
1252 if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
1253 if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
1254 if ( !strcmp("-",fname) ) return FT_STDIN;
1255
1256 hFILE *f = hopen(fname, "r");
1257 if (f == NULL) return 0;
1258
1259 htsFormat fmt;
1260 if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
1261 if (hclose(f) < 0) return 0;
1262
1263 switch (fmt.format) {
1264 case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
1265 case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
1266 default: return 0;
1267 }
1268 }
1269
hts_check_EOF(htsFile * fp)1270 int hts_check_EOF(htsFile *fp)
1271 {
1272 if (fp->format.compression == bgzf)
1273 return bgzf_check_EOF(hts_get_bgzfp(fp));
1274 else if (fp->format.format == cram)
1275 return cram_check_EOF(fp->fp.cram);
1276 else
1277 return 3;
1278 }
1279
1280
1281 /****************
1282 *** Indexing ***
1283 ****************/
1284
1285 #define HTS_MIN_MARKER_DIST 0x10000
1286
1287 // Finds the special meta bin
1288 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
1289 #define META_BIN(idx) ((idx)->n_bins + 1)
1290
1291 #define pair64_lt(a,b) ((a).u < (b).u)
1292
1293 KSORT_INIT(_off, hts_pair64_t, pair64_lt)
1294 KSORT_INIT(_off_max, hts_pair64_max_t, pair64_lt)
1295
1296 typedef struct {
1297 int32_t m, n;
1298 uint64_t loff;
1299 hts_pair64_t *list;
1300 } bins_t;
1301
1302 KHASH_MAP_INIT_INT(bin, bins_t)
1303 typedef khash_t(bin) bidx_t;
1304
1305 typedef struct {
1306 int32_t n, m;
1307 uint64_t *offset;
1308 } lidx_t;
1309
1310 struct __hts_idx_t {
1311 int fmt, min_shift, n_lvls, n_bins;
1312 uint32_t l_meta;
1313 int32_t n, m;
1314 uint64_t n_no_coor;
1315 bidx_t **bidx;
1316 lidx_t *lidx;
1317 uint8_t *meta; // MUST have a terminating NUL on the end
1318 struct {
1319 uint32_t last_bin, save_bin;
1320 int last_coor, last_tid, save_tid, finished;
1321 uint64_t last_off, save_off;
1322 uint64_t off_beg, off_end;
1323 uint64_t n_mapped, n_unmapped;
1324 } z; // keep internal states
1325 };
1326
idx_format_name(int fmt)1327 static char * idx_format_name(int fmt) {
1328 switch (fmt) {
1329 case HTS_FMT_CSI: return "csi";
1330 case HTS_FMT_BAI: return "bai";
1331 case HTS_FMT_TBI: return "tbi";
1332 case HTS_FMT_CRAI: return "crai";
1333 default: return "unknown";
1334 }
1335 }
1336
insert_to_b(bidx_t * b,int bin,uint64_t beg,uint64_t end)1337 static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
1338 {
1339 khint_t k;
1340 bins_t *l;
1341 int absent;
1342 k = kh_put(bin, b, bin, &absent);
1343 if (absent < 0) return -1; // Out of memory
1344 l = &kh_value(b, k);
1345 if (absent) {
1346 l->m = 1; l->n = 0;
1347 l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
1348 if (!l->list) {
1349 kh_del(bin, b, k);
1350 return -1;
1351 }
1352 } else if (l->n == l->m) {
1353 uint32_t new_m = l->m ? l->m << 1 : 1;
1354 hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
1355 if (!new_list) return -1;
1356 l->list = new_list;
1357 l->m = new_m;
1358 }
1359 l->list[l->n].u = beg;
1360 l->list[l->n++].v = end;
1361 return 0;
1362 }
1363
insert_to_l(lidx_t * l,int64_t _beg,int64_t _end,uint64_t offset,int min_shift)1364 static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
1365 {
1366 int i, beg, end;
1367 beg = _beg >> min_shift;
1368 end = (_end - 1) >> min_shift;
1369 if (l->m < end + 1) {
1370 size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
1371 uint64_t *new_offset;
1372
1373 new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
1374 if (!new_offset) return -1;
1375
1376 // fill unused memory with (uint64_t)-1
1377 memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
1378 l->m = new_m;
1379 l->offset = new_offset;
1380 }
1381 for (i = beg; i <= end; ++i) {
1382 if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
1383 }
1384 if (l->n < end + 1) l->n = end + 1;
1385 return 0;
1386 }
1387
hts_idx_init(int n,int fmt,uint64_t offset0,int min_shift,int n_lvls)1388 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
1389 {
1390 hts_idx_t *idx;
1391 idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
1392 if (idx == NULL) return NULL;
1393 idx->fmt = fmt;
1394 idx->min_shift = min_shift;
1395 idx->n_lvls = n_lvls;
1396 idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
1397 idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
1398 idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
1399 idx->z.last_coor = 0xffffffffu;
1400 if (n) {
1401 idx->n = idx->m = n;
1402 idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
1403 if (idx->bidx == NULL) { free(idx); return NULL; }
1404 idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
1405 if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
1406 }
1407 return idx;
1408 }
1409
update_loff(hts_idx_t * idx,int i,int free_lidx)1410 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
1411 {
1412 bidx_t *bidx = idx->bidx[i];
1413 lidx_t *lidx = &idx->lidx[i];
1414 khint_t k;
1415 int l;
1416 uint64_t offset0 = 0;
1417 if (bidx) {
1418 k = kh_get(bin, bidx, META_BIN(idx));
1419 if (k != kh_end(bidx))
1420 offset0 = kh_val(bidx, k).list[0].u;
1421 for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
1422 lidx->offset[l] = offset0;
1423 } else l = 1;
1424 for (; l < lidx->n; ++l) // fill missing values
1425 if (lidx->offset[l] == (uint64_t)-1)
1426 lidx->offset[l] = lidx->offset[l-1];
1427 if (bidx == 0) return;
1428 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
1429 if (kh_exist(bidx, k))
1430 {
1431 if ( kh_key(bidx, k) < idx->n_bins )
1432 {
1433 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
1434 // disable linear index if bot_bin out of bounds
1435 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
1436 }
1437 else
1438 kh_val(bidx, k).loff = 0;
1439 }
1440 if (free_lidx) {
1441 free(lidx->offset);
1442 lidx->m = lidx->n = 0;
1443 lidx->offset = 0;
1444 }
1445 }
1446
compress_binning(hts_idx_t * idx,int i)1447 static void compress_binning(hts_idx_t *idx, int i)
1448 {
1449 bidx_t *bidx = idx->bidx[i];
1450 khint_t k;
1451 int l, m;
1452 if (bidx == 0) return;
1453 // merge a bin to its parent if the bin is too small
1454 for (l = idx->n_lvls; l > 0; --l) {
1455 unsigned start = hts_bin_first(l);
1456 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1457 bins_t *p, *q;
1458 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
1459 p = &kh_value(bidx, k);
1460 if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
1461 if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
1462 khint_t kp;
1463 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
1464 if (kp == kh_end(bidx)) continue;
1465 q = &kh_val(bidx, kp);
1466 if (q->n + p->n > q->m) {
1467 q->m = q->n + p->n;
1468 kroundup32(q->m);
1469 q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
1470 }
1471 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
1472 q->n += p->n;
1473 free(p->list);
1474 kh_del(bin, bidx, k);
1475 }
1476 }
1477 }
1478 k = kh_get(bin, bidx, 0);
1479 if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
1480 // merge adjacent chunks that start from the same BGZF block
1481 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1482 bins_t *p;
1483 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
1484 p = &kh_value(bidx, k);
1485 for (l = 1, m = 0; l < p->n; ++l) {
1486 if (p->list[m].v>>16 >= p->list[l].u>>16) {
1487 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
1488 } else p->list[++m] = p->list[l];
1489 }
1490 p->n = m + 1;
1491 }
1492 }
1493
hts_idx_finish(hts_idx_t * idx,uint64_t final_offset)1494 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
1495 {
1496 int i;
1497 if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
1498 if (idx->z.save_tid >= 0) {
1499 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
1500 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
1501 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
1502 }
1503 for (i = 0; i < idx->n; ++i) {
1504 update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
1505 compress_binning(idx, i);
1506 }
1507 idx->z.finished = 1;
1508 }
1509
hts_idx_push(hts_idx_t * idx,int tid,int beg,int end,uint64_t offset,int is_mapped)1510 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
1511 {
1512 int bin;
1513 int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
1514 if (tid<0) beg = -1, end = 0;
1515 if (tid >= 0 && (beg > maxpos || end > maxpos)) {
1516 goto pos_too_big;
1517 }
1518 if (tid >= idx->m) { // enlarge the index
1519 uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
1520 bidx_t **new_bidx;
1521 lidx_t *new_lidx;
1522
1523 new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
1524 if (!new_bidx) return -1;
1525 idx->bidx = new_bidx;
1526
1527 new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
1528 if (!new_lidx) return -1;
1529 idx->lidx = new_lidx;
1530
1531 memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
1532 memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
1533 idx->m = new_m;
1534 }
1535 if (idx->n < tid + 1) idx->n = tid + 1;
1536 if (idx->z.finished) return 0;
1537 if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
1538 if ( tid>=0 && idx->n_no_coor )
1539 {
1540 hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
1541 return -1;
1542 }
1543 if (tid>=0 && idx->bidx[tid] != 0)
1544 {
1545 hts_log_error("Chromosome blocks not continuous");
1546 return -1;
1547 }
1548 idx->z.last_tid = tid;
1549 idx->z.last_bin = 0xffffffffu;
1550 } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
1551 hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid+1, idx->z.last_coor+1, beg+1);
1552 return -1;
1553 }
1554 if ( tid>=0 )
1555 {
1556 if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
1557 if (is_mapped) {
1558 // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
1559 if (beg < 0) beg = 0;
1560 if (end <= 0) end = 1;
1561 // idx->z.last_off points to the start of the current record
1562 if (insert_to_l(&idx->lidx[tid], beg, end,
1563 idx->z.last_off, idx->min_shift) < 0) return -1;
1564 }
1565 }
1566 else idx->n_no_coor++;
1567 bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
1568 if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
1569 if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
1570 if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
1571 idx->z.save_off, idx->z.last_off) < 0) return -1;
1572 }
1573 if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
1574 idx->z.off_end = idx->z.last_off;
1575 if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1576 idx->z.off_beg, idx->z.off_end) < 0) return -1;
1577 if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1578 idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
1579 idx->z.n_mapped = idx->z.n_unmapped = 0;
1580 idx->z.off_beg = idx->z.off_end;
1581 }
1582 idx->z.save_off = idx->z.last_off;
1583 idx->z.save_bin = idx->z.last_bin = bin;
1584 idx->z.save_tid = tid;
1585 }
1586 if (is_mapped) ++idx->z.n_mapped;
1587 else ++idx->z.n_unmapped;
1588 idx->z.last_off = offset;
1589 idx->z.last_coor = beg;
1590 return 0;
1591
1592 pos_too_big: {
1593 int64_t max = end > beg ? end : beg, s = 1 << 14;
1594 int n_lvls = 0;
1595 while (max > s) {
1596 n_lvls++;
1597 s <<= 3;
1598 }
1599
1600 if (idx->fmt == HTS_FMT_CSI) {
1601 hts_log_error("Region %d..%d cannot be stored in a csi index "
1602 "with min_shift = %d, n_lvls = %d. Try using "
1603 "min_shift = 14, n_lvls >= %d",
1604 beg, end,
1605 idx->min_shift, idx->n_lvls,
1606 n_lvls);
1607 } else {
1608 hts_log_error("Region %d..%d cannot be stored in a %s index. "
1609 "Try using a csi index with min_shift = 14, "
1610 "n_lvls >= %d",
1611 beg, end, idx_format_name(idx->fmt),
1612 n_lvls);
1613 }
1614 errno = ERANGE;
1615 return -1;
1616 }
1617 }
1618
hts_idx_destroy(hts_idx_t * idx)1619 void hts_idx_destroy(hts_idx_t *idx)
1620 {
1621 khint_t k;
1622 int i;
1623 if (idx == 0) return;
1624
1625 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
1626 if (idx->fmt == HTS_FMT_CRAI) {
1627 hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
1628 cram_index_free(cidx->cram);
1629 free(cidx);
1630 return;
1631 }
1632
1633 for (i = 0; i < idx->m; ++i) {
1634 bidx_t *bidx = idx->bidx[i];
1635 free(idx->lidx[i].offset);
1636 if (bidx == 0) continue;
1637 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1638 if (kh_exist(bidx, k))
1639 free(kh_value(bidx, k).list);
1640 kh_destroy(bin, bidx);
1641 }
1642 free(idx->bidx); free(idx->lidx); free(idx->meta);
1643 free(idx);
1644 }
1645
1646 // The optimizer eliminates these ed_is_big() calls; still it would be good to
1647 // TODO Determine endianness at configure- or compile-time
1648
idx_write_int32(BGZF * fp,int32_t x)1649 static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
1650 {
1651 if (ed_is_big()) x = ed_swap_4(x);
1652 return bgzf_write(fp, &x, sizeof x);
1653 }
1654
idx_write_uint32(BGZF * fp,uint32_t x)1655 static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
1656 {
1657 if (ed_is_big()) x = ed_swap_4(x);
1658 return bgzf_write(fp, &x, sizeof x);
1659 }
1660
idx_write_uint64(BGZF * fp,uint64_t x)1661 static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
1662 {
1663 if (ed_is_big()) x = ed_swap_8(x);
1664 return bgzf_write(fp, &x, sizeof x);
1665 }
1666
swap_bins(bins_t * p)1667 static inline void swap_bins(bins_t *p)
1668 {
1669 int i;
1670 for (i = 0; i < p->n; ++i) {
1671 ed_swap_8p(&p->list[i].u);
1672 ed_swap_8p(&p->list[i].v);
1673 }
1674 }
1675
hts_idx_save_core(const hts_idx_t * idx,BGZF * fp,int fmt)1676 static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
1677 {
1678 int32_t i, j;
1679
1680 #define check(ret) if ((ret) < 0) return -1
1681
1682 check(idx_write_int32(fp, idx->n));
1683 if (fmt == HTS_FMT_TBI && idx->l_meta)
1684 check(bgzf_write(fp, idx->meta, idx->l_meta));
1685
1686 for (i = 0; i < idx->n; ++i) {
1687 khint_t k;
1688 bidx_t *bidx = idx->bidx[i];
1689 lidx_t *lidx = &idx->lidx[i];
1690 // write binning index
1691 check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
1692 if (bidx)
1693 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1694 if (kh_exist(bidx, k)) {
1695 bins_t *p = &kh_value(bidx, k);
1696 check(idx_write_uint32(fp, kh_key(bidx, k)));
1697 if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
1698 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
1699 check(idx_write_int32(fp, p->n));
1700 for (j = 0; j < p->n; ++j) {
1701 check(idx_write_uint64(fp, p->list[j].u));
1702 check(idx_write_uint64(fp, p->list[j].v));
1703 }
1704 }
1705
1706 // write linear index
1707 if (fmt != HTS_FMT_CSI) {
1708 check(idx_write_int32(fp, lidx->n));
1709 for (j = 0; j < lidx->n; ++j)
1710 check(idx_write_uint64(fp, lidx->offset[j]));
1711 }
1712 }
1713
1714 check(idx_write_uint64(fp, idx->n_no_coor));
1715 return 0;
1716 #undef check
1717 }
1718
hts_idx_save(const hts_idx_t * idx,const char * fn,int fmt)1719 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
1720 {
1721 int ret, save;
1722 char *fnidx = (char*)calloc(1, strlen(fn) + 5);
1723 if (fnidx == NULL) return -1;
1724
1725 strcpy(fnidx, fn);
1726 switch (fmt) {
1727 case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
1728 case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
1729 case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
1730 default: abort();
1731 }
1732
1733 ret = hts_idx_save_as(idx, fn, fnidx, fmt);
1734 save = errno;
1735 free(fnidx);
1736 errno = save;
1737 return ret;
1738 }
1739
hts_idx_save_as(const hts_idx_t * idx,const char * fn,const char * fnidx,int fmt)1740 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
1741 {
1742 BGZF *fp;
1743
1744 #define check(ret) if ((ret) < 0) goto fail
1745
1746 if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
1747
1748 fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
1749 if (fp == NULL) return -1;
1750
1751 if (fmt == HTS_FMT_CSI) {
1752 check(bgzf_write(fp, "CSI\1", 4));
1753 check(idx_write_int32(fp, idx->min_shift));
1754 check(idx_write_int32(fp, idx->n_lvls));
1755 check(idx_write_uint32(fp, idx->l_meta));
1756 if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
1757 } else if (fmt == HTS_FMT_TBI) {
1758 check(bgzf_write(fp, "TBI\1", 4));
1759 } else if (fmt == HTS_FMT_BAI) {
1760 check(bgzf_write(fp, "BAI\1", 4));
1761 } else abort();
1762
1763 check(hts_idx_save_core(idx, fp, fmt));
1764
1765 return bgzf_close(fp);
1766 #undef check
1767
1768 fail:
1769 bgzf_close(fp);
1770 return -1;
1771 }
1772
hts_idx_load_core(hts_idx_t * idx,BGZF * fp,int fmt)1773 static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt)
1774 {
1775 int32_t i, n, is_be;
1776 is_be = ed_is_big();
1777 if (idx == NULL) return -4;
1778 for (i = 0; i < idx->n; ++i) {
1779 bidx_t *h;
1780 lidx_t *l = &idx->lidx[i];
1781 uint32_t key;
1782 int j, absent;
1783 bins_t *p;
1784 h = idx->bidx[i] = kh_init(bin);
1785 if (bgzf_read(fp, &n, 4) != 4) return -1;
1786 if (is_be) ed_swap_4p(&n);
1787 for (j = 0; j < n; ++j) {
1788 khint_t k;
1789 if (bgzf_read(fp, &key, 4) != 4) return -1;
1790 if (is_be) ed_swap_4p(&key);
1791 k = kh_put(bin, h, key, &absent);
1792 if (absent <= 0) return -3; // Duplicate bin number
1793 p = &kh_val(h, k);
1794 if (fmt == HTS_FMT_CSI) {
1795 if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
1796 if (is_be) ed_swap_8p(&p->loff);
1797 } else p->loff = 0;
1798 if (bgzf_read(fp, &p->n, 4) != 4) return -1;
1799 if (is_be) ed_swap_4p(&p->n);
1800 p->m = p->n;
1801 p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
1802 if (p->list == NULL) return -2;
1803 if (bgzf_read(fp, p->list, p->n<<4) != p->n<<4) return -1;
1804 if (is_be) swap_bins(p);
1805 }
1806 if (fmt != HTS_FMT_CSI) { // load linear index
1807 int j;
1808 if (bgzf_read(fp, &l->n, 4) != 4) return -1;
1809 if (is_be) ed_swap_4p(&l->n);
1810 l->m = l->n;
1811 l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
1812 if (l->offset == NULL) return -2;
1813 if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
1814 if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
1815 for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
1816 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
1817 update_loff(idx, i, 1);
1818 }
1819 }
1820 if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
1821 if (is_be) ed_swap_8p(&idx->n_no_coor);
1822 return 0;
1823 }
1824
hts_idx_load_local(const char * fn)1825 static hts_idx_t *hts_idx_load_local(const char *fn)
1826 {
1827 uint8_t magic[4];
1828 int i, is_be;
1829 hts_idx_t *idx = NULL;
1830 uint8_t *meta = NULL;
1831 BGZF *fp = bgzf_open(fn, "r");
1832 if (fp == NULL) return NULL;
1833 is_be = ed_is_big();
1834 if (bgzf_read(fp, magic, 4) != 4) goto fail;
1835
1836 if (memcmp(magic, "CSI\1", 4) == 0) {
1837 uint32_t x[3], n;
1838 if (bgzf_read(fp, x, 12) != 12) goto fail;
1839 if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
1840 if (x[2]) {
1841 if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
1842 if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
1843 if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
1844 // Prevent possible strlen past the end in tbx_index_load2
1845 meta[x[2]] = '\0';
1846 }
1847 if (bgzf_read(fp, &n, 4) != 4) goto fail;
1848 if (is_be) ed_swap_4p(&n);
1849 if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
1850 idx->l_meta = x[2];
1851 idx->meta = meta;
1852 meta = NULL;
1853 if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
1854 }
1855 else if (memcmp(magic, "TBI\1", 4) == 0) {
1856 uint8_t x[8 * 4];
1857 uint32_t n;
1858 // Read file header
1859 if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
1860 n = le_to_u32(&x[0]); // location of n_ref
1861 if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
1862 n = le_to_u32(&x[7*4]); // location of l_nm
1863 if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
1864 idx->l_meta = 28 + n;
1865 if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
1866 // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
1867 // N.B. left in little-endian byte order.
1868 memcpy(idx->meta, &x[1*4], 28);
1869 // Read in sequence names.
1870 if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
1871 // Prevent possible strlen past the end in tbx_index_load2
1872 idx->meta[idx->l_meta] = '\0';
1873 if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
1874 }
1875 else if (memcmp(magic, "BAI\1", 4) == 0) {
1876 uint32_t n;
1877 if (bgzf_read(fp, &n, 4) != 4) goto fail;
1878 if (is_be) ed_swap_4p(&n);
1879 idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5);
1880 if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
1881 }
1882 else { errno = EINVAL; goto fail; }
1883
1884 bgzf_close(fp);
1885 return idx;
1886
1887 fail:
1888 bgzf_close(fp);
1889 hts_idx_destroy(idx);
1890 free(meta);
1891 return NULL;
1892 }
1893
hts_idx_set_meta(hts_idx_t * idx,uint32_t l_meta,uint8_t * meta,int is_copy)1894 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
1895 int is_copy)
1896 {
1897 uint8_t *new_meta = meta;
1898 if (is_copy) {
1899 size_t l = l_meta;
1900 if (l > SIZE_MAX - 1) {
1901 errno = ENOMEM;
1902 return -1;
1903 }
1904 new_meta = malloc(l + 1);
1905 if (!new_meta) return -1;
1906 memcpy(new_meta, meta, l);
1907 // Prevent possible strlen past the end in tbx_index_load2
1908 meta[l + 1] = '\0';
1909 }
1910 if (idx->meta) free(idx->meta);
1911 idx->l_meta = l_meta;
1912 idx->meta = new_meta;
1913 return 0;
1914 }
1915
hts_idx_get_meta(hts_idx_t * idx,uint32_t * l_meta)1916 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
1917 {
1918 *l_meta = idx->l_meta;
1919 return idx->meta;
1920 }
1921
hts_idx_seqnames(const hts_idx_t * idx,int * n,hts_id2name_f getid,void * hdr)1922 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
1923 {
1924 if ( !idx->n )
1925 {
1926 *n = 0;
1927 return NULL;
1928 }
1929
1930 int tid = 0, i;
1931 const char **names = (const char**) calloc(idx->n,sizeof(const char*));
1932 for (i=0; i<idx->n; i++)
1933 {
1934 bidx_t *bidx = idx->bidx[i];
1935 if ( !bidx ) continue;
1936 names[tid++] = getid(hdr,i);
1937 }
1938 *n = tid;
1939 return names;
1940 }
1941
hts_idx_get_stat(const hts_idx_t * idx,int tid,uint64_t * mapped,uint64_t * unmapped)1942 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
1943 {
1944 if ( idx->fmt == HTS_FMT_CRAI ) {
1945 *mapped = 0; *unmapped = 0;
1946 return -1;
1947 }
1948
1949 bidx_t *h = idx->bidx[tid];
1950 khint_t k = kh_get(bin, h, META_BIN(idx));
1951 if (k != kh_end(h)) {
1952 *mapped = kh_val(h, k).list[1].u;
1953 *unmapped = kh_val(h, k).list[1].v;
1954 return 0;
1955 } else {
1956 *mapped = 0; *unmapped = 0;
1957 return -1;
1958 }
1959 }
1960
hts_idx_get_n_no_coor(const hts_idx_t * idx)1961 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
1962 {
1963 return idx->n_no_coor;
1964 }
1965
1966 /****************
1967 *** Iterator ***
1968 ****************/
1969
reg2bins(int64_t beg,int64_t end,hts_itr_t * itr,int min_shift,int n_lvls)1970 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
1971 {
1972 int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
1973 if (beg >= end) return 0;
1974 if (end >= 1LL<<s) end = 1LL<<s;
1975 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
1976 int b, e, n, i;
1977 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
1978 if (itr->bins.n + n > itr->bins.m) {
1979 itr->bins.m = itr->bins.n + n;
1980 kroundup32(itr->bins.m);
1981 itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
1982 }
1983 for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
1984 }
1985 return itr->bins.n;
1986 }
1987
reg2intervals(hts_itr_multi_t * iter,const hts_idx_t * idx,int tid,int64_t beg,int64_t end,uint64_t min_off,uint64_t max_off,int min_shift,int n_lvls)1988 static inline int reg2intervals(hts_itr_multi_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
1989 {
1990 int l, t, s;
1991 int b, e, i, j;
1992 hts_pair64_max_t *off;
1993 bidx_t *bidx;
1994 khint_t k;
1995
1996 if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
1997 return -1;
1998
1999 s = min_shift + (n_lvls<<1) + n_lvls;
2000 if (end >= 1LL<<s)
2001 end = 1LL<<s;
2002
2003 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
2004 b = t + (beg>>s); e = t + (end>>s);
2005
2006 for (i = b; i <= e; ++i) {
2007 if ((k = kh_get(bin, bidx, i)) != kh_end(bidx)) {
2008 bins_t *p = &kh_value(bidx, k);
2009
2010 if (p->n) {
2011 off = (hts_pair64_max_t*)realloc(iter->off, (iter->n_off + p->n) * sizeof(hts_pair64_max_t));
2012 if (!off)
2013 return -2;
2014
2015 iter->off = off;
2016 for (j = 0; j < p->n; ++j) {
2017 if (p->list[j].v > min_off && p->list[j].u < max_off) {
2018 iter->off[iter->n_off].u = p->list[j].u;
2019 iter->off[iter->n_off].v = p->list[j].v;
2020 iter->off[iter->n_off].max = ((uint64_t)tid<<32) | (end+1);
2021 iter->n_off++;
2022 }
2023 }
2024 }
2025 }
2026 }
2027 }
2028
2029 return iter->n_off;
2030 }
2031
compare_regions(const void * r1,const void * r2)2032 static int compare_regions(const void *r1, const void *r2) {
2033 hts_reglist_t *reg1 = (hts_reglist_t *)r1;
2034 hts_reglist_t *reg2 = (hts_reglist_t *)r2;
2035
2036 if (reg1->tid < 0 && reg2->tid >= 0)
2037 return 1;
2038 else if (reg1->tid >= 0 && reg2->tid < 0)
2039 return -1;
2040 else
2041 return reg1->tid - reg2->tid;
2042 }
2043
hts_itr_off(const hts_idx_t * idx,int tid)2044 uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
2045
2046 int i;
2047 bidx_t* bidx;
2048 uint64_t off0 = (uint64_t) -1;
2049 khint_t k;
2050 switch (tid) {
2051 case HTS_IDX_START:
2052 // Find the smallest offset, note that sequence ids may not be ordered sequentially
2053 for (i = 0; i < idx->n; i++) {
2054 bidx = idx->bidx[i];
2055 k = kh_get(bin, bidx, META_BIN(idx));
2056 if (k == kh_end(bidx))
2057 continue;
2058
2059 if (off0 > kh_val(bidx, k).list[0].u)
2060 off0 = kh_val(bidx, k).list[0].u;
2061 }
2062 if (off0 == (uint64_t) -1 && idx->n_no_coor)
2063 off0 = 0;
2064 // only no-coor reads in this bam
2065 break;
2066 case HTS_IDX_NOCOOR:
2067 /* No-coor reads sort after all of the mapped reads. The position
2068 is not stored in the index itself, so need to find the end
2069 offset for the last mapped read. A loop is needed here in
2070 case references at the end of the file have no mapped reads,
2071 or sequence ids are not ordered sequentially.
2072 See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
2073 for (i = 0; i < idx->n; i++) {
2074 bidx = idx->bidx[i];
2075 k = kh_get(bin, bidx, META_BIN(idx));
2076 if (k != kh_end(bidx)) {
2077 if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
2078 off0 = kh_val(bidx, k).list[0].v;
2079 }
2080 }
2081 }
2082 if (off0 == (uint64_t) -1 && idx->n_no_coor)
2083 off0 = 0;
2084 // only no-coor reads in this bam
2085 break;
2086 case HTS_IDX_REST:
2087 off0 = 0;
2088 break;
2089 case HTS_IDX_NONE:
2090 off0 = 0;
2091 break;
2092 }
2093
2094 return off0;
2095 }
2096
hts_itr_query(const hts_idx_t * idx,int tid,int beg,int end,hts_readrec_func * readrec)2097 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
2098 {
2099 int i, n_off, l, bin;
2100 hts_pair64_t *off;
2101 khint_t k;
2102 bidx_t *bidx;
2103 uint64_t min_off, max_off;
2104 hts_itr_t *iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
2105 if (iter) {
2106 if (tid < 0) {
2107 uint64_t off = hts_itr_off(idx, tid);
2108 if (off != (uint64_t) -1) {
2109 iter->read_rest = 1;
2110 iter->curr_off = off;
2111 iter->readrec = readrec;
2112 if (tid == HTS_IDX_NONE)
2113 iter->finished = 1;
2114 } else {
2115 free(iter);
2116 iter = NULL;
2117 }
2118 } else {
2119 if (beg < 0) beg = 0;
2120 if (end < beg) return 0;
2121 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
2122
2123 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
2124 iter->readrec = readrec;
2125
2126 if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
2127
2128 // compute min_off
2129 bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
2130 do {
2131 int first;
2132 k = kh_get(bin, bidx, bin);
2133 if (k != kh_end(bidx)) break;
2134 first = (hts_bin_parent(bin)<<3) + 1;
2135 if (bin > first) --bin;
2136 else bin = hts_bin_parent(bin);
2137 } while (bin);
2138 if (bin == 0) k = kh_get(bin, bidx, bin);
2139 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2140
2141 // compute max_off: a virtual offset from a bin to the right of end
2142 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2143 if (bin >= idx->n_bins) bin = 0;
2144 while (1) {
2145 // search for an extant bin by moving right, but moving up to the
2146 // parent whenever we get to a first child (which also covers falling
2147 // off the RHS, which wraps around and immediately goes up to bin 0)
2148 while (bin % 8 == 1) bin = hts_bin_parent(bin);
2149 if (bin == 0) { max_off = (uint64_t)-1; break; }
2150 k = kh_get(bin, bidx, bin);
2151 if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
2152 bin++;
2153 }
2154
2155 // retrieve bins
2156 reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
2157
2158 for (i = n_off = 0; i < iter->bins.n; ++i)
2159 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
2160 n_off += kh_value(bidx, k).n;
2161 if (n_off == 0) {
2162 // No overlapping bins means the iterator has already finished.
2163 iter->finished = 1;
2164 return iter;
2165 }
2166 off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
2167 for (i = n_off = 0; i < iter->bins.n; ++i) {
2168 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
2169 int j;
2170 bins_t *p = &kh_value(bidx, k);
2171 for (j = 0; j < p->n; ++j)
2172 if (p->list[j].v > min_off && p->list[j].u < max_off)
2173 off[n_off++] = p->list[j];
2174 }
2175 }
2176
2177 if (n_off == 0) {
2178 free(off);
2179 iter->finished = 1;
2180 return iter;
2181 }
2182 ks_introsort(_off, n_off, off);
2183 // resolve completely contained adjacent blocks
2184 for (i = 1, l = 0; i < n_off; ++i)
2185 if (off[l].v < off[i].v) off[++l] = off[i];
2186 n_off = l + 1;
2187 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2188 for (i = 1; i < n_off; ++i)
2189 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2190 // merge adjacent blocks
2191 for (i = 1, l = 0; i < n_off; ++i) {
2192 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
2193 else off[++l] = off[i];
2194 }
2195 n_off = l + 1;
2196 iter->n_off = n_off; iter->off = off;
2197 }
2198 }
2199 return iter;
2200 }
2201
hts_itr_multi_bam(const hts_idx_t * idx,hts_itr_multi_t * iter)2202 hts_itr_multi_t *hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_multi_t *iter)
2203 {
2204 int i, j, l, n_off = 0, bin;
2205 hts_pair64_max_t *off = NULL;
2206 khint_t k;
2207 bidx_t *bidx;
2208 uint64_t min_off, max_off, t_off = (uint64_t)-1;
2209 int tid, beg, end;
2210 hts_reglist_t *curr_reg;
2211
2212 if (iter) {
2213 iter->i = -1;
2214 for (i=0; i<iter->n_reg; i++) {
2215
2216 curr_reg = &iter->reg_list[i];
2217 tid = curr_reg->tid;
2218
2219 if (tid < 0) {
2220 t_off = hts_itr_off(idx, tid);
2221 if (t_off != (uint64_t)-1) {
2222 switch (tid) {
2223 case HTS_IDX_NONE:
2224 iter->finished = 1;
2225 case HTS_IDX_START:
2226 case HTS_IDX_REST:
2227 iter->curr_off = t_off;
2228 iter->n_reg = 0;
2229 iter->reg_list = NULL;
2230 iter->read_rest = 1;
2231 return iter;
2232 case HTS_IDX_NOCOOR:
2233 iter->nocoor = 1;
2234 iter->nocoor_off = t_off;
2235 }
2236 }
2237 } else {
2238 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
2239 continue;
2240
2241 for(j=0; j<curr_reg->count; j++) {
2242 hts_pair32_t *curr_intv = &curr_reg->intervals[j];
2243 if (curr_intv->end < curr_intv->beg)
2244 continue;
2245
2246 beg = curr_intv->beg;
2247 end = curr_intv->end;
2248
2249 /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
2250 If the computed bin is not in the index, try the next bin to the
2251 left, belonging to the same parent. If it is the first sibling bin,
2252 try the parent bin. */
2253 bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
2254 do {
2255 int first;
2256 k = kh_get(bin, bidx, bin);
2257 if (k != kh_end(bidx)) break;
2258 first = (hts_bin_parent(bin)<<3) + 1;
2259 if (bin > first) --bin;
2260 else bin = hts_bin_parent(bin);
2261 } while (bin);
2262 if (bin == 0)
2263 k = kh_get(bin, bidx, bin);
2264 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2265
2266 // compute max_off: a virtual offset from a bin to the right of end
2267 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2268 if (bin >= idx->n_bins) bin = 0;
2269 while (1) {
2270 // search for an extant bin by moving right, but moving up to the
2271 // parent whenever we get to a first child (which also covers falling
2272 // off the RHS, which wraps around and immediately goes up to bin 0)
2273 while (bin % 8 == 1) bin = hts_bin_parent(bin);
2274 if (bin == 0) { max_off = (uint64_t)-1; break; }
2275 k = kh_get(bin, bidx, bin);
2276 if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
2277 max_off = kh_val(bidx, k).list[0].u;
2278 break;
2279 }
2280 bin++;
2281 }
2282
2283 //convert coordinates to file offsets
2284 reg2intervals(iter, idx, tid, beg, end, min_off, max_off, idx->min_shift, idx->n_lvls);
2285 }
2286 }
2287 }
2288
2289 off = iter->off;
2290 n_off = iter->n_off;
2291
2292 if (n_off) {
2293 ks_introsort(_off_max, n_off, off);
2294 // resolve completely contained adjacent blocks
2295 for (i = 1, l = 0; i < n_off; ++i) {
2296 if (off[l].v < off[i].v) {
2297 off[++l] = off[i];
2298 } else {
2299 off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max);
2300 }
2301 }
2302 n_off = l + 1;
2303 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2304 for (i = 1; i < n_off; ++i)
2305 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2306 // merge adjacent blocks
2307 for (i = 1, l = 0; i < n_off; ++i) {
2308 if (off[l].v>>16 == off[i].u>>16) {
2309 off[l].v = off[i].v;
2310 off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max);
2311 } else off[++l] = off[i];
2312 }
2313 n_off = l + 1;
2314 iter->n_off = n_off; iter->off = off;
2315 }
2316
2317 if(!n_off && !iter->nocoor)
2318 iter->finished = 1;
2319 }
2320 return iter;
2321 }
2322
hts_itr_multi_cram(const hts_idx_t * idx,hts_itr_multi_t * iter)2323 hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter)
2324 {
2325 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
2326 int tid, beg, end, i, j, l, n_off = 0;
2327 hts_reglist_t *curr_reg;
2328 hts_pair32_t *curr_intv;
2329 hts_pair64_max_t *off = NULL;
2330 cram_index *e = NULL;
2331
2332 if (!cidx || !iter)
2333 return NULL;
2334
2335 iter->is_cram = 1;
2336 iter->read_rest = 0;
2337 iter->off = NULL;
2338 iter->n_off = 0;
2339 iter->curr_off = 0;
2340 iter->i = -1;
2341
2342 for (i=0; i<iter->n_reg; i++) {
2343
2344 curr_reg = &iter->reg_list[i];
2345 tid = curr_reg->tid;
2346
2347 if (tid >= 0) {
2348 off = (hts_pair64_max_t*)realloc(off, (n_off + curr_reg->count) * sizeof(hts_pair64_max_t));
2349 if (!off)
2350 return NULL;
2351
2352 for (j=0; j < curr_reg->count; j++) {
2353 curr_intv = &curr_reg->intervals[j];
2354 if (curr_intv->end < curr_intv->beg)
2355 continue;
2356
2357 beg = curr_intv->beg;
2358 end = curr_intv->end;
2359
2360 /* First, fetch the container overlapping 'beg' and assign its file offset to u, then
2361 * find the container overlapping 'end' and assing the relative end of the slice to v.
2362 * The cram_ptell function will adjust with the container offset, which is not stored
2363 * in the index.
2364 */
2365 e = cram_index_query(cidx->cram, tid, beg+1, NULL);
2366 if (e) {
2367 off[n_off].u = e->offset;
2368
2369 if (end == INT_MAX) {
2370 e = cram_index_last(cidx->cram, tid, NULL);
2371 } else {
2372 e = cram_index_query(cidx->cram, tid, end+1, NULL);
2373 }
2374
2375 if (e) {
2376 off[n_off].v = e->offset + e->slice + e->len;
2377 off[n_off].max = (uint64_t)tid<<32 | end;
2378 n_off++;
2379 } else {
2380 hts_log_warning("Could not set offset end for region %d(%s):%d-%d. Skipping", tid, curr_reg->reg, beg, end);
2381 }
2382 } else {
2383 hts_log_warning("No index entry for region %d:%d-%d", tid, beg, end);
2384 }
2385 }
2386 } else {
2387 switch (tid) {
2388 case HTS_IDX_NOCOOR:
2389 e = cram_index_query(cidx->cram, tid, 1, NULL);
2390 if (e) {
2391 iter->nocoor = 1;
2392 iter->nocoor_off = e->offset;
2393 } else {
2394 hts_log_warning("No index entry for NOCOOR region");
2395 }
2396 break;
2397 case HTS_IDX_START:
2398 e = cram_index_query(cidx->cram, tid, 1, NULL);
2399 if (e) {
2400 iter->read_rest = 1;
2401 off = (hts_pair64_max_t*)realloc(off, sizeof(hts_pair64_max_t));
2402 off[0].u = e->offset;
2403 off[0].v = 0;
2404 off[0].max = 0;
2405 n_off=1;
2406 } else {
2407 hts_log_warning("No index entries");
2408 }
2409 break;
2410 case HTS_IDX_REST:
2411 break;
2412 case HTS_IDX_NONE:
2413 iter->finished = 1;
2414 break;
2415 default:
2416 hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
2417 }
2418 }
2419 }
2420
2421 if (n_off) {
2422 ks_introsort(_off_max, n_off, off);
2423 // resolve completely contained adjacent blocks
2424 for (i = 1, l = 0; i < n_off; ++i) {
2425 if (off[l].v < off[i].v) {
2426 off[++l] = off[i];
2427 } else {
2428 off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max);
2429 }
2430 }
2431 n_off = l + 1;
2432 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2433 for (i = 1; i < n_off; ++i)
2434 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2435 // merge adjacent blocks
2436 for (i = 1, l = 0; i < n_off; ++i) {
2437 if (off[l].v>>16 == off[i].u>>16) {
2438 off[l].v = off[i].v;
2439 off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max);
2440 } else off[++l] = off[i];
2441 }
2442 n_off = l + 1;
2443 iter->n_off = n_off; iter->off = off;
2444 }
2445
2446 if(!n_off && !iter->nocoor)
2447 iter->finished = 1;
2448
2449 return iter;
2450 }
2451
hts_itr_destroy(hts_itr_t * iter)2452 void hts_itr_destroy(hts_itr_t *iter)
2453 {
2454 if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
2455 }
2456
hts_reglist_free(hts_reglist_t * reglist,int count)2457 void hts_reglist_free(hts_reglist_t *reglist, int count) {
2458
2459 int i;
2460 if(reglist) {
2461 for (i=0;i<count;i++) {
2462 if (reglist[i].intervals)
2463 free(reglist[i].intervals);
2464 }
2465 free(reglist);
2466 }
2467 }
2468
hts_itr_multi_destroy(hts_itr_multi_t * iter)2469 void hts_itr_multi_destroy(hts_itr_multi_t *iter) {
2470
2471 if (iter) {
2472 if (iter->reg_list && iter->n_reg)
2473 hts_reglist_free(iter->reg_list, iter->n_reg);
2474
2475 if (iter->off && iter->n_off)
2476 free(iter->off);
2477 free(iter);
2478 }
2479 }
2480
push_digit(long long i,char c)2481 static inline long long push_digit(long long i, char c)
2482 {
2483 // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
2484 int digit = c - '0';
2485 return 10 * i + digit;
2486 }
2487
hts_parse_decimal(const char * str,char ** strend,int flags)2488 long long hts_parse_decimal(const char *str, char **strend, int flags)
2489 {
2490 long long n = 0;
2491 int decimals = 0, e = 0, lost = 0;
2492 char sign = '+', esign = '+';
2493 const char *s;
2494
2495 while (isspace_c(*str)) str++;
2496 s = str;
2497
2498 if (*s == '+' || *s == '-') sign = *s++;
2499 while (*s)
2500 if (isdigit_c(*s)) n = push_digit(n, *s++);
2501 else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
2502 else break;
2503
2504 if (*s == '.') {
2505 s++;
2506 while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++);
2507 }
2508
2509 if (*s == 'E' || *s == 'e') {
2510 s++;
2511 if (*s == '+' || *s == '-') esign = *s++;
2512 while (isdigit_c(*s)) e = push_digit(e, *s++);
2513 if (esign == '-') e = -e;
2514 }
2515
2516 e -= decimals;
2517 while (e > 0) n *= 10, e--;
2518 while (e < 0) lost += n % 10, n /= 10, e++;
2519
2520 if (lost > 0) {
2521 hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
2522 }
2523
2524 if (strend) {
2525 *strend = (char *)s;
2526 } else if (*s) {
2527 hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
2528 }
2529
2530 return (sign == '+')? n : -n;
2531 }
2532
hts_parse_reg(const char * s,int * beg,int * end)2533 const char *hts_parse_reg(const char *s, int *beg, int *end)
2534 {
2535 char *hyphen;
2536 const char *colon = strrchr(s, ':');
2537 if (colon == NULL) {
2538 *beg = 0; *end = INT_MAX;
2539 return s + strlen(s);
2540 }
2541
2542 *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
2543 if (*beg < 0) *beg = 0;
2544
2545 if (*hyphen == '\0') *end = INT_MAX;
2546 else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
2547 else return NULL;
2548
2549 if (*beg >= *end) return NULL;
2550 return colon;
2551 }
2552
hts_itr_querys(const hts_idx_t * idx,const char * reg,hts_name2id_f getid,void * hdr,hts_itr_query_func * itr_query,hts_readrec_func * readrec)2553 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
2554 {
2555 int tid, beg, end;
2556 const char *q;
2557
2558 if (strcmp(reg, ".") == 0)
2559 return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
2560 else if (strcmp(reg, "*") == 0)
2561 return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
2562
2563 q = hts_parse_reg(reg, &beg, &end);
2564 if (q) {
2565 char tmp_a[1024], *tmp = tmp_a;
2566 if (q - reg + 1 > 1024)
2567 if (!(tmp = malloc(q - reg + 1)))
2568 return NULL;
2569 strncpy(tmp, reg, q - reg);
2570 tmp[q - reg] = 0;
2571 tid = getid(hdr, tmp);
2572 if (tmp != tmp_a)
2573 free(tmp);
2574 }
2575 else {
2576 // not parsable as a region, but possibly a sequence named "foo:a"
2577 tid = getid(hdr, reg);
2578 beg = 0; end = INT_MAX;
2579 }
2580
2581 if (tid < 0) return NULL;
2582 return itr_query(idx, tid, beg, end, readrec);
2583 }
2584
hts_itr_regions(const hts_idx_t * idx,hts_reglist_t * reglist,int count,hts_name2id_f getid,void * hdr,hts_itr_multi_query_func * itr_specific,hts_readrec_func * readrec,hts_seek_func * seek,hts_tell_func * tell)2585 hts_itr_multi_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
2586
2587 int i;
2588
2589 if (!reglist)
2590 return NULL;
2591
2592 hts_itr_multi_t *itr = (hts_itr_multi_t*)calloc(1, sizeof(hts_itr_multi_t));
2593 if (itr) {
2594 itr->n_reg = count;
2595 itr->readrec = readrec;
2596 itr->seek = seek;
2597 itr->tell = tell;
2598 itr->reg_list = reglist;
2599 itr->finished = 0;
2600 itr->nocoor = 0;
2601
2602
2603 for (i = 0; i < itr->n_reg; i++) {
2604 if (!strcmp(itr->reg_list[i].reg, ".")) {
2605 itr->reg_list[i].tid = HTS_IDX_START;
2606 continue;
2607 }
2608
2609 if (!strcmp(itr->reg_list[i].reg, "*")) {
2610 itr->reg_list[i].tid = HTS_IDX_NOCOOR;
2611 continue;
2612 }
2613
2614 itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
2615 if (itr->reg_list[i].tid < 0)
2616 hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
2617 }
2618
2619 qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
2620 itr_specific(idx, itr);
2621 }
2622 return itr;
2623 }
2624
hts_itr_next(BGZF * fp,hts_itr_t * iter,void * r,void * data)2625 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
2626 {
2627 int ret, tid, beg, end;
2628 if (iter == NULL || iter->finished) return -1;
2629 if (iter->read_rest) {
2630 if (iter->curr_off) { // seek to the start
2631 if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) return -1;
2632 iter->curr_off = 0; // only seek once
2633 }
2634 ret = iter->readrec(fp, data, r, &tid, &beg, &end);
2635 if (ret < 0) iter->finished = 1;
2636 iter->curr_tid = tid;
2637 iter->curr_beg = beg;
2638 iter->curr_end = end;
2639 return ret;
2640 }
2641 // A NULL iter->off should always be accompanied by iter->finished.
2642 assert(iter->off != NULL);
2643 for (;;) {
2644 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
2645 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
2646 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
2647 if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) return -1;
2648 iter->curr_off = bgzf_tell(fp);
2649 }
2650 ++iter->i;
2651 }
2652 if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
2653 iter->curr_off = bgzf_tell(fp);
2654 if (tid != iter->tid || beg >= iter->end) { // no need to proceed
2655 ret = -1; break;
2656 } else if (end > iter->beg && iter->end > beg) {
2657 iter->curr_tid = tid;
2658 iter->curr_beg = beg;
2659 iter->curr_end = end;
2660 return ret;
2661 }
2662 } else break; // end of file or error
2663 }
2664 iter->finished = 1;
2665 return ret;
2666 }
2667
hts_itr_multi_next(htsFile * fd,hts_itr_multi_t * iter,void * r)2668 int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r)
2669 {
2670 void *fp;
2671 int ret, tid, beg, end, i, cr, ci;
2672 hts_reglist_t *found_reg;
2673
2674 if (iter == NULL || iter->finished) return -1;
2675
2676 if (iter->is_cram) {
2677 fp = fd->fp.cram;
2678 } else {
2679 fp = fd->fp.bgzf;
2680 }
2681
2682 if (iter->read_rest) {
2683 if (iter->curr_off) { // seek to the start
2684 if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
2685 return -1;
2686 }
2687 iter->curr_off = 0; // only seek once
2688 }
2689
2690 ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
2691 if (ret < 0) {
2692 iter->finished = 1;
2693 }
2694
2695 iter->curr_tid = tid;
2696 iter->curr_beg = beg;
2697 iter->curr_end = end;
2698
2699 return ret;
2700 }
2701 // A NULL iter->off should always be accompanied by iter->finished.
2702 assert(iter->off != NULL || iter->nocoor != 0);
2703
2704 for (;;) {
2705 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
2706 if (iter->i == iter->n_off - 1) { // no more chunks, except NOCOORs
2707 if (iter->nocoor) {
2708 iter->read_rest = 1;
2709 iter->curr_off = iter->nocoor_off;
2710
2711 return hts_itr_multi_next(fd, iter, r);
2712 } else {
2713 ret = -1; break;
2714 }
2715 }
2716
2717 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
2718 if (iter->seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
2719 return -1;
2720 }
2721
2722 iter->curr_off = iter->tell(fp);
2723 }
2724 ++iter->i;
2725 }
2726
2727 ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
2728 if (ret < 0)
2729 break;
2730
2731 iter->curr_off = iter->tell(fp);
2732 if (tid != iter->curr_tid) {
2733 hts_reglist_t key;
2734 key.tid = tid;
2735
2736 found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list, iter->n_reg, sizeof(hts_reglist_t), compare_regions);
2737 if (!found_reg)
2738 continue;
2739
2740 iter->curr_reg = (found_reg - iter->reg_list);
2741 iter->curr_tid = tid;
2742 iter->curr_intv = 0;
2743 }
2744
2745 cr = iter->curr_reg;
2746 ci = iter->curr_intv;
2747
2748 if (beg > iter->off[iter->i].max) {
2749 iter->curr_off = iter->off[iter->i].v;
2750 continue;
2751 }
2752 if (beg > iter->reg_list[cr].max_end)
2753 continue;
2754
2755 for (i = ci; i < iter->reg_list[cr].count; i++) {
2756 if (end > iter->reg_list[cr].intervals[i].beg && iter->reg_list[cr].intervals[i].end > beg) {
2757 iter->curr_beg = beg;
2758 iter->curr_end = end;
2759 iter->curr_intv = i;
2760
2761 return ret;
2762 }
2763 }
2764 }
2765 iter->finished = 1;
2766
2767 return ret;
2768 }
2769
2770 /**********************
2771 *** Retrieve index ***
2772 **********************/
2773 // Returns -1 if index couldn't be opened.
2774 // -2 on other errors
test_and_fetch(const char * fn,const char ** local_fn)2775 static int test_and_fetch(const char *fn, const char **local_fn)
2776 {
2777 hFILE *remote_hfp;
2778 FILE *local_fp = NULL;
2779 uint8_t *buf = NULL;
2780 int save_errno;
2781
2782 if (hisremote(fn)) {
2783 const int buf_size = 1 * 1024 * 1024;
2784 int l;
2785 const char *p;
2786 for (p = fn + strlen(fn) - 1; p >= fn; --p)
2787 if (*p == '/') break;
2788 ++p; // p now points to the local file name
2789 // Attempt to open local file first
2790 if ((local_fp = fopen((char*)p, "rb")) != 0)
2791 {
2792 fclose(local_fp);
2793 *local_fn = p;
2794 return 0;
2795 }
2796 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
2797 if ((remote_hfp = hopen(fn, "r")) == 0) return -1;
2798 if ((local_fp = fopen(p, "w")) == 0) {
2799 hts_log_error("Failed to create file %s in the working directory", p);
2800 goto fail;
2801 }
2802 hts_log_info("Downloading file %s to local directory", fn);
2803 buf = (uint8_t*)calloc(buf_size, 1);
2804 if (!buf) {
2805 hts_log_error("%s", strerror(errno));
2806 goto fail;
2807 }
2808 while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
2809 if (fwrite(buf, 1, l, local_fp) != l) {
2810 hts_log_error("Failed to write data to %s : %s",
2811 fn, strerror(errno));
2812 goto fail;
2813 }
2814 }
2815 free(buf);
2816 if (fclose(local_fp) < 0) {
2817 hts_log_error("Error closing %s : %s", fn, strerror(errno));
2818 local_fp = NULL;
2819 goto fail;
2820 }
2821 if (hclose(remote_hfp) != 0) {
2822 hts_log_error("Failed to close remote file %s", fn);
2823 }
2824 *local_fn = p;
2825 return 0;
2826 } else {
2827 hFILE *local_hfp;
2828 if ((local_hfp = hopen(fn, "r")) == 0) return -1;
2829 hclose_abruptly(local_hfp);
2830 *local_fn = fn;
2831 return 0;
2832 }
2833
2834 fail:
2835 save_errno = errno;
2836 hclose_abruptly(remote_hfp);
2837 if (local_fp) fclose(local_fp);
2838 free(buf);
2839 errno = save_errno;
2840 return -2;
2841 }
2842
hts_idx_getfn(const char * fn,const char * ext)2843 char *hts_idx_getfn(const char *fn, const char *ext)
2844 {
2845 int i, l_fn, l_ext, ret;
2846 char *fnidx;
2847 const char *local_fn = NULL;
2848 l_fn = strlen(fn); l_ext = strlen(ext);
2849 fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
2850 if (!fnidx) return NULL;
2851 // First try : append `ext` to `fn`
2852 strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
2853 if ((ret = test_and_fetch(fnidx, &local_fn)) == -1) {
2854 // Second try : replace suffix of `fn` with `ext`
2855 for (i = l_fn - 1; i > 0; --i)
2856 if (fnidx[i] == '.' || fnidx[i] == '/') break;
2857 if (fnidx[i] == '.') {
2858 strcpy(fnidx + i, ext);
2859 ret = test_and_fetch(fnidx, &local_fn);
2860 }
2861 }
2862 if (ret < 0) {
2863 free(fnidx);
2864 return NULL;
2865 }
2866 l_fn = strlen(local_fn);
2867 memmove(fnidx, local_fn, l_fn + 1);
2868 return fnidx;
2869 }
2870
hts_idx_load(const char * fn,int fmt)2871 hts_idx_t *hts_idx_load(const char *fn, int fmt)
2872 {
2873 char *fnidx;
2874 hts_idx_t *idx;
2875 fnidx = hts_idx_getfn(fn, ".csi");
2876 if (! fnidx) fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
2877 if (fnidx == 0) return 0;
2878
2879 idx = hts_idx_load2(fn, fnidx);
2880 free(fnidx);
2881 return idx;
2882 }
2883
hts_idx_load2(const char * fn,const char * fnidx)2884 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
2885 {
2886 // Check that the index file is up to date, the main file might have changed
2887 struct stat stat_idx,stat_main;
2888 if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
2889 {
2890 if ( stat_idx.st_mtime < stat_main.st_mtime )
2891 hts_log_warning("The index file is older than the data file: %s", fnidx);
2892 }
2893
2894 return hts_idx_load_local(fnidx);
2895 }
2896
2897
2898
2899 /**********************
2900 *** Memory ***
2901 **********************/
2902
2903 /* For use with hts_expand macros *only* */
hts_realloc_or_die(size_t n,size_t m,size_t m_sz,size_t size,int clear,void ** ptr,const char * func)2904 size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
2905 int clear, void **ptr, const char *func) {
2906 /* If new_m and size are both below this limit, multiplying them
2907 together can't overflow */
2908 const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
2909 void *new_ptr;
2910 size_t bytes, new_m;
2911
2912 new_m = n;
2913 kroundup_size_t(new_m);
2914
2915 bytes = size * new_m;
2916
2917 /* Check for overflow. Both ensure that new_m will fit in m (we make the
2918 pessimistic assumption that m is signed), and that bytes has not
2919 wrapped around. */
2920 if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
2921 || ((size > safe || new_m > safe)
2922 && bytes / new_m != size)) {
2923 errno = ENOMEM;
2924 goto die;
2925 }
2926
2927 new_ptr = realloc(*ptr, bytes);
2928 if (new_ptr == NULL) goto die;
2929
2930 if (clear) {
2931 if (new_m > m) {
2932 memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
2933 }
2934 }
2935
2936 *ptr = new_ptr;
2937
2938 return new_m;
2939
2940 die:
2941 hts_log_error("%s", strerror(errno));
2942 exit(1);
2943 }
2944
hts_set_log_level(enum htsLogLevel level)2945 void hts_set_log_level(enum htsLogLevel level)
2946 {
2947 hts_verbose = level;
2948 }
2949
hts_get_log_level()2950 enum htsLogLevel hts_get_log_level()
2951 {
2952 return hts_verbose;
2953 }
2954
get_severity_tag(enum htsLogLevel severity)2955 static char get_severity_tag(enum htsLogLevel severity)
2956 {
2957 switch (severity) {
2958 case HTS_LOG_ERROR:
2959 return 'E';
2960 case HTS_LOG_WARNING:
2961 return 'W';
2962 case HTS_LOG_INFO:
2963 return 'I';
2964 case HTS_LOG_DEBUG:
2965 return 'D';
2966 case HTS_LOG_TRACE:
2967 return 'T';
2968 default:
2969 break;
2970 }
2971
2972 return '*';
2973 }
2974
hts_log(enum htsLogLevel severity,const char * context,const char * format,...)2975 void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
2976 {
2977 int save_errno = errno;
2978 if (severity <= hts_verbose) {
2979 va_list argptr;
2980
2981 fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
2982
2983 va_start(argptr, format);
2984 vfprintf(stderr, format, argptr);
2985 va_end(argptr);
2986
2987 fprintf(stderr, "\n");
2988 }
2989 errno = save_errno;
2990 }
2991