1 /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
2
3 Copyright (C) 2014-2021 Genome Research Ltd.
4
5 Author: Shane McCarthy <sm15@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <strings.h>
28 #include <unistd.h>
29 #include <getopt.h>
30 #include <htslib/vcf.h>
31 #include <htslib/tbx.h>
32 #include <sys/stat.h>
33 #define __STDC_FORMAT_MACROS
34 #include <inttypes.h>
35 #include <htslib/kstring.h>
36 #include <htslib/bgzf.h>
37 #include "bcftools.h"
38
39 #define BCF_LIDX_SHIFT 14
40
41 enum {
42 per_contig = 1,
43 total = 2
44 };
45
usage(void)46 static void usage(void)
47 {
48 fprintf(stderr, "\n");
49 fprintf(stderr, "About: Index bgzip compressed VCF/BCF files for random access.\n");
50 fprintf(stderr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
51 fprintf(stderr, "\n");
52 fprintf(stderr, "Indexing options:\n");
53 fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
54 fprintf(stderr, " -f, --force overwrite index if it already exists\n");
55 fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
56 fprintf(stderr, " -o, --output FILE optional output index file name\n");
57 fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n");
58 fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n");
59 fprintf(stderr, "\n");
60 fprintf(stderr, "Stats options:\n");
61 fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n");
62 fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n");
63 fprintf(stderr, "\n");
64 exit(1);
65 }
66
vcf_index_stats(char * fname,int stats)67 int vcf_index_stats(char *fname, int stats)
68 {
69 const char **seq = NULL;
70 int tid, nseq = 0, ret = 0;
71 tbx_t *tbx = NULL;
72 bcf_hdr_t *hdr = NULL;
73 hts_idx_t *idx = NULL;
74 htsFile *fp = NULL;
75 uint64_t sum = 0;
76 char *fntemp = NULL, *fnidx = NULL;
77
78 /*
79 * First, has the user provided an index file? If per contig stats
80 * are requested, open the variant file (together with the index file,
81 * if provided), since the contig names can only be retrieved from its
82 * header. Otherwise, use just the corresponding index file to count
83 * the total number of records.
84 */
85 int len = strlen(fname);
86 int idx_only = 0;
87 if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) {
88 fntemp = strdup(fname);
89 if ( !fntemp ) return 1;
90 fntemp[fnidx-fname] = 0;
91 fname = fntemp;
92 fnidx += strlen(HTS_IDX_DELIM);
93 }
94 else if ( len>4 && (!strcasecmp(".csi",fname+len-4) || !strcasecmp(".tbi",fname+len-4)) )
95 {
96 fnidx = fname;
97 fntemp = strdup(fname);
98 fname = fntemp;
99 fname[len-4] = 0;
100 idx_only = 1;
101 }
102
103 if ( stats&per_contig )
104 {
105 if ( idx_only )
106 {
107 struct stat buf;
108 if ( stat(fname, &buf)==0 ) idx_only = 0;
109 }
110
111 enum htsExactFormat fmt;
112 if ( !idx_only )
113 {
114 fp = hts_open(fname,"r");
115 if ( !fp ) {
116 fprintf(stderr,"Could not read %s\n", fname);
117 ret = 1; goto cleanup;
118 }
119 hdr = bcf_hdr_read(fp);
120 if ( !hdr ) {
121 fprintf(stderr,"Could not read the header: %s\n", fname);
122 ret = 1; goto cleanup;
123 }
124 fmt = hts_get_format(fp)->format;
125 }
126 else
127 {
128 int len = strlen(fnidx);
129 if ( !strcasecmp(".tbi",fnidx+len-4) ) fmt = vcf;
130 else fmt = bcf;
131 }
132
133 if ( fmt==vcf )
134 {
135 tbx = tbx_index_load2(fname, fnidx);
136 if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
137 }
138 else if ( fmt==bcf )
139 {
140 idx = bcf_index_load2(fname, fnidx);
141 if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
142 }
143 else
144 {
145 fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
146 return 1;
147 }
148 }
149 else if ( fnidx )
150 {
151 char *ext = strrchr(fnidx, '.');
152 if ( ext && strcmp(ext, ".tbi") == 0 ) {
153 tbx = tbx_index_load2(fname, fnidx);
154 } else if ( ext && strcmp(ext, ".csi") == 0 ) {
155 idx = bcf_index_load2(fname, fnidx);
156 }
157 if ( !tbx && !idx ) {
158 fprintf(stderr,"Could not load index file '%s'\n", fnidx);
159 ret = 1; goto cleanup;
160 }
161 } else {
162 char *ext = strrchr(fname, '.');
163 if ( ext && strcmp(ext, ".bcf") == 0 ) {
164 idx = bcf_index_load(fname);
165 } else if ( ext && (ext-fname) > 4 && strcmp(ext-4, ".vcf.gz") == 0 ) {
166 tbx = tbx_index_load(fname);
167 }
168 }
169
170 if ( !tbx && !idx ) {
171 fprintf(stderr,"No index file could be found for '%s'. Use 'bcftools index' to create one\n", fname);
172 ret = 1; goto cleanup;
173 }
174
175 if ( tbx ) {
176 seq = tbx_seqnames(tbx, &nseq);
177 } else {
178 nseq = hts_idx_nseq(idx);
179 }
180 if ( !tbx && !hdr ) fprintf(stderr,"Warning: cannot determine contig names given the .csi index alone\n");
181 for (tid=0; tid<nseq; tid++)
182 {
183 uint64_t records, v;
184 hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
185 sum += records;
186 if ( (stats&total) || !records ) continue;
187 const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a";
188 bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
189 int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
190 printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
191 }
192 if ( !sum )
193 {
194 // No counts found.
195 // Is this because index version has no stored count data, or no records?
196 bcf1_t *rec = bcf_init1();
197 if (fp && hdr && rec && bcf_read1(fp, hdr, rec) >= 0) {
198 fprintf(stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
199 ret = 1;
200 }
201 bcf_destroy1(rec);
202 }
203 if ( (stats&total) && !ret ) {
204 printf("%" PRIu64 "\n", sum);
205 }
206
207 cleanup:
208 free(seq);
209 free(fntemp);
210 if ( fp && hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
211 bcf_hdr_destroy(hdr);
212 if (tbx)
213 tbx_destroy(tbx);
214 if (idx)
215 hts_idx_destroy(idx);
216 return ret;
217 }
218
main_vcfindex(int argc,char * argv[])219 int main_vcfindex(int argc, char *argv[])
220 {
221 int c, force = 0, tbi = 0, stats = 0, n_threads = 0;
222 int min_shift = BCF_LIDX_SHIFT;
223 char *outfn = NULL;
224
225 static struct option loptions[] =
226 {
227 {"csi",no_argument,NULL,'c'},
228 {"tbi",no_argument,NULL,'t'},
229 {"force",no_argument,NULL,'f'},
230 {"min-shift",required_argument,NULL,'m'},
231 {"stats",no_argument,NULL,'s'},
232 {"nrecords",no_argument,NULL,'n'},
233 {"threads",required_argument,NULL,9},
234 {"output-file",required_argument,NULL,'o'},
235 {"output",required_argument,NULL,'o'},
236 {NULL, 0, NULL, 0}
237 };
238
239 char *tmp;
240 while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0)
241 {
242 switch (c)
243 {
244 case 'c': tbi = 0; break;
245 case 't': tbi = 1; min_shift = 0; break;
246 case 'f': force = 1; break;
247 case 'm':
248 min_shift = strtol(optarg,&tmp,10);
249 if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
250 break;
251 case 's': stats |= per_contig; break;
252 case 'n': stats |= total; break;
253 case 9:
254 n_threads = strtol(optarg,&tmp,10);
255 if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
256 break;
257 case 'o': outfn = optarg; break;
258 default: usage();
259 }
260 }
261 if (stats > total)
262 {
263 fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
264 return 1;
265 }
266 if (tbi && min_shift>0)
267 {
268 fprintf(stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__);
269 return 1;
270 }
271 if (min_shift < 0 || min_shift > 30)
272 {
273 fprintf(stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift);
274 return 1;
275 }
276
277 char *fname = NULL;
278 if ( optind>=argc )
279 {
280 if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
281 else usage();
282 }
283 else fname = argv[optind];
284 if (stats) return vcf_index_stats(fname, stats);
285
286 kstring_t idx_fname = {0,0,0};
287 if (outfn)
288 kputs(outfn,&idx_fname);
289 else
290 {
291 if (!strcmp(fname, "-")) { fprintf(stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; }
292 ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi");
293 }
294 if (!force)
295 {
296 // Before complaining about existing index, check if the VCF file isn't newer.
297 struct stat stat_tbi, stat_file;
298 if ( stat(idx_fname.s, &stat_tbi)==0 )
299 {
300 stat(fname, &stat_file);
301 if ( stat_file.st_mtime <= stat_tbi.st_mtime )
302 {
303 fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s);
304 free(idx_fname.s);
305 return 1;
306 }
307 }
308
309 // check for truncated files, allow only with -f
310 BGZF *fp = bgzf_open(fname, "r");
311 if ( !fp ) error("index: failed to open %s\n", fname);
312 if ( bgzf_compression(fp)!=2 ) error("index: the file is not BGZF compressed, cannot index: %s\n", fname);
313 if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname);
314 if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname);
315 }
316
317 int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
318 free(idx_fname.s);
319 if (ret != 0) {
320 if (ret == -2)
321 error("index: failed to open \"%s\"\n", fname);
322 else if (ret == -3)
323 error("index: \"%s\" is in a format that cannot be usefully indexed\n", fname);
324 else
325 error("index: failed to create index for \"%s\"\n", fname);
326 }
327 return 0;
328 }
329