1 /*  vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
2 
3     Copyright (C) 2014-2021 Genome Research Ltd.
4 
5     Author: Shane McCarthy <sm15@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <strings.h>
28 #include <unistd.h>
29 #include <getopt.h>
30 #include <htslib/vcf.h>
31 #include <htslib/tbx.h>
32 #include <sys/stat.h>
33 #define __STDC_FORMAT_MACROS
34 #include <inttypes.h>
35 #include <htslib/kstring.h>
36 #include <htslib/bgzf.h>
37 #include "bcftools.h"
38 
39 #define BCF_LIDX_SHIFT    14
40 
41 enum {
42     per_contig = 1,
43     total = 2
44 };
45 
usage(void)46 static void usage(void)
47 {
48     fprintf(stderr, "\n");
49     fprintf(stderr, "About:   Index bgzip compressed VCF/BCF files for random access.\n");
50     fprintf(stderr, "Usage:   bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
51     fprintf(stderr, "\n");
52     fprintf(stderr, "Indexing options:\n");
53     fprintf(stderr, "    -c, --csi                generate CSI-format index for VCF/BCF files [default]\n");
54     fprintf(stderr, "    -f, --force              overwrite index if it already exists\n");
55     fprintf(stderr, "    -m, --min-shift INT      set minimal interval size for CSI indices to 2^INT [14]\n");
56     fprintf(stderr, "    -o, --output FILE        optional output index file name\n");
57     fprintf(stderr, "    -t, --tbi                generate TBI-format index for VCF files\n");
58     fprintf(stderr, "        --threads INT        use multithreading with INT worker threads [0]\n");
59     fprintf(stderr, "\n");
60     fprintf(stderr, "Stats options:\n");
61     fprintf(stderr, "    -n, --nrecords       print number of records based on existing index file\n");
62     fprintf(stderr, "    -s, --stats          print per contig stats based on existing index file\n");
63     fprintf(stderr, "\n");
64     exit(1);
65 }
66 
vcf_index_stats(char * fname,int stats)67 int vcf_index_stats(char *fname, int stats)
68 {
69     const char **seq = NULL;
70     int tid, nseq = 0, ret = 0;
71     tbx_t *tbx = NULL;
72     bcf_hdr_t *hdr = NULL;
73     hts_idx_t *idx = NULL;
74     htsFile *fp = NULL;
75     uint64_t sum = 0;
76     char *fntemp = NULL, *fnidx = NULL;
77 
78     /*
79      * First, has the user provided an index file? If per contig stats
80      * are requested, open the variant file (together with the index file,
81      * if provided), since the contig names can only be retrieved from its
82      * header. Otherwise, use just the corresponding index file to count
83      * the total number of records.
84      */
85     int len = strlen(fname);
86     int idx_only = 0;
87     if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) {
88         fntemp = strdup(fname);
89         if ( !fntemp ) return 1;
90         fntemp[fnidx-fname] = 0;
91         fname = fntemp;
92         fnidx += strlen(HTS_IDX_DELIM);
93     }
94     else if ( len>4 && (!strcasecmp(".csi",fname+len-4) || !strcasecmp(".tbi",fname+len-4)) )
95     {
96         fnidx  = fname;
97         fntemp = strdup(fname);
98         fname  = fntemp;
99         fname[len-4] = 0;
100         idx_only = 1;
101     }
102 
103     if ( stats&per_contig )
104     {
105         if ( idx_only )
106         {
107             struct stat buf;
108             if ( stat(fname, &buf)==0 ) idx_only = 0;
109         }
110 
111         enum htsExactFormat fmt;
112         if ( !idx_only )
113         {
114             fp = hts_open(fname,"r");
115             if ( !fp ) {
116                 fprintf(stderr,"Could not read %s\n", fname);
117                 ret = 1; goto cleanup;
118             }
119             hdr = bcf_hdr_read(fp);
120             if ( !hdr ) {
121                 fprintf(stderr,"Could not read the header: %s\n", fname);
122                 ret = 1; goto cleanup;
123             }
124             fmt = hts_get_format(fp)->format;
125         }
126         else
127         {
128             int len = strlen(fnidx);
129             if ( !strcasecmp(".tbi",fnidx+len-4) ) fmt = vcf;
130             else fmt = bcf;
131         }
132 
133         if ( fmt==vcf )
134         {
135             tbx = tbx_index_load2(fname, fnidx);
136             if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
137         }
138         else if ( fmt==bcf )
139         {
140             idx = bcf_index_load2(fname, fnidx);
141             if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
142         }
143         else
144         {
145             fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
146             return 1;
147         }
148     }
149     else if ( fnidx )
150     {
151         char *ext = strrchr(fnidx, '.');
152         if ( ext && strcmp(ext, ".tbi") == 0 ) {
153             tbx = tbx_index_load2(fname, fnidx);
154         } else if ( ext && strcmp(ext, ".csi") == 0 ) {
155             idx = bcf_index_load2(fname, fnidx);
156         }
157         if ( !tbx && !idx ) {
158             fprintf(stderr,"Could not load index file '%s'\n", fnidx);
159             ret = 1; goto cleanup;
160         }
161     } else {
162         char *ext = strrchr(fname, '.');
163         if ( ext && strcmp(ext, ".bcf") == 0 ) {
164             idx = bcf_index_load(fname);
165         } else if ( ext && (ext-fname) > 4 && strcmp(ext-4, ".vcf.gz") == 0 ) {
166             tbx = tbx_index_load(fname);
167         }
168     }
169 
170     if ( !tbx && !idx ) {
171         fprintf(stderr,"No index file could be found for '%s'. Use 'bcftools index' to create one\n", fname);
172         ret = 1; goto cleanup;
173     }
174 
175     if ( tbx ) {
176         seq = tbx_seqnames(tbx, &nseq);
177     } else {
178         nseq = hts_idx_nseq(idx);
179     }
180     if ( !tbx && !hdr ) fprintf(stderr,"Warning: cannot determine contig names given the .csi index alone\n");
181     for (tid=0; tid<nseq; tid++)
182     {
183         uint64_t records, v;
184         hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
185         sum += records;
186         if ( (stats&total) || !records ) continue;
187         const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a";
188         bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
189         int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
190         printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
191     }
192     if ( !sum )
193     {
194         // No counts found.
195         // Is this because index version has no stored count data, or no records?
196         bcf1_t *rec = bcf_init1();
197         if (fp && hdr && rec && bcf_read1(fp, hdr, rec) >= 0) {
198             fprintf(stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
199             ret = 1;
200         }
201         bcf_destroy1(rec);
202     }
203     if ( (stats&total) && !ret ) {
204         printf("%" PRIu64 "\n", sum);
205     }
206 
207 cleanup:
208     free(seq);
209     free(fntemp);
210     if ( fp && hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
211     bcf_hdr_destroy(hdr);
212     if (tbx)
213         tbx_destroy(tbx);
214     if (idx)
215         hts_idx_destroy(idx);
216     return ret;
217 }
218 
main_vcfindex(int argc,char * argv[])219 int main_vcfindex(int argc, char *argv[])
220 {
221     int c, force = 0, tbi = 0, stats = 0, n_threads = 0;
222     int min_shift = BCF_LIDX_SHIFT;
223     char *outfn = NULL;
224 
225     static struct option loptions[] =
226     {
227         {"csi",no_argument,NULL,'c'},
228         {"tbi",no_argument,NULL,'t'},
229         {"force",no_argument,NULL,'f'},
230         {"min-shift",required_argument,NULL,'m'},
231         {"stats",no_argument,NULL,'s'},
232         {"nrecords",no_argument,NULL,'n'},
233         {"threads",required_argument,NULL,9},
234         {"output-file",required_argument,NULL,'o'},
235         {"output",required_argument,NULL,'o'},
236         {NULL, 0, NULL, 0}
237     };
238 
239     char *tmp;
240     while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0)
241     {
242         switch (c)
243         {
244             case 'c': tbi = 0; break;
245             case 't': tbi = 1; min_shift = 0; break;
246             case 'f': force = 1; break;
247             case 'm':
248                 min_shift = strtol(optarg,&tmp,10);
249                 if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
250                 break;
251             case 's': stats |= per_contig; break;
252             case 'n': stats |= total; break;
253             case 9:
254                 n_threads = strtol(optarg,&tmp,10);
255                 if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
256                 break;
257             case 'o': outfn = optarg; break;
258             default: usage();
259         }
260     }
261     if (stats > total)
262     {
263         fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
264         return 1;
265     }
266     if (tbi && min_shift>0)
267     {
268         fprintf(stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__);
269         return 1;
270     }
271     if (min_shift < 0 || min_shift > 30)
272     {
273         fprintf(stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift);
274         return 1;
275     }
276 
277     char *fname = NULL;
278     if ( optind>=argc )
279     {
280         if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
281         else usage();
282     }
283     else fname = argv[optind];
284     if (stats) return vcf_index_stats(fname, stats);
285 
286     kstring_t idx_fname = {0,0,0};
287     if (outfn)
288         kputs(outfn,&idx_fname);
289     else
290     {
291         if (!strcmp(fname, "-")) { fprintf(stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; }
292         ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi");
293     }
294     if (!force)
295     {
296         // Before complaining about existing index, check if the VCF file isn't newer.
297         struct stat stat_tbi, stat_file;
298         if ( stat(idx_fname.s, &stat_tbi)==0 )
299         {
300             stat(fname, &stat_file);
301             if ( stat_file.st_mtime <= stat_tbi.st_mtime )
302             {
303                 fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s);
304                 free(idx_fname.s);
305                 return 1;
306             }
307         }
308 
309         // check for truncated files, allow only with -f
310         BGZF *fp = bgzf_open(fname, "r");
311         if ( !fp ) error("index: failed to open %s\n", fname);
312         if ( bgzf_compression(fp)!=2 ) error("index: the file is not BGZF compressed, cannot index: %s\n", fname);
313         if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname);
314         if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname);
315     }
316 
317     int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
318     free(idx_fname.s);
319     if (ret != 0) {
320         if (ret == -2)
321             error("index: failed to open \"%s\"\n", fname);
322         else if (ret == -3)
323             error("index: \"%s\" is in a format that cannot be usefully indexed\n", fname);
324         else
325             error("index: failed to create index for \"%s\"\n", fname);
326     }
327     return 0;
328 }
329