1 /*  vcfquery.c -- Extracts fields from VCF/BCF file.
2 
3     Copyright (C) 2013-2021 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.  */
24 
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <getopt.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <errno.h>
31 #include <sys/stat.h>
32 #include <sys/types.h>
33 #include <htslib/vcf.h>
34 #include <htslib/synced_bcf_reader.h>
35 #include <htslib/khash_str2int.h>
36 #include <htslib/vcfutils.h>
37 #include "bcftools.h"
38 #include "filter.h"
39 #include "convert.h"
40 
41 
42 // Logic of the filters: include or exclude sites which match the filters?
43 #define FLT_INCLUDE 1
44 #define FLT_EXCLUDE 2
45 
46 typedef struct
47 {
48     filter_t *filter;
49     char *filter_str;
50     int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
51     uint8_t *smpl_pass;
52     convert_t *convert;
53     bcf_srs_t *files;
54     bcf_hdr_t *header;
55     int nsamples, *samples, sample_is_file;
56     char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out;
57     int argc, list_columns, print_header, allow_undef_tags;
58     FILE *out;
59 }
60 args_t;
61 
destroy_list(char ** list,int n)62 static void destroy_list(char **list, int n)
63 {
64     int i;
65     for (i=0; i<n; i++)
66         free(list[i]);
67     free(list);
68 }
69 
init_data(args_t * args)70 static void init_data(args_t *args)
71 {
72     args->header = args->files->readers[0].header;
73 
74     int i, nsamples = 0, *samples = NULL;
75     if ( args->sample_list && strcmp("-",args->sample_list) )
76     {
77         for (i=0; i<args->files->nreaders; i++)
78         {
79             int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
80             if ( ret<0 ) error("Error parsing the sample list\n");
81             else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
82         }
83 
84         if ( args->sample_list[0]!='^' )
85         {
86             // the sample ordering may be different if not negated
87             int n;
88             char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
89             if ( !smpls ) error("Could not parse %s\n", args->sample_list);
90             if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
91                 error("The number of samples does not match, perhaps some are present multiple times?\n");
92             nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
93             samples = (int*) malloc(sizeof(int)*nsamples);
94             for (i=0; i<n; i++)
95             {
96                 samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
97                 free(smpls[i]);
98             }
99             free(smpls);
100         }
101     }
102     args->convert = convert_init(args->header, samples, nsamples, args->format_str);
103     convert_set_option(args->convert, subset_samples, &args->smpl_pass);
104     if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
105     free(samples);
106 
107     int max_unpack = convert_max_unpack(args->convert);
108     if ( args->filter_str )
109     {
110         args->filter = filter_init(args->header, args->filter_str);
111         max_unpack |= filter_max_unpack(args->filter);
112     }
113     args->files->max_unpack = max_unpack;
114 }
115 
destroy_data(args_t * args)116 static void destroy_data(args_t *args)
117 {
118     convert_destroy(args->convert);
119     if ( args->filter )
120         filter_destroy(args->filter);
121     free(args->samples);
122 }
123 
query_vcf(args_t * args)124 static void query_vcf(args_t *args)
125 {
126     kstring_t str = {0,0,0};
127 
128     if ( args->print_header )
129     {
130         convert_header(args->convert,&str);
131         if ( fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output");
132     }
133 
134     int i,max_convert_unpack = convert_max_unpack(args->convert);
135     while ( bcf_sr_next_line(args->files) )
136     {
137         if ( !bcf_sr_has_line(args->files,0) ) continue;
138         bcf1_t *line = args->files->readers[0].buffer[0];
139         bcf_unpack(line, args->files->max_unpack);
140 
141         if ( args->filter )
142         {
143             int pass = filter_test(args->filter, line, (const uint8_t**) &args->smpl_pass);
144             if ( args->filter_logic & FLT_EXCLUDE )
145             {
146                 // This code addresses this problem:
147                 //  -i can include a site but exclude a sample
148                 //  -e exclude a site but include a sample
149 
150                 if ( pass )
151                 {
152                     if ( !args->smpl_pass ) continue;
153                     if ( !(max_convert_unpack & BCF_UN_FMT) ) continue;
154 
155                     pass = 0;
156                     for (i=0; i<line->n_sample; i++)
157                     {
158                         if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0;
159                         else { args->smpl_pass[i] = 1; pass = 1; }
160                     }
161                     if ( !pass ) continue;
162                 }
163                 else if ( args->smpl_pass )
164                     for (i=0; i<line->n_sample; i++) args->smpl_pass[i] = 1;
165             }
166             else if ( !pass ) continue;
167         }
168 
169         str.l = 0;
170         convert_line(args->convert, line, &str);
171         if ( str.l && fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output");
172     }
173     if ( str.m ) free(str.s);
174 }
175 
list_columns(args_t * args)176 static void list_columns(args_t *args)
177 {
178     void *has_sample = NULL;
179     if ( args->sample_list )
180     {
181         has_sample = khash_str2int_init();
182         int i, nsmpl;
183         char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
184         for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]);
185         free(smpl);
186     }
187 
188     int i;
189     bcf_sr_t *reader = &args->files->readers[0];
190     for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
191     {
192         if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue;
193         printf("%s\n", reader->header->samples[i]);
194     }
195 
196     if ( has_sample )
197         khash_str2int_destroy_free(has_sample);
198 }
199 
copy_header(bcf_hdr_t * hdr,char ** src,int nsrc)200 static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
201 {
202     char **dst = (char**) malloc(sizeof(char*)*nsrc);
203     int i;
204     for (i=0; i<nsrc; i++) dst[i] = strdup(src[i]);
205     return dst;
206 }
compare_header(bcf_hdr_t * hdr,char ** a,int na,char ** b,int nb)207 static int compare_header(bcf_hdr_t *hdr, char **a, int na, char **b, int nb)
208 {
209     if ( na!=nb ) return na-nb;
210     int i;
211     for (i=0; i<na; i++)
212         if ( strcmp(a[i],b[i]) ) return 1;
213     return 0;
214 }
215 
216 
usage(void)217 static void usage(void)
218 {
219     fprintf(stderr, "\n");
220     fprintf(stderr, "About:   Extracts fields from VCF/BCF file and prints them in user-defined format\n");
221     fprintf(stderr, "Usage:   bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
222     fprintf(stderr, "\n");
223     fprintf(stderr, "Options:\n");
224     fprintf(stderr, "    -e, --exclude EXPR                Exclude sites for which the expression is true (see man page for details)\n");
225     fprintf(stderr, "    -f, --format STRING               See man page for details\n");
226     fprintf(stderr, "    -H, --print-header                Print header\n");
227     fprintf(stderr, "    -i, --include EXPR                Select sites for which the expression is true (see man page for details)\n");
228     fprintf(stderr, "    -l, --list-samples                Print the list of samples and exit\n");
229     fprintf(stderr, "    -o, --output FILE                 Output file name [stdout]\n");
230     fprintf(stderr, "    -r, --regions REGION              Restrict to comma-separated list of regions\n");
231     fprintf(stderr, "    -R, --regions-file FILE           Restrict to regions listed in a file\n");
232     fprintf(stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
233     fprintf(stderr, "    -s, --samples LIST                List of samples to include\n");
234     fprintf(stderr, "    -S, --samples-file FILE           File of samples to include\n");
235     fprintf(stderr, "    -t, --targets REGION              Similar to -r but streams rather than index-jumps\n");
236     fprintf(stderr, "    -T, --targets-file FILE           Similar to -R but streams rather than index-jumps\n");
237     fprintf(stderr, "        --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
238     fprintf(stderr, "    -u, --allow-undef-tags            Print \".\" for undefined tags\n");
239     fprintf(stderr, "    -v, --vcf-list FILE               Process multiple VCFs listed in the file\n");
240     fprintf(stderr, "\n");
241     fprintf(stderr, "Examples:\n");
242     fprintf(stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
243     fprintf(stderr, "\n");
244     exit(1);
245 }
246 
main_vcfquery(int argc,char * argv[])247 int main_vcfquery(int argc, char *argv[])
248 {
249     int c, collapse = 0;
250     args_t *args = (args_t*) calloc(1,sizeof(args_t));
251     args->argc   = argc; args->argv = argv;
252     int regions_is_file = 0, targets_is_file = 0;
253     int regions_overlap = 1;
254     int targets_overlap = 0;
255 
256     static struct option loptions[] =
257     {
258         {"help",0,0,'h'},
259         {"list-samples",0,0,'l'},
260         {"include",1,0,'i'},
261         {"exclude",1,0,'e'},
262         {"format",1,0,'f'},
263         {"output-file",1,0,'o'},
264         {"output",1,0,'o'},
265         {"regions",1,0,'r'},
266         {"regions-file",1,0,'R'},
267         {"regions-overlap",required_argument,NULL,1},
268         {"targets",1,0,'t'},
269         {"targets-file",1,0,'T'},
270         {"targets-overlap",required_argument,NULL,2},
271         {"annots",1,0,'a'},
272         {"samples",1,0,'s'},
273         {"samples-file",1,0,'S'},
274         {"print-header",0,0,'H'},
275         {"collapse",1,0,'c'},
276         {"vcf-list",1,0,'v'},
277         {"allow-undef-tags",0,0,'u'},
278         {0,0,0,0}
279     };
280     while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) {
281         switch (c) {
282             case 'o': args->fn_out = optarg; break;
283             case 'f': args->format_str = strdup(optarg); break;
284             case 'H': args->print_header = 1; break;
285             case 'v': args->vcf_list = optarg; break;
286             case 'c':
287                 error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n");
288                 break;
289             case 'a':
290                 {
291                     kstring_t str = {0,0,0};
292                     kputs("%CHROM\t%POS\t%MASK\t%REF\t%ALT\t%", &str);
293                     char *p = optarg;
294                     while ( *p )
295                     {
296                         if ( *p==',' )
297                             kputs("\t%", &str);
298                         else
299                             kputc(*p, &str);
300                         p++;
301                     }
302                     kputc('\n', &str);
303                     args->format_str = str.s;
304                     break;
305                 }
306             case 'e':
307                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
308                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
309             case 'i':
310                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
311                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
312             case 'r': args->regions_list = optarg; break;
313             case 'R': args->regions_list = optarg; regions_is_file = 1; break;
314             case 't': args->targets_list = optarg; break;
315             case 'T': args->targets_list = optarg; targets_is_file = 1; break;
316             case 'l': args->list_columns = 1; break;
317             case 'u': args->allow_undef_tags = 1; break;
318             case 's': args->sample_list = optarg; break;
319             case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
320             case  1 :
321                 if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
322                 else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
323                 else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
324                 else error("Could not parse: --regions-overlap %s\n",optarg);
325                 break;
326             case  2 :
327                 if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
328                 else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
329                 else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
330                 else error("Could not parse: --targets-overlap %s\n",optarg);
331                 break;
332             case 'h':
333             case '?': usage(); break;
334             default: error("Unknown argument: %s\n", optarg);
335         }
336     }
337 
338     char *fname = NULL;
339     if ( optind>=argc )
340     {
341         if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";
342     }
343     else fname = argv[optind];
344 
345     if ( args->list_columns )
346     {
347         if ( !fname ) error("Missing the VCF file name\n");
348         args->files = bcf_sr_init();
349         if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
350         list_columns(args);
351         bcf_sr_destroy(args->files);
352         free(args);
353         return 0;
354     }
355 
356     if ( !args->format_str )
357     {
358         if ( argc==1 && !fname ) usage();
359         error("Error: Missing the --format option\n");
360     }
361     args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout;
362     if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
363 
364     if ( !args->vcf_list )
365     {
366         if ( !fname ) usage();
367         args->files = bcf_sr_init();
368         if ( optind+1 < argc ) args->files->require_index = 1;
369         if ( args->regions_list )
370         {
371             bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
372             if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
373                 error("Failed to read the regions: %s\n", args->regions_list);
374         }
375         if ( args->targets_list )
376         {
377             bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
378             if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
379                 error("Failed to read the targets: %s\n", args->targets_list);
380         }
381         while ( fname )
382         {
383             if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
384             fname = ++optind < argc ? argv[optind] : NULL;
385         }
386         init_data(args);
387         query_vcf(args);
388         free(args->format_str);
389         destroy_data(args);
390         bcf_sr_destroy(args->files);
391         if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out);
392         free(args);
393         return 0;
394     }
395 
396     // multiple VCFs
397     int i, k, nfiles, prev_nsamples = 0;
398     char **fnames, **prev_samples = NULL;
399     fnames = hts_readlist(args->vcf_list, 1, &nfiles);
400     if ( !nfiles ) error("No files in %s?\n", args->vcf_list);
401     for (i=0; i<nfiles; i++)
402     {
403         args->files = bcf_sr_init();
404         args->files->collapse = collapse;
405         if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
406             error("Failed to read the regions: %s\n", args->regions_list);
407         if ( optind < argc ) args->files->require_index = 1;
408         if ( args->targets_list )
409         {
410             if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
411                 error("Failed to read the targets: %s\n", args->targets_list);
412         }
413         if ( !bcf_sr_add_reader(args->files, fnames[i]) ) error("Failed to open %s: %s\n", fnames[i],bcf_sr_strerror(args->files->errnum));
414         for (k=optind; k<argc; k++)
415             if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum));
416         init_data(args);
417         if ( i==0 )
418         {
419             prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header));
420             prev_nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
421         }
422         else
423         {
424             args->print_header = 0;
425             if ( compare_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header), prev_samples, prev_nsamples) )
426                 error("Different samples in %s and %s\n", fnames[i-1],fnames[i]);
427         }
428         query_vcf(args);
429         destroy_data(args);
430         bcf_sr_destroy(args->files);
431     }
432     if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out);;
433     destroy_list(fnames, nfiles);
434     destroy_list(prev_samples, prev_nsamples);
435     free(args->format_str);
436     free(args);
437     return 0;
438 }
439 
440 
441