1 /* vcfquery.c -- Extracts fields from VCF/BCF file.
2
3 Copyright (C) 2013-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE. */
24
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <getopt.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <errno.h>
31 #include <sys/stat.h>
32 #include <sys/types.h>
33 #include <htslib/vcf.h>
34 #include <htslib/synced_bcf_reader.h>
35 #include <htslib/khash_str2int.h>
36 #include <htslib/vcfutils.h>
37 #include "bcftools.h"
38 #include "filter.h"
39 #include "convert.h"
40
41
42 // Logic of the filters: include or exclude sites which match the filters?
43 #define FLT_INCLUDE 1
44 #define FLT_EXCLUDE 2
45
46 typedef struct
47 {
48 filter_t *filter;
49 char *filter_str;
50 int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
51 uint8_t *smpl_pass;
52 convert_t *convert;
53 bcf_srs_t *files;
54 bcf_hdr_t *header;
55 int nsamples, *samples, sample_is_file;
56 char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out;
57 int argc, list_columns, print_header, allow_undef_tags;
58 FILE *out;
59 }
60 args_t;
61
destroy_list(char ** list,int n)62 static void destroy_list(char **list, int n)
63 {
64 int i;
65 for (i=0; i<n; i++)
66 free(list[i]);
67 free(list);
68 }
69
init_data(args_t * args)70 static void init_data(args_t *args)
71 {
72 args->header = args->files->readers[0].header;
73
74 int i, nsamples = 0, *samples = NULL;
75 if ( args->sample_list && strcmp("-",args->sample_list) )
76 {
77 for (i=0; i<args->files->nreaders; i++)
78 {
79 int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
80 if ( ret<0 ) error("Error parsing the sample list\n");
81 else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
82 }
83
84 if ( args->sample_list[0]!='^' )
85 {
86 // the sample ordering may be different if not negated
87 int n;
88 char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
89 if ( !smpls ) error("Could not parse %s\n", args->sample_list);
90 if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
91 error("The number of samples does not match, perhaps some are present multiple times?\n");
92 nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
93 samples = (int*) malloc(sizeof(int)*nsamples);
94 for (i=0; i<n; i++)
95 {
96 samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
97 free(smpls[i]);
98 }
99 free(smpls);
100 }
101 }
102 args->convert = convert_init(args->header, samples, nsamples, args->format_str);
103 convert_set_option(args->convert, subset_samples, &args->smpl_pass);
104 if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
105 free(samples);
106
107 int max_unpack = convert_max_unpack(args->convert);
108 if ( args->filter_str )
109 {
110 args->filter = filter_init(args->header, args->filter_str);
111 max_unpack |= filter_max_unpack(args->filter);
112 }
113 args->files->max_unpack = max_unpack;
114 }
115
destroy_data(args_t * args)116 static void destroy_data(args_t *args)
117 {
118 convert_destroy(args->convert);
119 if ( args->filter )
120 filter_destroy(args->filter);
121 free(args->samples);
122 }
123
query_vcf(args_t * args)124 static void query_vcf(args_t *args)
125 {
126 kstring_t str = {0,0,0};
127
128 if ( args->print_header )
129 {
130 convert_header(args->convert,&str);
131 if ( fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output");
132 }
133
134 int i,max_convert_unpack = convert_max_unpack(args->convert);
135 while ( bcf_sr_next_line(args->files) )
136 {
137 if ( !bcf_sr_has_line(args->files,0) ) continue;
138 bcf1_t *line = args->files->readers[0].buffer[0];
139 bcf_unpack(line, args->files->max_unpack);
140
141 if ( args->filter )
142 {
143 int pass = filter_test(args->filter, line, (const uint8_t**) &args->smpl_pass);
144 if ( args->filter_logic & FLT_EXCLUDE )
145 {
146 // This code addresses this problem:
147 // -i can include a site but exclude a sample
148 // -e exclude a site but include a sample
149
150 if ( pass )
151 {
152 if ( !args->smpl_pass ) continue;
153 if ( !(max_convert_unpack & BCF_UN_FMT) ) continue;
154
155 pass = 0;
156 for (i=0; i<line->n_sample; i++)
157 {
158 if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0;
159 else { args->smpl_pass[i] = 1; pass = 1; }
160 }
161 if ( !pass ) continue;
162 }
163 else if ( args->smpl_pass )
164 for (i=0; i<line->n_sample; i++) args->smpl_pass[i] = 1;
165 }
166 else if ( !pass ) continue;
167 }
168
169 str.l = 0;
170 convert_line(args->convert, line, &str);
171 if ( str.l && fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output");
172 }
173 if ( str.m ) free(str.s);
174 }
175
list_columns(args_t * args)176 static void list_columns(args_t *args)
177 {
178 void *has_sample = NULL;
179 if ( args->sample_list )
180 {
181 has_sample = khash_str2int_init();
182 int i, nsmpl;
183 char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
184 for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]);
185 free(smpl);
186 }
187
188 int i;
189 bcf_sr_t *reader = &args->files->readers[0];
190 for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
191 {
192 if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue;
193 printf("%s\n", reader->header->samples[i]);
194 }
195
196 if ( has_sample )
197 khash_str2int_destroy_free(has_sample);
198 }
199
copy_header(bcf_hdr_t * hdr,char ** src,int nsrc)200 static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
201 {
202 char **dst = (char**) malloc(sizeof(char*)*nsrc);
203 int i;
204 for (i=0; i<nsrc; i++) dst[i] = strdup(src[i]);
205 return dst;
206 }
compare_header(bcf_hdr_t * hdr,char ** a,int na,char ** b,int nb)207 static int compare_header(bcf_hdr_t *hdr, char **a, int na, char **b, int nb)
208 {
209 if ( na!=nb ) return na-nb;
210 int i;
211 for (i=0; i<na; i++)
212 if ( strcmp(a[i],b[i]) ) return 1;
213 return 0;
214 }
215
216
usage(void)217 static void usage(void)
218 {
219 fprintf(stderr, "\n");
220 fprintf(stderr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n");
221 fprintf(stderr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
222 fprintf(stderr, "\n");
223 fprintf(stderr, "Options:\n");
224 fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
225 fprintf(stderr, " -f, --format STRING See man page for details\n");
226 fprintf(stderr, " -H, --print-header Print header\n");
227 fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
228 fprintf(stderr, " -l, --list-samples Print the list of samples and exit\n");
229 fprintf(stderr, " -o, --output FILE Output file name [stdout]\n");
230 fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
231 fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
232 fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
233 fprintf(stderr, " -s, --samples LIST List of samples to include\n");
234 fprintf(stderr, " -S, --samples-file FILE File of samples to include\n");
235 fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
236 fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
237 fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
238 fprintf(stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n");
239 fprintf(stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n");
240 fprintf(stderr, "\n");
241 fprintf(stderr, "Examples:\n");
242 fprintf(stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
243 fprintf(stderr, "\n");
244 exit(1);
245 }
246
main_vcfquery(int argc,char * argv[])247 int main_vcfquery(int argc, char *argv[])
248 {
249 int c, collapse = 0;
250 args_t *args = (args_t*) calloc(1,sizeof(args_t));
251 args->argc = argc; args->argv = argv;
252 int regions_is_file = 0, targets_is_file = 0;
253 int regions_overlap = 1;
254 int targets_overlap = 0;
255
256 static struct option loptions[] =
257 {
258 {"help",0,0,'h'},
259 {"list-samples",0,0,'l'},
260 {"include",1,0,'i'},
261 {"exclude",1,0,'e'},
262 {"format",1,0,'f'},
263 {"output-file",1,0,'o'},
264 {"output",1,0,'o'},
265 {"regions",1,0,'r'},
266 {"regions-file",1,0,'R'},
267 {"regions-overlap",required_argument,NULL,1},
268 {"targets",1,0,'t'},
269 {"targets-file",1,0,'T'},
270 {"targets-overlap",required_argument,NULL,2},
271 {"annots",1,0,'a'},
272 {"samples",1,0,'s'},
273 {"samples-file",1,0,'S'},
274 {"print-header",0,0,'H'},
275 {"collapse",1,0,'c'},
276 {"vcf-list",1,0,'v'},
277 {"allow-undef-tags",0,0,'u'},
278 {0,0,0,0}
279 };
280 while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) {
281 switch (c) {
282 case 'o': args->fn_out = optarg; break;
283 case 'f': args->format_str = strdup(optarg); break;
284 case 'H': args->print_header = 1; break;
285 case 'v': args->vcf_list = optarg; break;
286 case 'c':
287 error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n");
288 break;
289 case 'a':
290 {
291 kstring_t str = {0,0,0};
292 kputs("%CHROM\t%POS\t%MASK\t%REF\t%ALT\t%", &str);
293 char *p = optarg;
294 while ( *p )
295 {
296 if ( *p==',' )
297 kputs("\t%", &str);
298 else
299 kputc(*p, &str);
300 p++;
301 }
302 kputc('\n', &str);
303 args->format_str = str.s;
304 break;
305 }
306 case 'e':
307 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
308 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
309 case 'i':
310 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
311 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
312 case 'r': args->regions_list = optarg; break;
313 case 'R': args->regions_list = optarg; regions_is_file = 1; break;
314 case 't': args->targets_list = optarg; break;
315 case 'T': args->targets_list = optarg; targets_is_file = 1; break;
316 case 'l': args->list_columns = 1; break;
317 case 'u': args->allow_undef_tags = 1; break;
318 case 's': args->sample_list = optarg; break;
319 case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
320 case 1 :
321 if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
322 else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
323 else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
324 else error("Could not parse: --regions-overlap %s\n",optarg);
325 break;
326 case 2 :
327 if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
328 else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
329 else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
330 else error("Could not parse: --targets-overlap %s\n",optarg);
331 break;
332 case 'h':
333 case '?': usage(); break;
334 default: error("Unknown argument: %s\n", optarg);
335 }
336 }
337
338 char *fname = NULL;
339 if ( optind>=argc )
340 {
341 if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";
342 }
343 else fname = argv[optind];
344
345 if ( args->list_columns )
346 {
347 if ( !fname ) error("Missing the VCF file name\n");
348 args->files = bcf_sr_init();
349 if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
350 list_columns(args);
351 bcf_sr_destroy(args->files);
352 free(args);
353 return 0;
354 }
355
356 if ( !args->format_str )
357 {
358 if ( argc==1 && !fname ) usage();
359 error("Error: Missing the --format option\n");
360 }
361 args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout;
362 if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
363
364 if ( !args->vcf_list )
365 {
366 if ( !fname ) usage();
367 args->files = bcf_sr_init();
368 if ( optind+1 < argc ) args->files->require_index = 1;
369 if ( args->regions_list )
370 {
371 bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
372 if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
373 error("Failed to read the regions: %s\n", args->regions_list);
374 }
375 if ( args->targets_list )
376 {
377 bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
378 if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
379 error("Failed to read the targets: %s\n", args->targets_list);
380 }
381 while ( fname )
382 {
383 if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
384 fname = ++optind < argc ? argv[optind] : NULL;
385 }
386 init_data(args);
387 query_vcf(args);
388 free(args->format_str);
389 destroy_data(args);
390 bcf_sr_destroy(args->files);
391 if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out);
392 free(args);
393 return 0;
394 }
395
396 // multiple VCFs
397 int i, k, nfiles, prev_nsamples = 0;
398 char **fnames, **prev_samples = NULL;
399 fnames = hts_readlist(args->vcf_list, 1, &nfiles);
400 if ( !nfiles ) error("No files in %s?\n", args->vcf_list);
401 for (i=0; i<nfiles; i++)
402 {
403 args->files = bcf_sr_init();
404 args->files->collapse = collapse;
405 if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
406 error("Failed to read the regions: %s\n", args->regions_list);
407 if ( optind < argc ) args->files->require_index = 1;
408 if ( args->targets_list )
409 {
410 if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
411 error("Failed to read the targets: %s\n", args->targets_list);
412 }
413 if ( !bcf_sr_add_reader(args->files, fnames[i]) ) error("Failed to open %s: %s\n", fnames[i],bcf_sr_strerror(args->files->errnum));
414 for (k=optind; k<argc; k++)
415 if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum));
416 init_data(args);
417 if ( i==0 )
418 {
419 prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header));
420 prev_nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
421 }
422 else
423 {
424 args->print_header = 0;
425 if ( compare_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header), prev_samples, prev_nsamples) )
426 error("Different samples in %s and %s\n", fnames[i-1],fnames[i]);
427 }
428 query_vcf(args);
429 destroy_data(args);
430 bcf_sr_destroy(args->files);
431 }
432 if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out);;
433 destroy_list(fnames, nfiles);
434 destroy_list(prev_samples, prev_nsamples);
435 free(args->format_str);
436 free(args);
437 return 0;
438 }
439
440
441