1 /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
2
3 Copyright (C) 2013-2021 Genome Research Ltd.
4
5 Author: Shane McCarthy <sm15@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE. */
24
25 #include <stdio.h>
26 #include <strings.h>
27 #include <unistd.h>
28 #include <getopt.h>
29 #include <ctype.h>
30 #include <string.h>
31 #include <errno.h>
32 #include <sys/stat.h>
33 #include <sys/types.h>
34 #include <math.h>
35 #include <inttypes.h>
36 #include <htslib/vcf.h>
37 #include <htslib/synced_bcf_reader.h>
38 #include <htslib/vcfutils.h>
39 #include "bcftools.h"
40 #include "filter.h"
41 #include "htslib/khash_str2int.h"
42
43 #define FLT_INCLUDE 1
44 #define FLT_EXCLUDE 2
45
46 #define ALLELE_NONREF 1
47 #define ALLELE_MINOR 2
48 #define ALLELE_ALT1 3
49 #define ALLELE_MAJOR 4
50 #define ALLELE_NONMAJOR 5
51
52 #define GT_NEED_HOM 1
53 #define GT_NEED_HET 2
54 #define GT_NO_HOM 3
55 #define GT_NO_HET 4
56 #define GT_NEED_MISSING 5
57 #define GT_NO_MISSING 6
58
59 typedef struct _args_t
60 {
61 filter_t *filter;
62 char *filter_str;
63 int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e)
64
65 bcf_srs_t *files;
66 bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
67 char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
68 int regions_overlap, targets_overlap;
69 int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
70 int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
71 int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
72 int *ac, mac;
73 float min_af, max_af;
74 char *fn_ref, *fn_out, **samples;
75 int sample_is_file, force_samples;
76 char *include_types, *exclude_types;
77 int include, exclude;
78 int record_cmd_line;
79 htsFile *out;
80 }
81 args_t;
82
init_data(args_t * args)83 static void init_data(args_t *args)
84 {
85 int i;
86 args->hdr = args->files->readers[0].header;
87
88 if (args->calc_ac && args->update_info)
89 {
90 if (bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">") < 0)
91 error_errno("[%s] Failed to add \"AC\" INFO header", __func__);
92 if (bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">") < 0)
93 error_errno("[%s] Failed to add \"AN\" INFO header", __func__);
94 }
95 if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
96 else if (bcf_hdr_sync(args->hdr) < 0)
97 error_errno("[%s] Failed to update header", __func__);
98
99 // setup sample data
100 if (args->sample_names)
101 {
102 void *hdr_samples = khash_str2int_init();
103 for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
104 khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
105
106 void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
107 int nsmpl;
108 char **smpl = NULL;
109 args->samples = NULL; args->n_samples = 0;
110 smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
111 if ( !smpl )
112 {
113 error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
114 }
115
116 if ( exclude )
117 {
118 for (i=0; i<nsmpl; i++) {
119 if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
120 if (args->force_samples) {
121 fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
122 } else {
123 error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
124 }
125 }
126 khash_str2int_inc(exclude, smpl[i]);
127 }
128
129 for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
130 {
131 if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)) ) continue;
132 args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
133 args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
134 }
135 khash_str2int_destroy(exclude);
136 }
137 else
138 {
139 for (i=0; i<nsmpl; i++) {
140 if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
141 if (args->force_samples) {
142 fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
143 continue;
144 } else {
145 error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
146 }
147 }
148 args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
149 args->samples[args->n_samples++] = strdup(smpl[i]);
150 }
151 }
152 for (i=0; i<nsmpl; i++) free(smpl[i]);
153 free(smpl);
154 khash_str2int_destroy(hdr_samples);
155 if (args->n_samples == 0) {
156 fprintf(stderr, "Warn: subsetting has removed all samples\n");
157 args->sites_only = 1;
158 }
159 }
160
161 if (args->n_samples)
162 args->imap = (int*)malloc(args->n_samples * sizeof(int));
163
164 // determine variant types to include/exclude
165 if (args->include_types || args->exclude_types) {
166 if (args->include_types && args->exclude_types) {
167 fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n");
168 exit(1);
169 }
170 char **type_list = 0;
171 int m = 0, n = 0;
172 const char *q, *p;
173 for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
174 if (*p == ',' || *p == 0) {
175 if (m == n) {
176 m = m? m<<1 : 16;
177 type_list = (char**)realloc(type_list, m * sizeof(char*));
178 }
179 type_list[n] = (char*)calloc(p - q + 1, 1);
180 strncpy(type_list[n++], q, p - q);
181 q = p + 1;
182 if (*p == 0) break;
183 }
184 }
185 type_list = (char**)realloc(type_list, n * sizeof(char*));
186
187 if (args->include_types) {
188 args->include = 0;
189 for (i = 0; i < n; ++i) {
190 if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
191 else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
192 else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
193 else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
194 else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
195 else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
196 else {
197 fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
198 fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
199 exit(1);
200 }
201 }
202 }
203 if (args->exclude_types) {
204 args->exclude = 0;
205 for (i = 0; i < n; ++i) {
206 if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
207 else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
208 else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
209 else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
210 else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
211 else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
212 else {
213 fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
214 fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
215 exit(1);
216 }
217 }
218 }
219 for (i = 0; i < n; ++i)
220 free(type_list[i]);
221 free(type_list);
222 }
223
224 char wmode[8];
225 set_wmode(wmode,args->output_type,args->fn_out,args->clevel);
226 args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode);
227 if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
228 if ( args->n_threads > 0)
229 hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
230
231 // headers: hdr=full header, hsub=subset header, hnull=sites only header
232 if (args->sites_only){
233 args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
234 bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
235 }
236 if (args->n_samples > 0)
237 {
238 args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
239 if ( !args->hsub ) error("Error occurred while subsetting samples\n");
240 if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
241 {
242 int i;
243 for (i=0; i<args->n_samples; i++)
244 if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
245 }
246 }
247
248 if ( args->filter_str )
249 args->filter = filter_init(args->hdr, args->filter_str);
250 }
251
destroy_data(args_t * args)252 static void destroy_data(args_t *args)
253 {
254 int i;
255 if ( args->imap ) {
256 for (i = 0; i < args->n_samples; ++i)
257 free(args->samples[i]);
258 free(args->samples);
259 free(args->imap);
260 }
261 if (args->hnull) bcf_hdr_destroy(args->hnull);
262 if (args->hsub) bcf_hdr_destroy(args->hsub);
263 if ( args->filter )
264 filter_destroy(args->filter);
265 free(args->ac);
266 }
267
268 // true if all samples are phased.
269 // haploid genotypes are considered phased
270 // ./. => not phased, .|. => phased
bcf_all_phased(const bcf_hdr_t * header,bcf1_t * line)271 int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line)
272 {
273 bcf_unpack(line, BCF_UN_FMT);
274 bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT");
275 int all_phased = 1;
276 if ( fmt_ptr )
277 {
278 int i, isample;
279 for (isample=0; isample<line->n_sample; isample++)
280 {
281 int sample_phased = 0;
282 #define BRANCH_INT(type_t,vector_end) { \
283 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
284 for (i=0; i<fmt_ptr->n; i++) \
285 { \
286 if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \
287 if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \
288 if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
289 if ((p[i])&1) { \
290 sample_phased = 1; \
291 break; \
292 } \
293 } \
294 }
295 switch (fmt_ptr->type) {
296 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
297 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
298 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
299 default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
300 }
301 #undef BRANCH_INT
302 if (!sample_phased) {
303 all_phased = 0;
304 break;
305 }
306 }
307 }
308 return all_phased;
309 }
310
subset_vcf(args_t * args,bcf1_t * line)311 int subset_vcf(args_t *args, bcf1_t *line)
312 {
313 if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
314 if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
315 if (args->novel || args->known)
316 {
317 if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
318 if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0; // skip sites which are novel, ID == '.'
319 }
320
321 if (args->include || args->exclude)
322 {
323 int line_type = bcf_get_variant_types(line);
324 if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
325 if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types
326 }
327
328 if ( args->filter )
329 {
330 int ret = filter_test(args->filter, line, NULL);
331 if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
332 else if ( ret ) return 0;
333 }
334
335 hts_expand(int, line->n_allele, args->mac, args->ac);
336 int i, an = 0, non_ref_ac = 0;
337 if (args->calc_ac) {
338 bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
339 for (i=1; i<line->n_allele; i++)
340 non_ref_ac += args->ac[i];
341 for (i=0; i<line->n_allele; i++)
342 an += args->ac[i];
343 }
344
345 int update_ac = args->calc_ac;
346 if (args->n_samples)
347 {
348 int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
349 bcf_subset(args->hdr, line, args->n_samples, args->imap);
350 if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
351 if ( update_ac )
352 {
353 bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
354 an = 0;
355 for (i=0; i<line->n_allele; i++) {
356 args->ac[i] = ac_sub[i];
357 an += ac_sub[i];
358 }
359 for (i=1; i<line->n_allele; i++)
360 non_ref_ac_sub += ac_sub[i];
361 if (args->private_vars) {
362 if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
363 if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
364 }
365 non_ref_ac = non_ref_ac_sub;
366 }
367 free(ac_sub);
368 }
369
370 bcf_fmt_t *gt_fmt;
371 if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
372 {
373 int nhet = 0, nhom = 0, nmiss = 0;
374 for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
375 {
376 int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
377 if ( type==GT_HET_RA || type==GT_HET_AA )
378 {
379 if ( args->gt_type==GT_NO_HET ) return 0;
380 nhet = 1;
381 }
382 else if ( type==GT_UNKN )
383 {
384 if ( args->gt_type==GT_NO_MISSING ) return 0;
385 nmiss = 1;
386 }
387 else
388 {
389 if ( args->gt_type==GT_NO_HOM ) return 0;
390 nhom = 1;
391 }
392 }
393 if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
394 else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
395 else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
396 }
397
398 int minor_ac = 0;
399 int major_ac = 0;
400 if ( args->calc_ac )
401 {
402 minor_ac = args->ac[0];
403 major_ac = args->ac[0];
404 for (i=1; i<line->n_allele; i++){
405 if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
406 if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
407 }
408 }
409
410 if (args->min_ac!=-1)
411 {
412 if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
413 else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
414 else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
415 else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
416 else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
417 }
418 if (args->max_ac!=-1)
419 {
420 if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
421 else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
422 else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
423 else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
424 else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
425 }
426 if (args->min_af!=-1)
427 {
428 if (an == 0) return 0; // freq not defined, skip site
429 if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
430 else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
431 else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
432 else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
433 else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
434 }
435 if (args->max_af!=-1)
436 {
437 if (an == 0) return 0; // freq not defined, skip site
438 if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
439 else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
440 else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
441 else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
442 else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
443 }
444 if (args->uncalled) {
445 if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
446 if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
447 }
448 if (update_ac && args->update_info) {
449 bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
450 bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
451 }
452 if (args->trim_alts)
453 {
454 int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
455 if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1);
456 }
457 if (args->phased) {
458 int phased = bcf_all_phased(args->hdr, line);
459 if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
460 if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
461 }
462 if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
463 return 1;
464 }
465
set_allele_type(int * atype,char * atype_string)466 void set_allele_type (int *atype, char *atype_string)
467 {
468 *atype = ALLELE_NONREF;
469 if (strcmp(atype_string, "minor") == 0) {
470 *atype = ALLELE_MINOR;
471 }
472 else if (strcmp(atype_string, "alt1") == 0) {
473 *atype = ALLELE_ALT1;
474 }
475 else if (strcmp(atype_string, "nref") == 0) {
476 *atype = ALLELE_NONREF;
477 }
478 else if (strcmp(atype_string, "major") == 0) {
479 *atype = ALLELE_MAJOR;
480 }
481 else if (strcmp(atype_string, "nonmajor") == 0) {
482 *atype = ALLELE_NONMAJOR;
483 }
484 else {
485 error("Error: allele type not recognised. Expected one of nref|alt1|minor|major|nonmajor, got \"%s\".\n", atype_string);
486 }
487 }
488
usage(args_t * args)489 static void usage(args_t *args)
490 {
491 fprintf(stderr, "\n");
492 fprintf(stderr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
493 fprintf(stderr, "Usage: bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
494 fprintf(stderr, "\n");
495 fprintf(stderr, "Output options:\n");
496 fprintf(stderr, " -G, --drop-genotypes Drop individual genotype information (after subsetting if -s option set)\n");
497 fprintf(stderr, " -h, --header-only Print only the header in VCF output (equivalent to bcftools head)\n");
498 fprintf(stderr, " -H, --no-header Suppress the header in VCF output\n");
499 fprintf(stderr, " --with-header Print both header and records in VCF output [default]\n");
500 fprintf(stderr, " -l, --compression-level [0-9] Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
501 fprintf(stderr, " --no-version Do not append version and command line to the header\n");
502 fprintf(stderr, " -o, --output FILE Output file name [stdout]\n");
503 fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
504 fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
505 fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
506 fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
507 fprintf(stderr, " -t, --targets [^]REGION Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
508 fprintf(stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
509 fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
510 fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
511 fprintf(stderr, "\n");
512 fprintf(stderr, "Subset options:\n");
513 fprintf(stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
514 fprintf(stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
515 fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n");
516 fprintf(stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n");
517 fprintf(stderr, " --force-samples Only warn about unknown subset samples\n");
518 fprintf(stderr, "\n");
519 fprintf(stderr, "Filter options:\n");
520 fprintf(stderr, " -c/C, --min-ac/--max-ac INT[:TYPE] Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
521 fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
522 fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
523 fprintf(stderr, " -g, --genotype [^]hom|het|miss Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n");
524 fprintf(stderr, " -i/e, --include/--exclude EXPR Select/exclude sites for which the expression is true (see man page for details)\n");
525 fprintf(stderr, " -k/n, --known/--novel Select known/novel sites only (ID is not/is '.')\n");
526 fprintf(stderr, " -m/M, --min-alleles/--max-alleles INT Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
527 fprintf(stderr, " -p/P, --phased/--exclude-phased Select/exclude sites where all samples are phased\n");
528 fprintf(stderr, " -q/Q, --min-af/--max-af FLOAT[:TYPE] Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
529 fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
530 fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n");
531 fprintf(stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
532 fprintf(stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
533 fprintf(stderr, "\n");
534 exit(1);
535 }
536
main_vcfview(int argc,char * argv[])537 int main_vcfview(int argc, char *argv[])
538 {
539 int c;
540 args_t *args = (args_t*) calloc(1,sizeof(args_t));
541 args->argc = argc; args->argv = argv;
542 args->files = bcf_sr_init();
543 args->clevel = -1;
544 args->print_header = 1;
545 args->update_info = 1;
546 args->output_type = FT_VCF;
547 args->n_threads = 0;
548 args->record_cmd_line = 1;
549 args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
550 args->regions_overlap = 1;
551 args->targets_overlap = 0;
552 int targets_is_file = 0, regions_is_file = 0;
553
554 static struct option loptions[] =
555 {
556 {"genotype",required_argument,NULL,'g'},
557 {"compression-level",required_argument,NULL,'l'},
558 {"threads",required_argument,NULL,9},
559 {"header-only",no_argument,NULL,'h'},
560 {"no-header",no_argument,NULL,'H'},
561 {"with-header",no_argument,NULL,4},
562 {"exclude",required_argument,NULL,'e'},
563 {"include",required_argument,NULL,'i'},
564 {"trim-alt-alleles",no_argument,NULL,'a'},
565 {"no-update",no_argument,NULL,'I'},
566 {"drop-genotypes",no_argument,NULL,'G'},
567 {"private",no_argument,NULL,'x'},
568 {"exclude-private",no_argument,NULL,'X'},
569 {"uncalled",no_argument,NULL,'u'},
570 {"exclude-uncalled",no_argument,NULL,'U'},
571 {"apply-filters",required_argument,NULL,'f'},
572 {"known",no_argument,NULL,'k'},
573 {"novel",no_argument,NULL,'n'},
574 {"min-alleles",required_argument,NULL,'m'},
575 {"max-alleles",required_argument,NULL,'M'},
576 {"samples",required_argument,NULL,'s'},
577 {"samples-file",required_argument,NULL,'S'},
578 {"force-samples",no_argument,NULL,1},
579 {"output-type",required_argument,NULL,'O'},
580 {"output-file",required_argument,NULL,'o'},
581 {"output",required_argument,NULL,'o'},
582 {"types",required_argument,NULL,'v'},
583 {"exclude-types",required_argument,NULL,'V'},
584 {"targets",required_argument,NULL,'t'},
585 {"targets-file",required_argument,NULL,'T'},
586 {"targets-overlap",required_argument,NULL,2},
587 {"regions",required_argument,NULL,'r'},
588 {"regions-file",required_argument,NULL,'R'},
589 {"regions-overlap",required_argument,NULL,3},
590 {"min-ac",required_argument,NULL,'c'},
591 {"max-ac",required_argument,NULL,'C'},
592 {"min-af",required_argument,NULL,'q'},
593 {"max-af",required_argument,NULL,'Q'},
594 {"phased",no_argument,NULL,'p'},
595 {"exclude-phased",no_argument,NULL,'P'},
596 {"no-version",no_argument,NULL,8},
597 {NULL,0,NULL,0}
598 };
599 char *tmp;
600 while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
601 {
602 char allele_type[9] = "nref";
603 switch (c)
604 {
605 case 'O':
606 switch (optarg[0]) {
607 case 'b': args->output_type = FT_BCF_GZ; break;
608 case 'u': args->output_type = FT_BCF; break;
609 case 'z': args->output_type = FT_VCF_GZ; break;
610 case 'v': args->output_type = FT_VCF; break;
611 default:
612 {
613 args->clevel = strtol(optarg,&tmp,10);
614 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
615 }
616 };
617 if ( optarg[1] )
618 {
619 args->clevel = strtol(optarg+1,&tmp,10);
620 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
621 }
622 break;
623 case 'l':
624 args->clevel = strtol(optarg,&tmp,10);
625 if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
626 args->output_type |= FT_GZ;
627 break;
628 case 'o': args->fn_out = optarg; break;
629 case 'H': args->print_header = 0; break;
630 case 'h': args->header_only = 1; break;
631 case 4 : args->print_header = 1; args->header_only = 0; break;
632
633 case 't': args->targets_list = optarg; break;
634 case 'T': args->targets_list = optarg; targets_is_file = 1; break;
635 case 'r': args->regions_list = optarg; break;
636 case 'R': args->regions_list = optarg; regions_is_file = 1; break;
637
638 case 's': args->sample_names = optarg; break;
639 case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
640 case 1 : args->force_samples = 1; break;
641 case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
642 case 'I': args->update_info = 0; break;
643 case 'G': args->sites_only = 1; break;
644
645 case 'f': args->files->apply_filters = optarg; break;
646 case 'k': args->known = 1; break;
647 case 'n': args->novel = 1; break;
648 case 'm':
649 args->min_alleles = strtol(optarg,&tmp,10);
650 if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
651 break;
652 case 'M':
653 args->max_alleles = strtol(optarg,&tmp,10);
654 if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
655 break;
656 case 'v': args->include_types = optarg; break;
657 case 'V': args->exclude_types = optarg; break;
658 case 'e':
659 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
660 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
661 case 'i':
662 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
663 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
664 case 'c':
665 {
666 args->min_ac_type = ALLELE_NONREF;
667 if ( sscanf(optarg,"%d:%8s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 )
668 error("Error: Could not parse --min-ac %s\n", optarg);
669 set_allele_type(&args->min_ac_type, allele_type);
670 args->calc_ac = 1;
671 break;
672 }
673 case 'C':
674 {
675 args->max_ac_type = ALLELE_NONREF;
676 if ( sscanf(optarg,"%d:%8s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 )
677 error("Error: Could not parse --max-ac %s\n", optarg);
678 set_allele_type(&args->max_ac_type, allele_type);
679 args->calc_ac = 1;
680 break;
681 }
682 case 'q':
683 {
684 args->min_af_type = ALLELE_NONREF;
685 if ( sscanf(optarg,"%f:%8s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 )
686 error("Error: Could not parse --min-af %s\n", optarg);
687 set_allele_type(&args->min_af_type, allele_type);
688 args->calc_ac = 1;
689 break;
690 }
691 case 'Q':
692 {
693 args->max_af_type = ALLELE_NONREF;
694 if ( sscanf(optarg,"%f:%8s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 )
695 error("Error: Could not parse --max-af %s\n", optarg);
696 set_allele_type(&args->max_af_type, allele_type);
697 args->calc_ac = 1;
698 break;
699 }
700
701 case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break;
702 case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break;
703 case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break;
704 case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break;
705 case 'p': args->phased |= FLT_INCLUDE; break; // phased
706 case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased
707 case 'g':
708 {
709 if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM;
710 else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET;
711 else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING;
712 else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM;
713 else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET;
714 else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING;
715 else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
716 break;
717 }
718 case 2 :
719 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
720 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
721 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
722 else error("Could not parse: --targets-overlap %s\n",optarg);
723 break;
724 case 3 :
725 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
726 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
727 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
728 else error("Could not parse: --regions-overlap %s\n",optarg);
729 break;
730 case 9 : args->n_threads = strtol(optarg, 0, 0); break;
731 case 8 : args->record_cmd_line = 0; break;
732 case '?': usage(args); break;
733 default: error("Unknown argument: %s\n", optarg);
734 }
735 }
736
737 if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
738 if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n");
739 if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n");
740 if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n");
741
742 if ( args->sample_names && args->update_info) args->calc_ac = 1;
743
744 char *fname = NULL;
745 if ( optind>=argc )
746 {
747 if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
748 else usage(args);
749 }
750 else fname = argv[optind];
751
752 // read in the regions from the command line
753 if ( args->regions_list )
754 {
755 bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
756 if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
757 error("Failed to read the regions: %s\n", args->regions_list);
758 }
759 else if ( optind+1 < argc )
760 {
761 int i;
762 kstring_t tmp = {0,0,0};
763 kputs(argv[optind+1],&tmp);
764 for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
765 if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 )
766 error("Failed to read the regions: %s\n", tmp.s);
767 free(tmp.s);
768 }
769 if ( args->targets_list )
770 {
771 bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
772 if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
773 error("Failed to read the targets: %s\n", args->targets_list);
774 }
775
776 if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
777 if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
778
779 init_data(args);
780 bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr);
781 if (args->print_header)
782 {
783 if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
784 }
785 else if ( args->output_type & FT_BCF )
786 error("BCF output requires header, cannot proceed with -H\n");
787
788 int ret = 0;
789 if (!args->header_only)
790 {
791 while ( bcf_sr_next_line(args->files) )
792 {
793 bcf1_t *line = args->files->readers[0].buffer[0];
794 if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n");
795 if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
796 }
797 ret = args->files->errnum;
798 if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
799 }
800 hts_close(args->out);
801 destroy_data(args);
802 bcf_sr_destroy(args->files);
803 free(args);
804 return ret;
805 }
806