1 #include "bcftools.pysam.h"
2
3 /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
4
5 Copyright (C) 2013-2021 Genome Research Ltd.
6
7 Author: Shane McCarthy <sm15@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE. */
26
27 #include <stdio.h>
28 #include <strings.h>
29 #include <unistd.h>
30 #include <getopt.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <math.h>
37 #include <inttypes.h>
38 #include <htslib/vcf.h>
39 #include <htslib/synced_bcf_reader.h>
40 #include <htslib/vcfutils.h>
41 #include "bcftools.h"
42 #include "filter.h"
43 #include "htslib/khash_str2int.h"
44
45 #define FLT_INCLUDE 1
46 #define FLT_EXCLUDE 2
47
48 #define ALLELE_NONREF 1
49 #define ALLELE_MINOR 2
50 #define ALLELE_ALT1 3
51 #define ALLELE_MAJOR 4
52 #define ALLELE_NONMAJOR 5
53
54 #define GT_NEED_HOM 1
55 #define GT_NEED_HET 2
56 #define GT_NO_HOM 3
57 #define GT_NO_HET 4
58 #define GT_NEED_MISSING 5
59 #define GT_NO_MISSING 6
60
61 typedef struct _args_t
62 {
63 filter_t *filter;
64 char *filter_str;
65 int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e)
66
67 bcf_srs_t *files;
68 bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
69 char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
70 int regions_overlap, targets_overlap;
71 int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
72 int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
73 int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
74 int *ac, mac;
75 float min_af, max_af;
76 char *fn_ref, *fn_out, **samples;
77 int sample_is_file, force_samples;
78 char *include_types, *exclude_types;
79 int include, exclude;
80 int record_cmd_line;
81 htsFile *out;
82 }
83 args_t;
84
init_data(args_t * args)85 static void init_data(args_t *args)
86 {
87 int i;
88 args->hdr = args->files->readers[0].header;
89
90 if (args->calc_ac && args->update_info)
91 {
92 if (bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">") < 0)
93 error_errno("[%s] Failed to add \"AC\" INFO header", __func__);
94 if (bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">") < 0)
95 error_errno("[%s] Failed to add \"AN\" INFO header", __func__);
96 }
97 if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
98 else if (bcf_hdr_sync(args->hdr) < 0)
99 error_errno("[%s] Failed to update header", __func__);
100
101 // setup sample data
102 if (args->sample_names)
103 {
104 void *hdr_samples = khash_str2int_init();
105 for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
106 khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
107
108 void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
109 int nsmpl;
110 char **smpl = NULL;
111 args->samples = NULL; args->n_samples = 0;
112 smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
113 if ( !smpl )
114 {
115 error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
116 }
117
118 if ( exclude )
119 {
120 for (i=0; i<nsmpl; i++) {
121 if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
122 if (args->force_samples) {
123 fprintf(bcftools_stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
124 } else {
125 error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
126 }
127 }
128 khash_str2int_inc(exclude, smpl[i]);
129 }
130
131 for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
132 {
133 if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)) ) continue;
134 args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
135 args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
136 }
137 khash_str2int_destroy(exclude);
138 }
139 else
140 {
141 for (i=0; i<nsmpl; i++) {
142 if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
143 if (args->force_samples) {
144 fprintf(bcftools_stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
145 continue;
146 } else {
147 error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
148 }
149 }
150 args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
151 args->samples[args->n_samples++] = strdup(smpl[i]);
152 }
153 }
154 for (i=0; i<nsmpl; i++) free(smpl[i]);
155 free(smpl);
156 khash_str2int_destroy(hdr_samples);
157 if (args->n_samples == 0) {
158 fprintf(bcftools_stderr, "Warn: subsetting has removed all samples\n");
159 args->sites_only = 1;
160 }
161 }
162
163 if (args->n_samples)
164 args->imap = (int*)malloc(args->n_samples * sizeof(int));
165
166 // determine variant types to include/exclude
167 if (args->include_types || args->exclude_types) {
168 if (args->include_types && args->exclude_types) {
169 fprintf(bcftools_stderr, "Error: only supply one of --include-types, --exclude-types options\n");
170 bcftools_exit(1);
171 }
172 char **type_list = 0;
173 int m = 0, n = 0;
174 const char *q, *p;
175 for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
176 if (*p == ',' || *p == 0) {
177 if (m == n) {
178 m = m? m<<1 : 16;
179 type_list = (char**)realloc(type_list, m * sizeof(char*));
180 }
181 type_list[n] = (char*)calloc(p - q + 1, 1);
182 strncpy(type_list[n++], q, p - q);
183 q = p + 1;
184 if (*p == 0) break;
185 }
186 }
187 type_list = (char**)realloc(type_list, n * sizeof(char*));
188
189 if (args->include_types) {
190 args->include = 0;
191 for (i = 0; i < n; ++i) {
192 if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
193 else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
194 else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
195 else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
196 else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
197 else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
198 else {
199 fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]);
200 fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n");
201 bcftools_exit(1);
202 }
203 }
204 }
205 if (args->exclude_types) {
206 args->exclude = 0;
207 for (i = 0; i < n; ++i) {
208 if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
209 else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
210 else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
211 else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
212 else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
213 else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
214 else {
215 fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]);
216 fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n");
217 bcftools_exit(1);
218 }
219 }
220 }
221 for (i = 0; i < n; ++i)
222 free(type_list[i]);
223 free(type_list);
224 }
225
226 char wmode[8];
227 set_wmode(wmode,args->output_type,args->fn_out,args->clevel);
228 args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode);
229 if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
230 if ( args->n_threads > 0)
231 hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
232
233 // headers: hdr=full header, hsub=subset header, hnull=sites only header
234 if (args->sites_only){
235 args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
236 bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
237 }
238 if (args->n_samples > 0)
239 {
240 args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
241 if ( !args->hsub ) error("Error occurred while subsetting samples\n");
242 if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
243 {
244 int i;
245 for (i=0; i<args->n_samples; i++)
246 if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
247 }
248 }
249
250 if ( args->filter_str )
251 args->filter = filter_init(args->hdr, args->filter_str);
252 }
253
destroy_data(args_t * args)254 static void destroy_data(args_t *args)
255 {
256 int i;
257 if ( args->imap ) {
258 for (i = 0; i < args->n_samples; ++i)
259 free(args->samples[i]);
260 free(args->samples);
261 free(args->imap);
262 }
263 if (args->hnull) bcf_hdr_destroy(args->hnull);
264 if (args->hsub) bcf_hdr_destroy(args->hsub);
265 if ( args->filter )
266 filter_destroy(args->filter);
267 free(args->ac);
268 }
269
270 // true if all samples are phased.
271 // haploid genotypes are considered phased
272 // ./. => not phased, .|. => phased
bcf_all_phased(const bcf_hdr_t * header,bcf1_t * line)273 int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line)
274 {
275 bcf_unpack(line, BCF_UN_FMT);
276 bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT");
277 int all_phased = 1;
278 if ( fmt_ptr )
279 {
280 int i, isample;
281 for (isample=0; isample<line->n_sample; isample++)
282 {
283 int sample_phased = 0;
284 #define BRANCH_INT(type_t,vector_end) { \
285 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
286 for (i=0; i<fmt_ptr->n; i++) \
287 { \
288 if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \
289 if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \
290 if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
291 if ((p[i])&1) { \
292 sample_phased = 1; \
293 break; \
294 } \
295 } \
296 }
297 switch (fmt_ptr->type) {
298 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
299 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
300 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
301 default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
302 }
303 #undef BRANCH_INT
304 if (!sample_phased) {
305 all_phased = 0;
306 break;
307 }
308 }
309 }
310 return all_phased;
311 }
312
subset_vcf(args_t * args,bcf1_t * line)313 int subset_vcf(args_t *args, bcf1_t *line)
314 {
315 if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
316 if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
317 if (args->novel || args->known)
318 {
319 if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
320 if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0; // skip sites which are novel, ID == '.'
321 }
322
323 if (args->include || args->exclude)
324 {
325 int line_type = bcf_get_variant_types(line);
326 if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
327 if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types
328 }
329
330 if ( args->filter )
331 {
332 int ret = filter_test(args->filter, line, NULL);
333 if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
334 else if ( ret ) return 0;
335 }
336
337 hts_expand(int, line->n_allele, args->mac, args->ac);
338 int i, an = 0, non_ref_ac = 0;
339 if (args->calc_ac) {
340 bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
341 for (i=1; i<line->n_allele; i++)
342 non_ref_ac += args->ac[i];
343 for (i=0; i<line->n_allele; i++)
344 an += args->ac[i];
345 }
346
347 int update_ac = args->calc_ac;
348 if (args->n_samples)
349 {
350 int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
351 bcf_subset(args->hdr, line, args->n_samples, args->imap);
352 if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
353 if ( update_ac )
354 {
355 bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
356 an = 0;
357 for (i=0; i<line->n_allele; i++) {
358 args->ac[i] = ac_sub[i];
359 an += ac_sub[i];
360 }
361 for (i=1; i<line->n_allele; i++)
362 non_ref_ac_sub += ac_sub[i];
363 if (args->private_vars) {
364 if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
365 if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
366 }
367 non_ref_ac = non_ref_ac_sub;
368 }
369 free(ac_sub);
370 }
371
372 bcf_fmt_t *gt_fmt;
373 if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
374 {
375 int nhet = 0, nhom = 0, nmiss = 0;
376 for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
377 {
378 int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
379 if ( type==GT_HET_RA || type==GT_HET_AA )
380 {
381 if ( args->gt_type==GT_NO_HET ) return 0;
382 nhet = 1;
383 }
384 else if ( type==GT_UNKN )
385 {
386 if ( args->gt_type==GT_NO_MISSING ) return 0;
387 nmiss = 1;
388 }
389 else
390 {
391 if ( args->gt_type==GT_NO_HOM ) return 0;
392 nhom = 1;
393 }
394 }
395 if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
396 else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
397 else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
398 }
399
400 int minor_ac = 0;
401 int major_ac = 0;
402 if ( args->calc_ac )
403 {
404 minor_ac = args->ac[0];
405 major_ac = args->ac[0];
406 for (i=1; i<line->n_allele; i++){
407 if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
408 if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
409 }
410 }
411
412 if (args->min_ac!=-1)
413 {
414 if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
415 else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
416 else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
417 else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
418 else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
419 }
420 if (args->max_ac!=-1)
421 {
422 if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
423 else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
424 else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
425 else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
426 else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
427 }
428 if (args->min_af!=-1)
429 {
430 if (an == 0) return 0; // freq not defined, skip site
431 if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
432 else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
433 else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
434 else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
435 else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
436 }
437 if (args->max_af!=-1)
438 {
439 if (an == 0) return 0; // freq not defined, skip site
440 if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
441 else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
442 else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
443 else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
444 else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
445 }
446 if (args->uncalled) {
447 if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
448 if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
449 }
450 if (update_ac && args->update_info) {
451 bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
452 bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
453 }
454 if (args->trim_alts)
455 {
456 int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
457 if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1);
458 }
459 if (args->phased) {
460 int phased = bcf_all_phased(args->hdr, line);
461 if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
462 if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
463 }
464 if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
465 return 1;
466 }
467
set_allele_type(int * atype,char * atype_string)468 void set_allele_type (int *atype, char *atype_string)
469 {
470 *atype = ALLELE_NONREF;
471 if (strcmp(atype_string, "minor") == 0) {
472 *atype = ALLELE_MINOR;
473 }
474 else if (strcmp(atype_string, "alt1") == 0) {
475 *atype = ALLELE_ALT1;
476 }
477 else if (strcmp(atype_string, "nref") == 0) {
478 *atype = ALLELE_NONREF;
479 }
480 else if (strcmp(atype_string, "major") == 0) {
481 *atype = ALLELE_MAJOR;
482 }
483 else if (strcmp(atype_string, "nonmajor") == 0) {
484 *atype = ALLELE_NONMAJOR;
485 }
486 else {
487 error("Error: allele type not recognised. Expected one of nref|alt1|minor|major|nonmajor, got \"%s\".\n", atype_string);
488 }
489 }
490
usage(args_t * args)491 static void usage(args_t *args)
492 {
493 fprintf(bcftools_stderr, "\n");
494 fprintf(bcftools_stderr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
495 fprintf(bcftools_stderr, "Usage: bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
496 fprintf(bcftools_stderr, "\n");
497 fprintf(bcftools_stderr, "Output options:\n");
498 fprintf(bcftools_stderr, " -G, --drop-genotypes Drop individual genotype information (after subsetting if -s option set)\n");
499 fprintf(bcftools_stderr, " -h, --header-only Print only the header in VCF output (equivalent to bcftools head)\n");
500 fprintf(bcftools_stderr, " -H, --no-header Suppress the header in VCF output\n");
501 fprintf(bcftools_stderr, " --with-header Print both header and records in VCF output [default]\n");
502 fprintf(bcftools_stderr, " -l, --compression-level [0-9] Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
503 fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
504 fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n");
505 fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
506 fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
507 fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
508 fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
509 fprintf(bcftools_stderr, " -t, --targets [^]REGION Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
510 fprintf(bcftools_stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
511 fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
512 fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
513 fprintf(bcftools_stderr, "\n");
514 fprintf(bcftools_stderr, "Subset options:\n");
515 fprintf(bcftools_stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
516 fprintf(bcftools_stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
517 fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n");
518 fprintf(bcftools_stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n");
519 fprintf(bcftools_stderr, " --force-samples Only warn about unknown subset samples\n");
520 fprintf(bcftools_stderr, "\n");
521 fprintf(bcftools_stderr, "Filter options:\n");
522 fprintf(bcftools_stderr, " -c/C, --min-ac/--max-ac INT[:TYPE] Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
523 fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
524 fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
525 fprintf(bcftools_stderr, " -g, --genotype [^]hom|het|miss Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n");
526 fprintf(bcftools_stderr, " -i/e, --include/--exclude EXPR Select/exclude sites for which the expression is true (see man page for details)\n");
527 fprintf(bcftools_stderr, " -k/n, --known/--novel Select known/novel sites only (ID is not/is '.')\n");
528 fprintf(bcftools_stderr, " -m/M, --min-alleles/--max-alleles INT Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
529 fprintf(bcftools_stderr, " -p/P, --phased/--exclude-phased Select/exclude sites where all samples are phased\n");
530 fprintf(bcftools_stderr, " -q/Q, --min-af/--max-af FLOAT[:TYPE] Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
531 fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
532 fprintf(bcftools_stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n");
533 fprintf(bcftools_stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
534 fprintf(bcftools_stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
535 fprintf(bcftools_stderr, "\n");
536 bcftools_exit(1);
537 }
538
main_vcfview(int argc,char * argv[])539 int main_vcfview(int argc, char *argv[])
540 {
541 int c;
542 args_t *args = (args_t*) calloc(1,sizeof(args_t));
543 args->argc = argc; args->argv = argv;
544 args->files = bcf_sr_init();
545 args->clevel = -1;
546 args->print_header = 1;
547 args->update_info = 1;
548 args->output_type = FT_VCF;
549 args->n_threads = 0;
550 args->record_cmd_line = 1;
551 args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
552 args->regions_overlap = 1;
553 args->targets_overlap = 0;
554 int targets_is_file = 0, regions_is_file = 0;
555
556 static struct option loptions[] =
557 {
558 {"genotype",required_argument,NULL,'g'},
559 {"compression-level",required_argument,NULL,'l'},
560 {"threads",required_argument,NULL,9},
561 {"header-only",no_argument,NULL,'h'},
562 {"no-header",no_argument,NULL,'H'},
563 {"with-header",no_argument,NULL,4},
564 {"exclude",required_argument,NULL,'e'},
565 {"include",required_argument,NULL,'i'},
566 {"trim-alt-alleles",no_argument,NULL,'a'},
567 {"no-update",no_argument,NULL,'I'},
568 {"drop-genotypes",no_argument,NULL,'G'},
569 {"private",no_argument,NULL,'x'},
570 {"exclude-private",no_argument,NULL,'X'},
571 {"uncalled",no_argument,NULL,'u'},
572 {"exclude-uncalled",no_argument,NULL,'U'},
573 {"apply-filters",required_argument,NULL,'f'},
574 {"known",no_argument,NULL,'k'},
575 {"novel",no_argument,NULL,'n'},
576 {"min-alleles",required_argument,NULL,'m'},
577 {"max-alleles",required_argument,NULL,'M'},
578 {"samples",required_argument,NULL,'s'},
579 {"samples-file",required_argument,NULL,'S'},
580 {"force-samples",no_argument,NULL,1},
581 {"output-type",required_argument,NULL,'O'},
582 {"output-file",required_argument,NULL,'o'},
583 {"output",required_argument,NULL,'o'},
584 {"types",required_argument,NULL,'v'},
585 {"exclude-types",required_argument,NULL,'V'},
586 {"targets",required_argument,NULL,'t'},
587 {"targets-file",required_argument,NULL,'T'},
588 {"targets-overlap",required_argument,NULL,2},
589 {"regions",required_argument,NULL,'r'},
590 {"regions-file",required_argument,NULL,'R'},
591 {"regions-overlap",required_argument,NULL,3},
592 {"min-ac",required_argument,NULL,'c'},
593 {"max-ac",required_argument,NULL,'C'},
594 {"min-af",required_argument,NULL,'q'},
595 {"max-af",required_argument,NULL,'Q'},
596 {"phased",no_argument,NULL,'p'},
597 {"exclude-phased",no_argument,NULL,'P'},
598 {"no-version",no_argument,NULL,8},
599 {NULL,0,NULL,0}
600 };
601 char *tmp;
602 while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
603 {
604 char allele_type[9] = "nref";
605 switch (c)
606 {
607 case 'O':
608 switch (optarg[0]) {
609 case 'b': args->output_type = FT_BCF_GZ; break;
610 case 'u': args->output_type = FT_BCF; break;
611 case 'z': args->output_type = FT_VCF_GZ; break;
612 case 'v': args->output_type = FT_VCF; break;
613 default:
614 {
615 args->clevel = strtol(optarg,&tmp,10);
616 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
617 }
618 };
619 if ( optarg[1] )
620 {
621 args->clevel = strtol(optarg+1,&tmp,10);
622 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
623 }
624 break;
625 case 'l':
626 args->clevel = strtol(optarg,&tmp,10);
627 if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
628 args->output_type |= FT_GZ;
629 break;
630 case 'o': args->fn_out = optarg; break;
631 case 'H': args->print_header = 0; break;
632 case 'h': args->header_only = 1; break;
633 case 4 : args->print_header = 1; args->header_only = 0; break;
634
635 case 't': args->targets_list = optarg; break;
636 case 'T': args->targets_list = optarg; targets_is_file = 1; break;
637 case 'r': args->regions_list = optarg; break;
638 case 'R': args->regions_list = optarg; regions_is_file = 1; break;
639
640 case 's': args->sample_names = optarg; break;
641 case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
642 case 1 : args->force_samples = 1; break;
643 case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
644 case 'I': args->update_info = 0; break;
645 case 'G': args->sites_only = 1; break;
646
647 case 'f': args->files->apply_filters = optarg; break;
648 case 'k': args->known = 1; break;
649 case 'n': args->novel = 1; break;
650 case 'm':
651 args->min_alleles = strtol(optarg,&tmp,10);
652 if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
653 break;
654 case 'M':
655 args->max_alleles = strtol(optarg,&tmp,10);
656 if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
657 break;
658 case 'v': args->include_types = optarg; break;
659 case 'V': args->exclude_types = optarg; break;
660 case 'e':
661 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
662 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
663 case 'i':
664 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
665 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
666 case 'c':
667 {
668 args->min_ac_type = ALLELE_NONREF;
669 if ( sscanf(optarg,"%d:%8s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 )
670 error("Error: Could not parse --min-ac %s\n", optarg);
671 set_allele_type(&args->min_ac_type, allele_type);
672 args->calc_ac = 1;
673 break;
674 }
675 case 'C':
676 {
677 args->max_ac_type = ALLELE_NONREF;
678 if ( sscanf(optarg,"%d:%8s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 )
679 error("Error: Could not parse --max-ac %s\n", optarg);
680 set_allele_type(&args->max_ac_type, allele_type);
681 args->calc_ac = 1;
682 break;
683 }
684 case 'q':
685 {
686 args->min_af_type = ALLELE_NONREF;
687 if ( sscanf(optarg,"%f:%8s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 )
688 error("Error: Could not parse --min-af %s\n", optarg);
689 set_allele_type(&args->min_af_type, allele_type);
690 args->calc_ac = 1;
691 break;
692 }
693 case 'Q':
694 {
695 args->max_af_type = ALLELE_NONREF;
696 if ( sscanf(optarg,"%f:%8s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 )
697 error("Error: Could not parse --max-af %s\n", optarg);
698 set_allele_type(&args->max_af_type, allele_type);
699 args->calc_ac = 1;
700 break;
701 }
702
703 case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break;
704 case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break;
705 case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break;
706 case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break;
707 case 'p': args->phased |= FLT_INCLUDE; break; // phased
708 case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased
709 case 'g':
710 {
711 if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM;
712 else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET;
713 else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING;
714 else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM;
715 else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET;
716 else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING;
717 else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
718 break;
719 }
720 case 2 :
721 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
722 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
723 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
724 else error("Could not parse: --targets-overlap %s\n",optarg);
725 break;
726 case 3 :
727 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
728 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
729 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
730 else error("Could not parse: --regions-overlap %s\n",optarg);
731 break;
732 case 9 : args->n_threads = strtol(optarg, 0, 0); break;
733 case 8 : args->record_cmd_line = 0; break;
734 case '?': usage(args); break;
735 default: error("Unknown argument: %s\n", optarg);
736 }
737 }
738
739 if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
740 if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n");
741 if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n");
742 if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n");
743
744 if ( args->sample_names && args->update_info) args->calc_ac = 1;
745
746 char *fname = NULL;
747 if ( optind>=argc )
748 {
749 if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
750 else usage(args);
751 }
752 else fname = argv[optind];
753
754 // read in the regions from the command line
755 if ( args->regions_list )
756 {
757 bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
758 if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
759 error("Failed to read the regions: %s\n", args->regions_list);
760 }
761 else if ( optind+1 < argc )
762 {
763 int i;
764 kstring_t tmp = {0,0,0};
765 kputs(argv[optind+1],&tmp);
766 for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
767 if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 )
768 error("Failed to read the regions: %s\n", tmp.s);
769 free(tmp.s);
770 }
771 if ( args->targets_list )
772 {
773 bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
774 if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
775 error("Failed to read the targets: %s\n", args->targets_list);
776 }
777
778 if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
779 if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
780
781 init_data(args);
782 bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr);
783 if (args->print_header)
784 {
785 if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
786 }
787 else if ( args->output_type & FT_BCF )
788 error("BCF output requires header, cannot proceed with -H\n");
789
790 int ret = 0;
791 if (!args->header_only)
792 {
793 while ( bcf_sr_next_line(args->files) )
794 {
795 bcf1_t *line = args->files->readers[0].buffer[0];
796 if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n");
797 if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
798 }
799 ret = args->files->errnum;
800 if ( ret ) fprintf(bcftools_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
801 }
802 hts_close(args->out);
803 destroy_data(args);
804 bcf_sr_destroy(args->files);
805 free(args);
806 return ret;
807 }
808