1 #include "bcftools.pysam.h"
2 
3 /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
4 
5     Copyright (C) 2013-2021 Genome Research Ltd.
6 
7     Author: Shane McCarthy <sm15@sanger.ac.uk>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.  */
26 
27 #include <stdio.h>
28 #include <strings.h>
29 #include <unistd.h>
30 #include <getopt.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <math.h>
37 #include <inttypes.h>
38 #include <htslib/vcf.h>
39 #include <htslib/synced_bcf_reader.h>
40 #include <htslib/vcfutils.h>
41 #include "bcftools.h"
42 #include "filter.h"
43 #include "htslib/khash_str2int.h"
44 
45 #define FLT_INCLUDE 1
46 #define FLT_EXCLUDE 2
47 
48 #define ALLELE_NONREF 1
49 #define ALLELE_MINOR 2
50 #define ALLELE_ALT1 3
51 #define ALLELE_MAJOR 4
52 #define ALLELE_NONMAJOR 5
53 
54 #define GT_NEED_HOM 1
55 #define GT_NEED_HET 2
56 #define GT_NO_HOM   3
57 #define GT_NO_HET   4
58 #define GT_NEED_MISSING 5
59 #define GT_NO_MISSING 6
60 
61 typedef struct _args_t
62 {
63     filter_t *filter;
64     char *filter_str;
65     int filter_logic;   // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e)
66 
67     bcf_srs_t *files;
68     bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
69     char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
70     int regions_overlap, targets_overlap;
71     int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
72     int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
73     int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
74     int *ac, mac;
75     float min_af, max_af;
76     char *fn_ref, *fn_out, **samples;
77     int sample_is_file, force_samples;
78     char *include_types, *exclude_types;
79     int include, exclude;
80     int record_cmd_line;
81     htsFile *out;
82 }
83 args_t;
84 
init_data(args_t * args)85 static void init_data(args_t *args)
86 {
87     int i;
88     args->hdr = args->files->readers[0].header;
89 
90     if (args->calc_ac && args->update_info)
91     {
92         if (bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">") < 0)
93             error_errno("[%s] Failed to add \"AC\" INFO header", __func__);
94         if (bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">") < 0)
95             error_errno("[%s] Failed to add \"AN\" INFO header", __func__);
96     }
97     if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
98     else if (bcf_hdr_sync(args->hdr) < 0)
99         error_errno("[%s] Failed to update header", __func__);
100 
101     // setup sample data
102     if (args->sample_names)
103     {
104         void *hdr_samples = khash_str2int_init();
105         for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
106             khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
107 
108         void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
109         int nsmpl;
110         char **smpl = NULL;
111         args->samples = NULL; args->n_samples = 0;
112         smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
113         if ( !smpl )
114         {
115             error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
116         }
117 
118         if ( exclude )
119         {
120             for (i=0; i<nsmpl; i++) {
121                 if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
122                     if (args->force_samples) {
123                         fprintf(bcftools_stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
124                     } else {
125                         error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
126                     }
127                 }
128                 khash_str2int_inc(exclude, smpl[i]);
129             }
130 
131             for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
132             {
133                 if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i))  ) continue;
134                 args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
135                 args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
136             }
137             khash_str2int_destroy(exclude);
138         }
139         else
140         {
141             for (i=0; i<nsmpl; i++) {
142                 if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
143                     if (args->force_samples) {
144                         fprintf(bcftools_stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
145                         continue;
146                     } else {
147                         error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
148                     }
149                 }
150                 args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
151                 args->samples[args->n_samples++] = strdup(smpl[i]);
152             }
153         }
154         for (i=0; i<nsmpl; i++) free(smpl[i]);
155         free(smpl);
156         khash_str2int_destroy(hdr_samples);
157         if (args->n_samples == 0) {
158             fprintf(bcftools_stderr, "Warn: subsetting has removed all samples\n");
159             args->sites_only = 1;
160         }
161     }
162 
163     if (args->n_samples)
164         args->imap = (int*)malloc(args->n_samples * sizeof(int));
165 
166     // determine variant types to include/exclude
167     if (args->include_types || args->exclude_types) {
168         if (args->include_types && args->exclude_types) {
169             fprintf(bcftools_stderr, "Error: only supply one of --include-types, --exclude-types options\n");
170             bcftools_exit(1);
171         }
172         char **type_list = 0;
173         int m = 0, n = 0;
174         const char *q, *p;
175         for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
176             if (*p == ',' || *p == 0) {
177                 if (m == n) {
178                     m = m? m<<1 : 16;
179                     type_list = (char**)realloc(type_list, m * sizeof(char*));
180                 }
181                 type_list[n] = (char*)calloc(p - q + 1, 1);
182                 strncpy(type_list[n++], q, p - q);
183                 q = p + 1;
184                 if (*p == 0) break;
185             }
186         }
187         type_list = (char**)realloc(type_list, n * sizeof(char*));
188 
189         if (args->include_types) {
190             args->include = 0;
191             for (i = 0; i < n; ++i) {
192                 if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
193                 else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
194                 else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
195                 else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
196                 else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
197                 else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
198                 else {
199                     fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]);
200                     fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n");
201                     bcftools_exit(1);
202                 }
203             }
204         }
205         if (args->exclude_types) {
206             args->exclude = 0;
207             for (i = 0; i < n; ++i) {
208                 if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
209                 else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
210                 else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
211                 else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
212                 else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
213                 else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
214                 else {
215                     fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]);
216                     fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n");
217                     bcftools_exit(1);
218                 }
219             }
220         }
221         for (i = 0; i < n; ++i)
222             free(type_list[i]);
223         free(type_list);
224     }
225 
226     char wmode[8];
227     set_wmode(wmode,args->output_type,args->fn_out,args->clevel);
228     args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode);
229     if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
230     if ( args->n_threads > 0)
231         hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
232 
233     // headers: hdr=full header, hsub=subset header, hnull=sites only header
234     if (args->sites_only){
235         args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
236         bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
237     }
238     if (args->n_samples > 0)
239     {
240         args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
241         if ( !args->hsub ) error("Error occurred while subsetting samples\n");
242         if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
243         {
244             int i;
245             for (i=0; i<args->n_samples; i++)
246                 if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
247         }
248     }
249 
250     if ( args->filter_str )
251         args->filter = filter_init(args->hdr, args->filter_str);
252 }
253 
destroy_data(args_t * args)254 static void destroy_data(args_t *args)
255 {
256     int i;
257     if ( args->imap ) {
258         for (i = 0; i < args->n_samples; ++i)
259             free(args->samples[i]);
260         free(args->samples);
261         free(args->imap);
262     }
263     if (args->hnull) bcf_hdr_destroy(args->hnull);
264     if (args->hsub) bcf_hdr_destroy(args->hsub);
265     if ( args->filter )
266         filter_destroy(args->filter);
267     free(args->ac);
268 }
269 
270 // true if all samples are phased.
271 // haploid genotypes are considered phased
272 // ./. => not phased, .|. => phased
bcf_all_phased(const bcf_hdr_t * header,bcf1_t * line)273 int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line)
274 {
275     bcf_unpack(line, BCF_UN_FMT);
276     bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT");
277     int all_phased = 1;
278     if ( fmt_ptr )
279     {
280         int i, isample;
281         for (isample=0; isample<line->n_sample; isample++)
282         {
283             int sample_phased = 0;
284             #define BRANCH_INT(type_t,vector_end) { \
285                 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
286                 for (i=0; i<fmt_ptr->n; i++) \
287                 { \
288                     if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \
289                     if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \
290                     if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
291                     if ((p[i])&1) { \
292                         sample_phased = 1; \
293                         break; \
294                     } \
295                 } \
296             }
297             switch (fmt_ptr->type) {
298                 case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
299                 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
300                 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
301                 default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
302             }
303             #undef BRANCH_INT
304             if (!sample_phased) {
305                 all_phased = 0;
306                 break;
307             }
308         }
309     }
310     return all_phased;
311 }
312 
subset_vcf(args_t * args,bcf1_t * line)313 int subset_vcf(args_t *args, bcf1_t *line)
314 {
315     if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
316     if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
317     if (args->novel || args->known)
318     {
319         if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
320         if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0;  // skip sites which are novel, ID == '.'
321     }
322 
323     if (args->include || args->exclude)
324     {
325         int line_type = bcf_get_variant_types(line);
326         if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
327         if ( args->exclude &&   (line_type<<1) & args->exclude  ) return 0; // exclude given variant types
328     }
329 
330     if ( args->filter )
331     {
332         int ret = filter_test(args->filter, line, NULL);
333         if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
334         else if ( ret ) return 0;
335     }
336 
337     hts_expand(int, line->n_allele, args->mac, args->ac);
338     int i, an = 0, non_ref_ac = 0;
339     if (args->calc_ac) {
340         bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
341         for (i=1; i<line->n_allele; i++)
342             non_ref_ac += args->ac[i];
343         for (i=0; i<line->n_allele; i++)
344             an += args->ac[i];
345     }
346 
347     int update_ac = args->calc_ac;
348     if (args->n_samples)
349     {
350         int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
351         bcf_subset(args->hdr, line, args->n_samples, args->imap);
352         if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
353         if ( update_ac )
354         {
355             bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
356             an = 0;
357             for (i=0; i<line->n_allele; i++) {
358                 args->ac[i] = ac_sub[i];
359                 an += ac_sub[i];
360             }
361             for (i=1; i<line->n_allele; i++)
362                 non_ref_ac_sub += ac_sub[i];
363             if (args->private_vars) {
364                 if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
365                 if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
366             }
367             non_ref_ac = non_ref_ac_sub;
368         }
369         free(ac_sub);
370     }
371 
372     bcf_fmt_t *gt_fmt;
373     if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
374     {
375         int nhet = 0, nhom = 0, nmiss = 0;
376         for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
377         {
378             int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
379             if ( type==GT_HET_RA || type==GT_HET_AA )
380             {
381                 if ( args->gt_type==GT_NO_HET ) return 0;
382                 nhet = 1;
383             }
384             else if ( type==GT_UNKN )
385             {
386                 if ( args->gt_type==GT_NO_MISSING ) return 0;
387                 nmiss = 1;
388             }
389             else
390             {
391                 if ( args->gt_type==GT_NO_HOM ) return 0;
392                 nhom = 1;
393             }
394         }
395         if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
396         else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
397         else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
398     }
399 
400     int minor_ac = 0;
401     int major_ac = 0;
402     if ( args->calc_ac )
403     {
404         minor_ac = args->ac[0];
405         major_ac = args->ac[0];
406         for (i=1; i<line->n_allele; i++){
407             if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
408             if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
409         }
410     }
411 
412     if (args->min_ac!=-1)
413     {
414         if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
415         else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
416         else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
417         else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
418         else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
419     }
420     if (args->max_ac!=-1)
421     {
422         if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
423         else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
424         else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
425         else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
426         else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
427     }
428     if (args->min_af!=-1)
429     {
430         if (an == 0) return 0; // freq not defined, skip site
431         if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
432         else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
433         else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
434         else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
435         else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
436     }
437     if (args->max_af!=-1)
438     {
439         if (an == 0) return 0; // freq not defined, skip site
440         if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
441         else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
442         else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
443         else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
444         else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
445     }
446     if (args->uncalled) {
447         if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
448         if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
449     }
450     if (update_ac && args->update_info) {
451         bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
452         bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
453     }
454     if (args->trim_alts)
455     {
456         int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
457         if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1);
458     }
459     if (args->phased) {
460         int phased = bcf_all_phased(args->hdr, line);
461         if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
462         if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
463     }
464     if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
465     return 1;
466 }
467 
set_allele_type(int * atype,char * atype_string)468 void set_allele_type (int *atype, char *atype_string)
469 {
470     *atype = ALLELE_NONREF;
471     if (strcmp(atype_string, "minor") == 0) {
472         *atype = ALLELE_MINOR;
473     }
474     else if (strcmp(atype_string, "alt1") == 0) {
475         *atype = ALLELE_ALT1;
476     }
477     else if (strcmp(atype_string, "nref") == 0) {
478         *atype = ALLELE_NONREF;
479     }
480     else if (strcmp(atype_string, "major") == 0) {
481         *atype = ALLELE_MAJOR;
482     }
483     else if (strcmp(atype_string, "nonmajor") == 0) {
484         *atype = ALLELE_NONMAJOR;
485     }
486     else {
487         error("Error: allele type not recognised. Expected one of nref|alt1|minor|major|nonmajor, got \"%s\".\n", atype_string);
488     }
489 }
490 
usage(args_t * args)491 static void usage(args_t *args)
492 {
493     fprintf(bcftools_stderr, "\n");
494     fprintf(bcftools_stderr, "About:   VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
495     fprintf(bcftools_stderr, "Usage:   bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
496     fprintf(bcftools_stderr, "\n");
497     fprintf(bcftools_stderr, "Output options:\n");
498     fprintf(bcftools_stderr, "    -G, --drop-genotypes              Drop individual genotype information (after subsetting if -s option set)\n");
499     fprintf(bcftools_stderr, "    -h, --header-only                 Print only the header in VCF output (equivalent to bcftools head)\n");
500     fprintf(bcftools_stderr, "    -H, --no-header                   Suppress the header in VCF output\n");
501     fprintf(bcftools_stderr, "        --with-header                 Print both header and records in VCF output [default]\n");
502     fprintf(bcftools_stderr, "    -l, --compression-level [0-9]     Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
503     fprintf(bcftools_stderr, "        --no-version                  Do not append version and command line to the header\n");
504     fprintf(bcftools_stderr, "    -o, --output FILE                 Output file name [bcftools_stdout]\n");
505     fprintf(bcftools_stderr, "    -O, --output-type u|b|v|z[0-9]    u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
506     fprintf(bcftools_stderr, "    -r, --regions REGION              Restrict to comma-separated list of regions\n");
507     fprintf(bcftools_stderr, "    -R, --regions-file FILE           Restrict to regions listed in FILE\n");
508     fprintf(bcftools_stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
509     fprintf(bcftools_stderr, "    -t, --targets [^]REGION           Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
510     fprintf(bcftools_stderr, "    -T, --targets-file [^]FILE        Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
511     fprintf(bcftools_stderr, "        --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
512     fprintf(bcftools_stderr, "        --threads INT                 Use multithreading with INT worker threads [0]\n");
513     fprintf(bcftools_stderr, "\n");
514     fprintf(bcftools_stderr, "Subset options:\n");
515     fprintf(bcftools_stderr, "    -a, --trim-alt-alleles            Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
516     fprintf(bcftools_stderr, "    -I, --no-update                   Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
517     fprintf(bcftools_stderr, "    -s, --samples [^]LIST             Comma separated list of samples to include (or exclude with \"^\" prefix)\n");
518     fprintf(bcftools_stderr, "    -S, --samples-file [^]FILE        File of samples to include (or exclude with \"^\" prefix)\n");
519     fprintf(bcftools_stderr, "        --force-samples               Only warn about unknown subset samples\n");
520     fprintf(bcftools_stderr, "\n");
521     fprintf(bcftools_stderr, "Filter options:\n");
522     fprintf(bcftools_stderr, "    -c/C, --min-ac/--max-ac INT[:TYPE]     Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
523     fprintf(bcftools_stderr, "                                               (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
524     fprintf(bcftools_stderr, "    -f,   --apply-filters LIST             Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
525     fprintf(bcftools_stderr, "    -g,   --genotype [^]hom|het|miss       Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n");
526     fprintf(bcftools_stderr, "    -i/e, --include/--exclude EXPR         Select/exclude sites for which the expression is true (see man page for details)\n");
527     fprintf(bcftools_stderr, "    -k/n, --known/--novel                  Select known/novel sites only (ID is not/is '.')\n");
528     fprintf(bcftools_stderr, "    -m/M, --min-alleles/--max-alleles INT  Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
529     fprintf(bcftools_stderr, "    -p/P, --phased/--exclude-phased        Select/exclude sites where all samples are phased\n");
530     fprintf(bcftools_stderr, "    -q/Q, --min-af/--max-af FLOAT[:TYPE]   Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
531     fprintf(bcftools_stderr, "                                               (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
532     fprintf(bcftools_stderr, "    -u/U, --uncalled/--exclude-uncalled    Select/exclude sites without a called genotype\n");
533     fprintf(bcftools_stderr, "    -v/V, --types/--exclude-types LIST     Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
534     fprintf(bcftools_stderr, "    -x/X, --private/--exclude-private      Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
535     fprintf(bcftools_stderr, "\n");
536     bcftools_exit(1);
537 }
538 
main_vcfview(int argc,char * argv[])539 int main_vcfview(int argc, char *argv[])
540 {
541     int c;
542     args_t *args  = (args_t*) calloc(1,sizeof(args_t));
543     args->argc    = argc; args->argv = argv;
544     args->files   = bcf_sr_init();
545     args->clevel  = -1;
546     args->print_header = 1;
547     args->update_info = 1;
548     args->output_type = FT_VCF;
549     args->n_threads = 0;
550     args->record_cmd_line = 1;
551     args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
552     args->regions_overlap = 1;
553     args->targets_overlap = 0;
554     int targets_is_file = 0, regions_is_file = 0;
555 
556     static struct option loptions[] =
557     {
558         {"genotype",required_argument,NULL,'g'},
559         {"compression-level",required_argument,NULL,'l'},
560         {"threads",required_argument,NULL,9},
561         {"header-only",no_argument,NULL,'h'},
562         {"no-header",no_argument,NULL,'H'},
563         {"with-header",no_argument,NULL,4},
564         {"exclude",required_argument,NULL,'e'},
565         {"include",required_argument,NULL,'i'},
566         {"trim-alt-alleles",no_argument,NULL,'a'},
567         {"no-update",no_argument,NULL,'I'},
568         {"drop-genotypes",no_argument,NULL,'G'},
569         {"private",no_argument,NULL,'x'},
570         {"exclude-private",no_argument,NULL,'X'},
571         {"uncalled",no_argument,NULL,'u'},
572         {"exclude-uncalled",no_argument,NULL,'U'},
573         {"apply-filters",required_argument,NULL,'f'},
574         {"known",no_argument,NULL,'k'},
575         {"novel",no_argument,NULL,'n'},
576         {"min-alleles",required_argument,NULL,'m'},
577         {"max-alleles",required_argument,NULL,'M'},
578         {"samples",required_argument,NULL,'s'},
579         {"samples-file",required_argument,NULL,'S'},
580         {"force-samples",no_argument,NULL,1},
581         {"output-type",required_argument,NULL,'O'},
582         {"output-file",required_argument,NULL,'o'},
583         {"output",required_argument,NULL,'o'},
584         {"types",required_argument,NULL,'v'},
585         {"exclude-types",required_argument,NULL,'V'},
586         {"targets",required_argument,NULL,'t'},
587         {"targets-file",required_argument,NULL,'T'},
588         {"targets-overlap",required_argument,NULL,2},
589         {"regions",required_argument,NULL,'r'},
590         {"regions-file",required_argument,NULL,'R'},
591         {"regions-overlap",required_argument,NULL,3},
592         {"min-ac",required_argument,NULL,'c'},
593         {"max-ac",required_argument,NULL,'C'},
594         {"min-af",required_argument,NULL,'q'},
595         {"max-af",required_argument,NULL,'Q'},
596         {"phased",no_argument,NULL,'p'},
597         {"exclude-phased",no_argument,NULL,'P'},
598         {"no-version",no_argument,NULL,8},
599         {NULL,0,NULL,0}
600     };
601     char *tmp;
602     while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
603     {
604         char allele_type[9] = "nref";
605         switch (c)
606         {
607             case 'O':
608                 switch (optarg[0]) {
609                     case 'b': args->output_type = FT_BCF_GZ; break;
610                     case 'u': args->output_type = FT_BCF; break;
611                     case 'z': args->output_type = FT_VCF_GZ; break;
612                     case 'v': args->output_type = FT_VCF; break;
613                     default:
614                     {
615                         args->clevel = strtol(optarg,&tmp,10);
616                         if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
617                     }
618                 };
619                 if ( optarg[1] )
620                 {
621                     args->clevel = strtol(optarg+1,&tmp,10);
622                     if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
623                 }
624                 break;
625             case 'l':
626                 args->clevel = strtol(optarg,&tmp,10);
627                 if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
628                 args->output_type |= FT_GZ;
629                 break;
630             case 'o': args->fn_out = optarg; break;
631             case 'H': args->print_header = 0; break;
632             case 'h': args->header_only = 1; break;
633             case  4 : args->print_header = 1; args->header_only = 0; break;
634 
635             case 't': args->targets_list = optarg; break;
636             case 'T': args->targets_list = optarg; targets_is_file = 1; break;
637             case 'r': args->regions_list = optarg; break;
638             case 'R': args->regions_list = optarg; regions_is_file = 1; break;
639 
640             case 's': args->sample_names = optarg; break;
641             case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
642             case  1 : args->force_samples = 1; break;
643             case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
644             case 'I': args->update_info = 0; break;
645             case 'G': args->sites_only = 1; break;
646 
647             case 'f': args->files->apply_filters = optarg; break;
648             case 'k': args->known = 1; break;
649             case 'n': args->novel = 1; break;
650             case 'm':
651                 args->min_alleles = strtol(optarg,&tmp,10);
652                 if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
653                 break;
654             case 'M':
655                 args->max_alleles = strtol(optarg,&tmp,10);
656                 if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
657                 break;
658             case 'v': args->include_types = optarg; break;
659             case 'V': args->exclude_types = optarg; break;
660             case 'e':
661                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
662                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
663             case 'i':
664                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
665                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
666             case 'c':
667             {
668                 args->min_ac_type = ALLELE_NONREF;
669                 if ( sscanf(optarg,"%d:%8s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 )
670                     error("Error: Could not parse --min-ac %s\n", optarg);
671                 set_allele_type(&args->min_ac_type, allele_type);
672                 args->calc_ac = 1;
673                 break;
674             }
675             case 'C':
676             {
677                 args->max_ac_type = ALLELE_NONREF;
678                 if ( sscanf(optarg,"%d:%8s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 )
679                     error("Error: Could not parse --max-ac %s\n", optarg);
680                 set_allele_type(&args->max_ac_type, allele_type);
681                 args->calc_ac = 1;
682                 break;
683             }
684             case 'q':
685             {
686                 args->min_af_type = ALLELE_NONREF;
687                 if ( sscanf(optarg,"%f:%8s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 )
688                     error("Error: Could not parse --min-af %s\n", optarg);
689                 set_allele_type(&args->min_af_type, allele_type);
690                 args->calc_ac = 1;
691                 break;
692             }
693             case 'Q':
694             {
695                 args->max_af_type = ALLELE_NONREF;
696                 if ( sscanf(optarg,"%f:%8s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 )
697                     error("Error: Could not parse --max-af %s\n", optarg);
698                 set_allele_type(&args->max_af_type, allele_type);
699                 args->calc_ac = 1;
700                 break;
701             }
702 
703             case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break;
704             case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break;
705             case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break;
706             case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break;
707             case 'p': args->phased |= FLT_INCLUDE; break; // phased
708             case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased
709             case 'g':
710             {
711                 if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM;
712                 else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET;
713                 else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING;
714                 else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM;
715                 else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET;
716                 else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING;
717                 else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
718                 break;
719             }
720             case  2 :
721                 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
722                 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
723                 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
724                 else error("Could not parse: --targets-overlap %s\n",optarg);
725                 break;
726             case  3 :
727                 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
728                 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
729                 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
730                 else error("Could not parse: --regions-overlap %s\n",optarg);
731                 break;
732             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
733             case  8 : args->record_cmd_line = 0; break;
734             case '?': usage(args); break;
735             default: error("Unknown argument: %s\n", optarg);
736         }
737     }
738 
739     if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
740     if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n");
741     if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n");
742     if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n");
743 
744     if ( args->sample_names && args->update_info) args->calc_ac = 1;
745 
746     char *fname = NULL;
747     if ( optind>=argc )
748     {
749         if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
750         else usage(args);
751     }
752     else fname = argv[optind];
753 
754     // read in the regions from the command line
755     if ( args->regions_list )
756     {
757         bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
758         if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
759             error("Failed to read the regions: %s\n", args->regions_list);
760     }
761     else if ( optind+1 < argc )
762     {
763         int i;
764         kstring_t tmp = {0,0,0};
765         kputs(argv[optind+1],&tmp);
766         for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
767         if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 )
768             error("Failed to read the regions: %s\n", tmp.s);
769         free(tmp.s);
770     }
771     if ( args->targets_list )
772     {
773         bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
774         if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
775             error("Failed to read the targets: %s\n", args->targets_list);
776     }
777 
778     if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
779     if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
780 
781     init_data(args);
782     bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr);
783     if (args->print_header)
784     {
785         if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
786     }
787     else if ( args->output_type & FT_BCF )
788         error("BCF output requires header, cannot proceed with -H\n");
789 
790     int ret = 0;
791     if (!args->header_only)
792     {
793         while ( bcf_sr_next_line(args->files) )
794         {
795             bcf1_t *line = args->files->readers[0].buffer[0];
796             if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n");
797             if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
798         }
799         ret = args->files->errnum;
800         if ( ret ) fprintf(bcftools_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
801     }
802     hts_close(args->out);
803     destroy_data(args);
804     bcf_sr_destroy(args->files);
805     free(args);
806     return ret;
807 }
808