1 /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
2 
3     Copyright (C) 2013-2021 Genome Research Ltd.
4 
5     Author: Shane McCarthy <sm15@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.  */
24 
25 #include <stdio.h>
26 #include <strings.h>
27 #include <unistd.h>
28 #include <getopt.h>
29 #include <ctype.h>
30 #include <string.h>
31 #include <errno.h>
32 #include <sys/stat.h>
33 #include <sys/types.h>
34 #include <math.h>
35 #include <inttypes.h>
36 #include <htslib/vcf.h>
37 #include <htslib/synced_bcf_reader.h>
38 #include <htslib/vcfutils.h>
39 #include "bcftools.h"
40 #include "filter.h"
41 #include "htslib/khash_str2int.h"
42 
43 #define FLT_INCLUDE 1
44 #define FLT_EXCLUDE 2
45 
46 #define ALLELE_NONREF 1
47 #define ALLELE_MINOR 2
48 #define ALLELE_ALT1 3
49 #define ALLELE_MAJOR 4
50 #define ALLELE_NONMAJOR 5
51 
52 #define GT_NEED_HOM 1
53 #define GT_NEED_HET 2
54 #define GT_NO_HOM   3
55 #define GT_NO_HET   4
56 #define GT_NEED_MISSING 5
57 #define GT_NO_MISSING 6
58 
59 typedef struct _args_t
60 {
61     filter_t *filter;
62     char *filter_str;
63     int filter_logic;   // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e)
64 
65     bcf_srs_t *files;
66     bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
67     char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
68     int regions_overlap, targets_overlap;
69     int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
70     int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
71     int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
72     int *ac, mac;
73     float min_af, max_af;
74     char *fn_ref, *fn_out, **samples;
75     int sample_is_file, force_samples;
76     char *include_types, *exclude_types;
77     int include, exclude;
78     int record_cmd_line;
79     htsFile *out;
80 }
81 args_t;
82 
init_data(args_t * args)83 static void init_data(args_t *args)
84 {
85     int i;
86     args->hdr = args->files->readers[0].header;
87 
88     if (args->calc_ac && args->update_info)
89     {
90         if (bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">") < 0)
91             error_errno("[%s] Failed to add \"AC\" INFO header", __func__);
92         if (bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">") < 0)
93             error_errno("[%s] Failed to add \"AN\" INFO header", __func__);
94     }
95     if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
96     else if (bcf_hdr_sync(args->hdr) < 0)
97         error_errno("[%s] Failed to update header", __func__);
98 
99     // setup sample data
100     if (args->sample_names)
101     {
102         void *hdr_samples = khash_str2int_init();
103         for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
104             khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
105 
106         void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
107         int nsmpl;
108         char **smpl = NULL;
109         args->samples = NULL; args->n_samples = 0;
110         smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
111         if ( !smpl )
112         {
113             error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
114         }
115 
116         if ( exclude )
117         {
118             for (i=0; i<nsmpl; i++) {
119                 if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
120                     if (args->force_samples) {
121                         fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
122                     } else {
123                         error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
124                     }
125                 }
126                 khash_str2int_inc(exclude, smpl[i]);
127             }
128 
129             for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
130             {
131                 if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i))  ) continue;
132                 args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
133                 args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
134             }
135             khash_str2int_destroy(exclude);
136         }
137         else
138         {
139             for (i=0; i<nsmpl; i++) {
140                 if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
141                     if (args->force_samples) {
142                         fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
143                         continue;
144                     } else {
145                         error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
146                     }
147                 }
148                 args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
149                 args->samples[args->n_samples++] = strdup(smpl[i]);
150             }
151         }
152         for (i=0; i<nsmpl; i++) free(smpl[i]);
153         free(smpl);
154         khash_str2int_destroy(hdr_samples);
155         if (args->n_samples == 0) {
156             fprintf(stderr, "Warn: subsetting has removed all samples\n");
157             args->sites_only = 1;
158         }
159     }
160 
161     if (args->n_samples)
162         args->imap = (int*)malloc(args->n_samples * sizeof(int));
163 
164     // determine variant types to include/exclude
165     if (args->include_types || args->exclude_types) {
166         if (args->include_types && args->exclude_types) {
167             fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n");
168             exit(1);
169         }
170         char **type_list = 0;
171         int m = 0, n = 0;
172         const char *q, *p;
173         for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
174             if (*p == ',' || *p == 0) {
175                 if (m == n) {
176                     m = m? m<<1 : 16;
177                     type_list = (char**)realloc(type_list, m * sizeof(char*));
178                 }
179                 type_list[n] = (char*)calloc(p - q + 1, 1);
180                 strncpy(type_list[n++], q, p - q);
181                 q = p + 1;
182                 if (*p == 0) break;
183             }
184         }
185         type_list = (char**)realloc(type_list, n * sizeof(char*));
186 
187         if (args->include_types) {
188             args->include = 0;
189             for (i = 0; i < n; ++i) {
190                 if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
191                 else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
192                 else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
193                 else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
194                 else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
195                 else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
196                 else {
197                     fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
198                     fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
199                     exit(1);
200                 }
201             }
202         }
203         if (args->exclude_types) {
204             args->exclude = 0;
205             for (i = 0; i < n; ++i) {
206                 if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
207                 else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
208                 else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
209                 else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
210                 else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
211                 else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
212                 else {
213                     fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
214                     fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
215                     exit(1);
216                 }
217             }
218         }
219         for (i = 0; i < n; ++i)
220             free(type_list[i]);
221         free(type_list);
222     }
223 
224     char wmode[8];
225     set_wmode(wmode,args->output_type,args->fn_out,args->clevel);
226     args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode);
227     if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
228     if ( args->n_threads > 0)
229         hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
230 
231     // headers: hdr=full header, hsub=subset header, hnull=sites only header
232     if (args->sites_only){
233         args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
234         bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
235     }
236     if (args->n_samples > 0)
237     {
238         args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
239         if ( !args->hsub ) error("Error occurred while subsetting samples\n");
240         if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
241         {
242             int i;
243             for (i=0; i<args->n_samples; i++)
244                 if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
245         }
246     }
247 
248     if ( args->filter_str )
249         args->filter = filter_init(args->hdr, args->filter_str);
250 }
251 
destroy_data(args_t * args)252 static void destroy_data(args_t *args)
253 {
254     int i;
255     if ( args->imap ) {
256         for (i = 0; i < args->n_samples; ++i)
257             free(args->samples[i]);
258         free(args->samples);
259         free(args->imap);
260     }
261     if (args->hnull) bcf_hdr_destroy(args->hnull);
262     if (args->hsub) bcf_hdr_destroy(args->hsub);
263     if ( args->filter )
264         filter_destroy(args->filter);
265     free(args->ac);
266 }
267 
268 // true if all samples are phased.
269 // haploid genotypes are considered phased
270 // ./. => not phased, .|. => phased
bcf_all_phased(const bcf_hdr_t * header,bcf1_t * line)271 int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line)
272 {
273     bcf_unpack(line, BCF_UN_FMT);
274     bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT");
275     int all_phased = 1;
276     if ( fmt_ptr )
277     {
278         int i, isample;
279         for (isample=0; isample<line->n_sample; isample++)
280         {
281             int sample_phased = 0;
282             #define BRANCH_INT(type_t,vector_end) { \
283                 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
284                 for (i=0; i<fmt_ptr->n; i++) \
285                 { \
286                     if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \
287                     if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \
288                     if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
289                     if ((p[i])&1) { \
290                         sample_phased = 1; \
291                         break; \
292                     } \
293                 } \
294             }
295             switch (fmt_ptr->type) {
296                 case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
297                 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
298                 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
299                 default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
300             }
301             #undef BRANCH_INT
302             if (!sample_phased) {
303                 all_phased = 0;
304                 break;
305             }
306         }
307     }
308     return all_phased;
309 }
310 
subset_vcf(args_t * args,bcf1_t * line)311 int subset_vcf(args_t *args, bcf1_t *line)
312 {
313     if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
314     if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
315     if (args->novel || args->known)
316     {
317         if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
318         if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0;  // skip sites which are novel, ID == '.'
319     }
320 
321     if (args->include || args->exclude)
322     {
323         int line_type = bcf_get_variant_types(line);
324         if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
325         if ( args->exclude &&   (line_type<<1) & args->exclude  ) return 0; // exclude given variant types
326     }
327 
328     if ( args->filter )
329     {
330         int ret = filter_test(args->filter, line, NULL);
331         if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
332         else if ( ret ) return 0;
333     }
334 
335     hts_expand(int, line->n_allele, args->mac, args->ac);
336     int i, an = 0, non_ref_ac = 0;
337     if (args->calc_ac) {
338         bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
339         for (i=1; i<line->n_allele; i++)
340             non_ref_ac += args->ac[i];
341         for (i=0; i<line->n_allele; i++)
342             an += args->ac[i];
343     }
344 
345     int update_ac = args->calc_ac;
346     if (args->n_samples)
347     {
348         int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
349         bcf_subset(args->hdr, line, args->n_samples, args->imap);
350         if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
351         if ( update_ac )
352         {
353             bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
354             an = 0;
355             for (i=0; i<line->n_allele; i++) {
356                 args->ac[i] = ac_sub[i];
357                 an += ac_sub[i];
358             }
359             for (i=1; i<line->n_allele; i++)
360                 non_ref_ac_sub += ac_sub[i];
361             if (args->private_vars) {
362                 if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
363                 if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
364             }
365             non_ref_ac = non_ref_ac_sub;
366         }
367         free(ac_sub);
368     }
369 
370     bcf_fmt_t *gt_fmt;
371     if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
372     {
373         int nhet = 0, nhom = 0, nmiss = 0;
374         for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
375         {
376             int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
377             if ( type==GT_HET_RA || type==GT_HET_AA )
378             {
379                 if ( args->gt_type==GT_NO_HET ) return 0;
380                 nhet = 1;
381             }
382             else if ( type==GT_UNKN )
383             {
384                 if ( args->gt_type==GT_NO_MISSING ) return 0;
385                 nmiss = 1;
386             }
387             else
388             {
389                 if ( args->gt_type==GT_NO_HOM ) return 0;
390                 nhom = 1;
391             }
392         }
393         if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
394         else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
395         else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
396     }
397 
398     int minor_ac = 0;
399     int major_ac = 0;
400     if ( args->calc_ac )
401     {
402         minor_ac = args->ac[0];
403         major_ac = args->ac[0];
404         for (i=1; i<line->n_allele; i++){
405             if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
406             if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
407         }
408     }
409 
410     if (args->min_ac!=-1)
411     {
412         if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
413         else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
414         else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
415         else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
416         else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
417     }
418     if (args->max_ac!=-1)
419     {
420         if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
421         else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
422         else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
423         else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
424         else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
425     }
426     if (args->min_af!=-1)
427     {
428         if (an == 0) return 0; // freq not defined, skip site
429         if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
430         else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
431         else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
432         else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
433         else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
434     }
435     if (args->max_af!=-1)
436     {
437         if (an == 0) return 0; // freq not defined, skip site
438         if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
439         else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
440         else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
441         else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
442         else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
443     }
444     if (args->uncalled) {
445         if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
446         if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
447     }
448     if (update_ac && args->update_info) {
449         bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
450         bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
451     }
452     if (args->trim_alts)
453     {
454         int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
455         if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1);
456     }
457     if (args->phased) {
458         int phased = bcf_all_phased(args->hdr, line);
459         if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
460         if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
461     }
462     if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
463     return 1;
464 }
465 
set_allele_type(int * atype,char * atype_string)466 void set_allele_type (int *atype, char *atype_string)
467 {
468     *atype = ALLELE_NONREF;
469     if (strcmp(atype_string, "minor") == 0) {
470         *atype = ALLELE_MINOR;
471     }
472     else if (strcmp(atype_string, "alt1") == 0) {
473         *atype = ALLELE_ALT1;
474     }
475     else if (strcmp(atype_string, "nref") == 0) {
476         *atype = ALLELE_NONREF;
477     }
478     else if (strcmp(atype_string, "major") == 0) {
479         *atype = ALLELE_MAJOR;
480     }
481     else if (strcmp(atype_string, "nonmajor") == 0) {
482         *atype = ALLELE_NONMAJOR;
483     }
484     else {
485         error("Error: allele type not recognised. Expected one of nref|alt1|minor|major|nonmajor, got \"%s\".\n", atype_string);
486     }
487 }
488 
usage(args_t * args)489 static void usage(args_t *args)
490 {
491     fprintf(stderr, "\n");
492     fprintf(stderr, "About:   VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
493     fprintf(stderr, "Usage:   bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
494     fprintf(stderr, "\n");
495     fprintf(stderr, "Output options:\n");
496     fprintf(stderr, "    -G, --drop-genotypes              Drop individual genotype information (after subsetting if -s option set)\n");
497     fprintf(stderr, "    -h, --header-only                 Print only the header in VCF output (equivalent to bcftools head)\n");
498     fprintf(stderr, "    -H, --no-header                   Suppress the header in VCF output\n");
499     fprintf(stderr, "        --with-header                 Print both header and records in VCF output [default]\n");
500     fprintf(stderr, "    -l, --compression-level [0-9]     Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
501     fprintf(stderr, "        --no-version                  Do not append version and command line to the header\n");
502     fprintf(stderr, "    -o, --output FILE                 Output file name [stdout]\n");
503     fprintf(stderr, "    -O, --output-type u|b|v|z[0-9]    u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
504     fprintf(stderr, "    -r, --regions REGION              Restrict to comma-separated list of regions\n");
505     fprintf(stderr, "    -R, --regions-file FILE           Restrict to regions listed in FILE\n");
506     fprintf(stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
507     fprintf(stderr, "    -t, --targets [^]REGION           Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
508     fprintf(stderr, "    -T, --targets-file [^]FILE        Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
509     fprintf(stderr, "        --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
510     fprintf(stderr, "        --threads INT                 Use multithreading with INT worker threads [0]\n");
511     fprintf(stderr, "\n");
512     fprintf(stderr, "Subset options:\n");
513     fprintf(stderr, "    -a, --trim-alt-alleles            Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
514     fprintf(stderr, "    -I, --no-update                   Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
515     fprintf(stderr, "    -s, --samples [^]LIST             Comma separated list of samples to include (or exclude with \"^\" prefix)\n");
516     fprintf(stderr, "    -S, --samples-file [^]FILE        File of samples to include (or exclude with \"^\" prefix)\n");
517     fprintf(stderr, "        --force-samples               Only warn about unknown subset samples\n");
518     fprintf(stderr, "\n");
519     fprintf(stderr, "Filter options:\n");
520     fprintf(stderr, "    -c/C, --min-ac/--max-ac INT[:TYPE]     Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
521     fprintf(stderr, "                                               (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
522     fprintf(stderr, "    -f,   --apply-filters LIST             Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
523     fprintf(stderr, "    -g,   --genotype [^]hom|het|miss       Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n");
524     fprintf(stderr, "    -i/e, --include/--exclude EXPR         Select/exclude sites for which the expression is true (see man page for details)\n");
525     fprintf(stderr, "    -k/n, --known/--novel                  Select known/novel sites only (ID is not/is '.')\n");
526     fprintf(stderr, "    -m/M, --min-alleles/--max-alleles INT  Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
527     fprintf(stderr, "    -p/P, --phased/--exclude-phased        Select/exclude sites where all samples are phased\n");
528     fprintf(stderr, "    -q/Q, --min-af/--max-af FLOAT[:TYPE]   Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
529     fprintf(stderr, "                                               (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
530     fprintf(stderr, "    -u/U, --uncalled/--exclude-uncalled    Select/exclude sites without a called genotype\n");
531     fprintf(stderr, "    -v/V, --types/--exclude-types LIST     Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
532     fprintf(stderr, "    -x/X, --private/--exclude-private      Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
533     fprintf(stderr, "\n");
534     exit(1);
535 }
536 
main_vcfview(int argc,char * argv[])537 int main_vcfview(int argc, char *argv[])
538 {
539     int c;
540     args_t *args  = (args_t*) calloc(1,sizeof(args_t));
541     args->argc    = argc; args->argv = argv;
542     args->files   = bcf_sr_init();
543     args->clevel  = -1;
544     args->print_header = 1;
545     args->update_info = 1;
546     args->output_type = FT_VCF;
547     args->n_threads = 0;
548     args->record_cmd_line = 1;
549     args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
550     args->regions_overlap = 1;
551     args->targets_overlap = 0;
552     int targets_is_file = 0, regions_is_file = 0;
553 
554     static struct option loptions[] =
555     {
556         {"genotype",required_argument,NULL,'g'},
557         {"compression-level",required_argument,NULL,'l'},
558         {"threads",required_argument,NULL,9},
559         {"header-only",no_argument,NULL,'h'},
560         {"no-header",no_argument,NULL,'H'},
561         {"with-header",no_argument,NULL,4},
562         {"exclude",required_argument,NULL,'e'},
563         {"include",required_argument,NULL,'i'},
564         {"trim-alt-alleles",no_argument,NULL,'a'},
565         {"no-update",no_argument,NULL,'I'},
566         {"drop-genotypes",no_argument,NULL,'G'},
567         {"private",no_argument,NULL,'x'},
568         {"exclude-private",no_argument,NULL,'X'},
569         {"uncalled",no_argument,NULL,'u'},
570         {"exclude-uncalled",no_argument,NULL,'U'},
571         {"apply-filters",required_argument,NULL,'f'},
572         {"known",no_argument,NULL,'k'},
573         {"novel",no_argument,NULL,'n'},
574         {"min-alleles",required_argument,NULL,'m'},
575         {"max-alleles",required_argument,NULL,'M'},
576         {"samples",required_argument,NULL,'s'},
577         {"samples-file",required_argument,NULL,'S'},
578         {"force-samples",no_argument,NULL,1},
579         {"output-type",required_argument,NULL,'O'},
580         {"output-file",required_argument,NULL,'o'},
581         {"output",required_argument,NULL,'o'},
582         {"types",required_argument,NULL,'v'},
583         {"exclude-types",required_argument,NULL,'V'},
584         {"targets",required_argument,NULL,'t'},
585         {"targets-file",required_argument,NULL,'T'},
586         {"targets-overlap",required_argument,NULL,2},
587         {"regions",required_argument,NULL,'r'},
588         {"regions-file",required_argument,NULL,'R'},
589         {"regions-overlap",required_argument,NULL,3},
590         {"min-ac",required_argument,NULL,'c'},
591         {"max-ac",required_argument,NULL,'C'},
592         {"min-af",required_argument,NULL,'q'},
593         {"max-af",required_argument,NULL,'Q'},
594         {"phased",no_argument,NULL,'p'},
595         {"exclude-phased",no_argument,NULL,'P'},
596         {"no-version",no_argument,NULL,8},
597         {NULL,0,NULL,0}
598     };
599     char *tmp;
600     while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
601     {
602         char allele_type[9] = "nref";
603         switch (c)
604         {
605             case 'O':
606                 switch (optarg[0]) {
607                     case 'b': args->output_type = FT_BCF_GZ; break;
608                     case 'u': args->output_type = FT_BCF; break;
609                     case 'z': args->output_type = FT_VCF_GZ; break;
610                     case 'v': args->output_type = FT_VCF; break;
611                     default:
612                     {
613                         args->clevel = strtol(optarg,&tmp,10);
614                         if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
615                     }
616                 };
617                 if ( optarg[1] )
618                 {
619                     args->clevel = strtol(optarg+1,&tmp,10);
620                     if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
621                 }
622                 break;
623             case 'l':
624                 args->clevel = strtol(optarg,&tmp,10);
625                 if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
626                 args->output_type |= FT_GZ;
627                 break;
628             case 'o': args->fn_out = optarg; break;
629             case 'H': args->print_header = 0; break;
630             case 'h': args->header_only = 1; break;
631             case  4 : args->print_header = 1; args->header_only = 0; break;
632 
633             case 't': args->targets_list = optarg; break;
634             case 'T': args->targets_list = optarg; targets_is_file = 1; break;
635             case 'r': args->regions_list = optarg; break;
636             case 'R': args->regions_list = optarg; regions_is_file = 1; break;
637 
638             case 's': args->sample_names = optarg; break;
639             case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
640             case  1 : args->force_samples = 1; break;
641             case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
642             case 'I': args->update_info = 0; break;
643             case 'G': args->sites_only = 1; break;
644 
645             case 'f': args->files->apply_filters = optarg; break;
646             case 'k': args->known = 1; break;
647             case 'n': args->novel = 1; break;
648             case 'm':
649                 args->min_alleles = strtol(optarg,&tmp,10);
650                 if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
651                 break;
652             case 'M':
653                 args->max_alleles = strtol(optarg,&tmp,10);
654                 if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
655                 break;
656             case 'v': args->include_types = optarg; break;
657             case 'V': args->exclude_types = optarg; break;
658             case 'e':
659                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
660                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
661             case 'i':
662                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
663                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
664             case 'c':
665             {
666                 args->min_ac_type = ALLELE_NONREF;
667                 if ( sscanf(optarg,"%d:%8s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 )
668                     error("Error: Could not parse --min-ac %s\n", optarg);
669                 set_allele_type(&args->min_ac_type, allele_type);
670                 args->calc_ac = 1;
671                 break;
672             }
673             case 'C':
674             {
675                 args->max_ac_type = ALLELE_NONREF;
676                 if ( sscanf(optarg,"%d:%8s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 )
677                     error("Error: Could not parse --max-ac %s\n", optarg);
678                 set_allele_type(&args->max_ac_type, allele_type);
679                 args->calc_ac = 1;
680                 break;
681             }
682             case 'q':
683             {
684                 args->min_af_type = ALLELE_NONREF;
685                 if ( sscanf(optarg,"%f:%8s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 )
686                     error("Error: Could not parse --min-af %s\n", optarg);
687                 set_allele_type(&args->min_af_type, allele_type);
688                 args->calc_ac = 1;
689                 break;
690             }
691             case 'Q':
692             {
693                 args->max_af_type = ALLELE_NONREF;
694                 if ( sscanf(optarg,"%f:%8s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 )
695                     error("Error: Could not parse --max-af %s\n", optarg);
696                 set_allele_type(&args->max_af_type, allele_type);
697                 args->calc_ac = 1;
698                 break;
699             }
700 
701             case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break;
702             case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break;
703             case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break;
704             case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break;
705             case 'p': args->phased |= FLT_INCLUDE; break; // phased
706             case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased
707             case 'g':
708             {
709                 if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM;
710                 else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET;
711                 else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING;
712                 else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM;
713                 else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET;
714                 else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING;
715                 else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
716                 break;
717             }
718             case  2 :
719                 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
720                 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
721                 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
722                 else error("Could not parse: --targets-overlap %s\n",optarg);
723                 break;
724             case  3 :
725                 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
726                 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
727                 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
728                 else error("Could not parse: --regions-overlap %s\n",optarg);
729                 break;
730             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
731             case  8 : args->record_cmd_line = 0; break;
732             case '?': usage(args); break;
733             default: error("Unknown argument: %s\n", optarg);
734         }
735     }
736 
737     if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
738     if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n");
739     if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n");
740     if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n");
741 
742     if ( args->sample_names && args->update_info) args->calc_ac = 1;
743 
744     char *fname = NULL;
745     if ( optind>=argc )
746     {
747         if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
748         else usage(args);
749     }
750     else fname = argv[optind];
751 
752     // read in the regions from the command line
753     if ( args->regions_list )
754     {
755         bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
756         if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
757             error("Failed to read the regions: %s\n", args->regions_list);
758     }
759     else if ( optind+1 < argc )
760     {
761         int i;
762         kstring_t tmp = {0,0,0};
763         kputs(argv[optind+1],&tmp);
764         for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
765         if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 )
766             error("Failed to read the regions: %s\n", tmp.s);
767         free(tmp.s);
768     }
769     if ( args->targets_list )
770     {
771         bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
772         if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
773             error("Failed to read the targets: %s\n", args->targets_list);
774     }
775 
776     if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
777     if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
778 
779     init_data(args);
780     bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr);
781     if (args->print_header)
782     {
783         if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
784     }
785     else if ( args->output_type & FT_BCF )
786         error("BCF output requires header, cannot proceed with -H\n");
787 
788     int ret = 0;
789     if (!args->header_only)
790     {
791         while ( bcf_sr_next_line(args->files) )
792         {
793             bcf1_t *line = args->files->readers[0].buffer[0];
794             if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n");
795             if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
796         }
797         ret = args->files->errnum;
798         if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
799     }
800     hts_close(args->out);
801     destroy_data(args);
802     bcf_sr_destroy(args->files);
803     free(args);
804     return ret;
805 }
806