1 /* The MIT License
2 
3    Copyright (c) 2019-2021 Genome Research Ltd.
4 
5    Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7    Permission is hereby granted, free of charge, to any person obtaining a copy
8    of this software and associated documentation files (the "Software"), to deal
9    in the Software without restriction, including without limitation the rights
10    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11    copies of the Software, and to permit persons to whom the Software is
12    furnished to do so, subject to the following conditions:
13 
14    The above copyright notice and this permission notice shall be included in
15    all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <strings.h>
30 #include <getopt.h>
31 #include <unistd.h>     // for isatty
32 #include <inttypes.h>
33 #include <htslib/hts.h>
34 #include <htslib/vcf.h>
35 #include <htslib/bgzf.h>
36 #include <htslib/kstring.h>
37 #include <htslib/kseq.h>
38 #include <htslib/synced_bcf_reader.h>
39 #include <htslib/khash_str2int.h>
40 #include <regex.h>
41 #include "../bcftools.h"
42 #include "../filter.h"
43 #include "../convert.h"
44 #include "../cols.h"
45 
46 
47 // Logic of the filters: include or exclude sites which match the filters?
48 #define FLT_INCLUDE 1
49 #define FLT_EXCLUDE 2
50 
51 #define SELECT_TR_ALL       0
52 #define SELECT_TR_WORST     1
53 #define SELECT_TR_PRIMARY   2
54 #define SELECT_CSQ_ANY      -1
55 
56 typedef struct
57 {
58     regex_t *regex;
59     char *type;
60 }
61 col2type_t;
62 
63 typedef struct
64 {
65     char *field;    // the name of the VEP field, e.g. Consequence,Gene,etc.
66     char *tag;      // the name of the VCF tag: the annot_t.field with the -p prefix
67     int idx;        // 0-based index within the VEP annotation string
68     int type;       // annotation type, one of the BCF_HT_* types
69     kstring_t str;  // annotation value, ready to pass to bcf_update_info_*
70 }
71 annot_t;
72 
73 typedef struct
74 {
75     convert_t *convert;
76     filter_t *filter;
77     int argc, filter_logic, regions_is_file, targets_is_file, list_hdr, record_cmd_line, clevel;
78     int regions_overlap, targets_overlap;
79     kstring_t kstr;
80     char *filter_str,
81         *vep_tag;       // the --annotation INFO tag to process
82     char **argv, *output_fname, *fname, *regions, *targets, *format_str;
83     int output_type;
84     htsFile *fh_vcf;
85     BGZF *fh_bgzf;
86     bcf_srs_t *sr;
87     bcf_hdr_t *hdr, *hdr_out;
88     int nfield;         // number of all available VEP fields
89     char **field;       // list of all available VEP fields
90     int nannot;         // number of requested fields
91     annot_t *annot;     // requested fields
92     int nscale;         // number of items in the severity scale
93     char **scale;       // severity scale (list)
94     int ncsq_str;       // the length of csq_str allocated by bcf_get_info_string()
95     char *csq_str;      // the current bcf_get_info_string() result
96     int csq_idx,        // the index of the Consequence field; for the --select CSQ option
97         primary_id;     // the index of the CANONICAL field; for the --select TR option
98     char *severity,     // the --severity scale option
99         *select,        // the --select option
100         *column_str,    // the --columns option
101         *column_types,  // the --columns-types option
102         *annot_prefix;  // the --annot-prefix option
103     void *field2idx,    // VEP field name to index, used in initialization
104         *csq2severity;  // consequence type to severity score
105     cols_t *cols_tr,    // the current CSQ tag split into transcripts
106         *cols_csq;      // the current CSQ transcript split into fields
107     int min_severity, max_severity;     // ignore consequences outside this severity range
108     int drop_sites;                     // the -x, --drop-sites option
109     int select_tr;                      // one of SELECT_TR_*
110     uint8_t *smpl_pass;                 // for filtering at sample level, used with -f
111     int duplicate;              // the -d, --duplicate option is set
112     char *all_fields_delim;     // the -A, --all-fields option is set
113     float *farr;                // helper arrays for bcf_update_* functions
114     int32_t *iarr;
115     int niarr,miarr, nfarr,mfarr;
116     col2type_t *column2type;
117     int ncolumn2type;
118     int raw_vep_request;        // raw VEP tag requested and will need subsetting
119     int allow_undef_tags;
120 }
121 args_t;
122 
123 args_t args;
124 
about(void)125 const char *about(void)
126 {
127     return "Query structured annotations such as the CSQ created by VEP.\n";
128 }
129 
default_severity(void)130 static const char *default_severity(void)
131 {
132     return
133         "# Default consequence substrings ordered in ascending order by severity.\n"
134         "# Consequences with the same severity can be put on the same line in arbitrary order.\n"
135         "# See also https://m.ensembl.org/info/genome/variation/prediction/predicted_data.htm\n"
136         "intergenic\n"
137         "feature_truncation feature_elongation\n"
138         "regulatory\n"
139         "TF_binding_site TFBS\n"
140         "downstream upstream\n"
141         "non_coding_transcript non_coding\n"
142         "intron NMD_transcript\n"
143         "non_coding_transcript_exon\n"
144         "5_prime_utr 3_prime_utr\n"
145         "coding_sequence mature_miRNA\n"
146         "stop_retained start_retained synonymous\n"
147         "incomplete_terminal_codon\n"
148         "splice_region\n"
149         "missense inframe protein_altering\n"
150         "transcript_amplification\n"
151         "exon_loss\n"
152         "disruptive\n"
153         "start_lost stop_lost stop_gained frameshift\n"
154         "splice_acceptor splice_donor\n"
155         "transcript_ablation\n";
156 }
default_column_types(void)157 static const char *default_column_types(void)
158 {
159     return
160         "# Default CSQ subfield types, unlisted fields are type String.\n"
161         "# Note the use of regular expressions.\n"
162         "cDNA_position              Integer\n"
163         "CDS_position               Integer\n"
164         "Protein_position           Integer\n"
165         "DISTANCE                   Integer\n"
166         "STRAND                     Integer\n"
167         "TSL                        Integer\n"
168         "GENE_PHENO                 Integer\n"
169         "HGVS_OFFSET                Integer\n"
170         "AF                         Float\n"
171         ".*_AF                      Float\n"
172         "MAX_AF_.*                  Float\n"
173         "MOTIF_POS                  Integer\n"
174         "MOTIF_SCORE_CHANGE         Float\n"
175         "existing_InFrame_oORFs     Integer\n"
176         "existing_OutOfFrame_oORFs  Integer\n"
177         "existing_uORFs             Integer\n"
178         "SpliceAI_pred_DP_.*        Integer\n"
179         "SpliceAI_pred_DS_.*        Float\n";
180 }
usage_text(void)181 static const char *usage_text(void)
182 {
183     return
184         "\n"
185         "About: Query structured annotations such INFO/CSQ created by bcftools/csq or VEP. For more\n"
186         "   more information and pointers see http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n"
187         "Usage: bcftools +split-vep [Plugin Options]\n"
188         "Plugin options:\n"
189         "   -a, --annotation STR            INFO annotation to parse [CSQ]\n"
190         "   -A, --all-fields DELIM          Output all fields replacing the -a tag (\"%CSQ\" by default) in the -f\n"
191         "                                     filtering expression using the output field delimiter DELIM. This can be\n"
192         "                                     \"tab\", \"space\" or an arbitrary string.\n"
193         "   -c, --columns [LIST|-][:TYPE]   Extract the fields listed either as 0-based indexes or names, \"-\" to extract all\n"
194         "                                     fields. See --columns-types for the defaults. Supported types are String/Str,\n"
195         "                                     Integer/Int and Float/Real. Unlisted fields are set to String.\n"
196         "       --columns-types -|FILE      Pass \"-\" to print the default -c types or FILE to override the presets\n"
197         "   -d, --duplicate                 Output per transcript/allele consequences on a new line rather rather than\n"
198         "                                     as comma-separated fields on a single line\n"
199         "   -f, --format STR                Create non-VCF output; similar to `bcftools query -f` but drops lines w/o consequence\n"
200         "   -l, --list                      Parse the VCF header and list the annotation fields\n"
201         "   -p, --annot-prefix STR          Before doing anything else, prepend STR to all CSQ fields to avoid tag name conflicts\n"
202         "   -s, --select TR:CSQ             Select transcripts to extract by type and/or consequence severity. (See also -S and -x.)\n"
203         "                                     TR, transcript:   worst,primary(*),all        [all]\n"
204         "                                     CSQ, consequence: any,missense,missense+,etc  [any]\n"
205         "                                     (*) Primary transcripts have the field \"CANONICAL\" set to \"YES\"\n"
206         "   -S, --severity -|FILE           Pass \"-\" to print the default severity scale or FILE to override\n"
207         "                                     the default scale\n"
208         "   -u, --allow-undef-tags          Print \".\" for undefined tags\n"
209         "   -x, --drop-sites                Drop sites with none of the consequences matching the severity specified by -s.\n"
210         "                                      This switch is intended for use with VCF/BCF output (i.e. -f not given).\n"
211         "Common options:\n"
212         "   -e, --exclude EXPR              Exclude sites and samples for which the expression is true\n"
213         "   -i, --include EXPR              Include sites and samples for which the expression is true\n"
214         "       --no-version                Do not append version and command line to the header\n"
215         "   -o, --output FILE               Output file name [stdout]\n"
216         "   -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
217         "   -r, --regions REG               Restrict to comma-separated list of regions\n"
218         "   -R, --regions-file FILE         Restrict to regions listed in a file\n"
219         "       --regions-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"
220         "   -t, --targets REG               Similar to -r but streams rather than index-jumps\n"
221         "   -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n"
222         "       --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
223         "\n"
224         "Examples:\n"
225         "   # List available fields of the INFO/CSQ annotation\n"
226         "   bcftools +split-vep -l file.vcf.gz\n"
227         "\n"
228         "   # List the default severity scale\n"
229         "   bcftools +split-vep -S -\n"
230         "\n"
231         "   # Extract Consequence, IMPACT and gene SYMBOL of the most severe consequence into\n"
232         "   # INFO annotations starting with the prefix \"vep\". For brevity, the columns can\n"
233         "   # be given also as 0-based indexes\n"
234         "   bcftools +split-vep -c Consequence,IMPACT,SYMBOL -s worst -p vep file.vcf.gz\n"
235         "   bcftools +split-vep -c 1-3 -s worst -p vep file.vcf.gz\n"
236         "\n"
237         "   # Same as above but use the text output of the \"bcftools query\" format\n"
238         "   bcftools +split-vep -s worst -f '%CHROM %POS %Consequence %IMPACT %SYMBOL\\n' file.vcf.gz\n"
239         "\n"
240         "   # Print all subfields (tab-delimited) in place of %CSQ, each consequence on a new line\n"
241         "   bcftools +split-vep -f '%CHROM %POS %CSQ\\n' -d -A tab file.vcf.gz\n"
242         "\n"
243         "   # Extract gnomAD_AF subfield into a new INFO/gnomAD_AF annotation of Type=Float so that\n"
244         "   # numeric filtering can be used.\n"
245         "   bcftools +split-vep -c gnomAD_AF:Float file.vcf.gz -i'gnomAD_AF<0.001'\n"
246         "\n"
247         "   # Similar to above, but add the annotation only if the consequence severity is missense\n"
248         "   # or equivalent. In order to drop sites with different consequences completely, we add\n"
249         "   # the -x switch. See the online documentation referenced above for more examples.\n"
250         "   bcftools +split-vep -c gnomAD_AF:Float -s :missense    file.vcf.gz\n"
251         "   bcftools +split-vep -c gnomAD_AF:Float -s :missense -x file.vcf.gz\n"
252         "\n"
253         "   See also http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n"
254         "\n";
255 }
256 
expand_csq_expression(args_t * args,kstring_t * str)257 static void expand_csq_expression(args_t *args, kstring_t *str)
258 {
259     if ( !args->all_fields_delim ) return;
260 
261     str->l = 0;
262     kputc('%',str);
263     kputs(args->vep_tag,str);
264     char *ptr = strstr(args->format_str,str->s);
265     if ( !ptr ) return;
266     char *end = ptr + str->l, tmp = *end;
267     if ( isalnum(tmp) || tmp=='_' || tmp=='.' ) return;
268     *end = 0;
269 
270     str->l = 0;
271     kputsn(args->format_str, ptr - args->format_str, str);
272 
273     int i;
274     for (i=0; i<args->nfield; i++)
275     {
276         if ( i>0 ) kputs(args->all_fields_delim, str);
277         kputc('%', str);
278         kputs(args->field[i], str);
279     }
280 
281     *end = tmp;
282     kputs(end, str);
283 
284     free(args->format_str);
285     args->format_str = str->s;
286     str->l = str->m = 0;
287     str->s = NULL;
288 }
289 
init_column2type(args_t * args)290 static void init_column2type(args_t *args)
291 {
292     int i, ntype = 0;
293     char **type = NULL;
294     if ( args->column_types && strcmp("-",args->column_types) )
295         type = hts_readlines(args->column_types, &ntype);
296     else
297     {
298         char *str = strdup(default_column_types());
299         char *beg = str;
300         while ( *beg )
301         {
302             char *end = beg;
303             while ( *end && *end!='\n' ) end++;
304             char tmp = *end;
305             *end = 0;
306             ntype++;
307             type = (char**) realloc(type,sizeof(*type)*ntype);
308             type[ntype-1] = strdup(beg);
309             if ( !tmp ) break;
310             beg = end+1;
311         }
312         free(str);
313     }
314     if ( !type || !ntype ) error("Failed to parse the column types\n");
315     for (i=0; i<ntype; i++)
316     {
317         if ( type[i][0]=='#' ) continue;
318         char *tmp = strdup(type[i]);
319         char *ptr = tmp;
320         while ( *ptr && !isspace(*ptr) ) ptr++;
321         if ( !*ptr ) error("Error: failed to parse the column type \"%s\"\n",type[i]);
322         *ptr = 0;
323         ptr++;
324         while ( *ptr && isspace(*ptr) ) ptr++;
325         if ( !*ptr ) error("Error: failed to parse the column type \"%s\"\n",type[i]);
326         args->ncolumn2type++;
327         args->column2type = (col2type_t*) realloc(args->column2type,sizeof(*args->column2type)*args->ncolumn2type);
328         col2type_t *ct = &args->column2type[args->ncolumn2type-1];
329         ct->regex = (regex_t *) malloc(sizeof(regex_t));
330         if ( regcomp(ct->regex, tmp, REG_NOSUB) )
331                 error("Error: fail to compile the column type regular expression \"%s\": %s\n", tmp,type[i]);
332         int type_ok = 0;
333         if ( !strcmp(ptr,"Float") ) type_ok = 1;
334         else if ( !strcmp(ptr,"Integer") ) type_ok = 1;
335         else if ( !strcmp(ptr,"Flag") ) type_ok = 1;
336         else if ( !strcmp(ptr,"String") ) type_ok = 1;
337         if ( !type_ok ) error("Error: the column type \"%s\" is not supported: %s\n",ptr,type[i]);
338         ct->type = strdup(ptr);
339         free(tmp);
340     }
341     if ( !args->ncolumn2type ) error("Failed to parse the column types\n");
342     for (i=0; i<ntype; i++) free(type[i]);
343     free(type);
344 }
destroy_column2type(args_t * args)345 static void destroy_column2type(args_t *args)
346 {
347     int i;
348     for (i=0; i<args->ncolumn2type; i++)
349     {
350         regfree(args->column2type[i].regex);
351         free(args->column2type[i].regex);
352         free(args->column2type[i].type);
353     }
354     free(args->column2type);
355 }
get_column_type(args_t * args,char * field)356 static const char *get_column_type(args_t *args, char *field)
357 {
358     if ( !args->column2type ) init_column2type(args);
359     int i;
360     for (i=0; i<args->ncolumn2type; i++)
361     {
362         int match = regexec(args->column2type[i].regex, field, 0,NULL,0) ? 0 : 1;
363         if ( match ) return args->column2type[i].type;
364     }
365     return "String";
366 }
query_has_field(char * fmt,char * field,kstring_t * str)367 static int query_has_field(char *fmt, char *field, kstring_t *str)
368 {
369     str->l = 0;
370     kputc('%',str);
371     kputs(field,str);
372     char end, *ptr = fmt;
373     while ( ptr )
374     {
375         ptr = strstr(ptr,str->s);
376         if ( !ptr ) return 0;
377         end = ptr[str->l];
378         if ( isalnum(end) || end=='_' || end=='.' )
379         {
380             ptr++;
381             continue;
382         }
383         break;
384     }
385     return 1;
386 }
strdup_annot_prefix(args_t * args,const char * str)387 char *strdup_annot_prefix(args_t *args, const char *str)
388 {
389     if ( !args->annot_prefix ) return strdup(str);
390     int str_len = strlen(str);
391     int prefix_len = strlen(args->annot_prefix);
392     char *out = calloc(str_len+prefix_len+1,1);
393     memcpy(out,args->annot_prefix,prefix_len);
394     memcpy(out+prefix_len,str,str_len);
395     return out;
396 }
init_data(args_t * args)397 static void init_data(args_t *args)
398 {
399     args->sr = bcf_sr_init();
400     if ( args->regions )
401     {
402         args->sr->require_index = 1;
403         bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
404         if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions);
405     }
406     if ( args->targets )
407     {
408         bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
409         if ( bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets);
410     }
411     if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
412     args->hdr = bcf_sr_get_header(args->sr,0);
413     args->hdr_out = bcf_hdr_dup(args->hdr);
414 
415     // Parse the header CSQ line, must contain Description with "Format: ..." declaration
416     bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr, BCF_HL_INFO, NULL, args->vep_tag, NULL);
417     if ( !hrec ) error("The tag INFO/%s not found in the header\n", args->vep_tag);
418     int ret = bcf_hrec_find_key(hrec, "Description");
419     if ( ret<0 ) error("No \"Description\" field was found for the tag INFO/%s in the header\n", args->vep_tag);
420     char *format = strstr(hrec->vals[ret], "Format: ");
421     if ( !format ) error("Expected \"Format: \" substring in the header INFO/%s/Description, found: %s\n", args->vep_tag,hrec->vals[ret]);
422     format += 8;
423     char *ep = format;
424     while ( *ep )
425     {
426         char *bp = ep;
427         while ( *ep && *ep!='|' && *ep!='"' && *ep!='(' ) ep++; // don't include bracket in "pos(1-based)"
428         char tmp = *ep;
429         *ep = 0;
430         args->nfield++;
431         args->field = (char**)realloc(args->field,args->nfield*sizeof(*args->field));
432         args->field[args->nfield-1] = strdup_annot_prefix(args,bp);
433         if ( !tmp ) break;
434         *ep = tmp;
435         while ( *ep && *ep!='|' ) ep++;     // skip bracket in "pos(1-based)"
436         if ( *ep ) ep++;
437     }
438     if ( !args->nfield ) error("Could not parse Description of INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]);
439     args->field2idx = khash_str2int_init();
440     int i,j;
441     for (i=0; i<args->nfield; i++)
442     {
443         if ( khash_str2int_has_key(args->field2idx, args->field[i]) )
444         {
445             fprintf(stderr,"Warning: duplicate INFO/%s key \"%s\"\n", args->vep_tag,args->field[i]);
446             continue;
447         }
448         khash_str2int_set(args->field2idx, args->field[i], i);
449     }
450 
451     // Severity scale
452     kstring_t str = {0,0,0};
453     args->csq2severity = khash_str2int_init();
454     int severity = 0;
455     if ( args->severity )
456     {
457         kstring_t tmp = {0,0,0};
458         htsFile *fp = hts_open(args->severity,"r");
459         if ( !fp ) error("Cannot read %s\n", args->severity);
460         while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 )
461         {
462             kputs(tmp.s, &str);
463             kputc('\n', &str);
464         }
465         free(tmp.s);
466     }
467     else
468         kputs(default_severity(),&str);
469     ep = str.s;
470     while ( *ep )
471     {
472         if ( *ep=='#' )
473         {
474             while ( *ep && *ep!='\n' ) { *ep = tolower(*ep); ep++; }
475             if ( !*ep ) break;
476             ep++;
477             continue;
478         }
479         char *bp = ep;
480         while ( *ep && !isspace(*ep) ) { *ep = tolower(*ep); ep++; }
481         char tmp = *ep;
482         *ep = 0;
483         args->nscale++;
484         args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale));
485         args->scale[args->nscale-1] = strdup(bp);
486         if ( !khash_str2int_has_key(args->csq2severity,args->scale[args->nscale-1]) )
487             khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity);
488         if ( !tmp ) break;
489         if ( tmp=='\n' ) severity++;
490         ep++;
491         while ( *ep && isspace(*ep) ) ep++;
492     }
493 
494     // Transcript and/or consequence selection
495     if ( !args->select ) args->select = "all:any";
496     cols_t *cols = cols_split(args->select, NULL, ':');
497     char *sel_tr  = cols->off[0][0] ? cols->off[0] : "all";
498     char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any";
499     if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL;
500     else if ( !strcasecmp(sel_tr,"worst") ) args->select_tr = SELECT_TR_WORST;
501     else if ( !strcasecmp(sel_tr,"primary") ) args->select_tr = SELECT_TR_PRIMARY;
502     else error("Error: the transcript selection key \"%s\" is not recognised.\n", sel_tr);
503     if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; }     // to avoid unnecessary lookups
504     else
505     {
506         int len = strlen(sel_csq);
507         int severity, modifier = '=';
508         if ( sel_csq[len-1]=='+' ) { modifier = '+'; sel_csq[len-1] = 0; }
509         else if ( sel_csq[len-1]=='-' ) { modifier = '-'; sel_csq[len-1] = 0; }
510         if ( khash_str2int_get(args->csq2severity, sel_csq, &severity)!=0 )
511             error("Error: the consequence \"%s\" is not recognised. Run \"bcftools +split-vep -S ?\" to see the default list.\n", sel_csq);
512         if ( modifier=='=' ) { args->min_severity = severity; args->max_severity = severity; }
513         else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; }
514         else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; }
515     }
516     cols_destroy(cols);
517 
518     // The 'CANONICAL' column to look up severity, its name is hardwired for now
519     if ( args->select_tr==SELECT_TR_PRIMARY )
520     {
521         char *tmp = strdup_annot_prefix(args,"CANONICAL");
522         if ( khash_str2int_get(args->field2idx,tmp,&args->primary_id)!=0 )
523             error("The primary transcript was requested but the field \"CANONICAL\" is not present in INFO/%s: %s\n",args->vep_tag,hrec->vals[ret]);
524         free(tmp);
525     }
526 
527     // Create a text output as with `bcftools query -f`. For this we need to determine the fields to be extracted
528     // from the formatting expression
529     if ( args->format_str && !args->column_str )
530     {
531         // Special case: -A was given, extract all fields, for this the -a tag (%CSQ) must be present
532         if ( args->all_fields_delim ) expand_csq_expression(args, &str);
533 
534         for (i=0; i<args->nfield; i++)
535         {
536             if ( !query_has_field(args->format_str,args->field[i],&str) ) continue;
537 
538             int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->field[i]);
539             if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,tag_id) )
540                 fprintf(stderr,"Note: ambiguous key %%%s; using the %s subfield of %s, not the INFO/%s tag\n", args->field[i],args->field[i],args->vep_tag,args->field[i]);
541 
542             int olen = args->column_str ? strlen(args->column_str) : 0;
543             int nlen = strlen(args->field[i]);
544             args->column_str = (char*)realloc(args->column_str, olen + nlen + 2);
545             if ( olen )
546             {
547                 memcpy(args->column_str+olen,",",1);
548                 olen++;
549             }
550             memcpy(args->column_str+olen,args->field[i],nlen);
551             args->column_str[olen+nlen] = 0;
552         }
553         if ( query_has_field(args->format_str,args->vep_tag,&str) ) args->raw_vep_request = 1;
554     }
555 
556     // The "Consequence" column to look up severity, its name is hardwired for now
557     char *tmp = strdup_annot_prefix(args,"Consequence");
558     if ( khash_str2int_get(args->field2idx,tmp,&args->csq_idx)!=0 )
559         error("The field \"Consequence\" is not present in INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]);
560     free(tmp);
561 
562     // Columns to extract: given as names, 0-based indexes or ranges of indexes
563     if ( args->column_str )
564     {
565         int *column = NULL;
566         int *types  = NULL;
567         if ( !strcmp("-",args->column_str) )    // all subfields
568         {
569             free(args->column_str);
570             kstring_t str = {0,0,0};
571             ksprintf(&str,"0-%d",args->nfield-1);
572             args->column_str = str.s;
573         }
574         ep = args->column_str;
575         while ( *ep )
576         {
577             char *tp, *bp = ep;
578             while ( *ep && *ep!=',' ) ep++;
579             char keep = *ep;
580             *ep = 0;
581             int type = -1;
582             int idx_beg, idx_end;
583             if ( !strcmp("-",bp) )
584             {
585                 kstring_t str = {0,0,0};
586                 ksprintf(&str,"0-%d",args->nfield-1);
587                 if ( keep ) ksprintf(&str,",%s",ep+1);
588                 free(args->column_str);
589                 args->column_str = str.s;
590                 ep = str.s;
591                 continue;
592             }
593             char *tmp = strdup_annot_prefix(args, bp);
594             if ( khash_str2int_get(args->field2idx, bp, &idx_beg)==0 || khash_str2int_get(args->field2idx, tmp, &idx_beg)==0 )
595                 idx_end = idx_beg;
596             else if ( (tp=strrchr(bp,':')) )
597             {
598                 *tp = 0;
599                 if ( khash_str2int_get(args->field2idx, bp, &idx_beg)!=0 )
600                 {
601                     *tp = ':';
602                     tp = strrchr(tmp,':');
603                     *tp = 0;
604                     if ( khash_str2int_get(args->field2idx, tmp, &idx_beg)!=0 ) error("No such column: \"%s\"\n", bp);
605                 }
606                 idx_end = idx_beg;
607                 *tp = ':';
608                 if ( !strcasecmp(tp+1,"string") ) type = BCF_HT_STR;
609                 else if ( !strcasecmp(tp+1,"float") || !strcasecmp(tp+1,"real") ) type = BCF_HT_REAL;
610                 else if ( !strcasecmp(tp+1,"integer") || !strcasecmp(tp+1,"int") ) type = BCF_HT_INT;
611                 else if ( !strcasecmp(tp+1,"flag") ) type = BCF_HT_FLAG;
612                 else error("The type \"%s\" (or column \"%s\"?) not recognised\n", tp+1,bp);
613             }
614             else
615             {
616                 char *mp;
617                 idx_beg = strtol(bp,&mp,10);
618                 if ( !*mp ) idx_end = idx_beg;
619                 else if ( *mp=='-' )
620                     idx_end = strtol(mp+1,&mp,10);
621                 if ( *mp )
622                 {
623                     if ( *mp==':' )
624                     {
625                         idx_end = idx_beg;
626                         if ( !strcasecmp(mp+1,"string") ) type = BCF_HT_STR;
627                         else if ( !strcasecmp(mp+1,"float") || !strcasecmp(mp+1,"real") ) type = BCF_HT_REAL;
628                         else if ( !strcasecmp(mp+1,"integer") || !strcasecmp(mp+1,"int") ) type = BCF_HT_INT;
629                         else if ( !strcasecmp(mp+1,"flag") ) type = BCF_HT_FLAG;
630                         else error("The type \"%s\" (or column \"%s\"?) not recognised\n", mp+1,bp);
631                     }
632                     else if ( !strcmp(bp,args->vep_tag) )
633                     {
634                         free(tmp);
635                         args->raw_vep_request = 1;
636                         if ( !keep ) break;
637                         ep++;
638                         continue;
639                     }
640                     else
641                         error("No such column: \"%s\"\n", bp);
642                 }
643             }
644             free(tmp);
645 
646             i = args->nannot;
647             args->nannot += idx_end - idx_beg + 1;
648             column = (int*)realloc(column,args->nannot*sizeof(*column));
649             types  = (int*)realloc(types,args->nannot*sizeof(*types));
650             for (j=idx_beg; j<=idx_end; j++)
651             {
652                 if ( j >= args->nfield ) error("The index is too big: %d\n", j);
653                 column[i] = j;
654                 types[i]  = type;
655                 i++;
656             }
657             if ( !keep ) break;
658             ep++;
659         }
660         args->annot = (annot_t*)calloc(args->nannot,sizeof(*args->annot));
661         for (i=0; i<args->nannot; i++)
662         {
663             annot_t *ann = &args->annot[i];
664             ann->type = types[i];
665             ann->idx = j = column[i];
666             ann->field = strdup(args->field[j]);
667             ann->tag = strdup(args->field[j]);
668             args->kstr.l = 0;
669             const char *type = "String";
670             if ( ann->type==BCF_HT_REAL ) type = "Float";
671             else if ( ann->type==BCF_HT_INT ) type = "Integer";
672             else if ( ann->type==BCF_HT_FLAG ) type = "Flag";
673             else if ( ann->type==BCF_HT_STR ) type = "String";
674             else if ( ann->type==-1 ) type = get_column_type(args, args->field[j]);
675             ksprintf(&args->kstr,"##INFO=<ID=%%s,Number=.,Type=%s,Description=\"The %%s field from INFO/%%s\">",type);
676             bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag);
677         }
678         if ( args->raw_vep_request && args->select_tr==SELECT_TR_ALL ) args->raw_vep_request = 0;
679         if ( args->raw_vep_request )
680         {
681             args->nannot++;
682             args->annot = (annot_t*)realloc(args->annot,args->nannot*sizeof(*args->annot));
683             annot_t *ann = &args->annot[args->nannot-1];
684             ann->type  = BCF_HT_STR;
685             ann->idx   = -1;
686             ann->field = strdup(args->vep_tag);
687             ann->tag   = strdup(args->vep_tag);
688             memset(&ann->str,0,sizeof(ann->str));
689         }
690         free(column);
691         free(types);
692         destroy_column2type(args);
693 
694         if ( bcf_hdr_sync(args->hdr_out)<0 )
695             error_errno("[%s] Failed to update header", __func__);
696     }
697     if ( args->format_str )
698     {
699         if ( !args->column_str && !args->select ) error("Error: No %s field selected in the formatting expression and -s not given: a typo?\n",args->vep_tag);
700         args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str);
701         if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str);
702         if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
703     }
704     if ( args->filter_str )
705     {
706         int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0;
707         args->filter = filter_init(args->hdr_out, args->filter_str);
708         max_unpack |= filter_max_unpack(args->filter);
709         if ( !args->format_str ) max_unpack |= BCF_UN_FMT;      // don't drop FMT fields on VCF input when VCF/BCF is output
710         args->sr->max_unpack = max_unpack;
711         if ( args->convert && (max_unpack & BCF_UN_FMT) )
712             convert_set_option(args->convert, subset_samples, &args->smpl_pass);
713     }
714 
715     free(str.s);
716 }
destroy_data(args_t * args)717 static void destroy_data(args_t *args)
718 {
719     free(args->farr);
720     free(args->iarr);
721     free(args->kstr.s);
722     free(args->column_str);
723     free(args->format_str);
724     cols_destroy(args->cols_csq);
725     cols_destroy(args->cols_tr);
726     int i;
727     for (i=0; i<args->nscale; i++) free(args->scale[i]);
728     free(args->scale);
729     for (i=0; i<args->nfield; i++) free(args->field[i]);
730     free(args->field);
731     for (i=0; i<args->nannot; i++)
732     {
733         annot_t *ann = &args->annot[i];
734         free(ann->field);
735         free(ann->tag);
736         free(ann->str.s);
737     }
738     free(args->annot);
739     if ( args->field2idx ) khash_str2int_destroy(args->field2idx);
740     if ( args->csq2severity ) khash_str2int_destroy(args->csq2severity);
741     bcf_sr_destroy(args->sr);
742     bcf_hdr_destroy(args->hdr_out);
743     free(args->csq_str);
744     if ( args->filter ) filter_destroy(args->filter);
745     if ( args->convert ) convert_destroy(args->convert);
746     if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname);
747     if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname);
748     free(args);
749 }
list_header(args_t * args)750 static void list_header(args_t *args)
751 {
752     int i;
753     for (i=0; i<args->nfield; i++) printf("%d\t%s\n", i,args->field[i]);
754 }
755 
csq_to_severity(args_t * args,char * csq,int * min_severity,int * max_severity,int exact_match)756 static void csq_to_severity(args_t *args, char *csq, int *min_severity, int *max_severity, int exact_match)
757 {
758     *min_severity = INT_MAX;
759     *max_severity = -1;
760     char *ep = csq;
761     while ( *ep )
762     {
763         char *bp = ep;
764         while ( *ep && *ep!='&' ) { *ep = tolower(*ep); ep++; }
765         char tmp = *ep;
766         *ep = 0;
767 
768         int i, severity = -1;
769         if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 )
770         {
771             for (i=0; i<args->nscale; i++)
772                 if ( strstr(bp,args->scale[i]) ) break;
773 
774             if ( i!=args->nscale )
775                 khash_str2int_get(args->csq2severity, args->scale[i], &severity);
776             else
777                 severity = args->nscale + 1;
778 
779             args->nscale++;
780             args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale));
781             args->scale[args->nscale-1] = strdup(bp);
782             khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity);
783             if ( i==args->nscale )
784                 fprintf(stderr,"Note: assigning a (high) severity score to a new consequence, use -S to override: %s -> %d\n",args->scale[args->nscale-1],args->nscale);
785 
786             if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) error("FIXME: failed to look up the consequence \"%s\"\n", bp);
787         }
788         if ( exact_match < 0 )
789         {
790             if ( *min_severity > severity ) *min_severity = severity;
791             if ( *max_severity < severity ) *max_severity = severity;
792         }
793         else
794         {
795             if ( severity==exact_match )
796             {
797                 *min_severity = *max_severity = severity;
798                 *ep = tmp;
799                 return;
800             }
801         }
802 
803         if ( !tmp ) break;
804         *ep = tmp;
805         ep++;
806     }
807 }
808 
csq_severity_pass(args_t * args,char * csq)809 static int csq_severity_pass(args_t *args, char *csq)
810 {
811     if ( args->min_severity==args->max_severity && args->min_severity==SELECT_CSQ_ANY ) return 1;
812 
813     int min_severity, max_severity, exact_match = args->min_severity==args->max_severity ? args->min_severity : -1;
814     csq_to_severity(args, csq, &min_severity, &max_severity, exact_match);
815     if ( max_severity < args->min_severity ) return 0;
816     if ( min_severity > args->max_severity ) return 0;
817     return 1;
818 }
819 
get_primary_transcript(args_t * args,bcf1_t * rec,cols_t * cols_tr)820 static int get_primary_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr)    // modifies args->cols_csq!
821 {
822     int i;
823     for (i=0; i<cols_tr->n; i++)
824     {
825         args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|');
826         if ( args->primary_id >= args->cols_csq->n )
827             error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->primary_id,args->cols_csq->n);
828         if ( !strcmp("YES",args->cols_csq->off[args->primary_id]) ) return i;
829     }
830     return -1;
831 }
get_worst_transcript(args_t * args,bcf1_t * rec,cols_t * cols_tr)832 static int get_worst_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr)     // modifies args->cols_csq!
833 {
834     int i, max_severity = -1, imax_severity = 0;
835     for (i=0; i<cols_tr->n; i++)
836     {
837         args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|');
838         if ( args->csq_idx >= args->cols_csq->n )
839             error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n);
840         char *csq = args->cols_csq->off[args->csq_idx];
841 
842         int min, max;
843         csq_to_severity(args, csq, &min, &max, -1);
844         if ( max_severity < max ) { imax_severity = i; max_severity = max; }
845     }
846     return imax_severity;
847 }
annot_reset(annot_t * annot,int nannot)848 static void annot_reset(annot_t *annot, int nannot)
849 {
850     int i;
851     for (i=0; i<nannot; i++) annot[i].str.l = 0;
852 }
annot_append(annot_t * ann,char * value)853 static void annot_append(annot_t *ann, char *value)
854 {
855     if ( ann->str.l ) kputc(',',&ann->str);
856     kputs(value, &ann->str);
857 }
parse_array_real(char * str,float ** arr,int * marr,int * narr)858 static inline void parse_array_real(char *str, float **arr, int *marr, int *narr)
859 {
860     char *bp = str, *ep;
861     float *ptr = *arr;
862     int i, n = 1, m = *marr;
863     for (i=0; *bp; bp++)
864         if ( *bp == ',' ) n++;
865 
866     hts_expand(float*,n,m,ptr);
867 
868     i = 0;
869     bp = str;
870     while ( *bp )
871     {
872         ptr[i] = strtod(bp, &ep);
873         if ( bp==ep )
874             bcf_float_set_missing(ptr[i]);
875         i++;
876         while ( *ep && *ep!=',' ) ep++;
877         bp = *ep ? ep + 1 : ep;
878     }
879     *narr = i;
880     *marr = m;
881     *arr  = ptr;
882 }
parse_array_int32(char * str,int ** arr,int * marr,int * narr)883 static inline void parse_array_int32(char *str, int **arr, int *marr, int *narr)
884 {
885     char *bp = str, *ep;
886     int32_t *ptr = *arr;
887     int i, n = 1, m = *marr;
888     for (i=0; *bp; bp++)
889         if ( *bp == ',' ) n++;
890 
891     hts_expand(int32_t*,n,m,ptr);
892 
893     i = 0;
894     bp = str;
895     while ( *bp )
896     {
897         ptr[i] = strtol(bp, &ep, 10);
898         if ( bp==ep )
899             ptr[i] = bcf_int32_missing;
900         i++;
901         while ( *ep && *ep!=',' ) ep++;
902         bp = *ep ? ep + 1 : ep;
903     }
904     *narr = i;
905     *marr = m;
906     *arr  = ptr;
907 }
filter_and_output(args_t * args,bcf1_t * rec,int severity_pass,int all_missing)908 static void filter_and_output(args_t *args, bcf1_t *rec, int severity_pass, int all_missing)
909 {
910     int i, updated = 0;
911     for (i=0; i<args->nannot; i++)
912     {
913         annot_t *ann = &args->annot[i];
914         if ( !ann->str.l ) continue;
915         if ( ann->type==BCF_HT_REAL )
916         {
917             parse_array_real(ann->str.s,&args->farr,&args->mfarr,&args->nfarr);
918             bcf_update_info_float(args->hdr_out,rec,ann->tag,args->farr,args->nfarr);
919         }
920         else if ( ann->type==BCF_HT_INT )
921         {
922             parse_array_int32(ann->str.s,&args->iarr,&args->miarr,&args->niarr);
923             bcf_update_info_int32(args->hdr_out,rec,ann->tag,args->iarr,args->niarr);
924         }
925         else
926             bcf_update_info_string(args->hdr_out,rec,ann->tag,ann->str.s);
927         updated++;
928     }
929     if ( args->filter )
930     {
931         int pass = filter_test(args->filter, rec, (const uint8_t**) &args->smpl_pass);
932         if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
933         if ( !pass ) return;
934     }
935     if ( args->format_str )
936     {
937         if ( args->nannot )
938         {
939             if ( !updated || all_missing ) return;         // the standard case: using -f to print the CSQ subfields, skipping if missing
940         }
941         else
942         {
943             if ( !severity_pass ) return;   // request to print only non-CSQ tags at sites that pass severity
944         }
945 
946         args->kstr.l = 0;
947         convert_line(args->convert, rec, &args->kstr);
948         if ( args->kstr.l && bgzf_write(args->fh_bgzf, args->kstr.s, args->kstr.l)!=args->kstr.l )
949             error("Failed to write to %s\n", args->output_fname);
950         return;
951     }
952     if ( bcf_write(args->fh_vcf, args->hdr_out,rec)!=0 )
953         error("Failed to write to %s\n", args->output_fname);
954 }
process_record(args_t * args,bcf1_t * rec)955 static void process_record(args_t *args, bcf1_t *rec)
956 {
957     int len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str);
958     if ( len<=0 )
959     {
960         if ( !args->drop_sites )
961         {
962             annot_reset(args->annot, args->nannot);
963             filter_and_output(args,rec,1,1);
964         }
965         return;
966     }
967 
968     args->cols_tr = cols_split(args->csq_str, args->cols_tr, ',');
969 
970     int i,j, itr_min = 0, itr_max = args->cols_tr->n - 1;
971     if ( args->select_tr==SELECT_TR_PRIMARY )
972     {
973         itr_min = itr_max = get_primary_transcript(args, rec, args->cols_tr);
974         if ( itr_min<0 ) itr_max = itr_min - 1;
975     }
976     else if ( args->select_tr==SELECT_TR_WORST )
977         itr_min = itr_max = get_worst_transcript(args, rec, args->cols_tr);
978 
979     annot_reset(args->annot, args->nannot);
980     int severity_pass = 0;  // consequence severity requested via the -s option (BCF record may be output but not annotated)
981     int all_missing   = 1;  // transcripts with all requested annotations missing will be discarded if -f was given
982     static int too_few_fields_warned = 0;
983     for (i=itr_min; i<=itr_max; i++)
984     {
985         args->cols_csq = cols_split(args->cols_tr->off[i], args->cols_csq, '|');
986         if ( args->csq_idx >= args->cols_csq->n )
987             error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n);
988 
989         char *csq = args->cols_csq->off[args->csq_idx];
990         if ( !csq_severity_pass(args, csq) ) continue;
991         severity_pass = 1;
992 
993         for (j=0; j<args->nannot; j++)
994         {
995             annot_t *ann = &args->annot[j];
996             if ( ann->idx >= args->cols_csq->n )
997             {
998                 if ( !too_few_fields_warned )
999                 {
1000                     fprintf(stderr, "Warning: fewer %s fields than expected at %s:%"PRId64", filling with dots. This warning is printed only once.\n", args->vep_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
1001                     too_few_fields_warned = 1;
1002                 }
1003                 annot_append(ann, ".");
1004                 continue;
1005             }
1006 
1007             char *ann_str = NULL;
1008             if ( ann->idx==-1 ) ann_str = args->cols_tr->off[i];
1009             else if ( *args->cols_csq->off[ann->idx] ) ann_str = args->cols_csq->off[ann->idx];
1010             if ( ann_str )
1011             {
1012                 annot_append(ann, ann_str);
1013                 all_missing = 0;
1014             }
1015             else
1016                 annot_append(ann, "."); // missing value
1017         }
1018 
1019         if ( args->duplicate )
1020         {
1021             filter_and_output(args, rec, severity_pass, all_missing);
1022             annot_reset(args->annot, args->nannot);
1023             all_missing   = 1;
1024             severity_pass = 0;
1025         }
1026     }
1027     if ( !severity_pass && args->drop_sites ) return;
1028     if ( !args->duplicate )
1029         filter_and_output(args, rec, severity_pass, all_missing);
1030 }
1031 
run(int argc,char ** argv)1032 int run(int argc, char **argv)
1033 {
1034     args_t *args = (args_t*) calloc(1,sizeof(args_t));
1035     args->argc   = argc; args->argv = argv;
1036     args->output_fname = "-";
1037     args->output_type  = FT_VCF;
1038     args->vep_tag = "CSQ";
1039     args->record_cmd_line = 1;
1040     args->regions_overlap = 1;
1041     args->targets_overlap = 0;
1042     args->clevel = -1;
1043     static struct option loptions[] =
1044     {
1045         {"drop-sites",no_argument,0,'x'},
1046         {"all-fields",no_argument,0,'A'},
1047         {"duplicate",no_argument,0,'d'},
1048         {"format",required_argument,0,'f'},
1049         {"annotation",required_argument,0,'a'},
1050         {"annot-prefix",required_argument,0,'p'},
1051         {"columns",required_argument,0,'c'},
1052         {"columns-types",required_argument,0,1},
1053         {"select",required_argument,0,'s'},
1054         {"severity",required_argument,0,'S'},
1055         {"list",no_argument,0,'l'},
1056         {"include",required_argument,0,'i'},
1057         {"exclude",required_argument,0,'e'},
1058         {"output",required_argument,NULL,'o'},
1059         {"output-type",required_argument,NULL,'O'},
1060         {"regions",1,0,'r'},
1061         {"regions-file",1,0,'R'},
1062         {"regions-overlap",required_argument,NULL,3},
1063         {"targets",1,0,'t'},
1064         {"targets-file",1,0,'T'},
1065         {"targets-overlap",required_argument,NULL,4},
1066         {"no-version",no_argument,NULL,2},
1067         {"allow-undef-tags",no_argument,0,'u'},
1068         {NULL,0,NULL,0}
1069     };
1070     int c;
1071     char *tmp;
1072     while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:xu",loptions,NULL)) >= 0)
1073     {
1074         switch (c)
1075         {
1076             case  2 : args->record_cmd_line = 0; break;
1077             case  1 : args->column_types = optarg; break;
1078             case 'A':
1079                 if ( !strcasecmp(optarg,"tab") ) args->all_fields_delim = "\t";
1080                 else if ( !strcasecmp(optarg,"space") ) args->all_fields_delim = " ";
1081                 else args->all_fields_delim = optarg;
1082                 break;
1083             case 'x': args->drop_sites = 1; break;
1084             case 'd': args->duplicate = 1; break;
1085             case 'f': args->format_str = strdup(optarg); break;
1086             case 'a': args->vep_tag = optarg; break;
1087             case 'p': args->annot_prefix = optarg; break;
1088             case 'c': args->column_str = strdup(optarg); break;
1089             case 'S': args->severity = optarg; break;
1090             case 's': args->select = optarg; break;
1091             case 'l': args->list_hdr = 1; break;
1092             case 'u': args->allow_undef_tags = 1; break;
1093             case 'e':
1094                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1095                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
1096             case 'i':
1097                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1098                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
1099             case 't': args->targets = optarg; break;
1100             case 'T': args->targets = optarg; args->targets_is_file = 1; break;
1101             case 'r': args->regions = optarg; break;
1102             case 'R': args->regions = optarg; args->regions_is_file = 1; break;
1103             case 'o': args->output_fname = optarg; break;
1104             case 'O':
1105                       switch (optarg[0]) {
1106                           case 'b': args->output_type = FT_BCF_GZ; break;
1107                           case 'u': args->output_type = FT_BCF; break;
1108                           case 'z': args->output_type = FT_VCF_GZ; break;
1109                           case 'v': args->output_type = FT_VCF; break;
1110                           default:
1111                           {
1112                               args->clevel = strtol(optarg,&tmp,10);
1113                               if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
1114                           }
1115                       }
1116                       if ( optarg[1] )
1117                       {
1118                           args->clevel = strtol(optarg+1,&tmp,10);
1119                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
1120                       }
1121                       break;
1122             case  3 :
1123                 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
1124                 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
1125                 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
1126                 else error("Could not parse: --regions-overlap %s\n",optarg);
1127                 break;
1128             case  4 :
1129                 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
1130                 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
1131                 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
1132                 else error("Could not parse: --targets-overlap %s\n",optarg);
1133                 break;
1134             case 'h':
1135             case '?':
1136             default: error("%s", usage_text()); break;
1137         }
1138     }
1139     if ( args->drop_sites && args->format_str ) error("Error: the -x behavior is the default (and only supported) with -f\n");
1140     if ( args->all_fields_delim && !args->format_str ) error("Error: the -A option must be used with -f\n");
1141     if ( args->severity && (!strcmp("?",args->severity) || !strcmp("-",args->severity)) ) error("%s", default_severity());
1142     if ( args->column_types && !strcmp("-",args->column_types) ) error("%s", default_column_types());
1143     if ( optind==argc )
1144     {
1145         if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-";  // reading from stdin
1146         else { error("%s", usage_text()); }
1147     }
1148     else if ( optind+1!=argc ) error("%s", usage_text());
1149     else args->fname = argv[optind];
1150 
1151     init_data(args);
1152 
1153     if ( args->list_hdr )
1154         list_header(args);
1155     else
1156     {
1157         if ( !args->format_str && !args->column_str )
1158         {
1159             if ( args->min_severity==SELECT_CSQ_ANY && args->max_severity==SELECT_CSQ_ANY )
1160                 error("Error: none of the -c,-f,-s options was given, why not use \"bcftools view\" instead?\n");
1161             else if ( !args->drop_sites )
1162                 error("Error: when the -s option is used without -x, everything is printed; why not use \"bcftools view\" instead?\n");
1163         }
1164 
1165         if ( args->format_str )
1166             args->fh_bgzf = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu");
1167         else
1168         {
1169             char wmode[8];
1170             set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
1171             args->fh_vcf = hts_open(args->output_fname ? args->output_fname : "-", wmode);
1172             if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_split-vep");
1173             if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname);
1174         }
1175         while ( bcf_sr_next_line(args->sr) )
1176             process_record(args, bcf_sr_get_line(args->sr,0));
1177     }
1178 
1179     destroy_data(args);
1180 
1181     return 0;
1182 }
1183