1 /* The MIT License
2
3 Copyright (c) 2019-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <strings.h>
30 #include <getopt.h>
31 #include <unistd.h> // for isatty
32 #include <inttypes.h>
33 #include <htslib/hts.h>
34 #include <htslib/vcf.h>
35 #include <htslib/bgzf.h>
36 #include <htslib/kstring.h>
37 #include <htslib/kseq.h>
38 #include <htslib/synced_bcf_reader.h>
39 #include <htslib/khash_str2int.h>
40 #include <regex.h>
41 #include "../bcftools.h"
42 #include "../filter.h"
43 #include "../convert.h"
44 #include "../cols.h"
45
46
47 // Logic of the filters: include or exclude sites which match the filters?
48 #define FLT_INCLUDE 1
49 #define FLT_EXCLUDE 2
50
51 #define SELECT_TR_ALL 0
52 #define SELECT_TR_WORST 1
53 #define SELECT_TR_PRIMARY 2
54 #define SELECT_CSQ_ANY -1
55
56 typedef struct
57 {
58 regex_t *regex;
59 char *type;
60 }
61 col2type_t;
62
63 typedef struct
64 {
65 char *field; // the name of the VEP field, e.g. Consequence,Gene,etc.
66 char *tag; // the name of the VCF tag: the annot_t.field with the -p prefix
67 int idx; // 0-based index within the VEP annotation string
68 int type; // annotation type, one of the BCF_HT_* types
69 kstring_t str; // annotation value, ready to pass to bcf_update_info_*
70 }
71 annot_t;
72
73 typedef struct
74 {
75 convert_t *convert;
76 filter_t *filter;
77 int argc, filter_logic, regions_is_file, targets_is_file, list_hdr, record_cmd_line, clevel;
78 int regions_overlap, targets_overlap;
79 kstring_t kstr;
80 char *filter_str,
81 *vep_tag; // the --annotation INFO tag to process
82 char **argv, *output_fname, *fname, *regions, *targets, *format_str;
83 int output_type;
84 htsFile *fh_vcf;
85 BGZF *fh_bgzf;
86 bcf_srs_t *sr;
87 bcf_hdr_t *hdr, *hdr_out;
88 int nfield; // number of all available VEP fields
89 char **field; // list of all available VEP fields
90 int nannot; // number of requested fields
91 annot_t *annot; // requested fields
92 int nscale; // number of items in the severity scale
93 char **scale; // severity scale (list)
94 int ncsq_str; // the length of csq_str allocated by bcf_get_info_string()
95 char *csq_str; // the current bcf_get_info_string() result
96 int csq_idx, // the index of the Consequence field; for the --select CSQ option
97 primary_id; // the index of the CANONICAL field; for the --select TR option
98 char *severity, // the --severity scale option
99 *select, // the --select option
100 *column_str, // the --columns option
101 *column_types, // the --columns-types option
102 *annot_prefix; // the --annot-prefix option
103 void *field2idx, // VEP field name to index, used in initialization
104 *csq2severity; // consequence type to severity score
105 cols_t *cols_tr, // the current CSQ tag split into transcripts
106 *cols_csq; // the current CSQ transcript split into fields
107 int min_severity, max_severity; // ignore consequences outside this severity range
108 int drop_sites; // the -x, --drop-sites option
109 int select_tr; // one of SELECT_TR_*
110 uint8_t *smpl_pass; // for filtering at sample level, used with -f
111 int duplicate; // the -d, --duplicate option is set
112 char *all_fields_delim; // the -A, --all-fields option is set
113 float *farr; // helper arrays for bcf_update_* functions
114 int32_t *iarr;
115 int niarr,miarr, nfarr,mfarr;
116 col2type_t *column2type;
117 int ncolumn2type;
118 int raw_vep_request; // raw VEP tag requested and will need subsetting
119 int allow_undef_tags;
120 }
121 args_t;
122
123 args_t args;
124
about(void)125 const char *about(void)
126 {
127 return "Query structured annotations such as the CSQ created by VEP.\n";
128 }
129
default_severity(void)130 static const char *default_severity(void)
131 {
132 return
133 "# Default consequence substrings ordered in ascending order by severity.\n"
134 "# Consequences with the same severity can be put on the same line in arbitrary order.\n"
135 "# See also https://m.ensembl.org/info/genome/variation/prediction/predicted_data.htm\n"
136 "intergenic\n"
137 "feature_truncation feature_elongation\n"
138 "regulatory\n"
139 "TF_binding_site TFBS\n"
140 "downstream upstream\n"
141 "non_coding_transcript non_coding\n"
142 "intron NMD_transcript\n"
143 "non_coding_transcript_exon\n"
144 "5_prime_utr 3_prime_utr\n"
145 "coding_sequence mature_miRNA\n"
146 "stop_retained start_retained synonymous\n"
147 "incomplete_terminal_codon\n"
148 "splice_region\n"
149 "missense inframe protein_altering\n"
150 "transcript_amplification\n"
151 "exon_loss\n"
152 "disruptive\n"
153 "start_lost stop_lost stop_gained frameshift\n"
154 "splice_acceptor splice_donor\n"
155 "transcript_ablation\n";
156 }
default_column_types(void)157 static const char *default_column_types(void)
158 {
159 return
160 "# Default CSQ subfield types, unlisted fields are type String.\n"
161 "# Note the use of regular expressions.\n"
162 "cDNA_position Integer\n"
163 "CDS_position Integer\n"
164 "Protein_position Integer\n"
165 "DISTANCE Integer\n"
166 "STRAND Integer\n"
167 "TSL Integer\n"
168 "GENE_PHENO Integer\n"
169 "HGVS_OFFSET Integer\n"
170 "AF Float\n"
171 ".*_AF Float\n"
172 "MAX_AF_.* Float\n"
173 "MOTIF_POS Integer\n"
174 "MOTIF_SCORE_CHANGE Float\n"
175 "existing_InFrame_oORFs Integer\n"
176 "existing_OutOfFrame_oORFs Integer\n"
177 "existing_uORFs Integer\n"
178 "SpliceAI_pred_DP_.* Integer\n"
179 "SpliceAI_pred_DS_.* Float\n";
180 }
usage_text(void)181 static const char *usage_text(void)
182 {
183 return
184 "\n"
185 "About: Query structured annotations such INFO/CSQ created by bcftools/csq or VEP. For more\n"
186 " more information and pointers see http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n"
187 "Usage: bcftools +split-vep [Plugin Options]\n"
188 "Plugin options:\n"
189 " -a, --annotation STR INFO annotation to parse [CSQ]\n"
190 " -A, --all-fields DELIM Output all fields replacing the -a tag (\"%CSQ\" by default) in the -f\n"
191 " filtering expression using the output field delimiter DELIM. This can be\n"
192 " \"tab\", \"space\" or an arbitrary string.\n"
193 " -c, --columns [LIST|-][:TYPE] Extract the fields listed either as 0-based indexes or names, \"-\" to extract all\n"
194 " fields. See --columns-types for the defaults. Supported types are String/Str,\n"
195 " Integer/Int and Float/Real. Unlisted fields are set to String.\n"
196 " --columns-types -|FILE Pass \"-\" to print the default -c types or FILE to override the presets\n"
197 " -d, --duplicate Output per transcript/allele consequences on a new line rather rather than\n"
198 " as comma-separated fields on a single line\n"
199 " -f, --format STR Create non-VCF output; similar to `bcftools query -f` but drops lines w/o consequence\n"
200 " -l, --list Parse the VCF header and list the annotation fields\n"
201 " -p, --annot-prefix STR Before doing anything else, prepend STR to all CSQ fields to avoid tag name conflicts\n"
202 " -s, --select TR:CSQ Select transcripts to extract by type and/or consequence severity. (See also -S and -x.)\n"
203 " TR, transcript: worst,primary(*),all [all]\n"
204 " CSQ, consequence: any,missense,missense+,etc [any]\n"
205 " (*) Primary transcripts have the field \"CANONICAL\" set to \"YES\"\n"
206 " -S, --severity -|FILE Pass \"-\" to print the default severity scale or FILE to override\n"
207 " the default scale\n"
208 " -u, --allow-undef-tags Print \".\" for undefined tags\n"
209 " -x, --drop-sites Drop sites with none of the consequences matching the severity specified by -s.\n"
210 " This switch is intended for use with VCF/BCF output (i.e. -f not given).\n"
211 "Common options:\n"
212 " -e, --exclude EXPR Exclude sites and samples for which the expression is true\n"
213 " -i, --include EXPR Include sites and samples for which the expression is true\n"
214 " --no-version Do not append version and command line to the header\n"
215 " -o, --output FILE Output file name [stdout]\n"
216 " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
217 " -r, --regions REG Restrict to comma-separated list of regions\n"
218 " -R, --regions-file FILE Restrict to regions listed in a file\n"
219 " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"
220 " -t, --targets REG Similar to -r but streams rather than index-jumps\n"
221 " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
222 " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
223 "\n"
224 "Examples:\n"
225 " # List available fields of the INFO/CSQ annotation\n"
226 " bcftools +split-vep -l file.vcf.gz\n"
227 "\n"
228 " # List the default severity scale\n"
229 " bcftools +split-vep -S -\n"
230 "\n"
231 " # Extract Consequence, IMPACT and gene SYMBOL of the most severe consequence into\n"
232 " # INFO annotations starting with the prefix \"vep\". For brevity, the columns can\n"
233 " # be given also as 0-based indexes\n"
234 " bcftools +split-vep -c Consequence,IMPACT,SYMBOL -s worst -p vep file.vcf.gz\n"
235 " bcftools +split-vep -c 1-3 -s worst -p vep file.vcf.gz\n"
236 "\n"
237 " # Same as above but use the text output of the \"bcftools query\" format\n"
238 " bcftools +split-vep -s worst -f '%CHROM %POS %Consequence %IMPACT %SYMBOL\\n' file.vcf.gz\n"
239 "\n"
240 " # Print all subfields (tab-delimited) in place of %CSQ, each consequence on a new line\n"
241 " bcftools +split-vep -f '%CHROM %POS %CSQ\\n' -d -A tab file.vcf.gz\n"
242 "\n"
243 " # Extract gnomAD_AF subfield into a new INFO/gnomAD_AF annotation of Type=Float so that\n"
244 " # numeric filtering can be used.\n"
245 " bcftools +split-vep -c gnomAD_AF:Float file.vcf.gz -i'gnomAD_AF<0.001'\n"
246 "\n"
247 " # Similar to above, but add the annotation only if the consequence severity is missense\n"
248 " # or equivalent. In order to drop sites with different consequences completely, we add\n"
249 " # the -x switch. See the online documentation referenced above for more examples.\n"
250 " bcftools +split-vep -c gnomAD_AF:Float -s :missense file.vcf.gz\n"
251 " bcftools +split-vep -c gnomAD_AF:Float -s :missense -x file.vcf.gz\n"
252 "\n"
253 " See also http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n"
254 "\n";
255 }
256
expand_csq_expression(args_t * args,kstring_t * str)257 static void expand_csq_expression(args_t *args, kstring_t *str)
258 {
259 if ( !args->all_fields_delim ) return;
260
261 str->l = 0;
262 kputc('%',str);
263 kputs(args->vep_tag,str);
264 char *ptr = strstr(args->format_str,str->s);
265 if ( !ptr ) return;
266 char *end = ptr + str->l, tmp = *end;
267 if ( isalnum(tmp) || tmp=='_' || tmp=='.' ) return;
268 *end = 0;
269
270 str->l = 0;
271 kputsn(args->format_str, ptr - args->format_str, str);
272
273 int i;
274 for (i=0; i<args->nfield; i++)
275 {
276 if ( i>0 ) kputs(args->all_fields_delim, str);
277 kputc('%', str);
278 kputs(args->field[i], str);
279 }
280
281 *end = tmp;
282 kputs(end, str);
283
284 free(args->format_str);
285 args->format_str = str->s;
286 str->l = str->m = 0;
287 str->s = NULL;
288 }
289
init_column2type(args_t * args)290 static void init_column2type(args_t *args)
291 {
292 int i, ntype = 0;
293 char **type = NULL;
294 if ( args->column_types && strcmp("-",args->column_types) )
295 type = hts_readlines(args->column_types, &ntype);
296 else
297 {
298 char *str = strdup(default_column_types());
299 char *beg = str;
300 while ( *beg )
301 {
302 char *end = beg;
303 while ( *end && *end!='\n' ) end++;
304 char tmp = *end;
305 *end = 0;
306 ntype++;
307 type = (char**) realloc(type,sizeof(*type)*ntype);
308 type[ntype-1] = strdup(beg);
309 if ( !tmp ) break;
310 beg = end+1;
311 }
312 free(str);
313 }
314 if ( !type || !ntype ) error("Failed to parse the column types\n");
315 for (i=0; i<ntype; i++)
316 {
317 if ( type[i][0]=='#' ) continue;
318 char *tmp = strdup(type[i]);
319 char *ptr = tmp;
320 while ( *ptr && !isspace(*ptr) ) ptr++;
321 if ( !*ptr ) error("Error: failed to parse the column type \"%s\"\n",type[i]);
322 *ptr = 0;
323 ptr++;
324 while ( *ptr && isspace(*ptr) ) ptr++;
325 if ( !*ptr ) error("Error: failed to parse the column type \"%s\"\n",type[i]);
326 args->ncolumn2type++;
327 args->column2type = (col2type_t*) realloc(args->column2type,sizeof(*args->column2type)*args->ncolumn2type);
328 col2type_t *ct = &args->column2type[args->ncolumn2type-1];
329 ct->regex = (regex_t *) malloc(sizeof(regex_t));
330 if ( regcomp(ct->regex, tmp, REG_NOSUB) )
331 error("Error: fail to compile the column type regular expression \"%s\": %s\n", tmp,type[i]);
332 int type_ok = 0;
333 if ( !strcmp(ptr,"Float") ) type_ok = 1;
334 else if ( !strcmp(ptr,"Integer") ) type_ok = 1;
335 else if ( !strcmp(ptr,"Flag") ) type_ok = 1;
336 else if ( !strcmp(ptr,"String") ) type_ok = 1;
337 if ( !type_ok ) error("Error: the column type \"%s\" is not supported: %s\n",ptr,type[i]);
338 ct->type = strdup(ptr);
339 free(tmp);
340 }
341 if ( !args->ncolumn2type ) error("Failed to parse the column types\n");
342 for (i=0; i<ntype; i++) free(type[i]);
343 free(type);
344 }
destroy_column2type(args_t * args)345 static void destroy_column2type(args_t *args)
346 {
347 int i;
348 for (i=0; i<args->ncolumn2type; i++)
349 {
350 regfree(args->column2type[i].regex);
351 free(args->column2type[i].regex);
352 free(args->column2type[i].type);
353 }
354 free(args->column2type);
355 }
get_column_type(args_t * args,char * field)356 static const char *get_column_type(args_t *args, char *field)
357 {
358 if ( !args->column2type ) init_column2type(args);
359 int i;
360 for (i=0; i<args->ncolumn2type; i++)
361 {
362 int match = regexec(args->column2type[i].regex, field, 0,NULL,0) ? 0 : 1;
363 if ( match ) return args->column2type[i].type;
364 }
365 return "String";
366 }
query_has_field(char * fmt,char * field,kstring_t * str)367 static int query_has_field(char *fmt, char *field, kstring_t *str)
368 {
369 str->l = 0;
370 kputc('%',str);
371 kputs(field,str);
372 char end, *ptr = fmt;
373 while ( ptr )
374 {
375 ptr = strstr(ptr,str->s);
376 if ( !ptr ) return 0;
377 end = ptr[str->l];
378 if ( isalnum(end) || end=='_' || end=='.' )
379 {
380 ptr++;
381 continue;
382 }
383 break;
384 }
385 return 1;
386 }
strdup_annot_prefix(args_t * args,const char * str)387 char *strdup_annot_prefix(args_t *args, const char *str)
388 {
389 if ( !args->annot_prefix ) return strdup(str);
390 int str_len = strlen(str);
391 int prefix_len = strlen(args->annot_prefix);
392 char *out = calloc(str_len+prefix_len+1,1);
393 memcpy(out,args->annot_prefix,prefix_len);
394 memcpy(out+prefix_len,str,str_len);
395 return out;
396 }
init_data(args_t * args)397 static void init_data(args_t *args)
398 {
399 args->sr = bcf_sr_init();
400 if ( args->regions )
401 {
402 args->sr->require_index = 1;
403 bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
404 if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions);
405 }
406 if ( args->targets )
407 {
408 bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
409 if ( bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets);
410 }
411 if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
412 args->hdr = bcf_sr_get_header(args->sr,0);
413 args->hdr_out = bcf_hdr_dup(args->hdr);
414
415 // Parse the header CSQ line, must contain Description with "Format: ..." declaration
416 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr, BCF_HL_INFO, NULL, args->vep_tag, NULL);
417 if ( !hrec ) error("The tag INFO/%s not found in the header\n", args->vep_tag);
418 int ret = bcf_hrec_find_key(hrec, "Description");
419 if ( ret<0 ) error("No \"Description\" field was found for the tag INFO/%s in the header\n", args->vep_tag);
420 char *format = strstr(hrec->vals[ret], "Format: ");
421 if ( !format ) error("Expected \"Format: \" substring in the header INFO/%s/Description, found: %s\n", args->vep_tag,hrec->vals[ret]);
422 format += 8;
423 char *ep = format;
424 while ( *ep )
425 {
426 char *bp = ep;
427 while ( *ep && *ep!='|' && *ep!='"' && *ep!='(' ) ep++; // don't include bracket in "pos(1-based)"
428 char tmp = *ep;
429 *ep = 0;
430 args->nfield++;
431 args->field = (char**)realloc(args->field,args->nfield*sizeof(*args->field));
432 args->field[args->nfield-1] = strdup_annot_prefix(args,bp);
433 if ( !tmp ) break;
434 *ep = tmp;
435 while ( *ep && *ep!='|' ) ep++; // skip bracket in "pos(1-based)"
436 if ( *ep ) ep++;
437 }
438 if ( !args->nfield ) error("Could not parse Description of INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]);
439 args->field2idx = khash_str2int_init();
440 int i,j;
441 for (i=0; i<args->nfield; i++)
442 {
443 if ( khash_str2int_has_key(args->field2idx, args->field[i]) )
444 {
445 fprintf(stderr,"Warning: duplicate INFO/%s key \"%s\"\n", args->vep_tag,args->field[i]);
446 continue;
447 }
448 khash_str2int_set(args->field2idx, args->field[i], i);
449 }
450
451 // Severity scale
452 kstring_t str = {0,0,0};
453 args->csq2severity = khash_str2int_init();
454 int severity = 0;
455 if ( args->severity )
456 {
457 kstring_t tmp = {0,0,0};
458 htsFile *fp = hts_open(args->severity,"r");
459 if ( !fp ) error("Cannot read %s\n", args->severity);
460 while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 )
461 {
462 kputs(tmp.s, &str);
463 kputc('\n', &str);
464 }
465 free(tmp.s);
466 }
467 else
468 kputs(default_severity(),&str);
469 ep = str.s;
470 while ( *ep )
471 {
472 if ( *ep=='#' )
473 {
474 while ( *ep && *ep!='\n' ) { *ep = tolower(*ep); ep++; }
475 if ( !*ep ) break;
476 ep++;
477 continue;
478 }
479 char *bp = ep;
480 while ( *ep && !isspace(*ep) ) { *ep = tolower(*ep); ep++; }
481 char tmp = *ep;
482 *ep = 0;
483 args->nscale++;
484 args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale));
485 args->scale[args->nscale-1] = strdup(bp);
486 if ( !khash_str2int_has_key(args->csq2severity,args->scale[args->nscale-1]) )
487 khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity);
488 if ( !tmp ) break;
489 if ( tmp=='\n' ) severity++;
490 ep++;
491 while ( *ep && isspace(*ep) ) ep++;
492 }
493
494 // Transcript and/or consequence selection
495 if ( !args->select ) args->select = "all:any";
496 cols_t *cols = cols_split(args->select, NULL, ':');
497 char *sel_tr = cols->off[0][0] ? cols->off[0] : "all";
498 char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any";
499 if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL;
500 else if ( !strcasecmp(sel_tr,"worst") ) args->select_tr = SELECT_TR_WORST;
501 else if ( !strcasecmp(sel_tr,"primary") ) args->select_tr = SELECT_TR_PRIMARY;
502 else error("Error: the transcript selection key \"%s\" is not recognised.\n", sel_tr);
503 if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups
504 else
505 {
506 int len = strlen(sel_csq);
507 int severity, modifier = '=';
508 if ( sel_csq[len-1]=='+' ) { modifier = '+'; sel_csq[len-1] = 0; }
509 else if ( sel_csq[len-1]=='-' ) { modifier = '-'; sel_csq[len-1] = 0; }
510 if ( khash_str2int_get(args->csq2severity, sel_csq, &severity)!=0 )
511 error("Error: the consequence \"%s\" is not recognised. Run \"bcftools +split-vep -S ?\" to see the default list.\n", sel_csq);
512 if ( modifier=='=' ) { args->min_severity = severity; args->max_severity = severity; }
513 else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; }
514 else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; }
515 }
516 cols_destroy(cols);
517
518 // The 'CANONICAL' column to look up severity, its name is hardwired for now
519 if ( args->select_tr==SELECT_TR_PRIMARY )
520 {
521 char *tmp = strdup_annot_prefix(args,"CANONICAL");
522 if ( khash_str2int_get(args->field2idx,tmp,&args->primary_id)!=0 )
523 error("The primary transcript was requested but the field \"CANONICAL\" is not present in INFO/%s: %s\n",args->vep_tag,hrec->vals[ret]);
524 free(tmp);
525 }
526
527 // Create a text output as with `bcftools query -f`. For this we need to determine the fields to be extracted
528 // from the formatting expression
529 if ( args->format_str && !args->column_str )
530 {
531 // Special case: -A was given, extract all fields, for this the -a tag (%CSQ) must be present
532 if ( args->all_fields_delim ) expand_csq_expression(args, &str);
533
534 for (i=0; i<args->nfield; i++)
535 {
536 if ( !query_has_field(args->format_str,args->field[i],&str) ) continue;
537
538 int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->field[i]);
539 if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,tag_id) )
540 fprintf(stderr,"Note: ambiguous key %%%s; using the %s subfield of %s, not the INFO/%s tag\n", args->field[i],args->field[i],args->vep_tag,args->field[i]);
541
542 int olen = args->column_str ? strlen(args->column_str) : 0;
543 int nlen = strlen(args->field[i]);
544 args->column_str = (char*)realloc(args->column_str, olen + nlen + 2);
545 if ( olen )
546 {
547 memcpy(args->column_str+olen,",",1);
548 olen++;
549 }
550 memcpy(args->column_str+olen,args->field[i],nlen);
551 args->column_str[olen+nlen] = 0;
552 }
553 if ( query_has_field(args->format_str,args->vep_tag,&str) ) args->raw_vep_request = 1;
554 }
555
556 // The "Consequence" column to look up severity, its name is hardwired for now
557 char *tmp = strdup_annot_prefix(args,"Consequence");
558 if ( khash_str2int_get(args->field2idx,tmp,&args->csq_idx)!=0 )
559 error("The field \"Consequence\" is not present in INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]);
560 free(tmp);
561
562 // Columns to extract: given as names, 0-based indexes or ranges of indexes
563 if ( args->column_str )
564 {
565 int *column = NULL;
566 int *types = NULL;
567 if ( !strcmp("-",args->column_str) ) // all subfields
568 {
569 free(args->column_str);
570 kstring_t str = {0,0,0};
571 ksprintf(&str,"0-%d",args->nfield-1);
572 args->column_str = str.s;
573 }
574 ep = args->column_str;
575 while ( *ep )
576 {
577 char *tp, *bp = ep;
578 while ( *ep && *ep!=',' ) ep++;
579 char keep = *ep;
580 *ep = 0;
581 int type = -1;
582 int idx_beg, idx_end;
583 if ( !strcmp("-",bp) )
584 {
585 kstring_t str = {0,0,0};
586 ksprintf(&str,"0-%d",args->nfield-1);
587 if ( keep ) ksprintf(&str,",%s",ep+1);
588 free(args->column_str);
589 args->column_str = str.s;
590 ep = str.s;
591 continue;
592 }
593 char *tmp = strdup_annot_prefix(args, bp);
594 if ( khash_str2int_get(args->field2idx, bp, &idx_beg)==0 || khash_str2int_get(args->field2idx, tmp, &idx_beg)==0 )
595 idx_end = idx_beg;
596 else if ( (tp=strrchr(bp,':')) )
597 {
598 *tp = 0;
599 if ( khash_str2int_get(args->field2idx, bp, &idx_beg)!=0 )
600 {
601 *tp = ':';
602 tp = strrchr(tmp,':');
603 *tp = 0;
604 if ( khash_str2int_get(args->field2idx, tmp, &idx_beg)!=0 ) error("No such column: \"%s\"\n", bp);
605 }
606 idx_end = idx_beg;
607 *tp = ':';
608 if ( !strcasecmp(tp+1,"string") ) type = BCF_HT_STR;
609 else if ( !strcasecmp(tp+1,"float") || !strcasecmp(tp+1,"real") ) type = BCF_HT_REAL;
610 else if ( !strcasecmp(tp+1,"integer") || !strcasecmp(tp+1,"int") ) type = BCF_HT_INT;
611 else if ( !strcasecmp(tp+1,"flag") ) type = BCF_HT_FLAG;
612 else error("The type \"%s\" (or column \"%s\"?) not recognised\n", tp+1,bp);
613 }
614 else
615 {
616 char *mp;
617 idx_beg = strtol(bp,&mp,10);
618 if ( !*mp ) idx_end = idx_beg;
619 else if ( *mp=='-' )
620 idx_end = strtol(mp+1,&mp,10);
621 if ( *mp )
622 {
623 if ( *mp==':' )
624 {
625 idx_end = idx_beg;
626 if ( !strcasecmp(mp+1,"string") ) type = BCF_HT_STR;
627 else if ( !strcasecmp(mp+1,"float") || !strcasecmp(mp+1,"real") ) type = BCF_HT_REAL;
628 else if ( !strcasecmp(mp+1,"integer") || !strcasecmp(mp+1,"int") ) type = BCF_HT_INT;
629 else if ( !strcasecmp(mp+1,"flag") ) type = BCF_HT_FLAG;
630 else error("The type \"%s\" (or column \"%s\"?) not recognised\n", mp+1,bp);
631 }
632 else if ( !strcmp(bp,args->vep_tag) )
633 {
634 free(tmp);
635 args->raw_vep_request = 1;
636 if ( !keep ) break;
637 ep++;
638 continue;
639 }
640 else
641 error("No such column: \"%s\"\n", bp);
642 }
643 }
644 free(tmp);
645
646 i = args->nannot;
647 args->nannot += idx_end - idx_beg + 1;
648 column = (int*)realloc(column,args->nannot*sizeof(*column));
649 types = (int*)realloc(types,args->nannot*sizeof(*types));
650 for (j=idx_beg; j<=idx_end; j++)
651 {
652 if ( j >= args->nfield ) error("The index is too big: %d\n", j);
653 column[i] = j;
654 types[i] = type;
655 i++;
656 }
657 if ( !keep ) break;
658 ep++;
659 }
660 args->annot = (annot_t*)calloc(args->nannot,sizeof(*args->annot));
661 for (i=0; i<args->nannot; i++)
662 {
663 annot_t *ann = &args->annot[i];
664 ann->type = types[i];
665 ann->idx = j = column[i];
666 ann->field = strdup(args->field[j]);
667 ann->tag = strdup(args->field[j]);
668 args->kstr.l = 0;
669 const char *type = "String";
670 if ( ann->type==BCF_HT_REAL ) type = "Float";
671 else if ( ann->type==BCF_HT_INT ) type = "Integer";
672 else if ( ann->type==BCF_HT_FLAG ) type = "Flag";
673 else if ( ann->type==BCF_HT_STR ) type = "String";
674 else if ( ann->type==-1 ) type = get_column_type(args, args->field[j]);
675 ksprintf(&args->kstr,"##INFO=<ID=%%s,Number=.,Type=%s,Description=\"The %%s field from INFO/%%s\">",type);
676 bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag);
677 }
678 if ( args->raw_vep_request && args->select_tr==SELECT_TR_ALL ) args->raw_vep_request = 0;
679 if ( args->raw_vep_request )
680 {
681 args->nannot++;
682 args->annot = (annot_t*)realloc(args->annot,args->nannot*sizeof(*args->annot));
683 annot_t *ann = &args->annot[args->nannot-1];
684 ann->type = BCF_HT_STR;
685 ann->idx = -1;
686 ann->field = strdup(args->vep_tag);
687 ann->tag = strdup(args->vep_tag);
688 memset(&ann->str,0,sizeof(ann->str));
689 }
690 free(column);
691 free(types);
692 destroy_column2type(args);
693
694 if ( bcf_hdr_sync(args->hdr_out)<0 )
695 error_errno("[%s] Failed to update header", __func__);
696 }
697 if ( args->format_str )
698 {
699 if ( !args->column_str && !args->select ) error("Error: No %s field selected in the formatting expression and -s not given: a typo?\n",args->vep_tag);
700 args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str);
701 if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str);
702 if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
703 }
704 if ( args->filter_str )
705 {
706 int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0;
707 args->filter = filter_init(args->hdr_out, args->filter_str);
708 max_unpack |= filter_max_unpack(args->filter);
709 if ( !args->format_str ) max_unpack |= BCF_UN_FMT; // don't drop FMT fields on VCF input when VCF/BCF is output
710 args->sr->max_unpack = max_unpack;
711 if ( args->convert && (max_unpack & BCF_UN_FMT) )
712 convert_set_option(args->convert, subset_samples, &args->smpl_pass);
713 }
714
715 free(str.s);
716 }
destroy_data(args_t * args)717 static void destroy_data(args_t *args)
718 {
719 free(args->farr);
720 free(args->iarr);
721 free(args->kstr.s);
722 free(args->column_str);
723 free(args->format_str);
724 cols_destroy(args->cols_csq);
725 cols_destroy(args->cols_tr);
726 int i;
727 for (i=0; i<args->nscale; i++) free(args->scale[i]);
728 free(args->scale);
729 for (i=0; i<args->nfield; i++) free(args->field[i]);
730 free(args->field);
731 for (i=0; i<args->nannot; i++)
732 {
733 annot_t *ann = &args->annot[i];
734 free(ann->field);
735 free(ann->tag);
736 free(ann->str.s);
737 }
738 free(args->annot);
739 if ( args->field2idx ) khash_str2int_destroy(args->field2idx);
740 if ( args->csq2severity ) khash_str2int_destroy(args->csq2severity);
741 bcf_sr_destroy(args->sr);
742 bcf_hdr_destroy(args->hdr_out);
743 free(args->csq_str);
744 if ( args->filter ) filter_destroy(args->filter);
745 if ( args->convert ) convert_destroy(args->convert);
746 if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname);
747 if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname);
748 free(args);
749 }
list_header(args_t * args)750 static void list_header(args_t *args)
751 {
752 int i;
753 for (i=0; i<args->nfield; i++) printf("%d\t%s\n", i,args->field[i]);
754 }
755
csq_to_severity(args_t * args,char * csq,int * min_severity,int * max_severity,int exact_match)756 static void csq_to_severity(args_t *args, char *csq, int *min_severity, int *max_severity, int exact_match)
757 {
758 *min_severity = INT_MAX;
759 *max_severity = -1;
760 char *ep = csq;
761 while ( *ep )
762 {
763 char *bp = ep;
764 while ( *ep && *ep!='&' ) { *ep = tolower(*ep); ep++; }
765 char tmp = *ep;
766 *ep = 0;
767
768 int i, severity = -1;
769 if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 )
770 {
771 for (i=0; i<args->nscale; i++)
772 if ( strstr(bp,args->scale[i]) ) break;
773
774 if ( i!=args->nscale )
775 khash_str2int_get(args->csq2severity, args->scale[i], &severity);
776 else
777 severity = args->nscale + 1;
778
779 args->nscale++;
780 args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale));
781 args->scale[args->nscale-1] = strdup(bp);
782 khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity);
783 if ( i==args->nscale )
784 fprintf(stderr,"Note: assigning a (high) severity score to a new consequence, use -S to override: %s -> %d\n",args->scale[args->nscale-1],args->nscale);
785
786 if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) error("FIXME: failed to look up the consequence \"%s\"\n", bp);
787 }
788 if ( exact_match < 0 )
789 {
790 if ( *min_severity > severity ) *min_severity = severity;
791 if ( *max_severity < severity ) *max_severity = severity;
792 }
793 else
794 {
795 if ( severity==exact_match )
796 {
797 *min_severity = *max_severity = severity;
798 *ep = tmp;
799 return;
800 }
801 }
802
803 if ( !tmp ) break;
804 *ep = tmp;
805 ep++;
806 }
807 }
808
csq_severity_pass(args_t * args,char * csq)809 static int csq_severity_pass(args_t *args, char *csq)
810 {
811 if ( args->min_severity==args->max_severity && args->min_severity==SELECT_CSQ_ANY ) return 1;
812
813 int min_severity, max_severity, exact_match = args->min_severity==args->max_severity ? args->min_severity : -1;
814 csq_to_severity(args, csq, &min_severity, &max_severity, exact_match);
815 if ( max_severity < args->min_severity ) return 0;
816 if ( min_severity > args->max_severity ) return 0;
817 return 1;
818 }
819
get_primary_transcript(args_t * args,bcf1_t * rec,cols_t * cols_tr)820 static int get_primary_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq!
821 {
822 int i;
823 for (i=0; i<cols_tr->n; i++)
824 {
825 args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|');
826 if ( args->primary_id >= args->cols_csq->n )
827 error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->primary_id,args->cols_csq->n);
828 if ( !strcmp("YES",args->cols_csq->off[args->primary_id]) ) return i;
829 }
830 return -1;
831 }
get_worst_transcript(args_t * args,bcf1_t * rec,cols_t * cols_tr)832 static int get_worst_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq!
833 {
834 int i, max_severity = -1, imax_severity = 0;
835 for (i=0; i<cols_tr->n; i++)
836 {
837 args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|');
838 if ( args->csq_idx >= args->cols_csq->n )
839 error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n);
840 char *csq = args->cols_csq->off[args->csq_idx];
841
842 int min, max;
843 csq_to_severity(args, csq, &min, &max, -1);
844 if ( max_severity < max ) { imax_severity = i; max_severity = max; }
845 }
846 return imax_severity;
847 }
annot_reset(annot_t * annot,int nannot)848 static void annot_reset(annot_t *annot, int nannot)
849 {
850 int i;
851 for (i=0; i<nannot; i++) annot[i].str.l = 0;
852 }
annot_append(annot_t * ann,char * value)853 static void annot_append(annot_t *ann, char *value)
854 {
855 if ( ann->str.l ) kputc(',',&ann->str);
856 kputs(value, &ann->str);
857 }
parse_array_real(char * str,float ** arr,int * marr,int * narr)858 static inline void parse_array_real(char *str, float **arr, int *marr, int *narr)
859 {
860 char *bp = str, *ep;
861 float *ptr = *arr;
862 int i, n = 1, m = *marr;
863 for (i=0; *bp; bp++)
864 if ( *bp == ',' ) n++;
865
866 hts_expand(float*,n,m,ptr);
867
868 i = 0;
869 bp = str;
870 while ( *bp )
871 {
872 ptr[i] = strtod(bp, &ep);
873 if ( bp==ep )
874 bcf_float_set_missing(ptr[i]);
875 i++;
876 while ( *ep && *ep!=',' ) ep++;
877 bp = *ep ? ep + 1 : ep;
878 }
879 *narr = i;
880 *marr = m;
881 *arr = ptr;
882 }
parse_array_int32(char * str,int ** arr,int * marr,int * narr)883 static inline void parse_array_int32(char *str, int **arr, int *marr, int *narr)
884 {
885 char *bp = str, *ep;
886 int32_t *ptr = *arr;
887 int i, n = 1, m = *marr;
888 for (i=0; *bp; bp++)
889 if ( *bp == ',' ) n++;
890
891 hts_expand(int32_t*,n,m,ptr);
892
893 i = 0;
894 bp = str;
895 while ( *bp )
896 {
897 ptr[i] = strtol(bp, &ep, 10);
898 if ( bp==ep )
899 ptr[i] = bcf_int32_missing;
900 i++;
901 while ( *ep && *ep!=',' ) ep++;
902 bp = *ep ? ep + 1 : ep;
903 }
904 *narr = i;
905 *marr = m;
906 *arr = ptr;
907 }
filter_and_output(args_t * args,bcf1_t * rec,int severity_pass,int all_missing)908 static void filter_and_output(args_t *args, bcf1_t *rec, int severity_pass, int all_missing)
909 {
910 int i, updated = 0;
911 for (i=0; i<args->nannot; i++)
912 {
913 annot_t *ann = &args->annot[i];
914 if ( !ann->str.l ) continue;
915 if ( ann->type==BCF_HT_REAL )
916 {
917 parse_array_real(ann->str.s,&args->farr,&args->mfarr,&args->nfarr);
918 bcf_update_info_float(args->hdr_out,rec,ann->tag,args->farr,args->nfarr);
919 }
920 else if ( ann->type==BCF_HT_INT )
921 {
922 parse_array_int32(ann->str.s,&args->iarr,&args->miarr,&args->niarr);
923 bcf_update_info_int32(args->hdr_out,rec,ann->tag,args->iarr,args->niarr);
924 }
925 else
926 bcf_update_info_string(args->hdr_out,rec,ann->tag,ann->str.s);
927 updated++;
928 }
929 if ( args->filter )
930 {
931 int pass = filter_test(args->filter, rec, (const uint8_t**) &args->smpl_pass);
932 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
933 if ( !pass ) return;
934 }
935 if ( args->format_str )
936 {
937 if ( args->nannot )
938 {
939 if ( !updated || all_missing ) return; // the standard case: using -f to print the CSQ subfields, skipping if missing
940 }
941 else
942 {
943 if ( !severity_pass ) return; // request to print only non-CSQ tags at sites that pass severity
944 }
945
946 args->kstr.l = 0;
947 convert_line(args->convert, rec, &args->kstr);
948 if ( args->kstr.l && bgzf_write(args->fh_bgzf, args->kstr.s, args->kstr.l)!=args->kstr.l )
949 error("Failed to write to %s\n", args->output_fname);
950 return;
951 }
952 if ( bcf_write(args->fh_vcf, args->hdr_out,rec)!=0 )
953 error("Failed to write to %s\n", args->output_fname);
954 }
process_record(args_t * args,bcf1_t * rec)955 static void process_record(args_t *args, bcf1_t *rec)
956 {
957 int len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str);
958 if ( len<=0 )
959 {
960 if ( !args->drop_sites )
961 {
962 annot_reset(args->annot, args->nannot);
963 filter_and_output(args,rec,1,1);
964 }
965 return;
966 }
967
968 args->cols_tr = cols_split(args->csq_str, args->cols_tr, ',');
969
970 int i,j, itr_min = 0, itr_max = args->cols_tr->n - 1;
971 if ( args->select_tr==SELECT_TR_PRIMARY )
972 {
973 itr_min = itr_max = get_primary_transcript(args, rec, args->cols_tr);
974 if ( itr_min<0 ) itr_max = itr_min - 1;
975 }
976 else if ( args->select_tr==SELECT_TR_WORST )
977 itr_min = itr_max = get_worst_transcript(args, rec, args->cols_tr);
978
979 annot_reset(args->annot, args->nannot);
980 int severity_pass = 0; // consequence severity requested via the -s option (BCF record may be output but not annotated)
981 int all_missing = 1; // transcripts with all requested annotations missing will be discarded if -f was given
982 static int too_few_fields_warned = 0;
983 for (i=itr_min; i<=itr_max; i++)
984 {
985 args->cols_csq = cols_split(args->cols_tr->off[i], args->cols_csq, '|');
986 if ( args->csq_idx >= args->cols_csq->n )
987 error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n);
988
989 char *csq = args->cols_csq->off[args->csq_idx];
990 if ( !csq_severity_pass(args, csq) ) continue;
991 severity_pass = 1;
992
993 for (j=0; j<args->nannot; j++)
994 {
995 annot_t *ann = &args->annot[j];
996 if ( ann->idx >= args->cols_csq->n )
997 {
998 if ( !too_few_fields_warned )
999 {
1000 fprintf(stderr, "Warning: fewer %s fields than expected at %s:%"PRId64", filling with dots. This warning is printed only once.\n", args->vep_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
1001 too_few_fields_warned = 1;
1002 }
1003 annot_append(ann, ".");
1004 continue;
1005 }
1006
1007 char *ann_str = NULL;
1008 if ( ann->idx==-1 ) ann_str = args->cols_tr->off[i];
1009 else if ( *args->cols_csq->off[ann->idx] ) ann_str = args->cols_csq->off[ann->idx];
1010 if ( ann_str )
1011 {
1012 annot_append(ann, ann_str);
1013 all_missing = 0;
1014 }
1015 else
1016 annot_append(ann, "."); // missing value
1017 }
1018
1019 if ( args->duplicate )
1020 {
1021 filter_and_output(args, rec, severity_pass, all_missing);
1022 annot_reset(args->annot, args->nannot);
1023 all_missing = 1;
1024 severity_pass = 0;
1025 }
1026 }
1027 if ( !severity_pass && args->drop_sites ) return;
1028 if ( !args->duplicate )
1029 filter_and_output(args, rec, severity_pass, all_missing);
1030 }
1031
run(int argc,char ** argv)1032 int run(int argc, char **argv)
1033 {
1034 args_t *args = (args_t*) calloc(1,sizeof(args_t));
1035 args->argc = argc; args->argv = argv;
1036 args->output_fname = "-";
1037 args->output_type = FT_VCF;
1038 args->vep_tag = "CSQ";
1039 args->record_cmd_line = 1;
1040 args->regions_overlap = 1;
1041 args->targets_overlap = 0;
1042 args->clevel = -1;
1043 static struct option loptions[] =
1044 {
1045 {"drop-sites",no_argument,0,'x'},
1046 {"all-fields",no_argument,0,'A'},
1047 {"duplicate",no_argument,0,'d'},
1048 {"format",required_argument,0,'f'},
1049 {"annotation",required_argument,0,'a'},
1050 {"annot-prefix",required_argument,0,'p'},
1051 {"columns",required_argument,0,'c'},
1052 {"columns-types",required_argument,0,1},
1053 {"select",required_argument,0,'s'},
1054 {"severity",required_argument,0,'S'},
1055 {"list",no_argument,0,'l'},
1056 {"include",required_argument,0,'i'},
1057 {"exclude",required_argument,0,'e'},
1058 {"output",required_argument,NULL,'o'},
1059 {"output-type",required_argument,NULL,'O'},
1060 {"regions",1,0,'r'},
1061 {"regions-file",1,0,'R'},
1062 {"regions-overlap",required_argument,NULL,3},
1063 {"targets",1,0,'t'},
1064 {"targets-file",1,0,'T'},
1065 {"targets-overlap",required_argument,NULL,4},
1066 {"no-version",no_argument,NULL,2},
1067 {"allow-undef-tags",no_argument,0,'u'},
1068 {NULL,0,NULL,0}
1069 };
1070 int c;
1071 char *tmp;
1072 while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:xu",loptions,NULL)) >= 0)
1073 {
1074 switch (c)
1075 {
1076 case 2 : args->record_cmd_line = 0; break;
1077 case 1 : args->column_types = optarg; break;
1078 case 'A':
1079 if ( !strcasecmp(optarg,"tab") ) args->all_fields_delim = "\t";
1080 else if ( !strcasecmp(optarg,"space") ) args->all_fields_delim = " ";
1081 else args->all_fields_delim = optarg;
1082 break;
1083 case 'x': args->drop_sites = 1; break;
1084 case 'd': args->duplicate = 1; break;
1085 case 'f': args->format_str = strdup(optarg); break;
1086 case 'a': args->vep_tag = optarg; break;
1087 case 'p': args->annot_prefix = optarg; break;
1088 case 'c': args->column_str = strdup(optarg); break;
1089 case 'S': args->severity = optarg; break;
1090 case 's': args->select = optarg; break;
1091 case 'l': args->list_hdr = 1; break;
1092 case 'u': args->allow_undef_tags = 1; break;
1093 case 'e':
1094 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1095 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
1096 case 'i':
1097 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1098 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
1099 case 't': args->targets = optarg; break;
1100 case 'T': args->targets = optarg; args->targets_is_file = 1; break;
1101 case 'r': args->regions = optarg; break;
1102 case 'R': args->regions = optarg; args->regions_is_file = 1; break;
1103 case 'o': args->output_fname = optarg; break;
1104 case 'O':
1105 switch (optarg[0]) {
1106 case 'b': args->output_type = FT_BCF_GZ; break;
1107 case 'u': args->output_type = FT_BCF; break;
1108 case 'z': args->output_type = FT_VCF_GZ; break;
1109 case 'v': args->output_type = FT_VCF; break;
1110 default:
1111 {
1112 args->clevel = strtol(optarg,&tmp,10);
1113 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
1114 }
1115 }
1116 if ( optarg[1] )
1117 {
1118 args->clevel = strtol(optarg+1,&tmp,10);
1119 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
1120 }
1121 break;
1122 case 3 :
1123 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
1124 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
1125 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
1126 else error("Could not parse: --regions-overlap %s\n",optarg);
1127 break;
1128 case 4 :
1129 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
1130 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
1131 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
1132 else error("Could not parse: --targets-overlap %s\n",optarg);
1133 break;
1134 case 'h':
1135 case '?':
1136 default: error("%s", usage_text()); break;
1137 }
1138 }
1139 if ( args->drop_sites && args->format_str ) error("Error: the -x behavior is the default (and only supported) with -f\n");
1140 if ( args->all_fields_delim && !args->format_str ) error("Error: the -A option must be used with -f\n");
1141 if ( args->severity && (!strcmp("?",args->severity) || !strcmp("-",args->severity)) ) error("%s", default_severity());
1142 if ( args->column_types && !strcmp("-",args->column_types) ) error("%s", default_column_types());
1143 if ( optind==argc )
1144 {
1145 if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
1146 else { error("%s", usage_text()); }
1147 }
1148 else if ( optind+1!=argc ) error("%s", usage_text());
1149 else args->fname = argv[optind];
1150
1151 init_data(args);
1152
1153 if ( args->list_hdr )
1154 list_header(args);
1155 else
1156 {
1157 if ( !args->format_str && !args->column_str )
1158 {
1159 if ( args->min_severity==SELECT_CSQ_ANY && args->max_severity==SELECT_CSQ_ANY )
1160 error("Error: none of the -c,-f,-s options was given, why not use \"bcftools view\" instead?\n");
1161 else if ( !args->drop_sites )
1162 error("Error: when the -s option is used without -x, everything is printed; why not use \"bcftools view\" instead?\n");
1163 }
1164
1165 if ( args->format_str )
1166 args->fh_bgzf = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu");
1167 else
1168 {
1169 char wmode[8];
1170 set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
1171 args->fh_vcf = hts_open(args->output_fname ? args->output_fname : "-", wmode);
1172 if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_split-vep");
1173 if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname);
1174 }
1175 while ( bcf_sr_next_line(args->sr) )
1176 process_record(args, bcf_sr_get_line(args->sr,0));
1177 }
1178
1179 destroy_data(args);
1180
1181 return 0;
1182 }
1183