1 /*
2 Copyright (C) 2017-2021 Genome Research Ltd.
3
4 Author: Petr Danecek <pd3@sanger.ac.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24 /*
25 Compress gVCF file by resizing gVCF blocks according to specified criteria.
26 */
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <strings.h>
31 #include <getopt.h>
32 #include <stdarg.h>
33 #include <unistd.h>
34 #include <stdint.h>
35 #include <errno.h>
36 #include <ctype.h>
37 #include <inttypes.h>
38 #include <sys/stat.h>
39 #include <htslib/vcf.h>
40 #include <htslib/vcfutils.h>
41 #include <htslib/synced_bcf_reader.h>
42 #include "bcftools.h"
43 #include "filter.h"
44
45 #define FLT_INCLUDE 1
46 #define FLT_EXCLUDE 2
47
48 #define GQ_KEY_NONE NULL
49 #define GQ_KEY_GQ "GQ"
50 #define GQ_KEY_RGQ "RGQ"
51
52 typedef struct
53 {
54 int32_t end, min_dp, gq, pl[3], grp;
55 char *gq_key;
56 bcf1_t *rec;
57 }
58 block_t;
59 typedef struct
60 {
61 char *expr; // expression
62 int flt_id; // filter id, -1 for PASS
63 filter_t *flt; // filter
64 }
65 grp_t;
66 typedef struct
67 {
68 filter_t *filter;
69 char *filter_str;
70 int filter_logic;
71 block_t gvcf;
72 htsFile *fh_out;
73 int ngrp;
74 grp_t *grp;
75 char *group_by;
76 int argc, region_is_file, target_is_file, output_type, trim_alts, clevel;
77 int32_t *tmpi, mtmpi, mean_min_dp_reported;
78 char **argv, *region, *target, *fname, *output_fname, *keep_tags;
79 bcf_hdr_t *hdr_in, *hdr_out;
80 bcf_srs_t *sr;
81 }
82 args_t;
83
about(void)84 const char *about(void)
85 {
86 return "Compress gVCF file by resizing gVCF blocks according to specified criteria.\n";
87 }
88
usage_text(void)89 static const char *usage_text(void)
90 {
91 return
92 "\n"
93 "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n"
94 "\n"
95 "Usage: bcftools +gvcfz [Options]\n"
96 "Plugin options:\n"
97 " -a, --trim-alt-alleles trim alternate alleles not seen in the genotypes\n"
98 " -e, --exclude <expr> exclude sites for which the expression is true\n"
99 " -i, --include <expr> include sites for which the expression is true\n"
100 " -g, --group-by EXPR group gVCF blocks according to the expression\n"
101 " -o, --output FILE write gVCF output to the FILE\n"
102 " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
103 "Examples:\n"
104 " # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n"
105 " bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n"
106 "\n"
107 " # Compress all non-reference sites into a single block, remove unused alternate alleles\n"
108 " bcftools +gvcfz input.bcf -a -g'PASS:GT!=\"alt\"'\n"
109 "\n";
110 }
111
init_groups(args_t * args)112 static void init_groups(args_t *args)
113 {
114 args->hdr_out = bcf_hdr_dup(args->hdr_in);
115 bcf_hdr_printf(args->hdr_out, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"Stop position of the interval\">");
116
117 // avoid nested double quotes in FILTER description
118 char *hdr_str = strdup(args->group_by);
119 char *tmp = hdr_str;
120 while (*tmp)
121 {
122 if ( *tmp=='"' ) *tmp = '\'';
123 tmp++;
124 }
125
126 char *rmme_str = strdup(args->group_by), *beg = rmme_str;
127 while ( *beg )
128 {
129 while ( *beg && isspace(*beg) ) beg++;
130 if ( !beg ) break;
131 char *end = beg;
132 while ( *end && *end!=':' ) end++;
133 if ( *end!=':' ) error("Could not parse the expression: \"%s\"\n", args->group_by);
134 *end = 0;
135 char *flt = beg;
136 beg = ++end;
137 while ( *end && *end!=';' ) end++;
138 char tmp = *end; *end = 0;
139 if ( strcmp(flt,"PASS") )
140 {
141 bcf_hdr_printf(args->hdr_out, "##FILTER=<ID=%s,Description=\"%s\">", flt, hdr_str);
142 if (bcf_hdr_sync(args->hdr_out) < 0)
143 error_errno("[%s] Failed to update header", __func__);
144 }
145 args->ngrp++;
146 args->grp = (grp_t*) realloc(args->grp,sizeof(grp_t)*args->ngrp);
147 grp_t *grp = args->grp + args->ngrp - 1;
148 grp->expr = strdup(beg);
149 grp->flt_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
150 if ( !bcf_hdr_idinfo_exists(args->hdr_out, BCF_HL_FLT, grp->flt_id) ) error("Could not initialize the filter \"%s\"\n", flt);
151 if ( !strcmp(flt,"PASS") ) grp->flt_id = -1;
152
153 // remove trailing spaces
154 beg = grp->expr + strlen(grp->expr); while ( beg >= grp->expr && isspace(*beg) ) { *beg = 0; beg--; }
155 beg = grp->expr; while ( *beg && isspace(*beg) ) beg++;
156
157 grp->flt = strcmp("-",beg) ? filter_init(args->hdr_in, grp->expr) : NULL;
158
159 if ( !tmp ) break;
160 beg = end + 1;
161 }
162 free(rmme_str);
163 free(hdr_str);
164 }
165
destroy_data(args_t * args)166 static void destroy_data(args_t *args)
167 {
168 int i;
169 for (i=0; i<args->ngrp; i++)
170 {
171 if ( args->grp[i].flt ) filter_destroy(args->grp[i].flt);
172 free(args->grp[i].expr);
173 }
174 free(args->grp);
175
176 if ( args->filter ) filter_destroy(args->filter);
177 if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname);
178
179 bcf_sr_destroy(args->sr);
180 if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
181 if ( args->gvcf.rec ) bcf_destroy(args->gvcf.rec);
182 free(args->tmpi);
183 free(args);
184 }
185
flush_block(args_t * args,bcf1_t * rec)186 static void flush_block(args_t *args, bcf1_t *rec)
187 {
188 block_t *gvcf = &args->gvcf;
189 if ( gvcf->grp < 0 ) return;
190 if ( rec && gvcf->end - 1 >= rec->pos ) gvcf->end = rec->pos; // NB: end is 1-based, rec->pos is 0-based
191
192 if ( gvcf->rec->pos+1 < gvcf->end && bcf_update_info_int32(args->hdr_out,gvcf->rec,"END",&gvcf->end,1) != 0 )
193 error("Could not update INFO/END at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
194 if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"DP",&gvcf->min_dp,1) != 0 )
195 error("Could not update FORMAT/DP at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
196 if ( gvcf->gq_key )
197 {
198 if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,gvcf->gq_key,&gvcf->gq,1) != 0 )
199 error("Could not update FORMAT/%s at %s:%"PRId64"\n", gvcf->gq_key, bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
200 }
201 if ( gvcf->pl[0] >=0 )
202 {
203 if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 )
204 error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
205 }
206 if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 )
207 bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id);
208
209 if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n");
210
211 gvcf->grp = -1;
212 }
process_gvcf(args_t * args)213 static void process_gvcf(args_t *args)
214 {
215 bcf1_t *rec = bcf_sr_get_line(args->sr,0);
216
217 if ( args->filter )
218 {
219 int pass = filter_test(args->filter, rec, NULL);
220 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
221 if ( !pass ) return;
222 }
223
224 if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("<NON_REF>",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) )
225 {
226 if ( args->trim_alts )
227 {
228 bcf_unpack(rec, BCF_UN_ALL);
229 if ( bcf_trim_alleles(args->hdr_in, rec)<0 )
230 error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr_in, rec),(int64_t) rec->pos+1);
231
232 // trim the ref allele if necessary
233 if ( rec->d.allele[0][1] )
234 {
235 rec->d.allele[0][1] = 0;
236 bcf_update_alleles(args->hdr_in, rec, (const char**)rec->d.allele, 1);
237 }
238
239 }
240 if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("<NON_REF>",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) )
241 {
242 // not a gvcf block
243 flush_block(args, rec);
244 if ( bcf_write(args->fh_out, args->hdr_out, rec)!=0 ) error("Failed to write\n");
245 return;
246 }
247 }
248
249 int ret = bcf_get_info_int32(args->hdr_in,rec,"END",&args->tmpi,&args->mtmpi);
250 int32_t end = ret==1 ? args->tmpi[0] : rec->pos + 1;
251
252 char *gq_key = GQ_KEY_GQ;
253 ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi);
254 if ( ret!=1 )
255 {
256 gq_key = GQ_KEY_RGQ;
257 if ( ret<1 ) ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi);
258 if ( ret!=1 ) gq_key = GQ_KEY_NONE;
259 }
260 int32_t gq = ret==1 ? args->tmpi[0] : 0;
261
262 int32_t min_dp = 0;
263 if ( bcf_get_format_int32(args->hdr_in,rec,"MIN_DP",&args->tmpi,&args->mtmpi)==1 )
264 min_dp = args->tmpi[0];
265 else if ( bcf_get_format_int32(args->hdr_in,rec,"DP",&args->tmpi,&args->mtmpi)==1 )
266 min_dp = args->tmpi[0];
267 else
268 error("Expected one FORMAT/MIN_DP or FORMAT/DP value at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1);
269
270 int32_t pl[3] = {-1,-1,-1};
271 ret = bcf_get_format_int32(args->hdr_in,rec,"PL",&args->tmpi,&args->mtmpi);
272 if ( ret>3 ) error("Expected three FORMAT/PL values at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1);
273 else if ( ret==3 )
274 {
275 pl[0] = args->tmpi[0];
276 pl[1] = args->tmpi[1];
277 pl[2] = args->tmpi[2];
278 }
279
280 int i;
281 for (i=0; i<args->ngrp; i++)
282 if ( !args->grp[i].flt || filter_test(args->grp[i].flt, rec, NULL)==1 ) break;
283
284 if ( args->gvcf.grp != i ) flush_block(args, rec); // new block
285 if ( args->gvcf.grp >= 0 && args->gvcf.rec->rid != rec->rid ) flush_block(args, NULL); // new chromosome
286
287 if ( args->gvcf.grp >= 0 ) // extend an existing block
288 {
289 if ( args->gvcf.end < end ) args->gvcf.end = end;
290 if ( args->gvcf.gq_key!=GQ_KEY_NONE && gq_key!=GQ_KEY_NONE && args->gvcf.gq > gq ) args->gvcf.gq = gq;
291 if ( args->gvcf.min_dp > min_dp ) args->gvcf.min_dp = min_dp;
292 if ( args->gvcf.pl[0] > pl[0] ) args->gvcf.pl[0] = pl[0];
293 if ( args->gvcf.pl[1] > pl[1] ) args->gvcf.pl[1] = pl[1];
294 if ( args->gvcf.pl[2] > pl[2] ) args->gvcf.pl[2] = pl[2];
295 return;
296 }
297
298 // start a new block
299 args->gvcf.rec = bcf_copy(args->gvcf.rec, rec);
300 args->gvcf.grp = i;
301 args->gvcf.min_dp = min_dp;
302 args->gvcf.end = end;
303 args->gvcf.pl[0] = pl[0];
304 args->gvcf.pl[1] = pl[1];
305 args->gvcf.pl[2] = pl[2];
306 args->gvcf.gq_key = gq_key;
307 if ( gq_key!=GQ_KEY_NONE ) args->gvcf.gq = gq;
308 }
309
run(int argc,char ** argv)310 int run(int argc, char **argv)
311 {
312 args_t *args = (args_t*) calloc(1,sizeof(args_t));
313 args->argc = argc; args->argv = argv;
314 args->output_type = FT_VCF;
315 args->output_fname = "-";
316 args->clevel = -1;
317 static struct option loptions[] =
318 {
319 {"trim-alt-alleles",required_argument,0,'a'},
320 {"include",required_argument,0,'i'},
321 {"exclude",required_argument,0,'e'},
322 {"group-by",required_argument,NULL,'g'},
323 {"stats",required_argument,NULL,'s'},
324 {"output",required_argument,NULL,'o'},
325 {"output-type",required_argument,NULL,'O'},
326 {NULL,0,NULL,0}
327 };
328 int c;
329 char *tmp;
330 while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0)
331 {
332 switch (c)
333 {
334 case 'a': args->trim_alts = 1; break;
335 case 'e':
336 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
337 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
338 case 'i':
339 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
340 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
341 case 'g': args->group_by = optarg; break;
342 case 'o': args->output_fname = optarg; break;
343 case 'O':
344 switch (optarg[0]) {
345 case 'b': args->output_type = FT_BCF_GZ; break;
346 case 'u': args->output_type = FT_BCF; break;
347 case 'z': args->output_type = FT_VCF_GZ; break;
348 case 'v': args->output_type = FT_VCF; break;
349 default:
350 {
351 args->clevel = strtol(optarg,&tmp,10);
352 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
353 }
354 }
355 if ( optarg[1] )
356 {
357 args->clevel = strtol(optarg+1,&tmp,10);
358 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
359 }
360 break;
361 case 'h':
362 case '?':
363 default: error("%s", usage_text()); break;
364 }
365 }
366 if ( optind==argc )
367 {
368 if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
369 else { error("%s", usage_text()); }
370 }
371 else if ( optind+1!=argc ) error("%s", usage_text());
372 else args->fname = argv[optind];
373
374 if ( !args->group_by ) error("Missing the -g option\n");
375
376 args->gvcf.rec = bcf_init();
377 args->gvcf.grp = -1; // the block is inactive
378 args->sr = bcf_sr_init();
379 if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
380 args->hdr_in = bcf_sr_get_header(args->sr,0);
381 if ( args->filter_str )
382 args->filter = filter_init(args->hdr_in, args->filter_str);
383 init_groups(args);
384 char wmode[8];
385 set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
386 args->fh_out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
387 if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n");
388 while ( bcf_sr_next_line(args->sr) ) process_gvcf(args);
389 flush_block(args, NULL);
390
391 destroy_data(args);
392 return 0;
393 }
394
395
396