1 /*
2     Copyright (C) 2017-2021 Genome Research Ltd.
3 
4     Author: Petr Danecek <pd3@sanger.ac.uk>
5 
6     Permission is hereby granted, free of charge, to any person obtaining a copy
7     of this software and associated documentation files (the "Software"), to deal
8     in the Software without restriction, including without limitation the rights
9     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10     copies of the Software, and to permit persons to whom the Software is
11     furnished to do so, subject to the following conditions:
12 
13     The above copyright notice and this permission notice shall be included in
14     all copies or substantial portions of the Software.
15 
16     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22     THE SOFTWARE.
23 */
24 /*
25     Compress gVCF file by resizing gVCF blocks according to specified criteria.
26 */
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <strings.h>
31 #include <getopt.h>
32 #include <stdarg.h>
33 #include <unistd.h>
34 #include <stdint.h>
35 #include <errno.h>
36 #include <ctype.h>
37 #include <inttypes.h>
38 #include <sys/stat.h>
39 #include <htslib/vcf.h>
40 #include <htslib/vcfutils.h>
41 #include <htslib/synced_bcf_reader.h>
42 #include "bcftools.h"
43 #include "filter.h"
44 
45 #define FLT_INCLUDE 1
46 #define FLT_EXCLUDE 2
47 
48 #define GQ_KEY_NONE NULL
49 #define GQ_KEY_GQ   "GQ"
50 #define GQ_KEY_RGQ  "RGQ"
51 
52 typedef struct
53 {
54     int32_t end, min_dp, gq, pl[3], grp;
55     char *gq_key;
56     bcf1_t *rec;
57 }
58 block_t;
59 typedef struct
60 {
61     char *expr;     // expression
62     int flt_id;     // filter id, -1 for PASS
63     filter_t *flt;  // filter
64 }
65 grp_t;
66 typedef struct
67 {
68     filter_t *filter;
69     char *filter_str;
70     int filter_logic;
71     block_t gvcf;
72     htsFile *fh_out;
73     int ngrp;
74     grp_t *grp;
75     char *group_by;
76     int argc, region_is_file, target_is_file, output_type, trim_alts, clevel;
77     int32_t *tmpi, mtmpi, mean_min_dp_reported;
78     char **argv, *region, *target, *fname, *output_fname, *keep_tags;
79     bcf_hdr_t *hdr_in, *hdr_out;
80     bcf_srs_t *sr;
81 }
82 args_t;
83 
about(void)84 const char *about(void)
85 {
86     return "Compress gVCF file by resizing gVCF blocks according to specified criteria.\n";
87 }
88 
usage_text(void)89 static const char *usage_text(void)
90 {
91     return
92         "\n"
93         "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n"
94         "\n"
95         "Usage: bcftools +gvcfz [Options]\n"
96         "Plugin options:\n"
97         "   -a, --trim-alt-alleles          trim alternate alleles not seen in the genotypes\n"
98         "   -e, --exclude <expr>            exclude sites for which the expression is true\n"
99         "   -i, --include <expr>            include sites for which the expression is true\n"
100         "   -g, --group-by EXPR             group gVCF blocks according to the expression\n"
101         "   -o, --output FILE               write gVCF output to the FILE\n"
102         "   -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
103         "Examples:\n"
104         "   # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n"
105         "   bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n"
106         "\n"
107         "   # Compress all non-reference sites into a single block, remove unused alternate alleles\n"
108         "   bcftools +gvcfz input.bcf -a -g'PASS:GT!=\"alt\"'\n"
109         "\n";
110 }
111 
init_groups(args_t * args)112 static void init_groups(args_t *args)
113 {
114     args->hdr_out = bcf_hdr_dup(args->hdr_in);
115     bcf_hdr_printf(args->hdr_out, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"Stop position of the interval\">");
116 
117     // avoid nested double quotes in FILTER description
118     char *hdr_str = strdup(args->group_by);
119     char *tmp = hdr_str;
120     while (*tmp)
121     {
122         if ( *tmp=='"' ) *tmp = '\'';
123         tmp++;
124     }
125 
126     char *rmme_str = strdup(args->group_by), *beg = rmme_str;
127     while ( *beg )
128     {
129         while ( *beg && isspace(*beg) ) beg++;
130         if ( !beg ) break;
131         char *end = beg;
132         while ( *end && *end!=':' ) end++;
133         if ( *end!=':' ) error("Could not parse the expression: \"%s\"\n", args->group_by);
134         *end = 0;
135         char *flt = beg;
136         beg = ++end;
137         while ( *end && *end!=';' ) end++;
138         char tmp = *end; *end = 0;
139         if ( strcmp(flt,"PASS") )
140         {
141             bcf_hdr_printf(args->hdr_out, "##FILTER=<ID=%s,Description=\"%s\">", flt, hdr_str);
142             if (bcf_hdr_sync(args->hdr_out) < 0)
143                 error_errno("[%s] Failed to update header", __func__);
144         }
145         args->ngrp++;
146         args->grp = (grp_t*) realloc(args->grp,sizeof(grp_t)*args->ngrp);
147         grp_t *grp = args->grp + args->ngrp - 1;
148         grp->expr = strdup(beg);
149         grp->flt_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
150         if ( !bcf_hdr_idinfo_exists(args->hdr_out, BCF_HL_FLT, grp->flt_id) ) error("Could not initialize the filter \"%s\"\n", flt);
151         if ( !strcmp(flt,"PASS") ) grp->flt_id = -1;
152 
153         // remove trailing spaces
154         beg = grp->expr + strlen(grp->expr); while ( beg >= grp->expr && isspace(*beg) ) { *beg = 0; beg--; }
155         beg = grp->expr; while ( *beg && isspace(*beg) ) beg++;
156 
157         grp->flt = strcmp("-",beg) ? filter_init(args->hdr_in, grp->expr) : NULL;
158 
159         if ( !tmp ) break;
160         beg = end + 1;
161     }
162     free(rmme_str);
163     free(hdr_str);
164 }
165 
destroy_data(args_t * args)166 static void destroy_data(args_t *args)
167 {
168     int i;
169     for (i=0; i<args->ngrp; i++)
170     {
171         if ( args->grp[i].flt ) filter_destroy(args->grp[i].flt);
172         free(args->grp[i].expr);
173     }
174     free(args->grp);
175 
176     if ( args->filter ) filter_destroy(args->filter);
177     if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname);
178 
179     bcf_sr_destroy(args->sr);
180     if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
181     if ( args->gvcf.rec ) bcf_destroy(args->gvcf.rec);
182     free(args->tmpi);
183     free(args);
184 }
185 
flush_block(args_t * args,bcf1_t * rec)186 static void flush_block(args_t *args, bcf1_t *rec)
187 {
188     block_t *gvcf = &args->gvcf;
189     if ( gvcf->grp < 0 ) return;
190     if ( rec && gvcf->end - 1 >= rec->pos ) gvcf->end = rec->pos; // NB: end is 1-based, rec->pos is 0-based
191 
192     if ( gvcf->rec->pos+1 < gvcf->end && bcf_update_info_int32(args->hdr_out,gvcf->rec,"END",&gvcf->end,1) != 0 )
193         error("Could not update INFO/END at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
194     if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"DP",&gvcf->min_dp,1) != 0 )
195         error("Could not update FORMAT/DP at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
196     if ( gvcf->gq_key )
197     {
198         if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,gvcf->gq_key,&gvcf->gq,1) != 0 )
199             error("Could not update FORMAT/%s at %s:%"PRId64"\n", gvcf->gq_key, bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
200     }
201     if ( gvcf->pl[0] >=0 )
202     {
203         if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 )
204             error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
205     }
206     if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 )
207         bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id);
208 
209     if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n");
210 
211     gvcf->grp = -1;
212 }
process_gvcf(args_t * args)213 static void process_gvcf(args_t *args)
214 {
215     bcf1_t *rec = bcf_sr_get_line(args->sr,0);
216 
217     if ( args->filter )
218     {
219         int pass = filter_test(args->filter, rec, NULL);
220         if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
221         if ( !pass ) return;
222     }
223 
224     if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("<NON_REF>",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) )
225     {
226         if ( args->trim_alts )
227         {
228             bcf_unpack(rec, BCF_UN_ALL);
229             if ( bcf_trim_alleles(args->hdr_in, rec)<0 )
230                 error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr_in, rec),(int64_t)  rec->pos+1);
231 
232             // trim the ref allele if necessary
233             if ( rec->d.allele[0][1] )
234             {
235                 rec->d.allele[0][1] = 0;
236                 bcf_update_alleles(args->hdr_in, rec, (const char**)rec->d.allele, 1);
237             }
238 
239         }
240         if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("<NON_REF>",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) )
241         {
242             // not a gvcf block
243             flush_block(args, rec);
244             if ( bcf_write(args->fh_out, args->hdr_out, rec)!=0 ) error("Failed to write\n");
245             return;
246         }
247     }
248 
249     int ret = bcf_get_info_int32(args->hdr_in,rec,"END",&args->tmpi,&args->mtmpi);
250     int32_t end = ret==1 ? args->tmpi[0] : rec->pos + 1;
251 
252     char *gq_key = GQ_KEY_GQ;
253     ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi);
254     if ( ret!=1 )
255     {
256         gq_key = GQ_KEY_RGQ;
257         if ( ret<1 ) ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi);
258         if ( ret!=1 ) gq_key = GQ_KEY_NONE;
259     }
260     int32_t gq = ret==1 ? args->tmpi[0] : 0;
261 
262     int32_t min_dp = 0;
263     if ( bcf_get_format_int32(args->hdr_in,rec,"MIN_DP",&args->tmpi,&args->mtmpi)==1 )
264         min_dp = args->tmpi[0];
265     else if ( bcf_get_format_int32(args->hdr_in,rec,"DP",&args->tmpi,&args->mtmpi)==1 )
266         min_dp = args->tmpi[0];
267     else
268         error("Expected one FORMAT/MIN_DP or FORMAT/DP value at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1);
269 
270     int32_t pl[3] = {-1,-1,-1};
271     ret = bcf_get_format_int32(args->hdr_in,rec,"PL",&args->tmpi,&args->mtmpi);
272     if ( ret>3 ) error("Expected three FORMAT/PL values at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1);
273     else if ( ret==3 )
274     {
275         pl[0] = args->tmpi[0];
276         pl[1] = args->tmpi[1];
277         pl[2] = args->tmpi[2];
278     }
279 
280     int i;
281     for (i=0; i<args->ngrp; i++)
282         if ( !args->grp[i].flt || filter_test(args->grp[i].flt, rec, NULL)==1 ) break;
283 
284     if ( args->gvcf.grp != i ) flush_block(args, rec);      // new block
285     if ( args->gvcf.grp >= 0 && args->gvcf.rec->rid != rec->rid ) flush_block(args, NULL);  // new chromosome
286 
287     if ( args->gvcf.grp >= 0 ) // extend an existing block
288     {
289         if ( args->gvcf.end < end ) args->gvcf.end = end;
290         if ( args->gvcf.gq_key!=GQ_KEY_NONE && gq_key!=GQ_KEY_NONE && args->gvcf.gq > gq ) args->gvcf.gq = gq;
291         if ( args->gvcf.min_dp > min_dp ) args->gvcf.min_dp = min_dp;
292         if ( args->gvcf.pl[0] > pl[0] ) args->gvcf.pl[0] = pl[0];
293         if ( args->gvcf.pl[1] > pl[1] ) args->gvcf.pl[1] = pl[1];
294         if ( args->gvcf.pl[2] > pl[2] ) args->gvcf.pl[2] = pl[2];
295         return;
296     }
297 
298     // start a new block
299     args->gvcf.rec = bcf_copy(args->gvcf.rec, rec);
300     args->gvcf.grp = i;
301     args->gvcf.min_dp   = min_dp;
302     args->gvcf.end      = end;
303     args->gvcf.pl[0]    = pl[0];
304     args->gvcf.pl[1]    = pl[1];
305     args->gvcf.pl[2]    = pl[2];
306     args->gvcf.gq_key   = gq_key;
307     if ( gq_key!=GQ_KEY_NONE ) args->gvcf.gq = gq;
308 }
309 
run(int argc,char ** argv)310 int run(int argc, char **argv)
311 {
312     args_t *args = (args_t*) calloc(1,sizeof(args_t));
313     args->argc   = argc; args->argv = argv;
314     args->output_type  = FT_VCF;
315     args->output_fname = "-";
316     args->clevel = -1;
317     static struct option loptions[] =
318     {
319         {"trim-alt-alleles",required_argument,0,'a'},
320         {"include",required_argument,0,'i'},
321         {"exclude",required_argument,0,'e'},
322         {"group-by",required_argument,NULL,'g'},
323         {"stats",required_argument,NULL,'s'},
324         {"output",required_argument,NULL,'o'},
325         {"output-type",required_argument,NULL,'O'},
326         {NULL,0,NULL,0}
327     };
328     int c;
329     char *tmp;
330     while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0)
331     {
332         switch (c)
333         {
334             case 'a': args->trim_alts = 1; break;
335             case 'e':
336                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
337                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
338             case 'i':
339                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
340                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
341             case 'g': args->group_by = optarg; break;
342             case 'o': args->output_fname = optarg; break;
343             case 'O':
344                       switch (optarg[0]) {
345                           case 'b': args->output_type = FT_BCF_GZ; break;
346                           case 'u': args->output_type = FT_BCF; break;
347                           case 'z': args->output_type = FT_VCF_GZ; break;
348                           case 'v': args->output_type = FT_VCF; break;
349                           default:
350                           {
351                               args->clevel = strtol(optarg,&tmp,10);
352                               if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
353                           }
354                       }
355                       if ( optarg[1] )
356                       {
357                           args->clevel = strtol(optarg+1,&tmp,10);
358                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
359                       }
360                       break;
361             case 'h':
362             case '?':
363             default: error("%s", usage_text()); break;
364         }
365     }
366     if ( optind==argc )
367     {
368         if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-";  // reading from stdin
369         else { error("%s", usage_text()); }
370     }
371     else if ( optind+1!=argc ) error("%s", usage_text());
372     else args->fname = argv[optind];
373 
374     if ( !args->group_by ) error("Missing the -g option\n");
375 
376     args->gvcf.rec = bcf_init();
377     args->gvcf.grp = -1;            // the block is inactive
378     args->sr = bcf_sr_init();
379     if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
380     args->hdr_in = bcf_sr_get_header(args->sr,0);
381     if ( args->filter_str )
382         args->filter = filter_init(args->hdr_in, args->filter_str);
383     init_groups(args);
384     char wmode[8];
385     set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
386     args->fh_out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
387     if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n");
388     while ( bcf_sr_next_line(args->sr) ) process_gvcf(args);
389     flush_block(args, NULL);
390 
391     destroy_data(args);
392     return 0;
393 }
394 
395 
396