1 /* The MIT License
2 
3    Copyright (c) 2019-2021 Genome Research Ltd.
4 
5    Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7    Permission is hereby granted, free of charge, to any person obtaining a copy
8    of this software and associated documentation files (the "Software"), to deal
9    in the Software without restriction, including without limitation the rights
10    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11    copies of the Software, and to permit persons to whom the Software is
12    furnished to do so, subject to the following conditions:
13 
14    The above copyright notice and this permission notice shall be included in
15    all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <strings.h>
30 #include <getopt.h>
31 #include <math.h>
32 #include <unistd.h>     // for isatty
33 #include <inttypes.h>
34 #include <htslib/hts.h>
35 #include <htslib/vcf.h>
36 #include <htslib/kstring.h>
37 #include <htslib/kseq.h>
38 #include <htslib/synced_bcf_reader.h>
39 #include <htslib/vcfutils.h>
40 #include <htslib/kfunc.h>
41 #include <errno.h>
42 #include "bcftools.h"
43 #include "filter.h"
44 
45 // Logic of the filters: include or exclude sites which match the filters?
46 #define FLT_INCLUDE 1
47 #define FLT_EXCLUDE 2
48 
49 #define CNV_DEL 0
50 #define CNV_DUP 1
51 
52 #define iCHILD  0
53 #define iFATHER 1
54 #define iMOTHER 2
55 
56 typedef struct
57 {
58     int idx[3];     // VCF sample index for child, father, mother
59     int pass;       // do all three pass the filters?
60 }
61 trio_t;
62 
63 typedef struct
64 {
65     int argc, filter_logic, cnv_type, debug, greedy;
66     filter_t *filter;
67     char *filter_str;
68     char **argv, *pfm, *fname, *region;
69     bcf_srs_t *sr;
70     bcf_hdr_t *hdr;
71     trio_t trio;
72     int32_t *pl, *ad, *gt;   // input FMT/PL, AD, and GT values
73     int mpl, mad, mgt;
74     double ppat,pmat;   // method 1: probability of paternal/maternal origin
75     int ntest;          // number of informative sites
76     int nmat, npat;     // method 2: number of pat/mat sites based on simple ad[0] < ad[1] comparison
77     double min_pbinom;  // minimum binomial probability of paternal hets
78 }
79 args_t;
80 
81 args_t args;
82 
about(void)83 const char *about(void)
84 {
85     return "Determine parental origin of a CNV region in a trio.\n";
86 }
87 
usage_text(void)88 static const char *usage_text(void)
89 {
90     return
91         "\n"
92         "About: Determine parental origin of a CNV region\n"
93         "Usage: bcftools +parental-origin [Plugin Options]\n"
94         "Plugin options:\n"
95         "   -b, --min-binom-prob FLOAT      exclude parental HETs with skewed ALT allele fraction [1e-2]\n"
96         "   -d, --debug                     list informative sites\n"
97         "   -e, --exclude EXPR              exclude sites and samples for which the expression is true\n"
98         "   -g, --greedy                    use also ambiguous sites, e.g. het+hom parents for deletions\n"
99         "   -i, --include EXPR              include sites and samples for which the expression is true\n"
100         "   -p, --pfm P,F,M                 sample names of proband, father, and mother\n"
101         "   -r, --region REGION             chr:beg-end\n"
102         "   -t, --type <del|dup>            the CNV type\n"
103         "\n"
104         "Example:\n"
105         "   bcftools +parental-origin -p proband,father,mother -t dup -r 14:22671179-22947951 file.bcf\n"
106         "\n";
107 }
108 
init_data(args_t * args)109 static void init_data(args_t *args)
110 {
111     args->sr = bcf_sr_init();
112     if ( args->region )
113     {
114         args->sr->require_index = 1;
115         if ( bcf_sr_set_regions(args->sr, args->region, 0)<0 ) error("Failed to read the region: %s\n",args->region);
116     }
117     if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
118     args->hdr = bcf_sr_get_header(args->sr,0);
119 
120     int id;
121     if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) )
122         error("Error: the tag FORMAT/PL is not present in %s\n", args->fname);
123     if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) )
124         error("Error: the tag FORMAT/AD is not present in %s\n", args->fname);
125     if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) )
126         error("Error: the tag FORMAT/GT is not present in %s\n", args->fname);
127 
128     if ( args->filter_str )
129         args->filter = filter_init(args->hdr, args->filter_str);
130 
131     int i, n = 0;
132     char **list;
133     list = hts_readlist(args->pfm, 0, &n);
134     if ( n!=3 ) error("Expected three sample names with -t\n");
135     args->trio.idx[iCHILD]  = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]);
136     args->trio.idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]);
137     args->trio.idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]);
138     for (i=0; i<n; i++)
139     {
140         if ( args->trio.idx[i] < 0 ) error("The sample is not present: %s\n", list[i]);
141         free(list[i]);
142     }
143     free(list);
144 }
destroy_data(args_t * args)145 static void destroy_data(args_t *args)
146 {
147     if ( args->filter ) filter_destroy(args->filter);
148     free(args->pl);
149     free(args->ad);
150     free(args->gt);
151     bcf_sr_destroy(args->sr);
152     free(args);
153 }
calc_binom_two_sided(int na,int nb,double aprob)154 static inline double calc_binom_two_sided(int na, int nb, double aprob)
155 {
156     double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob);
157     if ( prob > 1 ) prob = 1;
158     return prob;
159 }
calc_binom_one_sided(int na,int nb,double aprob,int ge)160 static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge)
161 {
162     return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob);
163 }
process_record(args_t * args,bcf1_t * rec)164 static void process_record(args_t *args, bcf1_t *rec)
165 {
166     if ( rec->n_allele!=2 || bcf_get_variant_types(rec)!=VCF_SNP ) return;
167 
168     int i,j;
169     if ( args->filter )
170     {
171         uint8_t *smpl_pass = NULL;
172         int pass_site = filter_test(args->filter, rec, (const uint8_t**) &smpl_pass);
173         if ( args->filter_logic & FLT_EXCLUDE )
174         {
175             if ( pass_site )
176             {
177                 if ( !smpl_pass ) return;
178                 pass_site = 0;
179                 for (i=0; i<3; i++)
180                 {
181                     if ( smpl_pass[args->trio.idx[i]] ) smpl_pass[args->trio.idx[i]] = 0;
182                     else { smpl_pass[args->trio.idx[i]] = 1; pass_site = 1; }
183                 }
184                 if ( !pass_site ) return;
185             }
186             else
187                 for (i=0; i<3; i++) smpl_pass[args->trio.idx[i]] = 1;
188         }
189         else if ( !pass_site ) return;
190 
191         if ( smpl_pass )
192         {
193             for (i=0; i<3; i++)
194                 if ( !smpl_pass[args->trio.idx[i]] ) return;
195         }
196     }
197 
198     int nsmpl = bcf_hdr_nsamples(args->hdr);
199     int nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad);
200     if ( nret<=0 )
201     {
202         printf("The FORMAT/AD tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
203         return;
204     }
205     int nad1 = nret/nsmpl;
206 
207     nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl);
208     if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
209     int npl1 = nret/nsmpl;
210     if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 )
211     {
212         printf("todo: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1);
213         return;
214     }
215 
216     nret = bcf_get_genotypes(args->hdr,rec,&args->gt,&args->mgt);
217     if ( nret<=0 ) error("The FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
218     int ngt1 = nret/nsmpl;
219     if ( ngt1!=2 ) error("Todo: assuming diploid fields for now .. %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
220 
221     // number of ref and alt alleles in the proband
222     int32_t ad[6], *adP = ad, *adF = ad+2, *adM = ad+4;
223     int32_t dsg[3], *dsgP = dsg, *dsgF = dsg+1, *dsgM = dsg+2;
224     double gl[9], *glP = gl, *glF = gl+3, *glM = gl+6;
225     for (i=0; i<3; i++)    // trio
226     {
227         int isum = 0;
228         int32_t *src = args->pl + npl1*args->trio.idx[i];
229         double *gl_dst  = gl + 3*i;
230         double sum  = 0;
231         for (j=0; j<3; j++)     // iterate over PL
232         {
233             if ( src[j]==bcf_int32_missing || src[j]==bcf_int32_vector_end ) return;
234             gl_dst[j] = pow(10,-0.1*src[j]);
235             sum += gl_dst[j];
236             isum += src[j];
237         }
238         if ( isum==0 ) return;
239         for (j=0; j<3; j++) gl_dst[j] /= sum;
240 
241         int32_t *gt = args->gt + ngt1*args->trio.idx[i];
242         dsg[i] = 0;
243         for (j=0; j<ngt1; j++)
244         {
245             if ( gt[j]==bcf_int32_vector_end ) return;
246             if ( bcf_gt_is_missing(gt[j]) ) return;
247             if ( bcf_gt_allele(gt[j]) ) dsg[i]++;
248         }
249 
250         src = args->ad + nad1*args->trio.idx[i];
251         ad[2*i]   = src[0];
252         ad[2*i+1] = src[1];
253     }
254 
255     #define is_RR(x) (x[0]==0)
256     #define is_RA(x) (x[1]==0)
257     #define is_AA(x) (x[2]==0)
258     if ( args->cnv_type==CNV_DEL )
259     {
260         if ( *dsgP!=0 && *dsgP!=2 ) return;     // proband not a hom
261         if ( *dsgF == *dsgM ) return;           // cannot distinguish between parents
262         if ( !args->greedy )
263         {
264             if ( *dsgF==1 && *dsgP==*dsgM ) return; // both parents have the proband's allele
265             if ( *dsgM==1 && *dsgP==*dsgF ) return;
266         }
267         double pmat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[0]*glF[1] + glM[0]*glF[2] + 1/3.*glM[1]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) +
268                       glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[2]*glF[1] + glM[2]*glF[0] + 1/3.*glM[1]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]);
269         double ppat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[1]*glF[0] + glM[2]*glF[0] + 1/3.*glM[0]*glF[1] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) +
270                       glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[1]*glF[2] + glM[0]*glF[2] + 1/3.*glM[2]*glF[1] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]);
271 
272         // NB: pmat/ppat is the probability of parental origin of the observed, not the deleted allele;
273         //     args->pmat/ppat is the probability of parental origin of the deleted allele
274         args->pmat += log(ppat);
275         args->ppat += log(pmat);
276         args->ntest++;
277 
278         if ( args->debug )
279         {
280             // output: position, paternal probability, maternal probability, PLs of child, father, mother
281             printf("DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat);
282             for (i=0; i<3; i++)
283             {
284                 for (j=0; j<3; j++)  printf(" %d",args->pl[npl1*args->trio.idx[i]+j]);
285                 printf("\t");
286             }
287             printf("\n");
288         }
289     }
290     if ( args->cnv_type==CNV_DUP )
291     {
292         if ( !adP[0] || !adP[1] ) return;   // proband is homozygous or has no coverage
293         if ( adP[0] == adP[1] ) return;     // proband's alleles are not informative, any or none could have been duplicated
294         if ( *dsgP!=1 ) return;             // the proband's genotype is not a het
295         if ( *dsgF == *dsgM ) return;       // cannot distinguish between parents
296 
297         if ( args->min_pbinom!=0 )
298         {
299             // exclude parental hets with skewed ALT allele proportion
300             if ( *dsgF==1 && adF[0] && adF[1] && calc_binom_two_sided(adF[0],adF[1],0.5) < args->min_pbinom ) return;
301             if ( *dsgM==1 && adM[0] && adM[1] && calc_binom_two_sided(adM[0],adM[1],0.5) < args->min_pbinom ) return;
302         }
303 
304         double prra = glP[1] * calc_binom_one_sided(adP[1],adP[0],1/3.,1);
305         double praa = glP[1] * calc_binom_one_sided(adP[1],adP[0],2/3.,0);
306         double ppat = prra*(glM[1]*glF[0] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) +
307                       praa*(glM[1]*glF[2] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]);
308         double pmat = prra*(glM[0]*glF[1] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) +
309                       praa*(glM[2]*glF[1] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]);
310         args->pmat += log(pmat);
311         args->ppat += log(ppat);
312         args->ntest++;
313 
314         if ( args->debug )
315         {
316             // output: position; paternal probability; maternal probability; ADs of child, father,mother; PLs of child, father, mother
317             printf("DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat);
318             for (i=0; i<3; i++)
319             {
320                 printf("%d %d\t",ad[2*i],ad[2*i+1]);
321             }
322             for (i=0; i<3; i++)
323             {
324                 for (j=0; j<3; j++)  printf(" %d",args->pl[npl1*args->trio.idx[i]+j]);
325                 printf("\t");
326             }
327             printf("\n");
328         }
329     }
330 }
331 
run(int argc,char ** argv)332 int run(int argc, char **argv)
333 {
334     args_t *args = (args_t*) calloc(1,sizeof(args_t));
335     args->argc   = argc; args->argv = argv;
336     args->min_pbinom = 1e-2;
337     static struct option loptions[] =
338     {
339         {"include",required_argument,0,'i'},
340         {"exclude",required_argument,0,'e'},
341         {"pfm",required_argument,NULL,'p'},
342         {"region",required_argument,0,'r'},
343         {"type",required_argument,0,'t'},
344         {"debug",no_argument,0,'d'},
345         {"greedy",no_argument,0,'g'},
346         {"min-binom-prob",required_argument,0,'b'},
347         {NULL,0,NULL,0}
348     };
349     int c;
350     char *tmp;
351     while ((c = getopt_long(argc, argv, "h?e:i:p:r:t:dgb:",loptions,NULL)) >= 0)
352     {
353         switch (c)
354         {
355             case 'e':
356                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
357                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
358             case 'i':
359                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
360                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
361             case 't':
362                 if ( !strcasecmp("dup",optarg) ) args->cnv_type = CNV_DUP;
363                 else if ( !strcasecmp("del",optarg) ) args->cnv_type = CNV_DEL;
364                 break;
365             case 'r': args->region = optarg; break;
366             case 'p': args->pfm = optarg; break;
367             case 'd': args->debug = 1; break;
368             case 'g': args->greedy = 1; break;
369             case 'b':
370                 args->min_pbinom = strtod(optarg,&tmp);
371                 if ( *tmp ) error("Could not parse: -b %s\n", optarg);
372                 if ( args->min_pbinom<0 || args->min_pbinom>1 ) error("Expected value from the interval [0,1] with --min-binom-prob\n");
373                 break;
374             case 'h':
375             case '?':
376             default: error("%s", usage_text()); break;
377         }
378     }
379     if ( optind==argc )
380     {
381         if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-";  // reading from stdin
382         else { error("%s", usage_text()); }
383     }
384     else if ( optind+1!=argc ) error("%s", usage_text());
385     else args->fname = argv[optind];
386 
387     if ( !args->pfm ) error("Missing the -p option\n");
388 
389     init_data(args);
390     if ( args->debug )
391     {
392         if ( args->cnv_type==CNV_DEL ) printf("# DBG: position; paternal probability; maternal probability; PLs of child, father, mother\n");
393         else printf("# DBG: position; paternal probability; maternal probability; ADs of child, father, mother; PLs of child, father, mother\n");
394     }
395 
396     while ( bcf_sr_next_line(args->sr) )
397         process_record(args, bcf_sr_get_line(args->sr,0));
398 
399     double qual = 4.3429*fabs(args->ppat - args->pmat);
400     char *origin = "uncertain";
401     if ( args->ppat > args->pmat ) origin = "paternal";
402     else if ( args->ppat < args->pmat ) origin = "maternal";
403 
404     int i;
405     printf("# bcftools +%s", args->argv[0]);
406     for (i=1; i<args->argc; i++) printf(" %s",args->argv[i]);
407     printf("\n");
408     printf("# [1]type\t[2]predicted_origin\t[3]quality\t[4]nmarkers\n");
409     printf("%s\t%s\t%f\t%d\n", args->cnv_type==CNV_DUP ? "dup" : "del", origin, qual, args->ntest);
410 
411     destroy_data(args);
412 
413     return 0;
414 }
415