1 /* The MIT License
2
3 Copyright (c) 2019-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <strings.h>
30 #include <getopt.h>
31 #include <math.h>
32 #include <unistd.h> // for isatty
33 #include <inttypes.h>
34 #include <htslib/hts.h>
35 #include <htslib/vcf.h>
36 #include <htslib/kstring.h>
37 #include <htslib/kseq.h>
38 #include <htslib/synced_bcf_reader.h>
39 #include <htslib/vcfutils.h>
40 #include <htslib/kfunc.h>
41 #include <errno.h>
42 #include "bcftools.h"
43 #include "filter.h"
44
45 // Logic of the filters: include or exclude sites which match the filters?
46 #define FLT_INCLUDE 1
47 #define FLT_EXCLUDE 2
48
49 #define CNV_DEL 0
50 #define CNV_DUP 1
51
52 #define iCHILD 0
53 #define iFATHER 1
54 #define iMOTHER 2
55
56 typedef struct
57 {
58 int idx[3]; // VCF sample index for child, father, mother
59 int pass; // do all three pass the filters?
60 }
61 trio_t;
62
63 typedef struct
64 {
65 int argc, filter_logic, cnv_type, debug, greedy;
66 filter_t *filter;
67 char *filter_str;
68 char **argv, *pfm, *fname, *region;
69 bcf_srs_t *sr;
70 bcf_hdr_t *hdr;
71 trio_t trio;
72 int32_t *pl, *ad, *gt; // input FMT/PL, AD, and GT values
73 int mpl, mad, mgt;
74 double ppat,pmat; // method 1: probability of paternal/maternal origin
75 int ntest; // number of informative sites
76 int nmat, npat; // method 2: number of pat/mat sites based on simple ad[0] < ad[1] comparison
77 double min_pbinom; // minimum binomial probability of paternal hets
78 }
79 args_t;
80
81 args_t args;
82
about(void)83 const char *about(void)
84 {
85 return "Determine parental origin of a CNV region in a trio.\n";
86 }
87
usage_text(void)88 static const char *usage_text(void)
89 {
90 return
91 "\n"
92 "About: Determine parental origin of a CNV region\n"
93 "Usage: bcftools +parental-origin [Plugin Options]\n"
94 "Plugin options:\n"
95 " -b, --min-binom-prob FLOAT exclude parental HETs with skewed ALT allele fraction [1e-2]\n"
96 " -d, --debug list informative sites\n"
97 " -e, --exclude EXPR exclude sites and samples for which the expression is true\n"
98 " -g, --greedy use also ambiguous sites, e.g. het+hom parents for deletions\n"
99 " -i, --include EXPR include sites and samples for which the expression is true\n"
100 " -p, --pfm P,F,M sample names of proband, father, and mother\n"
101 " -r, --region REGION chr:beg-end\n"
102 " -t, --type <del|dup> the CNV type\n"
103 "\n"
104 "Example:\n"
105 " bcftools +parental-origin -p proband,father,mother -t dup -r 14:22671179-22947951 file.bcf\n"
106 "\n";
107 }
108
init_data(args_t * args)109 static void init_data(args_t *args)
110 {
111 args->sr = bcf_sr_init();
112 if ( args->region )
113 {
114 args->sr->require_index = 1;
115 if ( bcf_sr_set_regions(args->sr, args->region, 0)<0 ) error("Failed to read the region: %s\n",args->region);
116 }
117 if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
118 args->hdr = bcf_sr_get_header(args->sr,0);
119
120 int id;
121 if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) )
122 error("Error: the tag FORMAT/PL is not present in %s\n", args->fname);
123 if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) )
124 error("Error: the tag FORMAT/AD is not present in %s\n", args->fname);
125 if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) )
126 error("Error: the tag FORMAT/GT is not present in %s\n", args->fname);
127
128 if ( args->filter_str )
129 args->filter = filter_init(args->hdr, args->filter_str);
130
131 int i, n = 0;
132 char **list;
133 list = hts_readlist(args->pfm, 0, &n);
134 if ( n!=3 ) error("Expected three sample names with -t\n");
135 args->trio.idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]);
136 args->trio.idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]);
137 args->trio.idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]);
138 for (i=0; i<n; i++)
139 {
140 if ( args->trio.idx[i] < 0 ) error("The sample is not present: %s\n", list[i]);
141 free(list[i]);
142 }
143 free(list);
144 }
destroy_data(args_t * args)145 static void destroy_data(args_t *args)
146 {
147 if ( args->filter ) filter_destroy(args->filter);
148 free(args->pl);
149 free(args->ad);
150 free(args->gt);
151 bcf_sr_destroy(args->sr);
152 free(args);
153 }
calc_binom_two_sided(int na,int nb,double aprob)154 static inline double calc_binom_two_sided(int na, int nb, double aprob)
155 {
156 double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob);
157 if ( prob > 1 ) prob = 1;
158 return prob;
159 }
calc_binom_one_sided(int na,int nb,double aprob,int ge)160 static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge)
161 {
162 return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob);
163 }
process_record(args_t * args,bcf1_t * rec)164 static void process_record(args_t *args, bcf1_t *rec)
165 {
166 if ( rec->n_allele!=2 || bcf_get_variant_types(rec)!=VCF_SNP ) return;
167
168 int i,j;
169 if ( args->filter )
170 {
171 uint8_t *smpl_pass = NULL;
172 int pass_site = filter_test(args->filter, rec, (const uint8_t**) &smpl_pass);
173 if ( args->filter_logic & FLT_EXCLUDE )
174 {
175 if ( pass_site )
176 {
177 if ( !smpl_pass ) return;
178 pass_site = 0;
179 for (i=0; i<3; i++)
180 {
181 if ( smpl_pass[args->trio.idx[i]] ) smpl_pass[args->trio.idx[i]] = 0;
182 else { smpl_pass[args->trio.idx[i]] = 1; pass_site = 1; }
183 }
184 if ( !pass_site ) return;
185 }
186 else
187 for (i=0; i<3; i++) smpl_pass[args->trio.idx[i]] = 1;
188 }
189 else if ( !pass_site ) return;
190
191 if ( smpl_pass )
192 {
193 for (i=0; i<3; i++)
194 if ( !smpl_pass[args->trio.idx[i]] ) return;
195 }
196 }
197
198 int nsmpl = bcf_hdr_nsamples(args->hdr);
199 int nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad);
200 if ( nret<=0 )
201 {
202 printf("The FORMAT/AD tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
203 return;
204 }
205 int nad1 = nret/nsmpl;
206
207 nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl);
208 if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
209 int npl1 = nret/nsmpl;
210 if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 )
211 {
212 printf("todo: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1);
213 return;
214 }
215
216 nret = bcf_get_genotypes(args->hdr,rec,&args->gt,&args->mgt);
217 if ( nret<=0 ) error("The FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
218 int ngt1 = nret/nsmpl;
219 if ( ngt1!=2 ) error("Todo: assuming diploid fields for now .. %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
220
221 // number of ref and alt alleles in the proband
222 int32_t ad[6], *adP = ad, *adF = ad+2, *adM = ad+4;
223 int32_t dsg[3], *dsgP = dsg, *dsgF = dsg+1, *dsgM = dsg+2;
224 double gl[9], *glP = gl, *glF = gl+3, *glM = gl+6;
225 for (i=0; i<3; i++) // trio
226 {
227 int isum = 0;
228 int32_t *src = args->pl + npl1*args->trio.idx[i];
229 double *gl_dst = gl + 3*i;
230 double sum = 0;
231 for (j=0; j<3; j++) // iterate over PL
232 {
233 if ( src[j]==bcf_int32_missing || src[j]==bcf_int32_vector_end ) return;
234 gl_dst[j] = pow(10,-0.1*src[j]);
235 sum += gl_dst[j];
236 isum += src[j];
237 }
238 if ( isum==0 ) return;
239 for (j=0; j<3; j++) gl_dst[j] /= sum;
240
241 int32_t *gt = args->gt + ngt1*args->trio.idx[i];
242 dsg[i] = 0;
243 for (j=0; j<ngt1; j++)
244 {
245 if ( gt[j]==bcf_int32_vector_end ) return;
246 if ( bcf_gt_is_missing(gt[j]) ) return;
247 if ( bcf_gt_allele(gt[j]) ) dsg[i]++;
248 }
249
250 src = args->ad + nad1*args->trio.idx[i];
251 ad[2*i] = src[0];
252 ad[2*i+1] = src[1];
253 }
254
255 #define is_RR(x) (x[0]==0)
256 #define is_RA(x) (x[1]==0)
257 #define is_AA(x) (x[2]==0)
258 if ( args->cnv_type==CNV_DEL )
259 {
260 if ( *dsgP!=0 && *dsgP!=2 ) return; // proband not a hom
261 if ( *dsgF == *dsgM ) return; // cannot distinguish between parents
262 if ( !args->greedy )
263 {
264 if ( *dsgF==1 && *dsgP==*dsgM ) return; // both parents have the proband's allele
265 if ( *dsgM==1 && *dsgP==*dsgF ) return;
266 }
267 double pmat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[0]*glF[1] + glM[0]*glF[2] + 1/3.*glM[1]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) +
268 glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[2]*glF[1] + glM[2]*glF[0] + 1/3.*glM[1]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]);
269 double ppat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[1]*glF[0] + glM[2]*glF[0] + 1/3.*glM[0]*glF[1] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) +
270 glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[1]*glF[2] + glM[0]*glF[2] + 1/3.*glM[2]*glF[1] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]);
271
272 // NB: pmat/ppat is the probability of parental origin of the observed, not the deleted allele;
273 // args->pmat/ppat is the probability of parental origin of the deleted allele
274 args->pmat += log(ppat);
275 args->ppat += log(pmat);
276 args->ntest++;
277
278 if ( args->debug )
279 {
280 // output: position, paternal probability, maternal probability, PLs of child, father, mother
281 printf("DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat);
282 for (i=0; i<3; i++)
283 {
284 for (j=0; j<3; j++) printf(" %d",args->pl[npl1*args->trio.idx[i]+j]);
285 printf("\t");
286 }
287 printf("\n");
288 }
289 }
290 if ( args->cnv_type==CNV_DUP )
291 {
292 if ( !adP[0] || !adP[1] ) return; // proband is homozygous or has no coverage
293 if ( adP[0] == adP[1] ) return; // proband's alleles are not informative, any or none could have been duplicated
294 if ( *dsgP!=1 ) return; // the proband's genotype is not a het
295 if ( *dsgF == *dsgM ) return; // cannot distinguish between parents
296
297 if ( args->min_pbinom!=0 )
298 {
299 // exclude parental hets with skewed ALT allele proportion
300 if ( *dsgF==1 && adF[0] && adF[1] && calc_binom_two_sided(adF[0],adF[1],0.5) < args->min_pbinom ) return;
301 if ( *dsgM==1 && adM[0] && adM[1] && calc_binom_two_sided(adM[0],adM[1],0.5) < args->min_pbinom ) return;
302 }
303
304 double prra = glP[1] * calc_binom_one_sided(adP[1],adP[0],1/3.,1);
305 double praa = glP[1] * calc_binom_one_sided(adP[1],adP[0],2/3.,0);
306 double ppat = prra*(glM[1]*glF[0] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) +
307 praa*(glM[1]*glF[2] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]);
308 double pmat = prra*(glM[0]*glF[1] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) +
309 praa*(glM[2]*glF[1] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]);
310 args->pmat += log(pmat);
311 args->ppat += log(ppat);
312 args->ntest++;
313
314 if ( args->debug )
315 {
316 // output: position; paternal probability; maternal probability; ADs of child, father,mother; PLs of child, father, mother
317 printf("DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat);
318 for (i=0; i<3; i++)
319 {
320 printf("%d %d\t",ad[2*i],ad[2*i+1]);
321 }
322 for (i=0; i<3; i++)
323 {
324 for (j=0; j<3; j++) printf(" %d",args->pl[npl1*args->trio.idx[i]+j]);
325 printf("\t");
326 }
327 printf("\n");
328 }
329 }
330 }
331
run(int argc,char ** argv)332 int run(int argc, char **argv)
333 {
334 args_t *args = (args_t*) calloc(1,sizeof(args_t));
335 args->argc = argc; args->argv = argv;
336 args->min_pbinom = 1e-2;
337 static struct option loptions[] =
338 {
339 {"include",required_argument,0,'i'},
340 {"exclude",required_argument,0,'e'},
341 {"pfm",required_argument,NULL,'p'},
342 {"region",required_argument,0,'r'},
343 {"type",required_argument,0,'t'},
344 {"debug",no_argument,0,'d'},
345 {"greedy",no_argument,0,'g'},
346 {"min-binom-prob",required_argument,0,'b'},
347 {NULL,0,NULL,0}
348 };
349 int c;
350 char *tmp;
351 while ((c = getopt_long(argc, argv, "h?e:i:p:r:t:dgb:",loptions,NULL)) >= 0)
352 {
353 switch (c)
354 {
355 case 'e':
356 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
357 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
358 case 'i':
359 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
360 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
361 case 't':
362 if ( !strcasecmp("dup",optarg) ) args->cnv_type = CNV_DUP;
363 else if ( !strcasecmp("del",optarg) ) args->cnv_type = CNV_DEL;
364 break;
365 case 'r': args->region = optarg; break;
366 case 'p': args->pfm = optarg; break;
367 case 'd': args->debug = 1; break;
368 case 'g': args->greedy = 1; break;
369 case 'b':
370 args->min_pbinom = strtod(optarg,&tmp);
371 if ( *tmp ) error("Could not parse: -b %s\n", optarg);
372 if ( args->min_pbinom<0 || args->min_pbinom>1 ) error("Expected value from the interval [0,1] with --min-binom-prob\n");
373 break;
374 case 'h':
375 case '?':
376 default: error("%s", usage_text()); break;
377 }
378 }
379 if ( optind==argc )
380 {
381 if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
382 else { error("%s", usage_text()); }
383 }
384 else if ( optind+1!=argc ) error("%s", usage_text());
385 else args->fname = argv[optind];
386
387 if ( !args->pfm ) error("Missing the -p option\n");
388
389 init_data(args);
390 if ( args->debug )
391 {
392 if ( args->cnv_type==CNV_DEL ) printf("# DBG: position; paternal probability; maternal probability; PLs of child, father, mother\n");
393 else printf("# DBG: position; paternal probability; maternal probability; ADs of child, father, mother; PLs of child, father, mother\n");
394 }
395
396 while ( bcf_sr_next_line(args->sr) )
397 process_record(args, bcf_sr_get_line(args->sr,0));
398
399 double qual = 4.3429*fabs(args->ppat - args->pmat);
400 char *origin = "uncertain";
401 if ( args->ppat > args->pmat ) origin = "paternal";
402 else if ( args->ppat < args->pmat ) origin = "maternal";
403
404 int i;
405 printf("# bcftools +%s", args->argv[0]);
406 for (i=1; i<args->argc; i++) printf(" %s",args->argv[i]);
407 printf("\n");
408 printf("# [1]type\t[2]predicted_origin\t[3]quality\t[4]nmarkers\n");
409 printf("%s\t%s\t%f\t%d\n", args->cnv_type==CNV_DUP ? "dup" : "del", origin, qual, args->ntest);
410
411 destroy_data(args);
412
413 return 0;
414 }
415