1 #include "bcftools.pysam.h"
2
3 /* ccall.c -- consensus variant calling.
4
5 Copyright (C) 2013-2014 Genome Research Ltd.
6 Portions copyright (C) 2010 Broad Institute.
7
8 Author: Petr Danecek <pd3@sanger.ac.uk>
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 THE SOFTWARE. */
27
28 #include <math.h>
29 #include <assert.h>
30 #include <htslib/kfunc.h>
31 #include "call.h"
32 #include "kmin.h"
33 #include "prob1.h"
34
35 // Most of the original -c calling was moved to bcftools as it was
36 // and its data structures were wrapped into the ccal_t to make it
37 // functional quickly. This is not the desired state.
38 struct _ccall_t
39 {
40 bcf_p1aux_t *p1;
41 };
42
ccall_init(call_t * call)43 void ccall_init(call_t *call)
44 {
45 call->cdat = (ccall_t*) calloc(1,sizeof(ccall_t));
46 call_init_pl2p(call);
47 call->cdat->p1 = bcf_p1_init(bcf_hdr_nsamples(call->hdr), call->ploidy);
48 call->gts = (int*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int)); // assuming at most diploid everywhere
49 call->nals_map = 5;
50 call->als_map = (int*) malloc(sizeof(int)*call->nals_map);
51
52 bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
53 if ( call->output_tags & CALL_FMT_GQ )
54 {
55 bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
56 call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
57 }
58 if ( call->output_tags & CALL_FMT_GP )
59 error("Sorry, -f GP is not supported with -c\n");
60 bcf_hdr_append(call->hdr,"##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">");
61 // Todo: groups not migrated to 'bcftools call' yet
62 bcf_hdr_append(call->hdr,"##INFO=<ID=AF2,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first and second group ALT allele frequency (assuming HWE)\">");
63 bcf_hdr_append(call->hdr,"##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">");
64 bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n");
65 bcf_hdr_append(call->hdr,"##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n");
66 bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
67 bcf_hdr_append(call->hdr,"##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n");
68 bcf_hdr_append(call->hdr,"##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n");
69 // bcf_hdr_append(call->hdr,);
70 // bcf_hdr_append(call->hdr,);
71 bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
72
73 return;
74 }
ccall_destroy(call_t * call)75 void ccall_destroy(call_t *call)
76 {
77 free(call->itmp);
78 free(call->als_map);
79 free(call->gts);
80 free(call->anno16);
81 free(call->PLs);
82 free(call->GQs);
83 free(call->pdg);
84 bcf_p1_destroy(call->cdat->p1);
85 free(call->cdat);
86 return;
87 }
88
89 // Inits P(D|G): convert PLs from log space, only two alleles (three PLs) are used.
90 // NB: The original samtools calling code uses pdgs in reverse order (AA comes
91 // first, RR last), while the -m calling model uses the canonical order.
set_pdg3(double * pl2p,int * PLs,double * pdg,int n_smpl,int n_gt)92 static void set_pdg3(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt)
93 {
94 int i;
95 for (i=0; i<n_smpl; i++)
96 {
97 pdg[2] = pl2p[ PLs[0] ];
98 pdg[1] = pl2p[ PLs[1] ];
99 pdg[0] = pl2p[ PLs[2] ];
100 PLs += n_gt;
101 pdg += 3;
102 }
103 }
104
ttest(int n1,int n2,float a[4])105 static double ttest(int n1, int n2, float a[4])
106 {
107 extern double kf_betai(double a, double b, double x);
108 double t, v, u1, u2;
109 if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0;
110 u1 = (double)a[0] / n1; u2 = (double)a[2] / n2;
111 if (u1 <= u2) return 1.;
112 t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2));
113 v = n1 + n2 - 2;
114 return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t));
115 }
116
test16_core(float anno[16],anno16_t * a)117 static int test16_core(float anno[16], anno16_t *a)
118 {
119 extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
120 double left, right;
121 int i;
122 a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
123 for (i=0; i<4; i++) a->d[i] = anno[i];
124 a->depth = anno[0] + anno[1] + anno[2] + anno[3];
125 a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0);
126 if (a->depth == 0) return -1;
127 a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499);
128 kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]);
129 for (i = 1; i < 4; ++i)
130 a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i);
131 return 0;
132 }
133
test16(float * anno16,anno16_t * a)134 int test16(float *anno16, anno16_t *a)
135 {
136 a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
137 a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.;
138 a->mq = a->depth = a->is_tested = 0;
139 return test16_core(anno16, a);
140 }
update_bcf1(call_t * call,bcf1_t * rec,const bcf_p1rst_t * pr,double em[10])141 static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double em[10])
142 {
143 int has_I16, is_var;
144 float fq, r;
145 anno16_t a;
146 float tmpf[4], tmpi;
147
148 bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16);
149
150 has_I16 = test16(call->anno16, &a) >= 0? 1 : 0;
151
152 // print EM
153 if (em[0] >= 0)
154 {
155 tmpf[0] = 1 - em[0];
156 bcf_update_info_float(call->hdr, rec, "AF1", tmpf, 1);
157 }
158 if (em[4] >= 0 && em[4] <= 0.05)
159 {
160 tmpf[0] = em[3]; tmpf[1] = em[2]; tmpf[2] = em[1]; tmpf[3] = em[4];
161 bcf_update_info_float(call->hdr, rec, "G3", tmpf, 3);
162 bcf_update_info_float(call->hdr, rec, "HWE", &tmpf[3], 1);
163 }
164 if (em[5] >= 0 && em[6] >= 0)
165 {
166 tmpf[0] = 1 - em[5]; tmpf[1] = 1 - em[6];
167 bcf_update_info_float(call->hdr, rec, "AF2", tmpf, 2);
168 }
169 if (em[7] >= 0)
170 {
171 tmpf[0] = em[7];
172 bcf_update_info_float(call->hdr, rec, "LRT", tmpf, 1);
173 }
174 if (em[8] >= 0)
175 {
176 tmpf[0] = em[8];
177 bcf_update_info_float(call->hdr, rec, "LRT2", tmpf, 1);
178 }
179
180 bcf_p1aux_t *p1 = call->cdat->p1;
181 if (p1->cons_llr > 0)
182 {
183 tmpi = p1->cons_llr;
184 bcf_update_info_int32(call->hdr, rec, "CLR", &tmpi, 1);
185 // todo: trio calling with -c
186 if (p1->cons_gt > 0)
187 {
188 char tmp[4];
189 tmp[0] = p1->cons_gt&0xff; tmp[1] = p1->cons_gt>>8&0xff; tmp[2] = p1->cons_gt>>16&0xff; tmp[3] = 0;
190 bcf_update_info_string(call->hdr, rec, "UGT", tmp);
191 tmp[0] = p1->cons_gt>>32&0xff; tmp[1] = p1->cons_gt>>40&0xff; tmp[2] = p1->cons_gt>>48&0xff;
192 bcf_update_info_string(call->hdr, rec, "CGT", tmp);
193 }
194 }
195 is_var = (pr->p_ref < call->pref);
196 r = is_var? pr->p_ref : pr->p_var;
197
198 bcf_update_info_int32(call->hdr, rec, "AC1", &pr->ac, 1);
199 int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
200 bcf_update_info_int32(call->hdr, rec, "DP4", dp, 4);
201 bcf_update_info_int32(call->hdr, rec, "MQ", &a.mq, 1);
202
203 fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
204 if (fq < -999) fq = -999;
205 if (fq > 999) fq = 999;
206 bcf_update_info_float(call->hdr, rec, "FQ", &fq, 1);
207
208 assert( pr->cmp[0]<0 );
209 // todo
210 // if (pr->cmp[0] >= 0.) { // two sample groups
211 // int i, q[3];
212 // for (i = 1; i < 3; ++i) {
213 // double x = pr->cmp[i] + pr->cmp[0]/2.;
214 // q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
215 // if (q[i] > 255) q[i] = 255;
216 // }
217 // if (pr->perm_rank >= 0) ksprintf(&s, "PR=%d;", pr->perm_rank);
218 //
219 // ksprintf(&s, "PCHI2=%.3g;PC2=%d,%d;", q[1], q[2], pr->p_chi2);
220 // }
221
222 if (has_I16 && a.is_tested)
223 {
224 int i;
225 for (i=0; i<4; i++) tmpf[i] = a.p[i];
226 bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
227 }
228 bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
229 bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
230
231 rec->qual = r < 1e-100? 999 : -4.343 * log(r);
232 if (rec->qual > 999) rec->qual = 999;
233
234 // Remove unused alleles
235 int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1;
236 if ( call->flag & CALL_KEEPALT && call->unseen==nals-1 ) nals--;
237
238 if ( nals<rec->n_allele )
239 {
240 bcf_update_alleles(call->hdr, rec, (const char**)rec->d.allele, nals);
241
242 // Update PLs
243 int npls_src = call->nPLs / rec->n_sample, npls_dst = nals*(nals+1)/2;
244 int *pls_src = call->PLs - npls_src, *pls_dst = call->PLs - npls_dst;
245 int isample, i;
246 for (isample = 0; isample < rec->n_sample; isample++)
247 {
248 pls_src += npls_src;
249 pls_dst += npls_dst;
250 if ( !call->ploidy || call->ploidy[isample]==2 )
251 {
252 for (i=0; i<npls_dst; i++)
253 pls_dst[i] = pls_src[i];
254 }
255 else
256 {
257 for (i=0; i<nals; i++)
258 {
259 int isrc = (i+1)*(i+2)/2-1;
260 pls_dst[i] = pls_src[isrc];
261 }
262 if (i<npls_dst) pls_dst[i] = bcf_int32_vector_end;
263 }
264 }
265 bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*rec->n_sample);
266 }
267
268 // Call genotypes
269 int i;
270 for (i=0; i<rec->n_sample; i++)
271 {
272 int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i, is_var) : 2;
273 int gt = x&3;
274 if ( !call->ploidy || call->ploidy[i]==2 )
275 {
276 if ( gt==1 )
277 {
278 call->gts[2*i] = bcf_gt_unphased(0);
279 call->gts[2*i+1] = bcf_gt_unphased(1);
280 }
281 else if ( gt==0 )
282 {
283 call->gts[2*i] = bcf_gt_unphased(1);
284 call->gts[2*i+1] = bcf_gt_unphased(1);
285 }
286 else
287 {
288 call->gts[2*i] = bcf_gt_unphased(0);
289 call->gts[2*i+1] = bcf_gt_unphased(0);
290 }
291 if ( call->output_tags & CALL_FMT_GQ ) call->GQs[i] = x>>2;
292 }
293 else
294 {
295 if ( gt==0 ) call->gts[2*i] = bcf_gt_unphased(1);
296 else call->gts[2*i] = bcf_gt_unphased(0);
297 call->gts[2*i+1] = bcf_int32_vector_end;
298 if ( call->output_tags & CALL_FMT_GQ ) call->GQs[i] = bcf_int32_missing;
299 }
300 }
301 bcf_update_genotypes(call->hdr, rec, call->gts, rec->n_sample*2);
302 if ( call->output_tags & CALL_FMT_GQ )
303 bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, rec->n_sample);
304
305 // trim Number=R tags
306 int out_als = 0;
307 for (i=0; i<nals; i++) out_als |= 1<<i;
308 init_allele_trimming_maps(call, nals_ori, out_als);
309 mcall_trim_and_update_numberR(call, rec, nals_ori, nals);
310
311 return is_var;
312 }
313
314
ccall(call_t * call,bcf1_t * rec)315 int ccall(call_t *call, bcf1_t *rec)
316 {
317 int nsmpl = bcf_hdr_nsamples(call->hdr);
318
319 // Get the genotype likelihoods
320 int nals = rec->n_allele;
321 call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
322 if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // diploid+haploid or haploid only
323 error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
324
325 // Convert PLs to probabilities, only first two alleles are considered
326 int ngts = nals*(nals+1)/2;
327 hts_expand(double, 3*nsmpl, call->npdg, call->pdg);
328 set_pdg3(call->pl2p, call->PLs, call->pdg, nsmpl, ngts);
329
330 double em[10] = {-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.};
331 int ret = bcf_em1(call, rec, call->ngrp1_samples, 0x1ff, em);
332
333 bcf_p1rst_t pr;
334 int do_contrast = (em[7] >= 0 && em[7] < call->min_lrt) ? 1 : 0;
335 ret = bcf_p1_cal(call, rec, do_contrast, call->cdat->p1, &pr);
336 if (pr.p_ref >= call->pref && (call->flag & CALL_VARONLY)) return 0;
337 if (ret >= 0) ret = update_bcf1(call, rec, &pr, em);
338 return ret;
339 }
340
341