1 #include "bcftools.pysam.h"
2 
3 /*  ccall.c -- consensus variant calling.
4 
5     Copyright (C) 2013-2014 Genome Research Ltd.
6     Portions copyright (C) 2010 Broad Institute.
7 
8     Author: Petr Danecek <pd3@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 THE SOFTWARE.  */
27 
28 #include <math.h>
29 #include <assert.h>
30 #include <htslib/kfunc.h>
31 #include "call.h"
32 #include "kmin.h"
33 #include "prob1.h"
34 
35 // Most of the original -c calling was moved to bcftools as it was
36 // and its data structures were wrapped into the ccal_t to make it
37 // functional quickly. This is not the desired state.
38 struct _ccall_t
39 {
40     bcf_p1aux_t *p1;
41 };
42 
ccall_init(call_t * call)43 void ccall_init(call_t *call)
44 {
45     call->cdat = (ccall_t*) calloc(1,sizeof(ccall_t));
46     call_init_pl2p(call);
47     call->cdat->p1 = bcf_p1_init(bcf_hdr_nsamples(call->hdr), call->ploidy);
48     call->gts = (int*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int));   // assuming at most diploid everywhere
49     call->nals_map = 5;
50     call->als_map  = (int*) malloc(sizeof(int)*call->nals_map);
51 
52     bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
53     if ( call->output_tags & CALL_FMT_GQ )
54     {
55         bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
56         call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
57     }
58     if ( call->output_tags & CALL_FMT_GP )
59         error("Sorry, -f GP is not supported with -c\n");
60     bcf_hdr_append(call->hdr,"##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">");
61     // Todo: groups not migrated to 'bcftools call' yet
62     bcf_hdr_append(call->hdr,"##INFO=<ID=AF2,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first and second group ALT allele frequency (assuming HWE)\">");
63     bcf_hdr_append(call->hdr,"##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">");
64     bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n");
65     bcf_hdr_append(call->hdr,"##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n");
66     bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
67     bcf_hdr_append(call->hdr,"##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n");
68     bcf_hdr_append(call->hdr,"##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n");
69     // bcf_hdr_append(call->hdr,);
70     // bcf_hdr_append(call->hdr,);
71     bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
72 
73     return;
74 }
ccall_destroy(call_t * call)75 void ccall_destroy(call_t *call)
76 {
77     free(call->itmp);
78     free(call->als_map);
79     free(call->gts);
80     free(call->anno16);
81     free(call->PLs);
82     free(call->GQs);
83     free(call->pdg);
84     bcf_p1_destroy(call->cdat->p1);
85     free(call->cdat);
86     return;
87 }
88 
89 // Inits P(D|G): convert PLs from log space, only two alleles (three PLs) are used.
90 // NB: The original samtools calling code uses pdgs in reverse order (AA comes
91 // first, RR last), while the -m calling model uses the canonical order.
set_pdg3(double * pl2p,int * PLs,double * pdg,int n_smpl,int n_gt)92 static void set_pdg3(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt)
93 {
94     int i;
95     for (i=0; i<n_smpl; i++)
96     {
97         pdg[2] = pl2p[ PLs[0] ];
98         pdg[1] = pl2p[ PLs[1] ];
99         pdg[0] = pl2p[ PLs[2] ];
100         PLs += n_gt;
101         pdg += 3;
102     }
103 }
104 
ttest(int n1,int n2,float a[4])105 static double ttest(int n1, int n2, float a[4])
106 {
107     extern double kf_betai(double a, double b, double x);
108     double t, v, u1, u2;
109     if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0;
110     u1 = (double)a[0] / n1; u2 = (double)a[2] / n2;
111     if (u1 <= u2) return 1.;
112     t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2));
113     v = n1 + n2 - 2;
114     return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t));
115 }
116 
test16_core(float anno[16],anno16_t * a)117 static int test16_core(float anno[16], anno16_t *a)
118 {
119     extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
120     double left, right;
121     int i;
122     a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
123     for (i=0; i<4; i++) a->d[i] = anno[i];
124     a->depth = anno[0] + anno[1] + anno[2] + anno[3];
125     a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0);
126     if (a->depth == 0) return -1;
127     a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499);
128     kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]);
129     for (i = 1; i < 4; ++i)
130         a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i);
131     return 0;
132 }
133 
test16(float * anno16,anno16_t * a)134 int test16(float *anno16, anno16_t *a)
135 {
136     a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
137     a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.;
138     a->mq = a->depth = a->is_tested = 0;
139     return test16_core(anno16, a);
140 }
update_bcf1(call_t * call,bcf1_t * rec,const bcf_p1rst_t * pr,double em[10])141 static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double em[10])
142 {
143     int has_I16, is_var;
144     float fq, r;
145     anno16_t a;
146     float tmpf[4], tmpi;
147 
148     bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16);
149 
150     has_I16 = test16(call->anno16, &a) >= 0? 1 : 0;
151 
152     // print EM
153     if (em[0] >= 0)
154     {
155         tmpf[0] = 1 - em[0];
156         bcf_update_info_float(call->hdr, rec, "AF1", tmpf, 1);
157     }
158     if (em[4] >= 0 && em[4] <= 0.05)
159     {
160         tmpf[0] = em[3]; tmpf[1] = em[2]; tmpf[2] = em[1]; tmpf[3] = em[4];
161         bcf_update_info_float(call->hdr, rec, "G3", tmpf, 3);
162         bcf_update_info_float(call->hdr, rec, "HWE", &tmpf[3], 1);
163     }
164     if (em[5] >= 0 && em[6] >= 0)
165     {
166         tmpf[0] = 1 - em[5]; tmpf[1] = 1 - em[6];
167         bcf_update_info_float(call->hdr, rec, "AF2", tmpf, 2);
168     }
169     if (em[7] >= 0)
170     {
171         tmpf[0] = em[7];
172         bcf_update_info_float(call->hdr, rec, "LRT", tmpf, 1);
173     }
174     if (em[8] >= 0)
175     {
176         tmpf[0] = em[8];
177         bcf_update_info_float(call->hdr, rec, "LRT2", tmpf, 1);
178     }
179 
180     bcf_p1aux_t *p1 = call->cdat->p1;
181     if (p1->cons_llr > 0)
182     {
183         tmpi = p1->cons_llr;
184         bcf_update_info_int32(call->hdr, rec, "CLR", &tmpi, 1);
185         // todo: trio calling with -c
186         if (p1->cons_gt > 0)
187         {
188             char tmp[4];
189             tmp[0] = p1->cons_gt&0xff; tmp[1] = p1->cons_gt>>8&0xff; tmp[2] = p1->cons_gt>>16&0xff; tmp[3] = 0;
190             bcf_update_info_string(call->hdr, rec, "UGT", tmp);
191             tmp[0] = p1->cons_gt>>32&0xff; tmp[1] = p1->cons_gt>>40&0xff; tmp[2] = p1->cons_gt>>48&0xff;
192             bcf_update_info_string(call->hdr, rec, "CGT", tmp);
193         }
194     }
195     is_var = (pr->p_ref < call->pref);
196     r = is_var? pr->p_ref : pr->p_var;
197 
198     bcf_update_info_int32(call->hdr, rec, "AC1", &pr->ac, 1);
199     int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
200     bcf_update_info_int32(call->hdr, rec, "DP4", dp, 4);
201     bcf_update_info_int32(call->hdr, rec, "MQ", &a.mq, 1);
202 
203     fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
204     if (fq < -999) fq = -999;
205     if (fq > 999) fq = 999;
206     bcf_update_info_float(call->hdr, rec, "FQ", &fq, 1);
207 
208     assert( pr->cmp[0]<0 );
209     // todo
210     //  if (pr->cmp[0] >= 0.) { // two sample groups
211     //      int i, q[3];
212     //      for (i = 1; i < 3; ++i) {
213     //          double x = pr->cmp[i] + pr->cmp[0]/2.;
214     //          q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
215     //          if (q[i] > 255) q[i] = 255;
216     //      }
217     //      if (pr->perm_rank >= 0) ksprintf(&s, "PR=%d;", pr->perm_rank);
218     //
219     //      ksprintf(&s, "PCHI2=%.3g;PC2=%d,%d;", q[1], q[2], pr->p_chi2);
220     //  }
221 
222     if (has_I16 && a.is_tested)
223     {
224         int i;
225         for (i=0; i<4; i++) tmpf[i] = a.p[i];
226         bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
227     }
228     bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0);     // remove I16 tag
229     bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0);      // remove QS tag
230 
231     rec->qual = r < 1e-100? 999 : -4.343 * log(r);
232     if (rec->qual > 999) rec->qual = 999;
233 
234     // Remove unused alleles
235     int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1;
236     if ( call->flag & CALL_KEEPALT && call->unseen==nals-1 ) nals--;
237 
238     if ( nals<rec->n_allele )
239     {
240         bcf_update_alleles(call->hdr, rec, (const char**)rec->d.allele, nals);
241 
242         // Update PLs
243         int npls_src = call->nPLs / rec->n_sample, npls_dst = nals*(nals+1)/2;
244         int *pls_src = call->PLs - npls_src, *pls_dst = call->PLs - npls_dst;
245         int isample, i;
246         for (isample = 0; isample < rec->n_sample; isample++)
247         {
248             pls_src += npls_src;
249             pls_dst += npls_dst;
250             if ( !call->ploidy || call->ploidy[isample]==2 )
251             {
252                 for (i=0; i<npls_dst; i++)
253                     pls_dst[i] =  pls_src[i];
254             }
255             else
256             {
257                 for (i=0; i<nals; i++)
258                 {
259                     int isrc = (i+1)*(i+2)/2-1;
260                     pls_dst[i] = pls_src[isrc];
261                 }
262                 if (i<npls_dst) pls_dst[i] = bcf_int32_vector_end;
263             }
264         }
265         bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*rec->n_sample);
266     }
267 
268     // Call genotypes
269     int i;
270     for (i=0; i<rec->n_sample; i++)
271     {
272         int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i, is_var) : 2;
273         int gt = x&3;
274         if ( !call->ploidy || call->ploidy[i]==2 )
275         {
276             if ( gt==1 )
277             {
278                 call->gts[2*i]   = bcf_gt_unphased(0);
279                 call->gts[2*i+1] = bcf_gt_unphased(1);
280             }
281             else if ( gt==0 )
282             {
283                 call->gts[2*i]   = bcf_gt_unphased(1);
284                 call->gts[2*i+1] = bcf_gt_unphased(1);
285             }
286             else
287             {
288                 call->gts[2*i]   = bcf_gt_unphased(0);
289                 call->gts[2*i+1] = bcf_gt_unphased(0);
290             }
291             if ( call->output_tags & CALL_FMT_GQ ) call->GQs[i] = x>>2;
292         }
293         else
294         {
295             if ( gt==0 ) call->gts[2*i] = bcf_gt_unphased(1);
296             else call->gts[2*i] = bcf_gt_unphased(0);
297             call->gts[2*i+1] = bcf_int32_vector_end;
298             if ( call->output_tags & CALL_FMT_GQ ) call->GQs[i] = bcf_int32_missing;
299         }
300     }
301     bcf_update_genotypes(call->hdr, rec, call->gts, rec->n_sample*2);
302     if ( call->output_tags & CALL_FMT_GQ )
303         bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, rec->n_sample);
304 
305     // trim Number=R tags
306     int out_als = 0;
307     for (i=0; i<nals; i++) out_als |= 1<<i;
308     init_allele_trimming_maps(call, nals_ori, out_als);
309     mcall_trim_and_update_numberR(call, rec, nals_ori, nals);
310 
311     return is_var;
312 }
313 
314 
ccall(call_t * call,bcf1_t * rec)315 int ccall(call_t *call, bcf1_t *rec)
316 {
317     int nsmpl = bcf_hdr_nsamples(call->hdr);
318 
319     // Get the genotype likelihoods
320     int nals = rec->n_allele;
321     call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
322     if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals )  // diploid+haploid or haploid only
323         error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
324 
325     // Convert PLs to probabilities, only first two alleles are considered
326     int ngts = nals*(nals+1)/2;
327     hts_expand(double, 3*nsmpl, call->npdg, call->pdg);
328     set_pdg3(call->pl2p, call->PLs, call->pdg, nsmpl, ngts);
329 
330     double em[10] = {-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.,-1.};
331     int ret = bcf_em1(call, rec, call->ngrp1_samples, 0x1ff, em);
332 
333     bcf_p1rst_t pr;
334     int do_contrast = (em[7] >= 0 && em[7] < call->min_lrt) ? 1 : 0;
335     ret = bcf_p1_cal(call, rec, do_contrast, call->cdat->p1, &pr);
336     if (pr.p_ref >= call->pref && (call->flag & CALL_VARONLY)) return 0;
337     if (ret >= 0) ret = update_bcf1(call, rec, &pr, em);
338     return ret;
339 }
340 
341