1 /* The MIT License
2 
3    Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "estimator.h"
25 
26 /**
27  * Computes allele frequencies using hard calls.
28  *
29  * @gts        - genotypes
30  * @no_samples - number of samples
31  * @ploidy     - ploidy
32  * @no_alleles - number of alleles
33  * @AC         - alternate allele counts
34  * @AN         - total number of allele counts
35  * @AF         - alternate allele frequency
36  * @GC         - genotype counts
37  * @GN         - total number of genotype counts
38  * @GF         - genotype frequency
39  * @NS         - number of samples with data
40  */
compute_af(int32_t * gts,int32_t no_samples,int32_t ploidy,int32_t no_alleles,int32_t * AC,int32_t & AN,float * AF,int32_t * GC,int32_t & GN,float * GF,int32_t & NS)41 void Estimator::compute_af(int32_t *gts, int32_t no_samples, int32_t ploidy,
42                 int32_t no_alleles, int32_t *AC, int32_t& AN, float *AF,
43                 int32_t *GC,  int32_t& GN, float *GF, int32_t& NS)
44 {
45     int32_t iter = 0;
46 
47     if (no_alleles==2 && ploidy==2)
48     {
49         NS = 0;
50         int32_t no_genotypes = 3;
51         AC[0] = 0;
52         AC[1] = 0;
53 
54         GC[0] = 0;
55         GC[1] = 0;
56         GC[2] = 0;
57 
58         GN=0;
59 
60         for (size_t k=0; k<no_samples; ++k)
61         {
62             size_t offset = k*ploidy;
63 
64             int32_t g1 = bcf_gt_allele(gts[offset]);
65             if (g1>=0)
66             {
67                 ++AC[g1];
68                 ++AN;
69             }
70             int32_t g2 = bcf_gt_allele(gts[offset+1]);
71             if (g2>=0)
72             {
73                 ++AC[g2];
74                 ++AN;
75             }
76 
77             if (g1>=0 && g2>=0)
78             {
79                 ++GC[g1+g2];
80                 ++GN;
81             }
82 
83             if (g1>=0||g2>=0) ++NS;
84         }
85 
86         if (!NS)
87         {
88             return;
89         }
90 
91         AF[0] = (float)AC[0]/AN;
92         AF[1] = (float)AC[1]/AN;
93 
94         GF[0] = (float)GC[0]/GN;
95         GF[1] = (float)GC[1]/GN;
96         GF[2] = (float)GC[2]/GN;
97 
98 //        std::cerr << "AC: " << AC[0] << "," << AC[1] << "\n";
99 //        std::cerr << "AF: " << AF[0] << "," << AF[1] << "\n";
100 //        std::cerr << "GC: " << AC[0] << AC[0] << "," << AC[1] << "\n";
101 //        std::cerr << "GC: " << AC[0] << AC[0] << "," << AC[1] << "\n";
102 
103     }
104     else
105     {
106         NS = 0;
107         AN = 0;
108         GN = 0;
109         int32_t no_genotypes = bcf_an2gn(no_alleles);
110         for (size_t i=0; i<no_alleles; ++i)
111         {
112             AC[i] = 0;
113         }
114 
115         for (size_t i=0; i<no_genotypes; ++i)
116         {
117             GC[i] = 0;
118         }
119 
120         int32_t gt_indiv[ploidy];
121 
122         for (size_t k=0; k<no_samples; ++k)
123         {
124             size_t offset = k*ploidy;
125 
126             int32_t last_AN = AN;
127             for (size_t i=0; i<ploidy; ++i)
128             {
129                 gt_indiv[i] = bcf_gt_allele(gts[offset+i]);
130 
131                 if (gt_indiv[i]>=0)
132                 {
133                     ++AC[gt_indiv[i]];
134                     ++AN;
135                 }
136             }
137 
138             if (last_AN<AN) ++NS;
139 
140             if (ploidy==2 && gt_indiv[0]>=0 && gt_indiv[1]>=0)
141             {
142                 if (gt_indiv[1]<gt_indiv[0])
143                 {
144                     gt_indiv[0] += gt_indiv[1];
145                     gt_indiv[1] = gt_indiv[0] - gt_indiv[1];
146                     gt_indiv[0] -= gt_indiv[1];
147                 }
148 
149                 ++GC[bcf_alleles2gt(gt_indiv[0],gt_indiv[1])];
150                 ++GN;
151             }
152         }
153 
154         for (size_t i=0; i<no_alleles; ++i)
155         {
156             AF[i] = (float)AC[i]/AN;
157         }
158 
159         for (size_t i=0; i<no_genotypes; ++i)
160         {
161             GF[i] = (float)GC[i]/GN;
162         }
163     }
164 }
165 
166 /**
167  * Computes allele frequencies using EM algorithm from genotype likelihoods
168  * under assumption of Hardy-Weinberg Equilibrium.
169  *
170  * @pls        - PHRED genotype likelihoods
171  * @no_samples - number of samples
172  * @ploidy     - ploidy
173  * @no_alleles - number of alleles
174  * @MLE_HWE_AF - estimated AF
175  * @MLE_HWE_GF - estimated GF
176  * @n          - effective sample size
177  * @e          - error
178  */
compute_gl_af_hwe(int32_t * pls,int32_t no_samples,int32_t ploidy,int32_t no_alleles,float * MLE_HWE_AF,float * MLE_HWE_GF,int32_t & n,double e)179 void Estimator::compute_gl_af_hwe(int32_t *pls, int32_t no_samples, int32_t ploidy,
180                 int32_t no_alleles, float *MLE_HWE_AF, float *MLE_HWE_GF, int32_t& n,
181                 double e)
182 {
183     int32_t iter = 0;
184 
185     if (ploidy!=2)
186     {
187         return;
188     }
189 
190     if (no_alleles==2)
191     {
192         n = 0;
193         int32_t imap[no_samples];
194 
195         for (size_t i=0; i<no_samples; ++i)
196         {
197             if (pls[i*3]!=bcf_int32_missing)
198             {
199                 imap[n] = i;
200                 ++n;
201             }
202         }
203 
204         if (!n)
205         {
206             return;
207         }
208 
209         float af[2] = {0.5, 0.5};
210         float gf[3];
211         float gf_indiv[3];
212 
213         float mse = e+1;
214         float diff = 0;
215         while (mse>e && iter<50)
216         {
217             gf[0] = af[0]*af[0];
218             gf[1] = 2*af[0]*af[1];
219             gf[2] = af[1]*af[1];
220 
221             MLE_HWE_AF[0] = 0;
222             MLE_HWE_AF[1] = 0;
223 
224             for (size_t i=0; i<n; ++i)
225             {
226                 size_t offset = imap[i]*3;
227 
228                 float prob_data = (gf_indiv[0] = gf[0]*LogTool::pl2prob(pls[offset]));
229                 prob_data += (gf_indiv[1] = gf[1]*LogTool::pl2prob(pls[offset+1]));
230                 prob_data += (gf_indiv[2] = gf[2]*LogTool::pl2prob(pls[offset+2]));
231 
232                 gf_indiv[0] /= prob_data;
233                 gf_indiv[1] /= prob_data;
234                 gf_indiv[2] /= prob_data;
235 
236                 MLE_HWE_AF[0] += gf_indiv[0] + 0.5*gf_indiv[1];
237                 MLE_HWE_AF[1] += gf_indiv[2] + 0.5*gf_indiv[1];
238             }
239 
240             MLE_HWE_AF[0] /= n;
241             MLE_HWE_AF[1] /= n;
242 
243             diff = (af[0]-MLE_HWE_AF[0]);
244             mse = diff*diff;
245             diff = (af[1]-MLE_HWE_AF[1]);
246             mse += diff*diff;
247 
248             af[0] = MLE_HWE_AF[0];
249             af[1] = MLE_HWE_AF[1];
250 
251             ++iter;
252         }
253 
254         MLE_HWE_GF[0] = MLE_HWE_AF[0]*MLE_HWE_AF[0];
255         MLE_HWE_GF[1] = 2*MLE_HWE_AF[0]*MLE_HWE_AF[1];
256         MLE_HWE_GF[2] = MLE_HWE_AF[1]*MLE_HWE_AF[1];
257     }
258     else
259     {
260         n = 0;
261         int32_t imap[no_samples];
262         int32_t no_genotypes = bcf_an2gn(no_alleles);
263         for (size_t k=0; k<no_samples; ++k)
264         {
265             if (pls[k*no_genotypes]!=bcf_int32_missing)
266             {
267                 imap[n] = k;
268                 ++n;
269             }
270         }
271 
272         if (!n) return;
273 
274         float af[no_alleles];
275         float p = 1.0/no_alleles;
276         for (size_t i=0; i<no_alleles; ++i)
277         {
278             af[i] = p;
279         }
280         float gf[no_genotypes];
281         float gf_indiv[no_genotypes];
282 
283         bool debug = false;
284 
285         float mse = e+1;
286         while (mse>e && iter<50)
287         {
288             for (size_t i=0; i<no_alleles; ++i)
289             {
290                 MLE_HWE_AF[i] = 0;
291                 for (size_t j=0; j<=i; ++j)
292                 {
293                     gf[bcf_alleles2gt(i,j)] = (i!=j?2:1)*af[i]*af[j];
294                 }
295             }
296 
297             //iterate through individuals
298             for (size_t k=0; k<n; ++k)
299             {
300                 size_t offset = imap[k]*no_genotypes;
301 
302                 float prob_data = 0;
303                 for (size_t i=0; i<no_genotypes; ++i)
304                 {
305                     prob_data += (gf_indiv[i] = gf[i]*LogTool::pl2prob(pls[offset+i]));
306                 }
307 
308                 for (size_t i=0; i<no_genotypes; ++i)
309                 {
310                     gf_indiv[i] /= prob_data;
311                 }
312 
313                 for (size_t i=0; i<no_alleles; ++i)
314                 {
315                     for (size_t j=0; j<=i; ++j)
316                     {
317                         int32_t gf_index = bcf_alleles2gt(i,j);
318                         MLE_HWE_AF[i] += 0.5*gf_indiv[gf_index];
319                         MLE_HWE_AF[j] += 0.5*gf_indiv[gf_index];
320                     }
321                 }
322             }
323 
324             //normalize to frequency
325             mse = 0;
326             float diff;
327             for (size_t i=0; i<no_alleles; ++i)
328             {
329                 MLE_HWE_AF[i] /= n;
330                 diff = af[i]-MLE_HWE_AF[i];
331                 mse += (diff*diff);
332                 af[i] = MLE_HWE_AF[i];
333             }
334 
335             ++iter;
336         }
337 
338         for (size_t i=0; i<no_alleles; ++i)
339         {
340             for (size_t j=0; j<=i; ++j)
341             {
342                 MLE_HWE_GF[bcf_alleles2gt(i,j)] = (i!=j?2:1)*MLE_HWE_AF[i]*MLE_HWE_AF[j];
343             }
344         }
345     }
346 }
347 
348 /**
349  * Computes allele frequencies using EM algorithm from genotype likelihoods.
350  *
351  * @pls        - PHRED genotype likelihoods
352  * @no_samples - number of samples
353  * @ploidy     - ploidy
354  * @n_alleles  - number of alleles
355  * @MLE_AF     - estimated AF
356  * @MLE_GF     - estimated GF
357  * @n          - effective sample size
358  * @e          - error
359  */
compute_gl_af(int32_t * pls,int32_t nsamples,int32_t ploidy,int32_t n_allele,float * MLE_AF,float * MLE_GF,int32_t & n,double e)360 void Estimator::compute_gl_af(int32_t *pls, int32_t nsamples, int32_t ploidy,
361                 int32_t n_allele, float *MLE_AF, float *MLE_GF, int32_t& n,
362                 double e)
363 {
364     int32_t iter = 0;
365 
366     if (n_allele==2 && ploidy==2)
367     {
368         n = 0;
369         int32_t imap[nsamples];
370 
371         for (size_t i=0; i<nsamples; ++i)
372         {
373             if (pls[i*3]!=bcf_int32_missing)
374             {
375                 imap[n] = i;
376                 ++n;
377             }
378         }
379 
380         if (!n)
381         {
382             return;
383         }
384 
385         float gf[3];
386         float gf_indiv[3];
387 
388         float mse = e+1;
389         float diff = 0;
390 
391         gf[0] = 1.0/3;
392         gf[1] = gf[0];
393         gf[2] = gf[1];
394 
395         while (mse>e && iter<50)
396         {
397             MLE_GF[0] = 0;
398             MLE_GF[1] = 0;
399             MLE_GF[2] = 0;
400 
401             for (size_t i=0; i<n; ++i)
402             {
403                 size_t offset = imap[i]*3;
404 
405                 float prob_data = (gf_indiv[0] = gf[0]*LogTool::pl2prob(pls[offset]));
406                 prob_data += (gf_indiv[1] = gf[1]*LogTool::pl2prob(pls[offset+1]));
407                 prob_data += (gf_indiv[2] = gf[2]*LogTool::pl2prob(pls[offset+2]));
408 
409                 MLE_GF[0] += (gf_indiv[0] /= prob_data);
410                 MLE_GF[1] += (gf_indiv[1] /= prob_data);
411                 MLE_GF[2] += (gf_indiv[2] /= prob_data);
412             }
413 
414             MLE_GF[0] /= n;
415             MLE_GF[1] /= n;
416             MLE_GF[2] /= n;
417 
418             diff = (gf[0]-MLE_GF[0]);
419             mse = diff*diff;
420             diff = (gf[1]-MLE_GF[1]);
421             mse += diff*diff;
422             diff = (gf[2]-MLE_GF[2]);
423             mse += diff*diff;
424 
425             gf[0] = MLE_GF[0];
426             gf[1] = MLE_GF[1];
427             gf[2] = MLE_GF[2];
428 
429 
430             ++iter;
431         }
432 
433         MLE_AF[0] = MLE_GF[0]+0.5*MLE_GF[1];
434         MLE_AF[1] = MLE_GF[2]+0.5*MLE_GF[1];
435     }
436     else
437     {
438         n = 0;
439         int32_t imap[nsamples];
440         int32_t no_genotypes = bcf_an2gn(n_allele);
441         for (size_t i=0; i<nsamples; ++i)
442         {
443             if (pls[i*no_genotypes]!=bcf_int32_missing)
444             {
445                 imap[n] = i;
446                 ++n;
447             }
448         }
449 
450         if (!n) return;
451 
452         float gf[no_genotypes];
453         float gf_indiv[no_genotypes];
454 
455         //initialization
456         gf[0] = 1.0/no_genotypes;
457         for (size_t i=1; i<no_genotypes; ++i)
458         {
459             gf[i] = gf[0];
460         }
461 
462         float mse = e+1;
463         while (mse>e && iter<50)
464         {
465             //initialization
466             for (size_t i=0; i<no_genotypes; ++i)
467             {
468                 MLE_GF[i] = 0;
469             }
470 
471             //iterate through individuals
472             for (size_t i=0; i<n; ++i)
473             {
474                 size_t offset = imap[i]*no_genotypes;
475 
476                 float prob_data = 0;
477                 for (size_t j=0; j<no_genotypes; ++j)
478                 {
479                     prob_data += (gf_indiv[j] = gf[j]*LogTool::pl2prob(pls[offset+j]));
480                 }
481 
482                 for (size_t j=0; j<no_genotypes; ++j)
483                 {
484                     MLE_GF[j] += (gf_indiv[j] /= prob_data);
485                 }
486             }
487 
488             mse = 0;
489             float diff;
490             for (size_t i=0; i<no_genotypes; ++i)
491             {
492                 MLE_GF[i] /= n;
493                 diff = gf[i]-MLE_GF[i];
494                 mse += (diff *= diff);
495                 gf[i] = MLE_GF[i];
496             }
497 
498             ++iter;
499         }
500 
501         for (size_t i=0; i<n_allele; ++i)
502         {
503             for (size_t j=0; j<=i; ++j)
504             {
505                 int32_t index = bcf_alleles2gt(i,j);
506                 MLE_AF[i] += 0.5*MLE_GF[index];
507                 MLE_AF[i] += 0.5*MLE_GF[index];
508             }
509         }
510     }
511 }
512 
513 /**
514  * Computes the Hardy-Weinberg Likelihood Ratio Test Statistic
515  *
516  * @pls        - PHRED genotype likelihoods
517  * @no_samples - number of samples
518  * @ploidy     - ploidy
519  * @no_alleles - number of alleles
520  * @MLE_HWE_AF - estimated AF assuming HWE
521  * @MLE_GF     - estimated GF
522  * @n          - effective sample size
523  * @lr         - log10 likelihood ratio
524  * @logp       - likelihood ratio test log p-value
525  * @df         - degrees of freedom
526  */
compute_hwe_lrt(int32_t * pls,int32_t no_samples,int32_t ploidy,int32_t no_alleles,float * MLE_HWE_GF,float * MLE_GF,int32_t & n,float & lr,float & logp,int32_t & df)527 void Estimator::compute_hwe_lrt(int32_t *pls, int32_t no_samples, int32_t ploidy,
528             int32_t no_alleles, float *MLE_HWE_GF, float *MLE_GF, int32_t& n,
529             float& lr, float& logp, int32_t& df)
530 {
531     n = 0;
532     if (ploidy==2)
533     {
534         if (no_alleles==2)
535         {
536             int32_t no_genotypes = 3;
537 
538             float l0=0, la=0;
539             for (size_t k=0; k<no_samples; ++k)
540             {
541                 size_t offset = k*3;
542 
543                 if (pls[offset]==bcf_int32_missing) continue;
544 
545                 ++n;
546 
547                 float p = LogTool::pl2prob(pls[offset]);
548                 float l0i = MLE_HWE_GF[0] * p;
549                 float lai = MLE_GF[0] * p;
550                 p = LogTool::pl2prob(pls[offset+1]);
551                 l0i += MLE_HWE_GF[1] * p;
552                 lai += MLE_GF[1] * p;
553                 p = LogTool::pl2prob(pls[offset+2]);
554                 l0i += MLE_HWE_GF[2] * p;
555                 lai += MLE_GF[2] * p;
556 
557                 l0 += log(l0i);
558                 la += log(lai);
559             }
560 
561             if (!n) return;
562 
563             lr = l0-la;
564             float lrts = lr>0 ? 0 : -2*lr;
565             df = 1;
566             logp = pchisq(lrts, 1, 0, 1);
567         }
568         else
569         {
570             int32_t no_genotypes = bcf_an2gn(no_alleles);
571 
572             float l0=0, la=0;
573             for (size_t k=0; k<no_samples; ++k)
574             {
575                 size_t offset = k*no_genotypes;
576 
577                 if (pls[offset]==bcf_int32_missing) continue;
578 
579                 ++n;
580 
581                 float l0i=0, lai=0;
582                 for (size_t j=0; j<no_genotypes; ++j)
583                 {
584                     float p = LogTool::pl2prob(pls[offset+j]);
585                     l0i += MLE_HWE_GF[j]*p;
586                     lai += MLE_GF[j]*p;
587                 }
588 
589                 l0 += log(l0i);
590                 la += log(lai);
591             }
592 
593             if (!n) return;
594 
595             lr = l0-la;
596             float lrts = lr>0 ? 0 : -2*lr;
597             df = no_genotypes-no_alleles;
598             logp = pchisq(lrts, df, 0, 1);
599         }
600     }
601 };
602 
603 /**
604  * Computes the Inbreeding Coefficient Statistic from Genotype likelihoods.
605  *
606  * @pls        - PHRED genotype likelihoods
607  * @no_samples - number of samples
608  * @ploidy     - ploidy
609  * @GF         - GF
610  * @HWE_AF     - AF under HWE assumption
611  * @no_alleles - number of alleles
612  * @F          - estimated inbreeding coefficient
613  * @n          - effective sample size
614  */
compute_gl_fic(int32_t * pls,int32_t no_samples,int32_t ploidy,float * HWE_AF,int32_t no_alleles,float * GF,float & F,int32_t & n)615 void Estimator::compute_gl_fic(int32_t * pls, int32_t no_samples, int32_t ploidy,
616                                float* HWE_AF, int32_t no_alleles, float* GF,
617                                float& F, int32_t& n)
618 {
619     n = 0;
620 
621     if (ploidy!=2)
622     {
623         return;
624     }
625 
626     float HWE_GF[3];
627     HWE_GF[0] = HWE_AF[0]*HWE_AF[0];
628     HWE_GF[1] = 2*HWE_AF[0]*HWE_AF[1];
629     HWE_GF[2] = HWE_AF[1]*HWE_AF[1];
630 
631     if (no_alleles==2)
632     {
633         int32_t no_genotypes = 3;
634 
635         float num = 0, denum=0;
636         for (size_t k=0; k<no_samples; ++k)
637         {
638             size_t offset = k*no_genotypes;
639 
640             if (pls[offset]==bcf_int32_missing)
641             {
642                 continue;
643             }
644 
645             ++n;
646 
647             float o_het_sum = LogTool::pl2prob(pls[offset+1])*GF[1];
648             float o_sum = LogTool::pl2prob(pls[offset])*GF[0];
649             o_sum += o_het_sum;
650             o_sum += LogTool::pl2prob(pls[offset+2])*GF[2];
651 
652             float e_het_sum = LogTool::pl2prob(pls[offset+1])*HWE_GF[1];
653             float e_sum = LogTool::pl2prob(pls[offset])*HWE_GF[0];
654             e_sum += e_het_sum;
655             e_sum += LogTool::pl2prob(pls[offset+2])*HWE_GF[2];
656 
657             num += o_het_sum/o_sum;
658             denum += e_het_sum/e_sum;
659         }
660 
661         F = 1-num/denum;
662     }
663     else
664     {
665         int32_t no_genotypes = bcf_an2gn(no_alleles);
666 
667         float HWE_GF[no_genotypes];
668 
669         for (size_t i=0; i<no_alleles; ++i)
670         {
671             for (size_t j=0; j<=i; ++j)
672             {
673                 HWE_GF[bcf_alleles2gt(i,j)] = (i!=j?2:1)*HWE_AF[i]*HWE_AF[j];
674             }
675         }
676 
677         float num=0, denum=0;
678         float o_het_sum;
679         float o_sum;
680         float e_het_sum;
681         float e_sum;
682         for (size_t k=0; k<no_samples; ++k)
683         {
684             size_t offset = k*no_genotypes;
685             if (pls[offset]==bcf_int32_missing)
686             {
687                 continue;
688             }
689 
690             ++n;
691 
692             o_het_sum = 0;
693             o_sum = 0;
694             e_het_sum = 0;
695             e_sum = 0;
696             int32_t gt_index = 0;
697             for (size_t i=0; i<no_alleles; ++i)
698             {
699                 for (size_t j=0; j<i; ++j)
700                 {
701                     float p = LogTool::pl2prob(pls[offset+gt_index]);
702                     o_het_sum += p * GF[gt_index];
703                     o_sum += p * GF[gt_index];
704 
705                     e_het_sum += p * HWE_GF[gt_index];
706                     e_sum += p * HWE_GF[gt_index];
707 
708                     ++gt_index;
709                 }
710 
711                 //for homozygote
712                 o_sum += LogTool::pl2prob(pls[offset+gt_index]) * GF[gt_index];
713                 e_sum += LogTool::pl2prob(pls[offset+gt_index]) * HWE_GF[gt_index];
714                 ++gt_index;
715             }
716 
717             num += o_het_sum/o_sum;
718             denum += e_het_sum/e_sum;
719         }
720 
721         F = 1-num/denum;
722     }
723 };
724 
725 /**
726  * Computes Allele Balance from genotype likelihoods.
727  *
728  * @pls        - PHRED genotype likelihoods
729  * @no_samples - number of samples
730  * @ploidy     - ploidy
731  * @dps        - depths
732  * @GF         - estimated GF
733  * @no_alleles - number of alleles
734  * @ab         - estimate of allele balance
735  * @n          - effective sample size
736  */
compute_gl_ab(int32_t * pls,int32_t no_samples,int32_t ploidy,int32_t * dps,float * GF,int32_t no_alleles,float & ab,int32_t & n)737 void Estimator::compute_gl_ab(int32_t *pls, int32_t no_samples, int32_t ploidy,
738                               int32_t *dps,
739                               float* GF, int32_t no_alleles,
740                               float& ab, int32_t& n)
741 {
742     n = 0;
743 
744     if (ploidy!=2) return;
745 
746     if (no_alleles==2)
747     {
748         float num = 0, denum = 0;
749         for (size_t k=0; k<no_samples; ++k)
750         {
751             size_t offset = k*3;
752             if(pls[offset]!=bcf_int32_missing && dps[k]!=0)
753             {
754                 float nrefnum = pls[offset+2]-pls[offset+0];
755                 float nrefdenum = pls[offset]+pls[offset+2]-2*pls[offset+1] +6*dps[k];
756                 float nref = 0.5*dps[k]*(1+(nrefdenum?nrefnum/nrefdenum:0));
757                 float phet = LogTool::pl2prob(pls[offset+1])*GF[1] /
758                              ( LogTool::pl2prob(pls[offset])*GF[0]
759                               +LogTool::pl2prob(pls[offset+1])*GF[1]
760                               +LogTool::pl2prob(pls[offset+2])*GF[2]);
761 
762                 num += phet*nref;
763                 denum += phet*dps[k];
764                 ++n;
765             }
766         }
767 
768         ab = (0.05+num)/(0.10+denum);
769     }
770     else
771     {
772         int32_t no_genotypes = bcf_an2gn(no_alleles);
773         float num = 0, denum = 0;
774         for (size_t k=0; k<no_samples; ++k)
775         {
776             size_t offset = k*no_genotypes;
777             if(pls[offset]!=bcf_int32_missing)
778             {
779                 float prob_data, p_ref;
780                 int32_t gt_index = 0;
781 
782                 for (size_t j=1; j<no_alleles; ++j)
783                 {
784                     size_t het_index = bcf_alleles2gt(0,j);
785                     size_t homalt_index = bcf_alleles2gt(j,j);
786                     float nrefnum = pls[offset+homalt_index]-pls[offset];
787                     float nrefdenum = pls[offset]+pls[offset+homalt_index]-2*pls[offset+het_index] +6*dps[k];
788                     float nref = 0.5*dps[k]*(1+(nrefdenum?nrefnum/nrefdenum:0));
789 
790                     float n = LogTool::pl2prob(pls[offset+het_index])*GF[het_index] ;
791                     float d = (LogTool::pl2prob(pls[offset])*GF[0]
792                                +n
793                                +LogTool::pl2prob(pls[offset+homalt_index])*GF[homalt_index]);
794                     float phet = d?n/d:0.333;
795                     num += phet*nref;
796                     denum += phet*dps[k];
797                 }
798 
799                 ++n;
800             }
801         }
802 
803         ab = (0.05+num)/(0.10+denum);
804     }
805 };
806 
807 /**
808  * Computes the phred scaled QUAL for a variant.
809  *
810  * @pls        - PHRED genotype likelihoods
811  * @no_samples - number of samples
812  * @ploidy     - ploidy
813  * @no_alleles - number of alleles
814  * @n          - effective sample size
815  * @qual       - PHRED scaled QUAL
816  */
compute_qual(int32_t * pls,int32_t no_samples,int32_t ploidy,int32_t no_alleles,float & qual,int32_t & n)817 void Estimator::compute_qual(int32_t *pls, int32_t no_samples, int32_t ploidy,
818             int32_t no_alleles, float &qual, int32_t &n)
819 {
820     n = 0;
821     if (ploidy==2)
822     {
823         if (no_alleles==2)
824         {
825             int32_t no_genotypes = 3;
826 
827             qual = 0;
828             for (size_t k=0; k<no_samples; ++k)
829             {
830                 size_t offset = k*3;
831 
832                 if (pls[offset]==bcf_int32_missing) continue;
833 
834                 ++n;
835 
836                 qual += LogTool::log10((1-LogTool::pl2prob(pls[offset])/(LogTool::pl2prob(pls[offset])+LogTool::pl2prob(pls[offset+1])+LogTool::pl2prob(pls[offset+2]))));
837 
838             }
839 
840             if (!n) return;
841 
842             qual = LogTool::round(-qual*10);
843         }
844         else
845         {
846             //works only for ploidy of 2
847 
848             int32_t no_genotypes = bcf_an2gn(no_alleles);
849 
850             float gq = 0;
851             for (size_t k=0; k<no_samples; ++k)
852             {
853                 size_t offset = k*no_genotypes;
854 
855                 if (pls[offset]==bcf_int32_missing) continue;
856 
857                 ++n;
858 
859                 float denom = 0;
860                 for (size_t j=0; j<no_genotypes; ++j)
861                 {
862                     denom += LogTool::pl2prob(pls[offset]);
863                 }
864 
865                 qual += LogTool::log10((1-LogTool::pl2prob(pls[offset])/denom));
866             }
867 
868             if (!n) return;
869 
870             qual = LogTool::round(-qual*10);
871         }
872     }
873 };
874 
875 /**
876  * Compute correlation from sufficient statistics for integral observations
877  * @n  - number of observations
878  * @xy - sum of x_i * y_i
879  * @x1 - sum of x_i
880  * @x2 - sum of x_i^2
881  * @y1 - sum of y_i
882  * @y2 - sum of y_i^2
883  */
compute_correlation(int32_t n,int32_t xy,int32_t x1,int32_t x2,int32_t y1,int32_t y2,float buffer)884 float Estimator::compute_correlation(int32_t n, int32_t xy, int32_t x1, int32_t x2, int32_t y1, int32_t y2, float buffer)
885 {
886     if ( n == 0 ) return 0;
887     float xvar = x2/(float)n - (x1/(float)n)*(x1/(float)n);
888     float yvar = y2/(float)n - (y1/(float)n)*(y1/(float)n);
889     return ( ( xy/(float)n - x1 * y1 / (float) n / (float) n ) / sqrt( xvar * yvar + buffer ) );
890 }
891 
892 /**
893  * Compute correlation from sufficient statistics for float observations
894  * @n  - number of observations
895  * @xy - sum of x_i * y_i
896  * @x1 - sum of x_i
897  * @x2 - sum of x_i^2
898  * @y1 - sum of y_i
899  * @y2 - sum of y_i^2
900  */
compute_correlation_f(int32_t n,float xy,float x1,float x2,float y1,float y2,float buffer)901 float Estimator::compute_correlation_f(int32_t n, float xy, float x1, float x2, float y1, float y2, float buffer)
902 {
903     if ( n == 0 ) return 0;
904     float xvar = x2/(float)n - (x1/(float)n)*(x1/(float)n);
905     float yvar = y2/(float)n - (y1/(float)n)*(y1/(float)n);
906     return ( ( xy/(float)n - x1 * y1 / (float) n / (float) n ) / sqrt( xvar * yvar + buffer ) );
907 }