1 /* The MIT License
2
3 Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "estimator.h"
25
26 /**
27 * Computes allele frequencies using hard calls.
28 *
29 * @gts - genotypes
30 * @no_samples - number of samples
31 * @ploidy - ploidy
32 * @no_alleles - number of alleles
33 * @AC - alternate allele counts
34 * @AN - total number of allele counts
35 * @AF - alternate allele frequency
36 * @GC - genotype counts
37 * @GN - total number of genotype counts
38 * @GF - genotype frequency
39 * @NS - number of samples with data
40 */
compute_af(int32_t * gts,int32_t no_samples,int32_t ploidy,int32_t no_alleles,int32_t * AC,int32_t & AN,float * AF,int32_t * GC,int32_t & GN,float * GF,int32_t & NS)41 void Estimator::compute_af(int32_t *gts, int32_t no_samples, int32_t ploidy,
42 int32_t no_alleles, int32_t *AC, int32_t& AN, float *AF,
43 int32_t *GC, int32_t& GN, float *GF, int32_t& NS)
44 {
45 int32_t iter = 0;
46
47 if (no_alleles==2 && ploidy==2)
48 {
49 NS = 0;
50 int32_t no_genotypes = 3;
51 AC[0] = 0;
52 AC[1] = 0;
53
54 GC[0] = 0;
55 GC[1] = 0;
56 GC[2] = 0;
57
58 GN=0;
59
60 for (size_t k=0; k<no_samples; ++k)
61 {
62 size_t offset = k*ploidy;
63
64 int32_t g1 = bcf_gt_allele(gts[offset]);
65 if (g1>=0)
66 {
67 ++AC[g1];
68 ++AN;
69 }
70 int32_t g2 = bcf_gt_allele(gts[offset+1]);
71 if (g2>=0)
72 {
73 ++AC[g2];
74 ++AN;
75 }
76
77 if (g1>=0 && g2>=0)
78 {
79 ++GC[g1+g2];
80 ++GN;
81 }
82
83 if (g1>=0||g2>=0) ++NS;
84 }
85
86 if (!NS)
87 {
88 return;
89 }
90
91 AF[0] = (float)AC[0]/AN;
92 AF[1] = (float)AC[1]/AN;
93
94 GF[0] = (float)GC[0]/GN;
95 GF[1] = (float)GC[1]/GN;
96 GF[2] = (float)GC[2]/GN;
97
98 // std::cerr << "AC: " << AC[0] << "," << AC[1] << "\n";
99 // std::cerr << "AF: " << AF[0] << "," << AF[1] << "\n";
100 // std::cerr << "GC: " << AC[0] << AC[0] << "," << AC[1] << "\n";
101 // std::cerr << "GC: " << AC[0] << AC[0] << "," << AC[1] << "\n";
102
103 }
104 else
105 {
106 NS = 0;
107 AN = 0;
108 GN = 0;
109 int32_t no_genotypes = bcf_an2gn(no_alleles);
110 for (size_t i=0; i<no_alleles; ++i)
111 {
112 AC[i] = 0;
113 }
114
115 for (size_t i=0; i<no_genotypes; ++i)
116 {
117 GC[i] = 0;
118 }
119
120 int32_t gt_indiv[ploidy];
121
122 for (size_t k=0; k<no_samples; ++k)
123 {
124 size_t offset = k*ploidy;
125
126 int32_t last_AN = AN;
127 for (size_t i=0; i<ploidy; ++i)
128 {
129 gt_indiv[i] = bcf_gt_allele(gts[offset+i]);
130
131 if (gt_indiv[i]>=0)
132 {
133 ++AC[gt_indiv[i]];
134 ++AN;
135 }
136 }
137
138 if (last_AN<AN) ++NS;
139
140 if (ploidy==2 && gt_indiv[0]>=0 && gt_indiv[1]>=0)
141 {
142 if (gt_indiv[1]<gt_indiv[0])
143 {
144 gt_indiv[0] += gt_indiv[1];
145 gt_indiv[1] = gt_indiv[0] - gt_indiv[1];
146 gt_indiv[0] -= gt_indiv[1];
147 }
148
149 ++GC[bcf_alleles2gt(gt_indiv[0],gt_indiv[1])];
150 ++GN;
151 }
152 }
153
154 for (size_t i=0; i<no_alleles; ++i)
155 {
156 AF[i] = (float)AC[i]/AN;
157 }
158
159 for (size_t i=0; i<no_genotypes; ++i)
160 {
161 GF[i] = (float)GC[i]/GN;
162 }
163 }
164 }
165
166 /**
167 * Computes allele frequencies using EM algorithm from genotype likelihoods
168 * under assumption of Hardy-Weinberg Equilibrium.
169 *
170 * @pls - PHRED genotype likelihoods
171 * @no_samples - number of samples
172 * @ploidy - ploidy
173 * @no_alleles - number of alleles
174 * @MLE_HWE_AF - estimated AF
175 * @MLE_HWE_GF - estimated GF
176 * @n - effective sample size
177 * @e - error
178 */
compute_gl_af_hwe(int32_t * pls,int32_t no_samples,int32_t ploidy,int32_t no_alleles,float * MLE_HWE_AF,float * MLE_HWE_GF,int32_t & n,double e)179 void Estimator::compute_gl_af_hwe(int32_t *pls, int32_t no_samples, int32_t ploidy,
180 int32_t no_alleles, float *MLE_HWE_AF, float *MLE_HWE_GF, int32_t& n,
181 double e)
182 {
183 int32_t iter = 0;
184
185 if (ploidy!=2)
186 {
187 return;
188 }
189
190 if (no_alleles==2)
191 {
192 n = 0;
193 int32_t imap[no_samples];
194
195 for (size_t i=0; i<no_samples; ++i)
196 {
197 if (pls[i*3]!=bcf_int32_missing)
198 {
199 imap[n] = i;
200 ++n;
201 }
202 }
203
204 if (!n)
205 {
206 return;
207 }
208
209 float af[2] = {0.5, 0.5};
210 float gf[3];
211 float gf_indiv[3];
212
213 float mse = e+1;
214 float diff = 0;
215 while (mse>e && iter<50)
216 {
217 gf[0] = af[0]*af[0];
218 gf[1] = 2*af[0]*af[1];
219 gf[2] = af[1]*af[1];
220
221 MLE_HWE_AF[0] = 0;
222 MLE_HWE_AF[1] = 0;
223
224 for (size_t i=0; i<n; ++i)
225 {
226 size_t offset = imap[i]*3;
227
228 float prob_data = (gf_indiv[0] = gf[0]*LogTool::pl2prob(pls[offset]));
229 prob_data += (gf_indiv[1] = gf[1]*LogTool::pl2prob(pls[offset+1]));
230 prob_data += (gf_indiv[2] = gf[2]*LogTool::pl2prob(pls[offset+2]));
231
232 gf_indiv[0] /= prob_data;
233 gf_indiv[1] /= prob_data;
234 gf_indiv[2] /= prob_data;
235
236 MLE_HWE_AF[0] += gf_indiv[0] + 0.5*gf_indiv[1];
237 MLE_HWE_AF[1] += gf_indiv[2] + 0.5*gf_indiv[1];
238 }
239
240 MLE_HWE_AF[0] /= n;
241 MLE_HWE_AF[1] /= n;
242
243 diff = (af[0]-MLE_HWE_AF[0]);
244 mse = diff*diff;
245 diff = (af[1]-MLE_HWE_AF[1]);
246 mse += diff*diff;
247
248 af[0] = MLE_HWE_AF[0];
249 af[1] = MLE_HWE_AF[1];
250
251 ++iter;
252 }
253
254 MLE_HWE_GF[0] = MLE_HWE_AF[0]*MLE_HWE_AF[0];
255 MLE_HWE_GF[1] = 2*MLE_HWE_AF[0]*MLE_HWE_AF[1];
256 MLE_HWE_GF[2] = MLE_HWE_AF[1]*MLE_HWE_AF[1];
257 }
258 else
259 {
260 n = 0;
261 int32_t imap[no_samples];
262 int32_t no_genotypes = bcf_an2gn(no_alleles);
263 for (size_t k=0; k<no_samples; ++k)
264 {
265 if (pls[k*no_genotypes]!=bcf_int32_missing)
266 {
267 imap[n] = k;
268 ++n;
269 }
270 }
271
272 if (!n) return;
273
274 float af[no_alleles];
275 float p = 1.0/no_alleles;
276 for (size_t i=0; i<no_alleles; ++i)
277 {
278 af[i] = p;
279 }
280 float gf[no_genotypes];
281 float gf_indiv[no_genotypes];
282
283 bool debug = false;
284
285 float mse = e+1;
286 while (mse>e && iter<50)
287 {
288 for (size_t i=0; i<no_alleles; ++i)
289 {
290 MLE_HWE_AF[i] = 0;
291 for (size_t j=0; j<=i; ++j)
292 {
293 gf[bcf_alleles2gt(i,j)] = (i!=j?2:1)*af[i]*af[j];
294 }
295 }
296
297 //iterate through individuals
298 for (size_t k=0; k<n; ++k)
299 {
300 size_t offset = imap[k]*no_genotypes;
301
302 float prob_data = 0;
303 for (size_t i=0; i<no_genotypes; ++i)
304 {
305 prob_data += (gf_indiv[i] = gf[i]*LogTool::pl2prob(pls[offset+i]));
306 }
307
308 for (size_t i=0; i<no_genotypes; ++i)
309 {
310 gf_indiv[i] /= prob_data;
311 }
312
313 for (size_t i=0; i<no_alleles; ++i)
314 {
315 for (size_t j=0; j<=i; ++j)
316 {
317 int32_t gf_index = bcf_alleles2gt(i,j);
318 MLE_HWE_AF[i] += 0.5*gf_indiv[gf_index];
319 MLE_HWE_AF[j] += 0.5*gf_indiv[gf_index];
320 }
321 }
322 }
323
324 //normalize to frequency
325 mse = 0;
326 float diff;
327 for (size_t i=0; i<no_alleles; ++i)
328 {
329 MLE_HWE_AF[i] /= n;
330 diff = af[i]-MLE_HWE_AF[i];
331 mse += (diff*diff);
332 af[i] = MLE_HWE_AF[i];
333 }
334
335 ++iter;
336 }
337
338 for (size_t i=0; i<no_alleles; ++i)
339 {
340 for (size_t j=0; j<=i; ++j)
341 {
342 MLE_HWE_GF[bcf_alleles2gt(i,j)] = (i!=j?2:1)*MLE_HWE_AF[i]*MLE_HWE_AF[j];
343 }
344 }
345 }
346 }
347
348 /**
349 * Computes allele frequencies using EM algorithm from genotype likelihoods.
350 *
351 * @pls - PHRED genotype likelihoods
352 * @no_samples - number of samples
353 * @ploidy - ploidy
354 * @n_alleles - number of alleles
355 * @MLE_AF - estimated AF
356 * @MLE_GF - estimated GF
357 * @n - effective sample size
358 * @e - error
359 */
compute_gl_af(int32_t * pls,int32_t nsamples,int32_t ploidy,int32_t n_allele,float * MLE_AF,float * MLE_GF,int32_t & n,double e)360 void Estimator::compute_gl_af(int32_t *pls, int32_t nsamples, int32_t ploidy,
361 int32_t n_allele, float *MLE_AF, float *MLE_GF, int32_t& n,
362 double e)
363 {
364 int32_t iter = 0;
365
366 if (n_allele==2 && ploidy==2)
367 {
368 n = 0;
369 int32_t imap[nsamples];
370
371 for (size_t i=0; i<nsamples; ++i)
372 {
373 if (pls[i*3]!=bcf_int32_missing)
374 {
375 imap[n] = i;
376 ++n;
377 }
378 }
379
380 if (!n)
381 {
382 return;
383 }
384
385 float gf[3];
386 float gf_indiv[3];
387
388 float mse = e+1;
389 float diff = 0;
390
391 gf[0] = 1.0/3;
392 gf[1] = gf[0];
393 gf[2] = gf[1];
394
395 while (mse>e && iter<50)
396 {
397 MLE_GF[0] = 0;
398 MLE_GF[1] = 0;
399 MLE_GF[2] = 0;
400
401 for (size_t i=0; i<n; ++i)
402 {
403 size_t offset = imap[i]*3;
404
405 float prob_data = (gf_indiv[0] = gf[0]*LogTool::pl2prob(pls[offset]));
406 prob_data += (gf_indiv[1] = gf[1]*LogTool::pl2prob(pls[offset+1]));
407 prob_data += (gf_indiv[2] = gf[2]*LogTool::pl2prob(pls[offset+2]));
408
409 MLE_GF[0] += (gf_indiv[0] /= prob_data);
410 MLE_GF[1] += (gf_indiv[1] /= prob_data);
411 MLE_GF[2] += (gf_indiv[2] /= prob_data);
412 }
413
414 MLE_GF[0] /= n;
415 MLE_GF[1] /= n;
416 MLE_GF[2] /= n;
417
418 diff = (gf[0]-MLE_GF[0]);
419 mse = diff*diff;
420 diff = (gf[1]-MLE_GF[1]);
421 mse += diff*diff;
422 diff = (gf[2]-MLE_GF[2]);
423 mse += diff*diff;
424
425 gf[0] = MLE_GF[0];
426 gf[1] = MLE_GF[1];
427 gf[2] = MLE_GF[2];
428
429
430 ++iter;
431 }
432
433 MLE_AF[0] = MLE_GF[0]+0.5*MLE_GF[1];
434 MLE_AF[1] = MLE_GF[2]+0.5*MLE_GF[1];
435 }
436 else
437 {
438 n = 0;
439 int32_t imap[nsamples];
440 int32_t no_genotypes = bcf_an2gn(n_allele);
441 for (size_t i=0; i<nsamples; ++i)
442 {
443 if (pls[i*no_genotypes]!=bcf_int32_missing)
444 {
445 imap[n] = i;
446 ++n;
447 }
448 }
449
450 if (!n) return;
451
452 float gf[no_genotypes];
453 float gf_indiv[no_genotypes];
454
455 //initialization
456 gf[0] = 1.0/no_genotypes;
457 for (size_t i=1; i<no_genotypes; ++i)
458 {
459 gf[i] = gf[0];
460 }
461
462 float mse = e+1;
463 while (mse>e && iter<50)
464 {
465 //initialization
466 for (size_t i=0; i<no_genotypes; ++i)
467 {
468 MLE_GF[i] = 0;
469 }
470
471 //iterate through individuals
472 for (size_t i=0; i<n; ++i)
473 {
474 size_t offset = imap[i]*no_genotypes;
475
476 float prob_data = 0;
477 for (size_t j=0; j<no_genotypes; ++j)
478 {
479 prob_data += (gf_indiv[j] = gf[j]*LogTool::pl2prob(pls[offset+j]));
480 }
481
482 for (size_t j=0; j<no_genotypes; ++j)
483 {
484 MLE_GF[j] += (gf_indiv[j] /= prob_data);
485 }
486 }
487
488 mse = 0;
489 float diff;
490 for (size_t i=0; i<no_genotypes; ++i)
491 {
492 MLE_GF[i] /= n;
493 diff = gf[i]-MLE_GF[i];
494 mse += (diff *= diff);
495 gf[i] = MLE_GF[i];
496 }
497
498 ++iter;
499 }
500
501 for (size_t i=0; i<n_allele; ++i)
502 {
503 for (size_t j=0; j<=i; ++j)
504 {
505 int32_t index = bcf_alleles2gt(i,j);
506 MLE_AF[i] += 0.5*MLE_GF[index];
507 MLE_AF[i] += 0.5*MLE_GF[index];
508 }
509 }
510 }
511 }
512
513 /**
514 * Computes the Hardy-Weinberg Likelihood Ratio Test Statistic
515 *
516 * @pls - PHRED genotype likelihoods
517 * @no_samples - number of samples
518 * @ploidy - ploidy
519 * @no_alleles - number of alleles
520 * @MLE_HWE_AF - estimated AF assuming HWE
521 * @MLE_GF - estimated GF
522 * @n - effective sample size
523 * @lr - log10 likelihood ratio
524 * @logp - likelihood ratio test log p-value
525 * @df - degrees of freedom
526 */
compute_hwe_lrt(int32_t * pls,int32_t no_samples,int32_t ploidy,int32_t no_alleles,float * MLE_HWE_GF,float * MLE_GF,int32_t & n,float & lr,float & logp,int32_t & df)527 void Estimator::compute_hwe_lrt(int32_t *pls, int32_t no_samples, int32_t ploidy,
528 int32_t no_alleles, float *MLE_HWE_GF, float *MLE_GF, int32_t& n,
529 float& lr, float& logp, int32_t& df)
530 {
531 n = 0;
532 if (ploidy==2)
533 {
534 if (no_alleles==2)
535 {
536 int32_t no_genotypes = 3;
537
538 float l0=0, la=0;
539 for (size_t k=0; k<no_samples; ++k)
540 {
541 size_t offset = k*3;
542
543 if (pls[offset]==bcf_int32_missing) continue;
544
545 ++n;
546
547 float p = LogTool::pl2prob(pls[offset]);
548 float l0i = MLE_HWE_GF[0] * p;
549 float lai = MLE_GF[0] * p;
550 p = LogTool::pl2prob(pls[offset+1]);
551 l0i += MLE_HWE_GF[1] * p;
552 lai += MLE_GF[1] * p;
553 p = LogTool::pl2prob(pls[offset+2]);
554 l0i += MLE_HWE_GF[2] * p;
555 lai += MLE_GF[2] * p;
556
557 l0 += log(l0i);
558 la += log(lai);
559 }
560
561 if (!n) return;
562
563 lr = l0-la;
564 float lrts = lr>0 ? 0 : -2*lr;
565 df = 1;
566 logp = pchisq(lrts, 1, 0, 1);
567 }
568 else
569 {
570 int32_t no_genotypes = bcf_an2gn(no_alleles);
571
572 float l0=0, la=0;
573 for (size_t k=0; k<no_samples; ++k)
574 {
575 size_t offset = k*no_genotypes;
576
577 if (pls[offset]==bcf_int32_missing) continue;
578
579 ++n;
580
581 float l0i=0, lai=0;
582 for (size_t j=0; j<no_genotypes; ++j)
583 {
584 float p = LogTool::pl2prob(pls[offset+j]);
585 l0i += MLE_HWE_GF[j]*p;
586 lai += MLE_GF[j]*p;
587 }
588
589 l0 += log(l0i);
590 la += log(lai);
591 }
592
593 if (!n) return;
594
595 lr = l0-la;
596 float lrts = lr>0 ? 0 : -2*lr;
597 df = no_genotypes-no_alleles;
598 logp = pchisq(lrts, df, 0, 1);
599 }
600 }
601 };
602
603 /**
604 * Computes the Inbreeding Coefficient Statistic from Genotype likelihoods.
605 *
606 * @pls - PHRED genotype likelihoods
607 * @no_samples - number of samples
608 * @ploidy - ploidy
609 * @GF - GF
610 * @HWE_AF - AF under HWE assumption
611 * @no_alleles - number of alleles
612 * @F - estimated inbreeding coefficient
613 * @n - effective sample size
614 */
compute_gl_fic(int32_t * pls,int32_t no_samples,int32_t ploidy,float * HWE_AF,int32_t no_alleles,float * GF,float & F,int32_t & n)615 void Estimator::compute_gl_fic(int32_t * pls, int32_t no_samples, int32_t ploidy,
616 float* HWE_AF, int32_t no_alleles, float* GF,
617 float& F, int32_t& n)
618 {
619 n = 0;
620
621 if (ploidy!=2)
622 {
623 return;
624 }
625
626 float HWE_GF[3];
627 HWE_GF[0] = HWE_AF[0]*HWE_AF[0];
628 HWE_GF[1] = 2*HWE_AF[0]*HWE_AF[1];
629 HWE_GF[2] = HWE_AF[1]*HWE_AF[1];
630
631 if (no_alleles==2)
632 {
633 int32_t no_genotypes = 3;
634
635 float num = 0, denum=0;
636 for (size_t k=0; k<no_samples; ++k)
637 {
638 size_t offset = k*no_genotypes;
639
640 if (pls[offset]==bcf_int32_missing)
641 {
642 continue;
643 }
644
645 ++n;
646
647 float o_het_sum = LogTool::pl2prob(pls[offset+1])*GF[1];
648 float o_sum = LogTool::pl2prob(pls[offset])*GF[0];
649 o_sum += o_het_sum;
650 o_sum += LogTool::pl2prob(pls[offset+2])*GF[2];
651
652 float e_het_sum = LogTool::pl2prob(pls[offset+1])*HWE_GF[1];
653 float e_sum = LogTool::pl2prob(pls[offset])*HWE_GF[0];
654 e_sum += e_het_sum;
655 e_sum += LogTool::pl2prob(pls[offset+2])*HWE_GF[2];
656
657 num += o_het_sum/o_sum;
658 denum += e_het_sum/e_sum;
659 }
660
661 F = 1-num/denum;
662 }
663 else
664 {
665 int32_t no_genotypes = bcf_an2gn(no_alleles);
666
667 float HWE_GF[no_genotypes];
668
669 for (size_t i=0; i<no_alleles; ++i)
670 {
671 for (size_t j=0; j<=i; ++j)
672 {
673 HWE_GF[bcf_alleles2gt(i,j)] = (i!=j?2:1)*HWE_AF[i]*HWE_AF[j];
674 }
675 }
676
677 float num=0, denum=0;
678 float o_het_sum;
679 float o_sum;
680 float e_het_sum;
681 float e_sum;
682 for (size_t k=0; k<no_samples; ++k)
683 {
684 size_t offset = k*no_genotypes;
685 if (pls[offset]==bcf_int32_missing)
686 {
687 continue;
688 }
689
690 ++n;
691
692 o_het_sum = 0;
693 o_sum = 0;
694 e_het_sum = 0;
695 e_sum = 0;
696 int32_t gt_index = 0;
697 for (size_t i=0; i<no_alleles; ++i)
698 {
699 for (size_t j=0; j<i; ++j)
700 {
701 float p = LogTool::pl2prob(pls[offset+gt_index]);
702 o_het_sum += p * GF[gt_index];
703 o_sum += p * GF[gt_index];
704
705 e_het_sum += p * HWE_GF[gt_index];
706 e_sum += p * HWE_GF[gt_index];
707
708 ++gt_index;
709 }
710
711 //for homozygote
712 o_sum += LogTool::pl2prob(pls[offset+gt_index]) * GF[gt_index];
713 e_sum += LogTool::pl2prob(pls[offset+gt_index]) * HWE_GF[gt_index];
714 ++gt_index;
715 }
716
717 num += o_het_sum/o_sum;
718 denum += e_het_sum/e_sum;
719 }
720
721 F = 1-num/denum;
722 }
723 };
724
725 /**
726 * Computes Allele Balance from genotype likelihoods.
727 *
728 * @pls - PHRED genotype likelihoods
729 * @no_samples - number of samples
730 * @ploidy - ploidy
731 * @dps - depths
732 * @GF - estimated GF
733 * @no_alleles - number of alleles
734 * @ab - estimate of allele balance
735 * @n - effective sample size
736 */
compute_gl_ab(int32_t * pls,int32_t no_samples,int32_t ploidy,int32_t * dps,float * GF,int32_t no_alleles,float & ab,int32_t & n)737 void Estimator::compute_gl_ab(int32_t *pls, int32_t no_samples, int32_t ploidy,
738 int32_t *dps,
739 float* GF, int32_t no_alleles,
740 float& ab, int32_t& n)
741 {
742 n = 0;
743
744 if (ploidy!=2) return;
745
746 if (no_alleles==2)
747 {
748 float num = 0, denum = 0;
749 for (size_t k=0; k<no_samples; ++k)
750 {
751 size_t offset = k*3;
752 if(pls[offset]!=bcf_int32_missing && dps[k]!=0)
753 {
754 float nrefnum = pls[offset+2]-pls[offset+0];
755 float nrefdenum = pls[offset]+pls[offset+2]-2*pls[offset+1] +6*dps[k];
756 float nref = 0.5*dps[k]*(1+(nrefdenum?nrefnum/nrefdenum:0));
757 float phet = LogTool::pl2prob(pls[offset+1])*GF[1] /
758 ( LogTool::pl2prob(pls[offset])*GF[0]
759 +LogTool::pl2prob(pls[offset+1])*GF[1]
760 +LogTool::pl2prob(pls[offset+2])*GF[2]);
761
762 num += phet*nref;
763 denum += phet*dps[k];
764 ++n;
765 }
766 }
767
768 ab = (0.05+num)/(0.10+denum);
769 }
770 else
771 {
772 int32_t no_genotypes = bcf_an2gn(no_alleles);
773 float num = 0, denum = 0;
774 for (size_t k=0; k<no_samples; ++k)
775 {
776 size_t offset = k*no_genotypes;
777 if(pls[offset]!=bcf_int32_missing)
778 {
779 float prob_data, p_ref;
780 int32_t gt_index = 0;
781
782 for (size_t j=1; j<no_alleles; ++j)
783 {
784 size_t het_index = bcf_alleles2gt(0,j);
785 size_t homalt_index = bcf_alleles2gt(j,j);
786 float nrefnum = pls[offset+homalt_index]-pls[offset];
787 float nrefdenum = pls[offset]+pls[offset+homalt_index]-2*pls[offset+het_index] +6*dps[k];
788 float nref = 0.5*dps[k]*(1+(nrefdenum?nrefnum/nrefdenum:0));
789
790 float n = LogTool::pl2prob(pls[offset+het_index])*GF[het_index] ;
791 float d = (LogTool::pl2prob(pls[offset])*GF[0]
792 +n
793 +LogTool::pl2prob(pls[offset+homalt_index])*GF[homalt_index]);
794 float phet = d?n/d:0.333;
795 num += phet*nref;
796 denum += phet*dps[k];
797 }
798
799 ++n;
800 }
801 }
802
803 ab = (0.05+num)/(0.10+denum);
804 }
805 };
806
807 /**
808 * Computes the phred scaled QUAL for a variant.
809 *
810 * @pls - PHRED genotype likelihoods
811 * @no_samples - number of samples
812 * @ploidy - ploidy
813 * @no_alleles - number of alleles
814 * @n - effective sample size
815 * @qual - PHRED scaled QUAL
816 */
compute_qual(int32_t * pls,int32_t no_samples,int32_t ploidy,int32_t no_alleles,float & qual,int32_t & n)817 void Estimator::compute_qual(int32_t *pls, int32_t no_samples, int32_t ploidy,
818 int32_t no_alleles, float &qual, int32_t &n)
819 {
820 n = 0;
821 if (ploidy==2)
822 {
823 if (no_alleles==2)
824 {
825 int32_t no_genotypes = 3;
826
827 qual = 0;
828 for (size_t k=0; k<no_samples; ++k)
829 {
830 size_t offset = k*3;
831
832 if (pls[offset]==bcf_int32_missing) continue;
833
834 ++n;
835
836 qual += LogTool::log10((1-LogTool::pl2prob(pls[offset])/(LogTool::pl2prob(pls[offset])+LogTool::pl2prob(pls[offset+1])+LogTool::pl2prob(pls[offset+2]))));
837
838 }
839
840 if (!n) return;
841
842 qual = LogTool::round(-qual*10);
843 }
844 else
845 {
846 //works only for ploidy of 2
847
848 int32_t no_genotypes = bcf_an2gn(no_alleles);
849
850 float gq = 0;
851 for (size_t k=0; k<no_samples; ++k)
852 {
853 size_t offset = k*no_genotypes;
854
855 if (pls[offset]==bcf_int32_missing) continue;
856
857 ++n;
858
859 float denom = 0;
860 for (size_t j=0; j<no_genotypes; ++j)
861 {
862 denom += LogTool::pl2prob(pls[offset]);
863 }
864
865 qual += LogTool::log10((1-LogTool::pl2prob(pls[offset])/denom));
866 }
867
868 if (!n) return;
869
870 qual = LogTool::round(-qual*10);
871 }
872 }
873 };
874
875 /**
876 * Compute correlation from sufficient statistics for integral observations
877 * @n - number of observations
878 * @xy - sum of x_i * y_i
879 * @x1 - sum of x_i
880 * @x2 - sum of x_i^2
881 * @y1 - sum of y_i
882 * @y2 - sum of y_i^2
883 */
compute_correlation(int32_t n,int32_t xy,int32_t x1,int32_t x2,int32_t y1,int32_t y2,float buffer)884 float Estimator::compute_correlation(int32_t n, int32_t xy, int32_t x1, int32_t x2, int32_t y1, int32_t y2, float buffer)
885 {
886 if ( n == 0 ) return 0;
887 float xvar = x2/(float)n - (x1/(float)n)*(x1/(float)n);
888 float yvar = y2/(float)n - (y1/(float)n)*(y1/(float)n);
889 return ( ( xy/(float)n - x1 * y1 / (float) n / (float) n ) / sqrt( xvar * yvar + buffer ) );
890 }
891
892 /**
893 * Compute correlation from sufficient statistics for float observations
894 * @n - number of observations
895 * @xy - sum of x_i * y_i
896 * @x1 - sum of x_i
897 * @x2 - sum of x_i^2
898 * @y1 - sum of y_i
899 * @y2 - sum of y_i^2
900 */
compute_correlation_f(int32_t n,float xy,float x1,float x2,float y1,float y2,float buffer)901 float Estimator::compute_correlation_f(int32_t n, float xy, float x1, float x2, float y1, float y2, float buffer)
902 {
903 if ( n == 0 ) return 0;
904 float xvar = x2/(float)n - (x1/(float)n)*(x1/(float)n);
905 float yvar = y2/(float)n - (y1/(float)n)*(y1/(float)n);
906 return ( ( xy/(float)n - x1 * y1 / (float) n / (float) n ) / sqrt( xvar * yvar + buffer ) );
907 }