1 #include "GenotypePriors.h"
2
3 /*
4 long double alleleFrequencyProbability(const map<int, int>& alleleFrequencyCounts, long double theta) {
5
6 int M = 0;
7 long double p = 1;
8
9 for (map<int, int>::const_iterator f = alleleFrequencyCounts.begin(); f != alleleFrequencyCounts.end(); ++f) {
10 int frequency = f->first;
11 int count = f->second;
12 M += frequency * count;
13 p *= (double) pow((double) theta, (double) count) / (double) pow((double) frequency, (double) count) * factorial(count);
14 }
15
16 long double thetaH = 1;
17 for (int h = 1; h < M; ++h)
18 thetaH *= theta + h;
19
20 return factorial(M) / (theta * thetaH) * p;
21
22 }
23
24 AlleleFrequencyProbabilityCache alleleFrequencyProbabilityCache;
25
26 long double alleleFrequencyProbabilityln(const map<int, int>& alleleFrequencyCounts, long double theta) {
27 return alleleFrequencyProbabilityCache.alleleFrequencyProbabilityln(alleleFrequencyCounts, theta);
28 }
29
30 // Implements Ewens' Sampling Formula, which provides probability of a given
31 // partition of alleles in a sample from a population
32 long double __alleleFrequencyProbabilityln(const map<int, int>& alleleFrequencyCounts, long double theta) {
33
34 int M = 0; // multiplicity of site
35 long double p = 0;
36 long double thetaln = log(theta);
37
38 for (map<int, int>::const_iterator f = alleleFrequencyCounts.begin(); f != alleleFrequencyCounts.end(); ++f) {
39 int frequency = f->first;
40 int count = f->second;
41 M += frequency * count;
42 p += powln(thetaln, count) - powln(log(frequency), count) + factorialln(count);
43 }
44
45 long double thetaH = 0;
46 for (int h = 1; h < M; ++h)
47 thetaH += log(theta + h);
48
49 return factorialln(M) - (thetaln + thetaH) + p;
50
51 }
52 */
53
54
probabilityGenotypeComboGivenAlleleFrequencyln(GenotypeCombo & genotypeCombo,Allele & allele)55 long double probabilityGenotypeComboGivenAlleleFrequencyln(GenotypeCombo& genotypeCombo, Allele& allele) {
56
57 int n = genotypeCombo.numberOfAlleles();
58 long double lnhetscalar = 0;
59
60 for (GenotypeCombo::iterator gc = genotypeCombo.begin(); gc != genotypeCombo.end(); ++gc) {
61 SampleDataLikelihood& sgp = **gc;
62 if (!sgp.genotype->homozygous) {
63 lnhetscalar += multinomialCoefficientLn(sgp.genotype->ploidy, sgp.genotype->counts());
64 }
65 }
66
67 return lnhetscalar - multinomialCoefficientLn(n, genotypeCombo.counts());
68
69 }
70
71
72 // core calculation of genotype combination likelihoods
73 //
74 GenotypeComboResult
genotypeCombinationPriorProbability(GenotypeCombo * combo,Allele & refAllele,long double theta,bool pooled,bool binomialObsPriors,bool alleleBalancePriors,long double diffusionPriorScalar)75 genotypeCombinationPriorProbability(
76 GenotypeCombo* combo,
77 Allele& refAllele,
78 long double theta,
79 bool pooled,
80 bool binomialObsPriors,
81 bool alleleBalancePriors,
82 long double diffusionPriorScalar) {
83
84 // when we are operating on pooled samples, we will not be able to
85 // ascertain the number of heterozygotes in the pool,
86 // rendering P(Genotype combo | Allele frequency) meaningless
87 long double priorProbabilityOfGenotypeComboG_Af = 0;
88 if (!pooled) {
89 priorProbabilityOfGenotypeComboG_Af = probabilityGenotypeComboGivenAlleleFrequencyln(*combo, refAllele);
90 }
91
92 long double priorObservationExpectationProb = 0;
93
94 if (binomialObsPriors) {
95 // for each alternate and the reference allele
96 // calculate the binomial probability that we see the given strand balance and read placement prob
97 vector<string> alleles = combo->alleles();
98 // cerr << *combo << endl;
99 for (vector<string>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
100 const string& allele = *a;
101 map<string, AlleleCounter>::iterator ac = combo->alleleCounters.find(allele);
102 if (ac != combo->alleleCounters.end()) {
103 const AlleleCounter& alleleCounter = ac->second;
104 int obs = alleleCounter.observations;
105 /*
106 cerr << allele << " counts: " << alleleCounter.frequency
107 << " observations " << alleleCounter.observations
108 << " " << alleleCounter.forwardStrand
109 << "," << alleleCounter.reverseStrand
110 << " " << alleleCounter.placedLeft
111 << "," << alleleCounter.placedRight
112 << " " << alleleCounter.placedStart
113 << "," << alleleCounter.placedEnd
114 << endl;
115 cerr << "priorObservationExpectationProb = " << priorObservationExpectationProb << endl;
116 cerr << "binprobln strand = " << binomialProbln(alleleCounter.forwardStrand, obs, 0.5) << endl;
117 cerr << "binprobln position = " << binomialProbln(alleleCounter.placedLeft, obs, 0.5) << endl;
118 cerr << "binprobln start = " << binomialProbln(alleleCounter.placedStart, obs, 0.5) << endl;
119 cerr << "priorObservationExpectationProb = " << priorObservationExpectationProb << endl;
120 */
121
122 priorObservationExpectationProb
123 += binomialProbln(alleleCounter.forwardStrand, obs, 0.5)
124 + binomialProbln(alleleCounter.placedLeft, obs, 0.5)
125 + binomialProbln(alleleCounter.placedStart, obs, 0.5);
126
127 }
128 }
129 // ok... now do the same move for the observation counts
130 // --- this should capture "Allele Balance"
131 }
132
133 if (alleleBalancePriors) {
134 priorObservationExpectationProb += multinomialSamplingProbLn(combo->alleleProbs(), combo->observationCounts());
135 }
136
137 // with larger population samples, the effect of
138 // P(Genotype combo | Allele frequency) may bias us against reporting
139 // true variants which are under selection despite overwhelming evidence
140 // for variation. this allows us to scale the effect of this prior
141 if (diffusionPriorScalar != 1) {
142 priorProbabilityOfGenotypeComboG_Af /= diffusionPriorScalar;
143 }
144
145 // Ewens' Sampling Formula
146 long double priorProbabilityOfGenotypeComboAf =
147 alleleFrequencyProbabilityln(combo->countFrequencies(), theta);
148 long double priorProbabilityOfGenotypeCombo =
149 priorProbabilityOfGenotypeComboG_Af + priorProbabilityOfGenotypeComboAf;
150 long double priorComboProb = priorProbabilityOfGenotypeCombo + combo->prob + priorObservationExpectationProb;
151
152 return GenotypeComboResult(combo,
153 priorComboProb,
154 combo->prob,
155 priorProbabilityOfGenotypeCombo,
156 priorProbabilityOfGenotypeComboG_Af,
157 priorProbabilityOfGenotypeComboAf,
158 priorObservationExpectationProb);
159
160 }
161
162 void
genotypeCombinationsPriorProbability(vector<GenotypeComboResult> & genotypeComboProbs,vector<GenotypeCombo> & bandedCombos,Allele & refAllele,long double theta,bool pooled,bool binomialObsPriors,bool alleleBalancePriors,long double diffusionPriorScalar)163 genotypeCombinationsPriorProbability(
164 vector<GenotypeComboResult>& genotypeComboProbs,
165 vector<GenotypeCombo>& bandedCombos,
166 Allele& refAllele,
167 long double theta,
168 bool pooled,
169 bool binomialObsPriors,
170 bool alleleBalancePriors,
171 long double diffusionPriorScalar) {
172
173 for (vector<GenotypeCombo>::iterator c = bandedCombos.begin(); c != bandedCombos.end(); ++c) {
174
175 GenotypeCombo* combo = &*c;
176
177 genotypeComboProbs.push_back(
178 genotypeCombinationPriorProbability(
179 combo,
180 refAllele,
181 theta,
182 pooled,
183 binomialObsPriors,
184 alleleBalancePriors,
185 diffusionPriorScalar));
186
187 }
188 }
189