1 #include "GenotypePriors.h"
2 
3 /*
4 long double alleleFrequencyProbability(const map<int, int>& alleleFrequencyCounts, long double theta) {
5 
6     int M = 0;
7     long double p = 1;
8 
9     for (map<int, int>::const_iterator f = alleleFrequencyCounts.begin(); f != alleleFrequencyCounts.end(); ++f) {
10         int frequency = f->first;
11         int count = f->second;
12         M += frequency * count;
13         p *= (double) pow((double) theta, (double) count) / (double) pow((double) frequency, (double) count) * factorial(count);
14     }
15 
16     long double thetaH = 1;
17     for (int h = 1; h < M; ++h)
18         thetaH *= theta + h;
19 
20     return factorial(M) / (theta * thetaH) * p;
21 
22 }
23 
24 AlleleFrequencyProbabilityCache alleleFrequencyProbabilityCache;
25 
26 long double alleleFrequencyProbabilityln(const map<int, int>& alleleFrequencyCounts, long double theta) {
27     return alleleFrequencyProbabilityCache.alleleFrequencyProbabilityln(alleleFrequencyCounts, theta);
28 }
29 
30 // Implements Ewens' Sampling Formula, which provides probability of a given
31 // partition of alleles in a sample from a population
32 long double __alleleFrequencyProbabilityln(const map<int, int>& alleleFrequencyCounts, long double theta) {
33 
34     int M = 0; // multiplicity of site
35     long double p = 0;
36     long double thetaln = log(theta);
37 
38     for (map<int, int>::const_iterator f = alleleFrequencyCounts.begin(); f != alleleFrequencyCounts.end(); ++f) {
39         int frequency = f->first;
40         int count = f->second;
41         M += frequency * count;
42         p += powln(thetaln, count) - powln(log(frequency), count) + factorialln(count);
43     }
44 
45     long double thetaH = 0;
46     for (int h = 1; h < M; ++h)
47         thetaH += log(theta + h);
48 
49     return factorialln(M) - (thetaln + thetaH) + p;
50 
51 }
52 */
53 
54 
probabilityGenotypeComboGivenAlleleFrequencyln(GenotypeCombo & genotypeCombo,Allele & allele)55 long double probabilityGenotypeComboGivenAlleleFrequencyln(GenotypeCombo& genotypeCombo, Allele& allele) {
56 
57     int n = genotypeCombo.numberOfAlleles();
58     long double lnhetscalar = 0;
59 
60     for (GenotypeCombo::iterator gc = genotypeCombo.begin(); gc != genotypeCombo.end(); ++gc) {
61         SampleDataLikelihood& sgp = **gc;
62         if (!sgp.genotype->homozygous) {
63             lnhetscalar += multinomialCoefficientLn(sgp.genotype->ploidy, sgp.genotype->counts());
64         }
65     }
66 
67     return lnhetscalar - multinomialCoefficientLn(n, genotypeCombo.counts());
68 
69 }
70 
71 
72 // core calculation of genotype combination likelihoods
73 //
74 GenotypeComboResult
genotypeCombinationPriorProbability(GenotypeCombo * combo,Allele & refAllele,long double theta,bool pooled,bool binomialObsPriors,bool alleleBalancePriors,long double diffusionPriorScalar)75 genotypeCombinationPriorProbability(
76         GenotypeCombo* combo,
77         Allele& refAllele,
78         long double theta,
79         bool pooled,
80         bool binomialObsPriors,
81         bool alleleBalancePriors,
82         long double diffusionPriorScalar) {
83 
84         // when we are operating on pooled samples, we will not be able to
85         // ascertain the number of heterozygotes in the pool,
86         // rendering P(Genotype combo | Allele frequency) meaningless
87         long double priorProbabilityOfGenotypeComboG_Af = 0;
88         if (!pooled) {
89             priorProbabilityOfGenotypeComboG_Af = probabilityGenotypeComboGivenAlleleFrequencyln(*combo, refAllele);
90         }
91 
92         long double priorObservationExpectationProb = 0;
93 
94         if (binomialObsPriors) {
95             // for each alternate and the reference allele
96             // calculate the binomial probability that we see the given strand balance and read placement prob
97             vector<string> alleles = combo->alleles();
98             // cerr << *combo << endl;
99             for (vector<string>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
100                 const string& allele = *a;
101                 map<string, AlleleCounter>::iterator ac = combo->alleleCounters.find(allele);
102                 if (ac != combo->alleleCounters.end()) {
103                     const AlleleCounter& alleleCounter = ac->second;
104                     int obs = alleleCounter.observations;
105                     /*
106                     cerr << allele <<  " counts: " << alleleCounter.frequency
107                         << " observations " << alleleCounter.observations
108                         << " " << alleleCounter.forwardStrand
109                         << "," << alleleCounter.reverseStrand
110                         << " " << alleleCounter.placedLeft
111                         << "," << alleleCounter.placedRight
112                         << " " << alleleCounter.placedStart
113                         << "," << alleleCounter.placedEnd
114                         << endl;
115                     cerr << "priorObservationExpectationProb = " << priorObservationExpectationProb << endl;
116                     cerr << "binprobln strand = " << binomialProbln(alleleCounter.forwardStrand, obs, 0.5) << endl;
117                     cerr << "binprobln position = " << binomialProbln(alleleCounter.placedLeft, obs, 0.5) << endl;
118                     cerr << "binprobln start = " << binomialProbln(alleleCounter.placedStart, obs, 0.5) << endl;
119                     cerr << "priorObservationExpectationProb = " << priorObservationExpectationProb << endl;
120                     */
121 
122                     priorObservationExpectationProb
123                         += binomialProbln(alleleCounter.forwardStrand, obs, 0.5)
124                         +  binomialProbln(alleleCounter.placedLeft, obs, 0.5)
125                         +  binomialProbln(alleleCounter.placedStart, obs, 0.5);
126 
127                 }
128             }
129             // ok... now do the same move for the observation counts
130             // --- this should capture "Allele Balance"
131         }
132 
133         if (alleleBalancePriors) {
134             priorObservationExpectationProb += multinomialSamplingProbLn(combo->alleleProbs(), combo->observationCounts());
135         }
136 
137         // with larger population samples, the effect of
138         // P(Genotype combo | Allele frequency) may bias us against reporting
139         // true variants which are under selection despite overwhelming evidence
140         // for variation.  this allows us to scale the effect of this prior
141         if (diffusionPriorScalar != 1) {
142             priorProbabilityOfGenotypeComboG_Af /= diffusionPriorScalar;
143         }
144 
145         // Ewens' Sampling Formula
146         long double priorProbabilityOfGenotypeComboAf =
147             alleleFrequencyProbabilityln(combo->countFrequencies(), theta);
148         long double priorProbabilityOfGenotypeCombo =
149             priorProbabilityOfGenotypeComboG_Af + priorProbabilityOfGenotypeComboAf;
150         long double priorComboProb = priorProbabilityOfGenotypeCombo + combo->prob + priorObservationExpectationProb;
151 
152         return GenotypeComboResult(combo,
153                     priorComboProb,
154                     combo->prob,
155                     priorProbabilityOfGenotypeCombo,
156                     priorProbabilityOfGenotypeComboG_Af,
157                     priorProbabilityOfGenotypeComboAf,
158                     priorObservationExpectationProb);
159 
160 }
161 
162 void
genotypeCombinationsPriorProbability(vector<GenotypeComboResult> & genotypeComboProbs,vector<GenotypeCombo> & bandedCombos,Allele & refAllele,long double theta,bool pooled,bool binomialObsPriors,bool alleleBalancePriors,long double diffusionPriorScalar)163 genotypeCombinationsPriorProbability(
164         vector<GenotypeComboResult>& genotypeComboProbs,
165         vector<GenotypeCombo>& bandedCombos,
166         Allele& refAllele,
167         long double theta,
168         bool pooled,
169         bool binomialObsPriors,
170         bool alleleBalancePriors,
171         long double diffusionPriorScalar) {
172 
173     for (vector<GenotypeCombo>::iterator c = bandedCombos.begin(); c != bandedCombos.end(); ++c) {
174 
175         GenotypeCombo* combo = &*c;
176 
177         genotypeComboProbs.push_back(
178                 genotypeCombinationPriorProbability(
179                     combo,
180                     refAllele,
181                     theta,
182                     pooled,
183                     binomialObsPriors,
184                     alleleBalancePriors,
185                     diffusionPriorScalar));
186 
187     }
188 }
189