1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010 - 2012, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // models.cc -- routines to detect polymorphism (snp) and detect a lack of polymorphism (fixed).
23 //
24 // Julian Catchen
25 // jcatchen@uoregon.edu
26 // University of Oregon
27 //
28 // $Id$
29 //
30 #include "models.h"
31
32 using namespace std;
33
34 vector<pair<char, int>> sort_acgt(const map<char, int>&);
35 void record_snp(SNP& snp, snp_type type, uint col, double l_ratio, const vector<pair<char, int>>& nuc);
36 void record_dummy_snp(SNP& snp, uint col);
37
38 int barcode_size = 5;
39 double barcode_err_freq = 0.0;
40
41 double heterozygote_limit = -qchisq(0.05,1);
42 double homozygote_limit = qchisq(0.05,1);
43 double bound_low = 0.0;
44 double bound_high = 1.0;
45 double p_freq = 0.5;
46
47 const map<string,modelt> model_strings = {
48 {"snp", ::snp},
49 {"bounded", ::bounded},
50 {"fixed", ::fixed},
51 {"marukihigh", ::marukihigh},
52 {"marukilow", ::marukilow},
53 };
54
55 // Quantiles of the Chi2 distribution for VCF's GQ.
56 //$ Rscript -e "cat(paste(qchisq(1-10^(-(1:40)/10), 1), '\n', sep=''))"
57 const std::array<double,40> chisq1ddf_gq = {{
58 0.0679615365255401, 0.230764766661193, 0.452421556097698, 0.714036104152315, 1.00448471501902,
59 1.31668063342863, 1.64583886397469, 1.98858107774449, 2.34243609502603, 2.70554345409542,
60 3.07646857913928, 3.45408274293754, 3.83748240184564, 4.22593339019369, 4.61883133337202,
61 5.01567294625387, 5.41603482049954, 5.81955747770787, 6.22593319775977, 6.63489660102121,
62 7.04621727098416, 7.45969391025114, 7.87514966368955, 8.29242834051146, 8.71139133617301,
63 9.13191510451069, 9.55388906647803, 9.97721386826398, 10.4017999212068, 10.8275661706627,
64 11.2544390521783, 11.6823516018745, 12.1112426945654, 12.5410563882771, 12.9717413578738,
65 13.403250403671, 13.8355400234795, 14.2685700385079, 14.7023032652319, 15.1367052266236
66 }};
67
qchisq(double alpha,size_t df)68 double qchisq(double alpha, size_t df) {
69 //
70 // Quantiles (1-alpha) of the Chi2 distribution, as given by
71 //$ alphas="1e-1,5e-2,1e-2,5e-3,1e-3,5e-4,1e-4,5e-5,1e-5,5e-6,1e-6"
72 //$ df=1
73 //$ R -e "for(a in c($alphas)) {q=qchisq(1-a, $df); cat(paste('{',a,',',q,'}, ',sep=''));}"
74 //
75 // The value for a=0.05,df=1 is 3.84 for backward compatiblity (instead of
76 // 3.84145882069412).
77 //
78 static const vector<map<double,double>> table = {
79 { //df=1
80 {0.1,2.70554345409542}, {0.05,3.84}, {0.01,6.63489660102121},
81 {0.005,7.87943857662241}, {0.001,10.8275661706627}, {0.0005,12.1156651463974},
82 {0.0001,15.1367052266236}, {0.00005,16.4481102100082}, {0.00001,19.5114209646663},
83 {0.000005,20.8372870225104}, {0.000001,23.9281269768795}
84 },{ //df=2
85 {0.1,4.60517018598809}, {0.05,5.99146454710798}, {0.01,9.21034037197618},
86 {0.005,10.5966347330961}, {0.001,13.8155105579643}, {0.0005,15.2018049190844},
87 {0.0001,18.4206807439526}, {0.00005,19.8069751050725}, {0.00001,23.0258509299496},
88 {0.000005,24.4121452910472}, {0.000001,27.631021115871}
89 }
90 };
91
92 assert(df > 0 && df <= table.size());
93 try {
94 return table[df-1].at(alpha);
95 } catch (std::out_of_range&) {
96 cerr << "Error: Unsupported alpha value '" << alpha << "'; pick one of 0.05, 0.01, 0.005, 0.001...\n";
97 throw std::invalid_argument("qchisq");
98 }
99 }
100
parse_model_type(const string & arg)101 modelt parse_model_type(const string& arg) {
102 try {
103 return model_strings.at(arg);
104 } catch (std::out_of_range&) {
105 cerr << "Error: Unknown model '" << arg << "'.\n";
106 throw std::invalid_argument("set_model_type");
107 }
108 }
109
set_model_thresholds(double alpha)110 void set_model_thresholds(double alpha) {
111 double x = qchisq(alpha, 1);
112 homozygote_limit = x;
113 heterozygote_limit = -x;
114 }
115
to_string(modelt model_type)116 string to_string(modelt model_type) {
117 for (auto& m : model_strings)
118 if (model_type == m.second)
119 return m.first;
120 DOES_NOT_HAPPEN;
121 return string();
122 }
123
sort_acgt(const map<char,int> & counts)124 vector<pair<char, int>> sort_acgt(const map<char, int>& counts) {
125 vector<pair<char, int>> nuc;
126 for (map<char, int>::const_iterator i=counts.begin(); i!=counts.end(); i++)
127 if (i->first != 'N')
128 nuc.push_back(make_pair(i->first, i->second));
129 sort(nuc.begin(), nuc.end(), compare_pair);
130 return nuc;
131 }
132
record_snp(SNP & snp,snp_type type,uint col,double l_ratio,const vector<pair<char,int>> & nuc)133 void record_snp(SNP& snp, snp_type type, uint col, double l_ratio, const vector<pair<char, int>>& nuc) {
134
135 snp.type = type;
136 snp.col = col;
137 snp.lratio = l_ratio;
138 snp.rank_1 = nuc[0].first;
139
140 switch (type) {
141 case snp_type_het:
142 snp.rank_2 = nuc[1].first;
143 break;
144 case snp_type_hom:
145 snp.rank_2 = '-';
146 break;
147 default:
148 // snp_type_unk, with at least one observation (otherwise record_dummy_snp
149 // would have been called)
150 // A rank_2 nucleotide is set only if at least two nucleotides were observed.
151 snp.rank_2 = nuc[1].second > 0 ? nuc[1].first : '-';
152 break;
153 }
154
155 snp.rank_3 = 0;
156 snp.rank_4 = 0;
157 }
158
record_dummy_snp(SNP & snp,uint col)159 void record_dummy_snp(SNP& snp, uint col) {
160 snp.type = snp_type_unk;
161 snp.col = col;
162 snp.lratio = 0.0;
163 snp.rank_1 = 'N';
164 snp.rank_2 = '-';
165 snp.rank_3 = 0;
166 snp.rank_4 = 0;
167 }
168
call_multinomial_snp(MergedStack * tag,int col,map<char,int> & n,bool record_snps)169 void call_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_snps) {
170 vector<pair<char, int> > nuc = sort_acgt(n);
171 if (nuc[0].second == 0) {
172 if (record_snps) {
173 tag->snps.push_back(new SNP());
174 record_dummy_snp(*tag->snps.back(), col);
175 }
176 return;
177 }
178
179 double l_ratio = lr_multinomial_model(nuc[0].second, nuc[1].second, nuc[2].second, nuc[3].second);
180 snp_type type = call_snp(l_ratio);
181 if (record_snps) {
182 tag->snps.push_back(new SNP());
183 record_snp(*tag->snps.back(), type, col, l_ratio, nuc);
184 }
185 }
186
call_bounded_multinomial_snp(MergedStack * tag,int col,map<char,int> & n,bool record_snps)187 void call_bounded_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_snps) {
188 vector<pair<char, int> > nuc = sort_acgt(n);
189 if (nuc[0].second == 0) {
190 if (record_snps) {
191 tag->snps.push_back(new SNP());
192 record_dummy_snp(*tag->snps.back(), col);
193 }
194 return;
195 }
196
197 double l_ratio = lr_bounded_multinomial_model(nuc[0].second, nuc[1].second, nuc[2].second, nuc[3].second);
198 snp_type type = call_snp(l_ratio);
199 if (record_snps) {
200 tag->snps.push_back(new SNP());
201 record_snp(*tag->snps.back(), type, col, l_ratio, nuc);
202 }
203 }
204
call_multinomial_snp(Locus * tag,int col,map<char,int> & n)205 void call_multinomial_snp(Locus *tag, int col, map<char, int> &n) {
206 vector<pair<char, int> > nuc = sort_acgt(n);
207 if (nuc[0].second == 0) {
208 record_dummy_snp(*tag->snps[col], col);
209 return;
210 }
211
212 double l_ratio = lr_multinomial_model(nuc[0].second, nuc[1].second, nuc[2].second, nuc[3].second);
213 snp_type type = call_snp(l_ratio);
214 record_snp(*tag->snps[col], type, col, l_ratio, nuc);
215 }
216
call_bounded_multinomial_snp(Locus * tag,int col,map<char,int> & n)217 void call_bounded_multinomial_snp(Locus *tag, int col, map<char, int> &n) {
218 vector<pair<char, int> > nuc = sort_acgt(n);
219 if (nuc[0].second == 0) {
220 record_dummy_snp(*tag->snps[col], col);
221 return;
222 }
223
224 double l_ratio = lr_bounded_multinomial_model(nuc[0].second, nuc[1].second, nuc[2].second, nuc[3].second);
225 snp_type type = call_snp(l_ratio);
226 record_snp(*tag->snps[col], type, col, l_ratio, nuc);
227 }
228
call_multinomial_fixed(MergedStack * tag,int col,map<char,int> & n)229 void call_multinomial_fixed (MergedStack *tag, int col, map<char, int> &n) {
230 const double nucleotide_fixed_limit = 1.92;
231
232 vector<pair<char, int> > nuc = sort_acgt(n);
233 if (nuc[0].second == 0) {
234 tag->snps.push_back(new SNP());
235 record_dummy_snp(*tag->snps.back(), col);
236 return;
237 }
238
239 double l_ratio;
240 double nuc_1 = nuc[0].second;
241 double nuc_2 = nuc[1].second;
242 {
243 //
244 // Method of Paul Hohenlohe <hohenlo@uoregon.edu>, personal communication.
245 //
246 // Each population sample contains DNA from 6 individuals, so a
247 // sample of 12 alleles from the population. We want to assign a
248 // nucleotide (A,C,G,T) to each position where the population is
249 // fixed or nearly so, and N to each position that is either
250 // polymorphic within the population or has insufficient coverage
251 // depth to make a call. We can do this with a likelihood ratio
252 // test of the read counts, testing whether the allele frequency
253 // of the dominant allele is significantly larger than some
254 // threshold p) , stepping through each nucleotide position across
255 // RAD tags.
256 //
257
258 double epsilon = -1 * (log(1 - barcode_err_freq) / barcode_size);
259
260 l_ratio =
261 nuc_1 * log( ((4 * nuc_1 * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) /
262 ((4 * p_freq * (nuc_1 + nuc_2) * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) );
263
264 l_ratio +=
265 nuc_2 * log( ((4 * nuc_2 * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) /
266 ((4 * (1 - p_freq) * (nuc_1 + nuc_2) * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) );
267
268 //cerr << "Nuc_1: " << nuc_1 << " Nuc_2: " << nuc_2 << " Likelihood ratio: " << l_ratio << "\n";
269 }
270
271 double n_ratio = nuc_1 / (nuc_1 + nuc_2);
272
273 snp_type type = n_ratio < p_freq || l_ratio < nucleotide_fixed_limit ? snp_type_unk : snp_type_hom;
274
275 tag->snps.push_back(new SNP());
276 record_snp(*tag->snps.back(), type, col, l_ratio, nuc);
277 }
278
279 //
280 // ln L(1/2) = ln(n! / n_1!n_2!n_3!n_4!) +
281 // (n_1 + n_2) * ln(n_1 + n_2 / 2n) +
282 // (n_3 + n_4) * ln(n_3 + n_4 / 2n)
283 //
284 double
heterozygous_likelihood(int col,map<char,int> & nuc)285 heterozygous_likelihood(int col, map<char, int> &nuc)
286 {
287 vector<pair<char, int> > cnts;
288 map<char, int>::iterator i;
289
290 double n = 0;
291 for (i = nuc.begin(); i != nuc.end(); i++) {
292 n += i->second;
293 cnts.push_back(make_pair(i->first, i->second));
294 }
295
296 sort(cnts.begin(), cnts.end(), compare_pair);
297
298 double n_1 = cnts[0].second;
299 double n_2 = cnts[1].second;
300 double n_3 = cnts[2].second;
301 double n_4 = cnts[3].second;
302
303 double term_1 =
304 reduced_log_factorial(n, n_1) -
305 (log_factorial(n_2) + log_factorial(n_3) + log_factorial(n_4));
306
307 double term_3 = (n_3 + n_4 > 0) ? log((n_3 + n_4) / (2 * n)) : 0;
308
309 double lnl =
310 term_1 +
311 ((n_1 + n_2) * log((n_1 + n_2) / (2 * n))) +
312 ((n_3 + n_4) * term_3);
313
314 return lnl;
315 }
316
317 //
318 // ln L(1/1) = ln(n! / n_1!n_2!n_3!n_4!) +
319 // n_1 * ln(n_1 / n) +
320 // (n - n_1) * ln(n - n_1 / 3n)
321 //
322 double
homozygous_likelihood(int col,map<char,int> & nuc)323 homozygous_likelihood(int col, map<char, int> &nuc)
324 {
325 vector<pair<char, int> > cnts;
326 map<char, int>::iterator i;
327
328 double n = 0;
329 for (i = nuc.begin(); i != nuc.end(); i++) {
330 n += i->second;
331 cnts.push_back(make_pair(i->first, i->second));
332 }
333
334 sort(cnts.begin(), cnts.end(), compare_pair);
335
336 double n_1 = cnts[0].second;
337 double n_2 = cnts[1].second;
338 double n_3 = cnts[2].second;
339 double n_4 = cnts[3].second;
340
341 double term_1 =
342 reduced_log_factorial(n, n_1) -
343 (log_factorial(n_2) + log_factorial(n_3) + log_factorial(n_4));
344
345 double term_3 = n - n_1 > 0 ? log((n - n_1) / (3 * n)) : 0;
346
347 double lnl =
348 term_1 +
349 (n_1 * log(n_1 / n)) +
350 ((n - n_1) * term_3);
351
352 return lnl;
353 }
354
most_frequent_allele() const355 Nt2 SiteCall::most_frequent_allele() const {
356 assert(!alleles().empty());
357 auto a = alleles().begin();
358 auto best = a;
359 for(; a!=alleles().end(); ++a)
360 if (a->second > best->second)
361 best = a;
362 return best->first;
363 }
364
filter_mac(size_t min_mac)365 void SiteCall::filter_mac(size_t min_mac) {
366 assert(alleles().size() > 1);
367 Counts<Nt2> allele_counts = tally_allele_counts(sample_calls());
368 if (allele_counts.sorted()[1].first < min_mac)
369 *this = SiteCall(most_frequent_allele());
370 }
371
tally_allele_counts(const vector<SampleCall> & spldata)372 Counts<Nt2> SiteCall::tally_allele_counts(const vector<SampleCall>& spldata) {
373 Counts<Nt2> counts;
374 for (const SampleCall& sd : spldata) {
375 switch (sd.call()) {
376 case snp_type_hom :
377 counts.increment(sd.nt0());
378 counts.increment(sd.nt0());
379 break;
380 case snp_type_het :
381 counts.increment(sd.nt0());
382 counts.increment(sd.nt1());
383 break;
384 default:
385 // snp_type_unk
386 break;
387 }
388 }
389 return counts;
390 }
391
tally_allele_freqs(const vector<SampleCall> & spldata)392 map<Nt2,double> SiteCall::tally_allele_freqs(const vector<SampleCall>& spldata) {
393 //
394 // Tally the existing alleles and their frequencies.
395 //
396 map<Nt2,double> allele_freqs;
397 Counts<Nt2> counts = tally_allele_counts(spldata);
398 array<pair<size_t,Nt2>,4> sorted_alleles = counts.sorted();
399 size_t tot = counts.sum();
400 for (auto& a : sorted_alleles) {
401 if (a.first == 0)
402 break;
403 allele_freqs.insert({a.second, double(a.first)/tot});
404 }
405 return allele_freqs;
406 }
407
operator <<(ostream & os,const SampleCall & c)408 ostream& operator<<(ostream& os, const SampleCall& c) {
409 ostream copy (os.rdbuf());
410 copy << std::setprecision(4);
411 // Genotype.
412 switch(c.call()) {
413 case snp_type_hom: os << c.nt0() << "/" << c.nt0(); break;
414 case snp_type_het: os << std::min(c.nt0(), c.nt1()) << "/" << std::max(c.nt0(), c.nt1()); break;
415 case snp_type_unk: os << "u"; break;
416 case snp_type_discarded: os << "d"; break;
417 default: DOES_NOT_HAPPEN; break;
418 }
419 // Likelihoods.
420 os << "\t{" << c.lnls() << "}";
421 return os;
422 }
423
print(ostream & os,const SiteCounts & depths)424 void SiteCall::print(ostream& os, const SiteCounts& depths) {
425 // Total depths.
426 os << "tot depths {" << depths.tot << "}\n";
427 // Alleles.
428 os << "alleles";
429 for (auto& a : alleles())
430 os << " " << a.first << "(" << a.second << ")";
431 os << "\n";
432 // Samples.
433 os << "samples";
434 for (size_t s=0; s<depths.samples.size(); ++s) {
435 // Index.
436 os << "\n" << s;
437 auto& dp = depths.samples[s];
438 if (dp.sum() == 0) {
439 os << "\t.";
440 } else {
441 // Depths.
442 os << "\t{" << dp << "}";
443 if (alleles().size() >= 2)
444 os << "\t" << sample_calls()[s];
445 }
446 }
447 }
448
call(const SiteCounts & depths) const449 SiteCall MultinomialModel::call(const SiteCounts& depths) const {
450
451 size_t n_samples = depths.mpopi->samples().size();
452
453 //
454 // Make genotype calls.
455 //
456 vector<SampleCall> sample_calls (n_samples);
457 array<pair<size_t,Nt2>,4> sorted;
458 for (size_t sample=0; sample<n_samples; ++sample) {
459 const Counts<Nt2>& sdepths = depths.samples[sample];
460 size_t dp = sdepths.sum();
461 if (dp == 0)
462 continue;
463
464 sorted = sdepths.sorted();
465 SampleCall& c = sample_calls[sample];
466 double lnl_hom = lnl_multinomial_model_hom(dp, sorted[0].first);
467 double lnl_het = lnl_multinomial_model_het(dp, sorted[0].first+sorted[1].first);
468 c.lnls().set(sorted[0].second, sorted[0].second, lnl_hom);
469 c.lnls().set(sorted[0].second, sorted[1].second, lnl_het);
470
471 snp_type gt_call = call_snp(lnl_hom, lnl_het);
472 long gq = vcf_lnl2gq(std::max(lnl_hom, lnl_het), std::min(lnl_hom, lnl_het));
473 c.set_call(gt_call, sorted[0].second, sorted[1].second, gq);
474 }
475
476 //
477 // Record the existing alleles and their frequencies.
478 //
479 map<Nt2,double> allele_freqs = SiteCall::tally_allele_freqs(sample_calls);
480
481 //
482 // Compute the missing genotype likelihoods, if any.
483 //
484 if (allele_freqs.size() < 2) {
485 sample_calls = vector<SampleCall>();
486 } else {
487 for (size_t sample=0; sample<n_samples; ++sample) {
488 const Counts<Nt2>& sdepths = depths.samples[sample];
489 size_t dp = sdepths.sum();
490 if (dp == 0)
491 continue;
492
493 SampleCall& c = sample_calls[sample];
494 for (auto nt1_it=allele_freqs.begin(); nt1_it!=allele_freqs.end(); ++nt1_it) {
495 Nt2 nt1 (nt1_it->first);
496 // Homozygote.
497 if (!c.lnls().has_lik(nt1, nt1)) {
498 double lnl = lnl_multinomial_model_hom(dp, sdepths[nt1]);
499 c.lnls().set(nt1, nt1, lnl);
500 }
501 // Heterozygote(s).
502 auto nt2_it = nt1_it;
503 ++nt2_it;
504 for (; nt2_it!=allele_freqs.end(); ++nt2_it) {
505 Nt2 nt2 (nt2_it->first);
506 if (!c.lnls().has_lik(nt1, nt2)) {
507 double lnl = lnl_multinomial_model_het(dp, sdepths[nt1]+sdepths[nt2]);
508 c.lnls().set(nt1, nt2, lnl);
509 }
510 }
511 }
512 }
513 }
514
515 return SiteCall(move(allele_freqs), move(sample_calls));
516 }
517
calc_hom_lnl(double n,double n1) const518 double MarukiHighModel::calc_hom_lnl(double n, double n1) const {
519 // This returns the same value as the Hohenlohe ('snp/binomial') model except
520 // when the error rate estimate is bounded at 1.0 (i.e. `n1 < 0.25*n` c.f.
521 // Hohenlohe equations).
522 if (n1 == n)
523 return 0.0;
524 else if (n1 == 0.0)
525 return n * log(1.0/3.0);
526 else
527 return n1 * log(n1/n) + (n-n1) * log( (n-n1)/(3.0*n) );
528 }
529
calc_het_lnl(double n,double n1n2) const530 double MarukiHighModel::calc_het_lnl(double n, double n1n2) const {
531 // This returns the same value as the Hohenlohe ('snp/binomial') model except
532 // when the error rate estimate is bounded at 1.0 (i.e. `n1n2 < 0.5*n` c.f.
533 // Hohenlohe equations).
534 if (n1n2 == n)
535 return n * log(0.5);
536 else if (n1n2 < (1.0/3.0) * n)
537 return n1n2 * log(1.0/6.0) + (n-n1n2) * log(1.0/3.0);
538 else
539 return n1n2 * log( n1n2/(2.0*n) ) + (n-n1n2) * log((n-n1n2)/(2.0*n) );
540 }
541
call(const SiteCounts & depths) const542 SiteCall MarukiHighModel::call(const SiteCounts& depths) const {
543
544 /*
545 * For this model the procedure is:
546 * I. Obtain the most commonly seen nucleotide M.
547 * II. Look for alternative alleles, if any:
548 * For each sample:
549 * If there is a genotype significantly better than MM:
550 * (This genotype can be Mm, mm or mn.)
551 * The site is polymorphic.
552 * Record m as an alternative allele.
553 * If the genotype is mn AND is significantly better than mm:
554 * Record n as an allele.
555 * III. Given the known alleles, compute the likelihoods for all possible
556 * genotypes (n.b. most of the non-trivial ones have already been
557 * computed), and call genotypes.
558 */
559
560 const size_t n_samples = depths.mpopi->samples().size();
561
562 if (depths.tot.sum() == 0)
563 return SiteCall(map<Nt2,double>(), vector<SampleCall>());
564
565 //
566 // I.
567 // Count the observed nucleotides of the site for all samples; set
568 // `SampleCall::depths_`.
569 // Then find the most common nucleotide across the population.
570 //
571 Nt2 nt_ref = depths.tot.sorted()[0].second;
572
573 //
574 // II.
575 // Look for alternative alleles; start filling SampleCall::lnls_.
576 //
577 set<Nt2> alleles;
578 vector<SampleCall> sample_calls (n_samples);
579 alleles.insert(nt_ref);
580 array<pair<size_t,Nt2>,4> sorted;
581 for (size_t sample=0; sample<n_samples; ++sample) {
582 const Counts<Nt2>& sdepths = depths.samples[sample];
583 size_t dp = sdepths.sum();
584 if (dp == 0)
585 continue;
586
587 // Find the best genotype for the sample -- this is either the homzygote
588 // for the rank0 nucleotide or the heterozygote for the rank0 and rank1
589 // nucleotides.
590 sorted = sdepths.sorted();
591 Nt2 nt0 = sorted[0].second;
592 Nt2 nt1 = sorted[1].second;
593 size_t dp0 = sorted[0].first;
594 size_t dp1 = sorted[1].first;
595
596 if (dp1 == 0 && nt0 == nt_ref)
597 // We can wait until we know whether the site is fixed.
598 continue;
599
600 SampleCall& c = sample_calls[sample];
601 double lnl_hom = calc_hom_lnl(dp, dp0);
602 double lnl_het = calc_het_lnl(dp, dp0+dp1);
603 c.lnls().set(nt0, nt0, lnl_hom);
604 c.lnls().set(nt0, nt1, lnl_het);
605
606 // Make sure the sample would have a significant genotype call provided
607 // the site was polymorphic (otherwise a genotype can be significant in
608 // comparison with the ref homozygote but not in comparison with the
609 // second best genotype). This is a slight modification to Maruki &
610 // Lynch's method to avoid low-coverage weirdnesses. Note that the
611 // polymorphism discovery alpha could be set lower than the genotype call
612 // alpha (e.g. to account for multiple testing).
613 if (lnl_hom > lnl_het) {
614 if(!lrtest(lnl_hom, lnl_het, gt_threshold_))
615 continue;
616 } else {
617 if(!lrtest(lnl_het, lnl_hom, gt_threshold_))
618 continue;
619 }
620
621 // Compare this likelihood to that of the ref,ref homozygote.
622 if (nt0 == nt_ref) {
623 if (lrtest(lnl_het, lnl_hom, var_threshold_))
624 // Record the alternative allele.
625 alleles.insert(nt1);
626 } else {
627 double lnl_ref = calc_hom_lnl(dp, sdepths[nt_ref]);
628 c.lnls().set(nt_ref, nt_ref, lnl_ref);
629 double lnl_best = std::max(lnl_hom, lnl_het);
630 if (lrtest(lnl_best, lnl_ref, var_threshold_)) {
631 // Record one alternative allele.
632 alleles.insert(nt0);
633 if (nt1!=nt_ref && lrtest(lnl_het, lnl_hom, var_threshold_))
634 // Record a second alternative allele (the SNP is at least ternary).
635 alleles.insert(nt1);
636 }
637 }
638 }
639
640 //
641 // III.
642 // Compute the likelihoods for all possible genotypes & call genotypes.
643 //
644 if (alleles.size() == 1) {
645 sample_calls = vector<SampleCall>();
646 } else {
647 for (size_t sample=0; sample<n_samples; ++sample) {
648 const Counts<Nt2>& sdepths = depths.samples[sample];
649 size_t dp = sdepths.sum();
650 if (dp == 0)
651 continue;
652
653 SampleCall& c = sample_calls[sample];
654 for (auto nt1=alleles.begin(); nt1!=alleles.end(); ++nt1) {
655 // Homozygote.
656 if (!c.lnls().has_lik(*nt1, *nt1))
657 c.lnls().set(*nt1, *nt1, calc_hom_lnl(dp, sdepths[*nt1]));
658 // Heterozygote(s).
659 auto nt2 = nt1;
660 ++nt2;
661 for (; nt2!=alleles.end(); ++nt2)
662 if (!c.lnls().has_lik(*nt1, *nt2))
663 c.lnls().set(*nt1, *nt2, calc_het_lnl(dp, sdepths[*nt1]+sdepths[*nt2]));
664 }
665
666 // Call the genotype -- skiping ignored alleles.
667 sorted = sdepths.sorted();
668 auto nt0 = sorted.begin();
669 while(!alleles.count(nt0->second)) {
670 ++nt0;
671 assert(nt0 != sorted.end());
672 }
673 auto nt1 = nt0;
674 ++nt1;
675 while(!alleles.count(nt1->second)) {
676 ++nt1;
677 assert(nt1 != sorted.end());
678 }
679 double lnl_hom = c.lnls().at(nt0->second, nt0->second);
680 double lnl_het = c.lnls().at(nt0->second, nt1->second);
681 snp_type call;
682 long gq;
683 if (lnl_hom > lnl_het) {
684 call = lrtest(lnl_hom, lnl_het, gt_threshold_) ? snp_type_hom : snp_type_unk;
685 gq = vcf_lnl2gq(lnl_hom, lnl_het);
686 } else {
687 call = lrtest(lnl_het, lnl_hom, gt_threshold_) ? snp_type_het : snp_type_unk;
688 gq = vcf_lnl2gq(lnl_het, lnl_hom);
689 }
690
691 c.set_call(call, nt0->second, nt1->second, gq);
692 }
693 }
694
695 //
696 // Finally, compute allele frequencies.
697 //
698 map<Nt2,double> allele_freqs;
699 if (alleles.size() == 1) {
700 allele_freqs.insert({*alleles.begin(), 1.0});
701 } else {
702 allele_freqs = SiteCall::tally_allele_freqs(sample_calls);
703 assert(allele_freqs.size() == alleles.size()
704 || (allele_freqs.size() == alleles.size()-1 && !allele_freqs.count(nt_ref)));
705 // Note: There are limit cases (esp. low coverage) where nt_ref does
706 // not appear in any of the significant genotypes.
707 }
708
709 return SiteCall(move(allele_freqs), move(sample_calls));
710 }
711
calc_fixed_lnl(double n_tot,double n_M_tot) const712 double MarukiLowModel::calc_fixed_lnl(double n_tot, double n_M_tot) const {
713 assert(n_tot > 0.0);
714 assert(n_M_tot > 0.0);
715 if (n_M_tot == n_tot)
716 return 0.0;
717 else
718 return n_M_tot * log(n_M_tot/n_tot) + (n_tot-n_M_tot) * log((n_tot-n_M_tot)/n_tot/3.0);
719 }
720
calc_dimorph_lnl(double freq_MM,double freq_Mm,double freq_mm,const vector<LikData> & liks) const721 double MarukiLowModel::calc_dimorph_lnl(double freq_MM, double freq_Mm, double freq_mm, const vector<LikData>& liks) const {
722 double lnl = 0.0;
723 // Sum over samples.
724 for (const LikData& s_liks : liks)
725 if (s_liks.has_data)
726 // If !has_data, the sum is 1 and its log 0.
727 lnl += calc_ln_weighted_sum(freq_MM, freq_Mm, freq_mm, s_liks);
728 assert(lnl <= 0.0);
729 return lnl;
730 }
731
calc_ln_weighted_sum(double freq_MM,double freq_Mm,double freq_mm,const LikData & s_liks) const732 double MarukiLowModel::calc_ln_weighted_sum(double freq_MM, double freq_Mm, double freq_mm, const LikData& s_liks) const {
733 double weighted_sum = freq_MM * s_liks.l_MM + freq_Mm * s_liks.l_Mm + freq_mm * s_liks.l_mm;
734 #pragma omp atomic
735 ++n_wsum_tot_;
736 if (weighted_sum >= std::numeric_limits<double>::min()) {
737 return log(weighted_sum);
738 } else {
739 // `weigted_sum` is subnormal or zero.
740 #pragma omp atomic
741 ++n_wsum_underflows_;
742 return calc_ln_weighted_sum_safe(freq_MM, freq_Mm, freq_mm, s_liks);
743 }
744 }
745
calc_ln_weighted_sum_safe(double freq_MM,double freq_Mm,double freq_mm,const LikData & s_liks) const746 double MarukiLowModel::calc_ln_weighted_sum_safe(double freq_MM, double freq_Mm, double freq_mm, const LikData& s_liks) const {
747 array<pair<double,double>,3> s = {{
748 {s_liks.lnl_MM, freq_MM},
749 {s_liks.lnl_Mm, freq_Mm},
750 {s_liks.lnl_mm, freq_mm}
751 }};
752 std::sort(s.begin(), s.end()); // `s` is sorted by increasing lnl.
753 if (s[2].second > 0.0)
754 return s[2].first + log(s[2].second
755 + s[1].second * exp(s[1].first-s[2].first)
756 + s[0].second * exp(s[0].first-s[2].first)
757 );
758 else if (s[1].second > 0.0)
759 return s[1].first + log(s[1].second + s[0].second * exp(s[0].first-s[1].first));
760 else
761 return s[0].first + log(s[0].second);
762 }
763
call(const SiteCounts & depths) const764 SiteCall MarukiLowModel::call(const SiteCounts& depths) const {
765
766 /*
767 * For this model the procedure is:
768 * I. Compute the maximum likelihood for the fixed-site hypothesis (straightforward).
769 * II. Compute the maximum likelihood for the dimorphic-site hypothesis and
770 * estimate the genotype frequencies:
771 * 1. Compute the error rate estimate.
772 * 2. Compute the base likelihoods for all samples, all genotypes.
773 * 3. Compute the starting major allele frequency (`p`) value.
774 * 4. Compute the likelihoods for the site for all possible disequilibrium
775 * coefficients (`d_a`); keep the best one.
776 * 5. Optimize the genotype frequencies by finding a local maximum.
777 * III. Test whether the site is polymorphic.
778 * IV. Compute the likelihoods for all samples, all genotypes using Bayes'
779 * theorem, and call genotypes.
780 */
781
782 const size_t n_samples = depths.mpopi->samples().size();
783
784 size_t dp_tot = depths.tot.sum();
785 if (dp_tot == 0)
786 return SiteCall();
787
788 //
789 // I. Likelihood for the fixed-site hypothesis.
790 //
791
792 array<pair<size_t,Nt2>,4> sorted = depths.tot.sorted();
793 Nt2 nt_M = sorted[0].second;
794 Nt2 nt_m = sorted[1].second;
795 size_t n_M_tot = sorted[0].first;
796 size_t n_m_tot = sorted[1].first;
797
798 double lnl_fixed = calc_fixed_lnl(dp_tot, n_M_tot);
799
800 if (!lrtest(0.0, lnl_fixed, var_threshold_))
801 // Fixed: `lnl_fixed` is high enough than even when `lnl_dimorphic` is
802 // at its maximum value of 0, dimorphism isn't significant. This happens
803 // when there are either very few non-major-allele reads or very few reads
804 // overall.
805 return SiteCall(nt_M);
806
807 //
808 // II. Compute and optimize the likelihood for the dimorphic-site hypothesis
809 //
810
811 // 1. Error rate.
812 // Note: @Nick, Apr 2017: Maruki's description uses a simple estimator where
813 // the number of invisible errors (from M to m and inversely) is estimated
814 // as being half the number of observable errors. But this when few
815 // observable errors this is an underestimate as ideally we would like to
816 // integrate; in particular the error rate is estimated to zero when
817 // there aren't any observable errors and this leads to meaningless null
818 // likelihoods. Doing the integration isn't practical but we can reduce
819 // the problem by adding (see edit) to the numerator and denominator.
820 // Edit. Oct 2017: 1 is way too large; adding 0.1 yields a more reasonable
821 // prior.
822 double e = 3.0 / 2.0 * (dp_tot - n_M_tot - n_m_tot + 0.1) / double(dp_tot + 0.1);
823 assert(e > 0.0 && e < 1.0);
824 #pragma omp atomic
825 ++n_called_sites_;
826 #pragma omp atomic
827 sum_site_err_rates_ += e;
828
829 // 2. Base likelihoods.
830 vector<LikData> liks;
831 {
832 liks.reserve(n_samples);
833 double ln_err_hom = log(e/3.0);
834 double ln_hit_hom = log(1-e);
835 double ln_err_het = log(e/3.0);
836 double ln_hit_het = log(0.5-e/3.0);
837 for (size_t sample=0; sample<n_samples; ++sample) {
838 const Counts<Nt2>& sdepths = depths.samples[sample];
839 size_t dp = sdepths.sum();
840 if (dp == 0) {
841 liks.push_back(LikData());
842 continue;
843 }
844
845 double lnl_MM = sdepths[nt_M] * ln_hit_hom + (dp-sdepths[nt_M]) * ln_err_hom;
846 double lnl_mm = sdepths[nt_m] * ln_hit_hom + (dp-sdepths[nt_m]) * ln_err_hom;
847 double lnl_Mm = (sdepths[nt_M]+sdepths[nt_m]) * ln_hit_het + (dp-(sdepths[nt_M]+sdepths[nt_m])) * ln_err_het;;
848 liks.push_back(LikData(lnl_MM, lnl_Mm, lnl_mm));
849 }
850 }
851
852 // 3. Initial major allele frequency.
853 double p = (2*n_M_tot + 2*n_m_tot > dp_tot) ?
854 0.5 * double(3*n_M_tot + n_m_tot - dp_tot) / (2*n_M_tot + 2*n_m_tot - dp_tot)
855 : 0.5; // if `n_M_tot == n_m_tot == dp_tot / 4`.
856
857 // 4. Initial disequilibrium coefficient.
858 auto flush_freq = [](double& f) {
859 if (f < 1e-12)
860 f = 0.0;
861 else if (f > 1.0-1e-12)
862 f = 1.0;
863 };
864 double d_a_best = 0.0;
865 double lnl_dimorph = std::numeric_limits<double>::lowest();
866 {
867 double d_a_min = max(-p*p, -(1-p)*(1-p));
868 double d_a_max = p*(1-p);
869 for (double d_a = d_a_min; d_a < d_a_max + 1e-12; d_a += 1.0/n_samples) {
870 double f_MM = p*p + d_a;
871 double f_Mm = 2 * (p*(1-p) - d_a);
872 double f_mm = (1-p) * (1-p) + d_a;
873 flush_freq(f_MM);
874 flush_freq(f_Mm);
875 flush_freq(f_mm);
876 double lnl = calc_dimorph_lnl(f_MM, f_Mm, f_mm, liks);
877 if (lnl > lnl_dimorph) {
878 d_a_best = d_a;
879 lnl_dimorph = lnl;
880 }
881 }
882 }
883 assert(lnl_dimorph > std::numeric_limits<double>::lowest());
884
885 // 5. Find a local maximum.
886 double freq_MM = p*p + d_a_best;
887 double freq_Mm = 2 * (p*(1-p) - d_a_best);
888 double freq_mm = (1-p) * (1-p) + d_a_best;
889 flush_freq(freq_MM);
890 flush_freq(freq_Mm);
891 flush_freq(freq_mm);
892 double x = 1.0/n_samples;
893 {
894 double lnl_prev;
895 do {
896 lnl_prev = lnl_dimorph;
897 array<array<double,3>,6> neighbors = {{
898 {{freq_MM+x, freq_Mm-x, freq_mm}},
899 {{freq_MM-x, freq_Mm+x, freq_mm}},
900 {{freq_MM+x, freq_Mm, freq_mm-x}},
901 {{freq_MM-x, freq_Mm, freq_mm+x}},
902 {{freq_MM, freq_Mm+x, freq_mm-x}},
903 {{freq_MM, freq_Mm-x, freq_mm+x}}
904 }};
905 for(auto& n : neighbors) {
906 double f_MM = n[0];
907 double f_Mm = n[1];
908 double f_mm = n[2];
909 if (f_MM < 0.0-1e-12 || f_MM > 1.0+1e-12
910 || f_Mm < 0.0-1e-12 || f_Mm > 1.0+1e-12
911 || f_mm < 0.0-1e-12 || f_mm > 1.0+1e-12)
912 // Out of bounds.
913 continue;
914 flush_freq(f_MM);
915 flush_freq(f_Mm);
916 flush_freq(f_mm);
917
918 double lnl = calc_dimorph_lnl(f_MM, f_Mm, f_mm, liks);
919 if (lnl > lnl_dimorph) {
920 // Update the frequencies.
921 freq_MM = f_MM;
922 freq_Mm = f_Mm;
923 freq_mm = f_mm;
924 lnl_dimorph = lnl;
925 }
926 }
927 } while (lnl_dimorph > lnl_prev);
928 }
929 assert(almost_equal(freq_MM+freq_Mm+freq_mm, 1.0));
930
931 //
932 // III. Test whether the site is polymorphic.
933 //
934
935 if (!lrtest(lnl_dimorph, lnl_fixed, var_threshold_))
936 return SiteCall(nt_M);
937
938 map<Nt2,double> allele_freqs = {
939 {nt_M, freq_MM+0.5*freq_Mm},
940 {nt_m, freq_mm+0.5*freq_Mm},
941 };
942 for (auto& a : allele_freqs)
943 flush_freq(a.second);
944
945 long dimorph_qual = vcf_lnl2gq(lnl_dimorph, lnl_fixed);
946
947 //
948 // IV. Bayes-corrected likelihoods & genotypes.
949 //
950 // Note: @Nick Oct 2017: Optimal genotype frequencies may be 0.0 when the MAC
951 // or the number of samples is low. To avoid feeding null frequencies into
952 // the genotype equations, which may lead to distorted likelihood ratios
953 // (e.g. for a candidate het with more coverage for the minor allele, when
954 // the minor homozygote frequency is 0.0), we add 1/n_samples to all three
955 // frequencies.
956 //
957
958 vector<SampleCall> sample_calls (n_samples);
959 {
960 sample_calls.reserve(n_samples);
961 freq_MM = (freq_MM + x) / (1.0 + 3 * x);
962 freq_Mm = (freq_Mm + x) / (1.0 + 3 * x);
963 freq_mm = (freq_mm + x) / (1.0 + 3 * x);
964 double log_f_MM = log(freq_MM);
965 double log_f_Mm = log(freq_Mm);
966 double log_f_mm = log(freq_mm);
967 size_t gt_MM = GtLiks::gt_index(nt_M, nt_M);
968 size_t gt_Mm = GtLiks::gt_index(nt_M, nt_m);
969 size_t gt_mm = GtLiks::gt_index(nt_m, nt_m);
970 for (size_t sample=0; sample<n_samples; ++sample) {
971 const LikData& s_liks = liks[sample];
972 SampleCall& s_call = sample_calls[sample];
973
974 double w_sum = calc_ln_weighted_sum(freq_MM, freq_Mm, freq_mm, s_liks);
975
976 double lnl_MM = s_liks.lnl_MM + log_f_MM - w_sum;
977 double lnl_Mm = s_liks.lnl_Mm + log_f_Mm - w_sum;
978 double lnl_mm = s_liks.lnl_mm + log_f_mm - w_sum;
979 assert(lnl_MM < 0.0+1e-12);
980 assert(lnl_Mm < 0.0+1e-12);
981 assert(lnl_mm < 0.0+1e-12);
982 if (lnl_MM > 0.0)
983 lnl_MM = 0.0;
984 if (lnl_Mm > 0.0)
985 lnl_Mm = 0.0;
986 if (lnl_mm > 0.0)
987 lnl_mm = 0.0;
988 s_call.lnls().set(gt_MM, lnl_MM);
989 s_call.lnls().set(gt_Mm, lnl_Mm);
990 s_call.lnls().set(gt_mm, lnl_mm);
991
992 array<pair<double,pair<Nt2,Nt2>>,3> lnls {{
993 {lnl_MM, {nt_M,nt_M}},
994 {lnl_Mm, {nt_M,nt_m}},
995 {lnl_mm, {nt_m,nt_m}}
996 }};
997 sort(lnls.rbegin(), lnls.rend());
998
999 if (lrtest(lnls[0].first, lnls[1].first, gt_threshold_)) {
1000 pair<Nt2,Nt2>& nts = lnls[0].second;
1001 long gq = vcf_lnl2gq(lnls[0].first, lnls[1].first);
1002 s_call.set_call(nts.first == nts.second ? snp_type_hom : snp_type_het, nts.first, nts.second, gq);
1003 }
1004 }
1005 }
1006
1007 return SiteCall(move(allele_freqs), dimorph_qual, move(sample_calls));
1008 }
1009