1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010 - 2012, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // models.cc -- routines to detect polymorphism (snp) and detect a lack of polymorphism (fixed).
23 //
24 // Julian Catchen
25 // jcatchen@uoregon.edu
26 // University of Oregon
27 //
28 // $Id$
29 //
30 #include "models.h"
31 
32 using namespace std;
33 
34 vector<pair<char, int>> sort_acgt(const map<char, int>&);
35 void record_snp(SNP& snp, snp_type type, uint col, double l_ratio, const vector<pair<char, int>>& nuc);
36 void record_dummy_snp(SNP& snp, uint col);
37 
38 int    barcode_size     = 5;
39 double barcode_err_freq = 0.0;
40 
41 double heterozygote_limit = -qchisq(0.05,1);
42 double homozygote_limit   = qchisq(0.05,1);
43 double bound_low          = 0.0;
44 double bound_high         = 1.0;
45 double p_freq             = 0.5;
46 
47 const map<string,modelt> model_strings = {
48     {"snp", ::snp},
49     {"bounded", ::bounded},
50     {"fixed", ::fixed},
51     {"marukihigh", ::marukihigh},
52     {"marukilow", ::marukilow},
53 };
54 
55 // Quantiles of the Chi2 distribution for VCF's GQ.
56 //$ Rscript -e "cat(paste(qchisq(1-10^(-(1:40)/10), 1), '\n', sep=''))"
57 const std::array<double,40> chisq1ddf_gq = {{
58     0.0679615365255401, 0.230764766661193, 0.452421556097698, 0.714036104152315, 1.00448471501902,
59     1.31668063342863, 1.64583886397469, 1.98858107774449, 2.34243609502603, 2.70554345409542,
60     3.07646857913928, 3.45408274293754, 3.83748240184564, 4.22593339019369, 4.61883133337202,
61     5.01567294625387, 5.41603482049954, 5.81955747770787, 6.22593319775977, 6.63489660102121,
62     7.04621727098416, 7.45969391025114, 7.87514966368955, 8.29242834051146, 8.71139133617301,
63     9.13191510451069, 9.55388906647803, 9.97721386826398, 10.4017999212068, 10.8275661706627,
64     11.2544390521783, 11.6823516018745, 12.1112426945654, 12.5410563882771, 12.9717413578738,
65     13.403250403671, 13.8355400234795, 14.2685700385079, 14.7023032652319, 15.1367052266236
66 }};
67 
qchisq(double alpha,size_t df)68 double qchisq(double alpha, size_t df) {
69     //
70     // Quantiles (1-alpha) of the Chi2 distribution, as given by
71     //$ alphas="1e-1,5e-2,1e-2,5e-3,1e-3,5e-4,1e-4,5e-5,1e-5,5e-6,1e-6"
72     //$ df=1
73     //$ R -e "for(a in c($alphas)) {q=qchisq(1-a, $df); cat(paste('{',a,',',q,'}, ',sep=''));}"
74     //
75     // The value for a=0.05,df=1 is 3.84 for backward compatiblity (instead of
76     // 3.84145882069412).
77     //
78     static const vector<map<double,double>> table = {
79         { //df=1
80             {0.1,2.70554345409542}, {0.05,3.84}, {0.01,6.63489660102121},
81             {0.005,7.87943857662241}, {0.001,10.8275661706627}, {0.0005,12.1156651463974},
82             {0.0001,15.1367052266236}, {0.00005,16.4481102100082}, {0.00001,19.5114209646663},
83             {0.000005,20.8372870225104}, {0.000001,23.9281269768795}
84         },{ //df=2
85             {0.1,4.60517018598809}, {0.05,5.99146454710798}, {0.01,9.21034037197618},
86             {0.005,10.5966347330961}, {0.001,13.8155105579643}, {0.0005,15.2018049190844},
87             {0.0001,18.4206807439526}, {0.00005,19.8069751050725}, {0.00001,23.0258509299496},
88             {0.000005,24.4121452910472}, {0.000001,27.631021115871}
89         }
90     };
91 
92     assert(df > 0 && df <= table.size());
93     try {
94         return table[df-1].at(alpha);
95     } catch (std::out_of_range&) {
96         cerr << "Error: Unsupported alpha value '" << alpha << "'; pick one of 0.05, 0.01, 0.005, 0.001...\n";
97         throw std::invalid_argument("qchisq");
98     }
99 }
100 
parse_model_type(const string & arg)101 modelt parse_model_type(const string& arg) {
102     try {
103         return model_strings.at(arg);
104     } catch (std::out_of_range&) {
105         cerr << "Error: Unknown model '" << arg << "'.\n";
106         throw std::invalid_argument("set_model_type");
107     }
108 }
109 
set_model_thresholds(double alpha)110 void set_model_thresholds(double alpha) {
111     double x = qchisq(alpha, 1);
112     homozygote_limit = x;
113     heterozygote_limit = -x;
114 }
115 
to_string(modelt model_type)116 string to_string(modelt model_type) {
117     for (auto& m : model_strings)
118         if (model_type == m.second)
119             return m.first;
120     DOES_NOT_HAPPEN;
121     return string();
122 }
123 
sort_acgt(const map<char,int> & counts)124 vector<pair<char, int>> sort_acgt(const map<char, int>& counts) {
125     vector<pair<char, int>> nuc;
126     for (map<char, int>::const_iterator i=counts.begin(); i!=counts.end(); i++)
127         if (i->first != 'N')
128             nuc.push_back(make_pair(i->first, i->second));
129     sort(nuc.begin(), nuc.end(), compare_pair);
130     return nuc;
131 }
132 
record_snp(SNP & snp,snp_type type,uint col,double l_ratio,const vector<pair<char,int>> & nuc)133 void record_snp(SNP& snp, snp_type type, uint col, double l_ratio, const vector<pair<char, int>>& nuc) {
134 
135     snp.type   = type;
136     snp.col    = col;
137     snp.lratio = l_ratio;
138     snp.rank_1 = nuc[0].first;
139 
140     switch (type) {
141     case snp_type_het:
142         snp.rank_2 = nuc[1].first;
143         break;
144     case snp_type_hom:
145         snp.rank_2 = '-';
146         break;
147     default:
148         // snp_type_unk, with at least one observation (otherwise record_dummy_snp
149         // would have been called)
150         // A rank_2 nucleotide is set only if at least two nucleotides were observed.
151         snp.rank_2 = nuc[1].second > 0 ? nuc[1].first : '-';
152         break;
153     }
154 
155     snp.rank_3 = 0;
156     snp.rank_4 = 0;
157 }
158 
record_dummy_snp(SNP & snp,uint col)159 void record_dummy_snp(SNP& snp, uint col) {
160     snp.type   = snp_type_unk;
161     snp.col    = col;
162     snp.lratio = 0.0;
163     snp.rank_1 = 'N';
164     snp.rank_2 = '-';
165     snp.rank_3 = 0;
166     snp.rank_4 = 0;
167 }
168 
call_multinomial_snp(MergedStack * tag,int col,map<char,int> & n,bool record_snps)169 void call_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_snps) {
170     vector<pair<char, int> > nuc = sort_acgt(n);
171     if (nuc[0].second == 0) {
172         if (record_snps) {
173             tag->snps.push_back(new SNP());
174             record_dummy_snp(*tag->snps.back(), col);
175         }
176         return;
177     }
178 
179     double l_ratio = lr_multinomial_model(nuc[0].second, nuc[1].second, nuc[2].second, nuc[3].second);
180     snp_type type = call_snp(l_ratio);
181     if (record_snps) {
182         tag->snps.push_back(new SNP());
183         record_snp(*tag->snps.back(), type, col, l_ratio, nuc);
184     }
185 }
186 
call_bounded_multinomial_snp(MergedStack * tag,int col,map<char,int> & n,bool record_snps)187 void call_bounded_multinomial_snp(MergedStack *tag, int col, map<char, int> &n, bool record_snps) {
188     vector<pair<char, int> > nuc = sort_acgt(n);
189     if (nuc[0].second == 0) {
190         if (record_snps) {
191             tag->snps.push_back(new SNP());
192             record_dummy_snp(*tag->snps.back(), col);
193         }
194         return;
195     }
196 
197     double l_ratio = lr_bounded_multinomial_model(nuc[0].second, nuc[1].second, nuc[2].second, nuc[3].second);
198     snp_type type = call_snp(l_ratio);
199     if (record_snps) {
200         tag->snps.push_back(new SNP());
201         record_snp(*tag->snps.back(), type, col, l_ratio, nuc);
202     }
203 }
204 
call_multinomial_snp(Locus * tag,int col,map<char,int> & n)205 void call_multinomial_snp(Locus *tag, int col, map<char, int> &n) {
206     vector<pair<char, int> > nuc = sort_acgt(n);
207     if (nuc[0].second == 0) {
208         record_dummy_snp(*tag->snps[col], col);
209         return;
210     }
211 
212     double l_ratio = lr_multinomial_model(nuc[0].second, nuc[1].second, nuc[2].second, nuc[3].second);
213     snp_type type = call_snp(l_ratio);
214     record_snp(*tag->snps[col], type, col, l_ratio, nuc);
215 }
216 
call_bounded_multinomial_snp(Locus * tag,int col,map<char,int> & n)217 void call_bounded_multinomial_snp(Locus *tag, int col, map<char, int> &n) {
218     vector<pair<char, int> > nuc = sort_acgt(n);
219     if (nuc[0].second == 0) {
220         record_dummy_snp(*tag->snps[col], col);
221         return;
222     }
223 
224     double l_ratio = lr_bounded_multinomial_model(nuc[0].second, nuc[1].second, nuc[2].second, nuc[3].second);
225     snp_type type = call_snp(l_ratio);
226     record_snp(*tag->snps[col], type, col, l_ratio, nuc);
227 }
228 
call_multinomial_fixed(MergedStack * tag,int col,map<char,int> & n)229 void call_multinomial_fixed (MergedStack *tag, int col, map<char, int> &n) {
230     const double nucleotide_fixed_limit = 1.92;
231 
232     vector<pair<char, int> > nuc = sort_acgt(n);
233     if (nuc[0].second == 0) {
234         tag->snps.push_back(new SNP());
235         record_dummy_snp(*tag->snps.back(), col);
236         return;
237     }
238 
239     double l_ratio;
240     double nuc_1 = nuc[0].second;
241     double nuc_2 = nuc[1].second;
242     {
243         //
244         // Method of Paul Hohenlohe <hohenlo@uoregon.edu>, personal communication.
245         //
246         // Each population sample contains DNA from 6 individuals, so a
247         // sample of 12 alleles from the population. We want to assign a
248         // nucleotide (A,C,G,T) to each position where the population is
249         // fixed or nearly so, and N to each position that is either
250         // polymorphic within the population or has insufficient coverage
251         // depth to make a call. We can do this with a likelihood ratio
252         // test of the read counts, testing whether the allele frequency
253         // of the dominant allele is significantly larger than some
254         // threshold p) , stepping through each nucleotide position across
255         // RAD tags.
256         //
257 
258         double epsilon = -1 * (log(1 - barcode_err_freq) / barcode_size);
259 
260         l_ratio  =
261             nuc_1 * log( ((4 * nuc_1 * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) /
262                          ((4 * p_freq * (nuc_1 + nuc_2) * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) );
263 
264         l_ratio +=
265             nuc_2 * log( ((4 * nuc_2 * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) /
266                          ((4 * (1 - p_freq) * (nuc_1 + nuc_2) * (1 - epsilon)) + ((nuc_1 + nuc_2) * epsilon)) );
267 
268         //cerr << "Nuc_1: " << nuc_1 << " Nuc_2: " << nuc_2 << " Likelihood ratio: " << l_ratio << "\n";
269     }
270 
271     double n_ratio = nuc_1 / (nuc_1 + nuc_2);
272 
273     snp_type type = n_ratio < p_freq || l_ratio < nucleotide_fixed_limit ? snp_type_unk : snp_type_hom;
274 
275     tag->snps.push_back(new SNP());
276     record_snp(*tag->snps.back(), type, col, l_ratio, nuc);
277 }
278 
279 //
280 // ln L(1/2) = ln(n! / n_1!n_2!n_3!n_4!) +
281 //               (n_1 + n_2) * ln(n_1 + n_2 / 2n) +
282 //               (n_3 + n_4) * ln(n_3 + n_4 / 2n)
283 //
284 double
heterozygous_likelihood(int col,map<char,int> & nuc)285 heterozygous_likelihood(int col, map<char, int> &nuc)
286 {
287     vector<pair<char, int> > cnts;
288     map<char, int>::iterator i;
289 
290     double n = 0;
291     for (i = nuc.begin(); i != nuc.end(); i++) {
292         n += i->second;
293         cnts.push_back(make_pair(i->first, i->second));
294     }
295 
296     sort(cnts.begin(), cnts.end(), compare_pair);
297 
298     double n_1 = cnts[0].second;
299     double n_2 = cnts[1].second;
300     double n_3 = cnts[2].second;
301     double n_4 = cnts[3].second;
302 
303     double term_1 =
304         reduced_log_factorial(n, n_1) -
305         (log_factorial(n_2) + log_factorial(n_3) + log_factorial(n_4));
306 
307     double term_3 = (n_3 + n_4 > 0) ? log((n_3 + n_4) / (2 * n)) : 0;
308 
309     double lnl =
310         term_1 +
311         ((n_1 + n_2) * log((n_1 + n_2) / (2 * n))) +
312         ((n_3 + n_4) * term_3);
313 
314     return lnl;
315 }
316 
317 //
318 // ln L(1/1) = ln(n! / n_1!n_2!n_3!n_4!) +
319 //               n_1 * ln(n_1 / n) +
320 //               (n - n_1) * ln(n - n_1 / 3n)
321 //
322 double
homozygous_likelihood(int col,map<char,int> & nuc)323 homozygous_likelihood(int col, map<char, int> &nuc)
324 {
325     vector<pair<char, int> > cnts;
326     map<char, int>::iterator i;
327 
328     double n = 0;
329     for (i = nuc.begin(); i != nuc.end(); i++) {
330         n += i->second;
331         cnts.push_back(make_pair(i->first, i->second));
332     }
333 
334     sort(cnts.begin(), cnts.end(), compare_pair);
335 
336     double n_1 = cnts[0].second;
337     double n_2 = cnts[1].second;
338     double n_3 = cnts[2].second;
339     double n_4 = cnts[3].second;
340 
341     double term_1 =
342         reduced_log_factorial(n, n_1) -
343         (log_factorial(n_2) + log_factorial(n_3) + log_factorial(n_4));
344 
345     double term_3 = n - n_1 > 0 ? log((n - n_1) / (3 * n)) : 0;
346 
347     double lnl =
348         term_1 +
349         (n_1 * log(n_1 / n)) +
350         ((n - n_1) * term_3);
351 
352     return lnl;
353 }
354 
most_frequent_allele() const355 Nt2 SiteCall::most_frequent_allele() const {
356     assert(!alleles().empty());
357     auto a = alleles().begin();
358     auto best = a;
359     for(; a!=alleles().end(); ++a)
360         if (a->second > best->second)
361             best = a;
362     return best->first;
363 }
364 
filter_mac(size_t min_mac)365 void SiteCall::filter_mac(size_t min_mac) {
366     assert(alleles().size() > 1);
367     Counts<Nt2> allele_counts = tally_allele_counts(sample_calls());
368     if (allele_counts.sorted()[1].first < min_mac)
369         *this = SiteCall(most_frequent_allele());
370 }
371 
tally_allele_counts(const vector<SampleCall> & spldata)372 Counts<Nt2> SiteCall::tally_allele_counts(const vector<SampleCall>& spldata) {
373     Counts<Nt2> counts;
374     for (const SampleCall& sd : spldata) {
375         switch (sd.call()) {
376         case snp_type_hom :
377             counts.increment(sd.nt0());
378             counts.increment(sd.nt0());
379             break;
380         case snp_type_het :
381             counts.increment(sd.nt0());
382             counts.increment(sd.nt1());
383             break;
384         default:
385             // snp_type_unk
386             break;
387         }
388     }
389     return counts;
390 }
391 
tally_allele_freqs(const vector<SampleCall> & spldata)392 map<Nt2,double> SiteCall::tally_allele_freqs(const vector<SampleCall>& spldata) {
393     //
394     // Tally the existing alleles and their frequencies.
395     //
396     map<Nt2,double> allele_freqs;
397     Counts<Nt2> counts = tally_allele_counts(spldata);
398     array<pair<size_t,Nt2>,4> sorted_alleles = counts.sorted();
399     size_t tot = counts.sum();
400     for (auto& a : sorted_alleles) {
401         if (a.first == 0)
402             break;
403         allele_freqs.insert({a.second, double(a.first)/tot});
404     }
405     return allele_freqs;
406 }
407 
operator <<(ostream & os,const SampleCall & c)408 ostream& operator<<(ostream& os, const SampleCall& c) {
409     ostream copy (os.rdbuf());
410     copy << std::setprecision(4);
411     // Genotype.
412     switch(c.call()) {
413     case snp_type_hom: os << c.nt0() << "/" << c.nt0(); break;
414     case snp_type_het: os << std::min(c.nt0(), c.nt1()) << "/" << std::max(c.nt0(), c.nt1()); break;
415     case snp_type_unk: os << "u"; break;
416     case snp_type_discarded: os << "d"; break;
417     default: DOES_NOT_HAPPEN; break;
418     }
419     // Likelihoods.
420     os << "\t{" << c.lnls() << "}";
421     return os;
422 }
423 
print(ostream & os,const SiteCounts & depths)424 void SiteCall::print(ostream& os, const SiteCounts& depths) {
425     // Total depths.
426     os << "tot depths {" << depths.tot << "}\n";
427     // Alleles.
428     os << "alleles";
429     for (auto& a : alleles())
430         os << " " << a.first << "(" << a.second << ")";
431     os << "\n";
432     // Samples.
433     os << "samples";
434     for (size_t s=0; s<depths.samples.size(); ++s) {
435         // Index.
436         os << "\n" << s;
437         auto& dp = depths.samples[s];
438         if (dp.sum() == 0) {
439             os << "\t.";
440         } else {
441             // Depths.
442             os << "\t{" << dp << "}";
443             if (alleles().size() >= 2)
444                 os << "\t" << sample_calls()[s];
445         }
446     }
447 }
448 
call(const SiteCounts & depths) const449 SiteCall MultinomialModel::call(const SiteCounts& depths) const {
450 
451     size_t n_samples = depths.mpopi->samples().size();
452 
453     //
454     // Make genotype calls.
455     //
456     vector<SampleCall> sample_calls (n_samples);
457     array<pair<size_t,Nt2>,4> sorted;
458     for (size_t sample=0; sample<n_samples; ++sample) {
459         const Counts<Nt2>& sdepths = depths.samples[sample];
460         size_t dp = sdepths.sum();
461         if (dp == 0)
462             continue;
463 
464         sorted = sdepths.sorted();
465         SampleCall& c = sample_calls[sample];
466         double lnl_hom = lnl_multinomial_model_hom(dp, sorted[0].first);
467         double lnl_het = lnl_multinomial_model_het(dp, sorted[0].first+sorted[1].first);
468         c.lnls().set(sorted[0].second, sorted[0].second, lnl_hom);
469         c.lnls().set(sorted[0].second, sorted[1].second, lnl_het);
470 
471         snp_type gt_call = call_snp(lnl_hom, lnl_het);
472         long gq = vcf_lnl2gq(std::max(lnl_hom, lnl_het), std::min(lnl_hom, lnl_het));
473         c.set_call(gt_call, sorted[0].second, sorted[1].second, gq);
474     }
475 
476     //
477     // Record the existing alleles and their frequencies.
478     //
479     map<Nt2,double> allele_freqs = SiteCall::tally_allele_freqs(sample_calls);
480 
481     //
482     // Compute the missing genotype likelihoods, if any.
483     //
484     if (allele_freqs.size() < 2) {
485         sample_calls = vector<SampleCall>();
486     } else {
487         for (size_t sample=0; sample<n_samples; ++sample) {
488             const Counts<Nt2>& sdepths = depths.samples[sample];
489             size_t dp = sdepths.sum();
490             if (dp == 0)
491                 continue;
492 
493             SampleCall& c = sample_calls[sample];
494             for (auto nt1_it=allele_freqs.begin(); nt1_it!=allele_freqs.end(); ++nt1_it) {
495                 Nt2 nt1 (nt1_it->first);
496                 // Homozygote.
497                 if (!c.lnls().has_lik(nt1, nt1)) {
498                     double lnl = lnl_multinomial_model_hom(dp, sdepths[nt1]);
499                     c.lnls().set(nt1, nt1, lnl);
500                 }
501                 // Heterozygote(s).
502                 auto nt2_it = nt1_it;
503                 ++nt2_it;
504                 for (; nt2_it!=allele_freqs.end(); ++nt2_it) {
505                     Nt2 nt2 (nt2_it->first);
506                     if (!c.lnls().has_lik(nt1, nt2)) {
507                         double lnl = lnl_multinomial_model_het(dp, sdepths[nt1]+sdepths[nt2]);
508                         c.lnls().set(nt1, nt2, lnl);
509                     }
510                 }
511             }
512         }
513     }
514 
515     return SiteCall(move(allele_freqs), move(sample_calls));
516 }
517 
calc_hom_lnl(double n,double n1) const518 double MarukiHighModel::calc_hom_lnl(double n, double n1) const {
519     // This returns the same value as the Hohenlohe ('snp/binomial') model except
520     // when the error rate estimate is bounded at 1.0 (i.e. `n1 < 0.25*n` c.f.
521     // Hohenlohe equations).
522     if (n1 == n)
523         return 0.0;
524     else if (n1 == 0.0)
525         return n * log(1.0/3.0);
526     else
527         return n1 * log(n1/n) + (n-n1) * log( (n-n1)/(3.0*n) );
528 }
529 
calc_het_lnl(double n,double n1n2) const530 double MarukiHighModel::calc_het_lnl(double n, double n1n2) const {
531     // This returns the same value as the Hohenlohe ('snp/binomial') model except
532     // when the error rate estimate is bounded at 1.0 (i.e. `n1n2 < 0.5*n` c.f.
533     // Hohenlohe equations).
534     if (n1n2 == n)
535         return n * log(0.5);
536     else if (n1n2 < (1.0/3.0) * n)
537         return n1n2 * log(1.0/6.0) + (n-n1n2) * log(1.0/3.0);
538     else
539         return n1n2 * log( n1n2/(2.0*n) ) + (n-n1n2) * log((n-n1n2)/(2.0*n) );
540 }
541 
call(const SiteCounts & depths) const542 SiteCall MarukiHighModel::call(const SiteCounts& depths) const {
543 
544     /*
545      * For this model the procedure is:
546      * I. Obtain the most commonly seen nucleotide M.
547      * II. Look for alternative alleles, if any:
548      *     For each sample:
549      *         If there is a genotype significantly better than MM:
550      *             (This genotype can be Mm, mm or mn.)
551      *             The site is polymorphic.
552      *             Record m as an alternative allele.
553      *             If the genotype is mn AND is significantly better than mm:
554      *                 Record n as an allele.
555      * III. Given the known alleles, compute the likelihoods for all possible
556      *      genotypes (n.b. most of the non-trivial ones have already been
557      *      computed), and call genotypes.
558      */
559 
560     const size_t n_samples = depths.mpopi->samples().size();
561 
562     if (depths.tot.sum() == 0)
563         return SiteCall(map<Nt2,double>(), vector<SampleCall>());
564 
565     //
566     // I.
567     // Count the observed nucleotides of the site for all samples; set
568     // `SampleCall::depths_`.
569     // Then find the most common nucleotide across the population.
570     //
571     Nt2 nt_ref = depths.tot.sorted()[0].second;
572 
573     //
574     // II.
575     // Look for alternative alleles; start filling SampleCall::lnls_.
576     //
577     set<Nt2> alleles;
578     vector<SampleCall> sample_calls (n_samples);
579     alleles.insert(nt_ref);
580     array<pair<size_t,Nt2>,4> sorted;
581     for (size_t sample=0; sample<n_samples; ++sample) {
582         const Counts<Nt2>& sdepths = depths.samples[sample];
583         size_t dp = sdepths.sum();
584         if (dp == 0)
585             continue;
586 
587         // Find the best genotype for the sample -- this is either the homzygote
588         // for the rank0 nucleotide or the heterozygote for the rank0 and rank1
589         // nucleotides.
590         sorted = sdepths.sorted();
591         Nt2 nt0 = sorted[0].second;
592         Nt2 nt1 = sorted[1].second;
593         size_t dp0 = sorted[0].first;
594         size_t dp1 = sorted[1].first;
595 
596         if (dp1 == 0 && nt0 == nt_ref)
597             // We can wait until we know whether the site is fixed.
598             continue;
599 
600         SampleCall& c = sample_calls[sample];
601         double lnl_hom = calc_hom_lnl(dp, dp0);
602         double lnl_het = calc_het_lnl(dp, dp0+dp1);
603         c.lnls().set(nt0, nt0, lnl_hom);
604         c.lnls().set(nt0, nt1, lnl_het);
605 
606         // Make sure the sample would have a significant genotype call provided
607         // the site was polymorphic (otherwise a genotype can be significant in
608         // comparison with the ref homozygote but not in comparison with the
609         // second best genotype). This is a slight modification to Maruki &
610         // Lynch's method to avoid low-coverage weirdnesses. Note that the
611         // polymorphism discovery alpha could be set lower than the genotype call
612         // alpha (e.g. to account for multiple testing).
613         if (lnl_hom > lnl_het) {
614             if(!lrtest(lnl_hom, lnl_het, gt_threshold_))
615                 continue;
616         } else {
617             if(!lrtest(lnl_het, lnl_hom, gt_threshold_))
618                 continue;
619         }
620 
621         // Compare this likelihood to that of the ref,ref homozygote.
622         if (nt0 == nt_ref) {
623             if (lrtest(lnl_het, lnl_hom, var_threshold_))
624                 // Record the alternative allele.
625                 alleles.insert(nt1);
626         } else {
627             double lnl_ref = calc_hom_lnl(dp, sdepths[nt_ref]);
628             c.lnls().set(nt_ref, nt_ref, lnl_ref);
629             double lnl_best = std::max(lnl_hom, lnl_het);
630             if (lrtest(lnl_best, lnl_ref, var_threshold_)) {
631                 // Record one alternative allele.
632                 alleles.insert(nt0);
633                 if (nt1!=nt_ref && lrtest(lnl_het, lnl_hom, var_threshold_))
634                     // Record a second alternative allele (the SNP is at least ternary).
635                     alleles.insert(nt1);
636             }
637         }
638     }
639 
640     //
641     // III.
642     // Compute the likelihoods for all possible genotypes & call genotypes.
643     //
644     if (alleles.size() == 1) {
645         sample_calls = vector<SampleCall>();
646     } else {
647         for (size_t sample=0; sample<n_samples; ++sample) {
648             const Counts<Nt2>& sdepths = depths.samples[sample];
649             size_t dp = sdepths.sum();
650             if (dp == 0)
651                 continue;
652 
653             SampleCall& c = sample_calls[sample];
654             for (auto nt1=alleles.begin(); nt1!=alleles.end(); ++nt1) {
655                 // Homozygote.
656                 if (!c.lnls().has_lik(*nt1, *nt1))
657                     c.lnls().set(*nt1, *nt1, calc_hom_lnl(dp, sdepths[*nt1]));
658                 // Heterozygote(s).
659                 auto nt2 = nt1;
660                 ++nt2;
661                 for (; nt2!=alleles.end(); ++nt2)
662                     if (!c.lnls().has_lik(*nt1, *nt2))
663                         c.lnls().set(*nt1, *nt2, calc_het_lnl(dp, sdepths[*nt1]+sdepths[*nt2]));
664             }
665 
666             // Call the genotype -- skiping ignored alleles.
667             sorted = sdepths.sorted();
668             auto nt0 = sorted.begin();
669             while(!alleles.count(nt0->second)) {
670                 ++nt0;
671                 assert(nt0 != sorted.end());
672             }
673             auto nt1 = nt0;
674             ++nt1;
675             while(!alleles.count(nt1->second)) {
676                 ++nt1;
677                 assert(nt1 != sorted.end());
678             }
679             double lnl_hom = c.lnls().at(nt0->second, nt0->second);
680             double lnl_het = c.lnls().at(nt0->second, nt1->second);
681             snp_type call;
682             long gq;
683             if (lnl_hom > lnl_het) {
684                 call = lrtest(lnl_hom, lnl_het, gt_threshold_) ? snp_type_hom : snp_type_unk;
685                 gq = vcf_lnl2gq(lnl_hom, lnl_het);
686             } else {
687                 call = lrtest(lnl_het, lnl_hom, gt_threshold_) ? snp_type_het : snp_type_unk;
688                 gq = vcf_lnl2gq(lnl_het, lnl_hom);
689             }
690 
691             c.set_call(call, nt0->second, nt1->second, gq);
692         }
693     }
694 
695     //
696     // Finally, compute allele frequencies.
697     //
698     map<Nt2,double> allele_freqs;
699     if (alleles.size() == 1) {
700         allele_freqs.insert({*alleles.begin(), 1.0});
701     } else {
702         allele_freqs = SiteCall::tally_allele_freqs(sample_calls);
703         assert(allele_freqs.size() == alleles.size()
704                || (allele_freqs.size() == alleles.size()-1 && !allele_freqs.count(nt_ref)));
705         // Note: There are limit cases (esp. low coverage) where nt_ref does
706         // not appear in any of the significant genotypes.
707     }
708 
709     return SiteCall(move(allele_freqs), move(sample_calls));
710 }
711 
calc_fixed_lnl(double n_tot,double n_M_tot) const712 double MarukiLowModel::calc_fixed_lnl(double n_tot, double n_M_tot) const {
713     assert(n_tot > 0.0);
714     assert(n_M_tot > 0.0);
715     if (n_M_tot == n_tot)
716         return 0.0;
717     else
718         return n_M_tot * log(n_M_tot/n_tot) + (n_tot-n_M_tot) * log((n_tot-n_M_tot)/n_tot/3.0);
719 }
720 
calc_dimorph_lnl(double freq_MM,double freq_Mm,double freq_mm,const vector<LikData> & liks) const721 double MarukiLowModel::calc_dimorph_lnl(double freq_MM, double freq_Mm, double freq_mm, const vector<LikData>& liks) const {
722     double lnl = 0.0;
723     // Sum over samples.
724     for (const LikData& s_liks : liks)
725         if (s_liks.has_data)
726             // If !has_data, the sum is 1 and its log 0.
727             lnl += calc_ln_weighted_sum(freq_MM, freq_Mm, freq_mm, s_liks);
728     assert(lnl <= 0.0);
729     return lnl;
730 }
731 
calc_ln_weighted_sum(double freq_MM,double freq_Mm,double freq_mm,const LikData & s_liks) const732 double MarukiLowModel::calc_ln_weighted_sum(double freq_MM, double freq_Mm, double freq_mm, const LikData& s_liks) const {
733     double weighted_sum = freq_MM * s_liks.l_MM + freq_Mm * s_liks.l_Mm + freq_mm * s_liks.l_mm;
734     #pragma omp atomic
735     ++n_wsum_tot_;
736     if (weighted_sum >= std::numeric_limits<double>::min()) {
737         return log(weighted_sum);
738     } else {
739         // `weigted_sum` is subnormal or zero.
740         #pragma omp atomic
741         ++n_wsum_underflows_;
742         return calc_ln_weighted_sum_safe(freq_MM, freq_Mm, freq_mm, s_liks);
743     }
744 }
745 
calc_ln_weighted_sum_safe(double freq_MM,double freq_Mm,double freq_mm,const LikData & s_liks) const746 double MarukiLowModel::calc_ln_weighted_sum_safe(double freq_MM, double freq_Mm, double freq_mm, const LikData& s_liks) const {
747     array<pair<double,double>,3> s = {{
748         {s_liks.lnl_MM, freq_MM},
749         {s_liks.lnl_Mm, freq_Mm},
750         {s_liks.lnl_mm, freq_mm}
751     }};
752     std::sort(s.begin(), s.end()); // `s` is sorted by increasing lnl.
753     if (s[2].second > 0.0)
754         return s[2].first + log(s[2].second
755                                 + s[1].second * exp(s[1].first-s[2].first)
756                                 + s[0].second * exp(s[0].first-s[2].first)
757                                 );
758     else if (s[1].second > 0.0)
759         return s[1].first + log(s[1].second + s[0].second * exp(s[0].first-s[1].first));
760     else
761         return s[0].first + log(s[0].second);
762 }
763 
call(const SiteCounts & depths) const764 SiteCall MarukiLowModel::call(const SiteCounts& depths) const {
765 
766     /*
767      * For this model the procedure is:
768      * I. Compute the maximum likelihood for the fixed-site hypothesis (straightforward).
769      * II. Compute the maximum likelihood for the dimorphic-site hypothesis and
770      *     estimate the genotype frequencies:
771      *     1. Compute the error rate estimate.
772      *     2. Compute the base likelihoods for all samples, all genotypes.
773      *     3. Compute the starting major allele frequency (`p`) value.
774      *     4. Compute the likelihoods for the site for all possible disequilibrium
775      *         coefficients (`d_a`); keep the best one.
776      *     5. Optimize the genotype frequencies by finding a local maximum.
777      * III. Test whether the site is polymorphic.
778      * IV. Compute the likelihoods for all samples, all genotypes using Bayes'
779      *      theorem, and call genotypes.
780      */
781 
782     const size_t n_samples = depths.mpopi->samples().size();
783 
784     size_t dp_tot = depths.tot.sum();
785     if (dp_tot == 0)
786         return SiteCall();
787 
788     //
789     // I. Likelihood for the fixed-site hypothesis.
790     //
791 
792     array<pair<size_t,Nt2>,4> sorted = depths.tot.sorted();
793     Nt2 nt_M = sorted[0].second;
794     Nt2 nt_m = sorted[1].second;
795     size_t n_M_tot = sorted[0].first;
796     size_t n_m_tot = sorted[1].first;
797 
798     double lnl_fixed = calc_fixed_lnl(dp_tot, n_M_tot);
799 
800     if (!lrtest(0.0, lnl_fixed, var_threshold_))
801         // Fixed: `lnl_fixed` is high enough than even when `lnl_dimorphic` is
802         // at its maximum value of 0, dimorphism isn't significant. This happens
803         // when there are either very few non-major-allele reads or very few reads
804         // overall.
805         return SiteCall(nt_M);
806 
807     //
808     // II. Compute and optimize the likelihood for the dimorphic-site hypothesis
809     //
810 
811     // 1. Error rate.
812     // Note: @Nick, Apr 2017: Maruki's description uses a simple estimator where
813     // the number of invisible errors (from M to m and inversely) is estimated
814     // as being half the number of observable errors. But this when few
815     // observable errors this is an underestimate as ideally we would like to
816     // integrate; in particular the error rate is estimated to zero when
817     // there aren't any observable errors and this leads to meaningless null
818     // likelihoods. Doing the integration isn't practical but we can reduce
819     // the problem by adding (see edit) to the numerator and denominator.
820     // Edit. Oct 2017: 1 is way too large; adding 0.1 yields a more reasonable
821     // prior.
822     double e = 3.0 / 2.0 * (dp_tot - n_M_tot - n_m_tot + 0.1) / double(dp_tot + 0.1);
823     assert(e > 0.0 && e < 1.0);
824     #pragma omp atomic
825     ++n_called_sites_;
826     #pragma omp atomic
827     sum_site_err_rates_ += e;
828 
829     // 2. Base likelihoods.
830     vector<LikData> liks;
831     {
832         liks.reserve(n_samples);
833         double ln_err_hom = log(e/3.0);
834         double ln_hit_hom = log(1-e);
835         double ln_err_het = log(e/3.0);
836         double ln_hit_het = log(0.5-e/3.0);
837         for (size_t sample=0; sample<n_samples; ++sample) {
838             const Counts<Nt2>& sdepths = depths.samples[sample];
839             size_t dp = sdepths.sum();
840             if (dp == 0) {
841                 liks.push_back(LikData());
842                 continue;
843             }
844 
845             double lnl_MM = sdepths[nt_M] * ln_hit_hom + (dp-sdepths[nt_M]) * ln_err_hom;
846             double lnl_mm = sdepths[nt_m] * ln_hit_hom + (dp-sdepths[nt_m]) * ln_err_hom;
847             double lnl_Mm = (sdepths[nt_M]+sdepths[nt_m]) * ln_hit_het + (dp-(sdepths[nt_M]+sdepths[nt_m])) * ln_err_het;;
848             liks.push_back(LikData(lnl_MM, lnl_Mm, lnl_mm));
849         }
850     }
851 
852     // 3. Initial major allele frequency.
853     double p = (2*n_M_tot + 2*n_m_tot > dp_tot) ?
854             0.5 * double(3*n_M_tot + n_m_tot - dp_tot) / (2*n_M_tot + 2*n_m_tot - dp_tot)
855             : 0.5; // if `n_M_tot == n_m_tot == dp_tot / 4`.
856 
857     // 4. Initial disequilibrium coefficient.
858     auto flush_freq = [](double& f) {
859         if (f < 1e-12)
860             f = 0.0;
861         else if (f > 1.0-1e-12)
862             f = 1.0;
863     };
864     double d_a_best = 0.0;
865     double lnl_dimorph = std::numeric_limits<double>::lowest();
866     {
867         double d_a_min = max(-p*p, -(1-p)*(1-p));
868         double d_a_max = p*(1-p);
869         for (double d_a = d_a_min; d_a < d_a_max + 1e-12; d_a += 1.0/n_samples) {
870             double f_MM = p*p + d_a;
871             double f_Mm = 2 * (p*(1-p) - d_a);
872             double f_mm = (1-p) * (1-p) + d_a;
873             flush_freq(f_MM);
874             flush_freq(f_Mm);
875             flush_freq(f_mm);
876             double lnl = calc_dimorph_lnl(f_MM, f_Mm, f_mm, liks);
877             if (lnl > lnl_dimorph) {
878                 d_a_best = d_a;
879                 lnl_dimorph = lnl;
880             }
881         }
882     }
883     assert(lnl_dimorph > std::numeric_limits<double>::lowest());
884 
885     // 5. Find a local maximum.
886     double freq_MM = p*p + d_a_best;
887     double freq_Mm = 2 * (p*(1-p) - d_a_best);
888     double freq_mm = (1-p) * (1-p) + d_a_best;
889     flush_freq(freq_MM);
890     flush_freq(freq_Mm);
891     flush_freq(freq_mm);
892     double x = 1.0/n_samples;
893     {
894         double lnl_prev;
895         do {
896             lnl_prev = lnl_dimorph;
897             array<array<double,3>,6> neighbors = {{
898                 {{freq_MM+x, freq_Mm-x, freq_mm}},
899                 {{freq_MM-x, freq_Mm+x, freq_mm}},
900                 {{freq_MM+x, freq_Mm,   freq_mm-x}},
901                 {{freq_MM-x, freq_Mm,   freq_mm+x}},
902                 {{freq_MM,   freq_Mm+x, freq_mm-x}},
903                 {{freq_MM,   freq_Mm-x, freq_mm+x}}
904             }};
905             for(auto& n : neighbors) {
906                 double f_MM = n[0];
907                 double f_Mm = n[1];
908                 double f_mm = n[2];
909                 if (f_MM < 0.0-1e-12 || f_MM > 1.0+1e-12
910                  || f_Mm < 0.0-1e-12 || f_Mm > 1.0+1e-12
911                  || f_mm < 0.0-1e-12 || f_mm > 1.0+1e-12)
912                     // Out of bounds.
913                     continue;
914                 flush_freq(f_MM);
915                 flush_freq(f_Mm);
916                 flush_freq(f_mm);
917 
918                 double lnl = calc_dimorph_lnl(f_MM, f_Mm, f_mm, liks);
919                 if (lnl > lnl_dimorph) {
920                     // Update the frequencies.
921                     freq_MM = f_MM;
922                     freq_Mm = f_Mm;
923                     freq_mm = f_mm;
924                     lnl_dimorph = lnl;
925                 }
926             }
927         } while (lnl_dimorph > lnl_prev);
928     }
929     assert(almost_equal(freq_MM+freq_Mm+freq_mm, 1.0));
930 
931     //
932     // III. Test whether the site is polymorphic.
933     //
934 
935     if (!lrtest(lnl_dimorph, lnl_fixed, var_threshold_))
936         return SiteCall(nt_M);
937 
938     map<Nt2,double> allele_freqs = {
939         {nt_M, freq_MM+0.5*freq_Mm},
940         {nt_m, freq_mm+0.5*freq_Mm},
941     };
942     for (auto& a : allele_freqs)
943         flush_freq(a.second);
944 
945     long dimorph_qual = vcf_lnl2gq(lnl_dimorph, lnl_fixed);
946 
947     //
948     // IV. Bayes-corrected likelihoods & genotypes.
949     //
950     // Note: @Nick Oct 2017: Optimal genotype frequencies may be 0.0 when the MAC
951     // or the number of samples is low. To avoid feeding null frequencies into
952     // the genotype equations, which may lead to distorted likelihood ratios
953     // (e.g. for a candidate het with more coverage for the minor allele, when
954     // the minor homozygote frequency is 0.0), we add 1/n_samples to all three
955     // frequencies.
956     //
957 
958     vector<SampleCall> sample_calls (n_samples);
959     {
960         sample_calls.reserve(n_samples);
961         freq_MM = (freq_MM + x) / (1.0 + 3 * x);
962         freq_Mm = (freq_Mm + x) / (1.0 + 3 * x);
963         freq_mm = (freq_mm + x) / (1.0 + 3 * x);
964         double log_f_MM = log(freq_MM);
965         double log_f_Mm = log(freq_Mm);
966         double log_f_mm = log(freq_mm);
967         size_t gt_MM = GtLiks::gt_index(nt_M, nt_M);
968         size_t gt_Mm = GtLiks::gt_index(nt_M, nt_m);
969         size_t gt_mm = GtLiks::gt_index(nt_m, nt_m);
970         for (size_t sample=0; sample<n_samples; ++sample) {
971             const LikData& s_liks = liks[sample];
972             SampleCall& s_call = sample_calls[sample];
973 
974             double w_sum = calc_ln_weighted_sum(freq_MM, freq_Mm, freq_mm, s_liks);
975 
976             double lnl_MM = s_liks.lnl_MM + log_f_MM - w_sum;
977             double lnl_Mm = s_liks.lnl_Mm + log_f_Mm - w_sum;
978             double lnl_mm = s_liks.lnl_mm + log_f_mm - w_sum;
979             assert(lnl_MM < 0.0+1e-12);
980             assert(lnl_Mm < 0.0+1e-12);
981             assert(lnl_mm < 0.0+1e-12);
982             if (lnl_MM > 0.0)
983                 lnl_MM = 0.0;
984             if (lnl_Mm > 0.0)
985                 lnl_Mm = 0.0;
986             if (lnl_mm > 0.0)
987                 lnl_mm = 0.0;
988             s_call.lnls().set(gt_MM, lnl_MM);
989             s_call.lnls().set(gt_Mm, lnl_Mm);
990             s_call.lnls().set(gt_mm, lnl_mm);
991 
992             array<pair<double,pair<Nt2,Nt2>>,3> lnls {{
993                 {lnl_MM, {nt_M,nt_M}},
994                 {lnl_Mm, {nt_M,nt_m}},
995                 {lnl_mm, {nt_m,nt_m}}
996             }};
997             sort(lnls.rbegin(), lnls.rend());
998 
999             if (lrtest(lnls[0].first, lnls[1].first, gt_threshold_)) {
1000                 pair<Nt2,Nt2>& nts = lnls[0].second;
1001                 long gq = vcf_lnl2gq(lnls[0].first, lnls[1].first);
1002                 s_call.set_call(nts.first == nts.second ? snp_type_hom : snp_type_het, nts.first, nts.second, gq);
1003             }
1004         }
1005     }
1006 
1007     return SiteCall(move(allele_freqs), dimorph_qual, move(sample_calls));
1008 }
1009