1 // $Id: dlmodel.cpp,v 1.137 2012/05/22 20:25:32 jyamato Exp $
2 
3 /*
4   Copyright 2002  Peter Beerli, Mary Kuhner, Jon Yamato and Joseph Felsenstein
5 
6   This software is distributed free of charge for non-commercial use
7   and is copyrighted.  Of course, we do not guarantee that the software
8   works, and are not responsible for any damage you may cause or have.
9 */
10 
11 #include <cassert>
12 #include <algorithm>
13 #include <cstring>
14 #include <numeric>
15 #include <iostream>
16 
17 #include "local_build.h"
18 
19 #include "calculators.h"
20 #include "datapack.h"
21 #include "datatype.h"
22 #include "defaults.h"                   // for defaults::threshhold in StepwiseModel::ctor
23 #include "dlcell.h"
24 #include "dlmodel.h"
25 #include "errhandling.h"
26 #include "funcMax.h"
27 #include "locus.h"
28 #include "mathx.h"
29 #include "registry.h"
30 #include "runreport.h"
31 #include "stringx.h"
32 #include "xml_strings.h"                // for ToXML()
33 #include "xmlsum_strings.h"             // For WriteAlpha()
34 
35 #ifdef DMALLOC_FUNC_CHECK
36 #include "/usr/local/include/dmalloc.h"
37 #endif
38 
39 // turns on detailed local variables for debugging
40 // JRM 3/10
41 //#define DEBUG_VARIABLES
42 #ifdef DEBUG_VARIABLES
43     int printdbgvar = 0;
44 #endif
45 
46 using namespace std;
47 
48 //------------------------------------------------------------------------------------
49 //------------------------------------------------------------------------------------
50 
DataModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,long numBins,double relmurate)51 DataModel::DataModel(long nmarkers,
52                      long numCategories,
53                      DoubleVec1d categoryRates,
54                      DoubleVec1d categoryProbabilities,
55                      double userAutoCorrelationValue,
56                      bool doNormalize,
57                      long numBins,
58                      double relmurate)
59     :
60     m_ncategories(numCategories),
61     m_catrates(categoryRates),
62     m_catprobs(categoryProbabilities),
63     m_acratio(1.0 / userAutoCorrelationValue),
64     m_normalize(doNormalize),
65     m_nbins(numBins),
66     m_nmarkers(nmarkers),
67     m_relmurate(relmurate)
68 {
69     m_ucratio = 1.0 - m_acratio;
70     ScaleCatProbabilities();
71     ScaleCatRates();
72     assert(DataModel::IsValidDataModel());
73 
74 } // DataModel constructor
75 
76 //------------------------------------------------------------------------------------
77 
GetDataModelName() const78 string DataModel::GetDataModelName() const
79 {
80     // "true" argument gets long version of name
81     return ToString(GetModelType(),true);
82 }
83 
84 //------------------------------------------------------------------------------------
85 
GetDataModelShortName() const86 string DataModel::GetDataModelShortName() const
87 {
88     // "false" argument gets short version of name
89     return ToString(GetModelType(),false);
90 }
91 
92 //------------------------------------------------------------------------------------
93 
ChooseRandomRates(long nsites) const94 DoubleVec1d DataModel::ChooseRandomRates(long nsites) const
95 {
96     DoubleVec1d rates;
97     Random& rand = registry.GetRandom();
98     double rate = m_catrates[0];
99     //choose a random rate to start
100     double r = rand.Float();
101     for (long i = 0; i < m_ncategories; ++i)
102     {
103         if (r < m_catprobs[i]) rate = m_catrates[i];
104         else r -= m_catprobs[i];
105     }
106 
107     for (long site=0; site<nsites; site++)
108     {
109         if (rand.Float() <= m_acratio)
110         {
111             //choose a new rate.
112             r = rand.Float();
113             for (long i = 0; i < m_ncategories; ++i)
114             {
115                 if (r < m_catprobs[i]) rate = m_catrates[i];
116                 else r -= m_catprobs[i];
117             }
118         }
119         rates.push_back(rate);
120     }
121     return rates;
122 
123 } // ChooseRandomRate
124 
125 //------------------------------------------------------------------------------------
126 
127 #if 0 // Vestigial but possibly future code
128 void DataModel::SetGamma(bool gam)
129 {
130     usegamma = gam;
131     if (usegamma)
132     {
133         // DEBUG debug warning WARNING
134         // need to error check presence of gammashape and ncats > 1?
135         SetCatRates(gamma_rates(gammashape,m_ncategories));
136         DoubleVec1d newprobs(m_ncategories,1.0/m_ncategories);
137         SetCatProbabilities(newprobs);
138     }
139 }
140 #endif
141 
142 //------------------------------------------------------------------------------------
143 
IsValidDataModel() const144 bool DataModel::IsValidDataModel() const
145 {
146     if (m_nbins < 1) return false;
147     if (m_nmarkers < 1) return false;
148 
149     if (m_acratio < 0) return false;
150 
151     if (m_ncategories < 1) return false;
152     size_t ncats = m_ncategories;
153     if (m_catrates.size() != ncats) return false;
154     if (m_catprobs.size() != ncats) return false;
155 
156     size_t i;
157     double totalprob = 0.0;
158     for (i = 0; i < ncats; ++i)
159     {
160         if (m_catrates[i] < 0.0) return false;
161         if (m_catprobs[i] <= 0.0) return false;
162         if (m_catprobs[i] > 1.0) return false;
163         totalprob += m_catprobs[i];
164     }
165 
166     if (fabs(1.0 - totalprob) > EPSILON) return false;
167 
168     return true;
169 } // DataModel::IsValidDataModel
170 
171 //------------------------------------------------------------------------------------
172 
CreateDataModelReport() const173 StringVec1d DataModel::CreateDataModelReport() const
174 {
175 
176     StringVec1d report;
177     string line;
178 
179     if (m_ncategories > 1)
180     {
181         line = ToString(m_ncategories) + " rate categories with correlated length " +
182             ToString(1.0 / m_acratio);
183         report.push_back(line);
184         long cat;
185         for (cat = 0; cat < m_ncategories; ++cat)
186         {
187             line = "Relative rate " + ToString(m_catrates[cat]) + "  Frequency " +
188                 ToString(m_catprobs[cat]);
189             report.push_back(line);
190         }
191     }
192     if (m_relmurate != 1)
193     {
194         line = "The relative marker mutation rate for this model's segment was ";
195         line += ToString(m_relmurate) + ".";
196         report.push_back(line);
197     }
198 
199     return report;
200 } // CreateDataModelReport
201 
202 //------------------------------------------------------------------------------------
203 
ToXML(size_t nspaces) const204 StringVec1d DataModel::ToXML(size_t nspaces) const
205 {
206     StringVec1d xmllines;
207     string line = MakeIndent(
208         MakeTagWithName(xmlstr::XML_TAG_MODEL,GetDataModelShortName()),
209         nspaces);
210     xmllines.push_back(line);
211 
212     nspaces += INDENT_DEPTH;
213     string mytag(MakeTag(xmlstr::XML_TAG_NORMALIZE));
214     line = MakeIndent(mytag,nspaces) + ToStringTF(ShouldNormalize()) +
215         MakeCloseTag(mytag);
216     xmllines.push_back(line);
217     line = MakeIndent(MakeTag(xmlstr::XML_TAG_CATEGORIES),nspaces);
218     xmllines.push_back(line);
219 
220     nspaces += INDENT_DEPTH;
221     mytag = MakeTag(xmlstr::XML_TAG_NUM_CATEGORIES);
222     line = MakeIndent(mytag,nspaces) + ToString(GetNcategories()) +
223         MakeCloseTag(mytag);
224     xmllines.push_back(line);
225     mytag = MakeTag(xmlstr::XML_TAG_RATES);
226     line = MakeIndent(mytag,nspaces) + ToString(GetCatRates(),6) +
227         MakeCloseTag(mytag);
228     xmllines.push_back(line);
229     mytag = MakeTag(xmlstr::XML_TAG_PROBABILITIES);
230     line = MakeIndent(mytag,nspaces) + ToString(GetCatProbabilities()) +
231         MakeCloseTag(mytag);
232     xmllines.push_back(line);
233     mytag = MakeTag(xmlstr::XML_TAG_AUTOCORRELATION);
234     line = MakeIndent(mytag,nspaces) + ToString(GetUserAcratio()) +
235         MakeCloseTag(mytag);
236     xmllines.push_back(line);
237     nspaces -= INDENT_DEPTH;
238 
239     line = MakeIndent(MakeCloseTag(xmlstr::XML_TAG_CATEGORIES),nspaces);
240     xmllines.push_back(line);
241 
242     mytag = MakeTag(xmlstr::XML_TAG_RELATIVE_MURATE);
243     line = MakeIndent(mytag,nspaces) + ToString(m_relmurate) +
244         MakeCloseTag(mytag);
245     xmllines.push_back(line);
246 
247     nspaces -= INDENT_DEPTH;
248     line = MakeIndent(MakeCloseTag(xmlstr::XML_TAG_MODEL),nspaces);
249     xmllines.push_back(line);
250 
251     return xmllines;
252 } // ToXML
253 
254 //------------------------------------------------------------------------------------
255 
ResetCatCells()256 void DataModel::ResetCatCells()
257 {
258     m_catcells.assign(m_ncategories,1.0);
259 } // DataModel::ResetCatCells
260 
261 //------------------------------------------------------------------------------------
262 
ScaleCatProbabilities()263 void DataModel::ScaleCatProbabilities()
264 {
265     long cat;
266     double totalprob = 0.0;
267     m_ncategories = m_catprobs.size();
268     for(cat = 0; cat < m_ncategories; cat++)
269     {
270         totalprob += m_catprobs[cat];
271     }
272     if (fabs(1.0 - totalprob) > EPSILON)
273     {
274         for(cat = 0; cat < m_ncategories; cat++)
275         {
276             m_catprobs[cat] = m_catprobs[cat] / totalprob;
277         }
278     }
279 } // DataModel::ScaleCatProbabilities
280 
281 //------------------------------------------------------------------------------------
282 
ScaleCatRates()283 void DataModel::ScaleCatRates()
284 {
285     // We renormalize the category rates to a weighted mean of 1.0
286     double meanrate = 0.0;
287     size_t rate;
288 
289     if (m_catrates.size() != m_catprobs.size())
290     {
291         string msg = "DataModel::ScaleCatRates() was called before ";
292         msg += "m_catrates and m_catprobs were properly initialized.";
293         throw implementation_error(msg);
294     }
295 
296     size_t numrates = m_catrates.size();
297 
298     for (rate = 0; rate < numrates; ++rate)
299     {
300         meanrate += m_catrates[rate] * m_catprobs[rate];
301     }
302 
303     for (rate = 0; rate < numrates; ++rate)
304     {
305         m_catrates[rate] /= meanrate;
306     }
307 
308 } // DataModel::ScaleCatRates
309 
310 //------------------------------------------------------------------------------------
311 
TryToNormalizeAndThrow(long posn,model_type mtype)312 void DataModel::TryToNormalizeAndThrow(long posn, model_type mtype)
313 {
314     if (ShouldNormalize())
315     {
316         string errmsg("Encountered a subtree of zero likelihood at ");
317         errmsg += "position " + ToString(posn) + ".";
318         switch (mtype)
319         {
320             case F84:
321             case GTR:
322             case KAllele:
323                 assert(false);
324                 //Encountering a tree of zero data likelihood on a tree with DNA
325                 // is almost certainly a programming error, though occasionally
326                 // one gets a tree so large that mispairing tips can be
327                 // catastrophic.  Likewise, a K-allele model should allow mutations
328                 // of a single step to have reasonable likelihoods even if the
329                 // microsats are very disparate.
330                 //
331                 // However, a *user* is much more likely to have such a data set,
332                 // (and hopefully we debug the program thoroughly before release)
333                 // so we fall through here, throw the 'bad tree' error, and try
334                 // a different tree.  --Lucian
335             case MixedKS:
336                 // The MixedKS model has been seen to have problems here when it
337                 // is optimizing.
338             case Brownian:
339             case Stepwise:
340                 throw zero_dl_error(errmsg);
341         }
342     }
343     else
344     {
345         SetNormalize(true);
346         throw datalikenorm_error("Datalikelihood normalization turned on.");
347     }
348 }
349 
350 //------------------------------------------------------------------------------------
351 //------------------------------------------------------------------------------------
352 
NucModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate,double freqA,double freqC,double freqG,double freqT,bool calcFreqsFromData,double perBaseErrorRate)353 NucModel::NucModel(long nmarkers,
354                    long numCategories,
355                    DoubleVec1d categoryRates,
356                    DoubleVec1d categoryProbabilities,
357                    double userAutoCorrelationValue,
358                    bool doNormalize,
359                    double relMuRate,
360                    double freqA,
361                    double freqC,
362                    double freqG,
363                    double freqT,
364                    bool calcFreqsFromData,
365                    double perBaseErrorRate)
366     : DataModel(nmarkers,
367                 numCategories,
368                 categoryRates,
369                 categoryProbabilities,
370                 userAutoCorrelationValue,
371                 doNormalize,
372                 defaults::nucleotideBins,
373                 relMuRate),
374       m_basefreqs(defaults::nucleotideBins),
375       m_freqsfromdata(calcFreqsFromData),
376       m_perBaseErrorRate(perBaseErrorRate)
377 {
378     m_basefreqs[baseA] = freqA;
379     m_basefreqs[baseC] = freqC;
380     m_basefreqs[baseG] = freqG;
381     m_basefreqs[baseT] = freqT;
382     NormalizeBaseFrequencies();
383     assert(NucModel::IsValidDataModel());
384 } // NucModel constructor
385 
386 //------------------------------------------------------------------------------------
387 
NormalizeBaseFrequencies()388 void NucModel::NormalizeBaseFrequencies()
389 {
390     double sumOfFreqs = accumulate(m_basefreqs.begin(),m_basefreqs.end(),0.0);
391     transform(m_basefreqs.begin(), m_basefreqs.end(),
392               m_basefreqs.begin(),
393               bind2nd(divides<double>(),sumOfFreqs));
394 }
395 
396 //------------------------------------------------------------------------------------
397 
IsValidDataModel() const398 bool NucModel::IsValidDataModel() const
399 {
400     size_t index;
401     double totalfreqs = 0.0;
402     if(m_basefreqs.size() != 4) return false;
403     for (index = 0; index < m_basefreqs.size(); index++)
404     {
405         double thisFreq = m_basefreqs[index];
406         if(thisFreq <= 0.0) return false;
407         if(thisFreq >= 1.0) return false;
408         totalfreqs += thisFreq;
409     }
410     if (fabs(1.0 - totalfreqs) > EPSILON) return false;
411     return DataModel::IsValidDataModel();
412 } // NucModel::ValidBaseFrequencies
413 
414 //------------------------------------------------------------------------------------
415 
DataToLikes(const string & datum,long) const416 vector<double> NucModel::DataToLikes(const string& datum, long) const
417 {
418     return NucModel::StaticDataToLikes(datum, GetPerBaseErrorRate());
419 }
420 
421 //------------------------------------------------------------------------------------
422 // This function implements the standard ambiguity codes for nucleotide data,
423 // returning a vector of four doubles indicating the likelihood for A, C, G, T in that order.
424 
StaticDataToLikes(const string & datum,double perBaseErrorRate)425 vector<double> NucModel::StaticDataToLikes(const string& datum, double perBaseErrorRate)
426 {
427     // We assume this is a single nucleotide base, passed as a string only for generality
428     assert(datum.size() == 1);
429 
430     vector<double> likes(BASES, 0.0);  // initialize to zero
431     char nucleotide = datum[0];
432 
433     // resolve code
434     switch(nucleotide)
435     {
436         case 'A':
437             likes[baseA] = 1.0;
438             break;
439 
440         case 'C':
441             likes[baseC] = 1.0;
442             break;
443 
444         case 'G':
445             likes[baseG] = 1.0;
446             break;
447 
448         case 'T':
449         case 'U':
450             likes[baseT] = 1.0;
451         break;
452 
453         case 'M':
454             likes[baseA] = 1.0;
455             likes[baseC] = 1.0;
456             break;
457 
458         case 'R':
459             likes[baseA] = 1.0;
460             likes[baseG] = 1.0;
461             break;
462 
463         case 'W':
464             likes[baseA] = 1.0;
465             likes[baseT] = 1.0;
466             break;
467 
468         case 'S':
469             likes[baseC] = 1.0;
470             likes[baseG] = 1.0;
471             break;
472 
473         case 'Y':
474             likes[baseC] = 1.0;
475             likes[baseT] = 1.0;
476             break;
477 
478         case 'K':
479             likes[baseG] = 1.0;
480             likes[baseT] = 1.0;
481             break;
482 
483         case 'V':
484             likes[baseA] = 1.0;
485             likes[baseC] = 1.0;
486             likes[baseG] = 1.0;
487             break;
488 
489         case 'H':
490             likes[baseA] = 1.0;
491             likes[baseC] = 1.0;
492             likes[baseT] = 1.0;
493             break;
494 
495         case 'D':
496             likes[baseA] = 1.0;
497             likes[baseG] = 1.0;
498             likes[baseT] = 1.0;
499             break;
500 
501         case 'B':
502             likes[baseC] = 1.0;
503             likes[baseG] = 1.0;
504             likes[baseT] = 1.0;
505             break;
506 
507         case 'N':
508         case 'O':
509         case 'X':
510         case '?':
511         case '-':
512             likes[baseA] = 1.0;
513         likes[baseC] = 1.0;
514         likes[baseG] = 1.0;
515         likes[baseT] = 1.0;
516         break;
517 
518         default:
519             assert(false);    // how did an unknown nucleotide get past proofreading?
520             likes[baseA] = 1.0;
521             likes[baseC] = 1.0;
522             likes[baseG] = 1.0;
523             likes[baseT] = 1.0;
524             break;
525     }
526 
527     long   num_ones  = (long)(likes[baseA] + likes[baseC] + likes[baseG] + likes[baseT]);
528     double new_0     = (double)num_ones * perBaseErrorRate / 3.0;
529     double new_1     = 1.0 - (double)(4-num_ones) * perBaseErrorRate / 3.0;
530     likes[baseA] = (likes[baseA] > 0.5) ? new_1 : new_0;
531     likes[baseC] = (likes[baseC] > 0.5) ? new_1 : new_0;
532     likes[baseG] = (likes[baseG] > 0.5) ? new_1 : new_0;
533     likes[baseT] = (likes[baseT] > 0.5) ? new_1 : new_0;
534 
535     return(likes);
536 
537 } // DataToLikes
538 
539 //------------------------------------------------------------------------------------
540 
CreateDataModelReport() const541 StringVec1d NucModel::CreateDataModelReport() const
542 {
543     StringVec1d report = DataModel::CreateDataModelReport();
544 
545     string line = "Base frequencies: " + ToString(m_basefreqs[baseA]) + ", ";
546     line += ToString(m_basefreqs[baseC]) + ", ";
547     line += ToString(m_basefreqs[baseG]) + ", ";
548     line += ToString(m_basefreqs[baseT]);
549     report.push_back(line);
550 
551     return report;
552 
553 } // NucModel::CreateDataModelReport
554 
555 //------------------------------------------------------------------------------------
556 
ToXML(size_t nspaces) const557 StringVec1d NucModel::ToXML(size_t nspaces) const
558 {
559     StringVec1d xmllines(DataModel::ToXML(nspaces));
560 
561     nspaces += INDENT_DEPTH;
562 
563     string line(MakeIndent(MakeTag(xmlstr::XML_TAG_BASE_FREQS),nspaces));
564     if (m_freqsfromdata)
565         line += " " + xmlstr::XML_TAG_CALCULATED + " ";
566     else
567         line += ToString(m_basefreqs,6);
568     line += MakeCloseTag(xmlstr::XML_TAG_BASE_FREQS);
569     StringVec1d::iterator endtag = --xmllines.end();
570     xmllines.insert(endtag,line);
571 
572     string line2(MakeIndent(MakeTag(xmlstr::XML_TAG_PER_BASE_ERROR_RATE),nspaces));
573     line2 += ToString(m_perBaseErrorRate);
574     line2 += MakeCloseTag(xmlstr::XML_TAG_PER_BASE_ERROR_RATE);
575     endtag = --xmllines.end();
576     xmllines.insert(endtag,line2);
577 
578     nspaces -= INDENT_DEPTH;
579     return xmllines;
580 
581 } // NucModel::ToXML
582 
583 //------------------------------------------------------------------------------------
584 
ChooseAncestralState(long marker)585 DoubleVec1d NucModel::ChooseAncestralState(long marker)
586 {
587     DoubleVec1d result(BASES, 0.0);
588     double r = registry.GetRandom().Float();
589     long i;
590     for (i = 0; i < BASES; ++i)
591     {
592         if (r < m_basefreqs[i])
593         {
594             result[i] = 1.0;
595             return result;
596         }
597         else
598         {
599             r -= m_basefreqs[i];
600         }
601     }
602 
603     // this code could be reached due to rounding errors
604     result[0] = 1.0;
605     return result;
606 } // ChooseAncestralState
607 
608 //------------------------------------------------------------------------------------
609 
CellToData(Cell_ptr cell,long marker) const610 string NucModel::CellToData(Cell_ptr cell, long marker) const
611 {
612     LongVec1d ones = cell->GetOnes(marker);
613     if (ones.size() == 1)
614     {
615         switch (ones[0])
616         {
617             case 0:
618                 return "A";
619             case 1:
620                 return "C";
621             case 2:
622                 return "G";
623             case 3:
624                 return "T";
625             default:
626                 throw data_error("Tried to convert " + ToString(ones[0]+1)
627                                  + " to a nucleotide, but there should only be four bins.");
628         }
629     }
630     throw implementation_error("Cannot convert nucleotide data from the internal format"
631                                " that is not simply one of the four bases.");
632 }
633 
634 //------------------------------------------------------------------------------------
635 //------------------------------------------------------------------------------------
636 
F84Model(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate,double freqA,double freqC,double freqG,double freqT,double ttRatio,bool calculateFreqsFromData,double perBaseErrorRate)637 F84Model::F84Model(
638     long nmarkers,
639     long numCategories,
640     DoubleVec1d categoryRates,
641     DoubleVec1d categoryProbabilities,
642     double userAutoCorrelationValue,
643     bool doNormalize,
644     double relMuRate,
645     double freqA,
646     double freqC,
647     double freqG,
648     double freqT,
649     double ttRatio,
650     bool calculateFreqsFromData,
651     double perBaseErrorRate)
652     : NucModel(
653         nmarkers,
654         numCategories,
655         categoryRates,
656         categoryProbabilities,
657         userAutoCorrelationValue,
658         doNormalize,
659         relMuRate,
660         freqA,
661         freqC,
662         freqG,
663         freqT,
664         calculateFreqsFromData,
665         perBaseErrorRate),
666       m_ttratio(ttRatio),
667       // set some potentially useful defaults
668       computed1(true), computed2(true),
669       freqar(FLAGDOUBLE), freqcy(FLAGDOUBLE),
670       freqgr(FLAGDOUBLE), freqty(FLAGDOUBLE),
671       basefreqarray(NULL),
672       daughter1(NULL), daughter2(NULL), target(NULL)
673 {
674     // The data-likelihood buffer vectors such as expA1 are default-constructed
675     // to an empty state, which is fine.
676     assert(F84Model::IsValidDataModel());
677     //cout << "*****new F84 model created*****" << this << endl;
678     Finalize();
679 
680 } // F84Model::F84Model
681 
682 //------------------------------------------------------------------------------------
683 
EmptyBuffers()684 void F84Model::EmptyBuffers()
685 {
686     if (basefreqarray)
687     {
688         delete [] basefreqarray[0];
689         delete [] basefreqarray;
690     }
691     if (daughter1)
692     {
693         delete [] daughter1[0];
694         delete [] daughter1;
695     }
696     if (daughter2)
697     {
698         delete [] daughter2[0];
699         delete [] daughter2;
700     }
701     if (target)
702     {
703         delete [] target[0];
704         delete [] target;
705     }
706 
707 } // F84Model::EmptyBuffers
708 
709 //------------------------------------------------------------------------------------
710 
Clone() const711 DataModel* F84Model::Clone() const
712 {
713     DataModel* newmodel = new F84Model(*this);
714     return newmodel;
715 }
716 
717 //------------------------------------------------------------------------------------
718 
AllocateBuffers()719 void F84Model::AllocateBuffers()
720 {
721     basefreqarray   = new double* [m_ncategories];
722     daughter1 = new double* [m_ncategories];
723     daughter2 = new double* [m_ncategories];
724     target = new double* [m_ncategories];
725     basefreqarray[0]   = new double [m_ncategories*BASES];
726     daughter1[0] = new double [m_ncategories*BASES];
727     daughter2[0] = new double [m_ncategories*BASES];
728     target[0] = new double [m_ncategories*BASES];
729     long cat;
730 
731     for (cat = 0; cat < m_ncategories; ++cat)
732     {
733         basefreqarray[cat]   = basefreqarray[0] + cat*BASES;
734         daughter1[cat] = daughter1[0] + cat*BASES;
735         daughter2[cat] = daughter2[0] + cat*BASES;
736         target[cat] = target[0] + cat*BASES;
737         basefreqarray[cat][baseA] = m_basefreqs[baseA];
738         basefreqarray[cat][baseC] = m_basefreqs[baseC];
739         basefreqarray[cat][baseG] = m_basefreqs[baseG];
740         basefreqarray[cat][baseT] = m_basefreqs[baseT];
741     }
742 
743 } // F84Model::AllocateBuffers
744 
745 //------------------------------------------------------------------------------------
746 
~F84Model()747 F84Model::~F84Model()
748 {
749     // if the array does not exist we suppose that the object is
750     // not Finalized, and don't try any deallocations.
751     EmptyBuffers();
752 
753 } // F84Model::~F84Model
754 
755 //------------------------------------------------------------------------------------
756 
CopyMembers(const F84Model & src)757 void F84Model::CopyMembers(const F84Model& src)
758 {
759     m_ttratio   = src.m_ttratio;
760 
761     freqar    = src.freqar;
762     freqcy    = src.freqcy;
763     freqgr    = src.freqgr;
764     freqty    = src.freqty;
765 
766     computed1 = src.computed1;
767     computed2 = src.computed2;
768 
769     // don't copy the data likelihood buffers
770 }
771 
772 //------------------------------------------------------------------------------------
773 
IsValidDataModel() const774 bool F84Model::IsValidDataModel() const
775 {
776     if (m_ttratio <= 0.5) return false;
777     return NucModel::IsValidDataModel();
778 
779 } // F84Model::IsValidDataModel
780 
781 //------------------------------------------------------------------------------------
782 
CopyBuffers(const F84Model & src)783 void F84Model::CopyBuffers(const F84Model& src)
784 {
785     xcatrates = src.xcatrates;
786     ycatrates = src.ycatrates;
787     expA1     = src.expA1;
788     expB1     = src.expB1;
789     expC1     = src.expC1;
790     expA2     = src.expA2;
791     expB2     = src.expB2;
792     expC2     = src.expC2;
793     catlikes = src.catlikes;
794 
795     EmptyBuffers();
796     AllocateBuffers();
797 
798     // Do not try to copy the buffers if they are unallocated
799     if (src.daughter1)
800         memcpy(daughter1[0],src.daughter1[0],m_ncategories*BASES*sizeof(double));
801     if (src.daughter2)
802         memcpy(daughter2[0],src.daughter2[0],m_ncategories*BASES*sizeof(double));
803     if (src.target)
804         memcpy(target[0],src.target[0],m_ncategories*BASES*sizeof(double));
805 
806 } // F84Model::CopyBuffers
807 
808 //------------------------------------------------------------------------------------
809 
F84Model(const F84Model & src)810 F84Model::F84Model(const F84Model& src)
811     : NucModel(src), basefreqarray(NULL), daughter1(NULL), daughter2(NULL),
812       target(NULL)
813 {
814     //cout << "*****F84 model copied*****" << this << endl;
815 
816     CopyMembers(src);
817     CopyBuffers(src);
818 }
819 
820 //------------------------------------------------------------------------------------
821 
operator =(const F84Model & src)822 F84Model& F84Model::operator=(const F84Model& src)
823 {
824     NucModel::operator=(src);
825     CopyMembers(src);
826     CopyBuffers(src);
827     return *this;
828 }
829 
830 //------------------------------------------------------------------------------------
831 
SetTTratio(double tr)832 void F84Model::SetTTratio(double tr)
833 {
834     if (tr <= 0.5)
835     {
836         data_error e("Transition/transversion ratio must be > 0.5");
837         throw e;
838     }
839     m_ttratio = tr;
840 }
841 
842 //------------------------------------------------------------------------------------
843 
Finalize()844 void F84Model::Finalize()
845 {
846     double pur, pyr, ag, ct, m, n, x, y;
847     long cat;
848 
849     double freqa(m_basefreqs[baseA]), freqc(m_basefreqs[baseC]),
850         freqg(m_basefreqs[baseG]), freqt(m_basefreqs[baseT]);
851 
852     pur    = freqa + freqg;
853     pyr    = freqc + freqt;
854     freqar = freqa / pur;
855     freqcy = freqc / pyr;
856     freqgr = freqg / pur;
857     freqty = freqt / pyr;
858 
859     ag = freqa * freqg;
860     ct = freqc * freqt;
861     m  = m_ttratio * pur * pyr - (ag + ct);
862     n  = ag / pur + ct / pyr;
863 
864     // code stolen from DNAMLK here
865     y = m / (m + n);
866     // if globalfreqs has failed....
867     if (y < 0) y = 0;
868     else if (y > 1) y = 1;
869     x = 1.0 - y;
870     double fracchange = y * (2.0 * freqa * freqgr + 2.0 * freqc * freqty)
871         + x * (1.0 - freqa * freqa - freqc * freqc - freqg * freqg - freqt * freqt);
872     y /= - fracchange;
873     x /= - fracchange;
874 
875     for (cat = 0; cat < m_ncategories; ++cat)
876     {
877         xcatrates.push_back(x*m_catrates[cat]);
878         ycatrates.push_back(y*m_catrates[cat]);
879     }
880 
881     // Allocate additional space for likelihood calculations.
882     expA1.insert(expA1.begin(),m_ncategories,0.0);
883     expA2.insert(expA2.begin(),m_ncategories,0.0);
884     expB1.insert(expB1.begin(),m_ncategories,0.0);
885     expB2.insert(expB2.begin(),m_ncategories,0.0);
886     expC1.insert(expC1.begin(),m_ncategories,0.0);
887     expC2.insert(expC2.begin(),m_ncategories,0.0);
888 
889     AllocateBuffers();
890 
891     double zero = 0.0;
892     catlikes = CreateVec2d(m_nmarkers,m_ncategories,zero);
893 
894 } // F84Model::Finalize
895 
896 //------------------------------------------------------------------------------------
897 
RescaleLength1(double length1)898 void F84Model::RescaleLength1(double length1)
899 {
900     long cat;
901     double n;
902 
903     if (length1 > FLAGDOUBLE)
904     {
905 #ifdef DEBUG_VARIABLES
906         if (printdbgvar > 0)
907         {
908             cout << "RescaleLength1 raw length1: " << length1 << endl;
909             //cout << " m_relmurate: " << m_relmurate << endl;
910         }
911 #endif
912         length1 *= m_relmurate;
913         computed1 = true;
914 
915         for(cat = 0; cat < m_ncategories; cat++)
916         {
917             n = exp(length1 * xcatrates[cat]);
918             expA1[cat] = 1.0 - n;
919             expB1[cat] = n * exp(length1 * ycatrates[cat]);
920             expC1[cat] = n - expB1[cat];
921 #ifdef DEBUG_VARIABLES
922             if (0)
923             //if (printdbgvar > 0)
924             {
925                 cout << " cat: " << cat << endl;
926                 cout << " n: " << n << endl;
927                 cout << " xcatrates: " << xcatrates[cat] << endl;
928                 cout << " ycatrates: " << ycatrates[cat] << endl;
929                 cout << " expA1: "<< expA1[cat] << endl;
930                 cout << " expB1: "<< expB1[cat] << endl;
931                 cout << " expC1: "<< expC1[cat] << endl;
932             }
933 #endif
934         }
935     }
936     else
937     {
938         computed1 = false;
939     }
940 #ifdef DEBUG_VARIABLES
941     if (printdbgvar > 0)
942     {
943         cout << " scaled length1: " << length1 << " computed1: "<< computed1 << endl << endl;
944     }
945 #endif
946 
947 } // F84Model::RescaleLength1
948 
949 //------------------------------------------------------------------------------------
950 
RescaleLength2(double length2)951 void F84Model::RescaleLength2(double length2)
952 {
953     long cat;
954     double n;
955 
956     if (length2 > FLAGDOUBLE)
957     {
958 #ifdef DEBUG_VARIABLES
959         if (printdbgvar > 0)
960         {
961             cout << "RescaleLength2 raw length2: " << length2 << endl;
962             //cout << " m_relmurate: " << m_relmurate << endl;
963         }
964 #endif
965         length2 *= m_relmurate;
966         computed2 = true;
967 
968         for(cat = 0; cat < m_ncategories; cat++)
969         {
970             n = exp(length2 * xcatrates[cat]);
971             expA2[cat] = 1.0 - n;
972             expB2[cat] = n * exp(length2 * ycatrates[cat]);
973             expC2[cat] = n - expB2[cat];
974 #ifdef DEBUG_VARIABLES
975             if (0)
976             //if (printdbgvar > 0)
977             {
978                 cout << " cat: " << cat << endl;
979                 cout << " n: " << n << endl;
980                 cout << " xcatrates: " << xcatrates[cat] << endl;
981                 cout << " ycatrates: " << ycatrates[cat] << endl;
982                 cout << " expA2: "<< expA2[cat] << endl;
983                 cout << " expB2: "<< expB2[cat] << endl;
984                 cout << " expC2: "<< expC2[cat] << endl;
985             }
986 #endif
987         }
988     }
989     else
990     {
991         computed2 = false;
992     }
993 #ifdef DEBUG_VARIABLES
994     if (printdbgvar > 0)
995     {
996         cout << " scaled length2: " << length2 << " computed2: "<< computed2 << endl << endl;
997     }
998 #endif
999 }
1000 
1001 //------------------------------------------------------------------------------------
1002 
RescaleLengths(double length1,double length2)1003 void F84Model::RescaleLengths(double length1, double length2)
1004 {
1005     RescaleLength1(length1);
1006     RescaleLength2(length2);
1007 
1008 } // F84Model::RescaleLengths
1009 
1010 //------------------------------------------------------------------------------------
1011 
ComputeSiteDLs(double ** siteDLs1,double ** siteDLs2)1012 double** F84Model::ComputeSiteDLs(double** siteDLs1, double** siteDLs2)
1013 {
1014     double sumAll, sumPur, sumPyr;
1015 
1016     long cat;
1017 #ifdef DEBUG_VARIABLES
1018     if (0)
1019     //if (printdbgvar > 0)
1020     {
1021         cout << "basefreqA: "<< m_basefreqs[baseA] << endl;
1022         cout << "basefreqC: "<< m_basefreqs[baseC] << endl;
1023         cout << "basefreqG: "<< m_basefreqs[baseG] << endl;
1024         cout << "basefreqT: "<< m_basefreqs[baseT] << endl;
1025         if (computed1)
1026             cout << "computed1: true"<< endl;
1027         else
1028             cout << "computed1: false"<< endl;
1029         if (computed2)
1030             cout << "computed2: true"<< endl;
1031         else
1032             cout << "computed2: false"<< endl;
1033     }
1034 #endif
1035 
1036     if (computed1)
1037     {
1038         for (cat = 0; cat < m_ncategories; ++cat)
1039         {
1040             double* catDLs1 = siteDLs1[cat];
1041             sumAll  = expA1[cat] *
1042                 (m_basefreqs[baseA]*catDLs1[baseA] +
1043                  m_basefreqs[baseC]*catDLs1[baseC] +
1044                  m_basefreqs[baseG]*catDLs1[baseG] +
1045                  m_basefreqs[baseT]*catDLs1[baseT]);
1046 
1047             sumPur  = freqar*catDLs1[baseA] + freqgr*catDLs1[baseG];
1048             sumPyr  = freqcy*catDLs1[baseC] + freqty*catDLs1[baseT];
1049 
1050 #ifdef DEBUG_VARIABLES
1051             if (printdbgvar > 0)
1052             {
1053                 //cout<< "cat: " << cat << endl;
1054                 //cout<< "sumA1: " << sumAll << endl;
1055                 //cout<< "freqar: " << freqar << endl;
1056                 //cout<< "freqcy: " << freqcy << endl;
1057                 //cout<< "freqgr: " << freqgr << endl;
1058                 //cout<< "freqty: " << freqty << endl;
1059                 //cout<< "sumPur1: " << sumPur << endl;
1060                 //cout<< "sumPyr1: " << sumPyr << endl;
1061                 cout<< "dls1: " << catDLs1[baseA] << " "<< catDLs1[baseC] << " "<< catDLs1[baseG] << " "<< catDLs1[baseT]  << endl;
1062                 //cout<< "dls1A: " << catDLs1[baseA] << endl;
1063                 //cout<< "dls1C: " << catDLs1[baseC] << endl;
1064                 //cout<< "dls1G: " << catDLs1[baseG] << endl;
1065                 //cout<< "dls1T: " << catDLs1[baseT] << endl;
1066                 //cout<< "expA1: " << expA1[cat] <<endl;
1067                 //cout<< "expB1: " << expB1[cat] <<endl;
1068                 //cout<< "expC1: " << expC1[cat] <<endl<<endl;
1069             }
1070 #endif
1071             double expC1cat = expC1[cat];
1072             daughter1[cat][baseA] = sumAll + expB1[cat]*catDLs1[baseA] +
1073                 expC1cat*sumPur;
1074             daughter1[cat][baseC] = sumAll + expB1[cat]*catDLs1[baseC] +
1075                 expC1cat*sumPyr;
1076             daughter1[cat][baseG] = sumAll + expB1[cat]*catDLs1[baseG] +
1077                 expC1cat*sumPur;
1078             daughter1[cat][baseT] = sumAll + expB1[cat]*catDLs1[baseT] +
1079                 expC1cat*sumPyr;
1080         }
1081     }
1082     else
1083     {
1084         memcpy(daughter1[0],basefreqarray[0],m_ncategories*BASES*sizeof(double));
1085     }
1086 
1087     if (computed2)
1088     {
1089         for (cat = 0; cat < m_ncategories; cat++)
1090         {
1091             double* catDLs2 = siteDLs2[cat];
1092             sumAll  = expA2[cat] *
1093                 (m_basefreqs[baseA]*catDLs2[baseA] +
1094                  m_basefreqs[baseC]*catDLs2[baseC] +
1095                  m_basefreqs[baseG]*catDLs2[baseG] +
1096                  m_basefreqs[baseT]*catDLs2[baseT]);
1097 
1098             sumPur  = freqar*catDLs2[baseA] + freqgr*catDLs2[baseG];
1099             sumPyr  = freqcy*catDLs2[baseC] + freqty*catDLs2[baseT];
1100 
1101 #ifdef DEBUG_VARIABLES
1102             if (printdbgvar > 0)
1103             {
1104                 //cout<< "cat: " << cat << endl;
1105                 //cout<< "sumA2: " << sumAll << endl;// JRM debug
1106                 //cout<< "sumPur2: " << sumPur << endl;
1107                 //cout<< "sumPyr2: " << sumPyr << endl;
1108                 //cout<< "dls2A: " << catDLs2[baseA] << endl;
1109                 //cout<< "dls2C: " << catDLs2[baseC] << endl;
1110                 //cout<< "dls2G: " << catDLs2[baseG] << endl;
1111                 //cout<< "dls2T: " << catDLs2[baseT] << endl;
1112                 //cout<< "expA2: " << expA2[cat] <<endl;
1113                 //cout<< "expB2: " << expB2[cat] <<endl;
1114                 //cout<< "expC2: " << expC2[cat] <<endl<<endl;
1115                 cout<< "dls2: " << catDLs2[baseA] << " "<< catDLs2[baseC] << " "<< catDLs2[baseG] << " "<< catDLs2[baseT]  << endl;
1116            }
1117 #endif
1118 
1119             double expC2cat = expC2[cat];
1120             daughter2[cat][baseA] = sumAll + expB2[cat]*catDLs2[baseA] +
1121                 expC2cat*sumPur;
1122             daughter2[cat][baseC] = sumAll + expB2[cat]*catDLs2[baseC] +
1123                 expC2cat*sumPyr;
1124             daughter2[cat][baseG] = sumAll + expB2[cat]*catDLs2[baseG] +
1125                 expC2cat*sumPur;
1126             daughter2[cat][baseT] = sumAll + expB2[cat]*catDLs2[baseT] +
1127                 expC2cat*sumPyr;
1128         }
1129     }
1130     else
1131     {
1132         memcpy(daughter2[0],basefreqarray[0],m_ncategories*BASES*sizeof(double));
1133     }
1134 
1135     for (cat = 0; cat < m_ncategories; cat++)
1136     {
1137         target[cat][baseA] = daughter1[cat][baseA] *
1138             daughter2[cat][baseA];
1139         target[cat][baseC] = daughter1[cat][baseC] *
1140             daughter2[cat][baseC];
1141         target[cat][baseG] = daughter1[cat][baseG] *
1142             daughter2[cat][baseG];
1143         target[cat][baseT] = daughter1[cat][baseT] *
1144             daughter2[cat][baseT];
1145  #ifdef DEBUG_VARIABLES
1146         if (printdbgvar > 0)
1147         {
1148             //cout<< "cat: " << cat << endl;
1149             //cout<< "targetA: " << target[cat][baseA] <<endl;
1150             //cout<< "targetC: " << target[cat][baseC] <<endl;
1151             //cout<< "targetG: " << target[cat][baseG] <<endl;
1152             //cout<< "targetT: " << target[cat][baseT] <<endl<<endl;
1153                 cout<< "target: " << target[cat][baseA] << " "<< target[cat][baseC] << " "<< target[cat][baseG] << " "<< target[cat][baseT]  << endl;
1154         }
1155 #endif
1156   }
1157 
1158     return target;
1159 
1160 } // F84Model::ComputeSiteDLs
1161 
1162 //------------------------------------------------------------------------------------
1163 
ComputeSubtreeDL(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)1164 double F84Model::ComputeSubtreeDL(Cell& rootdls, double** startmarker, double** endmarker, long posn)
1165 {
1166     double total=0.0, subtotal;
1167     double** marker;
1168     long cat;
1169     DoubleVec1d prior(m_ncategories);
1170     long firstposn = posn;
1171     int im = 0;
1172 
1173     for (marker = startmarker; marker != endmarker;
1174          marker = rootdls.GetNextMarker(marker))
1175     {
1176         subtotal = 0.0;
1177 
1178         for (cat = 0; cat < m_ncategories; cat++)
1179         {
1180             prior[cat] = m_basefreqs[baseA]*marker[cat][baseA] +
1181                 m_basefreqs[baseC]*marker[cat][baseC] +
1182                 m_basefreqs[baseG]*marker[cat][baseG] +
1183                 m_basefreqs[baseT]*marker[cat][baseT];
1184 
1185             subtotal += m_catprobs[cat] * prior[cat];
1186 #if 0
1187             cout << "marker " << im
1188                  << " cat " << cat
1189                  << " probs " << marker[cat][baseA]
1190                  << " " << marker[cat][baseC]
1191                  << " " << marker[cat][baseG]
1192                  << " " << marker[cat][baseT]
1193                  << " prior " << prior[cat]
1194                  << endl;
1195 #endif
1196         }
1197         im++;
1198 
1199         if (!subtotal)
1200         {
1201             DataModel::TryToNormalizeAndThrow(posn, GetModelType());
1202         }
1203 
1204         if (ShouldNormalize())
1205         {
1206             total += (log(subtotal) + rootdls.GetNorms(posn));
1207         }
1208         else
1209         {
1210             total +=  log(subtotal);
1211         }
1212 
1213         // de-normalization not needed here, since we are only interested in
1214         // the ratio
1215         if (m_ncategories > 1)
1216         {
1217             for (cat = 0; cat < m_ncategories; cat++)
1218                 catlikes[posn][cat] = prior[cat]/subtotal;
1219         }
1220         ++posn;
1221     }
1222 
1223     if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
1224     //    cout << "total: " << total << endl;
1225     return total;
1226 
1227 } // F84Model::ComputeSubtreeDL
1228 
1229 //------------------------------------------------------------------------------------
1230 
ComputeCatDL(long startmarker,long endmarker)1231 double F84Model::ComputeCatDL(long startmarker, long endmarker)
1232 {
1233     double subtotal;
1234     long marker, cat;
1235 
1236     DoubleVec1d like = m_catcells;
1237     DoubleVec1d nulike(m_ncategories);
1238 
1239     for (marker = startmarker; marker != endmarker; ++marker)
1240     {
1241         subtotal = 0.0;
1242         for (cat = 0; cat < m_ncategories; cat++)
1243             subtotal += m_catprobs[cat] * like[cat];
1244 
1245         subtotal *= m_acratio;
1246 
1247         for (cat = 0; cat < m_ncategories; cat++)
1248             nulike[cat] = catlikes[marker][cat] *
1249                 (subtotal + m_ucratio * like[cat]);
1250 
1251         // the following puts the nulike values into like.  It
1252         // also puts the like values into nulike, but we will not
1253         // be using values from nulike so we don't care.
1254         like.swap(nulike);
1255     }
1256 
1257     subtotal = 0.0;
1258     for (cat = 0; cat < m_ncategories; cat++)
1259         subtotal += m_catprobs[cat] * like[cat];
1260 
1261     // the following puts the like values into catcells for
1262     // long-term storage.  It also puts the catcells values into
1263     // like, but we don't care.
1264     m_catcells.swap(like);
1265 
1266     return log(subtotal);
1267 
1268 } // F84Model::ComputeCatDL
1269 
1270 //------------------------------------------------------------------------------------
1271 
CreateDataModelReport() const1272 StringVec1d F84Model::CreateDataModelReport() const
1273 {
1274     StringVec1d report = NucModel::CreateDataModelReport();
1275 
1276     string line = "Transition/transversion ratio: " + ToString(m_ttratio);
1277     report.push_back(line);
1278 
1279     return report;
1280 
1281 } // F84Model::CreateDataModelReport
1282 
1283 //------------------------------------------------------------------------------------
1284 
ToXML(size_t nspaces) const1285 StringVec1d F84Model::ToXML(size_t nspaces) const
1286 {
1287     StringVec1d xmllines(NucModel::ToXML(nspaces));
1288 
1289     nspaces += INDENT_DEPTH;
1290     string line(MakeIndent(MakeTag(xmlstr::XML_TAG_TTRATIO),nspaces));
1291     line += ToString(GetTTratio());
1292     line += MakeCloseTag(xmlstr::XML_TAG_TTRATIO);
1293     nspaces -= INDENT_DEPTH;
1294 
1295     StringVec1d::iterator endtag = --xmllines.end();
1296     xmllines.insert(endtag,line);
1297 
1298     return xmllines;
1299 
1300 } // F84Model::ToXML
1301 
1302 //------------------------------------------------------------------------------------
1303 
1304 // This routine simulates data for a single node under the F84
1305 // model.  It assumes that the given branch lengths have already
1306 // been rescaled for the rate category desired.
1307 // Nomenclature follows _Inferring Phylogenies_ pp. 202-203,
1308 //  printing date 2004 (earlier printings have different page numberings)
1309 
1310 // The second argument is unused since nucleotides don't have differing
1311 // number of states by marker position
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const1312 DoubleVec1d F84Model::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
1313 {
1314     double freqa(m_basefreqs[baseA]), freqc(m_basefreqs[baseC]), freqg(m_basefreqs[baseG]), freqt(m_basefreqs[baseT]);
1315     double pur = freqa + freqg;
1316     double pyr = freqc + freqt;
1317     double beta = 1.0 / (2.0 * pur * pyr * (1.0 + m_ttratio));
1318     double alpha = (pur * pyr * m_ttratio - freqa * freqg - freqc * freqt) /
1319         (2.0 * (1.0 + m_ttratio) * (pyr * freqa * freqg + pur * freqc * freqt));
1320 
1321     DoubleVec1d cumprob;
1322     cumprob.push_back(freqa);
1323     cumprob.push_back(freqa + freqc);
1324     cumprob.push_back(freqa + freqc + freqg);
1325 
1326     double expB = exp(-beta * branchlength);
1327 
1328     DoubleVec1d answer(BASES, 0.0);
1329 
1330     double general = 1.0 - expB;
1331     double r = registry.GetRandom().Float();
1332 
1333     // We compute a chance of drawing from a pool of all four nucleotides
1334     // in proportion to their frequency; if that doesn't happen, we
1335     // compute a chance of drawing from a pool of only purines or only
1336     // pyrimidines; if that doesn't happen either there is no change
1337     // and we return the initial state.
1338 
1339     if (r < general)
1340     {
1341         // perform a draw from the general pool
1342         double r2 = registry.GetRandom().Float();
1343         long i;
1344         for (i = 0; i < BASES-1; ++i)
1345         {
1346             if (r2 < cumprob[i])
1347             {
1348                 answer[i] = 1.0;
1349                 return answer;
1350             }
1351         }
1352         // flowthrough if previous bases not picked
1353         answer[BASES-1] = 1.0;
1354         return answer;
1355     }
1356     else
1357     {
1358         r -= general;
1359         double transition = expB * (1.0 - exp(-alpha * branchlength));
1360         if (r < transition)
1361         {
1362             // perform a draw from the transition pool
1363             double r2 = registry.GetRandom().Float();
1364             if (state[baseA] == 1.0 || state[baseG] == 1.0) // purine
1365             {
1366                 if (r2 < freqa / (freqa + freqg))
1367                 {
1368                     answer[baseA] = 1.0;
1369                 }
1370                 else
1371                 {
1372                     answer[baseG] = 1.0;
1373                 }
1374             }
1375             else                        // pyrimidine
1376             {
1377                 if (r2 < freqc / (freqc + freqt))
1378                 {
1379                     answer[baseC] = 1.0;
1380                 }
1381                 else
1382                 {
1383                     answer[baseT] = 1.0;
1384                 }
1385             }
1386             return answer;
1387         }
1388     }
1389 
1390     // otherwise, no event happens
1391     return state;
1392 
1393 } // SimulateMarke
1394 
1395 //------------------------------------------------------------------------------------
1396 //------------------------------------------------------------------------------------
1397 
GTRModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate,double freqA,double freqC,double freqG,double freqT,double freqAC,double freqAG,double freqAT,double freqCG,double freqCT,double freqTG,double perBaseErrorRate)1398 GTRModel::GTRModel(long nmarkers,
1399                    long numCategories,
1400                    DoubleVec1d categoryRates,
1401                    DoubleVec1d categoryProbabilities,
1402                    double userAutoCorrelationValue,
1403                    bool doNormalize,
1404                    double relMuRate,
1405                    double freqA,
1406                    double freqC,
1407                    double freqG,
1408                    double freqT,
1409                    double freqAC,
1410                    double freqAG,
1411                    double freqAT,
1412                    double freqCG,
1413                    double freqCT,
1414                    double freqTG,
1415                    double perBaseErrorRate)
1416     : NucModel(nmarkers,
1417                numCategories,
1418                categoryRates,
1419                categoryProbabilities,
1420                userAutoCorrelationValue,
1421                doNormalize,
1422                relMuRate,
1423                freqA,
1424                freqC,
1425                freqG,
1426                freqT,
1427                false,          // calculate freqs from data
1428                perBaseErrorRate),
1429       AC(freqAC),
1430       AG(freqAG),
1431       AT(freqAT),
1432       CG(freqCG),
1433       CT(freqCT),
1434       TG(freqTG),
1435       // set some potentially useful defaults
1436       basefreqarray(NULL),
1437       daughter(NULL),target(NULL)
1438 {
1439     // reset the names from the base values
1440     computed.assign(2,true);
1441     DoubleVec1d empty(BASES,0.0);
1442     scratch.assign(BASES,empty);
1443     Finalize();
1444 
1445     assert(GTRModel::IsValidDataModel());
1446 } // GTRModel::GTRModel
1447 
1448 //------------------------------------------------------------------------------------
1449 
GTRModel(const GTRModel & src)1450 GTRModel::GTRModel(const GTRModel& src)
1451     : NucModel(src), basefreqarray(NULL),daughter(NULL),target(NULL)
1452 {
1453     CopyMembers(src); // doesn't copy target or daughter buffers!
1454 } // GTRModel copy ctor
1455 
1456 //------------------------------------------------------------------------------------
1457 
~GTRModel()1458 GTRModel::~GTRModel()
1459 {
1460     EmptyBuffers();
1461 } // GTRModel dtor
1462 
1463 //------------------------------------------------------------------------------------
1464 
Clone() const1465 DataModel* GTRModel::Clone() const
1466 {
1467     DataModel* newmodel = new GTRModel(*this);
1468     return newmodel;
1469 }
1470 
1471 //------------------------------------------------------------------------------------
1472 
operator =(const GTRModel & src)1473 GTRModel& GTRModel::operator=(const GTRModel& src)
1474 {
1475     NucModel::operator=(src);
1476     CopyMembers(src);
1477     return *this;
1478 }
1479 
1480 //------------------------------------------------------------------------------------
1481 
AllocateBuffers()1482 void GTRModel::AllocateBuffers()
1483 {
1484     DoubleVec1d zero(m_ncategories,0.0);
1485     catlikes.assign(m_nmarkers,zero);
1486 
1487     DoubleVec1d base1(BASES,0);
1488     DoubleVec2d base2(BASES,base1);
1489     DoubleVec3d empty(m_ncategories,base2);
1490     pchange.assign(2,empty);
1491 
1492     daughter         = new double** [2];
1493     target           = new double* [m_ncategories];
1494     basefreqarray    = new double* [m_ncategories];
1495     daughter[0]      = new double* [2*m_ncategories];
1496     daughter[0][0]   = new double [2*m_ncategories*BASES];
1497     target[0]        = new double [m_ncategories*BASES];
1498     basefreqarray[0] = new double [m_ncategories*BASES];
1499     daughter[1] = daughter[0] + m_ncategories;
1500     daughter[1][0] = daughter[0][0] + m_ncategories*BASES;
1501     long cat;
1502     for (cat = 0; cat < m_ncategories; ++cat)
1503     {
1504         basefreqarray[cat]   = basefreqarray[0] + cat*BASES;
1505         daughter[0][cat] = daughter[0][0] + cat*BASES;
1506         daughter[1][cat] = daughter[1][0] + cat*BASES;
1507         target[cat] = target[0] + cat*BASES;
1508         long base;
1509         for(base = baseA; base <= baseT; ++base)
1510             basefreqarray[cat][base] = m_basefreqs[base];
1511     }
1512 } // GTRModel::AllocateBuffers
1513 
1514 //------------------------------------------------------------------------------------
1515 
EmptyBuffers()1516 void GTRModel::EmptyBuffers()
1517 {
1518     if (basefreqarray)
1519     {
1520         delete [] basefreqarray[0];
1521         delete [] basefreqarray;
1522     }
1523     if (daughter)
1524     {
1525         delete [] daughter[0][0];
1526         delete [] daughter[0];
1527         delete [] daughter;
1528     }
1529     if (target)
1530     {
1531         delete [] target[0];
1532         delete [] target;
1533     }
1534 
1535 } // GTRModel::EmptyBuffers
1536 
1537 //------------------------------------------------------------------------------------
1538 
CopyMembers(const GTRModel & src)1539 void GTRModel::CopyMembers(const GTRModel& src)
1540 {
1541     AG = src.AG;
1542     AC = src.AC;
1543     AT = src.AT;
1544     CG = src.CG;
1545     CT = src.CT;
1546     TG = src.TG;
1547     eigvals = src.eigvals;
1548     eigvecs1 = src.eigvecs1;
1549     eigvecs2 = src.eigvecs2;
1550     pchange = src.pchange;
1551     computed = src.computed;
1552     scratch = src.scratch;
1553     m_basefreqs = src.m_basefreqs;
1554 
1555     if (src.basefreqarray)
1556     {
1557         EmptyBuffers();
1558         AllocateBuffers();
1559 
1560         memcpy(basefreqarray[0],src.basefreqarray[0],
1561                m_ncategories*BASES*sizeof(double));
1562     }
1563 
1564     // we don't copy target or daughter!
1565 } // GTRModel::CopyMembers
1566 
1567 //------------------------------------------------------------------------------------
1568 
GTRDotProduct(const DoubleVec2d & first,const DoubleVec2d & second,DoubleVec2d & answer)1569 void GTRModel::GTRDotProduct(const DoubleVec2d& first, const DoubleVec2d& second, DoubleVec2d& answer)
1570 // dot product of first and second put into PRE-EXISTING answer!
1571 // second has been PRE-TRANSPOSED!!
1572 {
1573     // should be square
1574     assert(first.size() == first[0].size());
1575     // and all the same size!
1576     assert(first.size() == second.size());
1577     assert(first.size() == answer.size());
1578     double initial = 0.0;
1579 
1580     long i, j, n=first.size();
1581     for (i = 0; i < n; ++i)
1582     {
1583         for (j = 0; j < n; ++j)
1584         {
1585             answer[i][j] = inner_product(first[i].begin(), first[i].end(), second[j].begin(), initial);
1586         }
1587     }
1588 } // GTRModel::GTRDotProduct
1589 
1590 //------------------------------------------------------------------------------------
1591 
BarfOnBadGTRRates(const DoubleVec1d & rts) const1592 void GTRModel::BarfOnBadGTRRates(const DoubleVec1d& rts) const
1593 {
1594     if (rts.size() != 6)
1595     {
1596         data_error e("Incorrect number of GTR rates:  expected 6, found " + ToString(rts.size()));
1597         throw e;
1598     }
1599 
1600     size_t i;
1601     for (i = 0; i < rts.size(); ++i)
1602     {
1603         if (rts[i] <= 0.0)
1604         {
1605             data_error e("All rates for the GTR model must be greater than 0.");
1606             throw e;
1607         }
1608     }
1609 } // GTRModel::BarfOnBadGTRRates
1610 
1611 //------------------------------------------------------------------------------------
1612 
SetRates(const DoubleVec1d & rts)1613 void GTRModel::SetRates(const DoubleVec1d& rts)
1614 {
1615     BarfOnBadGTRRates(rts);
1616 
1617     AC = rts[0];
1618     AG = rts[1];
1619     AT = rts[2];
1620     CG = rts[3];
1621     CT = rts[4];
1622     TG = rts[5];
1623 } // GTRModel::SetRates
1624 
1625 //------------------------------------------------------------------------------------
1626 
GetRates() const1627 DoubleVec1d GTRModel::GetRates() const
1628 {
1629     DoubleVec1d rts;
1630     rts.push_back(AC);
1631     rts.push_back(AG);
1632     rts.push_back(AT);
1633     rts.push_back(CG);
1634     rts.push_back(CT);
1635     rts.push_back(TG);
1636 
1637     return rts;
1638 
1639 } // GTRModel::GetRates
1640 
1641 //------------------------------------------------------------------------------------
1642 
Finalize()1643 void GTRModel::Finalize()
1644 {
1645     AllocateBuffers();
1646 
1647     // calculate eigvals and eigvecs1 & 2
1648     // assemble rate matrix; using scratch
1649     double* bf = basefreqarray[0];
1650 
1651     // rescale rates to a mean of 1 event per unit branchlength
1652     double scalefactor = 2.0 * (AC * bf[baseA] * bf[baseC] +
1653                                 AG * bf[baseA] * bf[baseG] +
1654                                 AT * bf[baseA] * bf[baseT] +
1655                                 CG * bf[baseC] * bf[baseG] +
1656                                 CT * bf[baseC] * bf[baseT] +
1657                                 TG * bf[baseG] * bf[baseT]);
1658     double nAC = AC / scalefactor;
1659     double nAG = AG / scalefactor;
1660     double nAT = AT / scalefactor;
1661     double nCG = CG / scalefactor;
1662     double nCT = CT / scalefactor;
1663     double nTG = TG / scalefactor;
1664 
1665     scratch[0][0] = -(nAC*bf[baseC] + nAG*bf[baseG] + nAT*bf[baseT])/bf[baseA];
1666     scratch[0][1] = nAC;
1667     scratch[0][2] = nAG;
1668     scratch[0][3] = nAT;
1669     scratch[1][0] = nAC;
1670     scratch[1][1] = -(nAC*bf[baseA] + nCG*bf[baseG] + nCT*bf[baseT])/bf[baseC];
1671     scratch[1][2] = nCG;
1672     scratch[1][3] = nCT;
1673     scratch[2][0] = nAG;
1674     scratch[2][1] = nCG;
1675     scratch[2][2] = -(nAG*bf[baseA] + nCG*bf[baseC] + nTG*bf[baseT])/bf[baseG];
1676     scratch[2][3] = nTG;
1677     scratch[3][0] = nAT;
1678     scratch[3][1] = nCT;
1679     scratch[3][2] = nTG;
1680     scratch[3][3] = -(nAT*bf[baseA] + nCT*bf[baseC] + nTG*bf[baseG])/bf[baseT];
1681 
1682     // assemble square root of base frequencies
1683     DoubleVec1d zero(BASES,0.0);
1684     DoubleVec2d diag(BASES,zero);
1685     long i;
1686     for (i = 0; i < BASES; ++i)
1687     {
1688         diag[i][i] = sqrt(bf[i]);
1689     }
1690 
1691     GTRDotProduct(diag,Transpose(scratch),scratch);
1692     DoubleVec2d answ(BASES,zero);
1693     GTRDotProduct(scratch,diag,answ);
1694 
1695     EigenCalculator eig;
1696     pair<DoubleVec1d,DoubleVec2d> eigsys(eig.Eigen(answ));
1697     eigvecs1 = diag;  // needed to satisfy GTRDotProduct size reqs
1698     // calculate: diag . Transpose[eigsys.second], which would require
1699     // a call to the transpose of the transpose for GTRDotProduct purposes,
1700     // so just use the matrix straight.
1701     GTRDotProduct(diag,eigsys.second,eigvecs1);
1702     eigvals = eigsys.first;
1703     eigvecs2 = Transpose(Invert(eigvecs1));
1704 
1705 } // GTRModel::Finalize
1706 
1707 //------------------------------------------------------------------------------------
1708 // calculate pchange [eigvecs1 . Exp[eigvals*length] . eigvecs2]
1709 
RescaleLengths(double length1,double length2)1710 void GTRModel::RescaleLengths(double length1, double length2)
1711 {
1712     computed.assign(2,true);
1713 
1714     // if a length is FLAGDOUBLE we are in a one-legged coalescence
1715     // and we avoid this whole computation
1716     if (CloseEnough(length1, FLAGDOUBLE)) computed[0] = false;
1717     if (CloseEnough(length2, FLAGDOUBLE)) computed[1] = false;
1718 
1719     DoubleVec1d zero(BASES,0.0);
1720     DoubleVec2d zeros(BASES,zero);
1721     DoubleVec3d zeroes(m_ncategories,zeros);
1722     DoubleVec4d diag(2,zeroes);
1723 
1724     long cat;
1725     double expon;
1726 
1727     length1 *= m_relmurate;
1728     length2 *= m_relmurate;
1729 
1730     for(cat = 0; cat < m_ncategories; ++cat)
1731     {
1732         long i;
1733         for (i = 0; i < BASES; ++i)
1734         {
1735             double scalar = m_catrates[cat]*eigvals[i];
1736 
1737             if (computed[0])
1738             {
1739                 expon = length1 * scalar;
1740                 if (expon >= EXPMIN)
1741                 {
1742                     if (expon <= EXPMAX)
1743                         diag[0][cat][i][i] = exp(length1*scalar);
1744                     else
1745                         diag[0][cat][i][i] = EXP_OF_EXPMAX;
1746                 }
1747                 // else it remains zero
1748             }
1749 
1750             if (computed[1])
1751             {
1752                 expon = length2 * scalar;
1753                 if (expon >= EXPMIN)
1754                 {
1755                     if (expon <= EXPMAX)
1756                         diag[1][cat][i][i] = exp(length2*scalar);
1757                     else
1758                         diag[1][cat][i][i] = EXP_OF_EXPMAX;
1759                 }
1760                 // else it remains zero
1761             }
1762         }
1763     }
1764 
1765     for(cat = 0; cat < m_ncategories; ++cat)
1766     {
1767         long br;
1768         for(br = 0; br < 2; ++br)
1769         {
1770             if (!computed[br]) continue;
1771             GTRDotProduct(eigvecs1,diag[br][cat],scratch);
1772             GTRDotProduct(scratch,eigvecs2,pchange[br][cat]);
1773         }
1774     }
1775 
1776 } // GTRModel::RescaleLengths
1777 
1778 //------------------------------------------------------------------------------------
1779 
ComputeSiteDLs(double ** siteDL1,double ** siteDL2)1780 double** GTRModel::ComputeSiteDLs(double** siteDL1, double** siteDL2)
1781 {
1782     long cat, base;
1783 
1784     if (computed[0])
1785     {
1786         for(cat = 0; cat < m_ncategories; ++cat)
1787         {
1788             double* catDLs = siteDL1[cat];
1789             DoubleVec2d& prob = pchange[0][cat];
1790             for(base = baseA; base <= baseT; ++base)
1791             {
1792                 daughter[0][cat][base] = prob[baseA][base] * catDLs[baseA] +
1793                     prob[baseC][base] * catDLs[baseC] +
1794                     prob[baseG][base] * catDLs[baseG] +
1795                     prob[baseT][base] * catDLs[baseT];
1796             }
1797         }
1798     }
1799     else
1800     {
1801         memcpy(daughter[0][0],basefreqarray[0],m_ncategories*BASES*sizeof(double));
1802     }
1803 
1804     if (computed[1])
1805     {
1806         for(cat = 0; cat < m_ncategories; ++cat)
1807         {
1808             double* catDLs = siteDL2[cat];
1809             DoubleVec2d& prob = pchange[1][cat];
1810             for(base = baseA; base <= baseT; ++base)
1811             {
1812                 daughter[1][cat][base] = prob[baseA][base] * catDLs[baseA] +
1813                     prob[baseC][base] * catDLs[baseC] +
1814                     prob[baseG][base] * catDLs[baseG] +
1815                     prob[baseT][base] * catDLs[baseT];
1816             }
1817         }
1818     }
1819     else
1820     {
1821         memcpy(daughter[1][0],basefreqarray[0],m_ncategories*BASES*sizeof(double));
1822     }
1823 
1824     for (cat = 0; cat < m_ncategories; cat++)
1825         for(base = baseA; base <= baseT; ++base)
1826             target[cat][base] = daughter[0][cat][base] *
1827                 daughter[1][cat][base];
1828 
1829     return target;
1830 
1831 } // GTRModel::ComputeSiteDLs
1832 
1833 //------------------------------------------------------------------------------------
1834 
ComputeSubtreeDL(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)1835 double GTRModel::ComputeSubtreeDL(Cell& rootdls, double** startmarker, double** endmarker, long posn)
1836 {
1837     double total=0.0, subtotal;
1838     double** marker;
1839     long cat;
1840     DoubleVec1d prior(m_ncategories);
1841     long firstposn = posn;
1842 
1843     for (marker = startmarker; marker != endmarker;
1844          marker = rootdls.GetNextMarker(marker))
1845     {
1846         subtotal = 0.0;
1847 
1848         for (cat = 0; cat < m_ncategories; cat++)
1849         {
1850             prior[cat] = basefreqarray[cat][baseA]*marker[cat][baseA] +
1851                 basefreqarray[cat][baseC]*marker[cat][baseC] +
1852                 basefreqarray[cat][baseG]*marker[cat][baseG] +
1853                 basefreqarray[cat][baseT]*marker[cat][baseT];
1854 
1855             subtotal += m_catprobs[cat] * prior[cat];
1856         }
1857 
1858         if (!subtotal)
1859         {
1860             DataModel::TryToNormalizeAndThrow(posn, GetModelType());
1861         }
1862 
1863         if (ShouldNormalize())
1864         {
1865             total += (log(subtotal) + rootdls.GetNorms(posn));
1866         }
1867         else
1868         {
1869             total += log(subtotal);
1870         }
1871 
1872         // de-normalization not needed here, since we are only interested in the ratio
1873         if (m_ncategories > 1)
1874         {
1875             for (cat = 0; cat < m_ncategories; cat++)
1876                 catlikes[posn][cat] = prior[cat]/subtotal;
1877         }
1878         ++posn;
1879     }
1880 
1881     if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
1882     return total;
1883 
1884 } // GTRModel::ComputeSubtreeDL
1885 
1886 //------------------------------------------------------------------------------------
1887 
ComputeCatDL(long startmarker,long endmarker)1888 double GTRModel::ComputeCatDL(long startmarker, long endmarker)
1889 {
1890     double subtotal;
1891     long marker, cat;
1892 
1893     DoubleVec1d like = m_catcells;
1894     DoubleVec1d nulike(m_ncategories);
1895 
1896     for (marker = startmarker; marker != endmarker; ++marker)
1897     {
1898         subtotal = 0.0;
1899         for (cat = 0; cat < m_ncategories; cat++)
1900             subtotal += m_catprobs[cat] * like[cat];
1901 
1902         subtotal *= m_acratio;
1903 
1904         for (cat = 0; cat < m_ncategories; cat++)
1905             nulike[cat] = catlikes[marker][cat] *
1906                 (subtotal + m_ucratio * like[cat]);
1907 
1908         // the following puts the nulike values into like.  It
1909         // also puts the like values into nulike, but we will not
1910         // be using values from nulike so we don't care.
1911         like.swap(nulike);
1912     }
1913 
1914     subtotal = 0.0;
1915     for (cat = 0; cat < m_ncategories; cat++)
1916         subtotal += m_catprobs[cat] * like[cat];
1917 
1918     // the following puts the like values into catcells for
1919     // long-term storage.  It also puts the catcells values into
1920     // like, but we don't care.
1921     m_catcells.swap(like);
1922 
1923     return log(subtotal);
1924 
1925 } // GTRModel::ComputeCatDL
1926 
1927 //------------------------------------------------------------------------------------
1928 
CreateDataModelReport() const1929 StringVec1d GTRModel::CreateDataModelReport() const
1930 {
1931     StringVec1d report = NucModel::CreateDataModelReport();
1932 
1933     string line = "Mutation parameters: ";
1934     report.push_back(line);
1935     line = "Between A and (C, G, T):  ";
1936     line += ToString(AC) + ", " + ToString(AG) + ", " + ToString(AT);
1937     report.push_back(line);
1938     line = "Between C and (G, T):  ";
1939     line += ToString(CG) + ", " + ToString(CT);
1940     report.push_back(line);
1941     line = "Between G and (T):  ";
1942     line += ToString(TG);
1943     report.push_back(line);
1944 
1945     return report;
1946 
1947 } // GTRModel::CreateDataModelReport
1948 
1949 //------------------------------------------------------------------------------------
1950 
ToXML(size_t nspaces) const1951 StringVec1d GTRModel::ToXML(size_t nspaces) const
1952 {
1953     StringVec1d xmllines(NucModel::ToXML(nspaces));
1954 
1955     nspaces += INDENT_DEPTH;
1956     string line(MakeIndent(MakeTag(xmlstr::XML_TAG_GTRRATES),nspaces));
1957     line += ToString(GetRates(),6);
1958     line += MakeCloseTag(xmlstr::XML_TAG_GTRRATES);
1959     nspaces -= INDENT_DEPTH;
1960 
1961     StringVec1d::iterator endtag = --xmllines.end();
1962     xmllines.insert(endtag,line);
1963 
1964     return xmllines;
1965 
1966 } // GTRModel::ToXML
1967 
1968 //------------------------------------------------------------------------------------
1969 
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const1970 DoubleVec1d GTRModel::SimulateMarker(double branchlength, long whichmarker,
1971                                      const DoubleVec1d& state) const
1972 {
1973     throw implementation_error("Cannot simulate data with the GTR model yet.");
1974 } // SimulateMarker
1975 
1976 //------------------------------------------------------------------------------------
1977 
IsValidDataModel() const1978 bool GTRModel::IsValidDataModel() const
1979 {
1980     if (AC <= 0) return false;
1981     if (AG <= 0) return false;
1982     if (AT <= 0) return false;
1983     if (CG <= 0) return false;
1984     if (CT <= 0) return false;
1985     if (TG <= 0) return false;
1986     return NucModel::IsValidDataModel();
1987 } // GTRModel::IsValidDataModel
1988 
1989 //------------------------------------------------------------------------------------
1990 //------------------------------------------------------------------------------------
1991 
AlleleModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,long numBins,double relMuRate)1992 AlleleModel::AlleleModel(long nmarkers,
1993                          long numCategories,
1994                          DoubleVec1d categoryRates,
1995                          DoubleVec1d categoryProbabilities,
1996                          double userAutoCorrelationValue,
1997                          bool doNormalize,
1998                          long numBins,
1999                          double relMuRate)
2000     : DataModel(nmarkers,
2001                 numCategories,
2002                 categoryRates,
2003                 categoryProbabilities,
2004                 userAutoCorrelationValue,
2005                 doNormalize,
2006                 numBins,
2007                 relMuRate),
2008       m_likes(CreateVec2d(m_nmarkers, m_ncategories, 0.0)),
2009       m_bincounts()
2010 {
2011     // intentionally blank
2012 } // AlleleModel constructor
2013 
2014 //------------------------------------------------------------------------------------
2015 //------------------------------------------------------------------------------------
2016 
RescaleLength(double length)2017 DoubleVec1d AlleleModel::RescaleLength(double length)
2018 {
2019     length *= m_relmurate;
2020 
2021     long cat;
2022     DoubleVec1d scaled(m_ncategories);
2023     for (cat = 0; cat < m_ncategories; ++cat)
2024     {
2025         scaled[cat] = length * m_catrates[cat];
2026     }
2027     return scaled;
2028 
2029 } // AlleleModel::RescaleLengths
2030 
2031 //------------------------------------------------------------------------------------
2032 
ComputeCatDL(long startmarker,long endmarker)2033 double AlleleModel::ComputeCatDL(long startmarker, long endmarker)
2034 {
2035     double subtotal;
2036     long marker, cat;
2037 
2038     DoubleVec1d previous = m_catcells;
2039     DoubleVec1d current(m_ncategories, 0.0);
2040 
2041     for (marker = startmarker; marker != endmarker; ++marker)
2042     {
2043         subtotal = 0.0;
2044         for (cat = 0; cat < m_ncategories; cat++)
2045             subtotal += m_catprobs[cat] * previous[cat];
2046 
2047         subtotal *= m_acratio;
2048 
2049         for (cat = 0; cat < m_ncategories; cat++)
2050             current[cat] = m_likes[marker][cat] *
2051                 (subtotal + m_ucratio * previous[cat]);
2052 
2053         // This line puts the current values in previous cheaply.  We
2054         // don't care that it puts the previous values in current,
2055         // because current will be overwritten anyway.
2056         previous.swap(current);
2057     }
2058 
2059     subtotal = 0.0;
2060     for (cat = 0; cat < m_ncategories; cat++)
2061         subtotal += m_catprobs[cat] * previous[cat];
2062 
2063     m_catcells.swap(previous);
2064 
2065     return log(subtotal);
2066 
2067 } // AlleleModel::ComputeCatDL
2068 
2069 //------------------------------------------------------------------------------------
2070 
ChooseAncestralState(long marker)2071 DoubleVec1d AlleleModel::ChooseAncestralState(long marker)
2072 {
2073     long size = m_bincounts[marker];
2074     DoubleVec1d result(size, 0.0);
2075     result[registry.GetRandom().Long(size)] = 1.0;
2076     return result;
2077 } // ChooseAncestralState
2078 
2079 //------------------------------------------------------------------------------------
2080 //------------------------------------------------------------------------------------
2081 
StepwiseModel(long nmarkers,const StringVec2d & uniqueAlleles,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate)2082 StepwiseModel::StepwiseModel(long nmarkers,
2083                              const StringVec2d& uniqueAlleles,
2084                              long numCategories,
2085                              DoubleVec1d categoryRates,
2086                              DoubleVec1d categoryProbabilities,
2087                              double userAutoCorrelationValue,
2088                              bool doNormalize,
2089                              double relMuRate)
2090     :   AlleleModel(nmarkers,
2091                     numCategories,
2092                     categoryRates,
2093                     categoryProbabilities,
2094                     userAutoCorrelationValue,
2095                     doNormalize,
2096                     defaults::bins,
2097                     relMuRate),
2098         m_allowance(defaults::step_allowance)
2099 {
2100     Initialize(uniqueAlleles);
2101 } // StepwiseModel constructor
2102 
2103 //------------------------------------------------------------------------------------
2104 
Clone() const2105 DataModel* StepwiseModel::Clone() const
2106 {
2107     DataModel* newmodel = new StepwiseModel(*this);
2108     return newmodel;
2109 }
2110 
2111 //------------------------------------------------------------------------------------
2112 
CreateDataModelReport() const2113 StringVec1d StepwiseModel::CreateDataModelReport() const
2114 {
2115     StringVec1d report = DataModel::CreateDataModelReport();
2116     string maxbins = "Maximum number of bins:  " + ToString(m_bincounts[0]);
2117     report.push_back(maxbins);
2118     return report;
2119 } // StepwiseModel::CreateDataModelReport
2120 
2121 //------------------------------------------------------------------------------------
2122 
DataToLikes(const string & datum,long marker) const2123 vector<double> StepwiseModel::DataToLikes(const string& datum, long marker) const
2124 {
2125     if (m_bincounts.empty())
2126     {
2127         string msg = "StepwiseModel::DataToLikes() was called before m_bincounts ";
2128         msg += "was initialized.";
2129         throw implementation_error(msg);
2130     }
2131 
2132     if (datum == "?")       // unknown data fills all bins with 1
2133     {
2134         vector<double> result(m_bincounts[marker], 1.0);
2135         return result;
2136     }
2137     else
2138     {
2139         vector<double> result(m_bincounts[marker], 0.0);
2140         long allele;
2141         FromString(datum,allele);
2142         allele -= m_offsets[marker];
2143         if (allele < 0 || allele >= m_bincounts[marker])
2144         {
2145             string msg = "StepwiseModel::DataToLikes() was called on an ";
2146             msg += "uninitialized (or incorrectly initialized) object.";
2147             throw implementation_error(msg);
2148         }
2149         result[allele] = 1.0;
2150         return result;
2151     }
2152 
2153 } // DataToLikes
2154 
2155 //------------------------------------------------------------------------------------
2156 
Initialize(const StringVec2d & uniqueAlleles)2157 void StepwiseModel::Initialize(const StringVec2d& uniqueAlleles)
2158 {
2159     if (static_cast<unsigned long>(m_nmarkers) != uniqueAlleles.size())
2160     {
2161         string msg = "StepwiseModel::Initialize() encountered m_nmarkers = ";
2162         msg += ToString(m_nmarkers) + " and uniqueAlleles.size() = ";
2163         msg += ToString(uniqueAlleles.size()) + "; these numbers should be equal.";
2164         throw implementation_error(msg);
2165     }
2166     long marker;
2167 
2168     // find the biggest and smallest allele for each marker
2169     // NB We assume that allele sizes are in number of repeats,
2170     // *not* base pair count or anything else, and that "missing
2171     // data" is coded as ?.
2172 
2173     for (marker = 0; marker < m_nmarkers; ++marker)
2174     {
2175         bool real_allele = false;  // did we ever see a non-? allele
2176         long smallone = MAXLONG, largeone = 0;
2177         for (size_t nallele = 0; nallele<uniqueAlleles[marker].size();
2178              nallele++)
2179         {
2180             // convert to number
2181             // do not count "unknown data" markers
2182             string allele = uniqueAlleles[marker][nallele];
2183             if (allele == "?")
2184             {
2185                 assert(false); //need to catch this earlier
2186                 continue;
2187             }
2188             real_allele = true;
2189             long tipval;
2190             FromString(allele, tipval);  // convert to long
2191 
2192             if (tipval < smallone) smallone = tipval;
2193             if (tipval > largeone) largeone = tipval;
2194         }
2195 
2196         // if no non-? were ever found, use arbitrary values
2197         if (!real_allele)
2198         {
2199             smallone = 10;
2200             largeone = 10;
2201         }
2202 
2203         long newoffset = max(smallone - m_allowance, 0L);
2204         m_offsets.push_back(newoffset);
2205         m_bincounts.push_back(largeone + m_allowance + 1 - newoffset);
2206         if (real_allele && largeone != smallone)
2207             // m_threshhold: +1 makes it work for (large-small) both odd/even
2208             m_threshhold.push_back((largeone - smallone + 1L)/2L);
2209         else
2210             m_threshhold.push_back(1L);
2211 
2212         // Pre-calculate table of steps
2213         CalculateSteps(marker);
2214     }
2215 
2216     AlleleModel::m_nbins = *max_element(m_bincounts.begin(),
2217                                         m_bincounts.end());
2218     fill(AlleleModel::m_bincounts.begin(),
2219          AlleleModel::m_bincounts.end(), AlleleModel::m_nbins);
2220 } // Initialize
2221 
2222 //------------------------------------------------------------------------------------
2223 // Adaptation of Peter Beerli's microsatellite likelihood nuview_micro routine from Migrate.
2224 
ComputeSiteDLs(Cell_ptr child1,Cell_ptr child2,Cell_ptr thiscell,const DoubleVec1d & lengthOfBranchToChild1ScaledByRateCat,const DoubleVec1d & lengthOfBranchToChild2ScaledByRateCat,long marker)2225 void StepwiseModel::ComputeSiteDLs (Cell_ptr child1, Cell_ptr child2,
2226                                     Cell_ptr thiscell, const DoubleVec1d& lengthOfBranchToChild1ScaledByRateCat,
2227                                     const DoubleVec1d& lengthOfBranchToChild2ScaledByRateCat, long marker)
2228 {
2229     double **pSiteDLsForChild1 = child1->GetSiteDLs(marker);
2230     double **pSiteDLsForChild2 = child2->GetSiteDLs(marker);
2231     double normForChild1 = child1->GetNorms(marker);
2232     double normForChild2 = child2->GetNorms(marker);
2233 
2234     if (!pSiteDLsForChild1 && !pSiteDLsForChild2) // they can't both be NULL
2235     {
2236         string msg = "StepwiseModel::ComputeSiteDLs() found no existing ";
2237         msg += "site data-likelihoods for either child.";
2238         throw implementation_error(msg);
2239     }
2240 
2241     // in case of a one-legged coalescence, copy values and return
2242     if (!pSiteDLsForChild1)
2243     {
2244         thiscell->SetSiteDLs(marker, pSiteDLsForChild2);
2245         thiscell->SetNorms(normForChild2, marker);
2246         return;
2247     }
2248 
2249     if (!pSiteDLsForChild2)
2250     {
2251         thiscell->SetSiteDLs(marker, pSiteDLsForChild1);
2252         thiscell->SetNorms(normForChild1, marker);
2253         return;
2254     }
2255 
2256     double **jointProbChild1Child2 = new double*[m_ncategories * sizeof(double *)];
2257     jointProbChild1Child2[0] = new double[m_ncategories * m_bincounts[marker] * sizeof(double)];
2258     for (long cat = 0; cat < m_ncategories; ++cat)
2259         jointProbChild1Child2[cat] = jointProbChild1Child2[0] + cat * m_bincounts[marker];
2260 
2261     long bmax = m_bincounts[marker];
2262     long threshold = m_threshhold[marker];
2263     double maxJointProbChild1Child2 = -DBL_MAX;
2264     double mutationProbChild1, mutationProbChild2;
2265 
2266     // Compute the bin contents for each possible microsat allele.
2267     // "b" is the "bin number," i.e., the scaled value of the starting allele
2268     // ("scaled" meaning "shifted so that the smallest allele gets set to zero").
2269     // For each starting allele "b", we sweep through a range of alleles "a" to which
2270     // allele "b" can mutate, from b - threshold to b + threshold.
2271     for (long b = 0; b < bmax; b++)
2272     {
2273         for (long cat = 0; cat < m_ncategories; ++cat)
2274         {
2275             mutationProbChild1 = mutationProbChild2 = 0.0;
2276             for (long a = max(0L, b - threshold); a <= min(b + threshold, bmax-1); a++)
2277             {
2278                 // Note:  The probability of mutating n "steps" downward
2279                 // (e.g., from a microsat allele of 23 repeats to an allele of 20 repeats)
2280                 // equals the prob. of mutating n steps upward
2281                 // (e.g., from 23 repeats to 26 repeats).
2282                 long netNumPositiveSteps = labs(b - a);
2283                 if (pSiteDLsForChild1[cat][a] > 0)
2284                 {
2285                     mutationProbChild1 += Probability(lengthOfBranchToChild1ScaledByRateCat[cat],
2286                                                       netNumPositiveSteps, marker) * pSiteDLsForChild1[cat][a];
2287                 }
2288                 if (pSiteDLsForChild2[cat][a] > 0)
2289                 {
2290                     mutationProbChild2 += Probability(lengthOfBranchToChild2ScaledByRateCat[cat],
2291                                                       netNumPositiveSteps, marker) * pSiteDLsForChild2[cat][a];
2292                 }
2293             }
2294             jointProbChild1Child2[cat][b] = mutationProbChild1*mutationProbChild2;
2295 
2296             if (jointProbChild1Child2[cat][b] > maxJointProbChild1Child2)
2297             {
2298                 maxJointProbChild1Child2 = jointProbChild1Child2[cat][b];
2299             }
2300         }
2301     }
2302 
2303     // normalize to further protect against overflow, if requested
2304     if (ShouldNormalize())
2305     {
2306         if (0.0 == maxJointProbChild1Child2)
2307         {
2308             thiscell->SetNorms(-DBL_MAX,marker);
2309         }
2310         else
2311         {
2312             for (long b = 0; b < bmax; b++)
2313             {
2314                 for (long cat = 0; cat < m_ncategories; ++cat)
2315                 {
2316                     jointProbChild1Child2[cat][b] /= maxJointProbChild1Child2;
2317                 }
2318             }
2319             thiscell->SetNorms(log(maxJointProbChild1Child2) +
2320                                normForChild1 + normForChild2, marker);
2321         }
2322     }
2323 
2324     thiscell->SetSiteDLs(marker, jointProbChild1Child2);
2325 
2326     delete[] jointProbChild1Child2[0];
2327     delete[] jointProbChild1Child2;
2328 } // ComputeSiteDLs
2329 
2330 //------------------------------------------------------------------------------------
2331 
ComputeSubtreeDLs(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)2332 double StepwiseModel::ComputeSubtreeDLs(Cell& rootdls, double** startmarker, double** endmarker, long posn)
2333 {
2334     double total=0.0, subtotal;
2335     double** marker;
2336     long cat, bin;
2337     long firstposn = posn;
2338 
2339     for (marker = startmarker; marker != endmarker;
2340          marker = rootdls.GetNextMarker(marker))
2341     {
2342         subtotal = 0.0;
2343         DoubleVec1d buffer(m_ncategories, 0.0);
2344 
2345         for (cat = 0; cat < m_ncategories; ++cat)
2346         {
2347             for (bin = 0; bin < m_bincounts[posn]; ++bin)
2348             {
2349                 // NB:  We assume a flat allele frequency prior here.
2350                 //cout << "cat " << cat << ", bin " << bin << ", prob " << marker[cat][bin] << endl;
2351                 buffer[cat] += marker[cat][bin];
2352             }
2353             subtotal += m_catprobs[cat] * buffer[cat];
2354         }
2355 
2356         if (!subtotal)
2357         {
2358             DataModel::TryToNormalizeAndThrow(posn, GetModelType());
2359         }
2360 
2361         total += (log(subtotal) + rootdls.GetNorms(posn));
2362 
2363         assert (total != 0);  // that would be *too* likely
2364 
2365         if (m_ncategories > 1)
2366         {
2367             for (cat = 0; cat < m_ncategories; cat++)
2368                 m_likes[posn][cat] = buffer[cat]/subtotal;
2369         }
2370         ++posn;
2371     }
2372 
2373     if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
2374     return total;
2375 
2376 } // StepwiseModel::ComputeSubtreeDLs
2377 
2378 //------------------------------------------------------------------------------------
2379 
2380 // This method returns the probability of changing an allele by "diff" net steps
2381 // in time t.  For example, if the data type is microsatellite, we could compute
2382 // the probability that the sequence ACACAC (3 repeats) mutates to ACACACACAC
2383 // (5 repeats, for a "diff" of 2) along a branch whose length is scaled to be t.
2384 // "diff" received by this method is nonnegative, representing a net increase
2385 // in the size of the allele; the probability of decreasing by "diff" net steps
2386 // is defined to be the same as increasing by "diff" net steps.
2387 // All nonvanishingly unlikely net steps are considered; e.g., simply taking 2
2388 // steps along the branch, or taking 2 steps forward plus k steps backward
2389 // plus k steps forward, where k = 0, 1, 2, 3, ..., infinity.
2390 // The formula is Prob(i net steps in time t) =
2391 //    exp(-t)*sum_over_k((t/2)^(i+2k) / ((i+k)!k!)),
2392 // where the sum runs from k = 0 to k = infinity.
2393 // This formula is eq. 15.26 in Joe's "Inferring Phylogenies" book (p. 242).
2394 // The code looks different because the factorials are obtained via a precomputed
2395 // lookup table, and exp/log are used to counteract underflow.
2396 // Contributions to the sum drop off rapidly as k increases.
2397 // (Side note:  An equivalent and more compact version of this formula is
2398 // Prob(i,t) = exp(-t)*BesselI(i,t), where BesselI(n,x) is the
2399 // modified Bessel function of integer order n evaluated at real argument x.)
2400 
Probability(double t,long diff,long marker) const2401 double StepwiseModel::Probability(double t, long diff, long marker) const
2402 {
2403     long threshold = m_threshhold[marker]; // max. number of positive steps
2404     if (diff > threshold)
2405         return 0.0; // approximately infinitely unlikely to mutate that much in time t
2406 
2407     double sum(0.0), oldsum(0.0);
2408     const DoubleVec2d& PrecomputedTerms = m_steps[marker];
2409     double log_tOver2 = log(0.5 * t);
2410 
2411     for (long k = 0; k <= threshold; k++) // num steps = diff + k <= threshold
2412     {
2413         sum += exp(-t + log_tOver2*(diff + 2.0*k) - PrecomputedTerms[diff][k]);
2414 
2415         // quit if the contributions have become trivial
2416         if (fabs (oldsum - sum) < DBL_EPSILON) break;
2417         oldsum = sum;
2418     }
2419 
2420     return sum;
2421 }
2422 
2423 //------------------------------------------------------------------------------------
2424 // Adaptation of Peter Beerli's calculate_steps routine from MIGRATE.
2425 // This routine precomputes values needed by Probability, for speed.
2426 
CalculateSteps(long marker)2427 void StepwiseModel::CalculateSteps(long marker)
2428 {
2429     long k, diff;
2430     DoubleVec1d tempvec;
2431     DoubleVec2d steps;
2432     long threshhold = m_threshhold[marker];
2433 
2434     for (diff = 0; diff <= threshhold; diff++)
2435     {
2436         tempvec.clear();
2437         for (k = 0; k <= threshhold; k++)
2438         {
2439             tempvec.push_back(logfac (diff + k) + logfac (k));
2440         }
2441         steps.push_back(tempvec);
2442     }
2443 
2444     m_steps.push_back(steps); // Note:  This is okay from a speed standpoint,
2445     // but not so good from a design standpoint,
2446     // because this method assumes it's being called
2447     // within a loop over markers.
2448 } // calculate_steps
2449 
2450 //------------------------------------------------------------------------------------
2451 
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const2452 DoubleVec1d StepwiseModel::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
2453 {
2454     throw implementation_error("Cannot simulate data with the stepwise model yet.");
2455 } // SimulateMarker
2456 
2457 //------------------------------------------------------------------------------------
2458 
CellToData(Cell_ptr cell,long marker) const2459 string StepwiseModel::CellToData(Cell_ptr cell, long marker) const
2460 {
2461     LongVec1d ones = cell->GetOnes(marker);
2462     assert(static_cast<size_t>(marker) < m_offsets.size());
2463     if (ones.size() == 1)
2464     {
2465         return ToString(ones[0] + m_offsets[marker]);
2466     }
2467     throw implementation_error
2468         ("Cannot convert stepwise data from the internal format for data not simply a single number.");
2469 }
2470 
2471 //------------------------------------------------------------------------------------
2472 //------------------------------------------------------------------------------------
2473 
BrownianModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate)2474 BrownianModel::BrownianModel(long nmarkers,
2475                              long numCategories,
2476                              DoubleVec1d categoryRates,
2477                              DoubleVec1d categoryProbabilities,
2478                              double userAutoCorrelationValue,
2479                              bool doNormalize,
2480                              double relMuRate)
2481     : AlleleModel(nmarkers,
2482                   numCategories,
2483                   categoryRates,
2484                   categoryProbabilities,
2485                   userAutoCorrelationValue,
2486                   doNormalize,
2487                   defaults::brownianBins, // 3, for mean, variance, and cumulative total
2488                   relMuRate)
2489 {
2490     // intentionally blank
2491 } // BrownianModel constructor
2492 
2493 //------------------------------------------------------------------------------------
2494 
Clone() const2495 DataModel* BrownianModel::Clone() const
2496 {
2497     DataModel* newmodel = new BrownianModel(*this);
2498     return newmodel;
2499 }
2500 
2501 //------------------------------------------------------------------------------------
2502 
SetNormalize(bool norm)2503 void BrownianModel::SetNormalize(bool norm)
2504 {
2505     if(norm)
2506     {
2507         data_error e("Normalization cannot be set for Brownian Model");
2508         throw e;
2509     }
2510 } // BrownianModel::SetNormalize
2511 
2512 //------------------------------------------------------------------------------------
2513 
DataToLikes(const string & datum,long) const2514 vector<double> BrownianModel::DataToLikes(const string& datum, long) const
2515 {
2516     vector<double> result(m_nbins,0.0);
2517 
2518     if (datum == "?")                   // unknown data
2519     {
2520         result[0] = 5.0;                // this is an arbitrary value
2521         result[1] = DBL_BIG;
2522         result[2] = 0.0;
2523     }
2524     else
2525     {
2526         FromString(datum,result[0]);
2527         result[1] = 0.0;
2528         result[2] = 0.0;
2529     }
2530 
2531     return result;
2532 
2533 } // DataToLikes
2534 
2535 //------------------------------------------------------------------------------------
2536 // Adaptation of Peter Beerli's nuview_brownian() from Migrate-1.2.4
2537 // by Jon Yamato 2002/05/06
2538 
2539 // N[Log[1/Sqrt[2 Pi]], 30]
2540 #define LOG2PIHALF -0.918938533204672741780329736406
2541 
ComputeSiteDLs(Cell_ptr child1,Cell_ptr child2,Cell_ptr thiscell,const DoubleVec1d & vv1,const DoubleVec1d & vv2,long marker)2542 void BrownianModel::ComputeSiteDLs(Cell_ptr child1, Cell_ptr child2,
2543                                    Cell_ptr thiscell, const DoubleVec1d& vv1, const DoubleVec1d& vv2,
2544                                    long marker)
2545 {
2546     double mean1, mean2, xx1, xx2, c12, v1, v2, vtot, f1, f2;
2547     double **c1dls = child1->GetSiteDLs(marker),
2548         **c2dls = child2->GetSiteDLs(marker);
2549 
2550     if (!c1dls && !c2dls)
2551     {
2552         string msg = "BrownianModel::ComputeSiteDLs() failed to find ";
2553         msg += "data likelihoods for either child.";
2554         throw implementation_error(msg);
2555     }
2556 
2557     if (!c1dls)
2558     {
2559         thiscell->SetSiteDLs(marker,c2dls);
2560         return;
2561     }
2562 
2563     if (!c2dls)
2564     {
2565         thiscell->SetSiteDLs(marker,c1dls);
2566         return;
2567     }
2568 
2569     // temporary space needed for interface with dlcell::SetSiteDLs()
2570     double **mydls = new double*[m_ncategories*sizeof(double*)];
2571     mydls[0] = new double[m_ncategories*m_nbins*sizeof(double)];
2572     long cat;
2573     for(cat = 1; cat < m_ncategories; ++cat)
2574         mydls[cat] = mydls[0] + cat*m_nbins;
2575 
2576     for (cat = 0; cat < m_ncategories; ++cat)
2577     {
2578 
2579         mean1 = c1dls[cat][0];
2580         xx1 = c1dls[cat][2];
2581         v1 = vv1[cat] + c1dls[cat][1];
2582 
2583         mean2 = c2dls[cat][0];
2584         xx2 = c2dls[cat][2];
2585         v2 = vv2[cat] + c2dls[cat][1];
2586 
2587         vtot = v1 + v2;
2588 
2589         // the weights are set reciprocally so that the value coming from
2590         // the shorter branch is given more weight.
2591         if (vtot > 0.0) f1 = v2/vtot;
2592         else f1 = 0.5;
2593         f2 = 1.0 - f1;
2594 
2595         mydls[cat][0] = f1*mean1 + f2*mean2;
2596 
2597         mydls[cat][1] = v1*f1;
2598 
2599         mydls[cat][2] = xx1 + xx2;
2600 
2601         c12 = (mean1-mean2)*(mean1-mean2) / vtot;
2602         mydls[cat][2] += min(0.0,-0.5 * (log(vtot)+c12) + LOG2PIHALF);
2603     }
2604 
2605     thiscell->SetSiteDLs(marker,mydls);
2606 
2607     delete [] mydls[0];
2608     delete [] mydls;
2609 
2610 } // BrownianModel::ComputeSiteDLs
2611 
2612 #undef LOG2PIHALF
2613 
2614 //------------------------------------------------------------------------------------
2615 
ComputeSubtreeDLs(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)2616 double BrownianModel::ComputeSubtreeDLs(Cell& rootdls, double** startmarker, double** endmarker, long posn)
2617 {
2618     // NB:  Brownian likelihoods are stored as logs!
2619 
2620     double total=0.0, subtotal;
2621     double** marker;
2622     long cat;
2623     long firstposn = posn;
2624 
2625     for (marker = startmarker; marker != endmarker;
2626          marker = rootdls.GetNextMarker(marker))
2627     {
2628 
2629         if (m_ncategories > 1)
2630         {
2631             // in order to add up likelihoods over categories, we
2632             // must un-log them.  We normalize them first to avoid
2633             // underflow.
2634 
2635             DoubleVec1d buffer(m_ncategories, 0.0);
2636             subtotal = 0.0;
2637 
2638             double biggest = NEGMAX;
2639 
2640             for (cat = 0; cat < m_ncategories; ++cat)
2641             {
2642                 if (marker[cat][2] > biggest) biggest = marker[cat][2];
2643             }
2644 
2645             for (cat = 0; cat < m_ncategories; ++cat)
2646             {
2647                 buffer[cat] += exp(marker[cat][2] - biggest);
2648                 subtotal += m_catprobs[cat] * buffer[cat];
2649             }
2650 
2651             for (cat = 0; cat < m_ncategories; cat++)
2652                 m_likes[posn][cat] = buffer[cat]/subtotal;
2653 
2654             if (!subtotal)
2655             {
2656                 // we shouldn't be here!  Normalization shouldn't happen
2657                 // for Brownian.
2658                 datalike_error ex("invalid subtree found in Brownian model");
2659                 throw(ex);
2660             }
2661 
2662             total += (log(subtotal) + biggest);
2663 
2664         }
2665         else
2666         {
2667             // If there is only one category, there is no normalization
2668             // and we MUST NOT un-log the likelihood or it will underflow.
2669             total += marker[0][2];
2670         }
2671 
2672         ++posn;
2673     }
2674 
2675     if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
2676 
2677     return total;
2678 
2679 } // BrownianModel::ComputeSubtreeDLs
2680 
2681 //------------------------------------------------------------------------------------
2682 
CreateDataModelReport() const2683 StringVec1d BrownianModel::CreateDataModelReport() const
2684 {
2685     StringVec1d report = DataModel::CreateDataModelReport();
2686     string rptline("(The brownian approximation for microsatellite ");
2687     rptline += "evolution has no extra parameters.)";
2688     report.push_back(rptline);
2689 
2690     return report;
2691 
2692 } // BrownianModel::CreateDataModelReport
2693 
2694 //------------------------------------------------------------------------------------
2695 
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const2696 DoubleVec1d BrownianModel::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
2697 {
2698     throw implementation_error("Cannot simulate data with the Brownian model yet.");
2699 } // SimulateMarker
2700 
2701 //------------------------------------------------------------------------------------
2702 
CellToData(Cell_ptr cell,long marker) const2703 string BrownianModel::CellToData(Cell_ptr cell, long marker) const
2704 {
2705     double** dls = cell->GetSiteDLs(marker);
2706     if (dls != NULL)
2707     {
2708         return ToString(dls[0][0]);
2709     }
2710     return "0";
2711 }
2712 
2713 //------------------------------------------------------------------------------------
2714 //------------------------------------------------------------------------------------
2715 
KAlleleModel(long nmarkers,const StringVec2d & uniqueAlleles,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate)2716 KAlleleModel::KAlleleModel(
2717     long nmarkers,
2718     const StringVec2d& uniqueAlleles,
2719     long numCategories,
2720     DoubleVec1d categoryRates,
2721     DoubleVec1d categoryProbabilities,
2722     double userAutoCorrelationValue,
2723     bool doNormalize,
2724     double relMuRate
2725     )
2726     : AlleleModel(
2727         nmarkers,
2728         numCategories,
2729         categoryRates,
2730         categoryProbabilities,
2731         userAutoCorrelationValue,
2732         doNormalize,
2733         defaults::bins,
2734         relMuRate
2735         )
2736 {
2737     Initialize(uniqueAlleles);
2738 } // KAlleleModel constructor
2739 
2740 //------------------------------------------------------------------------------------
2741 
ComputeSiteDLs(Cell_ptr child1,Cell_ptr child2,Cell_ptr thiscell,const DoubleVec1d & lengthOfBranchToChild1ScaledByRateCat,const DoubleVec1d & lengthOfBranchToChild2ScaledByRateCat,long marker)2742 void KAlleleModel::ComputeSiteDLs(Cell_ptr child1, Cell_ptr child2,
2743                                   Cell_ptr thiscell, const DoubleVec1d& lengthOfBranchToChild1ScaledByRateCat,
2744                                   const DoubleVec1d& lengthOfBranchToChild2ScaledByRateCat, long marker)
2745 {
2746     double **pSiteDLsForChild1 = child1->GetSiteDLs(marker);
2747     double **pSiteDLsForChild2 = child2->GetSiteDLs(marker);
2748     double normForChild1 = child1->GetNorms(marker);
2749     double normForChild2 = child2->GetNorms(marker);
2750 
2751     if (!pSiteDLsForChild1 && !pSiteDLsForChild2)
2752     {
2753         string msg = "KAlleleModel::ComputeSiteDLs() found no existing ";
2754         msg += "site data-likelihoods for either child.";
2755         throw implementation_error(msg);
2756     }
2757 
2758     // in case of a one-legged coalescence, copy values and return
2759     if (!pSiteDLsForChild1)
2760     {
2761         thiscell->SetSiteDLs(marker, pSiteDLsForChild2);
2762         thiscell->SetNorms(normForChild2, marker);
2763         return;
2764     }
2765 
2766     if (!pSiteDLsForChild2)
2767     {
2768         thiscell->SetSiteDLs(marker, pSiteDLsForChild1);
2769         thiscell->SetNorms(normForChild1, marker);
2770         return;
2771     }
2772 
2773     long smax = m_bincounts[marker];
2774 
2775     // allocate temporary working space;
2776     // OPTIMIZE should be done more cheaply!
2777     double **jointProbChild1Child2 = new double*[m_ncategories * sizeof(double*)];
2778     jointProbChild1Child2[0] = new double[m_ncategories * m_bincounts[marker] * sizeof(double)];
2779     for (long cat = 0; cat < m_ncategories; ++cat)
2780         jointProbChild1Child2[cat] = jointProbChild1Child2[0] + cat * m_bincounts[marker];
2781 
2782     // compute the bin contents for each possible allele size
2783     long s, ss;
2784     double prob1, prob2, a1, a2, b1, b2, temp1, temp2, sum1, sum2;
2785     double maxJointProbChild1Child2(-DBL_MAX);
2786 
2787     for (long cat = 0; cat < m_ncategories; ++cat)
2788     {
2789         prob1 = probMathFunc ( lengthOfBranchToChild1ScaledByRateCat[cat], (smax-1.0)/smax);
2790         prob2 = probMathFunc ( lengthOfBranchToChild2ScaledByRateCat[cat], (smax-1.0)/smax);
2791 
2792         a1 = 1.0 - prob1;
2793         a2 = 1.0 - prob2;
2794         b1 = prob1 / ( smax - 1.0); // smax was set to at least 2 in Initialize()
2795         b2 = prob2 / ( smax - 1.0);
2796         for (s = 0; s < smax; s++)
2797         {
2798             sum1 = 0; sum2 = 0;
2799             for(ss = 0; ss < smax; ss++)
2800             {
2801                 if (s == ss)
2802                 {
2803                     temp1 = a1; temp2 = a2;
2804                 }
2805                 else
2806                 {
2807                     temp1 = b1; temp2 = b2;
2808                 }
2809                 sum1 += pSiteDLsForChild1[cat][ss] * temp1;
2810                 sum2 += pSiteDLsForChild2[cat][ss] * temp2;
2811             }
2812 
2813             jointProbChild1Child2[cat][s] = sum1 * sum2;
2814             if (jointProbChild1Child2[cat][s] > maxJointProbChild1Child2)
2815                 maxJointProbChild1Child2 = jointProbChild1Child2[cat][s];  // overflow protection
2816         }
2817     }
2818 
2819     // normalize to further protect against overflow, if requested
2820     if (ShouldNormalize())
2821     {
2822         if (maxJointProbChild1Child2 == 0.0)
2823         {
2824             thiscell->SetNorms(-DBL_MAX,marker);
2825         }
2826         else
2827         {
2828             for (s = 0; s < smax; s++)
2829             {
2830                 for (long cat = 0; cat < m_ncategories; ++cat)
2831                 {
2832                     jointProbChild1Child2[cat][s] /= maxJointProbChild1Child2;
2833                 }
2834             }
2835             thiscell->SetNorms(log(maxJointProbChild1Child2) + normForChild1 + normForChild2, marker);
2836         }
2837     }
2838 
2839     thiscell->SetSiteDLs(marker, jointProbChild1Child2);
2840 
2841     delete[] jointProbChild1Child2[0];
2842     delete[] jointProbChild1Child2;
2843 } // KAlleleModel::ComputeSiteDLs
2844 
2845 //------------------------------------------------------------------------------------
2846 
Clone() const2847 DataModel* KAlleleModel::Clone() const
2848 {
2849     DataModel* newmodel = new KAlleleModel(*this);
2850     return newmodel;
2851 } // KAllelModel::Clone
2852 
2853 //------------------------------------------------------------------------------------
2854 
probMathFunc(double ut,double coef)2855 double KAlleleModel::probMathFunc( double ut, double coef)
2856 {
2857     double mathValue = coef*(1.0-exp(-1.0/coef*ut));
2858     if (systemSpecificIsnan(mathValue)) // BUGBUG Presumably some other action should be taken?
2859         cout<<"coef:"<<coef<<endl
2860             <<"time:"<<ut<<endl;
2861     return mathValue;
2862 } // KAllelModel::probMathFunc
2863 
2864 //------------------------------------------------------------------------------------
2865 
CreateDataModelReport() const2866 StringVec1d KAlleleModel::CreateDataModelReport() const
2867 {
2868     StringVec1d report = DataModel::CreateDataModelReport();
2869     string maxbins = "Maximum number of bins:  " + ToString(m_bincounts[0]);
2870     report.push_back(maxbins);
2871     return report;
2872 
2873 } // KAllelModel::CreateDataModelReport
2874 
2875 //------------------------------------------------------------------------------------
2876 
Initialize(const StringVec2d & uniqueAlleles)2877 void KAlleleModel::Initialize(const StringVec2d& uniqueAlleles)
2878 {
2879     if (static_cast<unsigned long>(m_nmarkers) != uniqueAlleles.size())
2880     {
2881         string msg = "KAlleleModel::Initialize() encountered m_nmarkers = ";
2882         msg += ToString(m_nmarkers) + " and uniqueAlleles.size() = ";
2883         msg += ToString(uniqueAlleles.size()) + "; these numbers should be equal.";
2884         throw implementation_error(msg);
2885     }
2886 
2887     for (long marker = 0; marker < m_nmarkers; ++marker)
2888     {
2889         map <string,long> tmpMap;
2890         m_allelemaps.push_back(tmpMap);
2891         size_t nallele = 0;
2892         for (; nallele<uniqueAlleles[marker].size(); nallele++)
2893         {
2894             string allelename = uniqueAlleles[marker][nallele];
2895             if (allelename == "?")
2896             {
2897                 assert(false); //we need to catch this earlier.
2898                 continue;
2899             }
2900             assert(m_allelemaps[marker].find(allelename)==m_allelemaps[marker].end());
2901             m_allelemaps[marker].insert(make_pair(allelename, nallele));
2902         }
2903 
2904         if(nallele <= 1) nallele=2; // must be at least this much
2905         m_bincounts.push_back(nallele);
2906     }
2907 
2908     AlleleModel::m_nbins = *max_element(m_bincounts.begin(),
2909                                         m_bincounts.end());
2910     fill(AlleleModel::m_bincounts.begin(),
2911          AlleleModel::m_bincounts.end(), AlleleModel::m_nbins);
2912 } // KAlleleModel::Initialize
2913 
2914 //------------------------------------------------------------------------------------
2915 
DataToLikes(const string & datum,long marker) const2916 vector<double> KAlleleModel::DataToLikes(const string& datum, long marker) const
2917 {
2918     if (m_bincounts.empty())
2919     {
2920         string msg = "KAlleleModel::DataToLikes() was called on an uninitialized ";
2921         msg += "object.";
2922         throw implementation_error(msg);
2923     }
2924 
2925     if (datum == "?")       // unknown data fills all bins with 1
2926     {
2927         vector<double> result(m_bincounts[marker], 1.0);
2928         return result;
2929     }
2930     else
2931     {
2932         vector<double> result(m_bincounts[marker], 0.0);
2933         map<string, long>::const_iterator allele = m_allelemaps[marker].find(datum);
2934         assert(allele != m_allelemaps[marker].end()); //Uncounted allele
2935         result[allele->second] = 1.0;
2936         return result;
2937     }
2938 
2939 } // DataToLikes
2940 
2941 //------------------------------------------------------------------------------------
2942 
ComputeSubtreeDLs(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)2943 double KAlleleModel::ComputeSubtreeDLs(Cell& rootdls, double** startmarker, double** endmarker, long posn)
2944 {
2945     double total=0.0, subtotal;
2946     double** marker;
2947     long cat, bin;
2948     long firstposn = posn;
2949 
2950     for (marker = startmarker; marker != endmarker;
2951          marker = rootdls.GetNextMarker(marker))
2952     {
2953         subtotal = 0.0;
2954         DoubleVec1d buffer(m_ncategories, 0.0);
2955 
2956         for (cat = 0; cat < m_ncategories; ++cat)
2957         {
2958             for (bin = 0; bin < m_bincounts[posn]; ++bin)
2959             {
2960                 // NB:  We assume a flat allele frequency prior here.
2961                 buffer[cat] += marker[cat][bin]/m_nbins;
2962                 //LS DEBUG MAPPING:  we eventually want unique allele freqs here instead
2963                 // of dividing by m_nbins.
2964             }
2965 
2966             subtotal += m_catprobs[cat] * buffer[cat];
2967             //cout<<m_catprobs[cat]<<endl<<buffer[cat]<<endl;
2968             //assert(subtotal != 0);  // that would not be likely enough
2969             if (!subtotal)
2970             {
2971                 DataModel::TryToNormalizeAndThrow(posn, GetModelType());
2972             }
2973         }
2974 
2975         total += (log(subtotal) + rootdls.GetNorms(posn));
2976 
2977         assert (total != 0);  // that would be *too* likely
2978 
2979         if (m_ncategories > 1)
2980         {
2981             for (cat = 0; cat < m_ncategories; cat++)
2982                 m_likes[posn][cat] = buffer[cat]/subtotal;
2983         }
2984         ++posn;
2985     }
2986 
2987     if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
2988 
2989     if(systemSpecificIsnan(total))
2990     {
2991         cout<<endl<<"ComputeSubtreeDLs";
2992         cout<<endl;
2993     }
2994 
2995     return total;
2996 
2997 }; // KAlleleModel::ComputeSubtreeDLs
2998 
2999 //------------------------------------------------------------------------------------
3000 //------------------------------------------------------------------------------------
3001 
ToXML(size_t nspaces) const3002 StringVec1d KAlleleModel::ToXML(size_t nspaces) const
3003 {
3004     StringVec1d xmllines(DataModel::ToXML(nspaces));
3005 
3006     // Yes, this model has no fields of its own.
3007 
3008     return xmllines;
3009 
3010 } // ToXML
3011 
3012 //------------------------------------------------------------------------------------
3013 // This routine simulates data for a single node under the KAllele
3014 // model.  It assumes that the given branch lengths have already
3015 // been rescaled for the rate category desired.
3016 
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const3017 DoubleVec1d KAlleleModel::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
3018 {
3019     double rnd = registry.GetRandom().Float();
3020     long nstates(m_bincounts[whichmarker]);
3021     long oldstate = 0;
3022     for (long staten=0; staten<static_cast<long>(state.size()); staten++)
3023     {
3024         if (state[staten] == 1.0) oldstate = staten;
3025     }
3026 
3027     // if something happens
3028     double newratio = nstates/(nstates-1);
3029     if (rnd < (1/newratio) * (1.0 - exp(-newratio * branchlength)))
3030     {
3031         long chosenstate = registry.GetRandom().Long(nstates-1);
3032         if (chosenstate >= oldstate) chosenstate++;
3033         DoubleVec1d answer(state.size(),0.0);
3034         answer[chosenstate] = 1.0;
3035         return answer;
3036     }
3037 
3038     // return nothing happens
3039     return state;
3040 
3041 } // SimulateMarker
3042 
3043 //------------------------------------------------------------------------------------
3044 
CellToData(Cell_ptr cell,long marker) const3045 string KAlleleModel::CellToData(Cell_ptr cell, long marker) const
3046 {
3047     LongVec1d ones = cell->GetOnes(marker);
3048     if (ones.size() == 1)
3049     {
3050         for (map<string, long>::const_iterator key=m_allelemaps[marker].begin();
3051              key != m_allelemaps[marker].end(); key++)
3052         {
3053             if ((*key).second == ones[0])
3054             {
3055                 return (*key).first;
3056             }
3057         }
3058         throw data_error("Cannot find any any alleles for bin " + ToString(ones[0]) + ".");
3059     }
3060     throw implementation_error
3061         ("Cannot convert K-Allele data from the internal format if the data is not a single allele.");
3062 
3063 }
3064 
3065 //------------------------------------------------------------------------------------
3066 //------------------------------------------------------------------------------------
3067 
MixedKSModel(long nmarkers,const StringVec2d & uniqueAlleles,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate,double alphaVal,bool doOptimization)3068 MixedKSModel::MixedKSModel(
3069     long nmarkers,
3070     const StringVec2d& uniqueAlleles,
3071     long numCategories,
3072     DoubleVec1d categoryRates,
3073     DoubleVec1d categoryProbabilities,
3074     double userAutoCorrelationValue,
3075     bool doNormalize,
3076     double relMuRate,
3077     double alphaVal,
3078     bool doOptimization
3079     )
3080     : AlleleModel(
3081         nmarkers,
3082         numCategories,
3083         categoryRates,
3084         categoryProbabilities,
3085         userAutoCorrelationValue,
3086         doNormalize,
3087         defaults::bins,
3088         relMuRate
3089         ),
3090       m_allowance(defaults::mixedks_allowance),
3091       m_isOpt(doOptimization),
3092       m_alphaRpt()
3093 {
3094     SetAlpha(alphaVal); // instead of initializing, because there's
3095                         // logic to check value of alpha in SetAlpha
3096     m_origAlpha = m_alpha;
3097     Initialize(uniqueAlleles);
3098 }
3099 
3100 //------------------------------------------------------------------------------------
3101 
Clone() const3102 DataModel* MixedKSModel::Clone() const
3103 {
3104     DataModel* newmodel = new MixedKSModel(*this);
3105     return newmodel;
3106 }
3107 
3108 //------------------------------------------------------------------------------------
3109 
DataToLikes(const string & datum,long marker) const3110 vector<double> MixedKSModel::DataToLikes(const string& datum, long marker) const
3111 {
3112     if (m_bincounts.empty())
3113     {
3114         string msg = "MixedKSModel::DataToLikes() was called on an uninitialized ";
3115         msg += "object.";
3116         throw implementation_error(msg);
3117     }
3118 
3119     if (datum == "?")                   // unknown data fills all bins with 1
3120     {
3121         vector<double> result(m_bincounts[marker], 1.0);
3122         return result;
3123     }
3124     else if (StringType(datum)!=2)
3125     {
3126         throw data_error ("Your data must consist of numbers, not letters or punctuation."
3127                           "  The Mixed KS model is inappropriate for DNA or RNA data.");
3128     }
3129     else
3130     {
3131         vector<double> result(m_bincounts[marker], 0.0);
3132         long allele;
3133         FromString(datum,allele);
3134         allele -= m_offsets[marker];
3135         if (allele < 0 || allele >= m_bincounts[marker])
3136         {
3137             string msg = "MixedKSModel::DataToLikes() was called on an ";
3138             msg += "uninitialized (or incorrectly initialized) object.";
3139             throw implementation_error(msg);
3140         }
3141         result[allele] = 1.0;
3142         return result;
3143     }
3144 
3145 } // DataToLikes
3146 
3147 //------------------------------------------------------------------------------------
3148 
Initialize(const StringVec2d & uniqueAlleles)3149 void MixedKSModel::Initialize(const StringVec2d& uniqueAlleles)
3150 {
3151     if (static_cast<unsigned long>(m_nmarkers) != uniqueAlleles.size())
3152     {
3153         string msg = "MixedKSModel::Initialize() encountered m_nmarkers = ";
3154         msg += ToString(m_nmarkers) + " and uniqueAlleles.size() = ";
3155         msg += ToString(uniqueAlleles.size()) + "; these numbers should be equal.";
3156         throw implementation_error(msg);
3157     }
3158 
3159     // find the biggest and smallest allele for each marker
3160     // NB We assume that allele sizes are in number of repeats,
3161     // *not* base pair count or anything else, and that "missing
3162     // data" is coded as ?.
3163 
3164     long tipval;
3165 
3166     for (long marker = 0; marker < m_nmarkers; ++marker)
3167     {
3168         bool real_allele = false;  // did we ever see a non-? allele
3169         long smallone = MAXLONG, largeone = 0;
3170 
3171         for (size_t nallele = 0; nallele<uniqueAlleles[marker].size();
3172              nallele++)
3173         {
3174             // convert to number if possible;
3175             // do not count "unknown data" markers
3176             string allele = uniqueAlleles[marker][nallele];
3177             if (allele == "?")
3178             {
3179                 assert(false); //catch this earlier
3180                 continue;
3181             }
3182             real_allele = true;
3183             FromString(allele, tipval);  // convert to long
3184 
3185             if (tipval < smallone) smallone = tipval;
3186             if (tipval > largeone) largeone = tipval;
3187         }
3188 
3189         // if no non-? were ever found, use arbitrary values
3190         if (!real_allele)
3191         {
3192             smallone = 10;
3193             largeone = 10;
3194         }
3195 
3196         long newoffset = max(smallone - m_allowance, 0L);
3197         m_offsets.push_back(newoffset);
3198         m_bincounts.push_back(largeone + m_allowance + 1 - newoffset);
3199         if (real_allele && largeone != smallone)
3200             // m_threshhold: +1 makes it work for (large-small) both odd/even
3201             m_threshhold.push_back((largeone - smallone + 1L)/2L);
3202         else
3203             m_threshhold.push_back(1L);
3204 
3205         // Pre-calculate table of steps
3206         CalculateSteps(marker);
3207     }
3208 
3209     AlleleModel::m_nbins = *max_element(m_bincounts.begin(),
3210                                         m_bincounts.end());
3211     fill(AlleleModel::m_bincounts.begin(),
3212          AlleleModel::m_bincounts.end(), AlleleModel::m_nbins);
3213 } // Initialize
3214 
3215 //------------------------------------------------------------------------------------
3216 
SetAlpha(double alphaVal)3217 void MixedKSModel::SetAlpha(double alphaVal)
3218 {
3219     assert(alphaVal >= 0.0);
3220     assert(alphaVal <= 1.0);
3221     m_alpha = alphaVal;
3222     m_beta = 1 - m_alpha;
3223 }
3224 
3225 //------------------------------------------------------------------------------------
3226 
SetAlpha(double alphaVal,long rep,long chain)3227 void MixedKSModel::SetAlpha(double alphaVal, long rep, long chain)
3228 {
3229     if (m_alphaRpt.size() == static_cast<size_t>(rep))
3230     {
3231         ResetAlpha();
3232     }
3233     assert(m_alphaRpt[rep].size() == static_cast<size_t>(chain));
3234     m_alphaRpt[rep].push_back(alphaVal);
3235     SetAlpha(alphaVal);
3236 }
3237 
3238 //------------------------------------------------------------------------------------
3239 // Adaptation of Peter Beerli's microsatellite likelihood nuview_micro routine from Migrate.
3240 
ComputeSiteDLs(Cell_ptr child1,Cell_ptr child2,Cell_ptr thiscell,const DoubleVec1d & lengthOfBranchToChild1ScaledByRateCat,const DoubleVec1d & lengthOfBranchToChild2ScaledByRateCat,long marker)3241 void MixedKSModel::ComputeSiteDLs (Cell_ptr child1, Cell_ptr child2,
3242                                    Cell_ptr thiscell, const DoubleVec1d& lengthOfBranchToChild1ScaledByRateCat,
3243                                    const DoubleVec1d& lengthOfBranchToChild2ScaledByRateCat, long marker)
3244 {
3245     double **pSiteDLsForChild1 = child1->GetSiteDLs(marker);
3246     double **pSiteDLsForChild2 = child2->GetSiteDLs(marker);
3247     double normForChild1 = child1->GetNorms(marker);
3248     double normForChild2 = child2->GetNorms(marker);
3249 
3250     if (!pSiteDLsForChild1 && !pSiteDLsForChild2) // they can't both be NULL
3251     {
3252         string msg = "MixedKSModel::ComputeSiteDLs() found no existing ";
3253         msg += "site data-likelihoods for either child.";
3254         throw implementation_error(msg);
3255     }
3256 
3257     // in case of a one-legged coalescence, copy values and return
3258     if (!pSiteDLsForChild1)
3259     {
3260         thiscell->SetSiteDLs(marker, pSiteDLsForChild2);
3261         thiscell->SetNorms(normForChild2, marker);
3262         return;
3263     }
3264 
3265     if (!pSiteDLsForChild2)
3266     {
3267         thiscell->SetSiteDLs(marker, pSiteDLsForChild1);
3268         thiscell->SetNorms(normForChild1, marker);
3269         return;
3270     }
3271 
3272     long smax = m_bincounts[marker];
3273     long threshold = m_threshhold[marker];
3274 
3275     // allocate temporary working space;
3276     // OPTIMIZE should be done more cheaply!
3277     double **jointProbChild1Child2 = new double*[m_ncategories * sizeof(double *)];
3278     jointProbChild1Child2[0] = new double[m_ncategories * m_bincounts[marker] * sizeof(double)];
3279     for (long cat = 0; cat < m_ncategories; ++cat)
3280         jointProbChild1Child2[cat] = jointProbChild1Child2[0] + cat * m_bincounts[marker];
3281     double maxJointProbChild1Child2 = -DBL_MAX;
3282     double mutationProbChild1, mutationProbChild2;
3283 
3284     long s, a, diff;
3285 
3286     //KAllele part
3287     // compute the bin contents for each possible allele size
3288     long ss;
3289     double prob1, prob2, a1, a2, b1, b2, temp1, temp2, sum1, sum2;
3290 
3291     for (long cat = 0; cat < m_ncategories; ++cat)
3292     {
3293         prob1 = probMathFunc (lengthOfBranchToChild1ScaledByRateCat[cat], (smax-1.0)/smax);
3294         prob2 = probMathFunc (lengthOfBranchToChild2ScaledByRateCat[cat], (smax-1.0)/smax);
3295 
3296         a1 = 1.0 - prob1;
3297         a2 = 1.0 - prob2;
3298         b1 = prob1 / ( smax - 1.0); //need to check when smax = 1
3299         b2 = prob2 / ( smax - 1.0);
3300         for (s = 0; s < smax; s++)
3301         {
3302             sum1 = 0; sum2 = 0;
3303             for(ss = 0; ss < smax; ss++)
3304             {
3305                 if (s == ss)
3306                 {
3307                     temp1 = a1; temp2 = a2;
3308                 }
3309                 else
3310                 {
3311                     temp1 = b1; temp2 = b2;
3312                 }
3313                 sum1 += pSiteDLsForChild1[cat][ss] * temp1;
3314                 sum2 += pSiteDLsForChild2[cat][ss] * temp2;
3315             }
3316 
3317             jointProbChild1Child2[cat][s] = sum1 * sum2 * m_alpha;  //alpha
3318             if (jointProbChild1Child2[cat][s] > maxJointProbChild1Child2)
3319                 maxJointProbChild1Child2 = jointProbChild1Child2[cat][s];  // overflow protection
3320         }
3321     }
3322     //LS NOTE:  The following code firing was due to a bug in the past;
3323     // you should probably look carefully to make sure that's not true
3324     // again.
3325     if(maxJointProbChild1Child2 < 0)
3326     {
3327         string msg = "Warning:  maximum probability value of "
3328             + ToString(maxJointProbChild1Child2)
3329             + " in MixedKS.";
3330         registry.GetRunReport().ReportDebug(msg);
3331     }
3332 
3333     //StepWise part
3334     // compute the bin contents for each possible allele size
3335     for (s = 0; s < smax; s++)
3336     {
3337         for (long cat = 0; cat < m_ncategories; ++cat)
3338         {
3339             mutationProbChild1 = mutationProbChild2 = 0.0;
3340             for (a = max (0L, s - threshold); a <= s + threshold && a < smax; a++)
3341             {
3342                 diff = labs(s - a);
3343                 if (pSiteDLsForChild1[cat][a] > 0)
3344                 {
3345                     mutationProbChild1 += Probability(lengthOfBranchToChild1ScaledByRateCat[cat],
3346                                                       diff, marker) * pSiteDLsForChild1[cat][a];
3347                 }
3348                 if (pSiteDLsForChild2[cat][a] > 0)
3349                 {
3350                     mutationProbChild2 += Probability(lengthOfBranchToChild2ScaledByRateCat[cat],
3351                                                       diff, marker) * pSiteDLsForChild2[cat][a];
3352                 }
3353             }
3354             jointProbChild1Child2[cat][s] += mutationProbChild1 * mutationProbChild2 * m_beta;
3355             if (jointProbChild1Child2[cat][s] > maxJointProbChild1Child2)
3356                 maxJointProbChild1Child2 = jointProbChild1Child2[cat][s];  // overflow protection
3357         }
3358     }
3359 
3360     // normalize to further protect against overflow, if requested
3361     if (ShouldNormalize())
3362     {
3363         if (maxJointProbChild1Child2 == 0.0)
3364         {
3365             thiscell->SetNorms(-DBL_MAX,marker);
3366         }
3367         else
3368         {
3369             for (s = 0; s < smax; s++)
3370             {
3371                 for (long cat = 0; cat < m_ncategories; ++cat)
3372                 {
3373                     jointProbChild1Child2[cat][s] /= maxJointProbChild1Child2;
3374                 }
3375             }
3376             thiscell->SetNorms(log(maxJointProbChild1Child2) + normForChild1 + normForChild2,
3377                                marker);
3378         }
3379     }
3380 
3381     thiscell->SetSiteDLs(marker, jointProbChild1Child2);
3382 
3383     delete[] jointProbChild1Child2[0];
3384     delete[] jointProbChild1Child2;
3385 } // ComputeSiteDLs
3386 
3387 //------------------------------------------------------------------------------------
3388 
ComputeSubtreeDLs(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)3389 double MixedKSModel::ComputeSubtreeDLs(Cell& rootdls, double** startmarker, double** endmarker, long posn)
3390 {
3391     double total=0.0, subtotal;
3392     double** marker;
3393     long cat, bin;
3394     long firstposn = posn;
3395 
3396     for (marker = startmarker; marker != endmarker;
3397          marker = rootdls.GetNextMarker(marker))
3398     {
3399         subtotal = 0.0;
3400         DoubleVec1d buffer(m_ncategories, 0.0);
3401 
3402         for (cat = 0; cat < m_ncategories; ++cat)
3403         {
3404             for (bin = 0; bin < m_bincounts[posn]; ++bin)
3405             {
3406                 // NB:  We assume a flat allele frequency prior here.
3407                 buffer[cat] += marker[cat][bin];
3408             }
3409 
3410             subtotal += m_catprobs[cat] * buffer[cat];
3411         }
3412 
3413         if (!subtotal)
3414         {
3415             DataModel::TryToNormalizeAndThrow(posn, GetModelType());
3416         }
3417 
3418         total += (log(subtotal) + rootdls.GetNorms(posn));
3419 
3420         assert (total != 0);  // that would be *too* likely
3421 
3422         if (m_ncategories > 1)
3423         {
3424             for (cat = 0; cat < m_ncategories; cat++)
3425                 m_likes[posn][cat] = buffer[cat]/subtotal;
3426         }
3427         ++posn;
3428     }
3429 
3430     if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
3431     return total;
3432 
3433 } // MixedKSModel::ComputeSubtreeDLs
3434 
3435 //------------------------------------------------------------------------------------
3436 
ToXML(size_t nspaces) const3437 StringVec1d MixedKSModel::ToXML(size_t nspaces) const
3438 {
3439     StringVec1d xmllines(AlleleModel::ToXML(nspaces));
3440 
3441     nspaces += INDENT_DEPTH;
3442 
3443     string line(MakeIndent(MakeTag(xmlstr::XML_TAG_ISOPT),nspaces));
3444     line += ToStringTF(m_isOpt);
3445     line += MakeCloseTag(xmlstr::XML_TAG_ISOPT);
3446     StringVec1d::iterator endtag = --xmllines.end();
3447     xmllines.insert(endtag,line);
3448 
3449     string line2(MakeIndent(MakeTag(xmlstr::XML_TAG_ALPHA),nspaces));
3450     line2 += ToString(GetAlpha());
3451     line2 += MakeCloseTag(xmlstr::XML_TAG_ALPHA);
3452     endtag = --xmllines.end();
3453     xmllines.insert(endtag,line2);
3454 
3455     nspaces -= INDENT_DEPTH;
3456     return xmllines;
3457 
3458 } // ToXML
3459 
3460 //------------------------------------------------------------------------------------
3461 // Adaptation of Peter Beerli's prob_micro routine from MIGRATE.
3462 // This routine computes the probability of a change of "diff" steps in time "t".
3463 //
3464 //   Mary Kuhner 2002/01/02
3465 
Probability(double t,long diff,long marker) const3466 double MixedKSModel::Probability (double t, long diff, long marker) const
3467 {
3468     long threshold = m_threshhold[marker];
3469     if (diff > threshold)
3470         return 0.0; // approximately infinitely unlikely to mutate that much in time t
3471 
3472     const DoubleVec2d& PrecomputedTerms = m_steps[marker];
3473     double sum(0.0), oldsum(0.0), log_tOver2(log(0.5*t));
3474 
3475     for (long k = 0; k <= threshold; k++) // num steps = diff + k <= threshold
3476     {
3477         sum += exp(-t + log_tOver2*(diff + 2.0*k) - PrecomputedTerms[diff][k]);
3478 
3479         // quit if the contributions have become trivial
3480         if (fabs (oldsum - sum) < DBL_EPSILON) break;
3481         oldsum = sum;
3482     }
3483     return sum;
3484 } // Probability
3485 
3486 //------------------------------------------------------------------------------------
3487 // Adaptation of Peter Beerli's calculate_steps routine from MIGRATE.
3488 // This routine precomputes values needed by Probability, for speed.
3489 
CalculateSteps(long marker)3490 void MixedKSModel::CalculateSteps (long marker)
3491 {
3492     long k, diff;
3493     DoubleVec1d tempvec;
3494     DoubleVec2d steps;
3495     long threshhold = m_threshhold[marker];
3496 
3497     for (diff = 0; diff <= threshhold; diff++)
3498     {
3499         tempvec.clear();
3500         for (k = 0; k <= threshhold; k++)
3501         {
3502             tempvec.push_back(logfac (diff + k) + logfac (k));
3503         }
3504         steps.push_back(tempvec);
3505     }
3506 
3507     m_steps.push_back(steps);
3508     // Note:  This is okay from a speed standpoint,
3509     // but not so good from a design standpoint,
3510     // because this method assumes it's being called
3511     // within a loop over markers.
3512 } // CalculateSteps
3513 
3514 //------------------------------------------------------------------------------------
3515 
probMathFunc(double ut,double coef)3516 double MixedKSModel::probMathFunc( double ut, double coef)
3517 {
3518     double mathValue = coef*(1.0-exp(-1.0/coef*ut));
3519     if (systemSpecificIsnan(mathValue))
3520         cout << "coef:" << coef << endl
3521              << "time:" << ut << endl;
3522     return mathValue;
3523 } // MixedKSModel::probMathFunc
3524 
3525 //------------------------------------------------------------------------------------
3526 
CreateDataModelReport() const3527 StringVec1d MixedKSModel::CreateDataModelReport() const
3528 {
3529     StringVec1d report = DataModel::CreateDataModelReport();
3530 
3531     string line;
3532     if (m_isOpt)
3533     {
3534         line = "Final ";
3535     }
3536     line += "Multistep:single-step ratio: " + ToString(m_alpha);
3537     report.push_back(line);
3538 
3539     string maxbins = "Maximum number of bins:  " + ToString(m_bincounts[0]);
3540     report.push_back(maxbins);
3541     if (m_isOpt)
3542     {
3543         report.push_back(string("Multistep:single-step ratio used for each chain (optimized from the previous):"));
3544         //report alpha for each chain
3545         for(size_t rep=0; rep<m_alphaRpt.size(); rep++)
3546         {
3547             for (size_t chain=0; chain<m_alphaRpt[rep].size(); chain++)
3548             {
3549                 line = "";
3550                 if (m_alphaRpt.size() > 1)
3551                 {
3552                     line = "Replicate " + indexToKey(rep) + ", ";
3553                 }
3554                 line += "Chain " + indexToKey(chain) + " :"
3555                     + ToString(m_alphaRpt[rep][chain]);
3556                 report.push_back( line );
3557             }
3558         }
3559     }
3560 
3561     return report;
3562 
3563 } // MixedKSModel::CreateDataModelReport
3564 
3565 //------------------------------------------------------------------------------------
3566 
OptimizeDataModel(Tree * tree,const Locus & locus)3567 bool MixedKSModel::OptimizeDataModel(Tree* tree, const Locus& locus)
3568 {
3569     //only do optimization when turn on 'm_isOpt', otherwize do nothing
3570     if(!m_isOpt) return false;
3571 
3572     FuncMax fm(*this, tree, locus);
3573 
3574     FMState fms(100,      //max_iters
3575                 0.01,     //increment
3576                 0.0001,   //threshold
3577                 0,        //limits
3578                 1,
3579                 0.5       //initX
3580         );
3581 
3582     fm.setState(fms);
3583     fm.run();
3584     //Note:  The FuncMax object has a pointer to us, and uses it to set our alpha.
3585     //report final alpha for each chain
3586     m_alphaRpt[m_alphaRpt.size()-1].push_back(m_alpha);
3587     return true;
3588 }
3589 
3590 //------------------------------------------------------------------------------------
3591 
ResetAlpha()3592 void MixedKSModel::ResetAlpha()
3593 {
3594     SetAlpha(m_origAlpha);
3595     DoubleVec1d newalphas;
3596     newalphas.push_back(m_origAlpha);
3597     m_alphaRpt.push_back(newalphas);
3598 }
3599 
3600 //------------------------------------------------------------------------------------
3601 
WriteAlpha(ofstream & sumout,long loc,long rep,long chain)3602 void MixedKSModel::WriteAlpha(ofstream& sumout, long loc, long rep, long chain)
3603 {
3604     if (!m_isOpt) return;
3605     assert(static_cast<size_t>(rep)<m_alphaRpt.size());
3606     if (static_cast<size_t>(chain+1)>=m_alphaRpt[rep].size()) return;
3607     //LS NOTE:  We  write out the alpha that we calculated for this chain
3608     // instead of the alpha we used for this chain, because of timing
3609     // issues--we don't want to lose the information if sumfile writing stops
3610     // unexpectedly.
3611     sumout << "\t"
3612            << xmlsum::ALPHA_START1 << " "
3613            << xmlsum::ALPHA_START2 << " " << loc << " "
3614            << xmlsum::ALPHA_START3 << " " << m_alphaRpt[rep][chain+1]
3615            << " " << xmlsum::ALPHA_END << endl;
3616 
3617 }
3618 
3619 //------------------------------------------------------------------------------------
3620 
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const3621 DoubleVec1d MixedKSModel::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
3622 {
3623     throw implementation_error("Cannot simulate data with the MixedKS model yet.");
3624 } // SimulateMarker
3625 
3626 //------------------------------------------------------------------------------------
3627 
CellToData(Cell_ptr cell,long marker) const3628 string MixedKSModel::CellToData(Cell_ptr cell, long marker) const
3629 {
3630     LongVec1d ones = cell->GetOnes(marker);
3631     assert(static_cast<size_t>(marker) < m_offsets.size());
3632     if (ones.size() == 1)
3633     {
3634         return ToString(ones[0] + m_offsets[marker]);
3635     }
3636     throw implementation_error
3637         ("Cannot convert stepwise data from the internal format for data not simply a single number.");
3638 }
3639 
3640 //____________________________________________________________________________________
3641