1 // $Id: dlmodel.cpp,v 1.137 2012/05/22 20:25:32 jyamato Exp $
2
3 /*
4 Copyright 2002 Peter Beerli, Mary Kuhner, Jon Yamato and Joseph Felsenstein
5
6 This software is distributed free of charge for non-commercial use
7 and is copyrighted. Of course, we do not guarantee that the software
8 works, and are not responsible for any damage you may cause or have.
9 */
10
11 #include <cassert>
12 #include <algorithm>
13 #include <cstring>
14 #include <numeric>
15 #include <iostream>
16
17 #include "local_build.h"
18
19 #include "calculators.h"
20 #include "datapack.h"
21 #include "datatype.h"
22 #include "defaults.h" // for defaults::threshhold in StepwiseModel::ctor
23 #include "dlcell.h"
24 #include "dlmodel.h"
25 #include "errhandling.h"
26 #include "funcMax.h"
27 #include "locus.h"
28 #include "mathx.h"
29 #include "registry.h"
30 #include "runreport.h"
31 #include "stringx.h"
32 #include "xml_strings.h" // for ToXML()
33 #include "xmlsum_strings.h" // For WriteAlpha()
34
35 #ifdef DMALLOC_FUNC_CHECK
36 #include "/usr/local/include/dmalloc.h"
37 #endif
38
39 // turns on detailed local variables for debugging
40 // JRM 3/10
41 //#define DEBUG_VARIABLES
42 #ifdef DEBUG_VARIABLES
43 int printdbgvar = 0;
44 #endif
45
46 using namespace std;
47
48 //------------------------------------------------------------------------------------
49 //------------------------------------------------------------------------------------
50
DataModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,long numBins,double relmurate)51 DataModel::DataModel(long nmarkers,
52 long numCategories,
53 DoubleVec1d categoryRates,
54 DoubleVec1d categoryProbabilities,
55 double userAutoCorrelationValue,
56 bool doNormalize,
57 long numBins,
58 double relmurate)
59 :
60 m_ncategories(numCategories),
61 m_catrates(categoryRates),
62 m_catprobs(categoryProbabilities),
63 m_acratio(1.0 / userAutoCorrelationValue),
64 m_normalize(doNormalize),
65 m_nbins(numBins),
66 m_nmarkers(nmarkers),
67 m_relmurate(relmurate)
68 {
69 m_ucratio = 1.0 - m_acratio;
70 ScaleCatProbabilities();
71 ScaleCatRates();
72 assert(DataModel::IsValidDataModel());
73
74 } // DataModel constructor
75
76 //------------------------------------------------------------------------------------
77
GetDataModelName() const78 string DataModel::GetDataModelName() const
79 {
80 // "true" argument gets long version of name
81 return ToString(GetModelType(),true);
82 }
83
84 //------------------------------------------------------------------------------------
85
GetDataModelShortName() const86 string DataModel::GetDataModelShortName() const
87 {
88 // "false" argument gets short version of name
89 return ToString(GetModelType(),false);
90 }
91
92 //------------------------------------------------------------------------------------
93
ChooseRandomRates(long nsites) const94 DoubleVec1d DataModel::ChooseRandomRates(long nsites) const
95 {
96 DoubleVec1d rates;
97 Random& rand = registry.GetRandom();
98 double rate = m_catrates[0];
99 //choose a random rate to start
100 double r = rand.Float();
101 for (long i = 0; i < m_ncategories; ++i)
102 {
103 if (r < m_catprobs[i]) rate = m_catrates[i];
104 else r -= m_catprobs[i];
105 }
106
107 for (long site=0; site<nsites; site++)
108 {
109 if (rand.Float() <= m_acratio)
110 {
111 //choose a new rate.
112 r = rand.Float();
113 for (long i = 0; i < m_ncategories; ++i)
114 {
115 if (r < m_catprobs[i]) rate = m_catrates[i];
116 else r -= m_catprobs[i];
117 }
118 }
119 rates.push_back(rate);
120 }
121 return rates;
122
123 } // ChooseRandomRate
124
125 //------------------------------------------------------------------------------------
126
127 #if 0 // Vestigial but possibly future code
128 void DataModel::SetGamma(bool gam)
129 {
130 usegamma = gam;
131 if (usegamma)
132 {
133 // DEBUG debug warning WARNING
134 // need to error check presence of gammashape and ncats > 1?
135 SetCatRates(gamma_rates(gammashape,m_ncategories));
136 DoubleVec1d newprobs(m_ncategories,1.0/m_ncategories);
137 SetCatProbabilities(newprobs);
138 }
139 }
140 #endif
141
142 //------------------------------------------------------------------------------------
143
IsValidDataModel() const144 bool DataModel::IsValidDataModel() const
145 {
146 if (m_nbins < 1) return false;
147 if (m_nmarkers < 1) return false;
148
149 if (m_acratio < 0) return false;
150
151 if (m_ncategories < 1) return false;
152 size_t ncats = m_ncategories;
153 if (m_catrates.size() != ncats) return false;
154 if (m_catprobs.size() != ncats) return false;
155
156 size_t i;
157 double totalprob = 0.0;
158 for (i = 0; i < ncats; ++i)
159 {
160 if (m_catrates[i] < 0.0) return false;
161 if (m_catprobs[i] <= 0.0) return false;
162 if (m_catprobs[i] > 1.0) return false;
163 totalprob += m_catprobs[i];
164 }
165
166 if (fabs(1.0 - totalprob) > EPSILON) return false;
167
168 return true;
169 } // DataModel::IsValidDataModel
170
171 //------------------------------------------------------------------------------------
172
CreateDataModelReport() const173 StringVec1d DataModel::CreateDataModelReport() const
174 {
175
176 StringVec1d report;
177 string line;
178
179 if (m_ncategories > 1)
180 {
181 line = ToString(m_ncategories) + " rate categories with correlated length " +
182 ToString(1.0 / m_acratio);
183 report.push_back(line);
184 long cat;
185 for (cat = 0; cat < m_ncategories; ++cat)
186 {
187 line = "Relative rate " + ToString(m_catrates[cat]) + " Frequency " +
188 ToString(m_catprobs[cat]);
189 report.push_back(line);
190 }
191 }
192 if (m_relmurate != 1)
193 {
194 line = "The relative marker mutation rate for this model's segment was ";
195 line += ToString(m_relmurate) + ".";
196 report.push_back(line);
197 }
198
199 return report;
200 } // CreateDataModelReport
201
202 //------------------------------------------------------------------------------------
203
ToXML(size_t nspaces) const204 StringVec1d DataModel::ToXML(size_t nspaces) const
205 {
206 StringVec1d xmllines;
207 string line = MakeIndent(
208 MakeTagWithName(xmlstr::XML_TAG_MODEL,GetDataModelShortName()),
209 nspaces);
210 xmllines.push_back(line);
211
212 nspaces += INDENT_DEPTH;
213 string mytag(MakeTag(xmlstr::XML_TAG_NORMALIZE));
214 line = MakeIndent(mytag,nspaces) + ToStringTF(ShouldNormalize()) +
215 MakeCloseTag(mytag);
216 xmllines.push_back(line);
217 line = MakeIndent(MakeTag(xmlstr::XML_TAG_CATEGORIES),nspaces);
218 xmllines.push_back(line);
219
220 nspaces += INDENT_DEPTH;
221 mytag = MakeTag(xmlstr::XML_TAG_NUM_CATEGORIES);
222 line = MakeIndent(mytag,nspaces) + ToString(GetNcategories()) +
223 MakeCloseTag(mytag);
224 xmllines.push_back(line);
225 mytag = MakeTag(xmlstr::XML_TAG_RATES);
226 line = MakeIndent(mytag,nspaces) + ToString(GetCatRates(),6) +
227 MakeCloseTag(mytag);
228 xmllines.push_back(line);
229 mytag = MakeTag(xmlstr::XML_TAG_PROBABILITIES);
230 line = MakeIndent(mytag,nspaces) + ToString(GetCatProbabilities()) +
231 MakeCloseTag(mytag);
232 xmllines.push_back(line);
233 mytag = MakeTag(xmlstr::XML_TAG_AUTOCORRELATION);
234 line = MakeIndent(mytag,nspaces) + ToString(GetUserAcratio()) +
235 MakeCloseTag(mytag);
236 xmllines.push_back(line);
237 nspaces -= INDENT_DEPTH;
238
239 line = MakeIndent(MakeCloseTag(xmlstr::XML_TAG_CATEGORIES),nspaces);
240 xmllines.push_back(line);
241
242 mytag = MakeTag(xmlstr::XML_TAG_RELATIVE_MURATE);
243 line = MakeIndent(mytag,nspaces) + ToString(m_relmurate) +
244 MakeCloseTag(mytag);
245 xmllines.push_back(line);
246
247 nspaces -= INDENT_DEPTH;
248 line = MakeIndent(MakeCloseTag(xmlstr::XML_TAG_MODEL),nspaces);
249 xmllines.push_back(line);
250
251 return xmllines;
252 } // ToXML
253
254 //------------------------------------------------------------------------------------
255
ResetCatCells()256 void DataModel::ResetCatCells()
257 {
258 m_catcells.assign(m_ncategories,1.0);
259 } // DataModel::ResetCatCells
260
261 //------------------------------------------------------------------------------------
262
ScaleCatProbabilities()263 void DataModel::ScaleCatProbabilities()
264 {
265 long cat;
266 double totalprob = 0.0;
267 m_ncategories = m_catprobs.size();
268 for(cat = 0; cat < m_ncategories; cat++)
269 {
270 totalprob += m_catprobs[cat];
271 }
272 if (fabs(1.0 - totalprob) > EPSILON)
273 {
274 for(cat = 0; cat < m_ncategories; cat++)
275 {
276 m_catprobs[cat] = m_catprobs[cat] / totalprob;
277 }
278 }
279 } // DataModel::ScaleCatProbabilities
280
281 //------------------------------------------------------------------------------------
282
ScaleCatRates()283 void DataModel::ScaleCatRates()
284 {
285 // We renormalize the category rates to a weighted mean of 1.0
286 double meanrate = 0.0;
287 size_t rate;
288
289 if (m_catrates.size() != m_catprobs.size())
290 {
291 string msg = "DataModel::ScaleCatRates() was called before ";
292 msg += "m_catrates and m_catprobs were properly initialized.";
293 throw implementation_error(msg);
294 }
295
296 size_t numrates = m_catrates.size();
297
298 for (rate = 0; rate < numrates; ++rate)
299 {
300 meanrate += m_catrates[rate] * m_catprobs[rate];
301 }
302
303 for (rate = 0; rate < numrates; ++rate)
304 {
305 m_catrates[rate] /= meanrate;
306 }
307
308 } // DataModel::ScaleCatRates
309
310 //------------------------------------------------------------------------------------
311
TryToNormalizeAndThrow(long posn,model_type mtype)312 void DataModel::TryToNormalizeAndThrow(long posn, model_type mtype)
313 {
314 if (ShouldNormalize())
315 {
316 string errmsg("Encountered a subtree of zero likelihood at ");
317 errmsg += "position " + ToString(posn) + ".";
318 switch (mtype)
319 {
320 case F84:
321 case GTR:
322 case KAllele:
323 assert(false);
324 //Encountering a tree of zero data likelihood on a tree with DNA
325 // is almost certainly a programming error, though occasionally
326 // one gets a tree so large that mispairing tips can be
327 // catastrophic. Likewise, a K-allele model should allow mutations
328 // of a single step to have reasonable likelihoods even if the
329 // microsats are very disparate.
330 //
331 // However, a *user* is much more likely to have such a data set,
332 // (and hopefully we debug the program thoroughly before release)
333 // so we fall through here, throw the 'bad tree' error, and try
334 // a different tree. --Lucian
335 case MixedKS:
336 // The MixedKS model has been seen to have problems here when it
337 // is optimizing.
338 case Brownian:
339 case Stepwise:
340 throw zero_dl_error(errmsg);
341 }
342 }
343 else
344 {
345 SetNormalize(true);
346 throw datalikenorm_error("Datalikelihood normalization turned on.");
347 }
348 }
349
350 //------------------------------------------------------------------------------------
351 //------------------------------------------------------------------------------------
352
NucModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate,double freqA,double freqC,double freqG,double freqT,bool calcFreqsFromData,double perBaseErrorRate)353 NucModel::NucModel(long nmarkers,
354 long numCategories,
355 DoubleVec1d categoryRates,
356 DoubleVec1d categoryProbabilities,
357 double userAutoCorrelationValue,
358 bool doNormalize,
359 double relMuRate,
360 double freqA,
361 double freqC,
362 double freqG,
363 double freqT,
364 bool calcFreqsFromData,
365 double perBaseErrorRate)
366 : DataModel(nmarkers,
367 numCategories,
368 categoryRates,
369 categoryProbabilities,
370 userAutoCorrelationValue,
371 doNormalize,
372 defaults::nucleotideBins,
373 relMuRate),
374 m_basefreqs(defaults::nucleotideBins),
375 m_freqsfromdata(calcFreqsFromData),
376 m_perBaseErrorRate(perBaseErrorRate)
377 {
378 m_basefreqs[baseA] = freqA;
379 m_basefreqs[baseC] = freqC;
380 m_basefreqs[baseG] = freqG;
381 m_basefreqs[baseT] = freqT;
382 NormalizeBaseFrequencies();
383 assert(NucModel::IsValidDataModel());
384 } // NucModel constructor
385
386 //------------------------------------------------------------------------------------
387
NormalizeBaseFrequencies()388 void NucModel::NormalizeBaseFrequencies()
389 {
390 double sumOfFreqs = accumulate(m_basefreqs.begin(),m_basefreqs.end(),0.0);
391 transform(m_basefreqs.begin(), m_basefreqs.end(),
392 m_basefreqs.begin(),
393 bind2nd(divides<double>(),sumOfFreqs));
394 }
395
396 //------------------------------------------------------------------------------------
397
IsValidDataModel() const398 bool NucModel::IsValidDataModel() const
399 {
400 size_t index;
401 double totalfreqs = 0.0;
402 if(m_basefreqs.size() != 4) return false;
403 for (index = 0; index < m_basefreqs.size(); index++)
404 {
405 double thisFreq = m_basefreqs[index];
406 if(thisFreq <= 0.0) return false;
407 if(thisFreq >= 1.0) return false;
408 totalfreqs += thisFreq;
409 }
410 if (fabs(1.0 - totalfreqs) > EPSILON) return false;
411 return DataModel::IsValidDataModel();
412 } // NucModel::ValidBaseFrequencies
413
414 //------------------------------------------------------------------------------------
415
DataToLikes(const string & datum,long) const416 vector<double> NucModel::DataToLikes(const string& datum, long) const
417 {
418 return NucModel::StaticDataToLikes(datum, GetPerBaseErrorRate());
419 }
420
421 //------------------------------------------------------------------------------------
422 // This function implements the standard ambiguity codes for nucleotide data,
423 // returning a vector of four doubles indicating the likelihood for A, C, G, T in that order.
424
StaticDataToLikes(const string & datum,double perBaseErrorRate)425 vector<double> NucModel::StaticDataToLikes(const string& datum, double perBaseErrorRate)
426 {
427 // We assume this is a single nucleotide base, passed as a string only for generality
428 assert(datum.size() == 1);
429
430 vector<double> likes(BASES, 0.0); // initialize to zero
431 char nucleotide = datum[0];
432
433 // resolve code
434 switch(nucleotide)
435 {
436 case 'A':
437 likes[baseA] = 1.0;
438 break;
439
440 case 'C':
441 likes[baseC] = 1.0;
442 break;
443
444 case 'G':
445 likes[baseG] = 1.0;
446 break;
447
448 case 'T':
449 case 'U':
450 likes[baseT] = 1.0;
451 break;
452
453 case 'M':
454 likes[baseA] = 1.0;
455 likes[baseC] = 1.0;
456 break;
457
458 case 'R':
459 likes[baseA] = 1.0;
460 likes[baseG] = 1.0;
461 break;
462
463 case 'W':
464 likes[baseA] = 1.0;
465 likes[baseT] = 1.0;
466 break;
467
468 case 'S':
469 likes[baseC] = 1.0;
470 likes[baseG] = 1.0;
471 break;
472
473 case 'Y':
474 likes[baseC] = 1.0;
475 likes[baseT] = 1.0;
476 break;
477
478 case 'K':
479 likes[baseG] = 1.0;
480 likes[baseT] = 1.0;
481 break;
482
483 case 'V':
484 likes[baseA] = 1.0;
485 likes[baseC] = 1.0;
486 likes[baseG] = 1.0;
487 break;
488
489 case 'H':
490 likes[baseA] = 1.0;
491 likes[baseC] = 1.0;
492 likes[baseT] = 1.0;
493 break;
494
495 case 'D':
496 likes[baseA] = 1.0;
497 likes[baseG] = 1.0;
498 likes[baseT] = 1.0;
499 break;
500
501 case 'B':
502 likes[baseC] = 1.0;
503 likes[baseG] = 1.0;
504 likes[baseT] = 1.0;
505 break;
506
507 case 'N':
508 case 'O':
509 case 'X':
510 case '?':
511 case '-':
512 likes[baseA] = 1.0;
513 likes[baseC] = 1.0;
514 likes[baseG] = 1.0;
515 likes[baseT] = 1.0;
516 break;
517
518 default:
519 assert(false); // how did an unknown nucleotide get past proofreading?
520 likes[baseA] = 1.0;
521 likes[baseC] = 1.0;
522 likes[baseG] = 1.0;
523 likes[baseT] = 1.0;
524 break;
525 }
526
527 long num_ones = (long)(likes[baseA] + likes[baseC] + likes[baseG] + likes[baseT]);
528 double new_0 = (double)num_ones * perBaseErrorRate / 3.0;
529 double new_1 = 1.0 - (double)(4-num_ones) * perBaseErrorRate / 3.0;
530 likes[baseA] = (likes[baseA] > 0.5) ? new_1 : new_0;
531 likes[baseC] = (likes[baseC] > 0.5) ? new_1 : new_0;
532 likes[baseG] = (likes[baseG] > 0.5) ? new_1 : new_0;
533 likes[baseT] = (likes[baseT] > 0.5) ? new_1 : new_0;
534
535 return(likes);
536
537 } // DataToLikes
538
539 //------------------------------------------------------------------------------------
540
CreateDataModelReport() const541 StringVec1d NucModel::CreateDataModelReport() const
542 {
543 StringVec1d report = DataModel::CreateDataModelReport();
544
545 string line = "Base frequencies: " + ToString(m_basefreqs[baseA]) + ", ";
546 line += ToString(m_basefreqs[baseC]) + ", ";
547 line += ToString(m_basefreqs[baseG]) + ", ";
548 line += ToString(m_basefreqs[baseT]);
549 report.push_back(line);
550
551 return report;
552
553 } // NucModel::CreateDataModelReport
554
555 //------------------------------------------------------------------------------------
556
ToXML(size_t nspaces) const557 StringVec1d NucModel::ToXML(size_t nspaces) const
558 {
559 StringVec1d xmllines(DataModel::ToXML(nspaces));
560
561 nspaces += INDENT_DEPTH;
562
563 string line(MakeIndent(MakeTag(xmlstr::XML_TAG_BASE_FREQS),nspaces));
564 if (m_freqsfromdata)
565 line += " " + xmlstr::XML_TAG_CALCULATED + " ";
566 else
567 line += ToString(m_basefreqs,6);
568 line += MakeCloseTag(xmlstr::XML_TAG_BASE_FREQS);
569 StringVec1d::iterator endtag = --xmllines.end();
570 xmllines.insert(endtag,line);
571
572 string line2(MakeIndent(MakeTag(xmlstr::XML_TAG_PER_BASE_ERROR_RATE),nspaces));
573 line2 += ToString(m_perBaseErrorRate);
574 line2 += MakeCloseTag(xmlstr::XML_TAG_PER_BASE_ERROR_RATE);
575 endtag = --xmllines.end();
576 xmllines.insert(endtag,line2);
577
578 nspaces -= INDENT_DEPTH;
579 return xmllines;
580
581 } // NucModel::ToXML
582
583 //------------------------------------------------------------------------------------
584
ChooseAncestralState(long marker)585 DoubleVec1d NucModel::ChooseAncestralState(long marker)
586 {
587 DoubleVec1d result(BASES, 0.0);
588 double r = registry.GetRandom().Float();
589 long i;
590 for (i = 0; i < BASES; ++i)
591 {
592 if (r < m_basefreqs[i])
593 {
594 result[i] = 1.0;
595 return result;
596 }
597 else
598 {
599 r -= m_basefreqs[i];
600 }
601 }
602
603 // this code could be reached due to rounding errors
604 result[0] = 1.0;
605 return result;
606 } // ChooseAncestralState
607
608 //------------------------------------------------------------------------------------
609
CellToData(Cell_ptr cell,long marker) const610 string NucModel::CellToData(Cell_ptr cell, long marker) const
611 {
612 LongVec1d ones = cell->GetOnes(marker);
613 if (ones.size() == 1)
614 {
615 switch (ones[0])
616 {
617 case 0:
618 return "A";
619 case 1:
620 return "C";
621 case 2:
622 return "G";
623 case 3:
624 return "T";
625 default:
626 throw data_error("Tried to convert " + ToString(ones[0]+1)
627 + " to a nucleotide, but there should only be four bins.");
628 }
629 }
630 throw implementation_error("Cannot convert nucleotide data from the internal format"
631 " that is not simply one of the four bases.");
632 }
633
634 //------------------------------------------------------------------------------------
635 //------------------------------------------------------------------------------------
636
F84Model(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate,double freqA,double freqC,double freqG,double freqT,double ttRatio,bool calculateFreqsFromData,double perBaseErrorRate)637 F84Model::F84Model(
638 long nmarkers,
639 long numCategories,
640 DoubleVec1d categoryRates,
641 DoubleVec1d categoryProbabilities,
642 double userAutoCorrelationValue,
643 bool doNormalize,
644 double relMuRate,
645 double freqA,
646 double freqC,
647 double freqG,
648 double freqT,
649 double ttRatio,
650 bool calculateFreqsFromData,
651 double perBaseErrorRate)
652 : NucModel(
653 nmarkers,
654 numCategories,
655 categoryRates,
656 categoryProbabilities,
657 userAutoCorrelationValue,
658 doNormalize,
659 relMuRate,
660 freqA,
661 freqC,
662 freqG,
663 freqT,
664 calculateFreqsFromData,
665 perBaseErrorRate),
666 m_ttratio(ttRatio),
667 // set some potentially useful defaults
668 computed1(true), computed2(true),
669 freqar(FLAGDOUBLE), freqcy(FLAGDOUBLE),
670 freqgr(FLAGDOUBLE), freqty(FLAGDOUBLE),
671 basefreqarray(NULL),
672 daughter1(NULL), daughter2(NULL), target(NULL)
673 {
674 // The data-likelihood buffer vectors such as expA1 are default-constructed
675 // to an empty state, which is fine.
676 assert(F84Model::IsValidDataModel());
677 //cout << "*****new F84 model created*****" << this << endl;
678 Finalize();
679
680 } // F84Model::F84Model
681
682 //------------------------------------------------------------------------------------
683
EmptyBuffers()684 void F84Model::EmptyBuffers()
685 {
686 if (basefreqarray)
687 {
688 delete [] basefreqarray[0];
689 delete [] basefreqarray;
690 }
691 if (daughter1)
692 {
693 delete [] daughter1[0];
694 delete [] daughter1;
695 }
696 if (daughter2)
697 {
698 delete [] daughter2[0];
699 delete [] daughter2;
700 }
701 if (target)
702 {
703 delete [] target[0];
704 delete [] target;
705 }
706
707 } // F84Model::EmptyBuffers
708
709 //------------------------------------------------------------------------------------
710
Clone() const711 DataModel* F84Model::Clone() const
712 {
713 DataModel* newmodel = new F84Model(*this);
714 return newmodel;
715 }
716
717 //------------------------------------------------------------------------------------
718
AllocateBuffers()719 void F84Model::AllocateBuffers()
720 {
721 basefreqarray = new double* [m_ncategories];
722 daughter1 = new double* [m_ncategories];
723 daughter2 = new double* [m_ncategories];
724 target = new double* [m_ncategories];
725 basefreqarray[0] = new double [m_ncategories*BASES];
726 daughter1[0] = new double [m_ncategories*BASES];
727 daughter2[0] = new double [m_ncategories*BASES];
728 target[0] = new double [m_ncategories*BASES];
729 long cat;
730
731 for (cat = 0; cat < m_ncategories; ++cat)
732 {
733 basefreqarray[cat] = basefreqarray[0] + cat*BASES;
734 daughter1[cat] = daughter1[0] + cat*BASES;
735 daughter2[cat] = daughter2[0] + cat*BASES;
736 target[cat] = target[0] + cat*BASES;
737 basefreqarray[cat][baseA] = m_basefreqs[baseA];
738 basefreqarray[cat][baseC] = m_basefreqs[baseC];
739 basefreqarray[cat][baseG] = m_basefreqs[baseG];
740 basefreqarray[cat][baseT] = m_basefreqs[baseT];
741 }
742
743 } // F84Model::AllocateBuffers
744
745 //------------------------------------------------------------------------------------
746
~F84Model()747 F84Model::~F84Model()
748 {
749 // if the array does not exist we suppose that the object is
750 // not Finalized, and don't try any deallocations.
751 EmptyBuffers();
752
753 } // F84Model::~F84Model
754
755 //------------------------------------------------------------------------------------
756
CopyMembers(const F84Model & src)757 void F84Model::CopyMembers(const F84Model& src)
758 {
759 m_ttratio = src.m_ttratio;
760
761 freqar = src.freqar;
762 freqcy = src.freqcy;
763 freqgr = src.freqgr;
764 freqty = src.freqty;
765
766 computed1 = src.computed1;
767 computed2 = src.computed2;
768
769 // don't copy the data likelihood buffers
770 }
771
772 //------------------------------------------------------------------------------------
773
IsValidDataModel() const774 bool F84Model::IsValidDataModel() const
775 {
776 if (m_ttratio <= 0.5) return false;
777 return NucModel::IsValidDataModel();
778
779 } // F84Model::IsValidDataModel
780
781 //------------------------------------------------------------------------------------
782
CopyBuffers(const F84Model & src)783 void F84Model::CopyBuffers(const F84Model& src)
784 {
785 xcatrates = src.xcatrates;
786 ycatrates = src.ycatrates;
787 expA1 = src.expA1;
788 expB1 = src.expB1;
789 expC1 = src.expC1;
790 expA2 = src.expA2;
791 expB2 = src.expB2;
792 expC2 = src.expC2;
793 catlikes = src.catlikes;
794
795 EmptyBuffers();
796 AllocateBuffers();
797
798 // Do not try to copy the buffers if they are unallocated
799 if (src.daughter1)
800 memcpy(daughter1[0],src.daughter1[0],m_ncategories*BASES*sizeof(double));
801 if (src.daughter2)
802 memcpy(daughter2[0],src.daughter2[0],m_ncategories*BASES*sizeof(double));
803 if (src.target)
804 memcpy(target[0],src.target[0],m_ncategories*BASES*sizeof(double));
805
806 } // F84Model::CopyBuffers
807
808 //------------------------------------------------------------------------------------
809
F84Model(const F84Model & src)810 F84Model::F84Model(const F84Model& src)
811 : NucModel(src), basefreqarray(NULL), daughter1(NULL), daughter2(NULL),
812 target(NULL)
813 {
814 //cout << "*****F84 model copied*****" << this << endl;
815
816 CopyMembers(src);
817 CopyBuffers(src);
818 }
819
820 //------------------------------------------------------------------------------------
821
operator =(const F84Model & src)822 F84Model& F84Model::operator=(const F84Model& src)
823 {
824 NucModel::operator=(src);
825 CopyMembers(src);
826 CopyBuffers(src);
827 return *this;
828 }
829
830 //------------------------------------------------------------------------------------
831
SetTTratio(double tr)832 void F84Model::SetTTratio(double tr)
833 {
834 if (tr <= 0.5)
835 {
836 data_error e("Transition/transversion ratio must be > 0.5");
837 throw e;
838 }
839 m_ttratio = tr;
840 }
841
842 //------------------------------------------------------------------------------------
843
Finalize()844 void F84Model::Finalize()
845 {
846 double pur, pyr, ag, ct, m, n, x, y;
847 long cat;
848
849 double freqa(m_basefreqs[baseA]), freqc(m_basefreqs[baseC]),
850 freqg(m_basefreqs[baseG]), freqt(m_basefreqs[baseT]);
851
852 pur = freqa + freqg;
853 pyr = freqc + freqt;
854 freqar = freqa / pur;
855 freqcy = freqc / pyr;
856 freqgr = freqg / pur;
857 freqty = freqt / pyr;
858
859 ag = freqa * freqg;
860 ct = freqc * freqt;
861 m = m_ttratio * pur * pyr - (ag + ct);
862 n = ag / pur + ct / pyr;
863
864 // code stolen from DNAMLK here
865 y = m / (m + n);
866 // if globalfreqs has failed....
867 if (y < 0) y = 0;
868 else if (y > 1) y = 1;
869 x = 1.0 - y;
870 double fracchange = y * (2.0 * freqa * freqgr + 2.0 * freqc * freqty)
871 + x * (1.0 - freqa * freqa - freqc * freqc - freqg * freqg - freqt * freqt);
872 y /= - fracchange;
873 x /= - fracchange;
874
875 for (cat = 0; cat < m_ncategories; ++cat)
876 {
877 xcatrates.push_back(x*m_catrates[cat]);
878 ycatrates.push_back(y*m_catrates[cat]);
879 }
880
881 // Allocate additional space for likelihood calculations.
882 expA1.insert(expA1.begin(),m_ncategories,0.0);
883 expA2.insert(expA2.begin(),m_ncategories,0.0);
884 expB1.insert(expB1.begin(),m_ncategories,0.0);
885 expB2.insert(expB2.begin(),m_ncategories,0.0);
886 expC1.insert(expC1.begin(),m_ncategories,0.0);
887 expC2.insert(expC2.begin(),m_ncategories,0.0);
888
889 AllocateBuffers();
890
891 double zero = 0.0;
892 catlikes = CreateVec2d(m_nmarkers,m_ncategories,zero);
893
894 } // F84Model::Finalize
895
896 //------------------------------------------------------------------------------------
897
RescaleLength1(double length1)898 void F84Model::RescaleLength1(double length1)
899 {
900 long cat;
901 double n;
902
903 if (length1 > FLAGDOUBLE)
904 {
905 #ifdef DEBUG_VARIABLES
906 if (printdbgvar > 0)
907 {
908 cout << "RescaleLength1 raw length1: " << length1 << endl;
909 //cout << " m_relmurate: " << m_relmurate << endl;
910 }
911 #endif
912 length1 *= m_relmurate;
913 computed1 = true;
914
915 for(cat = 0; cat < m_ncategories; cat++)
916 {
917 n = exp(length1 * xcatrates[cat]);
918 expA1[cat] = 1.0 - n;
919 expB1[cat] = n * exp(length1 * ycatrates[cat]);
920 expC1[cat] = n - expB1[cat];
921 #ifdef DEBUG_VARIABLES
922 if (0)
923 //if (printdbgvar > 0)
924 {
925 cout << " cat: " << cat << endl;
926 cout << " n: " << n << endl;
927 cout << " xcatrates: " << xcatrates[cat] << endl;
928 cout << " ycatrates: " << ycatrates[cat] << endl;
929 cout << " expA1: "<< expA1[cat] << endl;
930 cout << " expB1: "<< expB1[cat] << endl;
931 cout << " expC1: "<< expC1[cat] << endl;
932 }
933 #endif
934 }
935 }
936 else
937 {
938 computed1 = false;
939 }
940 #ifdef DEBUG_VARIABLES
941 if (printdbgvar > 0)
942 {
943 cout << " scaled length1: " << length1 << " computed1: "<< computed1 << endl << endl;
944 }
945 #endif
946
947 } // F84Model::RescaleLength1
948
949 //------------------------------------------------------------------------------------
950
RescaleLength2(double length2)951 void F84Model::RescaleLength2(double length2)
952 {
953 long cat;
954 double n;
955
956 if (length2 > FLAGDOUBLE)
957 {
958 #ifdef DEBUG_VARIABLES
959 if (printdbgvar > 0)
960 {
961 cout << "RescaleLength2 raw length2: " << length2 << endl;
962 //cout << " m_relmurate: " << m_relmurate << endl;
963 }
964 #endif
965 length2 *= m_relmurate;
966 computed2 = true;
967
968 for(cat = 0; cat < m_ncategories; cat++)
969 {
970 n = exp(length2 * xcatrates[cat]);
971 expA2[cat] = 1.0 - n;
972 expB2[cat] = n * exp(length2 * ycatrates[cat]);
973 expC2[cat] = n - expB2[cat];
974 #ifdef DEBUG_VARIABLES
975 if (0)
976 //if (printdbgvar > 0)
977 {
978 cout << " cat: " << cat << endl;
979 cout << " n: " << n << endl;
980 cout << " xcatrates: " << xcatrates[cat] << endl;
981 cout << " ycatrates: " << ycatrates[cat] << endl;
982 cout << " expA2: "<< expA2[cat] << endl;
983 cout << " expB2: "<< expB2[cat] << endl;
984 cout << " expC2: "<< expC2[cat] << endl;
985 }
986 #endif
987 }
988 }
989 else
990 {
991 computed2 = false;
992 }
993 #ifdef DEBUG_VARIABLES
994 if (printdbgvar > 0)
995 {
996 cout << " scaled length2: " << length2 << " computed2: "<< computed2 << endl << endl;
997 }
998 #endif
999 }
1000
1001 //------------------------------------------------------------------------------------
1002
RescaleLengths(double length1,double length2)1003 void F84Model::RescaleLengths(double length1, double length2)
1004 {
1005 RescaleLength1(length1);
1006 RescaleLength2(length2);
1007
1008 } // F84Model::RescaleLengths
1009
1010 //------------------------------------------------------------------------------------
1011
ComputeSiteDLs(double ** siteDLs1,double ** siteDLs2)1012 double** F84Model::ComputeSiteDLs(double** siteDLs1, double** siteDLs2)
1013 {
1014 double sumAll, sumPur, sumPyr;
1015
1016 long cat;
1017 #ifdef DEBUG_VARIABLES
1018 if (0)
1019 //if (printdbgvar > 0)
1020 {
1021 cout << "basefreqA: "<< m_basefreqs[baseA] << endl;
1022 cout << "basefreqC: "<< m_basefreqs[baseC] << endl;
1023 cout << "basefreqG: "<< m_basefreqs[baseG] << endl;
1024 cout << "basefreqT: "<< m_basefreqs[baseT] << endl;
1025 if (computed1)
1026 cout << "computed1: true"<< endl;
1027 else
1028 cout << "computed1: false"<< endl;
1029 if (computed2)
1030 cout << "computed2: true"<< endl;
1031 else
1032 cout << "computed2: false"<< endl;
1033 }
1034 #endif
1035
1036 if (computed1)
1037 {
1038 for (cat = 0; cat < m_ncategories; ++cat)
1039 {
1040 double* catDLs1 = siteDLs1[cat];
1041 sumAll = expA1[cat] *
1042 (m_basefreqs[baseA]*catDLs1[baseA] +
1043 m_basefreqs[baseC]*catDLs1[baseC] +
1044 m_basefreqs[baseG]*catDLs1[baseG] +
1045 m_basefreqs[baseT]*catDLs1[baseT]);
1046
1047 sumPur = freqar*catDLs1[baseA] + freqgr*catDLs1[baseG];
1048 sumPyr = freqcy*catDLs1[baseC] + freqty*catDLs1[baseT];
1049
1050 #ifdef DEBUG_VARIABLES
1051 if (printdbgvar > 0)
1052 {
1053 //cout<< "cat: " << cat << endl;
1054 //cout<< "sumA1: " << sumAll << endl;
1055 //cout<< "freqar: " << freqar << endl;
1056 //cout<< "freqcy: " << freqcy << endl;
1057 //cout<< "freqgr: " << freqgr << endl;
1058 //cout<< "freqty: " << freqty << endl;
1059 //cout<< "sumPur1: " << sumPur << endl;
1060 //cout<< "sumPyr1: " << sumPyr << endl;
1061 cout<< "dls1: " << catDLs1[baseA] << " "<< catDLs1[baseC] << " "<< catDLs1[baseG] << " "<< catDLs1[baseT] << endl;
1062 //cout<< "dls1A: " << catDLs1[baseA] << endl;
1063 //cout<< "dls1C: " << catDLs1[baseC] << endl;
1064 //cout<< "dls1G: " << catDLs1[baseG] << endl;
1065 //cout<< "dls1T: " << catDLs1[baseT] << endl;
1066 //cout<< "expA1: " << expA1[cat] <<endl;
1067 //cout<< "expB1: " << expB1[cat] <<endl;
1068 //cout<< "expC1: " << expC1[cat] <<endl<<endl;
1069 }
1070 #endif
1071 double expC1cat = expC1[cat];
1072 daughter1[cat][baseA] = sumAll + expB1[cat]*catDLs1[baseA] +
1073 expC1cat*sumPur;
1074 daughter1[cat][baseC] = sumAll + expB1[cat]*catDLs1[baseC] +
1075 expC1cat*sumPyr;
1076 daughter1[cat][baseG] = sumAll + expB1[cat]*catDLs1[baseG] +
1077 expC1cat*sumPur;
1078 daughter1[cat][baseT] = sumAll + expB1[cat]*catDLs1[baseT] +
1079 expC1cat*sumPyr;
1080 }
1081 }
1082 else
1083 {
1084 memcpy(daughter1[0],basefreqarray[0],m_ncategories*BASES*sizeof(double));
1085 }
1086
1087 if (computed2)
1088 {
1089 for (cat = 0; cat < m_ncategories; cat++)
1090 {
1091 double* catDLs2 = siteDLs2[cat];
1092 sumAll = expA2[cat] *
1093 (m_basefreqs[baseA]*catDLs2[baseA] +
1094 m_basefreqs[baseC]*catDLs2[baseC] +
1095 m_basefreqs[baseG]*catDLs2[baseG] +
1096 m_basefreqs[baseT]*catDLs2[baseT]);
1097
1098 sumPur = freqar*catDLs2[baseA] + freqgr*catDLs2[baseG];
1099 sumPyr = freqcy*catDLs2[baseC] + freqty*catDLs2[baseT];
1100
1101 #ifdef DEBUG_VARIABLES
1102 if (printdbgvar > 0)
1103 {
1104 //cout<< "cat: " << cat << endl;
1105 //cout<< "sumA2: " << sumAll << endl;// JRM debug
1106 //cout<< "sumPur2: " << sumPur << endl;
1107 //cout<< "sumPyr2: " << sumPyr << endl;
1108 //cout<< "dls2A: " << catDLs2[baseA] << endl;
1109 //cout<< "dls2C: " << catDLs2[baseC] << endl;
1110 //cout<< "dls2G: " << catDLs2[baseG] << endl;
1111 //cout<< "dls2T: " << catDLs2[baseT] << endl;
1112 //cout<< "expA2: " << expA2[cat] <<endl;
1113 //cout<< "expB2: " << expB2[cat] <<endl;
1114 //cout<< "expC2: " << expC2[cat] <<endl<<endl;
1115 cout<< "dls2: " << catDLs2[baseA] << " "<< catDLs2[baseC] << " "<< catDLs2[baseG] << " "<< catDLs2[baseT] << endl;
1116 }
1117 #endif
1118
1119 double expC2cat = expC2[cat];
1120 daughter2[cat][baseA] = sumAll + expB2[cat]*catDLs2[baseA] +
1121 expC2cat*sumPur;
1122 daughter2[cat][baseC] = sumAll + expB2[cat]*catDLs2[baseC] +
1123 expC2cat*sumPyr;
1124 daughter2[cat][baseG] = sumAll + expB2[cat]*catDLs2[baseG] +
1125 expC2cat*sumPur;
1126 daughter2[cat][baseT] = sumAll + expB2[cat]*catDLs2[baseT] +
1127 expC2cat*sumPyr;
1128 }
1129 }
1130 else
1131 {
1132 memcpy(daughter2[0],basefreqarray[0],m_ncategories*BASES*sizeof(double));
1133 }
1134
1135 for (cat = 0; cat < m_ncategories; cat++)
1136 {
1137 target[cat][baseA] = daughter1[cat][baseA] *
1138 daughter2[cat][baseA];
1139 target[cat][baseC] = daughter1[cat][baseC] *
1140 daughter2[cat][baseC];
1141 target[cat][baseG] = daughter1[cat][baseG] *
1142 daughter2[cat][baseG];
1143 target[cat][baseT] = daughter1[cat][baseT] *
1144 daughter2[cat][baseT];
1145 #ifdef DEBUG_VARIABLES
1146 if (printdbgvar > 0)
1147 {
1148 //cout<< "cat: " << cat << endl;
1149 //cout<< "targetA: " << target[cat][baseA] <<endl;
1150 //cout<< "targetC: " << target[cat][baseC] <<endl;
1151 //cout<< "targetG: " << target[cat][baseG] <<endl;
1152 //cout<< "targetT: " << target[cat][baseT] <<endl<<endl;
1153 cout<< "target: " << target[cat][baseA] << " "<< target[cat][baseC] << " "<< target[cat][baseG] << " "<< target[cat][baseT] << endl;
1154 }
1155 #endif
1156 }
1157
1158 return target;
1159
1160 } // F84Model::ComputeSiteDLs
1161
1162 //------------------------------------------------------------------------------------
1163
ComputeSubtreeDL(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)1164 double F84Model::ComputeSubtreeDL(Cell& rootdls, double** startmarker, double** endmarker, long posn)
1165 {
1166 double total=0.0, subtotal;
1167 double** marker;
1168 long cat;
1169 DoubleVec1d prior(m_ncategories);
1170 long firstposn = posn;
1171 int im = 0;
1172
1173 for (marker = startmarker; marker != endmarker;
1174 marker = rootdls.GetNextMarker(marker))
1175 {
1176 subtotal = 0.0;
1177
1178 for (cat = 0; cat < m_ncategories; cat++)
1179 {
1180 prior[cat] = m_basefreqs[baseA]*marker[cat][baseA] +
1181 m_basefreqs[baseC]*marker[cat][baseC] +
1182 m_basefreqs[baseG]*marker[cat][baseG] +
1183 m_basefreqs[baseT]*marker[cat][baseT];
1184
1185 subtotal += m_catprobs[cat] * prior[cat];
1186 #if 0
1187 cout << "marker " << im
1188 << " cat " << cat
1189 << " probs " << marker[cat][baseA]
1190 << " " << marker[cat][baseC]
1191 << " " << marker[cat][baseG]
1192 << " " << marker[cat][baseT]
1193 << " prior " << prior[cat]
1194 << endl;
1195 #endif
1196 }
1197 im++;
1198
1199 if (!subtotal)
1200 {
1201 DataModel::TryToNormalizeAndThrow(posn, GetModelType());
1202 }
1203
1204 if (ShouldNormalize())
1205 {
1206 total += (log(subtotal) + rootdls.GetNorms(posn));
1207 }
1208 else
1209 {
1210 total += log(subtotal);
1211 }
1212
1213 // de-normalization not needed here, since we are only interested in
1214 // the ratio
1215 if (m_ncategories > 1)
1216 {
1217 for (cat = 0; cat < m_ncategories; cat++)
1218 catlikes[posn][cat] = prior[cat]/subtotal;
1219 }
1220 ++posn;
1221 }
1222
1223 if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
1224 // cout << "total: " << total << endl;
1225 return total;
1226
1227 } // F84Model::ComputeSubtreeDL
1228
1229 //------------------------------------------------------------------------------------
1230
ComputeCatDL(long startmarker,long endmarker)1231 double F84Model::ComputeCatDL(long startmarker, long endmarker)
1232 {
1233 double subtotal;
1234 long marker, cat;
1235
1236 DoubleVec1d like = m_catcells;
1237 DoubleVec1d nulike(m_ncategories);
1238
1239 for (marker = startmarker; marker != endmarker; ++marker)
1240 {
1241 subtotal = 0.0;
1242 for (cat = 0; cat < m_ncategories; cat++)
1243 subtotal += m_catprobs[cat] * like[cat];
1244
1245 subtotal *= m_acratio;
1246
1247 for (cat = 0; cat < m_ncategories; cat++)
1248 nulike[cat] = catlikes[marker][cat] *
1249 (subtotal + m_ucratio * like[cat]);
1250
1251 // the following puts the nulike values into like. It
1252 // also puts the like values into nulike, but we will not
1253 // be using values from nulike so we don't care.
1254 like.swap(nulike);
1255 }
1256
1257 subtotal = 0.0;
1258 for (cat = 0; cat < m_ncategories; cat++)
1259 subtotal += m_catprobs[cat] * like[cat];
1260
1261 // the following puts the like values into catcells for
1262 // long-term storage. It also puts the catcells values into
1263 // like, but we don't care.
1264 m_catcells.swap(like);
1265
1266 return log(subtotal);
1267
1268 } // F84Model::ComputeCatDL
1269
1270 //------------------------------------------------------------------------------------
1271
CreateDataModelReport() const1272 StringVec1d F84Model::CreateDataModelReport() const
1273 {
1274 StringVec1d report = NucModel::CreateDataModelReport();
1275
1276 string line = "Transition/transversion ratio: " + ToString(m_ttratio);
1277 report.push_back(line);
1278
1279 return report;
1280
1281 } // F84Model::CreateDataModelReport
1282
1283 //------------------------------------------------------------------------------------
1284
ToXML(size_t nspaces) const1285 StringVec1d F84Model::ToXML(size_t nspaces) const
1286 {
1287 StringVec1d xmllines(NucModel::ToXML(nspaces));
1288
1289 nspaces += INDENT_DEPTH;
1290 string line(MakeIndent(MakeTag(xmlstr::XML_TAG_TTRATIO),nspaces));
1291 line += ToString(GetTTratio());
1292 line += MakeCloseTag(xmlstr::XML_TAG_TTRATIO);
1293 nspaces -= INDENT_DEPTH;
1294
1295 StringVec1d::iterator endtag = --xmllines.end();
1296 xmllines.insert(endtag,line);
1297
1298 return xmllines;
1299
1300 } // F84Model::ToXML
1301
1302 //------------------------------------------------------------------------------------
1303
1304 // This routine simulates data for a single node under the F84
1305 // model. It assumes that the given branch lengths have already
1306 // been rescaled for the rate category desired.
1307 // Nomenclature follows _Inferring Phylogenies_ pp. 202-203,
1308 // printing date 2004 (earlier printings have different page numberings)
1309
1310 // The second argument is unused since nucleotides don't have differing
1311 // number of states by marker position
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const1312 DoubleVec1d F84Model::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
1313 {
1314 double freqa(m_basefreqs[baseA]), freqc(m_basefreqs[baseC]), freqg(m_basefreqs[baseG]), freqt(m_basefreqs[baseT]);
1315 double pur = freqa + freqg;
1316 double pyr = freqc + freqt;
1317 double beta = 1.0 / (2.0 * pur * pyr * (1.0 + m_ttratio));
1318 double alpha = (pur * pyr * m_ttratio - freqa * freqg - freqc * freqt) /
1319 (2.0 * (1.0 + m_ttratio) * (pyr * freqa * freqg + pur * freqc * freqt));
1320
1321 DoubleVec1d cumprob;
1322 cumprob.push_back(freqa);
1323 cumprob.push_back(freqa + freqc);
1324 cumprob.push_back(freqa + freqc + freqg);
1325
1326 double expB = exp(-beta * branchlength);
1327
1328 DoubleVec1d answer(BASES, 0.0);
1329
1330 double general = 1.0 - expB;
1331 double r = registry.GetRandom().Float();
1332
1333 // We compute a chance of drawing from a pool of all four nucleotides
1334 // in proportion to their frequency; if that doesn't happen, we
1335 // compute a chance of drawing from a pool of only purines or only
1336 // pyrimidines; if that doesn't happen either there is no change
1337 // and we return the initial state.
1338
1339 if (r < general)
1340 {
1341 // perform a draw from the general pool
1342 double r2 = registry.GetRandom().Float();
1343 long i;
1344 for (i = 0; i < BASES-1; ++i)
1345 {
1346 if (r2 < cumprob[i])
1347 {
1348 answer[i] = 1.0;
1349 return answer;
1350 }
1351 }
1352 // flowthrough if previous bases not picked
1353 answer[BASES-1] = 1.0;
1354 return answer;
1355 }
1356 else
1357 {
1358 r -= general;
1359 double transition = expB * (1.0 - exp(-alpha * branchlength));
1360 if (r < transition)
1361 {
1362 // perform a draw from the transition pool
1363 double r2 = registry.GetRandom().Float();
1364 if (state[baseA] == 1.0 || state[baseG] == 1.0) // purine
1365 {
1366 if (r2 < freqa / (freqa + freqg))
1367 {
1368 answer[baseA] = 1.0;
1369 }
1370 else
1371 {
1372 answer[baseG] = 1.0;
1373 }
1374 }
1375 else // pyrimidine
1376 {
1377 if (r2 < freqc / (freqc + freqt))
1378 {
1379 answer[baseC] = 1.0;
1380 }
1381 else
1382 {
1383 answer[baseT] = 1.0;
1384 }
1385 }
1386 return answer;
1387 }
1388 }
1389
1390 // otherwise, no event happens
1391 return state;
1392
1393 } // SimulateMarke
1394
1395 //------------------------------------------------------------------------------------
1396 //------------------------------------------------------------------------------------
1397
GTRModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate,double freqA,double freqC,double freqG,double freqT,double freqAC,double freqAG,double freqAT,double freqCG,double freqCT,double freqTG,double perBaseErrorRate)1398 GTRModel::GTRModel(long nmarkers,
1399 long numCategories,
1400 DoubleVec1d categoryRates,
1401 DoubleVec1d categoryProbabilities,
1402 double userAutoCorrelationValue,
1403 bool doNormalize,
1404 double relMuRate,
1405 double freqA,
1406 double freqC,
1407 double freqG,
1408 double freqT,
1409 double freqAC,
1410 double freqAG,
1411 double freqAT,
1412 double freqCG,
1413 double freqCT,
1414 double freqTG,
1415 double perBaseErrorRate)
1416 : NucModel(nmarkers,
1417 numCategories,
1418 categoryRates,
1419 categoryProbabilities,
1420 userAutoCorrelationValue,
1421 doNormalize,
1422 relMuRate,
1423 freqA,
1424 freqC,
1425 freqG,
1426 freqT,
1427 false, // calculate freqs from data
1428 perBaseErrorRate),
1429 AC(freqAC),
1430 AG(freqAG),
1431 AT(freqAT),
1432 CG(freqCG),
1433 CT(freqCT),
1434 TG(freqTG),
1435 // set some potentially useful defaults
1436 basefreqarray(NULL),
1437 daughter(NULL),target(NULL)
1438 {
1439 // reset the names from the base values
1440 computed.assign(2,true);
1441 DoubleVec1d empty(BASES,0.0);
1442 scratch.assign(BASES,empty);
1443 Finalize();
1444
1445 assert(GTRModel::IsValidDataModel());
1446 } // GTRModel::GTRModel
1447
1448 //------------------------------------------------------------------------------------
1449
GTRModel(const GTRModel & src)1450 GTRModel::GTRModel(const GTRModel& src)
1451 : NucModel(src), basefreqarray(NULL),daughter(NULL),target(NULL)
1452 {
1453 CopyMembers(src); // doesn't copy target or daughter buffers!
1454 } // GTRModel copy ctor
1455
1456 //------------------------------------------------------------------------------------
1457
~GTRModel()1458 GTRModel::~GTRModel()
1459 {
1460 EmptyBuffers();
1461 } // GTRModel dtor
1462
1463 //------------------------------------------------------------------------------------
1464
Clone() const1465 DataModel* GTRModel::Clone() const
1466 {
1467 DataModel* newmodel = new GTRModel(*this);
1468 return newmodel;
1469 }
1470
1471 //------------------------------------------------------------------------------------
1472
operator =(const GTRModel & src)1473 GTRModel& GTRModel::operator=(const GTRModel& src)
1474 {
1475 NucModel::operator=(src);
1476 CopyMembers(src);
1477 return *this;
1478 }
1479
1480 //------------------------------------------------------------------------------------
1481
AllocateBuffers()1482 void GTRModel::AllocateBuffers()
1483 {
1484 DoubleVec1d zero(m_ncategories,0.0);
1485 catlikes.assign(m_nmarkers,zero);
1486
1487 DoubleVec1d base1(BASES,0);
1488 DoubleVec2d base2(BASES,base1);
1489 DoubleVec3d empty(m_ncategories,base2);
1490 pchange.assign(2,empty);
1491
1492 daughter = new double** [2];
1493 target = new double* [m_ncategories];
1494 basefreqarray = new double* [m_ncategories];
1495 daughter[0] = new double* [2*m_ncategories];
1496 daughter[0][0] = new double [2*m_ncategories*BASES];
1497 target[0] = new double [m_ncategories*BASES];
1498 basefreqarray[0] = new double [m_ncategories*BASES];
1499 daughter[1] = daughter[0] + m_ncategories;
1500 daughter[1][0] = daughter[0][0] + m_ncategories*BASES;
1501 long cat;
1502 for (cat = 0; cat < m_ncategories; ++cat)
1503 {
1504 basefreqarray[cat] = basefreqarray[0] + cat*BASES;
1505 daughter[0][cat] = daughter[0][0] + cat*BASES;
1506 daughter[1][cat] = daughter[1][0] + cat*BASES;
1507 target[cat] = target[0] + cat*BASES;
1508 long base;
1509 for(base = baseA; base <= baseT; ++base)
1510 basefreqarray[cat][base] = m_basefreqs[base];
1511 }
1512 } // GTRModel::AllocateBuffers
1513
1514 //------------------------------------------------------------------------------------
1515
EmptyBuffers()1516 void GTRModel::EmptyBuffers()
1517 {
1518 if (basefreqarray)
1519 {
1520 delete [] basefreqarray[0];
1521 delete [] basefreqarray;
1522 }
1523 if (daughter)
1524 {
1525 delete [] daughter[0][0];
1526 delete [] daughter[0];
1527 delete [] daughter;
1528 }
1529 if (target)
1530 {
1531 delete [] target[0];
1532 delete [] target;
1533 }
1534
1535 } // GTRModel::EmptyBuffers
1536
1537 //------------------------------------------------------------------------------------
1538
CopyMembers(const GTRModel & src)1539 void GTRModel::CopyMembers(const GTRModel& src)
1540 {
1541 AG = src.AG;
1542 AC = src.AC;
1543 AT = src.AT;
1544 CG = src.CG;
1545 CT = src.CT;
1546 TG = src.TG;
1547 eigvals = src.eigvals;
1548 eigvecs1 = src.eigvecs1;
1549 eigvecs2 = src.eigvecs2;
1550 pchange = src.pchange;
1551 computed = src.computed;
1552 scratch = src.scratch;
1553 m_basefreqs = src.m_basefreqs;
1554
1555 if (src.basefreqarray)
1556 {
1557 EmptyBuffers();
1558 AllocateBuffers();
1559
1560 memcpy(basefreqarray[0],src.basefreqarray[0],
1561 m_ncategories*BASES*sizeof(double));
1562 }
1563
1564 // we don't copy target or daughter!
1565 } // GTRModel::CopyMembers
1566
1567 //------------------------------------------------------------------------------------
1568
GTRDotProduct(const DoubleVec2d & first,const DoubleVec2d & second,DoubleVec2d & answer)1569 void GTRModel::GTRDotProduct(const DoubleVec2d& first, const DoubleVec2d& second, DoubleVec2d& answer)
1570 // dot product of first and second put into PRE-EXISTING answer!
1571 // second has been PRE-TRANSPOSED!!
1572 {
1573 // should be square
1574 assert(first.size() == first[0].size());
1575 // and all the same size!
1576 assert(first.size() == second.size());
1577 assert(first.size() == answer.size());
1578 double initial = 0.0;
1579
1580 long i, j, n=first.size();
1581 for (i = 0; i < n; ++i)
1582 {
1583 for (j = 0; j < n; ++j)
1584 {
1585 answer[i][j] = inner_product(first[i].begin(), first[i].end(), second[j].begin(), initial);
1586 }
1587 }
1588 } // GTRModel::GTRDotProduct
1589
1590 //------------------------------------------------------------------------------------
1591
BarfOnBadGTRRates(const DoubleVec1d & rts) const1592 void GTRModel::BarfOnBadGTRRates(const DoubleVec1d& rts) const
1593 {
1594 if (rts.size() != 6)
1595 {
1596 data_error e("Incorrect number of GTR rates: expected 6, found " + ToString(rts.size()));
1597 throw e;
1598 }
1599
1600 size_t i;
1601 for (i = 0; i < rts.size(); ++i)
1602 {
1603 if (rts[i] <= 0.0)
1604 {
1605 data_error e("All rates for the GTR model must be greater than 0.");
1606 throw e;
1607 }
1608 }
1609 } // GTRModel::BarfOnBadGTRRates
1610
1611 //------------------------------------------------------------------------------------
1612
SetRates(const DoubleVec1d & rts)1613 void GTRModel::SetRates(const DoubleVec1d& rts)
1614 {
1615 BarfOnBadGTRRates(rts);
1616
1617 AC = rts[0];
1618 AG = rts[1];
1619 AT = rts[2];
1620 CG = rts[3];
1621 CT = rts[4];
1622 TG = rts[5];
1623 } // GTRModel::SetRates
1624
1625 //------------------------------------------------------------------------------------
1626
GetRates() const1627 DoubleVec1d GTRModel::GetRates() const
1628 {
1629 DoubleVec1d rts;
1630 rts.push_back(AC);
1631 rts.push_back(AG);
1632 rts.push_back(AT);
1633 rts.push_back(CG);
1634 rts.push_back(CT);
1635 rts.push_back(TG);
1636
1637 return rts;
1638
1639 } // GTRModel::GetRates
1640
1641 //------------------------------------------------------------------------------------
1642
Finalize()1643 void GTRModel::Finalize()
1644 {
1645 AllocateBuffers();
1646
1647 // calculate eigvals and eigvecs1 & 2
1648 // assemble rate matrix; using scratch
1649 double* bf = basefreqarray[0];
1650
1651 // rescale rates to a mean of 1 event per unit branchlength
1652 double scalefactor = 2.0 * (AC * bf[baseA] * bf[baseC] +
1653 AG * bf[baseA] * bf[baseG] +
1654 AT * bf[baseA] * bf[baseT] +
1655 CG * bf[baseC] * bf[baseG] +
1656 CT * bf[baseC] * bf[baseT] +
1657 TG * bf[baseG] * bf[baseT]);
1658 double nAC = AC / scalefactor;
1659 double nAG = AG / scalefactor;
1660 double nAT = AT / scalefactor;
1661 double nCG = CG / scalefactor;
1662 double nCT = CT / scalefactor;
1663 double nTG = TG / scalefactor;
1664
1665 scratch[0][0] = -(nAC*bf[baseC] + nAG*bf[baseG] + nAT*bf[baseT])/bf[baseA];
1666 scratch[0][1] = nAC;
1667 scratch[0][2] = nAG;
1668 scratch[0][3] = nAT;
1669 scratch[1][0] = nAC;
1670 scratch[1][1] = -(nAC*bf[baseA] + nCG*bf[baseG] + nCT*bf[baseT])/bf[baseC];
1671 scratch[1][2] = nCG;
1672 scratch[1][3] = nCT;
1673 scratch[2][0] = nAG;
1674 scratch[2][1] = nCG;
1675 scratch[2][2] = -(nAG*bf[baseA] + nCG*bf[baseC] + nTG*bf[baseT])/bf[baseG];
1676 scratch[2][3] = nTG;
1677 scratch[3][0] = nAT;
1678 scratch[3][1] = nCT;
1679 scratch[3][2] = nTG;
1680 scratch[3][3] = -(nAT*bf[baseA] + nCT*bf[baseC] + nTG*bf[baseG])/bf[baseT];
1681
1682 // assemble square root of base frequencies
1683 DoubleVec1d zero(BASES,0.0);
1684 DoubleVec2d diag(BASES,zero);
1685 long i;
1686 for (i = 0; i < BASES; ++i)
1687 {
1688 diag[i][i] = sqrt(bf[i]);
1689 }
1690
1691 GTRDotProduct(diag,Transpose(scratch),scratch);
1692 DoubleVec2d answ(BASES,zero);
1693 GTRDotProduct(scratch,diag,answ);
1694
1695 EigenCalculator eig;
1696 pair<DoubleVec1d,DoubleVec2d> eigsys(eig.Eigen(answ));
1697 eigvecs1 = diag; // needed to satisfy GTRDotProduct size reqs
1698 // calculate: diag . Transpose[eigsys.second], which would require
1699 // a call to the transpose of the transpose for GTRDotProduct purposes,
1700 // so just use the matrix straight.
1701 GTRDotProduct(diag,eigsys.second,eigvecs1);
1702 eigvals = eigsys.first;
1703 eigvecs2 = Transpose(Invert(eigvecs1));
1704
1705 } // GTRModel::Finalize
1706
1707 //------------------------------------------------------------------------------------
1708 // calculate pchange [eigvecs1 . Exp[eigvals*length] . eigvecs2]
1709
RescaleLengths(double length1,double length2)1710 void GTRModel::RescaleLengths(double length1, double length2)
1711 {
1712 computed.assign(2,true);
1713
1714 // if a length is FLAGDOUBLE we are in a one-legged coalescence
1715 // and we avoid this whole computation
1716 if (CloseEnough(length1, FLAGDOUBLE)) computed[0] = false;
1717 if (CloseEnough(length2, FLAGDOUBLE)) computed[1] = false;
1718
1719 DoubleVec1d zero(BASES,0.0);
1720 DoubleVec2d zeros(BASES,zero);
1721 DoubleVec3d zeroes(m_ncategories,zeros);
1722 DoubleVec4d diag(2,zeroes);
1723
1724 long cat;
1725 double expon;
1726
1727 length1 *= m_relmurate;
1728 length2 *= m_relmurate;
1729
1730 for(cat = 0; cat < m_ncategories; ++cat)
1731 {
1732 long i;
1733 for (i = 0; i < BASES; ++i)
1734 {
1735 double scalar = m_catrates[cat]*eigvals[i];
1736
1737 if (computed[0])
1738 {
1739 expon = length1 * scalar;
1740 if (expon >= EXPMIN)
1741 {
1742 if (expon <= EXPMAX)
1743 diag[0][cat][i][i] = exp(length1*scalar);
1744 else
1745 diag[0][cat][i][i] = EXP_OF_EXPMAX;
1746 }
1747 // else it remains zero
1748 }
1749
1750 if (computed[1])
1751 {
1752 expon = length2 * scalar;
1753 if (expon >= EXPMIN)
1754 {
1755 if (expon <= EXPMAX)
1756 diag[1][cat][i][i] = exp(length2*scalar);
1757 else
1758 diag[1][cat][i][i] = EXP_OF_EXPMAX;
1759 }
1760 // else it remains zero
1761 }
1762 }
1763 }
1764
1765 for(cat = 0; cat < m_ncategories; ++cat)
1766 {
1767 long br;
1768 for(br = 0; br < 2; ++br)
1769 {
1770 if (!computed[br]) continue;
1771 GTRDotProduct(eigvecs1,diag[br][cat],scratch);
1772 GTRDotProduct(scratch,eigvecs2,pchange[br][cat]);
1773 }
1774 }
1775
1776 } // GTRModel::RescaleLengths
1777
1778 //------------------------------------------------------------------------------------
1779
ComputeSiteDLs(double ** siteDL1,double ** siteDL2)1780 double** GTRModel::ComputeSiteDLs(double** siteDL1, double** siteDL2)
1781 {
1782 long cat, base;
1783
1784 if (computed[0])
1785 {
1786 for(cat = 0; cat < m_ncategories; ++cat)
1787 {
1788 double* catDLs = siteDL1[cat];
1789 DoubleVec2d& prob = pchange[0][cat];
1790 for(base = baseA; base <= baseT; ++base)
1791 {
1792 daughter[0][cat][base] = prob[baseA][base] * catDLs[baseA] +
1793 prob[baseC][base] * catDLs[baseC] +
1794 prob[baseG][base] * catDLs[baseG] +
1795 prob[baseT][base] * catDLs[baseT];
1796 }
1797 }
1798 }
1799 else
1800 {
1801 memcpy(daughter[0][0],basefreqarray[0],m_ncategories*BASES*sizeof(double));
1802 }
1803
1804 if (computed[1])
1805 {
1806 for(cat = 0; cat < m_ncategories; ++cat)
1807 {
1808 double* catDLs = siteDL2[cat];
1809 DoubleVec2d& prob = pchange[1][cat];
1810 for(base = baseA; base <= baseT; ++base)
1811 {
1812 daughter[1][cat][base] = prob[baseA][base] * catDLs[baseA] +
1813 prob[baseC][base] * catDLs[baseC] +
1814 prob[baseG][base] * catDLs[baseG] +
1815 prob[baseT][base] * catDLs[baseT];
1816 }
1817 }
1818 }
1819 else
1820 {
1821 memcpy(daughter[1][0],basefreqarray[0],m_ncategories*BASES*sizeof(double));
1822 }
1823
1824 for (cat = 0; cat < m_ncategories; cat++)
1825 for(base = baseA; base <= baseT; ++base)
1826 target[cat][base] = daughter[0][cat][base] *
1827 daughter[1][cat][base];
1828
1829 return target;
1830
1831 } // GTRModel::ComputeSiteDLs
1832
1833 //------------------------------------------------------------------------------------
1834
ComputeSubtreeDL(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)1835 double GTRModel::ComputeSubtreeDL(Cell& rootdls, double** startmarker, double** endmarker, long posn)
1836 {
1837 double total=0.0, subtotal;
1838 double** marker;
1839 long cat;
1840 DoubleVec1d prior(m_ncategories);
1841 long firstposn = posn;
1842
1843 for (marker = startmarker; marker != endmarker;
1844 marker = rootdls.GetNextMarker(marker))
1845 {
1846 subtotal = 0.0;
1847
1848 for (cat = 0; cat < m_ncategories; cat++)
1849 {
1850 prior[cat] = basefreqarray[cat][baseA]*marker[cat][baseA] +
1851 basefreqarray[cat][baseC]*marker[cat][baseC] +
1852 basefreqarray[cat][baseG]*marker[cat][baseG] +
1853 basefreqarray[cat][baseT]*marker[cat][baseT];
1854
1855 subtotal += m_catprobs[cat] * prior[cat];
1856 }
1857
1858 if (!subtotal)
1859 {
1860 DataModel::TryToNormalizeAndThrow(posn, GetModelType());
1861 }
1862
1863 if (ShouldNormalize())
1864 {
1865 total += (log(subtotal) + rootdls.GetNorms(posn));
1866 }
1867 else
1868 {
1869 total += log(subtotal);
1870 }
1871
1872 // de-normalization not needed here, since we are only interested in the ratio
1873 if (m_ncategories > 1)
1874 {
1875 for (cat = 0; cat < m_ncategories; cat++)
1876 catlikes[posn][cat] = prior[cat]/subtotal;
1877 }
1878 ++posn;
1879 }
1880
1881 if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
1882 return total;
1883
1884 } // GTRModel::ComputeSubtreeDL
1885
1886 //------------------------------------------------------------------------------------
1887
ComputeCatDL(long startmarker,long endmarker)1888 double GTRModel::ComputeCatDL(long startmarker, long endmarker)
1889 {
1890 double subtotal;
1891 long marker, cat;
1892
1893 DoubleVec1d like = m_catcells;
1894 DoubleVec1d nulike(m_ncategories);
1895
1896 for (marker = startmarker; marker != endmarker; ++marker)
1897 {
1898 subtotal = 0.0;
1899 for (cat = 0; cat < m_ncategories; cat++)
1900 subtotal += m_catprobs[cat] * like[cat];
1901
1902 subtotal *= m_acratio;
1903
1904 for (cat = 0; cat < m_ncategories; cat++)
1905 nulike[cat] = catlikes[marker][cat] *
1906 (subtotal + m_ucratio * like[cat]);
1907
1908 // the following puts the nulike values into like. It
1909 // also puts the like values into nulike, but we will not
1910 // be using values from nulike so we don't care.
1911 like.swap(nulike);
1912 }
1913
1914 subtotal = 0.0;
1915 for (cat = 0; cat < m_ncategories; cat++)
1916 subtotal += m_catprobs[cat] * like[cat];
1917
1918 // the following puts the like values into catcells for
1919 // long-term storage. It also puts the catcells values into
1920 // like, but we don't care.
1921 m_catcells.swap(like);
1922
1923 return log(subtotal);
1924
1925 } // GTRModel::ComputeCatDL
1926
1927 //------------------------------------------------------------------------------------
1928
CreateDataModelReport() const1929 StringVec1d GTRModel::CreateDataModelReport() const
1930 {
1931 StringVec1d report = NucModel::CreateDataModelReport();
1932
1933 string line = "Mutation parameters: ";
1934 report.push_back(line);
1935 line = "Between A and (C, G, T): ";
1936 line += ToString(AC) + ", " + ToString(AG) + ", " + ToString(AT);
1937 report.push_back(line);
1938 line = "Between C and (G, T): ";
1939 line += ToString(CG) + ", " + ToString(CT);
1940 report.push_back(line);
1941 line = "Between G and (T): ";
1942 line += ToString(TG);
1943 report.push_back(line);
1944
1945 return report;
1946
1947 } // GTRModel::CreateDataModelReport
1948
1949 //------------------------------------------------------------------------------------
1950
ToXML(size_t nspaces) const1951 StringVec1d GTRModel::ToXML(size_t nspaces) const
1952 {
1953 StringVec1d xmllines(NucModel::ToXML(nspaces));
1954
1955 nspaces += INDENT_DEPTH;
1956 string line(MakeIndent(MakeTag(xmlstr::XML_TAG_GTRRATES),nspaces));
1957 line += ToString(GetRates(),6);
1958 line += MakeCloseTag(xmlstr::XML_TAG_GTRRATES);
1959 nspaces -= INDENT_DEPTH;
1960
1961 StringVec1d::iterator endtag = --xmllines.end();
1962 xmllines.insert(endtag,line);
1963
1964 return xmllines;
1965
1966 } // GTRModel::ToXML
1967
1968 //------------------------------------------------------------------------------------
1969
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const1970 DoubleVec1d GTRModel::SimulateMarker(double branchlength, long whichmarker,
1971 const DoubleVec1d& state) const
1972 {
1973 throw implementation_error("Cannot simulate data with the GTR model yet.");
1974 } // SimulateMarker
1975
1976 //------------------------------------------------------------------------------------
1977
IsValidDataModel() const1978 bool GTRModel::IsValidDataModel() const
1979 {
1980 if (AC <= 0) return false;
1981 if (AG <= 0) return false;
1982 if (AT <= 0) return false;
1983 if (CG <= 0) return false;
1984 if (CT <= 0) return false;
1985 if (TG <= 0) return false;
1986 return NucModel::IsValidDataModel();
1987 } // GTRModel::IsValidDataModel
1988
1989 //------------------------------------------------------------------------------------
1990 //------------------------------------------------------------------------------------
1991
AlleleModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,long numBins,double relMuRate)1992 AlleleModel::AlleleModel(long nmarkers,
1993 long numCategories,
1994 DoubleVec1d categoryRates,
1995 DoubleVec1d categoryProbabilities,
1996 double userAutoCorrelationValue,
1997 bool doNormalize,
1998 long numBins,
1999 double relMuRate)
2000 : DataModel(nmarkers,
2001 numCategories,
2002 categoryRates,
2003 categoryProbabilities,
2004 userAutoCorrelationValue,
2005 doNormalize,
2006 numBins,
2007 relMuRate),
2008 m_likes(CreateVec2d(m_nmarkers, m_ncategories, 0.0)),
2009 m_bincounts()
2010 {
2011 // intentionally blank
2012 } // AlleleModel constructor
2013
2014 //------------------------------------------------------------------------------------
2015 //------------------------------------------------------------------------------------
2016
RescaleLength(double length)2017 DoubleVec1d AlleleModel::RescaleLength(double length)
2018 {
2019 length *= m_relmurate;
2020
2021 long cat;
2022 DoubleVec1d scaled(m_ncategories);
2023 for (cat = 0; cat < m_ncategories; ++cat)
2024 {
2025 scaled[cat] = length * m_catrates[cat];
2026 }
2027 return scaled;
2028
2029 } // AlleleModel::RescaleLengths
2030
2031 //------------------------------------------------------------------------------------
2032
ComputeCatDL(long startmarker,long endmarker)2033 double AlleleModel::ComputeCatDL(long startmarker, long endmarker)
2034 {
2035 double subtotal;
2036 long marker, cat;
2037
2038 DoubleVec1d previous = m_catcells;
2039 DoubleVec1d current(m_ncategories, 0.0);
2040
2041 for (marker = startmarker; marker != endmarker; ++marker)
2042 {
2043 subtotal = 0.0;
2044 for (cat = 0; cat < m_ncategories; cat++)
2045 subtotal += m_catprobs[cat] * previous[cat];
2046
2047 subtotal *= m_acratio;
2048
2049 for (cat = 0; cat < m_ncategories; cat++)
2050 current[cat] = m_likes[marker][cat] *
2051 (subtotal + m_ucratio * previous[cat]);
2052
2053 // This line puts the current values in previous cheaply. We
2054 // don't care that it puts the previous values in current,
2055 // because current will be overwritten anyway.
2056 previous.swap(current);
2057 }
2058
2059 subtotal = 0.0;
2060 for (cat = 0; cat < m_ncategories; cat++)
2061 subtotal += m_catprobs[cat] * previous[cat];
2062
2063 m_catcells.swap(previous);
2064
2065 return log(subtotal);
2066
2067 } // AlleleModel::ComputeCatDL
2068
2069 //------------------------------------------------------------------------------------
2070
ChooseAncestralState(long marker)2071 DoubleVec1d AlleleModel::ChooseAncestralState(long marker)
2072 {
2073 long size = m_bincounts[marker];
2074 DoubleVec1d result(size, 0.0);
2075 result[registry.GetRandom().Long(size)] = 1.0;
2076 return result;
2077 } // ChooseAncestralState
2078
2079 //------------------------------------------------------------------------------------
2080 //------------------------------------------------------------------------------------
2081
StepwiseModel(long nmarkers,const StringVec2d & uniqueAlleles,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate)2082 StepwiseModel::StepwiseModel(long nmarkers,
2083 const StringVec2d& uniqueAlleles,
2084 long numCategories,
2085 DoubleVec1d categoryRates,
2086 DoubleVec1d categoryProbabilities,
2087 double userAutoCorrelationValue,
2088 bool doNormalize,
2089 double relMuRate)
2090 : AlleleModel(nmarkers,
2091 numCategories,
2092 categoryRates,
2093 categoryProbabilities,
2094 userAutoCorrelationValue,
2095 doNormalize,
2096 defaults::bins,
2097 relMuRate),
2098 m_allowance(defaults::step_allowance)
2099 {
2100 Initialize(uniqueAlleles);
2101 } // StepwiseModel constructor
2102
2103 //------------------------------------------------------------------------------------
2104
Clone() const2105 DataModel* StepwiseModel::Clone() const
2106 {
2107 DataModel* newmodel = new StepwiseModel(*this);
2108 return newmodel;
2109 }
2110
2111 //------------------------------------------------------------------------------------
2112
CreateDataModelReport() const2113 StringVec1d StepwiseModel::CreateDataModelReport() const
2114 {
2115 StringVec1d report = DataModel::CreateDataModelReport();
2116 string maxbins = "Maximum number of bins: " + ToString(m_bincounts[0]);
2117 report.push_back(maxbins);
2118 return report;
2119 } // StepwiseModel::CreateDataModelReport
2120
2121 //------------------------------------------------------------------------------------
2122
DataToLikes(const string & datum,long marker) const2123 vector<double> StepwiseModel::DataToLikes(const string& datum, long marker) const
2124 {
2125 if (m_bincounts.empty())
2126 {
2127 string msg = "StepwiseModel::DataToLikes() was called before m_bincounts ";
2128 msg += "was initialized.";
2129 throw implementation_error(msg);
2130 }
2131
2132 if (datum == "?") // unknown data fills all bins with 1
2133 {
2134 vector<double> result(m_bincounts[marker], 1.0);
2135 return result;
2136 }
2137 else
2138 {
2139 vector<double> result(m_bincounts[marker], 0.0);
2140 long allele;
2141 FromString(datum,allele);
2142 allele -= m_offsets[marker];
2143 if (allele < 0 || allele >= m_bincounts[marker])
2144 {
2145 string msg = "StepwiseModel::DataToLikes() was called on an ";
2146 msg += "uninitialized (or incorrectly initialized) object.";
2147 throw implementation_error(msg);
2148 }
2149 result[allele] = 1.0;
2150 return result;
2151 }
2152
2153 } // DataToLikes
2154
2155 //------------------------------------------------------------------------------------
2156
Initialize(const StringVec2d & uniqueAlleles)2157 void StepwiseModel::Initialize(const StringVec2d& uniqueAlleles)
2158 {
2159 if (static_cast<unsigned long>(m_nmarkers) != uniqueAlleles.size())
2160 {
2161 string msg = "StepwiseModel::Initialize() encountered m_nmarkers = ";
2162 msg += ToString(m_nmarkers) + " and uniqueAlleles.size() = ";
2163 msg += ToString(uniqueAlleles.size()) + "; these numbers should be equal.";
2164 throw implementation_error(msg);
2165 }
2166 long marker;
2167
2168 // find the biggest and smallest allele for each marker
2169 // NB We assume that allele sizes are in number of repeats,
2170 // *not* base pair count or anything else, and that "missing
2171 // data" is coded as ?.
2172
2173 for (marker = 0; marker < m_nmarkers; ++marker)
2174 {
2175 bool real_allele = false; // did we ever see a non-? allele
2176 long smallone = MAXLONG, largeone = 0;
2177 for (size_t nallele = 0; nallele<uniqueAlleles[marker].size();
2178 nallele++)
2179 {
2180 // convert to number
2181 // do not count "unknown data" markers
2182 string allele = uniqueAlleles[marker][nallele];
2183 if (allele == "?")
2184 {
2185 assert(false); //need to catch this earlier
2186 continue;
2187 }
2188 real_allele = true;
2189 long tipval;
2190 FromString(allele, tipval); // convert to long
2191
2192 if (tipval < smallone) smallone = tipval;
2193 if (tipval > largeone) largeone = tipval;
2194 }
2195
2196 // if no non-? were ever found, use arbitrary values
2197 if (!real_allele)
2198 {
2199 smallone = 10;
2200 largeone = 10;
2201 }
2202
2203 long newoffset = max(smallone - m_allowance, 0L);
2204 m_offsets.push_back(newoffset);
2205 m_bincounts.push_back(largeone + m_allowance + 1 - newoffset);
2206 if (real_allele && largeone != smallone)
2207 // m_threshhold: +1 makes it work for (large-small) both odd/even
2208 m_threshhold.push_back((largeone - smallone + 1L)/2L);
2209 else
2210 m_threshhold.push_back(1L);
2211
2212 // Pre-calculate table of steps
2213 CalculateSteps(marker);
2214 }
2215
2216 AlleleModel::m_nbins = *max_element(m_bincounts.begin(),
2217 m_bincounts.end());
2218 fill(AlleleModel::m_bincounts.begin(),
2219 AlleleModel::m_bincounts.end(), AlleleModel::m_nbins);
2220 } // Initialize
2221
2222 //------------------------------------------------------------------------------------
2223 // Adaptation of Peter Beerli's microsatellite likelihood nuview_micro routine from Migrate.
2224
ComputeSiteDLs(Cell_ptr child1,Cell_ptr child2,Cell_ptr thiscell,const DoubleVec1d & lengthOfBranchToChild1ScaledByRateCat,const DoubleVec1d & lengthOfBranchToChild2ScaledByRateCat,long marker)2225 void StepwiseModel::ComputeSiteDLs (Cell_ptr child1, Cell_ptr child2,
2226 Cell_ptr thiscell, const DoubleVec1d& lengthOfBranchToChild1ScaledByRateCat,
2227 const DoubleVec1d& lengthOfBranchToChild2ScaledByRateCat, long marker)
2228 {
2229 double **pSiteDLsForChild1 = child1->GetSiteDLs(marker);
2230 double **pSiteDLsForChild2 = child2->GetSiteDLs(marker);
2231 double normForChild1 = child1->GetNorms(marker);
2232 double normForChild2 = child2->GetNorms(marker);
2233
2234 if (!pSiteDLsForChild1 && !pSiteDLsForChild2) // they can't both be NULL
2235 {
2236 string msg = "StepwiseModel::ComputeSiteDLs() found no existing ";
2237 msg += "site data-likelihoods for either child.";
2238 throw implementation_error(msg);
2239 }
2240
2241 // in case of a one-legged coalescence, copy values and return
2242 if (!pSiteDLsForChild1)
2243 {
2244 thiscell->SetSiteDLs(marker, pSiteDLsForChild2);
2245 thiscell->SetNorms(normForChild2, marker);
2246 return;
2247 }
2248
2249 if (!pSiteDLsForChild2)
2250 {
2251 thiscell->SetSiteDLs(marker, pSiteDLsForChild1);
2252 thiscell->SetNorms(normForChild1, marker);
2253 return;
2254 }
2255
2256 double **jointProbChild1Child2 = new double*[m_ncategories * sizeof(double *)];
2257 jointProbChild1Child2[0] = new double[m_ncategories * m_bincounts[marker] * sizeof(double)];
2258 for (long cat = 0; cat < m_ncategories; ++cat)
2259 jointProbChild1Child2[cat] = jointProbChild1Child2[0] + cat * m_bincounts[marker];
2260
2261 long bmax = m_bincounts[marker];
2262 long threshold = m_threshhold[marker];
2263 double maxJointProbChild1Child2 = -DBL_MAX;
2264 double mutationProbChild1, mutationProbChild2;
2265
2266 // Compute the bin contents for each possible microsat allele.
2267 // "b" is the "bin number," i.e., the scaled value of the starting allele
2268 // ("scaled" meaning "shifted so that the smallest allele gets set to zero").
2269 // For each starting allele "b", we sweep through a range of alleles "a" to which
2270 // allele "b" can mutate, from b - threshold to b + threshold.
2271 for (long b = 0; b < bmax; b++)
2272 {
2273 for (long cat = 0; cat < m_ncategories; ++cat)
2274 {
2275 mutationProbChild1 = mutationProbChild2 = 0.0;
2276 for (long a = max(0L, b - threshold); a <= min(b + threshold, bmax-1); a++)
2277 {
2278 // Note: The probability of mutating n "steps" downward
2279 // (e.g., from a microsat allele of 23 repeats to an allele of 20 repeats)
2280 // equals the prob. of mutating n steps upward
2281 // (e.g., from 23 repeats to 26 repeats).
2282 long netNumPositiveSteps = labs(b - a);
2283 if (pSiteDLsForChild1[cat][a] > 0)
2284 {
2285 mutationProbChild1 += Probability(lengthOfBranchToChild1ScaledByRateCat[cat],
2286 netNumPositiveSteps, marker) * pSiteDLsForChild1[cat][a];
2287 }
2288 if (pSiteDLsForChild2[cat][a] > 0)
2289 {
2290 mutationProbChild2 += Probability(lengthOfBranchToChild2ScaledByRateCat[cat],
2291 netNumPositiveSteps, marker) * pSiteDLsForChild2[cat][a];
2292 }
2293 }
2294 jointProbChild1Child2[cat][b] = mutationProbChild1*mutationProbChild2;
2295
2296 if (jointProbChild1Child2[cat][b] > maxJointProbChild1Child2)
2297 {
2298 maxJointProbChild1Child2 = jointProbChild1Child2[cat][b];
2299 }
2300 }
2301 }
2302
2303 // normalize to further protect against overflow, if requested
2304 if (ShouldNormalize())
2305 {
2306 if (0.0 == maxJointProbChild1Child2)
2307 {
2308 thiscell->SetNorms(-DBL_MAX,marker);
2309 }
2310 else
2311 {
2312 for (long b = 0; b < bmax; b++)
2313 {
2314 for (long cat = 0; cat < m_ncategories; ++cat)
2315 {
2316 jointProbChild1Child2[cat][b] /= maxJointProbChild1Child2;
2317 }
2318 }
2319 thiscell->SetNorms(log(maxJointProbChild1Child2) +
2320 normForChild1 + normForChild2, marker);
2321 }
2322 }
2323
2324 thiscell->SetSiteDLs(marker, jointProbChild1Child2);
2325
2326 delete[] jointProbChild1Child2[0];
2327 delete[] jointProbChild1Child2;
2328 } // ComputeSiteDLs
2329
2330 //------------------------------------------------------------------------------------
2331
ComputeSubtreeDLs(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)2332 double StepwiseModel::ComputeSubtreeDLs(Cell& rootdls, double** startmarker, double** endmarker, long posn)
2333 {
2334 double total=0.0, subtotal;
2335 double** marker;
2336 long cat, bin;
2337 long firstposn = posn;
2338
2339 for (marker = startmarker; marker != endmarker;
2340 marker = rootdls.GetNextMarker(marker))
2341 {
2342 subtotal = 0.0;
2343 DoubleVec1d buffer(m_ncategories, 0.0);
2344
2345 for (cat = 0; cat < m_ncategories; ++cat)
2346 {
2347 for (bin = 0; bin < m_bincounts[posn]; ++bin)
2348 {
2349 // NB: We assume a flat allele frequency prior here.
2350 //cout << "cat " << cat << ", bin " << bin << ", prob " << marker[cat][bin] << endl;
2351 buffer[cat] += marker[cat][bin];
2352 }
2353 subtotal += m_catprobs[cat] * buffer[cat];
2354 }
2355
2356 if (!subtotal)
2357 {
2358 DataModel::TryToNormalizeAndThrow(posn, GetModelType());
2359 }
2360
2361 total += (log(subtotal) + rootdls.GetNorms(posn));
2362
2363 assert (total != 0); // that would be *too* likely
2364
2365 if (m_ncategories > 1)
2366 {
2367 for (cat = 0; cat < m_ncategories; cat++)
2368 m_likes[posn][cat] = buffer[cat]/subtotal;
2369 }
2370 ++posn;
2371 }
2372
2373 if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
2374 return total;
2375
2376 } // StepwiseModel::ComputeSubtreeDLs
2377
2378 //------------------------------------------------------------------------------------
2379
2380 // This method returns the probability of changing an allele by "diff" net steps
2381 // in time t. For example, if the data type is microsatellite, we could compute
2382 // the probability that the sequence ACACAC (3 repeats) mutates to ACACACACAC
2383 // (5 repeats, for a "diff" of 2) along a branch whose length is scaled to be t.
2384 // "diff" received by this method is nonnegative, representing a net increase
2385 // in the size of the allele; the probability of decreasing by "diff" net steps
2386 // is defined to be the same as increasing by "diff" net steps.
2387 // All nonvanishingly unlikely net steps are considered; e.g., simply taking 2
2388 // steps along the branch, or taking 2 steps forward plus k steps backward
2389 // plus k steps forward, where k = 0, 1, 2, 3, ..., infinity.
2390 // The formula is Prob(i net steps in time t) =
2391 // exp(-t)*sum_over_k((t/2)^(i+2k) / ((i+k)!k!)),
2392 // where the sum runs from k = 0 to k = infinity.
2393 // This formula is eq. 15.26 in Joe's "Inferring Phylogenies" book (p. 242).
2394 // The code looks different because the factorials are obtained via a precomputed
2395 // lookup table, and exp/log are used to counteract underflow.
2396 // Contributions to the sum drop off rapidly as k increases.
2397 // (Side note: An equivalent and more compact version of this formula is
2398 // Prob(i,t) = exp(-t)*BesselI(i,t), where BesselI(n,x) is the
2399 // modified Bessel function of integer order n evaluated at real argument x.)
2400
Probability(double t,long diff,long marker) const2401 double StepwiseModel::Probability(double t, long diff, long marker) const
2402 {
2403 long threshold = m_threshhold[marker]; // max. number of positive steps
2404 if (diff > threshold)
2405 return 0.0; // approximately infinitely unlikely to mutate that much in time t
2406
2407 double sum(0.0), oldsum(0.0);
2408 const DoubleVec2d& PrecomputedTerms = m_steps[marker];
2409 double log_tOver2 = log(0.5 * t);
2410
2411 for (long k = 0; k <= threshold; k++) // num steps = diff + k <= threshold
2412 {
2413 sum += exp(-t + log_tOver2*(diff + 2.0*k) - PrecomputedTerms[diff][k]);
2414
2415 // quit if the contributions have become trivial
2416 if (fabs (oldsum - sum) < DBL_EPSILON) break;
2417 oldsum = sum;
2418 }
2419
2420 return sum;
2421 }
2422
2423 //------------------------------------------------------------------------------------
2424 // Adaptation of Peter Beerli's calculate_steps routine from MIGRATE.
2425 // This routine precomputes values needed by Probability, for speed.
2426
CalculateSteps(long marker)2427 void StepwiseModel::CalculateSteps(long marker)
2428 {
2429 long k, diff;
2430 DoubleVec1d tempvec;
2431 DoubleVec2d steps;
2432 long threshhold = m_threshhold[marker];
2433
2434 for (diff = 0; diff <= threshhold; diff++)
2435 {
2436 tempvec.clear();
2437 for (k = 0; k <= threshhold; k++)
2438 {
2439 tempvec.push_back(logfac (diff + k) + logfac (k));
2440 }
2441 steps.push_back(tempvec);
2442 }
2443
2444 m_steps.push_back(steps); // Note: This is okay from a speed standpoint,
2445 // but not so good from a design standpoint,
2446 // because this method assumes it's being called
2447 // within a loop over markers.
2448 } // calculate_steps
2449
2450 //------------------------------------------------------------------------------------
2451
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const2452 DoubleVec1d StepwiseModel::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
2453 {
2454 throw implementation_error("Cannot simulate data with the stepwise model yet.");
2455 } // SimulateMarker
2456
2457 //------------------------------------------------------------------------------------
2458
CellToData(Cell_ptr cell,long marker) const2459 string StepwiseModel::CellToData(Cell_ptr cell, long marker) const
2460 {
2461 LongVec1d ones = cell->GetOnes(marker);
2462 assert(static_cast<size_t>(marker) < m_offsets.size());
2463 if (ones.size() == 1)
2464 {
2465 return ToString(ones[0] + m_offsets[marker]);
2466 }
2467 throw implementation_error
2468 ("Cannot convert stepwise data from the internal format for data not simply a single number.");
2469 }
2470
2471 //------------------------------------------------------------------------------------
2472 //------------------------------------------------------------------------------------
2473
BrownianModel(long nmarkers,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate)2474 BrownianModel::BrownianModel(long nmarkers,
2475 long numCategories,
2476 DoubleVec1d categoryRates,
2477 DoubleVec1d categoryProbabilities,
2478 double userAutoCorrelationValue,
2479 bool doNormalize,
2480 double relMuRate)
2481 : AlleleModel(nmarkers,
2482 numCategories,
2483 categoryRates,
2484 categoryProbabilities,
2485 userAutoCorrelationValue,
2486 doNormalize,
2487 defaults::brownianBins, // 3, for mean, variance, and cumulative total
2488 relMuRate)
2489 {
2490 // intentionally blank
2491 } // BrownianModel constructor
2492
2493 //------------------------------------------------------------------------------------
2494
Clone() const2495 DataModel* BrownianModel::Clone() const
2496 {
2497 DataModel* newmodel = new BrownianModel(*this);
2498 return newmodel;
2499 }
2500
2501 //------------------------------------------------------------------------------------
2502
SetNormalize(bool norm)2503 void BrownianModel::SetNormalize(bool norm)
2504 {
2505 if(norm)
2506 {
2507 data_error e("Normalization cannot be set for Brownian Model");
2508 throw e;
2509 }
2510 } // BrownianModel::SetNormalize
2511
2512 //------------------------------------------------------------------------------------
2513
DataToLikes(const string & datum,long) const2514 vector<double> BrownianModel::DataToLikes(const string& datum, long) const
2515 {
2516 vector<double> result(m_nbins,0.0);
2517
2518 if (datum == "?") // unknown data
2519 {
2520 result[0] = 5.0; // this is an arbitrary value
2521 result[1] = DBL_BIG;
2522 result[2] = 0.0;
2523 }
2524 else
2525 {
2526 FromString(datum,result[0]);
2527 result[1] = 0.0;
2528 result[2] = 0.0;
2529 }
2530
2531 return result;
2532
2533 } // DataToLikes
2534
2535 //------------------------------------------------------------------------------------
2536 // Adaptation of Peter Beerli's nuview_brownian() from Migrate-1.2.4
2537 // by Jon Yamato 2002/05/06
2538
2539 // N[Log[1/Sqrt[2 Pi]], 30]
2540 #define LOG2PIHALF -0.918938533204672741780329736406
2541
ComputeSiteDLs(Cell_ptr child1,Cell_ptr child2,Cell_ptr thiscell,const DoubleVec1d & vv1,const DoubleVec1d & vv2,long marker)2542 void BrownianModel::ComputeSiteDLs(Cell_ptr child1, Cell_ptr child2,
2543 Cell_ptr thiscell, const DoubleVec1d& vv1, const DoubleVec1d& vv2,
2544 long marker)
2545 {
2546 double mean1, mean2, xx1, xx2, c12, v1, v2, vtot, f1, f2;
2547 double **c1dls = child1->GetSiteDLs(marker),
2548 **c2dls = child2->GetSiteDLs(marker);
2549
2550 if (!c1dls && !c2dls)
2551 {
2552 string msg = "BrownianModel::ComputeSiteDLs() failed to find ";
2553 msg += "data likelihoods for either child.";
2554 throw implementation_error(msg);
2555 }
2556
2557 if (!c1dls)
2558 {
2559 thiscell->SetSiteDLs(marker,c2dls);
2560 return;
2561 }
2562
2563 if (!c2dls)
2564 {
2565 thiscell->SetSiteDLs(marker,c1dls);
2566 return;
2567 }
2568
2569 // temporary space needed for interface with dlcell::SetSiteDLs()
2570 double **mydls = new double*[m_ncategories*sizeof(double*)];
2571 mydls[0] = new double[m_ncategories*m_nbins*sizeof(double)];
2572 long cat;
2573 for(cat = 1; cat < m_ncategories; ++cat)
2574 mydls[cat] = mydls[0] + cat*m_nbins;
2575
2576 for (cat = 0; cat < m_ncategories; ++cat)
2577 {
2578
2579 mean1 = c1dls[cat][0];
2580 xx1 = c1dls[cat][2];
2581 v1 = vv1[cat] + c1dls[cat][1];
2582
2583 mean2 = c2dls[cat][0];
2584 xx2 = c2dls[cat][2];
2585 v2 = vv2[cat] + c2dls[cat][1];
2586
2587 vtot = v1 + v2;
2588
2589 // the weights are set reciprocally so that the value coming from
2590 // the shorter branch is given more weight.
2591 if (vtot > 0.0) f1 = v2/vtot;
2592 else f1 = 0.5;
2593 f2 = 1.0 - f1;
2594
2595 mydls[cat][0] = f1*mean1 + f2*mean2;
2596
2597 mydls[cat][1] = v1*f1;
2598
2599 mydls[cat][2] = xx1 + xx2;
2600
2601 c12 = (mean1-mean2)*(mean1-mean2) / vtot;
2602 mydls[cat][2] += min(0.0,-0.5 * (log(vtot)+c12) + LOG2PIHALF);
2603 }
2604
2605 thiscell->SetSiteDLs(marker,mydls);
2606
2607 delete [] mydls[0];
2608 delete [] mydls;
2609
2610 } // BrownianModel::ComputeSiteDLs
2611
2612 #undef LOG2PIHALF
2613
2614 //------------------------------------------------------------------------------------
2615
ComputeSubtreeDLs(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)2616 double BrownianModel::ComputeSubtreeDLs(Cell& rootdls, double** startmarker, double** endmarker, long posn)
2617 {
2618 // NB: Brownian likelihoods are stored as logs!
2619
2620 double total=0.0, subtotal;
2621 double** marker;
2622 long cat;
2623 long firstposn = posn;
2624
2625 for (marker = startmarker; marker != endmarker;
2626 marker = rootdls.GetNextMarker(marker))
2627 {
2628
2629 if (m_ncategories > 1)
2630 {
2631 // in order to add up likelihoods over categories, we
2632 // must un-log them. We normalize them first to avoid
2633 // underflow.
2634
2635 DoubleVec1d buffer(m_ncategories, 0.0);
2636 subtotal = 0.0;
2637
2638 double biggest = NEGMAX;
2639
2640 for (cat = 0; cat < m_ncategories; ++cat)
2641 {
2642 if (marker[cat][2] > biggest) biggest = marker[cat][2];
2643 }
2644
2645 for (cat = 0; cat < m_ncategories; ++cat)
2646 {
2647 buffer[cat] += exp(marker[cat][2] - biggest);
2648 subtotal += m_catprobs[cat] * buffer[cat];
2649 }
2650
2651 for (cat = 0; cat < m_ncategories; cat++)
2652 m_likes[posn][cat] = buffer[cat]/subtotal;
2653
2654 if (!subtotal)
2655 {
2656 // we shouldn't be here! Normalization shouldn't happen
2657 // for Brownian.
2658 datalike_error ex("invalid subtree found in Brownian model");
2659 throw(ex);
2660 }
2661
2662 total += (log(subtotal) + biggest);
2663
2664 }
2665 else
2666 {
2667 // If there is only one category, there is no normalization
2668 // and we MUST NOT un-log the likelihood or it will underflow.
2669 total += marker[0][2];
2670 }
2671
2672 ++posn;
2673 }
2674
2675 if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
2676
2677 return total;
2678
2679 } // BrownianModel::ComputeSubtreeDLs
2680
2681 //------------------------------------------------------------------------------------
2682
CreateDataModelReport() const2683 StringVec1d BrownianModel::CreateDataModelReport() const
2684 {
2685 StringVec1d report = DataModel::CreateDataModelReport();
2686 string rptline("(The brownian approximation for microsatellite ");
2687 rptline += "evolution has no extra parameters.)";
2688 report.push_back(rptline);
2689
2690 return report;
2691
2692 } // BrownianModel::CreateDataModelReport
2693
2694 //------------------------------------------------------------------------------------
2695
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const2696 DoubleVec1d BrownianModel::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
2697 {
2698 throw implementation_error("Cannot simulate data with the Brownian model yet.");
2699 } // SimulateMarker
2700
2701 //------------------------------------------------------------------------------------
2702
CellToData(Cell_ptr cell,long marker) const2703 string BrownianModel::CellToData(Cell_ptr cell, long marker) const
2704 {
2705 double** dls = cell->GetSiteDLs(marker);
2706 if (dls != NULL)
2707 {
2708 return ToString(dls[0][0]);
2709 }
2710 return "0";
2711 }
2712
2713 //------------------------------------------------------------------------------------
2714 //------------------------------------------------------------------------------------
2715
KAlleleModel(long nmarkers,const StringVec2d & uniqueAlleles,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate)2716 KAlleleModel::KAlleleModel(
2717 long nmarkers,
2718 const StringVec2d& uniqueAlleles,
2719 long numCategories,
2720 DoubleVec1d categoryRates,
2721 DoubleVec1d categoryProbabilities,
2722 double userAutoCorrelationValue,
2723 bool doNormalize,
2724 double relMuRate
2725 )
2726 : AlleleModel(
2727 nmarkers,
2728 numCategories,
2729 categoryRates,
2730 categoryProbabilities,
2731 userAutoCorrelationValue,
2732 doNormalize,
2733 defaults::bins,
2734 relMuRate
2735 )
2736 {
2737 Initialize(uniqueAlleles);
2738 } // KAlleleModel constructor
2739
2740 //------------------------------------------------------------------------------------
2741
ComputeSiteDLs(Cell_ptr child1,Cell_ptr child2,Cell_ptr thiscell,const DoubleVec1d & lengthOfBranchToChild1ScaledByRateCat,const DoubleVec1d & lengthOfBranchToChild2ScaledByRateCat,long marker)2742 void KAlleleModel::ComputeSiteDLs(Cell_ptr child1, Cell_ptr child2,
2743 Cell_ptr thiscell, const DoubleVec1d& lengthOfBranchToChild1ScaledByRateCat,
2744 const DoubleVec1d& lengthOfBranchToChild2ScaledByRateCat, long marker)
2745 {
2746 double **pSiteDLsForChild1 = child1->GetSiteDLs(marker);
2747 double **pSiteDLsForChild2 = child2->GetSiteDLs(marker);
2748 double normForChild1 = child1->GetNorms(marker);
2749 double normForChild2 = child2->GetNorms(marker);
2750
2751 if (!pSiteDLsForChild1 && !pSiteDLsForChild2)
2752 {
2753 string msg = "KAlleleModel::ComputeSiteDLs() found no existing ";
2754 msg += "site data-likelihoods for either child.";
2755 throw implementation_error(msg);
2756 }
2757
2758 // in case of a one-legged coalescence, copy values and return
2759 if (!pSiteDLsForChild1)
2760 {
2761 thiscell->SetSiteDLs(marker, pSiteDLsForChild2);
2762 thiscell->SetNorms(normForChild2, marker);
2763 return;
2764 }
2765
2766 if (!pSiteDLsForChild2)
2767 {
2768 thiscell->SetSiteDLs(marker, pSiteDLsForChild1);
2769 thiscell->SetNorms(normForChild1, marker);
2770 return;
2771 }
2772
2773 long smax = m_bincounts[marker];
2774
2775 // allocate temporary working space;
2776 // OPTIMIZE should be done more cheaply!
2777 double **jointProbChild1Child2 = new double*[m_ncategories * sizeof(double*)];
2778 jointProbChild1Child2[0] = new double[m_ncategories * m_bincounts[marker] * sizeof(double)];
2779 for (long cat = 0; cat < m_ncategories; ++cat)
2780 jointProbChild1Child2[cat] = jointProbChild1Child2[0] + cat * m_bincounts[marker];
2781
2782 // compute the bin contents for each possible allele size
2783 long s, ss;
2784 double prob1, prob2, a1, a2, b1, b2, temp1, temp2, sum1, sum2;
2785 double maxJointProbChild1Child2(-DBL_MAX);
2786
2787 for (long cat = 0; cat < m_ncategories; ++cat)
2788 {
2789 prob1 = probMathFunc ( lengthOfBranchToChild1ScaledByRateCat[cat], (smax-1.0)/smax);
2790 prob2 = probMathFunc ( lengthOfBranchToChild2ScaledByRateCat[cat], (smax-1.0)/smax);
2791
2792 a1 = 1.0 - prob1;
2793 a2 = 1.0 - prob2;
2794 b1 = prob1 / ( smax - 1.0); // smax was set to at least 2 in Initialize()
2795 b2 = prob2 / ( smax - 1.0);
2796 for (s = 0; s < smax; s++)
2797 {
2798 sum1 = 0; sum2 = 0;
2799 for(ss = 0; ss < smax; ss++)
2800 {
2801 if (s == ss)
2802 {
2803 temp1 = a1; temp2 = a2;
2804 }
2805 else
2806 {
2807 temp1 = b1; temp2 = b2;
2808 }
2809 sum1 += pSiteDLsForChild1[cat][ss] * temp1;
2810 sum2 += pSiteDLsForChild2[cat][ss] * temp2;
2811 }
2812
2813 jointProbChild1Child2[cat][s] = sum1 * sum2;
2814 if (jointProbChild1Child2[cat][s] > maxJointProbChild1Child2)
2815 maxJointProbChild1Child2 = jointProbChild1Child2[cat][s]; // overflow protection
2816 }
2817 }
2818
2819 // normalize to further protect against overflow, if requested
2820 if (ShouldNormalize())
2821 {
2822 if (maxJointProbChild1Child2 == 0.0)
2823 {
2824 thiscell->SetNorms(-DBL_MAX,marker);
2825 }
2826 else
2827 {
2828 for (s = 0; s < smax; s++)
2829 {
2830 for (long cat = 0; cat < m_ncategories; ++cat)
2831 {
2832 jointProbChild1Child2[cat][s] /= maxJointProbChild1Child2;
2833 }
2834 }
2835 thiscell->SetNorms(log(maxJointProbChild1Child2) + normForChild1 + normForChild2, marker);
2836 }
2837 }
2838
2839 thiscell->SetSiteDLs(marker, jointProbChild1Child2);
2840
2841 delete[] jointProbChild1Child2[0];
2842 delete[] jointProbChild1Child2;
2843 } // KAlleleModel::ComputeSiteDLs
2844
2845 //------------------------------------------------------------------------------------
2846
Clone() const2847 DataModel* KAlleleModel::Clone() const
2848 {
2849 DataModel* newmodel = new KAlleleModel(*this);
2850 return newmodel;
2851 } // KAllelModel::Clone
2852
2853 //------------------------------------------------------------------------------------
2854
probMathFunc(double ut,double coef)2855 double KAlleleModel::probMathFunc( double ut, double coef)
2856 {
2857 double mathValue = coef*(1.0-exp(-1.0/coef*ut));
2858 if (systemSpecificIsnan(mathValue)) // BUGBUG Presumably some other action should be taken?
2859 cout<<"coef:"<<coef<<endl
2860 <<"time:"<<ut<<endl;
2861 return mathValue;
2862 } // KAllelModel::probMathFunc
2863
2864 //------------------------------------------------------------------------------------
2865
CreateDataModelReport() const2866 StringVec1d KAlleleModel::CreateDataModelReport() const
2867 {
2868 StringVec1d report = DataModel::CreateDataModelReport();
2869 string maxbins = "Maximum number of bins: " + ToString(m_bincounts[0]);
2870 report.push_back(maxbins);
2871 return report;
2872
2873 } // KAllelModel::CreateDataModelReport
2874
2875 //------------------------------------------------------------------------------------
2876
Initialize(const StringVec2d & uniqueAlleles)2877 void KAlleleModel::Initialize(const StringVec2d& uniqueAlleles)
2878 {
2879 if (static_cast<unsigned long>(m_nmarkers) != uniqueAlleles.size())
2880 {
2881 string msg = "KAlleleModel::Initialize() encountered m_nmarkers = ";
2882 msg += ToString(m_nmarkers) + " and uniqueAlleles.size() = ";
2883 msg += ToString(uniqueAlleles.size()) + "; these numbers should be equal.";
2884 throw implementation_error(msg);
2885 }
2886
2887 for (long marker = 0; marker < m_nmarkers; ++marker)
2888 {
2889 map <string,long> tmpMap;
2890 m_allelemaps.push_back(tmpMap);
2891 size_t nallele = 0;
2892 for (; nallele<uniqueAlleles[marker].size(); nallele++)
2893 {
2894 string allelename = uniqueAlleles[marker][nallele];
2895 if (allelename == "?")
2896 {
2897 assert(false); //we need to catch this earlier.
2898 continue;
2899 }
2900 assert(m_allelemaps[marker].find(allelename)==m_allelemaps[marker].end());
2901 m_allelemaps[marker].insert(make_pair(allelename, nallele));
2902 }
2903
2904 if(nallele <= 1) nallele=2; // must be at least this much
2905 m_bincounts.push_back(nallele);
2906 }
2907
2908 AlleleModel::m_nbins = *max_element(m_bincounts.begin(),
2909 m_bincounts.end());
2910 fill(AlleleModel::m_bincounts.begin(),
2911 AlleleModel::m_bincounts.end(), AlleleModel::m_nbins);
2912 } // KAlleleModel::Initialize
2913
2914 //------------------------------------------------------------------------------------
2915
DataToLikes(const string & datum,long marker) const2916 vector<double> KAlleleModel::DataToLikes(const string& datum, long marker) const
2917 {
2918 if (m_bincounts.empty())
2919 {
2920 string msg = "KAlleleModel::DataToLikes() was called on an uninitialized ";
2921 msg += "object.";
2922 throw implementation_error(msg);
2923 }
2924
2925 if (datum == "?") // unknown data fills all bins with 1
2926 {
2927 vector<double> result(m_bincounts[marker], 1.0);
2928 return result;
2929 }
2930 else
2931 {
2932 vector<double> result(m_bincounts[marker], 0.0);
2933 map<string, long>::const_iterator allele = m_allelemaps[marker].find(datum);
2934 assert(allele != m_allelemaps[marker].end()); //Uncounted allele
2935 result[allele->second] = 1.0;
2936 return result;
2937 }
2938
2939 } // DataToLikes
2940
2941 //------------------------------------------------------------------------------------
2942
ComputeSubtreeDLs(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)2943 double KAlleleModel::ComputeSubtreeDLs(Cell& rootdls, double** startmarker, double** endmarker, long posn)
2944 {
2945 double total=0.0, subtotal;
2946 double** marker;
2947 long cat, bin;
2948 long firstposn = posn;
2949
2950 for (marker = startmarker; marker != endmarker;
2951 marker = rootdls.GetNextMarker(marker))
2952 {
2953 subtotal = 0.0;
2954 DoubleVec1d buffer(m_ncategories, 0.0);
2955
2956 for (cat = 0; cat < m_ncategories; ++cat)
2957 {
2958 for (bin = 0; bin < m_bincounts[posn]; ++bin)
2959 {
2960 // NB: We assume a flat allele frequency prior here.
2961 buffer[cat] += marker[cat][bin]/m_nbins;
2962 //LS DEBUG MAPPING: we eventually want unique allele freqs here instead
2963 // of dividing by m_nbins.
2964 }
2965
2966 subtotal += m_catprobs[cat] * buffer[cat];
2967 //cout<<m_catprobs[cat]<<endl<<buffer[cat]<<endl;
2968 //assert(subtotal != 0); // that would not be likely enough
2969 if (!subtotal)
2970 {
2971 DataModel::TryToNormalizeAndThrow(posn, GetModelType());
2972 }
2973 }
2974
2975 total += (log(subtotal) + rootdls.GetNorms(posn));
2976
2977 assert (total != 0); // that would be *too* likely
2978
2979 if (m_ncategories > 1)
2980 {
2981 for (cat = 0; cat < m_ncategories; cat++)
2982 m_likes[posn][cat] = buffer[cat]/subtotal;
2983 }
2984 ++posn;
2985 }
2986
2987 if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
2988
2989 if(systemSpecificIsnan(total))
2990 {
2991 cout<<endl<<"ComputeSubtreeDLs";
2992 cout<<endl;
2993 }
2994
2995 return total;
2996
2997 }; // KAlleleModel::ComputeSubtreeDLs
2998
2999 //------------------------------------------------------------------------------------
3000 //------------------------------------------------------------------------------------
3001
ToXML(size_t nspaces) const3002 StringVec1d KAlleleModel::ToXML(size_t nspaces) const
3003 {
3004 StringVec1d xmllines(DataModel::ToXML(nspaces));
3005
3006 // Yes, this model has no fields of its own.
3007
3008 return xmllines;
3009
3010 } // ToXML
3011
3012 //------------------------------------------------------------------------------------
3013 // This routine simulates data for a single node under the KAllele
3014 // model. It assumes that the given branch lengths have already
3015 // been rescaled for the rate category desired.
3016
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const3017 DoubleVec1d KAlleleModel::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
3018 {
3019 double rnd = registry.GetRandom().Float();
3020 long nstates(m_bincounts[whichmarker]);
3021 long oldstate = 0;
3022 for (long staten=0; staten<static_cast<long>(state.size()); staten++)
3023 {
3024 if (state[staten] == 1.0) oldstate = staten;
3025 }
3026
3027 // if something happens
3028 double newratio = nstates/(nstates-1);
3029 if (rnd < (1/newratio) * (1.0 - exp(-newratio * branchlength)))
3030 {
3031 long chosenstate = registry.GetRandom().Long(nstates-1);
3032 if (chosenstate >= oldstate) chosenstate++;
3033 DoubleVec1d answer(state.size(),0.0);
3034 answer[chosenstate] = 1.0;
3035 return answer;
3036 }
3037
3038 // return nothing happens
3039 return state;
3040
3041 } // SimulateMarker
3042
3043 //------------------------------------------------------------------------------------
3044
CellToData(Cell_ptr cell,long marker) const3045 string KAlleleModel::CellToData(Cell_ptr cell, long marker) const
3046 {
3047 LongVec1d ones = cell->GetOnes(marker);
3048 if (ones.size() == 1)
3049 {
3050 for (map<string, long>::const_iterator key=m_allelemaps[marker].begin();
3051 key != m_allelemaps[marker].end(); key++)
3052 {
3053 if ((*key).second == ones[0])
3054 {
3055 return (*key).first;
3056 }
3057 }
3058 throw data_error("Cannot find any any alleles for bin " + ToString(ones[0]) + ".");
3059 }
3060 throw implementation_error
3061 ("Cannot convert K-Allele data from the internal format if the data is not a single allele.");
3062
3063 }
3064
3065 //------------------------------------------------------------------------------------
3066 //------------------------------------------------------------------------------------
3067
MixedKSModel(long nmarkers,const StringVec2d & uniqueAlleles,long numCategories,DoubleVec1d categoryRates,DoubleVec1d categoryProbabilities,double userAutoCorrelationValue,bool doNormalize,double relMuRate,double alphaVal,bool doOptimization)3068 MixedKSModel::MixedKSModel(
3069 long nmarkers,
3070 const StringVec2d& uniqueAlleles,
3071 long numCategories,
3072 DoubleVec1d categoryRates,
3073 DoubleVec1d categoryProbabilities,
3074 double userAutoCorrelationValue,
3075 bool doNormalize,
3076 double relMuRate,
3077 double alphaVal,
3078 bool doOptimization
3079 )
3080 : AlleleModel(
3081 nmarkers,
3082 numCategories,
3083 categoryRates,
3084 categoryProbabilities,
3085 userAutoCorrelationValue,
3086 doNormalize,
3087 defaults::bins,
3088 relMuRate
3089 ),
3090 m_allowance(defaults::mixedks_allowance),
3091 m_isOpt(doOptimization),
3092 m_alphaRpt()
3093 {
3094 SetAlpha(alphaVal); // instead of initializing, because there's
3095 // logic to check value of alpha in SetAlpha
3096 m_origAlpha = m_alpha;
3097 Initialize(uniqueAlleles);
3098 }
3099
3100 //------------------------------------------------------------------------------------
3101
Clone() const3102 DataModel* MixedKSModel::Clone() const
3103 {
3104 DataModel* newmodel = new MixedKSModel(*this);
3105 return newmodel;
3106 }
3107
3108 //------------------------------------------------------------------------------------
3109
DataToLikes(const string & datum,long marker) const3110 vector<double> MixedKSModel::DataToLikes(const string& datum, long marker) const
3111 {
3112 if (m_bincounts.empty())
3113 {
3114 string msg = "MixedKSModel::DataToLikes() was called on an uninitialized ";
3115 msg += "object.";
3116 throw implementation_error(msg);
3117 }
3118
3119 if (datum == "?") // unknown data fills all bins with 1
3120 {
3121 vector<double> result(m_bincounts[marker], 1.0);
3122 return result;
3123 }
3124 else if (StringType(datum)!=2)
3125 {
3126 throw data_error ("Your data must consist of numbers, not letters or punctuation."
3127 " The Mixed KS model is inappropriate for DNA or RNA data.");
3128 }
3129 else
3130 {
3131 vector<double> result(m_bincounts[marker], 0.0);
3132 long allele;
3133 FromString(datum,allele);
3134 allele -= m_offsets[marker];
3135 if (allele < 0 || allele >= m_bincounts[marker])
3136 {
3137 string msg = "MixedKSModel::DataToLikes() was called on an ";
3138 msg += "uninitialized (or incorrectly initialized) object.";
3139 throw implementation_error(msg);
3140 }
3141 result[allele] = 1.0;
3142 return result;
3143 }
3144
3145 } // DataToLikes
3146
3147 //------------------------------------------------------------------------------------
3148
Initialize(const StringVec2d & uniqueAlleles)3149 void MixedKSModel::Initialize(const StringVec2d& uniqueAlleles)
3150 {
3151 if (static_cast<unsigned long>(m_nmarkers) != uniqueAlleles.size())
3152 {
3153 string msg = "MixedKSModel::Initialize() encountered m_nmarkers = ";
3154 msg += ToString(m_nmarkers) + " and uniqueAlleles.size() = ";
3155 msg += ToString(uniqueAlleles.size()) + "; these numbers should be equal.";
3156 throw implementation_error(msg);
3157 }
3158
3159 // find the biggest and smallest allele for each marker
3160 // NB We assume that allele sizes are in number of repeats,
3161 // *not* base pair count or anything else, and that "missing
3162 // data" is coded as ?.
3163
3164 long tipval;
3165
3166 for (long marker = 0; marker < m_nmarkers; ++marker)
3167 {
3168 bool real_allele = false; // did we ever see a non-? allele
3169 long smallone = MAXLONG, largeone = 0;
3170
3171 for (size_t nallele = 0; nallele<uniqueAlleles[marker].size();
3172 nallele++)
3173 {
3174 // convert to number if possible;
3175 // do not count "unknown data" markers
3176 string allele = uniqueAlleles[marker][nallele];
3177 if (allele == "?")
3178 {
3179 assert(false); //catch this earlier
3180 continue;
3181 }
3182 real_allele = true;
3183 FromString(allele, tipval); // convert to long
3184
3185 if (tipval < smallone) smallone = tipval;
3186 if (tipval > largeone) largeone = tipval;
3187 }
3188
3189 // if no non-? were ever found, use arbitrary values
3190 if (!real_allele)
3191 {
3192 smallone = 10;
3193 largeone = 10;
3194 }
3195
3196 long newoffset = max(smallone - m_allowance, 0L);
3197 m_offsets.push_back(newoffset);
3198 m_bincounts.push_back(largeone + m_allowance + 1 - newoffset);
3199 if (real_allele && largeone != smallone)
3200 // m_threshhold: +1 makes it work for (large-small) both odd/even
3201 m_threshhold.push_back((largeone - smallone + 1L)/2L);
3202 else
3203 m_threshhold.push_back(1L);
3204
3205 // Pre-calculate table of steps
3206 CalculateSteps(marker);
3207 }
3208
3209 AlleleModel::m_nbins = *max_element(m_bincounts.begin(),
3210 m_bincounts.end());
3211 fill(AlleleModel::m_bincounts.begin(),
3212 AlleleModel::m_bincounts.end(), AlleleModel::m_nbins);
3213 } // Initialize
3214
3215 //------------------------------------------------------------------------------------
3216
SetAlpha(double alphaVal)3217 void MixedKSModel::SetAlpha(double alphaVal)
3218 {
3219 assert(alphaVal >= 0.0);
3220 assert(alphaVal <= 1.0);
3221 m_alpha = alphaVal;
3222 m_beta = 1 - m_alpha;
3223 }
3224
3225 //------------------------------------------------------------------------------------
3226
SetAlpha(double alphaVal,long rep,long chain)3227 void MixedKSModel::SetAlpha(double alphaVal, long rep, long chain)
3228 {
3229 if (m_alphaRpt.size() == static_cast<size_t>(rep))
3230 {
3231 ResetAlpha();
3232 }
3233 assert(m_alphaRpt[rep].size() == static_cast<size_t>(chain));
3234 m_alphaRpt[rep].push_back(alphaVal);
3235 SetAlpha(alphaVal);
3236 }
3237
3238 //------------------------------------------------------------------------------------
3239 // Adaptation of Peter Beerli's microsatellite likelihood nuview_micro routine from Migrate.
3240
ComputeSiteDLs(Cell_ptr child1,Cell_ptr child2,Cell_ptr thiscell,const DoubleVec1d & lengthOfBranchToChild1ScaledByRateCat,const DoubleVec1d & lengthOfBranchToChild2ScaledByRateCat,long marker)3241 void MixedKSModel::ComputeSiteDLs (Cell_ptr child1, Cell_ptr child2,
3242 Cell_ptr thiscell, const DoubleVec1d& lengthOfBranchToChild1ScaledByRateCat,
3243 const DoubleVec1d& lengthOfBranchToChild2ScaledByRateCat, long marker)
3244 {
3245 double **pSiteDLsForChild1 = child1->GetSiteDLs(marker);
3246 double **pSiteDLsForChild2 = child2->GetSiteDLs(marker);
3247 double normForChild1 = child1->GetNorms(marker);
3248 double normForChild2 = child2->GetNorms(marker);
3249
3250 if (!pSiteDLsForChild1 && !pSiteDLsForChild2) // they can't both be NULL
3251 {
3252 string msg = "MixedKSModel::ComputeSiteDLs() found no existing ";
3253 msg += "site data-likelihoods for either child.";
3254 throw implementation_error(msg);
3255 }
3256
3257 // in case of a one-legged coalescence, copy values and return
3258 if (!pSiteDLsForChild1)
3259 {
3260 thiscell->SetSiteDLs(marker, pSiteDLsForChild2);
3261 thiscell->SetNorms(normForChild2, marker);
3262 return;
3263 }
3264
3265 if (!pSiteDLsForChild2)
3266 {
3267 thiscell->SetSiteDLs(marker, pSiteDLsForChild1);
3268 thiscell->SetNorms(normForChild1, marker);
3269 return;
3270 }
3271
3272 long smax = m_bincounts[marker];
3273 long threshold = m_threshhold[marker];
3274
3275 // allocate temporary working space;
3276 // OPTIMIZE should be done more cheaply!
3277 double **jointProbChild1Child2 = new double*[m_ncategories * sizeof(double *)];
3278 jointProbChild1Child2[0] = new double[m_ncategories * m_bincounts[marker] * sizeof(double)];
3279 for (long cat = 0; cat < m_ncategories; ++cat)
3280 jointProbChild1Child2[cat] = jointProbChild1Child2[0] + cat * m_bincounts[marker];
3281 double maxJointProbChild1Child2 = -DBL_MAX;
3282 double mutationProbChild1, mutationProbChild2;
3283
3284 long s, a, diff;
3285
3286 //KAllele part
3287 // compute the bin contents for each possible allele size
3288 long ss;
3289 double prob1, prob2, a1, a2, b1, b2, temp1, temp2, sum1, sum2;
3290
3291 for (long cat = 0; cat < m_ncategories; ++cat)
3292 {
3293 prob1 = probMathFunc (lengthOfBranchToChild1ScaledByRateCat[cat], (smax-1.0)/smax);
3294 prob2 = probMathFunc (lengthOfBranchToChild2ScaledByRateCat[cat], (smax-1.0)/smax);
3295
3296 a1 = 1.0 - prob1;
3297 a2 = 1.0 - prob2;
3298 b1 = prob1 / ( smax - 1.0); //need to check when smax = 1
3299 b2 = prob2 / ( smax - 1.0);
3300 for (s = 0; s < smax; s++)
3301 {
3302 sum1 = 0; sum2 = 0;
3303 for(ss = 0; ss < smax; ss++)
3304 {
3305 if (s == ss)
3306 {
3307 temp1 = a1; temp2 = a2;
3308 }
3309 else
3310 {
3311 temp1 = b1; temp2 = b2;
3312 }
3313 sum1 += pSiteDLsForChild1[cat][ss] * temp1;
3314 sum2 += pSiteDLsForChild2[cat][ss] * temp2;
3315 }
3316
3317 jointProbChild1Child2[cat][s] = sum1 * sum2 * m_alpha; //alpha
3318 if (jointProbChild1Child2[cat][s] > maxJointProbChild1Child2)
3319 maxJointProbChild1Child2 = jointProbChild1Child2[cat][s]; // overflow protection
3320 }
3321 }
3322 //LS NOTE: The following code firing was due to a bug in the past;
3323 // you should probably look carefully to make sure that's not true
3324 // again.
3325 if(maxJointProbChild1Child2 < 0)
3326 {
3327 string msg = "Warning: maximum probability value of "
3328 + ToString(maxJointProbChild1Child2)
3329 + " in MixedKS.";
3330 registry.GetRunReport().ReportDebug(msg);
3331 }
3332
3333 //StepWise part
3334 // compute the bin contents for each possible allele size
3335 for (s = 0; s < smax; s++)
3336 {
3337 for (long cat = 0; cat < m_ncategories; ++cat)
3338 {
3339 mutationProbChild1 = mutationProbChild2 = 0.0;
3340 for (a = max (0L, s - threshold); a <= s + threshold && a < smax; a++)
3341 {
3342 diff = labs(s - a);
3343 if (pSiteDLsForChild1[cat][a] > 0)
3344 {
3345 mutationProbChild1 += Probability(lengthOfBranchToChild1ScaledByRateCat[cat],
3346 diff, marker) * pSiteDLsForChild1[cat][a];
3347 }
3348 if (pSiteDLsForChild2[cat][a] > 0)
3349 {
3350 mutationProbChild2 += Probability(lengthOfBranchToChild2ScaledByRateCat[cat],
3351 diff, marker) * pSiteDLsForChild2[cat][a];
3352 }
3353 }
3354 jointProbChild1Child2[cat][s] += mutationProbChild1 * mutationProbChild2 * m_beta;
3355 if (jointProbChild1Child2[cat][s] > maxJointProbChild1Child2)
3356 maxJointProbChild1Child2 = jointProbChild1Child2[cat][s]; // overflow protection
3357 }
3358 }
3359
3360 // normalize to further protect against overflow, if requested
3361 if (ShouldNormalize())
3362 {
3363 if (maxJointProbChild1Child2 == 0.0)
3364 {
3365 thiscell->SetNorms(-DBL_MAX,marker);
3366 }
3367 else
3368 {
3369 for (s = 0; s < smax; s++)
3370 {
3371 for (long cat = 0; cat < m_ncategories; ++cat)
3372 {
3373 jointProbChild1Child2[cat][s] /= maxJointProbChild1Child2;
3374 }
3375 }
3376 thiscell->SetNorms(log(maxJointProbChild1Child2) + normForChild1 + normForChild2,
3377 marker);
3378 }
3379 }
3380
3381 thiscell->SetSiteDLs(marker, jointProbChild1Child2);
3382
3383 delete[] jointProbChild1Child2[0];
3384 delete[] jointProbChild1Child2;
3385 } // ComputeSiteDLs
3386
3387 //------------------------------------------------------------------------------------
3388
ComputeSubtreeDLs(Cell & rootdls,double ** startmarker,double ** endmarker,long posn)3389 double MixedKSModel::ComputeSubtreeDLs(Cell& rootdls, double** startmarker, double** endmarker, long posn)
3390 {
3391 double total=0.0, subtotal;
3392 double** marker;
3393 long cat, bin;
3394 long firstposn = posn;
3395
3396 for (marker = startmarker; marker != endmarker;
3397 marker = rootdls.GetNextMarker(marker))
3398 {
3399 subtotal = 0.0;
3400 DoubleVec1d buffer(m_ncategories, 0.0);
3401
3402 for (cat = 0; cat < m_ncategories; ++cat)
3403 {
3404 for (bin = 0; bin < m_bincounts[posn]; ++bin)
3405 {
3406 // NB: We assume a flat allele frequency prior here.
3407 buffer[cat] += marker[cat][bin];
3408 }
3409
3410 subtotal += m_catprobs[cat] * buffer[cat];
3411 }
3412
3413 if (!subtotal)
3414 {
3415 DataModel::TryToNormalizeAndThrow(posn, GetModelType());
3416 }
3417
3418 total += (log(subtotal) + rootdls.GetNorms(posn));
3419
3420 assert (total != 0); // that would be *too* likely
3421
3422 if (m_ncategories > 1)
3423 {
3424 for (cat = 0; cat < m_ncategories; cat++)
3425 m_likes[posn][cat] = buffer[cat]/subtotal;
3426 }
3427 ++posn;
3428 }
3429
3430 if (m_ncategories > 1) total += ComputeCatDL(firstposn, posn);
3431 return total;
3432
3433 } // MixedKSModel::ComputeSubtreeDLs
3434
3435 //------------------------------------------------------------------------------------
3436
ToXML(size_t nspaces) const3437 StringVec1d MixedKSModel::ToXML(size_t nspaces) const
3438 {
3439 StringVec1d xmllines(AlleleModel::ToXML(nspaces));
3440
3441 nspaces += INDENT_DEPTH;
3442
3443 string line(MakeIndent(MakeTag(xmlstr::XML_TAG_ISOPT),nspaces));
3444 line += ToStringTF(m_isOpt);
3445 line += MakeCloseTag(xmlstr::XML_TAG_ISOPT);
3446 StringVec1d::iterator endtag = --xmllines.end();
3447 xmllines.insert(endtag,line);
3448
3449 string line2(MakeIndent(MakeTag(xmlstr::XML_TAG_ALPHA),nspaces));
3450 line2 += ToString(GetAlpha());
3451 line2 += MakeCloseTag(xmlstr::XML_TAG_ALPHA);
3452 endtag = --xmllines.end();
3453 xmllines.insert(endtag,line2);
3454
3455 nspaces -= INDENT_DEPTH;
3456 return xmllines;
3457
3458 } // ToXML
3459
3460 //------------------------------------------------------------------------------------
3461 // Adaptation of Peter Beerli's prob_micro routine from MIGRATE.
3462 // This routine computes the probability of a change of "diff" steps in time "t".
3463 //
3464 // Mary Kuhner 2002/01/02
3465
Probability(double t,long diff,long marker) const3466 double MixedKSModel::Probability (double t, long diff, long marker) const
3467 {
3468 long threshold = m_threshhold[marker];
3469 if (diff > threshold)
3470 return 0.0; // approximately infinitely unlikely to mutate that much in time t
3471
3472 const DoubleVec2d& PrecomputedTerms = m_steps[marker];
3473 double sum(0.0), oldsum(0.0), log_tOver2(log(0.5*t));
3474
3475 for (long k = 0; k <= threshold; k++) // num steps = diff + k <= threshold
3476 {
3477 sum += exp(-t + log_tOver2*(diff + 2.0*k) - PrecomputedTerms[diff][k]);
3478
3479 // quit if the contributions have become trivial
3480 if (fabs (oldsum - sum) < DBL_EPSILON) break;
3481 oldsum = sum;
3482 }
3483 return sum;
3484 } // Probability
3485
3486 //------------------------------------------------------------------------------------
3487 // Adaptation of Peter Beerli's calculate_steps routine from MIGRATE.
3488 // This routine precomputes values needed by Probability, for speed.
3489
CalculateSteps(long marker)3490 void MixedKSModel::CalculateSteps (long marker)
3491 {
3492 long k, diff;
3493 DoubleVec1d tempvec;
3494 DoubleVec2d steps;
3495 long threshhold = m_threshhold[marker];
3496
3497 for (diff = 0; diff <= threshhold; diff++)
3498 {
3499 tempvec.clear();
3500 for (k = 0; k <= threshhold; k++)
3501 {
3502 tempvec.push_back(logfac (diff + k) + logfac (k));
3503 }
3504 steps.push_back(tempvec);
3505 }
3506
3507 m_steps.push_back(steps);
3508 // Note: This is okay from a speed standpoint,
3509 // but not so good from a design standpoint,
3510 // because this method assumes it's being called
3511 // within a loop over markers.
3512 } // CalculateSteps
3513
3514 //------------------------------------------------------------------------------------
3515
probMathFunc(double ut,double coef)3516 double MixedKSModel::probMathFunc( double ut, double coef)
3517 {
3518 double mathValue = coef*(1.0-exp(-1.0/coef*ut));
3519 if (systemSpecificIsnan(mathValue))
3520 cout << "coef:" << coef << endl
3521 << "time:" << ut << endl;
3522 return mathValue;
3523 } // MixedKSModel::probMathFunc
3524
3525 //------------------------------------------------------------------------------------
3526
CreateDataModelReport() const3527 StringVec1d MixedKSModel::CreateDataModelReport() const
3528 {
3529 StringVec1d report = DataModel::CreateDataModelReport();
3530
3531 string line;
3532 if (m_isOpt)
3533 {
3534 line = "Final ";
3535 }
3536 line += "Multistep:single-step ratio: " + ToString(m_alpha);
3537 report.push_back(line);
3538
3539 string maxbins = "Maximum number of bins: " + ToString(m_bincounts[0]);
3540 report.push_back(maxbins);
3541 if (m_isOpt)
3542 {
3543 report.push_back(string("Multistep:single-step ratio used for each chain (optimized from the previous):"));
3544 //report alpha for each chain
3545 for(size_t rep=0; rep<m_alphaRpt.size(); rep++)
3546 {
3547 for (size_t chain=0; chain<m_alphaRpt[rep].size(); chain++)
3548 {
3549 line = "";
3550 if (m_alphaRpt.size() > 1)
3551 {
3552 line = "Replicate " + indexToKey(rep) + ", ";
3553 }
3554 line += "Chain " + indexToKey(chain) + " :"
3555 + ToString(m_alphaRpt[rep][chain]);
3556 report.push_back( line );
3557 }
3558 }
3559 }
3560
3561 return report;
3562
3563 } // MixedKSModel::CreateDataModelReport
3564
3565 //------------------------------------------------------------------------------------
3566
OptimizeDataModel(Tree * tree,const Locus & locus)3567 bool MixedKSModel::OptimizeDataModel(Tree* tree, const Locus& locus)
3568 {
3569 //only do optimization when turn on 'm_isOpt', otherwize do nothing
3570 if(!m_isOpt) return false;
3571
3572 FuncMax fm(*this, tree, locus);
3573
3574 FMState fms(100, //max_iters
3575 0.01, //increment
3576 0.0001, //threshold
3577 0, //limits
3578 1,
3579 0.5 //initX
3580 );
3581
3582 fm.setState(fms);
3583 fm.run();
3584 //Note: The FuncMax object has a pointer to us, and uses it to set our alpha.
3585 //report final alpha for each chain
3586 m_alphaRpt[m_alphaRpt.size()-1].push_back(m_alpha);
3587 return true;
3588 }
3589
3590 //------------------------------------------------------------------------------------
3591
ResetAlpha()3592 void MixedKSModel::ResetAlpha()
3593 {
3594 SetAlpha(m_origAlpha);
3595 DoubleVec1d newalphas;
3596 newalphas.push_back(m_origAlpha);
3597 m_alphaRpt.push_back(newalphas);
3598 }
3599
3600 //------------------------------------------------------------------------------------
3601
WriteAlpha(ofstream & sumout,long loc,long rep,long chain)3602 void MixedKSModel::WriteAlpha(ofstream& sumout, long loc, long rep, long chain)
3603 {
3604 if (!m_isOpt) return;
3605 assert(static_cast<size_t>(rep)<m_alphaRpt.size());
3606 if (static_cast<size_t>(chain+1)>=m_alphaRpt[rep].size()) return;
3607 //LS NOTE: We write out the alpha that we calculated for this chain
3608 // instead of the alpha we used for this chain, because of timing
3609 // issues--we don't want to lose the information if sumfile writing stops
3610 // unexpectedly.
3611 sumout << "\t"
3612 << xmlsum::ALPHA_START1 << " "
3613 << xmlsum::ALPHA_START2 << " " << loc << " "
3614 << xmlsum::ALPHA_START3 << " " << m_alphaRpt[rep][chain+1]
3615 << " " << xmlsum::ALPHA_END << endl;
3616
3617 }
3618
3619 //------------------------------------------------------------------------------------
3620
SimulateMarker(double branchlength,long whichmarker,const DoubleVec1d & state) const3621 DoubleVec1d MixedKSModel::SimulateMarker(double branchlength, long whichmarker, const DoubleVec1d& state) const
3622 {
3623 throw implementation_error("Cannot simulate data with the MixedKS model yet.");
3624 } // SimulateMarker
3625
3626 //------------------------------------------------------------------------------------
3627
CellToData(Cell_ptr cell,long marker) const3628 string MixedKSModel::CellToData(Cell_ptr cell, long marker) const
3629 {
3630 LongVec1d ones = cell->GetOnes(marker);
3631 assert(static_cast<size_t>(marker) < m_offsets.size());
3632 if (ones.size() == 1)
3633 {
3634 return ToString(ones[0] + m_offsets[marker]);
3635 }
3636 throw implementation_error
3637 ("Cannot convert stepwise data from the internal format for data not simply a single number.");
3638 }
3639
3640 //____________________________________________________________________________________
3641