1 // $Id: calculators.cpp,v 1.26 2011/03/12 20:02:52 bobgian Exp $
2
3 /*
4 Copyright 2002 Peter Beerli, Mary Kuhner, Jon Yamato and Joseph Felsenstein
5
6 This software is distributed free of charge for non-commercial use
7 and is copyrighted. Of course, we do not guarantee that the software
8 works, and are not responsible for any damage you may cause or have.
9 */
10
11 #include <cassert>
12 #include <functional>
13 #include <numeric>
14 #include <map>
15
16 #include "calculators.h"
17 #include "datapack.h"
18 #include "dlmodel.h"
19 #include "locus.h"
20 #include "mathx.h" //for ScaleToSumToOne
21 #include "region.h"
22 #include "registry.h"
23
24 using std::map;
25
26 //------------------------------------------------------------------------------------
27
FrequenciesFromData(long int regionId,long int locusId,model_type modelType)28 DoubleVec1d FrequenciesFromData(long int regionId, long int locusId, model_type modelType)
29 {
30 const Region& thisRegion = registry.GetDataPack().GetRegion(regionId);
31 const Locus& thisLocus = thisRegion.GetLocus(locusId);
32 return FrequenciesFromData(thisLocus,modelType);
33 }
34
35 //------------------------------------------------------------------------------------
36
FrequenciesFromData(const Locus & locus,model_type modelType)37 DoubleVec1d FrequenciesFromData(const Locus& locus, model_type modelType)
38 {
39
40 assert(modelType == F84 || modelType == GTR);
41 DoubleVec1d basefreqs(BASES,0.0);
42 double totalmarkers = 0.0;
43 const vector<TipData>& tips = locus.GetTipData();
44
45 vector<TipData>::const_iterator tit;
46 for (tit = tips.begin(); tit != tips.end(); ++tit)
47 {
48 StringVec1d data = tit->data;
49 StringVec1d::const_iterator sit;
50
51 for (sit = data.begin(); sit != data.end(); ++sit)
52 {
53 double errorRate= locus.GetPerBaseErrorRate();
54 DoubleVec1d site = NucModel::StaticDataToLikes(*sit,errorRate);
55 double total = site[baseA]+site[baseC]+site[baseG]+site[baseT];
56 basefreqs[baseA] += site[baseA]/total;
57 basefreqs[baseC] += site[baseC]/total;
58 basefreqs[baseG] += site[baseG]/total;
59 basefreqs[baseT] += site[baseT]/total;
60 }
61 totalmarkers += locus.GetNmarkers();
62 }
63
64 basefreqs[baseA] /= totalmarkers;
65 basefreqs[baseC] /= totalmarkers;
66 basefreqs[baseG] /= totalmarkers;
67 basefreqs[baseT] /= totalmarkers;
68 ScaleToSumToOne(basefreqs);
69
70 // check for any zero freqs, try global freqs, then use DBL_EPSILON set
71 double zero(0.0);
72 if (std::count(basefreqs.begin(),basefreqs.end(),zero))
73 {
74 basefreqs = registry.GetDataPack().CountOverallBaseFreqs();
75 ScaleToSumToOne(basefreqs);
76 if (std::count(basefreqs.begin(),basefreqs.end(),zero))
77 {
78 DoubleVec1d::iterator baseit;
79 for (baseit = basefreqs.begin(); baseit != basefreqs.end(); ++baseit)
80 {
81 if (*baseit == 0.0) *baseit = defaults::minLegalFrequency;
82 }
83 }
84 }
85
86 return basefreqs;
87 }
88
89 //------------------------------------------------------------------------------------
90
91 // MDEBUG LOCUS Function not currently used but being saved for
92 // multi-locus code
93 // EWFIX.P3 TEST -- IMPORTANT -- this function may contain code that sets
94 // the estimate to a default value when we don't want to. need
95 // to see if we can avoid that and instead post-process unsuitable
96 // values to correct them when appropriate. for example, if using
97 // this function to get an initial estimate, we may want to default
98 // to a reasonable value, but if reporting the value we do not.
99
MigFSTLocus(const DataPack & dpack,const Locus & loc,long int numMigs,DoubleVec1d & estimate,std::deque<bool> & isCalculated)100 void MigFSTLocus(const DataPack& dpack, const Locus& loc, long int numMigs,
101 DoubleVec1d& estimate, std::deque<bool>& isCalculated)
102 {
103 estimate.clear();
104 estimate = DoubleVec1d(numMigs,0.0);
105 assert(isCalculated.size() == estimate.size());
106 DoubleVec1d fw = loc.GetDataTypePtr()->CalcFw(loc);
107 DoubleVec1d fb = loc.GetDataTypePtr()->CalcFb(loc);
108
109 long int index1, index2;
110 long int pop1, pop2, npops = fw.size();
111 assert(numMigs == (npops * npops));
112
113 for(pop1 = 0; pop1 < npops; ++pop1)
114 {
115 for(pop2 = pop1+1; pop2 < npops; ++pop2)
116 {
117 index1 = pop1 * npops + pop2; // position in linear vector
118 double fb12 = fb[index1];
119 double denom = -2.0 * fb12 + fw[pop1] + fw[pop2];
120 if (denom == 0.0)
121 {
122 isCalculated[index1] = false;
123 estimate[index1] = defaults::migration;
124 }
125 else estimate[index1] = 2.0 * fb12 / denom;
126 if (estimate[index1] < 0.0)
127 {
128 isCalculated[index1] = false;
129 estimate[index1] = defaults::migration;
130 }
131 index2 = pop2 * npops + pop1; // lower triangle
132 estimate[index2] = estimate[index1];
133 }
134 }
135 } // MigFSTLocus
136
137 //------------------------------------------------------------------------------------
138
MigFSTMultiregion(const DataPack & dpack,const DoubleVec2d & regionalmuratios,DoubleVec1d & perRegionMigsAccumulator,std::deque<bool> & isCalculated)139 void MigFSTMultiregion(const DataPack & dpack, const DoubleVec2d & regionalmuratios,
140 DoubleVec1d & perRegionMigsAccumulator, std::deque<bool> & isCalculated)
141 {
142 // MDEBUG this is disabled for force_DIVMIG currently. It should be enabled
143 // eventually, contemporary populations only.
144
145 // get MIG npop and DIVMIG npop
146 long int migpopCount = dpack.GetNPartitionsByForceType(force_MIG);
147 long int divmigpopCount = dpack.GetNPartitionsByForceType(force_DIVMIG);
148
149 if (migpopCount == 0 && divmigpopCount <= 1)
150 throw data_error("Data error--zero populations? There should be at least one.");
151
152 if (migpopCount <= 1) return;
153
154 long int popCount = migpopCount;
155
156 isCalculated.clear();
157 perRegionMigsAccumulator.clear();
158 long int numMigs = popCount * popCount;
159 perRegionMigsAccumulator = DoubleVec1d(numMigs,0.0);
160 isCalculated = std::deque<bool>(numMigs,true);
161
162 const long int numRegions = dpack.GetNRegions();
163
164 for(long int regionIndex = 0; regionIndex < numRegions; regionIndex++)
165 {
166 const Region& region = dpack.GetRegion(regionIndex);
167 long int numLoci = region.GetNloci();
168 DoubleVec1d perLociMigsAccumulator(numMigs,0.0);
169
170 for(long int locusIndex = 0; locusIndex < numLoci; locusIndex++)
171 {
172 const Locus& locus = region.GetLocus(locusIndex);
173 DoubleVec1d migsForOneLocus;
174 MigFSTLocus(dpack,locus,numMigs,migsForOneLocus,isCalculated);
175
176 // rescale for relative mutation rate
177 std::transform(migsForOneLocus.begin(),
178 migsForOneLocus.end(),
179 migsForOneLocus.begin(),
180 std::bind2nd(std::multiplies<double>(),
181 regionalmuratios[regionIndex][locusIndex]));
182
183 // and add to the totals by region
184 std::transform( migsForOneLocus.begin(),
185 migsForOneLocus.end(),
186 perLociMigsAccumulator.begin(),
187 perLociMigsAccumulator.begin(),
188 std::plus<double>());
189 }
190 // average over the per-locus data
191 std::transform( perLociMigsAccumulator.begin(),
192 perLociMigsAccumulator.end(),
193 perLociMigsAccumulator.begin(),
194 std::bind2nd(std::divides<double>(),static_cast<double>(numLoci)));
195
196 // and add to the totals by region
197 std::transform( perLociMigsAccumulator.begin(),
198 perLociMigsAccumulator.end(),
199 perRegionMigsAccumulator.begin(),
200 perRegionMigsAccumulator.begin(),
201 std::plus<double>());
202 }
203 // finally, average over the per-region data
204 std::transform( perRegionMigsAccumulator.begin(),
205 perRegionMigsAccumulator.end(),
206 perRegionMigsAccumulator.begin(),
207 std::bind2nd(std::divides<double>(),static_cast<double>(numRegions)));
208 }
209
210 //------------------------------------------------------------------------------------
211
212 // EWFIX.P3 TEST -- IMPORTANT -- this function may contain code that sets
213 // the estimate to a default value when we don't want to. need
214 // to see if we can avoid that and instead post-process unsuitable
215 // values to correct them when appropriate. for example, if using
216 // this function to get an initial estimate, we may want to default
217 // to a reasonable value, but if reporting the value we do not.
218
ThetaFSTLocus(const DataPack & dpack,const Locus & loc,const long int numThetas,vector<map<force_type,string>> tipids,DoubleVec1d & estimates,std::deque<bool> & isCalculated)219 void ThetaFSTLocus(const DataPack& dpack, const Locus& loc, const long int numThetas,
220 vector<map<force_type,string> > tipids, DoubleVec1d& estimates,
221 std::deque<bool>& isCalculated)
222 {
223 assert(numThetas > 1);
224
225 StringVec3d data;
226 for(long int xpart = 0; xpart < numThetas; xpart++)
227 {
228 data.push_back(loc.GetCrossPartitionGeneticData(tipids[xpart]));
229 }
230
231 DoubleVec1d fw = loc.GetDataTypePtr()->CalcXFw(loc,data);
232 DoubleVec1d fb = loc.GetDataTypePtr()->CalcXFb(loc,data);
233
234 long int index1;
235 assert(fw.size() == static_cast<unsigned long int>(numThetas));
236
237 DoubleVec1d thetaEstimates(numThetas,defaults::theta);
238
239 for(long int xpart = 0; xpart < numThetas; xpart++)
240 {
241 double estimate = 0.0;
242 for(long int otherpart = 0; otherpart < numThetas; ++otherpart)
243 {
244 if (otherpart == xpart) continue;
245 index1 = xpart * numThetas + otherpart; // position in linear vector
246 double fb12 = fb[index1];
247 double numer = -2.0 * fb12 + fw[xpart] + fw[otherpart];
248 double denom = -2.0 * fb12 *fb12 + fw[xpart] + fw[otherpart] + fw[xpart] * fw[xpart];
249
250 if (denom != 0.0)
251 {
252 estimate += (numer * (1.0 - fw[xpart])) / denom;
253 }
254 }
255
256 if (estimate > 0.0)
257 {
258 thetaEstimates[xpart] = estimate / numThetas;
259 }
260 else
261 {
262 isCalculated[xpart] = false;
263 }
264
265 }
266 estimates = thetaEstimates;
267 } // ThetaFSTLocus
268
269 //------------------------------------------------------------------------------------
270
ThetaFSTMultiregion(const DataPack & dpack,const DoubleVec1d & regionalthetascalars,const DoubleVec2d & regionalmuratios,DoubleVec1d & estimates,std::deque<bool> & isCalculated)271 void ThetaFSTMultiregion(const DataPack& dpack, const DoubleVec1d & regionalthetascalars,
272 const DoubleVec2d & regionalmuratios, DoubleVec1d & estimates,
273 std::deque<bool> & isCalculated)
274 {
275 const long int numRegions = dpack.GetNRegions();
276 const long int numThetas = dpack.GetNCrossPartitions();
277 vector<map<force_type,string> > tipids = dpack.GetCrossPartitionIds(true);
278 isCalculated.clear();
279 isCalculated = std::deque<bool>(numThetas,true);
280
281 DoubleVec1d OverallThetasAccumulator(numThetas,0.0);
282
283 for(long int regionIndex = 0; regionIndex < numRegions; regionIndex++)
284 {
285 const Region& region = dpack.GetRegion(regionIndex);
286 long int numLoci = region.GetNloci();
287 DoubleVec1d RegionalThetasAccumulator(numThetas,0.0);
288
289 for(long int locusIndex = 0; locusIndex < numLoci; locusIndex++)
290 {
291 const Locus& locus = region.GetLocus(locusIndex);
292 DoubleVec1d thetasForOneLocus;
293 ThetaFSTLocus(dpack,locus,numThetas,tipids,thetasForOneLocus,isCalculated);
294 // correct for varying mutation rates
295 std::transform(thetasForOneLocus.begin(),
296 thetasForOneLocus.end(),
297 thetasForOneLocus.begin(),
298 std::bind2nd(std::divides<double>(),
299 regionalmuratios[regionIndex][locusIndex]));
300
301 // and add to the totals by locus
302 std::transform( thetasForOneLocus.begin(),
303 thetasForOneLocus.end(),
304 RegionalThetasAccumulator.begin(),
305 RegionalThetasAccumulator.begin(),
306 std::plus<double>());
307 }
308
309 // average over the per-locus data; while scaling appropiately
310 // for the current region
311 std::transform( RegionalThetasAccumulator.begin(),
312 RegionalThetasAccumulator.end(),
313 RegionalThetasAccumulator.begin(),
314 std::bind2nd(std::divides<double>(),regionalthetascalars[regionIndex]*numLoci));
315
316 // and add to the totals by region
317 std::transform( RegionalThetasAccumulator.begin(),
318 RegionalThetasAccumulator.end(),
319 OverallThetasAccumulator.begin(),
320 OverallThetasAccumulator.begin(),
321 std::plus<double>());
322 }
323 // finally, average over the per-region data
324 std::transform( OverallThetasAccumulator.begin(),
325 OverallThetasAccumulator.end(),
326 OverallThetasAccumulator.begin(),
327 std::bind2nd(std::divides<double>(),static_cast<double>(numRegions)));
328 estimates = OverallThetasAccumulator;
329
330 } // ThetaFSTMultiregion
331
332 //------------------------------------------------------------------------------------
333
334 // EWFIX.P3 TEST -- IMPORTANT -- this function may contain code that sets the estimate to a default value
335 // when we don't want to. Need to see if we can avoid that and instead post-process unsuitable values
336 // to correct them when appropriate. for example, if using this function to get an initial estimate,
337 // we may want to default to a reasonable value, but if reporting the value we do not.
338
ThetaWattersonLocus(const DataPack & dpack,const Locus & locus,const long int numThetas,const vector<map<force_type,string>> tipids,DoubleVec1d & estimates,std::deque<bool> & isCalculated)339 void ThetaWattersonLocus(const DataPack& dpack, const Locus& locus, const long int numThetas,
340 const vector<map<force_type,string> > tipids, DoubleVec1d& estimates,
341 std::deque<bool>& isCalculated)
342 {
343 const long int nsites = locus.GetNsites();
344 DoubleVec1d thetaEstimates(numThetas,defaults::theta);
345
346 for(long int thetaIndex = 0; thetaIndex < numThetas; thetaIndex++)
347 {
348 long int nvarmarkers = 0;
349 const StringVec2d data = locus.GetCrossPartitionGeneticData(tipids[thetaIndex]);
350 if (!data.empty())
351 {
352 nvarmarkers = locus.GetDataTypePtr()->CalcNVarMarkers(data);
353 }
354
355 const long int nseqs = locus.GetNTips(tipids[thetaIndex]);
356
357 if(nseqs > 1)
358 {
359 double harmonic = 0.0;
360 for(long int seq = 1; seq < nseqs; seq++)
361 {
362 harmonic += 1.0/seq;
363 }
364 thetaEstimates[thetaIndex] = nvarmarkers / (nsites * harmonic);
365 }
366 else
367 {
368 isCalculated[thetaIndex] = false;
369 }
370 }
371 estimates = thetaEstimates;
372 } // ThetaWattersonLocus
373
374 //------------------------------------------------------------------------------------
375
ThetaWattersonMultiregion(const DataPack & dpack,const DoubleVec1d & regionalthetascalars,const DoubleVec2d & regionalmuratios,DoubleVec1d & estimates,std::deque<bool> & isCalculated)376 void ThetaWattersonMultiregion(const DataPack& dpack,
377 const DoubleVec1d& regionalthetascalars,
378 const DoubleVec2d& regionalmuratios,
379 DoubleVec1d& estimates,
380 std::deque<bool>& isCalculated)
381 {
382 const long int numRegions = dpack.GetNRegions();
383 const long int numThetas = dpack.GetNCrossPartitions();
384 isCalculated.assign(numThetas,true);
385
386 DoubleVec1d OverallThetasAccumulator(numThetas,0.0);
387
388 for(long int regionIndex = 0; regionIndex < numRegions; regionIndex++)
389 {
390 const Region& region = dpack.GetRegion(regionIndex);
391 DoubleVec1d RegionalThetasAccumulator;
392 ThetaWattersonRegion(dpack,region,regionalthetascalars[regionIndex],
393 regionalmuratios[regionIndex],RegionalThetasAccumulator,isCalculated);
394
395 // and add to the totals by region
396 std::transform( RegionalThetasAccumulator.begin(),
397 RegionalThetasAccumulator.end(),
398 OverallThetasAccumulator.begin(),
399 OverallThetasAccumulator.begin(),
400 std::plus<double>());
401 }
402 // finally, average over the per-region data
403 std::transform( OverallThetasAccumulator.begin(),
404 OverallThetasAccumulator.end(),
405 OverallThetasAccumulator.begin(),
406 std::bind2nd(std::divides<double>(),static_cast<double>(numRegions)));
407 estimates = OverallThetasAccumulator;
408
409 } // ThetaWattersonMultiregion
410
411 //------------------------------------------------------------------------------------
412
ThetaWattersonRegion(const DataPack & dpack,const Region & region,double thetascalar,const DoubleVec1d & muratios,DoubleVec1d & estimates,std::deque<bool> & isCalculated)413 void ThetaWattersonRegion(const DataPack& dpack,
414 const Region& region,
415 double thetascalar,
416 const DoubleVec1d& muratios,
417 DoubleVec1d& estimates,
418 std::deque<bool>& isCalculated)
419 {
420 long int numLoci = region.GetNloci();
421 long int numThetas = dpack.GetNCrossPartitions();
422 DoubleVec1d ThetasAccumulator(numThetas,0.0);
423 vector<map<force_type,string> > tipids = dpack.GetCrossPartitionIds(true);
424
425 for(long int locusIndex = 0; locusIndex < numLoci; locusIndex++)
426 {
427 const Locus& locus = region.GetLocus(locusIndex);
428 DoubleVec1d thetasForOneLocus;
429 ThetaWattersonLocus(dpack,locus,numThetas,tipids,thetasForOneLocus,isCalculated);
430
431 // rescale the locus values for their relative mutation rates
432 std::transform(thetasForOneLocus.begin(),
433 thetasForOneLocus.end(),
434 thetasForOneLocus.begin(),
435 std::bind2nd(std::divides<double>(),muratios[locusIndex]));
436
437 // and add to the totals by region
438 std::transform( thetasForOneLocus.begin(),
439 thetasForOneLocus.end(),
440 ThetasAccumulator.begin(),
441 ThetasAccumulator.begin(),
442 std::plus<double>());
443 }
444 // average over the per-locus data; while scaling appropriately
445 // for the current region
446 std::transform( ThetasAccumulator.begin(),
447 ThetasAccumulator.end(),
448 ThetasAccumulator.begin(),
449 std::bind2nd(std::divides<double>(),thetascalar*numLoci));
450
451 estimates = ThetasAccumulator;
452 } // ThetaWattersonRegion
453
454 //____________________________________________________________________________________
455