1 // $Id: calculators.cpp,v 1.26 2011/03/12 20:02:52 bobgian Exp $
2 
3 /*
4   Copyright 2002  Peter Beerli, Mary Kuhner, Jon Yamato and Joseph Felsenstein
5 
6   This software is distributed free of charge for non-commercial use
7   and is copyrighted.  Of course, we do not guarantee that the software
8   works, and are not responsible for any damage you may cause or have.
9 */
10 
11 #include <cassert>
12 #include <functional>
13 #include <numeric>
14 #include <map>
15 
16 #include "calculators.h"
17 #include "datapack.h"
18 #include "dlmodel.h"
19 #include "locus.h"
20 #include "mathx.h" //for ScaleToSumToOne
21 #include "region.h"
22 #include "registry.h"
23 
24 using std::map;
25 
26 //------------------------------------------------------------------------------------
27 
FrequenciesFromData(long int regionId,long int locusId,model_type modelType)28 DoubleVec1d FrequenciesFromData(long int regionId, long int locusId, model_type modelType)
29 {
30     const Region& thisRegion = registry.GetDataPack().GetRegion(regionId);
31     const Locus& thisLocus = thisRegion.GetLocus(locusId);
32     return FrequenciesFromData(thisLocus,modelType);
33 }
34 
35 //------------------------------------------------------------------------------------
36 
FrequenciesFromData(const Locus & locus,model_type modelType)37 DoubleVec1d FrequenciesFromData(const Locus& locus, model_type modelType)
38 {
39 
40     assert(modelType == F84 || modelType == GTR);
41     DoubleVec1d basefreqs(BASES,0.0);
42     double totalmarkers = 0.0;
43     const vector<TipData>& tips = locus.GetTipData();
44 
45     vector<TipData>::const_iterator tit;
46     for (tit = tips.begin(); tit != tips.end(); ++tit)
47     {
48         StringVec1d data = tit->data;
49         StringVec1d::const_iterator sit;
50 
51         for (sit = data.begin(); sit != data.end(); ++sit)
52         {
53             double errorRate= locus.GetPerBaseErrorRate();
54             DoubleVec1d site = NucModel::StaticDataToLikes(*sit,errorRate);
55             double total = site[baseA]+site[baseC]+site[baseG]+site[baseT];
56             basefreqs[baseA] += site[baseA]/total;
57             basefreqs[baseC] += site[baseC]/total;
58             basefreqs[baseG] += site[baseG]/total;
59             basefreqs[baseT] += site[baseT]/total;
60         }
61         totalmarkers += locus.GetNmarkers();
62     }
63 
64     basefreqs[baseA] /= totalmarkers;
65     basefreqs[baseC] /= totalmarkers;
66     basefreqs[baseG] /= totalmarkers;
67     basefreqs[baseT] /= totalmarkers;
68     ScaleToSumToOne(basefreqs);
69 
70     // check for any zero freqs, try global freqs, then use DBL_EPSILON set
71     double zero(0.0);
72     if (std::count(basefreqs.begin(),basefreqs.end(),zero))
73     {
74         basefreqs = registry.GetDataPack().CountOverallBaseFreqs();
75         ScaleToSumToOne(basefreqs);
76         if (std::count(basefreqs.begin(),basefreqs.end(),zero))
77         {
78             DoubleVec1d::iterator baseit;
79             for (baseit = basefreqs.begin(); baseit != basefreqs.end(); ++baseit)
80             {
81                 if (*baseit == 0.0) *baseit = defaults::minLegalFrequency;
82             }
83         }
84     }
85 
86     return basefreqs;
87 }
88 
89 //------------------------------------------------------------------------------------
90 
91 // MDEBUG LOCUS Function not currently used but being saved for
92 // multi-locus code
93 // EWFIX.P3 TEST -- IMPORTANT -- this function may contain code that sets
94 // the estimate to a default value when we don't want to. need
95 // to see if we can avoid that and instead post-process unsuitable
96 // values to correct them when appropriate. for example, if using
97 // this function to get an initial estimate, we may want to default
98 // to a reasonable value, but if reporting the value we do not.
99 
MigFSTLocus(const DataPack & dpack,const Locus & loc,long int numMigs,DoubleVec1d & estimate,std::deque<bool> & isCalculated)100 void MigFSTLocus(const DataPack& dpack, const Locus& loc, long int numMigs,
101                  DoubleVec1d& estimate, std::deque<bool>& isCalculated)
102 {
103     estimate.clear();
104     estimate = DoubleVec1d(numMigs,0.0);
105     assert(isCalculated.size() == estimate.size());
106     DoubleVec1d fw = loc.GetDataTypePtr()->CalcFw(loc);
107     DoubleVec1d fb = loc.GetDataTypePtr()->CalcFb(loc);
108 
109     long int index1, index2;
110     long int pop1, pop2, npops = fw.size();
111     assert(numMigs ==  (npops * npops));
112 
113     for(pop1 = 0; pop1 < npops; ++pop1)
114     {
115         for(pop2 = pop1+1; pop2 < npops; ++pop2)
116         {
117             index1 = pop1 * npops + pop2;  // position in linear vector
118             double fb12 = fb[index1];
119             double denom = -2.0 * fb12 + fw[pop1] + fw[pop2];
120             if (denom == 0.0)
121             {
122                 isCalculated[index1] = false;
123                 estimate[index1] = defaults::migration;
124             }
125             else estimate[index1] = 2.0 * fb12 / denom;
126             if (estimate[index1] < 0.0)
127             {
128                 isCalculated[index1] = false;
129                 estimate[index1] = defaults::migration;
130             }
131             index2 = pop2 * npops + pop1;  // lower triangle
132             estimate[index2] = estimate[index1];
133         }
134     }
135 } // MigFSTLocus
136 
137 //------------------------------------------------------------------------------------
138 
MigFSTMultiregion(const DataPack & dpack,const DoubleVec2d & regionalmuratios,DoubleVec1d & perRegionMigsAccumulator,std::deque<bool> & isCalculated)139 void MigFSTMultiregion(const DataPack & dpack, const DoubleVec2d & regionalmuratios,
140                        DoubleVec1d & perRegionMigsAccumulator, std::deque<bool> & isCalculated)
141 {
142     // MDEBUG this is disabled for force_DIVMIG currently.  It should be enabled
143     // eventually, contemporary populations only.
144 
145     // get MIG npop and DIVMIG npop
146     long int migpopCount  = dpack.GetNPartitionsByForceType(force_MIG);
147     long int divmigpopCount  = dpack.GetNPartitionsByForceType(force_DIVMIG);
148 
149     if (migpopCount == 0 && divmigpopCount <= 1)
150         throw data_error("Data error--zero populations?  There should be at least one.");
151 
152     if (migpopCount <= 1) return;
153 
154     long int popCount = migpopCount;
155 
156     isCalculated.clear();
157     perRegionMigsAccumulator.clear();
158     long int numMigs = popCount * popCount;
159     perRegionMigsAccumulator = DoubleVec1d(numMigs,0.0);
160     isCalculated = std::deque<bool>(numMigs,true);
161 
162     const long int numRegions = dpack.GetNRegions();
163 
164     for(long int regionIndex = 0; regionIndex < numRegions; regionIndex++)
165     {
166         const Region& region = dpack.GetRegion(regionIndex);
167         long int numLoci = region.GetNloci();
168         DoubleVec1d perLociMigsAccumulator(numMigs,0.0);
169 
170         for(long int locusIndex = 0; locusIndex < numLoci; locusIndex++)
171         {
172             const Locus& locus = region.GetLocus(locusIndex);
173             DoubleVec1d migsForOneLocus;
174             MigFSTLocus(dpack,locus,numMigs,migsForOneLocus,isCalculated);
175 
176             // rescale for relative mutation rate
177             std::transform(migsForOneLocus.begin(),
178                            migsForOneLocus.end(),
179                            migsForOneLocus.begin(),
180                            std::bind2nd(std::multiplies<double>(),
181                                         regionalmuratios[regionIndex][locusIndex]));
182 
183             // and add to the totals by region
184             std::transform(  migsForOneLocus.begin(),
185                              migsForOneLocus.end(),
186                              perLociMigsAccumulator.begin(),
187                              perLociMigsAccumulator.begin(),
188                              std::plus<double>());
189         }
190         // average over the per-locus data
191         std::transform(  perLociMigsAccumulator.begin(),
192                          perLociMigsAccumulator.end(),
193                          perLociMigsAccumulator.begin(),
194                          std::bind2nd(std::divides<double>(),static_cast<double>(numLoci)));
195 
196         // and add to the totals by region
197         std::transform(  perLociMigsAccumulator.begin(),
198                          perLociMigsAccumulator.end(),
199                          perRegionMigsAccumulator.begin(),
200                          perRegionMigsAccumulator.begin(),
201                          std::plus<double>());
202     }
203     // finally, average over the per-region data
204     std::transform(  perRegionMigsAccumulator.begin(),
205                      perRegionMigsAccumulator.end(),
206                      perRegionMigsAccumulator.begin(),
207                      std::bind2nd(std::divides<double>(),static_cast<double>(numRegions)));
208 }
209 
210 //------------------------------------------------------------------------------------
211 
212 // EWFIX.P3 TEST -- IMPORTANT -- this function may contain code that sets
213 // the estimate to a default value when we don't want to. need
214 // to see if we can avoid that and instead post-process unsuitable
215 // values to correct them when appropriate. for example, if using
216 // this function to get an initial estimate, we may want to default
217 // to a reasonable value, but if reporting the value we do not.
218 
ThetaFSTLocus(const DataPack & dpack,const Locus & loc,const long int numThetas,vector<map<force_type,string>> tipids,DoubleVec1d & estimates,std::deque<bool> & isCalculated)219 void ThetaFSTLocus(const DataPack& dpack, const Locus& loc, const long int numThetas,
220                    vector<map<force_type,string> > tipids, DoubleVec1d& estimates,
221                    std::deque<bool>& isCalculated)
222 {
223     assert(numThetas > 1);
224 
225     StringVec3d data;
226     for(long int xpart = 0; xpart < numThetas; xpart++)
227     {
228         data.push_back(loc.GetCrossPartitionGeneticData(tipids[xpart]));
229     }
230 
231     DoubleVec1d fw = loc.GetDataTypePtr()->CalcXFw(loc,data);
232     DoubleVec1d fb = loc.GetDataTypePtr()->CalcXFb(loc,data);
233 
234     long int index1;
235     assert(fw.size() == static_cast<unsigned long int>(numThetas));
236 
237     DoubleVec1d thetaEstimates(numThetas,defaults::theta);
238 
239     for(long int xpart = 0; xpart < numThetas; xpart++)
240     {
241         double estimate = 0.0;
242         for(long int otherpart = 0; otherpart < numThetas; ++otherpart)
243         {
244             if (otherpart == xpart) continue;
245             index1 = xpart * numThetas + otherpart;  // position in linear vector
246             double fb12 = fb[index1];
247             double numer = -2.0 * fb12 + fw[xpart] + fw[otherpart];
248             double denom = -2.0 * fb12 *fb12 + fw[xpart] + fw[otherpart] + fw[xpart] * fw[xpart];
249 
250             if (denom != 0.0)
251             {
252                 estimate += (numer * (1.0 - fw[xpart])) / denom;
253             }
254         }
255 
256         if (estimate > 0.0)
257         {
258             thetaEstimates[xpart] = estimate / numThetas;
259         }
260         else
261         {
262             isCalculated[xpart] = false;
263         }
264 
265     }
266     estimates = thetaEstimates;
267 } // ThetaFSTLocus
268 
269 //------------------------------------------------------------------------------------
270 
ThetaFSTMultiregion(const DataPack & dpack,const DoubleVec1d & regionalthetascalars,const DoubleVec2d & regionalmuratios,DoubleVec1d & estimates,std::deque<bool> & isCalculated)271 void ThetaFSTMultiregion(const DataPack& dpack, const DoubleVec1d & regionalthetascalars,
272                          const DoubleVec2d & regionalmuratios, DoubleVec1d & estimates,
273                          std::deque<bool> & isCalculated)
274 {
275     const long int numRegions = dpack.GetNRegions();
276     const long int numThetas  = dpack.GetNCrossPartitions();
277     vector<map<force_type,string> > tipids = dpack.GetCrossPartitionIds(true);
278     isCalculated.clear();
279     isCalculated = std::deque<bool>(numThetas,true);
280 
281     DoubleVec1d OverallThetasAccumulator(numThetas,0.0);
282 
283     for(long int regionIndex = 0; regionIndex < numRegions; regionIndex++)
284     {
285         const Region& region = dpack.GetRegion(regionIndex);
286         long int numLoci = region.GetNloci();
287         DoubleVec1d RegionalThetasAccumulator(numThetas,0.0);
288 
289         for(long int locusIndex = 0; locusIndex < numLoci; locusIndex++)
290         {
291             const Locus& locus = region.GetLocus(locusIndex);
292             DoubleVec1d thetasForOneLocus;
293             ThetaFSTLocus(dpack,locus,numThetas,tipids,thetasForOneLocus,isCalculated);
294             // correct for varying mutation rates
295             std::transform(thetasForOneLocus.begin(),
296                            thetasForOneLocus.end(),
297                            thetasForOneLocus.begin(),
298                            std::bind2nd(std::divides<double>(),
299                                         regionalmuratios[regionIndex][locusIndex]));
300 
301             // and add to the totals by locus
302             std::transform(  thetasForOneLocus.begin(),
303                              thetasForOneLocus.end(),
304                              RegionalThetasAccumulator.begin(),
305                              RegionalThetasAccumulator.begin(),
306                              std::plus<double>());
307         }
308 
309         // average over the per-locus data; while scaling appropiately
310         // for the current region
311         std::transform(  RegionalThetasAccumulator.begin(),
312                          RegionalThetasAccumulator.end(),
313                          RegionalThetasAccumulator.begin(),
314                          std::bind2nd(std::divides<double>(),regionalthetascalars[regionIndex]*numLoci));
315 
316         // and add to the totals by region
317         std::transform(  RegionalThetasAccumulator.begin(),
318                          RegionalThetasAccumulator.end(),
319                          OverallThetasAccumulator.begin(),
320                          OverallThetasAccumulator.begin(),
321                          std::plus<double>());
322     }
323     // finally, average over the per-region data
324     std::transform(  OverallThetasAccumulator.begin(),
325                      OverallThetasAccumulator.end(),
326                      OverallThetasAccumulator.begin(),
327                      std::bind2nd(std::divides<double>(),static_cast<double>(numRegions)));
328     estimates = OverallThetasAccumulator;
329 
330 } // ThetaFSTMultiregion
331 
332 //------------------------------------------------------------------------------------
333 
334 // EWFIX.P3 TEST -- IMPORTANT -- this function may contain code that sets the estimate to a default value
335 // when we don't want to.  Need to see if we can avoid that and instead post-process unsuitable values
336 // to correct them when appropriate. for example, if using this function to get an initial estimate,
337 // we may want to default to a reasonable value, but if reporting the value we do not.
338 
ThetaWattersonLocus(const DataPack & dpack,const Locus & locus,const long int numThetas,const vector<map<force_type,string>> tipids,DoubleVec1d & estimates,std::deque<bool> & isCalculated)339 void ThetaWattersonLocus(const DataPack& dpack, const Locus& locus, const long int numThetas,
340                          const vector<map<force_type,string> > tipids, DoubleVec1d& estimates,
341     std::deque<bool>& isCalculated)
342 {
343     const long int nsites = locus.GetNsites();
344     DoubleVec1d thetaEstimates(numThetas,defaults::theta);
345 
346     for(long int thetaIndex = 0; thetaIndex < numThetas; thetaIndex++)
347     {
348         long int nvarmarkers = 0;
349         const StringVec2d data = locus.GetCrossPartitionGeneticData(tipids[thetaIndex]);
350         if (!data.empty())
351         {
352             nvarmarkers = locus.GetDataTypePtr()->CalcNVarMarkers(data);
353         }
354 
355         const long int nseqs = locus.GetNTips(tipids[thetaIndex]);
356 
357         if(nseqs > 1)
358         {
359             double harmonic = 0.0;
360             for(long int seq = 1; seq < nseqs; seq++)
361             {
362                 harmonic += 1.0/seq;
363             }
364             thetaEstimates[thetaIndex] = nvarmarkers / (nsites * harmonic);
365         }
366         else
367         {
368             isCalculated[thetaIndex] = false;
369         }
370     }
371     estimates = thetaEstimates;
372 } // ThetaWattersonLocus
373 
374 //------------------------------------------------------------------------------------
375 
ThetaWattersonMultiregion(const DataPack & dpack,const DoubleVec1d & regionalthetascalars,const DoubleVec2d & regionalmuratios,DoubleVec1d & estimates,std::deque<bool> & isCalculated)376 void ThetaWattersonMultiregion(const DataPack& dpack,
377                                const DoubleVec1d& regionalthetascalars,
378                                const DoubleVec2d& regionalmuratios,
379                                DoubleVec1d& estimates,
380                                std::deque<bool>& isCalculated)
381 {
382     const long int numRegions = dpack.GetNRegions();
383     const long int numThetas  = dpack.GetNCrossPartitions();
384     isCalculated.assign(numThetas,true);
385 
386     DoubleVec1d OverallThetasAccumulator(numThetas,0.0);
387 
388     for(long int regionIndex = 0; regionIndex < numRegions; regionIndex++)
389     {
390         const Region& region = dpack.GetRegion(regionIndex);
391         DoubleVec1d RegionalThetasAccumulator;
392         ThetaWattersonRegion(dpack,region,regionalthetascalars[regionIndex],
393                              regionalmuratios[regionIndex],RegionalThetasAccumulator,isCalculated);
394 
395         // and add to the totals by region
396         std::transform(  RegionalThetasAccumulator.begin(),
397                          RegionalThetasAccumulator.end(),
398                          OverallThetasAccumulator.begin(),
399                          OverallThetasAccumulator.begin(),
400                          std::plus<double>());
401     }
402     // finally, average over the per-region data
403     std::transform(  OverallThetasAccumulator.begin(),
404                      OverallThetasAccumulator.end(),
405                      OverallThetasAccumulator.begin(),
406                      std::bind2nd(std::divides<double>(),static_cast<double>(numRegions)));
407     estimates = OverallThetasAccumulator;
408 
409 } // ThetaWattersonMultiregion
410 
411 //------------------------------------------------------------------------------------
412 
ThetaWattersonRegion(const DataPack & dpack,const Region & region,double thetascalar,const DoubleVec1d & muratios,DoubleVec1d & estimates,std::deque<bool> & isCalculated)413 void ThetaWattersonRegion(const DataPack& dpack,
414                           const Region& region,
415                           double thetascalar,
416                           const DoubleVec1d& muratios,
417                           DoubleVec1d& estimates,
418                           std::deque<bool>& isCalculated)
419 {
420     long int numLoci = region.GetNloci();
421     long int numThetas = dpack.GetNCrossPartitions();
422     DoubleVec1d ThetasAccumulator(numThetas,0.0);
423     vector<map<force_type,string> > tipids = dpack.GetCrossPartitionIds(true);
424 
425     for(long int locusIndex = 0; locusIndex < numLoci; locusIndex++)
426     {
427         const Locus& locus = region.GetLocus(locusIndex);
428         DoubleVec1d thetasForOneLocus;
429         ThetaWattersonLocus(dpack,locus,numThetas,tipids,thetasForOneLocus,isCalculated);
430 
431         // rescale the locus values for their relative mutation rates
432         std::transform(thetasForOneLocus.begin(),
433                        thetasForOneLocus.end(),
434                        thetasForOneLocus.begin(),
435                        std::bind2nd(std::divides<double>(),muratios[locusIndex]));
436 
437         // and add to the totals by region
438         std::transform(  thetasForOneLocus.begin(),
439                          thetasForOneLocus.end(),
440                          ThetasAccumulator.begin(),
441                          ThetasAccumulator.begin(),
442                          std::plus<double>());
443     }
444     // average over the per-locus data; while scaling appropriately
445     // for the current region
446     std::transform(  ThetasAccumulator.begin(),
447                      ThetasAccumulator.end(),
448                      ThetasAccumulator.begin(),
449                      std::bind2nd(std::divides<double>(),thetascalar*numLoci));
450 
451     estimates = ThetasAccumulator;
452 } // ThetaWattersonRegion
453 
454 //____________________________________________________________________________________
455