1 // $Id: locus.cpp,v 1.81 2012/01/05 01:28:42 ewalkup Exp $
2 
3 /*
4   Copyright 2003  Peter Beerli, Mary Kuhner, Jon Yamato and Joseph Felsenstein
5 
6   This software is distributed free of charge for non-commercial use
7   and is copyrighted.  Of course, we do not guarantee that the software
8   works, and are not responsible for any damage you may cause or have.
9 */
10 
11 #include <cassert>
12 #include <functional>                   // for Locus::CountNNucleotides()
13 #include <fstream>
14 
15 #include "local_build.h"
16 
17 #include "constants.h"
18 #include "dlcalc.h"
19 #include "dlmodel.h"
20 #include "force.h"                      // for TipData::GetBranchPartitions()
21 #include "individual.h"                 // for use of Individuals in setting up Locus objects
22 #include "locus.h"
23 #include "mathx.h"                      // for IsEven
24 #include "rangex.h"                     // LS DEBUG SIM
25 #include "registry.h"
26 #include "runreport.h"
27 #include "stringx.h"
28 #include "xml_strings.h"
29 
30 using namespace std;
31 
32 //------------------------------------------------------------------------------------
33 //------------------------------------------------------------------------------------
34 
Locus(long int ind,bool movable,string name)35 Locus::Locus(long int ind, bool movable, string name)
36     : m_index(ind),
37       m_name(name),
38       m_nmarkers(FLAGLONG),
39       m_nsites(FLAGLONG),
40       m_regionalmapposition(0),
41       m_globalmapposition(0),
42       m_offset(0),
43       m_movable(movable),
44       m_type(mloc_data),
45       m_defaultlocations(true),
46       m_positions(),
47       m_pDatatype(),
48       m_pDatamodel(),
49       m_tipdata(),
50       m_protoCell(),
51       m_pDLCalculator(),
52       m_tipcells(),
53       m_allowedrange(),
54       m_variablerange(),
55       m_unknownrange(),
56       m_map(),
57       m_simulate(false),
58       m_truesite(FLAGLONG),
59       m_variability(),
60       m_phenotypes(name)
61 {
62     if (movable)
63     {
64         //this is a different sort of locus:
65         SetNmarkers(1); //This requires alleles of 1 marker
66         SetNsites(1);
67         SetPositions();
68     }
69 } // Locus constructor
70 
71 //------------------------------------------------------------------------------------
72 
GetName() const73 string Locus::GetName() const
74 {
75     if (!m_name.empty()) return m_name;
76     string tempname("#");
77     tempname += ToString(GetIndex()+1);
78     return tempname;
79 } // GetName
80 
81 //------------------------------------------------------------------------------------
82 
GetNsites() const83 long int Locus::GetNsites() const
84 {
85     // if it has never been set, we assume it's m_nmarkers
86     if (m_nsites == FLAGLONG) return m_nmarkers;
87     return m_nsites;
88 
89 } // GetNsites
90 
91 //------------------------------------------------------------------------------------
92 
GetOffset() const93 long int Locus::GetOffset() const
94 {
95     return m_offset;
96 
97 } // GetOffset
98 
99 //------------------------------------------------------------------------------------
100 
GetUserMarkerLocations() const101 LongVec1d Locus::GetUserMarkerLocations() const
102 {
103     LongVec1d userpos(m_positions);
104     std::transform(userpos.begin(),userpos.end(),userpos.begin(),
105                    std::bind2nd(std::plus<long int>(),m_offset-m_regionalmapposition));
106     return userpos;
107 } // GetUserMarkerLocations
108 
109 //------------------------------------------------------------------------------------
110 // Once everything is ready, make this Locus into a fully functional one containing likelihood cells for its tips.
111 
Setup(const IndVec & individuals)112 void Locus::Setup(const IndVec& individuals)
113 {
114     m_tipcells.clear();
115 
116     m_protoCell = m_pDatatype->CreateDLCell(*this);
117     unsigned long int tip;
118     for (tip = 0; tip < m_tipdata.size(); ++tip)
119     {
120         if (m_tipdata[tip].m_nodata)
121         {
122             //The data is stored in the haplotypes, not this tip.
123             vector<LocusCell> cellsbymarkers;
124             //Find which individual codes for this tip.
125             for (unsigned long int ind = 0; ind < individuals.size(); ind++)
126             {
127                 if (m_tipdata[tip].individual == individuals[ind].GetId())
128                 {
129                     for (long int marker = 0; marker < GetNmarkers(); marker++)
130                     {
131                         vector<LocusCell> cells = individuals[ind].GetLocusCellsFor(GetName(), marker);
132                         assert(static_cast<long int>(cells.size()) > m_tipdata[tip].m_hap);
133                         cellsbymarkers.push_back(cells[m_tipdata[tip].m_hap]);
134                     }
135                     continue;
136                 }
137             }
138             LocusCell onecell(cellsbymarkers);
139             m_tipcells.push_back(onecell);
140         }
141         else
142         {
143             m_tipcells.push_back(m_pDatatype->CreateInitializedDLCell(*this, m_tipdata[tip].data));
144         }
145     }
146     m_pDLCalculator = DLCalc_ptr(m_pDatatype->CreateDLCalculator(*this));
147 
148 } // Setup
149 
150 //------------------------------------------------------------------------------------
151 // took away clone of src since it should be uniquely generated
152 // for each locus by Registry::InstallDataModels
153 
SetDataModelOnce(DataModel_ptr src)154 void Locus::SetDataModelOnce(DataModel_ptr src)
155 {
156     if (src.get() != NULL) m_pDatamodel = src;
157     else m_pDatamodel.reset();
158 
159 } // SetDataModel
160 
161 //------------------------------------------------------------------------------------
162 
SetAnalysisType(mloc_type type)163 void Locus::SetAnalysisType(mloc_type type)
164 {
165     m_type = type;
166     switch(type)
167     {
168         case mloc_mapjump:
169         case mloc_mapfloat:
170             m_movable = true;
171             break;
172         case mloc_data:
173             m_movable = false;
174             break;
175         case mloc_partition:
176             assert(false);
177             throw implementation_error("You shouldn't be able to set the analysis type to 'partition' yet.");
178             break;
179             //LS DEBUG MAPPING:  We need to throw it away here or something.
180     }
181 }
182 
183 //------------------------------------------------------------------------------------
184 
SetAllowedRange(rangeset rs,long int regoffset)185 void Locus::SetAllowedRange(rangeset rs, long int regoffset)
186 {
187     //rs comes in here on the global scale.
188     rangeset offsetrs;
189     for (rangesetiter rpair = rs.begin(); rpair != rs.end(); rpair++)
190     {
191         long int low = rpair->first - regoffset;
192         long int high = rpair->second - regoffset;
193         offsetrs.insert(make_pair(low, high));
194     }
195 
196     m_allowedrange = offsetrs;
197 }
198 
199 //------------------------------------------------------------------------------------
200 
SetEmptyTipData(vector<TipData> td)201 void Locus::SetEmptyTipData(vector<TipData> td)
202 {
203     for (unsigned long int tip = 0; tip < td.size(); tip++)
204     {
205         td[tip].data.clear();
206         td[tip].m_nodata = true;
207     }
208     m_tipdata = td;
209 }
210 
211 //------------------------------------------------------------------------------------
212 
SetVariableRange(rangeset rs)213 void Locus::SetVariableRange(rangeset rs)
214 {
215     m_variablerange = rs;
216 }
217 
218 //------------------------------------------------------------------------------------
219 
SetDataType(DataType_ptr src)220 void Locus::SetDataType(DataType_ptr src)
221 {
222     m_pDatatype = src; // shallow copy!
223 } // SetDataType
224 
225 //------------------------------------------------------------------------------------
226 
SetNmarkers(long int n)227 void Locus::SetNmarkers(long int n)
228 {
229     if (m_nmarkers == FLAGLONG)
230     {
231         m_nmarkers = n;
232     }
233     else
234     {
235         if (m_nmarkers != n)
236         {
237             data_error e("Inconsistent number of markers");
238             throw e;
239         }
240     }
241 } // SetNmarkers
242 
243 //------------------------------------------------------------------------------------
244 
SetGlobalMapposition(long int site)245 void Locus::SetGlobalMapposition(long int site)
246 {
247 #if 0  // Potentially DEAD CODE (bobgian, Feb 2010)
248     if (site == 0)
249     {
250         throw data_error("Assuming the biologist's convention of the nonexistence of site zero,"
251                          " we assume that the position left of site 1 is site -1."
252                          "  As such, you may not set the map position of any segment to '0'.");
253     }
254 #endif
255     m_globalmapposition = site;
256 } // Setglobalmapposition
257 
258 //------------------------------------------------------------------------------------
259 
SetRegionalMapposition(long int site)260 void Locus::SetRegionalMapposition(long int site)
261 {
262     transform(m_positions.begin(),m_positions.end(),m_positions.begin(),
263               std::bind2nd(std::minus<long int>(),m_regionalmapposition));
264 
265     if(IsMoving())
266     {
267         long int movement = site - m_regionalmapposition;
268         SetGlobalMapposition(m_globalmapposition + movement);
269     }
270 
271     m_regionalmapposition = site;
272     std::transform(m_positions.begin(),m_positions.end(),m_positions.begin(),
273                    std::bind2nd(std::plus<long int>(),m_regionalmapposition));
274 
275 } // Setregionalmapposition
276 
277 //------------------------------------------------------------------------------------
278 
SetOffset(long int val)279 void Locus::SetOffset(long int val)
280 {
281     m_offset = val;
282 } // SetOffset
283 
284 //------------------------------------------------------------------------------------
285 
SetPositions()286 void Locus::SetPositions()
287 {
288     m_defaultlocations = true;
289     m_positions.clear();
290     m_positions.reserve(m_nmarkers);  // for speed
291     long int i;
292     for (i = 0; i < m_nmarkers; ++i)
293     {
294         m_positions.push_back(i + m_regionalmapposition);
295     }
296 
297 } // SetPositions
298 
299 //------------------------------------------------------------------------------------
300 
SetPositions(const LongVec1d & pos)301 void Locus::SetPositions(const LongVec1d& pos)
302 {
303     m_defaultlocations = false;
304     m_positions = pos;
305     std::transform(m_positions.begin(),m_positions.end(),m_positions.begin(),
306                    std::bind2nd(std::plus<long int>(),m_regionalmapposition));
307 
308 } // SetPositions
309 
310 //------------------------------------------------------------------------------------
311 
CalcNVariableMarkers() const312 LongVec1d Locus::CalcNVariableMarkers() const
313 {
314     LongVec1d nvarmarkers;
315 
316     const DataPack& dpack = registry.GetDataPack();
317     long int xpart, nxparts = dpack.GetNCrossPartitions();
318 
319     for (xpart = 0; xpart < nxparts; ++xpart)
320     {
321         nvarmarkers.push_back(CalcNVariableMarkers(dpack.GetTipId(xpart)));
322     }
323 
324     return nvarmarkers;
325 } // CalcNVariableMarkers()
326 
327 //------------------------------------------------------------------------------------
328 
CalcNVariableMarkers(tipidtype xpart) const329 long int Locus::CalcNVariableMarkers(tipidtype xpart) const
330 {
331     long int nvarmarkers = 0;
332 
333     const StringVec2d data = GetCrossPartitionGeneticData(xpart);
334     if (!data.empty()) nvarmarkers = GetDataTypePtr()->CalcNVarMarkers(data);
335 
336     return nvarmarkers;
337 
338 } // CalcNVariableMarkers(tipidtype xpart)
339 
340 //------------------------------------------------------------------------------------
341 
GetPopulationTipData(const string & popname) const342 vector<TipData> Locus::GetPopulationTipData(const string& popname) const
343 {
344     vector<TipData> popdata;
345     vector<TipData>::const_iterator tip;
346     for(tip = m_tipdata.begin(); tip != m_tipdata.end(); ++tip)
347         if (tip->IsInPopulation(popname)) popdata.push_back(*tip);
348 
349     return popdata;
350 
351 } // GetPopulationTipData
352 
353 //------------------------------------------------------------------------------------
354 
GetCrossPartitionGeneticData(tipidtype xpart) const355 StringVec2d Locus::GetCrossPartitionGeneticData(tipidtype xpart) const
356 {
357     StringVec2d data;
358     vector<TipData>::const_iterator tip = GetTipData().begin();
359     for( ; tip != GetTipData().end(); ++tip)
360         if (tip->IsInCrossPartition(xpart)) data.push_back(tip->data);
361 
362     return data;
363 
364 } // GetCrossPartitionGeneticData
365 
366 //------------------------------------------------------------------------------------
367 
GetPartitionGeneticData(force_type partname) const368 StringVec3d Locus::GetPartitionGeneticData(force_type partname) const
369 {
370     assert(partname == force_MIG || partname == force_DISEASE || partname == force_DIVMIG);
371 
372     StringVec3d data(registry.GetDataPack().GetNPartitionsByForceType(partname));
373     vector<TipData>::const_iterator tip = GetTipData().begin();
374     for( ; tip != GetTipData().end(); ++tip)
375         data[tip->GetPartition(partname)].push_back(tip->data);
376 
377     return data;
378 
379 } // GetPartitionGeneticData
380 
381 //------------------------------------------------------------------------------------
382 
GetMarkerDataWithLabels(const IndVec & individuals) const383 StringVec1d Locus::GetMarkerDataWithLabels(const IndVec& individuals) const
384 {
385     long int width = std::max(static_cast<long int>(GetName().size()), GetNmarkers()+5);
386     StringVec1d labeleddata(1,MakeCentered(GetName(),width));
387     vector<TipData>::const_iterator tip = GetTipData().begin();
388     if (tip->m_nodata)
389     {
390         //We need to iterate over the individuals instead
391         //LS DEBUG:  this is a pretty fragile check for this situation.
392         for (long int marker = 0; marker < m_nmarkers; marker++)
393         {
394             for (unsigned long int ind = 0; ind < individuals.size(); ind++)
395             {
396                 string label = MakeJustified(individuals[ind].GetName(), -9);
397                 string data = individuals[ind].GetMarkerDataFor(GetName(), marker);
398                 labeleddata.push_back(label + " " + data);
399             }
400             if (m_nmarkers > 1)
401             {
402                 labeleddata.push_back("");
403             }
404         }
405     }
406     else
407     {
408         for( ; tip != GetTipData().end(); ++tip)
409         {
410             string label = MakeJustified(tip->label,-9);
411             string data = tip->GetFormattedData(m_pDatatype->GetDelimiter());
412             labeleddata.push_back(label+ " " + data);
413         }
414     }
415     return labeleddata;
416 
417 } // GetMarkerDataWithLabels
418 
419 //------------------------------------------------------------------------------------
420 
CountNNucleotides() const421 DoubleVec1d Locus::CountNNucleotides() const
422 {
423     DoubleVec1d count(BASES,0L);
424 
425     if (!m_pDatatype->IsNucleotideData()) return count;
426 
427     vector<TipData>::const_iterator tip = GetTipData().begin();
428     for( ; tip != GetTipData().end(); ++tip)
429     {
430         StringVec1d data = tip->data;
431 
432         StringVec1d::const_iterator sit;
433         for (sit = data.begin(); sit != data.end(); ++sit)
434         {
435             // we can't use locus' inherent datamodel here because this code may
436             // be called in the menu, where the locus may not have a datamodel yet!
437             DoubleVec1d site = NucModel::StaticDataToLikes(*sit,GetPerBaseErrorRate());
438             double zero(0.0);
439             double total(accumulate(site.begin(),site.end(),zero));
440             transform(site.begin(),site.end(),site.begin(),
441                       bind2nd(divides<double>(),total));
442 
443             assert(site.size() == count.size());
444 
445             transform(count.begin(),count.end(),site.begin(),
446                       count.begin(),plus<double>());
447         }
448     }
449     return count;
450 } // CountNNucleotides
451 
452 //------------------------------------------------------------------------------------
453 
MultiSampleIndividuals() const454 bool Locus::MultiSampleIndividuals() const
455 {
456     set<long int> individuals; //a unique list
457     for (unsigned long int ind = 0; ind < m_tipdata.size(); ind++)
458     {
459         long int individual = m_tipdata[ind].individual;
460         if (individuals.find(individual) != individuals.end()) return true;
461         individuals.insert(individual);
462     }
463     return false;
464 }
465 
466 //------------------------------------------------------------------------------------
467 
GetNTips(tipidtype xpart) const468 long int Locus::GetNTips(tipidtype xpart) const
469 {
470     long int count = 0;
471     vector<TipData>::const_iterator tip = GetTipData().begin();
472     for( ; tip != GetTipData().end(); ++tip)
473         if (tip->IsInCrossPartition(xpart)) ++count;
474 
475     return count;
476 } // GetNTips(tipidtype xpart)
477 
478 //------------------------------------------------------------------------------------
479 
CalculateDataLikelihood(Tree & tree,bool moving) const480 double Locus::CalculateDataLikelihood(Tree& tree, bool moving) const
481 {
482     return m_pDLCalculator->Calculate(tree, *this, moving);
483 } // CalculateDataLikelihood
484 
485 //------------------------------------------------------------------------------------
486 
AddUniqueNamesTo(set<string> & popnames) const487 void Locus::AddUniqueNamesTo(set<string>& popnames) const
488 {
489     vector<TipData>::const_iterator tip = GetTipData().begin();
490     for( ; tip != GetTipData().end(); ++tip)
491         popnames.insert(tip->m_popname);  // only new names added because popnames is a std::set
492 
493 } // AddUniqueNamesTo
494 
495 //------------------------------------------------------------------------------------
496 
SetNewMappositionIfMoving()497 void Locus::SetNewMappositionIfMoving()
498 {
499     assert (m_simulate);
500     if (IsMoving())
501     {
502         //Pick a site to actually live
503         long int nsites = CountSites(m_allowedrange);
504         do {
505             m_truesite = registry.GetRandom().Long(nsites);
506         } while (m_allowedrange != AddPairToRange(make_pair(m_truesite, m_truesite+1), m_allowedrange));
507         SetRegionalMapposition(m_truesite);
508     }
509 }
510 
511 //------------------------------------------------------------------------------------
512 
SimulateData(Tree & tree,long int nsites)513 void Locus::SimulateData(Tree& tree, long int nsites)
514 {
515     assert(m_simulate);
516     ClearVariability();
517     long int ntries = 0;
518     while (IsNighInvariant() && ++ntries<5000)
519     {
520         m_pDLCalculator->SimulateData(tree, *this);
521     }
522     if (ntries >= 5000)
523     {
524         registry.GetRunReport().ReportNormal("Gave up trying to simulate non-invariant data for segment "
525                                              + GetName() + ".");
526     }
527 } // SimulateData
528 
529 //------------------------------------------------------------------------------------
530 
CopyDataFrom(Locus & original,Tree & tree)531 void Locus::CopyDataFrom(Locus& original, Tree& tree)
532 {
533     m_pDLCalculator->CopyDataFrom(*this, original, tree);
534 }
535 
536 //------------------------------------------------------------------------------------
537 
MakePhenotypesFor(IndVec & individuals)538 void Locus::MakePhenotypesFor(IndVec& individuals)
539 {
540     for (size_t ind = 0; ind<individuals.size(); ind++)
541     {
542         for (long int marker = 0; marker<m_nmarkers; marker++)
543         {
544             StringVec1d alleles = individuals[ind].GetAllelesFromDLs(m_index, marker, IsMoving(), m_pDatamodel);
545             //LS DEBUG
546             //cout << "Original haplotypes:  " << ToString(alleles) << endl;
547             if (m_phenotypes.AnyDefinedPhenotypes())
548             {
549                 individuals[ind].SetHaplotypes(GetName(), marker, m_phenotypes.ChooseHaplotypes(alleles));
550             }
551             else
552             {
553                 Haplotypes haps(individuals[ind].GetHaplotypesFor(GetName(),marker), true);
554                 haps.AddHaplotype(alleles, 1.0);
555                 individuals[ind].SetHaplotypes(GetName(), marker, haps);
556                 //LS DEBUG
557                 //cout << "Final haplotypes:  " << ToString(haps.GetAlleles()) << endl;
558                 // individuals[ind].PrintHaplotypesFor(GetName(), marker);
559             }
560         }
561     }
562 }
563 
564 //------------------------------------------------------------------------------------
565 
SaveState(DoubleVec1d & state,long int marker,string label)566 void Locus::SaveState(DoubleVec1d& state, long int marker, string label)
567 {
568     //First, copy over our own dlcell:
569     for (size_t tip = 0; tip < m_tipdata.size(); tip++)
570     {
571         if (m_tipdata[tip].label == label)
572         {
573             m_tipcells[tip].SetAllCategoriesTo(state, marker);
574         }
575     }
576 
577     //Now add it to the 'variability' list.
578     map<long int, DoubleVec1d>::iterator freqs = m_variability.find(marker);
579     if (freqs == m_variability.end())
580     {
581         m_variability.insert(make_pair(marker, state));
582         return;
583     }
584     //Not new, so add it to the old
585     std::transform(  state.begin(),
586                      state.end(),
587                      freqs->second.begin(),
588                      freqs->second.begin(),
589                      std::plus<double>());
590 }
591 
592 //------------------------------------------------------------------------------------
593 
RandomizeHalf(Tree & tree,bool swath)594 void Locus::RandomizeHalf(Tree& tree, bool swath)
595 {
596     m_unknownrange.clear();
597     //The idea here is that we take a random quarter-sized swath out of the first
598     // half, and and another quarter-sized swath out of the second half.
599 
600     if (swath)
601     {
602         //Pick a swath out of the first half, and another swath out of the
603         // second half.
604         long int quarter = static_cast<long int>(m_nsites/4);
605         long int left = registry.GetRandom().Long(quarter);
606         m_unknownrange.insert(make_pair(left, left+quarter+1));
607         left = registry.GetRandom().Long(quarter);
608         m_unknownrange.insert(make_pair(left+(2*quarter), left + (3*quarter) + 1));
609     }
610     else
611     {
612         //The 'every other site' version:
613         for (long int left = 0; left < m_nsites; left = left+2)
614         {
615             m_unknownrange.insert(make_pair(left, left+1));
616         }
617     }
618     m_pDLCalculator->Randomize(*this, m_unknownrange, tree);
619 
620     //Now we need to clear the variable sites out of the unknown range
621     m_variablerange = RemoveRangeFromRange(m_unknownrange, m_variablerange);
622 }
623 
624 //------------------------------------------------------------------------------------
625 
SiteInLocus(long int site) const626 bool Locus::SiteInLocus(long int site) const
627 {
628     // is the given site in this locus?
629     return (m_regionalmapposition <= site &&
630             site < m_regionalmapposition + m_nsites);
631 } // SiteInLocus
632 
633 //------------------------------------------------------------------------------------
634 
SiteToMarker(long int site) const635 long int Locus::SiteToMarker(long int site) const
636 {
637     assert(SiteInLocus(site));
638     for (long int i = 0; i < m_nmarkers; i++)
639     {
640         if (m_positions[i] == site)
641         {
642             return i;
643         }
644     }
645     return FLAGLONG;
646 }
647 
648 //------------------------------------------------------------------------------------
649 
GetSiteSpan() const650 pair<long int, long int> Locus::GetSiteSpan() const
651 {
652     return make_pair(m_regionalmapposition, m_regionalmapposition + m_nsites);
653 } // GetSiteSpan
654 
655 //------------------------------------------------------------------------------------
656 
GetGlobalScaleSiteSpan() const657 pair<long int, long int> Locus::GetGlobalScaleSiteSpan() const
658 {
659     return make_pair(m_globalmapposition, m_globalmapposition + m_nsites);
660 } // GetGlobalScaleSiteSpan
661 
662 //------------------------------------------------------------------------------------
663 
IsMovable() const664 bool Locus::IsMovable() const
665 {
666     //We might want to change this if split into phase 1/phase 2
667     return m_movable;
668 }
669 
670 //------------------------------------------------------------------------------------
671 
IsMoving() const672 bool Locus::IsMoving() const
673 {
674     switch (GetAnalysisType())
675     {
676         case mloc_mapjump:
677         case mloc_mapfloat:
678             return true;
679         case mloc_data:
680             return false;
681         case mloc_partition:
682             assert(false);
683             throw implementation_error("Shouldn't be checking this for a partition segment.");
684     }
685     throw implementation_error("Uncaught analysis type for segment.");
686 }
687 
688 //------------------------------------------------------------------------------------
689 
GetPerBaseErrorRate() const690 double Locus::GetPerBaseErrorRate() const
691 // EWFIX.CODESMELL -- horrible, horrible code smell
692 {
693     const DataModel_ptr p = GetDataModel();
694     if(p == NULL) return defaults::per_base_error_rate;
695 
696     DataModel * q = p->Clone();
697     NucModel * nm = dynamic_cast<NucModel*>(q);
698     if(nm != NULL) return nm->GetPerBaseErrorRate();
699 
700     return defaults::per_base_error_rate;
701 }
702 
703 //------------------------------------------------------------------------------------
704 
RemovePartitionFromTipDatas(force_type forcename)705 void Locus::RemovePartitionFromTipDatas(force_type forcename)
706 {
707     vector<TipData>::iterator tip = GetTipData().begin();
708     for( ; tip != GetTipData().end(); ++tip)
709         tip->RemovePartition(forcename);
710 
711 } // RemovePartitionFromTipDatas
712 
713 //------------------------------------------------------------------------------------
714 
IsValidLocus(string & errorString) const715 bool Locus::IsValidLocus(string& errorString) const
716 {
717     unsigned long int nmark = GetNmarkers();  // to avoid comparison warning
718     if (nmark == 0)
719     {
720         errorString = "No data in segment " + GetName();
721         return false;
722     }
723 
724     if (nmark != GetMarkerLocations().size())
725     {
726         errorString = "The number of markers doesn't match their positions in segment " + GetName();
727         return false;
728     }
729 
730     if (m_nsites < static_cast<long int>(nmark))
731     {
732         errorString = "Number of markers exceeds sequence length in segment " + GetName();
733         return false;
734     }
735 
736     vector<long int> sortedpositions = GetMarkerLocations();
737     std::sort(sortedpositions.begin(), sortedpositions.end());
738     if (sortedpositions != GetMarkerLocations())
739     {
740         errorString = "Positions out of order in segment " + GetName();
741         return false;
742     }
743 
744 #if 0 // Dead code
745     // There should no longer be a need to validate the data
746     // model since they are constructed to be valid and the
747     // participating members don't change.
748     if (m_pDatamodel.get() != NULL && !m_pDatamodel->IsValidDataModel())
749     {
750         errorString = "Invalid datamodel in segment " + m_name;
751         return false;
752     }
753 #endif // Dead code
754 
755     // We needn't validate datatype as it has no state.
756     return true;
757 
758 } // IsValidLocus
759 
760 //------------------------------------------------------------------------------------
761 
MakeOrderedSites(long int regoffset) const762 set<pair<double, long int> > Locus::MakeOrderedSites(long int regoffset) const
763 {
764     assert(m_map.size() > 0);
765 
766     set<pair<double, long int> > orderedsites;
767     for (unsigned long int site = 0; site < m_map.size(); site++)
768     {
769         long int place = site + regoffset;
770         orderedsites.insert(make_pair(m_map[site], place));
771     }
772     return orderedsites;
773 }
774 
775 //------------------------------------------------------------------------------------
776 
GetBestSites(set<pair<double,long int>> orderedsites) const777 rangeset Locus::GetBestSites(set<pair<double, long int> > orderedsites) const
778 {
779     rangeset bestsites;
780 
781     for (set<pair<double, long int> >::reverse_iterator sitepair=orderedsites.rbegin();
782          sitepair != orderedsites.rend(); sitepair++)
783     {
784         rangepair thissite = make_pair(sitepair->second, sitepair->second+1);
785         if (sitepair->first == orderedsites.rbegin()->first)
786         {
787             bestsites = AddPairToRange(thissite, bestsites);
788         }
789     }
790 
791     return bestsites;
792 }
793 
794 //------------------------------------------------------------------------------------
795 
GetTopSites(set<pair<double,long int>> orderedsites,double percLimit) const796 rangeset Locus::GetTopSites(set<pair<double, long int> > orderedsites, double percLimit) const
797 {
798     rangeset topsites;
799 
800     double total = 0;
801     for (set<pair<double, long int> >::reverse_iterator sitepair=orderedsites.rbegin();
802          sitepair != orderedsites.rend(); sitepair++)
803     {
804         rangepair thissite = make_pair(sitepair->second, sitepair->second+1);
805         if (total < percLimit)
806         {
807             topsites = AddPairToRange(thissite, topsites);
808         }
809         total += sitepair->first;
810         //We add afterwards so that the site that pushes us over the edge still gets
811         // included in the appropriate range.
812     }
813 
814     return topsites;
815 }
816 
817 //------------------------------------------------------------------------------------
818 
ReportMappingInfo(long int regoffset,bool isshort) const819 StringVec1d Locus::ReportMappingInfo(long int regoffset, bool isshort) const
820 {
821     set<pair<double, long int> > orderedsites = MakeOrderedSites(regoffset);
822 
823 
824     rangeset bestsites = GetBestSites(orderedsites);
825     rangeset topfivepercent = GetTopSites(orderedsites,0.05);
826     rangeset topfiftypercent = GetTopSites(orderedsites,0.5);
827     rangeset topninetyfivepercent = GetTopSites(orderedsites,0.95);
828 
829     // EWFIX -- 2010-09-02 -- don't remove until checked
830 #if 0
831     double total = 0;
832     for (set<pair<double, long int> >::reverse_iterator sitepair=orderedsites.rbegin();
833          sitepair != orderedsites.rend(); sitepair++)
834     {
835         rangepair thissite = make_pair(sitepair->second, sitepair->second+1);
836         if (sitepair->first == orderedsites.rbegin()->first)
837         {
838             bestsites = AddPairToRange(thissite, bestsites);
839         }
840         if (total < .05)
841         {
842             topfivepercent = AddPairToRange(thissite, topfivepercent);
843         }
844         if (total < .5)
845         {
846             topfiftypercent = AddPairToRange(thissite, topfiftypercent);
847         }
848         if (total < .95)
849         {
850             topninetyfivepercent = AddPairToRange(thissite, topninetyfivepercent);
851         }
852         total += sitepair->first;
853         //We add afterwards so that the site that pushes us over the edge still gets
854         // included in the appropriate range.
855     }
856 #endif
857 
858     StringVec1d report;
859     string msg = "Most likely site(s) for " + GetName() + ":  "
860         + ToString(bestsites) + ".  Relative data likelihood = "
861         + ToString(orderedsites.rbegin()->first);
862     report.push_back(msg);
863     if (isshort)
864     {
865         return report;
866     }
867     msg = "The top 5% of all sites in this region:  " + ToString(topfivepercent);
868     report.push_back(msg);
869     msg = "The top 50% of all sites in this region:  " + ToString(topfiftypercent);
870     report.push_back(msg);
871     msg = "The top 95% of all sites in this region:  " + ToString(topninetyfivepercent);
872     report.push_back(msg);
873     msg = "You have a total of " + ToString(CountSites(topninetyfivepercent)) +
874         " sites in your 95% range.";
875     report.push_back(msg);
876     if (m_truesite != FLAGLONG)
877     {
878         rangepair truesite = make_pair(m_truesite, m_truesite+1);
879         msg = "The true site (" + ToString(truesite) + ") ";
880         if (topninetyfivepercent == AddPairToRange(truesite, topninetyfivepercent))
881         {
882             msg += "was";
883         }
884         else
885         {
886             msg += "was not";
887         }
888         msg += " included in the top 95%.";
889         report.push_back(msg);
890 
891 #if 0  // LS DEBUG SIM  Hard-coded information output
892         rangeset unknownrange = registry.GetDataPack().GetRegion(0).GetLocus(0).GetUnknownRange();
893         msg = "This site was in the ";
894         if (unknownrange == AddPairToRange(truesite, unknownrange))
895         {
896             msg += "known";
897         }
898         else
899         {
900             msg += "unknown";
901         }
902         msg += " part of the segment.";
903         report.push_back(msg);
904 #endif
905     }
906     if (m_variability.size() > 0 && IsMoving())
907     {
908         report.push_back("Data variability:");
909         for (map<long int, DoubleVec1d>::const_iterator marker = m_variability.begin();
910              marker != m_variability.end(); marker++)
911         {
912             for (unsigned long int allele = 0; allele < marker->second.size(); allele++)
913             {
914                 msg = "Position " + ToString(marker->first + 1) + ", "
915                     + "Allele " + ToString(allele+1) + ":  "
916                     + ToString(marker->second[allele]);
917                 report.push_back(msg);
918             }
919         }
920     }
921     return report;
922 }
923 
924 //------------------------------------------------------------------------------------
925 
IsDuplicateTipName(const string & newname) const926 bool Locus::IsDuplicateTipName(const string& newname) const
927 {
928     vector<TipData>::const_iterator tip;
929     for(tip = m_tipdata.begin(); tip != m_tipdata.end(); ++tip)
930         if (tip->label == newname) return true;
931 
932     return false;
933 }
934 
935 //------------------------------------------------------------------------------------
936 
MakeTraitXML(long int nspaces,long int regoffset) const937 StringVec1d Locus::MakeTraitXML(long int nspaces, long int regoffset) const
938 {
939     StringVec1d xmllines;
940     string line = MakeIndent(MakeTag(xmlstr::XML_TAG_TRAIT), nspaces);
941     xmllines.push_back(line);
942     nspaces += INDENT_DEPTH;
943 
944     line = MakeTag(xmlstr::XML_TAG_NAME) + " " + GetName() + " " +
945         MakeCloseTag(xmlstr::XML_TAG_NAME);
946     xmllines.push_back(MakeIndent(line, nspaces));
947 
948     line = MakeTag(xmlstr::XML_TAG_ANALYSIS) + " "
949         + ToXMLString(GetAnalysisType())
950         + " " + MakeCloseTag(xmlstr::XML_TAG_ANALYSIS);
951     xmllines.push_back(MakeIndent(line, nspaces));
952 
953     line = MakeTag(xmlstr::XML_TAG_POSSIBLE_LOCATIONS);
954     xmllines.push_back(MakeIndent(line, nspaces));
955 
956     nspaces += INDENT_DEPTH;
957     for (rangeset::iterator range = m_allowedrange.begin();
958          range != m_allowedrange.end(); range++)
959     {
960         long int start = (*range).first + regoffset;
961         long int end   = (*range).second + regoffset - 1;
962 
963         line = MakeTag(xmlstr::XML_TAG_RANGE);
964         xmllines.push_back(MakeIndent(line, nspaces));
965 
966         nspaces += INDENT_DEPTH;
967         line = MakeTag(xmlstr::XML_TAG_START) + " " + ToString(start)
968             + " " + MakeCloseTag(xmlstr::XML_TAG_START);
969         xmllines.push_back(MakeIndent(line, nspaces));
970         line = MakeTag(xmlstr::XML_TAG_END) + " " + ToString(end)
971             + " " + MakeCloseTag(xmlstr::XML_TAG_END);
972         xmllines.push_back(MakeIndent(line, nspaces));
973         nspaces -= INDENT_DEPTH;
974 
975         line = MakeCloseTag(xmlstr::XML_TAG_RANGE);
976         xmllines.push_back(MakeIndent(line, nspaces));
977     }
978     nspaces -= INDENT_DEPTH;
979     line = MakeCloseTag(xmlstr::XML_TAG_POSSIBLE_LOCATIONS);
980     xmllines.push_back(MakeIndent(line, nspaces));
981 
982     StringVec1d dlmodelxml(GetDataModel()->ToXML(nspaces));
983     xmllines.insert(xmllines.end(),dlmodelxml.begin(),dlmodelxml.end());
984 
985     StringVec1d phenotypexml(m_phenotypes.GetPhenotypesXML(nspaces));
986     xmllines.insert(xmllines.end(),phenotypexml.begin(),phenotypexml.end());
987 
988     nspaces -= INDENT_DEPTH;
989     line = MakeCloseTag(xmlstr::XML_TAG_TRAIT);
990     xmllines.push_back(MakeIndent(line, nspaces));
991 
992     return xmllines;
993 }
994 
995 //------------------------------------------------------------------------------------
996 //The general rule for IsNighInvariant is that if any marker is *not* 'nigh
997 // invariant' (false), the locus as a whole is also not.  If all markers are
998 // indeed 'nigh invariant' (true), so is the locus as a whole.
999 
IsNighInvariant() const1000 bool Locus::IsNighInvariant() const
1001 {
1002     if (m_variability.size() == 0)
1003     {
1004         return true;
1005     }
1006     for (map<long int, DoubleVec1d>::const_iterator marker = m_variability.begin();
1007          marker != m_variability.end(); marker++)
1008     {
1009         if (!(IsNighInvariant(marker->first))) return false;
1010     }
1011     return true;
1012 }
1013 
1014 //------------------------------------------------------------------------------------
1015 // The rule for IsNighInvariant (well, in its current 11-28-05 form) is that
1016 //  if all of the data at this marker is one particular allele but two, the
1017 //  data is 'nigh invariant' (true).  If there are at least three alleles which
1018 //  are members of the non-majority allele, the data is not 'nigh invariant'
1019 //  (false).
1020 
IsNighInvariant(long int marker) const1021 bool Locus::IsNighInvariant(long int marker) const
1022 {
1023     unsigned long int ntips = m_tipcells.size();
1024 
1025     map<long int, DoubleVec1d>::const_iterator freqs = m_variability.find(marker);
1026     if (freqs == m_variability.end()) return false;
1027     double maxfreq = 0;
1028     for (unsigned long int allele = 0; allele < freqs->second.size(); allele++)
1029     {
1030         maxfreq = max(maxfreq, freqs->second[allele]);
1031     }
1032     long int mindiff = std::min(3L, static_cast<long int>(ntips) - 1);
1033     if (maxfreq <= ntips - mindiff)
1034     {
1035         return false;
1036     }
1037     return true;
1038 }
1039 
1040 //------------------------------------------------------------------------------------
1041 
IsCompletelyInvariant(long int marker) const1042 bool Locus::IsCompletelyInvariant(long int marker) const
1043 {
1044     unsigned long int ntips = m_tipcells.size();
1045 
1046     map<long int, DoubleVec1d>::const_iterator freqs = m_variability.find(marker);
1047     if (freqs == m_variability.end()) return false;
1048     for (unsigned long int allele = 0; allele < freqs->second.size(); allele++)
1049     {
1050         if (0 < freqs->second[allele] && freqs->second[allele] < ntips)
1051         {
1052             return false;
1053         }
1054     }
1055     return true;
1056 }
1057 
1058 //------------------------------------------------------------------------------------
1059 
ChooseVariableSiteFrom(rangeset rset)1060 long int Locus::ChooseVariableSiteFrom(rangeset rset)
1061 {
1062     LongVec1d variables;
1063     for (rangeset::iterator rit = rset.begin(); rit != rset.end(); rit++)
1064     {
1065         for (long int site = rit->first; site<rit->second; site++)
1066         {
1067             if (SiteInLocus(site))
1068             {
1069                 if (!IsNighInvariant(SiteToMarker(site)))
1070                 {
1071                     variables.push_back(site);
1072                 }
1073             }
1074         }
1075     }
1076     if (variables.size() == 0)
1077     {
1078         //No variable sites!
1079         return FLAGLONG;
1080     }
1081     return variables[registry.GetRandom().Long(variables.size())];
1082 }
1083 
1084 //------------------------------------------------------------------------------------
1085 
GetVariableRange()1086 rangeset Locus::GetVariableRange()
1087 {
1088     if (m_variablerange.size() == 0)
1089     {
1090         m_variablerange = CalculateVariableRange();
1091     }
1092     return m_variablerange;
1093 }
1094 
1095 //------------------------------------------------------------------------------------
1096 
CalculateVariableRange() const1097 rangeset Locus::CalculateVariableRange() const
1098 {
1099     rangeset variables;
1100     for (long int site = 0; site < m_nsites; site++)
1101     {
1102         if (!IsNighInvariant(SiteToMarker(site)))
1103         {
1104             variables = AddPairToRange(make_pair(site, site+1), variables);
1105         }
1106     }
1107     return variables;
1108 
1109 }
1110 
1111 //------------------------------------------------------------------------------------
1112 
CalculateCompleteVariableRange() const1113 rangeset Locus::CalculateCompleteVariableRange() const
1114 {
1115     rangeset variables;
1116     for (long int site = 0; site < m_nsites; site++)
1117     {
1118         if (!IsCompletelyInvariant(SiteToMarker(site)))
1119         {
1120             variables = AddPairToRange(make_pair(site, site+1), variables);
1121         }
1122     }
1123     return variables;
1124 }
1125 
1126 //------------------------------------------------------------------------------------
1127 
GetVariableAndUnknownRange() const1128 rangeset Locus::GetVariableAndUnknownRange() const
1129 {
1130     return Union(m_variablerange, m_unknownrange);
1131 }
1132 
1133 //------------------------------------------------------------------------------------
1134 
GetVariabilityOfUnknownRange() const1135 long int Locus::GetVariabilityOfUnknownRange() const
1136 {
1137     long int numvariable = 0;
1138     for (rangeset::iterator range = m_unknownrange.begin(); range != m_unknownrange.end(); range++)
1139     {
1140         for (long int site = range->first; site < range->second; site++)
1141         {
1142             if (!IsNighInvariant(SiteToMarker(site)))
1143             {
1144                 numvariable++;
1145             }
1146         }
1147     }
1148     return numvariable;
1149 }
1150 
1151 //------------------------------------------------------------------------------------
1152 
GetVariabilityOfKnownRange() const1153 long int Locus::GetVariabilityOfKnownRange() const
1154 {
1155     return CountSites(m_variablerange);
1156 }
1157 
1158 //------------------------------------------------------------------------------------
1159 
GetKnownRange() const1160 rangeset Locus::GetKnownRange() const
1161 {
1162     rangeset retset;
1163     retset.insert(make_pair(0, m_nsites));
1164     retset = RemoveRangeFromRange(m_unknownrange, retset);
1165     return retset;
1166 }
1167 
1168 //------------------------------------------------------------------------------------
1169 
CreateDataModelReport(string regionname) const1170 StringVec1d Locus::CreateDataModelReport(string regionname) const
1171 {
1172     StringVec1d out = m_pDatamodel->CreateDataModelReport();
1173     StringVec1d head;
1174     head.push_back("Parameters of a " + m_pDatamodel->GetDataModelName()
1175                    + " model for the " + GetName() + " segment of the "
1176                    + regionname + " region");
1177     head.insert(head.end(), out.begin(), out.end());
1178     return head;
1179 }
1180 
1181 //------------------------------------------------------------------------------------
1182 
WriteMapping(string regname,long int regoffset) const1183 void Locus::WriteMapping(string regname, long int regoffset) const
1184 {
1185 
1186     ofstream mapfile;
1187     mapfile.precision(10);
1188     UserParameters& userparams = registry.GetUserParameters();
1189     string fname = userparams.GetMapFilePrefix() + "_" + GetName() + ".txt";
1190     mapfile.open(fname.c_str(), ios::out );
1191     userparams.AddMapFileName(fname);
1192 
1193     mapfile << "Mapping results for " + GetName() + " from the region \"" + regname + "\".\n";
1194 
1195     switch (GetAnalysisType())
1196     {
1197         case mloc_mapjump:
1198             mapfile << "The analysis for this trait was performed by allowing the location of "
1199                 "the trait marker to move from place to place as trees were created.\n";
1200             break;
1201         case mloc_mapfloat:
1202             mapfile << "This analysis for this trait was performed by collecting trees, then calculating the "
1203                 "data likelihood of the trait marker at all allowed sites on those trees, and then averaging.\n";
1204             break;
1205         case mloc_data:
1206         case mloc_partition:
1207             assert(false); //These loci should not be in the moving locus vector.
1208             return;
1209     }
1210 
1211     StringVec1d mapinfo = ReportMappingInfo(regoffset);
1212     for (size_t i = 0; i < mapinfo.size(); i++)
1213     {
1214         mapfile << mapinfo[i] + "\n";
1215     }
1216     mapfile << "\nSite\tData likelihood\tRaw likelihood\n";
1217 
1218     DoubleVec1d results = GetMappingInfo();
1219     DoubleVec1d rawresults = GetRawMappingInfo();
1220     StringVec1d resultsStrings(results.size(), "-");
1221     StringVec1d rawresultsStrings(rawresults.size(), "-");
1222     rangeset validrange = GetAllowedRange();
1223     for (rangeset::iterator range=validrange.begin();
1224          range != validrange.end(); range++)
1225     {
1226         for (long int site = range->first; site<range->second; site++)
1227         {
1228             resultsStrings[site] = Pretty(results[site], 10);
1229             rawresultsStrings[site] = Pretty(rawresults[site], 10);
1230         }
1231     }
1232     for (size_t site = 0; site < results.size(); site += 1)
1233     {
1234         long int place = ToNoZeroesIfNeeded(static_cast<long int>(site) + regoffset);
1235         mapfile << ToString(place) + "\t" + resultsStrings[site] + "\t" + rawresultsStrings[site] + "\n";
1236     }
1237 }
1238 
1239 //------------------------------------------------------------------------------------
1240 // Debugging function.
1241 
PrintVariability()1242 void Locus::PrintVariability()
1243 {
1244     for (map<long int, DoubleVec1d>::const_iterator marker = m_variability.begin();
1245          marker != m_variability.end();
1246          marker++)
1247     {
1248         string msg = "Position " + ToString(marker->first + 1) + ": ";
1249         for (unsigned long int allele = 0; allele < marker->second.size(); allele++)
1250         {
1251             msg += ToString(marker->second[allele]) + ", ";
1252         }
1253         cout << msg << endl;
1254     }
1255     cout << "Overall, the variable sites are:  " << ToString(GetVariableRange()) << endl;
1256 }
1257 
1258 //------------------------------------------------------------------------------------
1259 // Debugging function.
1260 
PrintOnesAndZeroesForVariableSites()1261 void Locus::PrintOnesAndZeroesForVariableSites()
1262 {
1263     rangeset variability = GetVariableRange();
1264     long int newl = 249;
1265     cout << "Variability: ";
1266     for (long int site = 0; site < m_nsites; site++)
1267     {
1268         if (variability == AddPairToRange(make_pair(site, site + 1), variability))
1269         {
1270             cout << "1 ";
1271         }
1272         else
1273         {
1274             cout << "0 ";
1275         }
1276         if (site == newl)
1277         {
1278             cout << endl << "Variability: ";
1279             newl += 250;
1280         }
1281     }
1282     cout << endl;
1283 }
1284 
1285 //____________________________________________________________________________________
1286