1 // $Id: locus.cpp,v 1.81 2012/01/05 01:28:42 ewalkup Exp $
2
3 /*
4 Copyright 2003 Peter Beerli, Mary Kuhner, Jon Yamato and Joseph Felsenstein
5
6 This software is distributed free of charge for non-commercial use
7 and is copyrighted. Of course, we do not guarantee that the software
8 works, and are not responsible for any damage you may cause or have.
9 */
10
11 #include <cassert>
12 #include <functional> // for Locus::CountNNucleotides()
13 #include <fstream>
14
15 #include "local_build.h"
16
17 #include "constants.h"
18 #include "dlcalc.h"
19 #include "dlmodel.h"
20 #include "force.h" // for TipData::GetBranchPartitions()
21 #include "individual.h" // for use of Individuals in setting up Locus objects
22 #include "locus.h"
23 #include "mathx.h" // for IsEven
24 #include "rangex.h" // LS DEBUG SIM
25 #include "registry.h"
26 #include "runreport.h"
27 #include "stringx.h"
28 #include "xml_strings.h"
29
30 using namespace std;
31
32 //------------------------------------------------------------------------------------
33 //------------------------------------------------------------------------------------
34
Locus(long int ind,bool movable,string name)35 Locus::Locus(long int ind, bool movable, string name)
36 : m_index(ind),
37 m_name(name),
38 m_nmarkers(FLAGLONG),
39 m_nsites(FLAGLONG),
40 m_regionalmapposition(0),
41 m_globalmapposition(0),
42 m_offset(0),
43 m_movable(movable),
44 m_type(mloc_data),
45 m_defaultlocations(true),
46 m_positions(),
47 m_pDatatype(),
48 m_pDatamodel(),
49 m_tipdata(),
50 m_protoCell(),
51 m_pDLCalculator(),
52 m_tipcells(),
53 m_allowedrange(),
54 m_variablerange(),
55 m_unknownrange(),
56 m_map(),
57 m_simulate(false),
58 m_truesite(FLAGLONG),
59 m_variability(),
60 m_phenotypes(name)
61 {
62 if (movable)
63 {
64 //this is a different sort of locus:
65 SetNmarkers(1); //This requires alleles of 1 marker
66 SetNsites(1);
67 SetPositions();
68 }
69 } // Locus constructor
70
71 //------------------------------------------------------------------------------------
72
GetName() const73 string Locus::GetName() const
74 {
75 if (!m_name.empty()) return m_name;
76 string tempname("#");
77 tempname += ToString(GetIndex()+1);
78 return tempname;
79 } // GetName
80
81 //------------------------------------------------------------------------------------
82
GetNsites() const83 long int Locus::GetNsites() const
84 {
85 // if it has never been set, we assume it's m_nmarkers
86 if (m_nsites == FLAGLONG) return m_nmarkers;
87 return m_nsites;
88
89 } // GetNsites
90
91 //------------------------------------------------------------------------------------
92
GetOffset() const93 long int Locus::GetOffset() const
94 {
95 return m_offset;
96
97 } // GetOffset
98
99 //------------------------------------------------------------------------------------
100
GetUserMarkerLocations() const101 LongVec1d Locus::GetUserMarkerLocations() const
102 {
103 LongVec1d userpos(m_positions);
104 std::transform(userpos.begin(),userpos.end(),userpos.begin(),
105 std::bind2nd(std::plus<long int>(),m_offset-m_regionalmapposition));
106 return userpos;
107 } // GetUserMarkerLocations
108
109 //------------------------------------------------------------------------------------
110 // Once everything is ready, make this Locus into a fully functional one containing likelihood cells for its tips.
111
Setup(const IndVec & individuals)112 void Locus::Setup(const IndVec& individuals)
113 {
114 m_tipcells.clear();
115
116 m_protoCell = m_pDatatype->CreateDLCell(*this);
117 unsigned long int tip;
118 for (tip = 0; tip < m_tipdata.size(); ++tip)
119 {
120 if (m_tipdata[tip].m_nodata)
121 {
122 //The data is stored in the haplotypes, not this tip.
123 vector<LocusCell> cellsbymarkers;
124 //Find which individual codes for this tip.
125 for (unsigned long int ind = 0; ind < individuals.size(); ind++)
126 {
127 if (m_tipdata[tip].individual == individuals[ind].GetId())
128 {
129 for (long int marker = 0; marker < GetNmarkers(); marker++)
130 {
131 vector<LocusCell> cells = individuals[ind].GetLocusCellsFor(GetName(), marker);
132 assert(static_cast<long int>(cells.size()) > m_tipdata[tip].m_hap);
133 cellsbymarkers.push_back(cells[m_tipdata[tip].m_hap]);
134 }
135 continue;
136 }
137 }
138 LocusCell onecell(cellsbymarkers);
139 m_tipcells.push_back(onecell);
140 }
141 else
142 {
143 m_tipcells.push_back(m_pDatatype->CreateInitializedDLCell(*this, m_tipdata[tip].data));
144 }
145 }
146 m_pDLCalculator = DLCalc_ptr(m_pDatatype->CreateDLCalculator(*this));
147
148 } // Setup
149
150 //------------------------------------------------------------------------------------
151 // took away clone of src since it should be uniquely generated
152 // for each locus by Registry::InstallDataModels
153
SetDataModelOnce(DataModel_ptr src)154 void Locus::SetDataModelOnce(DataModel_ptr src)
155 {
156 if (src.get() != NULL) m_pDatamodel = src;
157 else m_pDatamodel.reset();
158
159 } // SetDataModel
160
161 //------------------------------------------------------------------------------------
162
SetAnalysisType(mloc_type type)163 void Locus::SetAnalysisType(mloc_type type)
164 {
165 m_type = type;
166 switch(type)
167 {
168 case mloc_mapjump:
169 case mloc_mapfloat:
170 m_movable = true;
171 break;
172 case mloc_data:
173 m_movable = false;
174 break;
175 case mloc_partition:
176 assert(false);
177 throw implementation_error("You shouldn't be able to set the analysis type to 'partition' yet.");
178 break;
179 //LS DEBUG MAPPING: We need to throw it away here or something.
180 }
181 }
182
183 //------------------------------------------------------------------------------------
184
SetAllowedRange(rangeset rs,long int regoffset)185 void Locus::SetAllowedRange(rangeset rs, long int regoffset)
186 {
187 //rs comes in here on the global scale.
188 rangeset offsetrs;
189 for (rangesetiter rpair = rs.begin(); rpair != rs.end(); rpair++)
190 {
191 long int low = rpair->first - regoffset;
192 long int high = rpair->second - regoffset;
193 offsetrs.insert(make_pair(low, high));
194 }
195
196 m_allowedrange = offsetrs;
197 }
198
199 //------------------------------------------------------------------------------------
200
SetEmptyTipData(vector<TipData> td)201 void Locus::SetEmptyTipData(vector<TipData> td)
202 {
203 for (unsigned long int tip = 0; tip < td.size(); tip++)
204 {
205 td[tip].data.clear();
206 td[tip].m_nodata = true;
207 }
208 m_tipdata = td;
209 }
210
211 //------------------------------------------------------------------------------------
212
SetVariableRange(rangeset rs)213 void Locus::SetVariableRange(rangeset rs)
214 {
215 m_variablerange = rs;
216 }
217
218 //------------------------------------------------------------------------------------
219
SetDataType(DataType_ptr src)220 void Locus::SetDataType(DataType_ptr src)
221 {
222 m_pDatatype = src; // shallow copy!
223 } // SetDataType
224
225 //------------------------------------------------------------------------------------
226
SetNmarkers(long int n)227 void Locus::SetNmarkers(long int n)
228 {
229 if (m_nmarkers == FLAGLONG)
230 {
231 m_nmarkers = n;
232 }
233 else
234 {
235 if (m_nmarkers != n)
236 {
237 data_error e("Inconsistent number of markers");
238 throw e;
239 }
240 }
241 } // SetNmarkers
242
243 //------------------------------------------------------------------------------------
244
SetGlobalMapposition(long int site)245 void Locus::SetGlobalMapposition(long int site)
246 {
247 #if 0 // Potentially DEAD CODE (bobgian, Feb 2010)
248 if (site == 0)
249 {
250 throw data_error("Assuming the biologist's convention of the nonexistence of site zero,"
251 " we assume that the position left of site 1 is site -1."
252 " As such, you may not set the map position of any segment to '0'.");
253 }
254 #endif
255 m_globalmapposition = site;
256 } // Setglobalmapposition
257
258 //------------------------------------------------------------------------------------
259
SetRegionalMapposition(long int site)260 void Locus::SetRegionalMapposition(long int site)
261 {
262 transform(m_positions.begin(),m_positions.end(),m_positions.begin(),
263 std::bind2nd(std::minus<long int>(),m_regionalmapposition));
264
265 if(IsMoving())
266 {
267 long int movement = site - m_regionalmapposition;
268 SetGlobalMapposition(m_globalmapposition + movement);
269 }
270
271 m_regionalmapposition = site;
272 std::transform(m_positions.begin(),m_positions.end(),m_positions.begin(),
273 std::bind2nd(std::plus<long int>(),m_regionalmapposition));
274
275 } // Setregionalmapposition
276
277 //------------------------------------------------------------------------------------
278
SetOffset(long int val)279 void Locus::SetOffset(long int val)
280 {
281 m_offset = val;
282 } // SetOffset
283
284 //------------------------------------------------------------------------------------
285
SetPositions()286 void Locus::SetPositions()
287 {
288 m_defaultlocations = true;
289 m_positions.clear();
290 m_positions.reserve(m_nmarkers); // for speed
291 long int i;
292 for (i = 0; i < m_nmarkers; ++i)
293 {
294 m_positions.push_back(i + m_regionalmapposition);
295 }
296
297 } // SetPositions
298
299 //------------------------------------------------------------------------------------
300
SetPositions(const LongVec1d & pos)301 void Locus::SetPositions(const LongVec1d& pos)
302 {
303 m_defaultlocations = false;
304 m_positions = pos;
305 std::transform(m_positions.begin(),m_positions.end(),m_positions.begin(),
306 std::bind2nd(std::plus<long int>(),m_regionalmapposition));
307
308 } // SetPositions
309
310 //------------------------------------------------------------------------------------
311
CalcNVariableMarkers() const312 LongVec1d Locus::CalcNVariableMarkers() const
313 {
314 LongVec1d nvarmarkers;
315
316 const DataPack& dpack = registry.GetDataPack();
317 long int xpart, nxparts = dpack.GetNCrossPartitions();
318
319 for (xpart = 0; xpart < nxparts; ++xpart)
320 {
321 nvarmarkers.push_back(CalcNVariableMarkers(dpack.GetTipId(xpart)));
322 }
323
324 return nvarmarkers;
325 } // CalcNVariableMarkers()
326
327 //------------------------------------------------------------------------------------
328
CalcNVariableMarkers(tipidtype xpart) const329 long int Locus::CalcNVariableMarkers(tipidtype xpart) const
330 {
331 long int nvarmarkers = 0;
332
333 const StringVec2d data = GetCrossPartitionGeneticData(xpart);
334 if (!data.empty()) nvarmarkers = GetDataTypePtr()->CalcNVarMarkers(data);
335
336 return nvarmarkers;
337
338 } // CalcNVariableMarkers(tipidtype xpart)
339
340 //------------------------------------------------------------------------------------
341
GetPopulationTipData(const string & popname) const342 vector<TipData> Locus::GetPopulationTipData(const string& popname) const
343 {
344 vector<TipData> popdata;
345 vector<TipData>::const_iterator tip;
346 for(tip = m_tipdata.begin(); tip != m_tipdata.end(); ++tip)
347 if (tip->IsInPopulation(popname)) popdata.push_back(*tip);
348
349 return popdata;
350
351 } // GetPopulationTipData
352
353 //------------------------------------------------------------------------------------
354
GetCrossPartitionGeneticData(tipidtype xpart) const355 StringVec2d Locus::GetCrossPartitionGeneticData(tipidtype xpart) const
356 {
357 StringVec2d data;
358 vector<TipData>::const_iterator tip = GetTipData().begin();
359 for( ; tip != GetTipData().end(); ++tip)
360 if (tip->IsInCrossPartition(xpart)) data.push_back(tip->data);
361
362 return data;
363
364 } // GetCrossPartitionGeneticData
365
366 //------------------------------------------------------------------------------------
367
GetPartitionGeneticData(force_type partname) const368 StringVec3d Locus::GetPartitionGeneticData(force_type partname) const
369 {
370 assert(partname == force_MIG || partname == force_DISEASE || partname == force_DIVMIG);
371
372 StringVec3d data(registry.GetDataPack().GetNPartitionsByForceType(partname));
373 vector<TipData>::const_iterator tip = GetTipData().begin();
374 for( ; tip != GetTipData().end(); ++tip)
375 data[tip->GetPartition(partname)].push_back(tip->data);
376
377 return data;
378
379 } // GetPartitionGeneticData
380
381 //------------------------------------------------------------------------------------
382
GetMarkerDataWithLabels(const IndVec & individuals) const383 StringVec1d Locus::GetMarkerDataWithLabels(const IndVec& individuals) const
384 {
385 long int width = std::max(static_cast<long int>(GetName().size()), GetNmarkers()+5);
386 StringVec1d labeleddata(1,MakeCentered(GetName(),width));
387 vector<TipData>::const_iterator tip = GetTipData().begin();
388 if (tip->m_nodata)
389 {
390 //We need to iterate over the individuals instead
391 //LS DEBUG: this is a pretty fragile check for this situation.
392 for (long int marker = 0; marker < m_nmarkers; marker++)
393 {
394 for (unsigned long int ind = 0; ind < individuals.size(); ind++)
395 {
396 string label = MakeJustified(individuals[ind].GetName(), -9);
397 string data = individuals[ind].GetMarkerDataFor(GetName(), marker);
398 labeleddata.push_back(label + " " + data);
399 }
400 if (m_nmarkers > 1)
401 {
402 labeleddata.push_back("");
403 }
404 }
405 }
406 else
407 {
408 for( ; tip != GetTipData().end(); ++tip)
409 {
410 string label = MakeJustified(tip->label,-9);
411 string data = tip->GetFormattedData(m_pDatatype->GetDelimiter());
412 labeleddata.push_back(label+ " " + data);
413 }
414 }
415 return labeleddata;
416
417 } // GetMarkerDataWithLabels
418
419 //------------------------------------------------------------------------------------
420
CountNNucleotides() const421 DoubleVec1d Locus::CountNNucleotides() const
422 {
423 DoubleVec1d count(BASES,0L);
424
425 if (!m_pDatatype->IsNucleotideData()) return count;
426
427 vector<TipData>::const_iterator tip = GetTipData().begin();
428 for( ; tip != GetTipData().end(); ++tip)
429 {
430 StringVec1d data = tip->data;
431
432 StringVec1d::const_iterator sit;
433 for (sit = data.begin(); sit != data.end(); ++sit)
434 {
435 // we can't use locus' inherent datamodel here because this code may
436 // be called in the menu, where the locus may not have a datamodel yet!
437 DoubleVec1d site = NucModel::StaticDataToLikes(*sit,GetPerBaseErrorRate());
438 double zero(0.0);
439 double total(accumulate(site.begin(),site.end(),zero));
440 transform(site.begin(),site.end(),site.begin(),
441 bind2nd(divides<double>(),total));
442
443 assert(site.size() == count.size());
444
445 transform(count.begin(),count.end(),site.begin(),
446 count.begin(),plus<double>());
447 }
448 }
449 return count;
450 } // CountNNucleotides
451
452 //------------------------------------------------------------------------------------
453
MultiSampleIndividuals() const454 bool Locus::MultiSampleIndividuals() const
455 {
456 set<long int> individuals; //a unique list
457 for (unsigned long int ind = 0; ind < m_tipdata.size(); ind++)
458 {
459 long int individual = m_tipdata[ind].individual;
460 if (individuals.find(individual) != individuals.end()) return true;
461 individuals.insert(individual);
462 }
463 return false;
464 }
465
466 //------------------------------------------------------------------------------------
467
GetNTips(tipidtype xpart) const468 long int Locus::GetNTips(tipidtype xpart) const
469 {
470 long int count = 0;
471 vector<TipData>::const_iterator tip = GetTipData().begin();
472 for( ; tip != GetTipData().end(); ++tip)
473 if (tip->IsInCrossPartition(xpart)) ++count;
474
475 return count;
476 } // GetNTips(tipidtype xpart)
477
478 //------------------------------------------------------------------------------------
479
CalculateDataLikelihood(Tree & tree,bool moving) const480 double Locus::CalculateDataLikelihood(Tree& tree, bool moving) const
481 {
482 return m_pDLCalculator->Calculate(tree, *this, moving);
483 } // CalculateDataLikelihood
484
485 //------------------------------------------------------------------------------------
486
AddUniqueNamesTo(set<string> & popnames) const487 void Locus::AddUniqueNamesTo(set<string>& popnames) const
488 {
489 vector<TipData>::const_iterator tip = GetTipData().begin();
490 for( ; tip != GetTipData().end(); ++tip)
491 popnames.insert(tip->m_popname); // only new names added because popnames is a std::set
492
493 } // AddUniqueNamesTo
494
495 //------------------------------------------------------------------------------------
496
SetNewMappositionIfMoving()497 void Locus::SetNewMappositionIfMoving()
498 {
499 assert (m_simulate);
500 if (IsMoving())
501 {
502 //Pick a site to actually live
503 long int nsites = CountSites(m_allowedrange);
504 do {
505 m_truesite = registry.GetRandom().Long(nsites);
506 } while (m_allowedrange != AddPairToRange(make_pair(m_truesite, m_truesite+1), m_allowedrange));
507 SetRegionalMapposition(m_truesite);
508 }
509 }
510
511 //------------------------------------------------------------------------------------
512
SimulateData(Tree & tree,long int nsites)513 void Locus::SimulateData(Tree& tree, long int nsites)
514 {
515 assert(m_simulate);
516 ClearVariability();
517 long int ntries = 0;
518 while (IsNighInvariant() && ++ntries<5000)
519 {
520 m_pDLCalculator->SimulateData(tree, *this);
521 }
522 if (ntries >= 5000)
523 {
524 registry.GetRunReport().ReportNormal("Gave up trying to simulate non-invariant data for segment "
525 + GetName() + ".");
526 }
527 } // SimulateData
528
529 //------------------------------------------------------------------------------------
530
CopyDataFrom(Locus & original,Tree & tree)531 void Locus::CopyDataFrom(Locus& original, Tree& tree)
532 {
533 m_pDLCalculator->CopyDataFrom(*this, original, tree);
534 }
535
536 //------------------------------------------------------------------------------------
537
MakePhenotypesFor(IndVec & individuals)538 void Locus::MakePhenotypesFor(IndVec& individuals)
539 {
540 for (size_t ind = 0; ind<individuals.size(); ind++)
541 {
542 for (long int marker = 0; marker<m_nmarkers; marker++)
543 {
544 StringVec1d alleles = individuals[ind].GetAllelesFromDLs(m_index, marker, IsMoving(), m_pDatamodel);
545 //LS DEBUG
546 //cout << "Original haplotypes: " << ToString(alleles) << endl;
547 if (m_phenotypes.AnyDefinedPhenotypes())
548 {
549 individuals[ind].SetHaplotypes(GetName(), marker, m_phenotypes.ChooseHaplotypes(alleles));
550 }
551 else
552 {
553 Haplotypes haps(individuals[ind].GetHaplotypesFor(GetName(),marker), true);
554 haps.AddHaplotype(alleles, 1.0);
555 individuals[ind].SetHaplotypes(GetName(), marker, haps);
556 //LS DEBUG
557 //cout << "Final haplotypes: " << ToString(haps.GetAlleles()) << endl;
558 // individuals[ind].PrintHaplotypesFor(GetName(), marker);
559 }
560 }
561 }
562 }
563
564 //------------------------------------------------------------------------------------
565
SaveState(DoubleVec1d & state,long int marker,string label)566 void Locus::SaveState(DoubleVec1d& state, long int marker, string label)
567 {
568 //First, copy over our own dlcell:
569 for (size_t tip = 0; tip < m_tipdata.size(); tip++)
570 {
571 if (m_tipdata[tip].label == label)
572 {
573 m_tipcells[tip].SetAllCategoriesTo(state, marker);
574 }
575 }
576
577 //Now add it to the 'variability' list.
578 map<long int, DoubleVec1d>::iterator freqs = m_variability.find(marker);
579 if (freqs == m_variability.end())
580 {
581 m_variability.insert(make_pair(marker, state));
582 return;
583 }
584 //Not new, so add it to the old
585 std::transform( state.begin(),
586 state.end(),
587 freqs->second.begin(),
588 freqs->second.begin(),
589 std::plus<double>());
590 }
591
592 //------------------------------------------------------------------------------------
593
RandomizeHalf(Tree & tree,bool swath)594 void Locus::RandomizeHalf(Tree& tree, bool swath)
595 {
596 m_unknownrange.clear();
597 //The idea here is that we take a random quarter-sized swath out of the first
598 // half, and and another quarter-sized swath out of the second half.
599
600 if (swath)
601 {
602 //Pick a swath out of the first half, and another swath out of the
603 // second half.
604 long int quarter = static_cast<long int>(m_nsites/4);
605 long int left = registry.GetRandom().Long(quarter);
606 m_unknownrange.insert(make_pair(left, left+quarter+1));
607 left = registry.GetRandom().Long(quarter);
608 m_unknownrange.insert(make_pair(left+(2*quarter), left + (3*quarter) + 1));
609 }
610 else
611 {
612 //The 'every other site' version:
613 for (long int left = 0; left < m_nsites; left = left+2)
614 {
615 m_unknownrange.insert(make_pair(left, left+1));
616 }
617 }
618 m_pDLCalculator->Randomize(*this, m_unknownrange, tree);
619
620 //Now we need to clear the variable sites out of the unknown range
621 m_variablerange = RemoveRangeFromRange(m_unknownrange, m_variablerange);
622 }
623
624 //------------------------------------------------------------------------------------
625
SiteInLocus(long int site) const626 bool Locus::SiteInLocus(long int site) const
627 {
628 // is the given site in this locus?
629 return (m_regionalmapposition <= site &&
630 site < m_regionalmapposition + m_nsites);
631 } // SiteInLocus
632
633 //------------------------------------------------------------------------------------
634
SiteToMarker(long int site) const635 long int Locus::SiteToMarker(long int site) const
636 {
637 assert(SiteInLocus(site));
638 for (long int i = 0; i < m_nmarkers; i++)
639 {
640 if (m_positions[i] == site)
641 {
642 return i;
643 }
644 }
645 return FLAGLONG;
646 }
647
648 //------------------------------------------------------------------------------------
649
GetSiteSpan() const650 pair<long int, long int> Locus::GetSiteSpan() const
651 {
652 return make_pair(m_regionalmapposition, m_regionalmapposition + m_nsites);
653 } // GetSiteSpan
654
655 //------------------------------------------------------------------------------------
656
GetGlobalScaleSiteSpan() const657 pair<long int, long int> Locus::GetGlobalScaleSiteSpan() const
658 {
659 return make_pair(m_globalmapposition, m_globalmapposition + m_nsites);
660 } // GetGlobalScaleSiteSpan
661
662 //------------------------------------------------------------------------------------
663
IsMovable() const664 bool Locus::IsMovable() const
665 {
666 //We might want to change this if split into phase 1/phase 2
667 return m_movable;
668 }
669
670 //------------------------------------------------------------------------------------
671
IsMoving() const672 bool Locus::IsMoving() const
673 {
674 switch (GetAnalysisType())
675 {
676 case mloc_mapjump:
677 case mloc_mapfloat:
678 return true;
679 case mloc_data:
680 return false;
681 case mloc_partition:
682 assert(false);
683 throw implementation_error("Shouldn't be checking this for a partition segment.");
684 }
685 throw implementation_error("Uncaught analysis type for segment.");
686 }
687
688 //------------------------------------------------------------------------------------
689
GetPerBaseErrorRate() const690 double Locus::GetPerBaseErrorRate() const
691 // EWFIX.CODESMELL -- horrible, horrible code smell
692 {
693 const DataModel_ptr p = GetDataModel();
694 if(p == NULL) return defaults::per_base_error_rate;
695
696 DataModel * q = p->Clone();
697 NucModel * nm = dynamic_cast<NucModel*>(q);
698 if(nm != NULL) return nm->GetPerBaseErrorRate();
699
700 return defaults::per_base_error_rate;
701 }
702
703 //------------------------------------------------------------------------------------
704
RemovePartitionFromTipDatas(force_type forcename)705 void Locus::RemovePartitionFromTipDatas(force_type forcename)
706 {
707 vector<TipData>::iterator tip = GetTipData().begin();
708 for( ; tip != GetTipData().end(); ++tip)
709 tip->RemovePartition(forcename);
710
711 } // RemovePartitionFromTipDatas
712
713 //------------------------------------------------------------------------------------
714
IsValidLocus(string & errorString) const715 bool Locus::IsValidLocus(string& errorString) const
716 {
717 unsigned long int nmark = GetNmarkers(); // to avoid comparison warning
718 if (nmark == 0)
719 {
720 errorString = "No data in segment " + GetName();
721 return false;
722 }
723
724 if (nmark != GetMarkerLocations().size())
725 {
726 errorString = "The number of markers doesn't match their positions in segment " + GetName();
727 return false;
728 }
729
730 if (m_nsites < static_cast<long int>(nmark))
731 {
732 errorString = "Number of markers exceeds sequence length in segment " + GetName();
733 return false;
734 }
735
736 vector<long int> sortedpositions = GetMarkerLocations();
737 std::sort(sortedpositions.begin(), sortedpositions.end());
738 if (sortedpositions != GetMarkerLocations())
739 {
740 errorString = "Positions out of order in segment " + GetName();
741 return false;
742 }
743
744 #if 0 // Dead code
745 // There should no longer be a need to validate the data
746 // model since they are constructed to be valid and the
747 // participating members don't change.
748 if (m_pDatamodel.get() != NULL && !m_pDatamodel->IsValidDataModel())
749 {
750 errorString = "Invalid datamodel in segment " + m_name;
751 return false;
752 }
753 #endif // Dead code
754
755 // We needn't validate datatype as it has no state.
756 return true;
757
758 } // IsValidLocus
759
760 //------------------------------------------------------------------------------------
761
MakeOrderedSites(long int regoffset) const762 set<pair<double, long int> > Locus::MakeOrderedSites(long int regoffset) const
763 {
764 assert(m_map.size() > 0);
765
766 set<pair<double, long int> > orderedsites;
767 for (unsigned long int site = 0; site < m_map.size(); site++)
768 {
769 long int place = site + regoffset;
770 orderedsites.insert(make_pair(m_map[site], place));
771 }
772 return orderedsites;
773 }
774
775 //------------------------------------------------------------------------------------
776
GetBestSites(set<pair<double,long int>> orderedsites) const777 rangeset Locus::GetBestSites(set<pair<double, long int> > orderedsites) const
778 {
779 rangeset bestsites;
780
781 for (set<pair<double, long int> >::reverse_iterator sitepair=orderedsites.rbegin();
782 sitepair != orderedsites.rend(); sitepair++)
783 {
784 rangepair thissite = make_pair(sitepair->second, sitepair->second+1);
785 if (sitepair->first == orderedsites.rbegin()->first)
786 {
787 bestsites = AddPairToRange(thissite, bestsites);
788 }
789 }
790
791 return bestsites;
792 }
793
794 //------------------------------------------------------------------------------------
795
GetTopSites(set<pair<double,long int>> orderedsites,double percLimit) const796 rangeset Locus::GetTopSites(set<pair<double, long int> > orderedsites, double percLimit) const
797 {
798 rangeset topsites;
799
800 double total = 0;
801 for (set<pair<double, long int> >::reverse_iterator sitepair=orderedsites.rbegin();
802 sitepair != orderedsites.rend(); sitepair++)
803 {
804 rangepair thissite = make_pair(sitepair->second, sitepair->second+1);
805 if (total < percLimit)
806 {
807 topsites = AddPairToRange(thissite, topsites);
808 }
809 total += sitepair->first;
810 //We add afterwards so that the site that pushes us over the edge still gets
811 // included in the appropriate range.
812 }
813
814 return topsites;
815 }
816
817 //------------------------------------------------------------------------------------
818
ReportMappingInfo(long int regoffset,bool isshort) const819 StringVec1d Locus::ReportMappingInfo(long int regoffset, bool isshort) const
820 {
821 set<pair<double, long int> > orderedsites = MakeOrderedSites(regoffset);
822
823
824 rangeset bestsites = GetBestSites(orderedsites);
825 rangeset topfivepercent = GetTopSites(orderedsites,0.05);
826 rangeset topfiftypercent = GetTopSites(orderedsites,0.5);
827 rangeset topninetyfivepercent = GetTopSites(orderedsites,0.95);
828
829 // EWFIX -- 2010-09-02 -- don't remove until checked
830 #if 0
831 double total = 0;
832 for (set<pair<double, long int> >::reverse_iterator sitepair=orderedsites.rbegin();
833 sitepair != orderedsites.rend(); sitepair++)
834 {
835 rangepair thissite = make_pair(sitepair->second, sitepair->second+1);
836 if (sitepair->first == orderedsites.rbegin()->first)
837 {
838 bestsites = AddPairToRange(thissite, bestsites);
839 }
840 if (total < .05)
841 {
842 topfivepercent = AddPairToRange(thissite, topfivepercent);
843 }
844 if (total < .5)
845 {
846 topfiftypercent = AddPairToRange(thissite, topfiftypercent);
847 }
848 if (total < .95)
849 {
850 topninetyfivepercent = AddPairToRange(thissite, topninetyfivepercent);
851 }
852 total += sitepair->first;
853 //We add afterwards so that the site that pushes us over the edge still gets
854 // included in the appropriate range.
855 }
856 #endif
857
858 StringVec1d report;
859 string msg = "Most likely site(s) for " + GetName() + ": "
860 + ToString(bestsites) + ". Relative data likelihood = "
861 + ToString(orderedsites.rbegin()->first);
862 report.push_back(msg);
863 if (isshort)
864 {
865 return report;
866 }
867 msg = "The top 5% of all sites in this region: " + ToString(topfivepercent);
868 report.push_back(msg);
869 msg = "The top 50% of all sites in this region: " + ToString(topfiftypercent);
870 report.push_back(msg);
871 msg = "The top 95% of all sites in this region: " + ToString(topninetyfivepercent);
872 report.push_back(msg);
873 msg = "You have a total of " + ToString(CountSites(topninetyfivepercent)) +
874 " sites in your 95% range.";
875 report.push_back(msg);
876 if (m_truesite != FLAGLONG)
877 {
878 rangepair truesite = make_pair(m_truesite, m_truesite+1);
879 msg = "The true site (" + ToString(truesite) + ") ";
880 if (topninetyfivepercent == AddPairToRange(truesite, topninetyfivepercent))
881 {
882 msg += "was";
883 }
884 else
885 {
886 msg += "was not";
887 }
888 msg += " included in the top 95%.";
889 report.push_back(msg);
890
891 #if 0 // LS DEBUG SIM Hard-coded information output
892 rangeset unknownrange = registry.GetDataPack().GetRegion(0).GetLocus(0).GetUnknownRange();
893 msg = "This site was in the ";
894 if (unknownrange == AddPairToRange(truesite, unknownrange))
895 {
896 msg += "known";
897 }
898 else
899 {
900 msg += "unknown";
901 }
902 msg += " part of the segment.";
903 report.push_back(msg);
904 #endif
905 }
906 if (m_variability.size() > 0 && IsMoving())
907 {
908 report.push_back("Data variability:");
909 for (map<long int, DoubleVec1d>::const_iterator marker = m_variability.begin();
910 marker != m_variability.end(); marker++)
911 {
912 for (unsigned long int allele = 0; allele < marker->second.size(); allele++)
913 {
914 msg = "Position " + ToString(marker->first + 1) + ", "
915 + "Allele " + ToString(allele+1) + ": "
916 + ToString(marker->second[allele]);
917 report.push_back(msg);
918 }
919 }
920 }
921 return report;
922 }
923
924 //------------------------------------------------------------------------------------
925
IsDuplicateTipName(const string & newname) const926 bool Locus::IsDuplicateTipName(const string& newname) const
927 {
928 vector<TipData>::const_iterator tip;
929 for(tip = m_tipdata.begin(); tip != m_tipdata.end(); ++tip)
930 if (tip->label == newname) return true;
931
932 return false;
933 }
934
935 //------------------------------------------------------------------------------------
936
MakeTraitXML(long int nspaces,long int regoffset) const937 StringVec1d Locus::MakeTraitXML(long int nspaces, long int regoffset) const
938 {
939 StringVec1d xmllines;
940 string line = MakeIndent(MakeTag(xmlstr::XML_TAG_TRAIT), nspaces);
941 xmllines.push_back(line);
942 nspaces += INDENT_DEPTH;
943
944 line = MakeTag(xmlstr::XML_TAG_NAME) + " " + GetName() + " " +
945 MakeCloseTag(xmlstr::XML_TAG_NAME);
946 xmllines.push_back(MakeIndent(line, nspaces));
947
948 line = MakeTag(xmlstr::XML_TAG_ANALYSIS) + " "
949 + ToXMLString(GetAnalysisType())
950 + " " + MakeCloseTag(xmlstr::XML_TAG_ANALYSIS);
951 xmllines.push_back(MakeIndent(line, nspaces));
952
953 line = MakeTag(xmlstr::XML_TAG_POSSIBLE_LOCATIONS);
954 xmllines.push_back(MakeIndent(line, nspaces));
955
956 nspaces += INDENT_DEPTH;
957 for (rangeset::iterator range = m_allowedrange.begin();
958 range != m_allowedrange.end(); range++)
959 {
960 long int start = (*range).first + regoffset;
961 long int end = (*range).second + regoffset - 1;
962
963 line = MakeTag(xmlstr::XML_TAG_RANGE);
964 xmllines.push_back(MakeIndent(line, nspaces));
965
966 nspaces += INDENT_DEPTH;
967 line = MakeTag(xmlstr::XML_TAG_START) + " " + ToString(start)
968 + " " + MakeCloseTag(xmlstr::XML_TAG_START);
969 xmllines.push_back(MakeIndent(line, nspaces));
970 line = MakeTag(xmlstr::XML_TAG_END) + " " + ToString(end)
971 + " " + MakeCloseTag(xmlstr::XML_TAG_END);
972 xmllines.push_back(MakeIndent(line, nspaces));
973 nspaces -= INDENT_DEPTH;
974
975 line = MakeCloseTag(xmlstr::XML_TAG_RANGE);
976 xmllines.push_back(MakeIndent(line, nspaces));
977 }
978 nspaces -= INDENT_DEPTH;
979 line = MakeCloseTag(xmlstr::XML_TAG_POSSIBLE_LOCATIONS);
980 xmllines.push_back(MakeIndent(line, nspaces));
981
982 StringVec1d dlmodelxml(GetDataModel()->ToXML(nspaces));
983 xmllines.insert(xmllines.end(),dlmodelxml.begin(),dlmodelxml.end());
984
985 StringVec1d phenotypexml(m_phenotypes.GetPhenotypesXML(nspaces));
986 xmllines.insert(xmllines.end(),phenotypexml.begin(),phenotypexml.end());
987
988 nspaces -= INDENT_DEPTH;
989 line = MakeCloseTag(xmlstr::XML_TAG_TRAIT);
990 xmllines.push_back(MakeIndent(line, nspaces));
991
992 return xmllines;
993 }
994
995 //------------------------------------------------------------------------------------
996 //The general rule for IsNighInvariant is that if any marker is *not* 'nigh
997 // invariant' (false), the locus as a whole is also not. If all markers are
998 // indeed 'nigh invariant' (true), so is the locus as a whole.
999
IsNighInvariant() const1000 bool Locus::IsNighInvariant() const
1001 {
1002 if (m_variability.size() == 0)
1003 {
1004 return true;
1005 }
1006 for (map<long int, DoubleVec1d>::const_iterator marker = m_variability.begin();
1007 marker != m_variability.end(); marker++)
1008 {
1009 if (!(IsNighInvariant(marker->first))) return false;
1010 }
1011 return true;
1012 }
1013
1014 //------------------------------------------------------------------------------------
1015 // The rule for IsNighInvariant (well, in its current 11-28-05 form) is that
1016 // if all of the data at this marker is one particular allele but two, the
1017 // data is 'nigh invariant' (true). If there are at least three alleles which
1018 // are members of the non-majority allele, the data is not 'nigh invariant'
1019 // (false).
1020
IsNighInvariant(long int marker) const1021 bool Locus::IsNighInvariant(long int marker) const
1022 {
1023 unsigned long int ntips = m_tipcells.size();
1024
1025 map<long int, DoubleVec1d>::const_iterator freqs = m_variability.find(marker);
1026 if (freqs == m_variability.end()) return false;
1027 double maxfreq = 0;
1028 for (unsigned long int allele = 0; allele < freqs->second.size(); allele++)
1029 {
1030 maxfreq = max(maxfreq, freqs->second[allele]);
1031 }
1032 long int mindiff = std::min(3L, static_cast<long int>(ntips) - 1);
1033 if (maxfreq <= ntips - mindiff)
1034 {
1035 return false;
1036 }
1037 return true;
1038 }
1039
1040 //------------------------------------------------------------------------------------
1041
IsCompletelyInvariant(long int marker) const1042 bool Locus::IsCompletelyInvariant(long int marker) const
1043 {
1044 unsigned long int ntips = m_tipcells.size();
1045
1046 map<long int, DoubleVec1d>::const_iterator freqs = m_variability.find(marker);
1047 if (freqs == m_variability.end()) return false;
1048 for (unsigned long int allele = 0; allele < freqs->second.size(); allele++)
1049 {
1050 if (0 < freqs->second[allele] && freqs->second[allele] < ntips)
1051 {
1052 return false;
1053 }
1054 }
1055 return true;
1056 }
1057
1058 //------------------------------------------------------------------------------------
1059
ChooseVariableSiteFrom(rangeset rset)1060 long int Locus::ChooseVariableSiteFrom(rangeset rset)
1061 {
1062 LongVec1d variables;
1063 for (rangeset::iterator rit = rset.begin(); rit != rset.end(); rit++)
1064 {
1065 for (long int site = rit->first; site<rit->second; site++)
1066 {
1067 if (SiteInLocus(site))
1068 {
1069 if (!IsNighInvariant(SiteToMarker(site)))
1070 {
1071 variables.push_back(site);
1072 }
1073 }
1074 }
1075 }
1076 if (variables.size() == 0)
1077 {
1078 //No variable sites!
1079 return FLAGLONG;
1080 }
1081 return variables[registry.GetRandom().Long(variables.size())];
1082 }
1083
1084 //------------------------------------------------------------------------------------
1085
GetVariableRange()1086 rangeset Locus::GetVariableRange()
1087 {
1088 if (m_variablerange.size() == 0)
1089 {
1090 m_variablerange = CalculateVariableRange();
1091 }
1092 return m_variablerange;
1093 }
1094
1095 //------------------------------------------------------------------------------------
1096
CalculateVariableRange() const1097 rangeset Locus::CalculateVariableRange() const
1098 {
1099 rangeset variables;
1100 for (long int site = 0; site < m_nsites; site++)
1101 {
1102 if (!IsNighInvariant(SiteToMarker(site)))
1103 {
1104 variables = AddPairToRange(make_pair(site, site+1), variables);
1105 }
1106 }
1107 return variables;
1108
1109 }
1110
1111 //------------------------------------------------------------------------------------
1112
CalculateCompleteVariableRange() const1113 rangeset Locus::CalculateCompleteVariableRange() const
1114 {
1115 rangeset variables;
1116 for (long int site = 0; site < m_nsites; site++)
1117 {
1118 if (!IsCompletelyInvariant(SiteToMarker(site)))
1119 {
1120 variables = AddPairToRange(make_pair(site, site+1), variables);
1121 }
1122 }
1123 return variables;
1124 }
1125
1126 //------------------------------------------------------------------------------------
1127
GetVariableAndUnknownRange() const1128 rangeset Locus::GetVariableAndUnknownRange() const
1129 {
1130 return Union(m_variablerange, m_unknownrange);
1131 }
1132
1133 //------------------------------------------------------------------------------------
1134
GetVariabilityOfUnknownRange() const1135 long int Locus::GetVariabilityOfUnknownRange() const
1136 {
1137 long int numvariable = 0;
1138 for (rangeset::iterator range = m_unknownrange.begin(); range != m_unknownrange.end(); range++)
1139 {
1140 for (long int site = range->first; site < range->second; site++)
1141 {
1142 if (!IsNighInvariant(SiteToMarker(site)))
1143 {
1144 numvariable++;
1145 }
1146 }
1147 }
1148 return numvariable;
1149 }
1150
1151 //------------------------------------------------------------------------------------
1152
GetVariabilityOfKnownRange() const1153 long int Locus::GetVariabilityOfKnownRange() const
1154 {
1155 return CountSites(m_variablerange);
1156 }
1157
1158 //------------------------------------------------------------------------------------
1159
GetKnownRange() const1160 rangeset Locus::GetKnownRange() const
1161 {
1162 rangeset retset;
1163 retset.insert(make_pair(0, m_nsites));
1164 retset = RemoveRangeFromRange(m_unknownrange, retset);
1165 return retset;
1166 }
1167
1168 //------------------------------------------------------------------------------------
1169
CreateDataModelReport(string regionname) const1170 StringVec1d Locus::CreateDataModelReport(string regionname) const
1171 {
1172 StringVec1d out = m_pDatamodel->CreateDataModelReport();
1173 StringVec1d head;
1174 head.push_back("Parameters of a " + m_pDatamodel->GetDataModelName()
1175 + " model for the " + GetName() + " segment of the "
1176 + regionname + " region");
1177 head.insert(head.end(), out.begin(), out.end());
1178 return head;
1179 }
1180
1181 //------------------------------------------------------------------------------------
1182
WriteMapping(string regname,long int regoffset) const1183 void Locus::WriteMapping(string regname, long int regoffset) const
1184 {
1185
1186 ofstream mapfile;
1187 mapfile.precision(10);
1188 UserParameters& userparams = registry.GetUserParameters();
1189 string fname = userparams.GetMapFilePrefix() + "_" + GetName() + ".txt";
1190 mapfile.open(fname.c_str(), ios::out );
1191 userparams.AddMapFileName(fname);
1192
1193 mapfile << "Mapping results for " + GetName() + " from the region \"" + regname + "\".\n";
1194
1195 switch (GetAnalysisType())
1196 {
1197 case mloc_mapjump:
1198 mapfile << "The analysis for this trait was performed by allowing the location of "
1199 "the trait marker to move from place to place as trees were created.\n";
1200 break;
1201 case mloc_mapfloat:
1202 mapfile << "This analysis for this trait was performed by collecting trees, then calculating the "
1203 "data likelihood of the trait marker at all allowed sites on those trees, and then averaging.\n";
1204 break;
1205 case mloc_data:
1206 case mloc_partition:
1207 assert(false); //These loci should not be in the moving locus vector.
1208 return;
1209 }
1210
1211 StringVec1d mapinfo = ReportMappingInfo(regoffset);
1212 for (size_t i = 0; i < mapinfo.size(); i++)
1213 {
1214 mapfile << mapinfo[i] + "\n";
1215 }
1216 mapfile << "\nSite\tData likelihood\tRaw likelihood\n";
1217
1218 DoubleVec1d results = GetMappingInfo();
1219 DoubleVec1d rawresults = GetRawMappingInfo();
1220 StringVec1d resultsStrings(results.size(), "-");
1221 StringVec1d rawresultsStrings(rawresults.size(), "-");
1222 rangeset validrange = GetAllowedRange();
1223 for (rangeset::iterator range=validrange.begin();
1224 range != validrange.end(); range++)
1225 {
1226 for (long int site = range->first; site<range->second; site++)
1227 {
1228 resultsStrings[site] = Pretty(results[site], 10);
1229 rawresultsStrings[site] = Pretty(rawresults[site], 10);
1230 }
1231 }
1232 for (size_t site = 0; site < results.size(); site += 1)
1233 {
1234 long int place = ToNoZeroesIfNeeded(static_cast<long int>(site) + regoffset);
1235 mapfile << ToString(place) + "\t" + resultsStrings[site] + "\t" + rawresultsStrings[site] + "\n";
1236 }
1237 }
1238
1239 //------------------------------------------------------------------------------------
1240 // Debugging function.
1241
PrintVariability()1242 void Locus::PrintVariability()
1243 {
1244 for (map<long int, DoubleVec1d>::const_iterator marker = m_variability.begin();
1245 marker != m_variability.end();
1246 marker++)
1247 {
1248 string msg = "Position " + ToString(marker->first + 1) + ": ";
1249 for (unsigned long int allele = 0; allele < marker->second.size(); allele++)
1250 {
1251 msg += ToString(marker->second[allele]) + ", ";
1252 }
1253 cout << msg << endl;
1254 }
1255 cout << "Overall, the variable sites are: " << ToString(GetVariableRange()) << endl;
1256 }
1257
1258 //------------------------------------------------------------------------------------
1259 // Debugging function.
1260
PrintOnesAndZeroesForVariableSites()1261 void Locus::PrintOnesAndZeroesForVariableSites()
1262 {
1263 rangeset variability = GetVariableRange();
1264 long int newl = 249;
1265 cout << "Variability: ";
1266 for (long int site = 0; site < m_nsites; site++)
1267 {
1268 if (variability == AddPairToRange(make_pair(site, site + 1), variability))
1269 {
1270 cout << "1 ";
1271 }
1272 else
1273 {
1274 cout << "0 ";
1275 }
1276 if (site == newl)
1277 {
1278 cout << endl << "Variability: ";
1279 newl += 250;
1280 }
1281 }
1282 cout << endl;
1283 }
1284
1285 //____________________________________________________________________________________
1286