1 /*
2  * scrm is an implementation of the Sequential-Coalescent-with-Recombination Model.
3  *
4  * Copyright (C) 2013, 2014 Paul R. Staab, Sha (Joe) Zhu, Dirk Metzler and Gerton Lunter
5  *
6  * This file is part of scrm.
7  *
8  * scrm is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 
21 */
22 
23 /*!
24  * \file forest.h
25  * \brief Contains the class Forest, which is the central class in scrm
26  *
27  * The central data structure of scrm is a forest, which is a collection of
28  * trees. This class on the one hand contains a NodeContainer object with all
29  * nodes building the trees, and on the other hand functions to manipulate to
30  * forest.
31  *
32  * Most functions are defined in forest.cc with exception of pure debugging
33  * functions, with are in forest-debug.cc.
34  */
35 
36 #ifndef scrm_src_forest
37 #define scrm_src_forest
38 
39 #include <string>
40 
41 #include "macros.h" // Needs to be before cassert
42 
43 #include <vector>
44 #include <unordered_set>
45 #include <stdexcept>
46 #include <cassert>
47 #include <iostream> // ostreams
48 #include <iomanip>  // Used for debug output
49 #include <sstream>  // Used for debug output
50 
51 #include "contemporaries_container.h"
52 #include "event.h"
53 #include "model.h"
54 #include "macros.h"
55 #include "node.h"
56 #include "node_container.h"
57 #include "time_interval.h"
58 #include "tree_point.h"
59 #include "random/random_generator.h"
60 #include "summary_statistics/summary_statistic.h"
61 
62 class TimeInterval;
63 class TimeIntervalIterator;
64 enum eventCode { NOEVENT, EVENT, INIT_NULL};
65 
66 class Forest
67 {
68  public:
69   Node* readNewickNode( std::string &in_str, std::string::iterator &current_it, size_t parenthesis_balance = 0, Node* parent = NULL );
70   void readNewick(std::string &in_str);
contemporaries()71   ContemporariesContainer* contemporaries()  {return &this->contemporaries_;};
72 
73 #ifdef UNITTEST
74   friend class TestForest;
75   friend class TestNode;
76   friend class TestTimeInterval;
77   friend class TestModel;
78   friend class TestNodeContainer;
79 #endif
80   friend class TimeInterval;
81   friend class TimeIntervalIterator;
82 
83   Forest(Model *model, RandomGenerator *random_generator);
84   Forest(const Forest &current_forest);
~Forest()85   virtual ~Forest() {};
86 
87   //Getters & Setters
model()88   const Model &model() const { return *model_; }
set_model(Model * model)89   void set_model(Model* model) { this->model_ = model; }
writable_model()90   Model* writable_model() { return this->model_; };
91 
local_root()92   Node* local_root() const { return local_root_; }
set_local_root(Node * local_root)93   void set_local_root(Node* local_root) { local_root_ = local_root; };
94 
primary_root()95   Node* primary_root() const { return primary_root_; }
set_primary_root(Node * primary_root)96   void set_primary_root(Node* primary_root) { primary_root_ = primary_root; };
97 
sample_size()98   size_t sample_size() const {
99     if (sample_size_ == 0) return model().sample_size();
100     return this->sample_size_;
101   }
set_sample_size(const size_t size)102   void set_sample_size(const size_t size ) { sample_size_ = size; }
103 
segment_count()104   size_t segment_count() const { return current_rec_; }
105 
106   void sampleNextBase();
107 
108   /**
109    * @brief Returns the length of the sequence for with the current tree is
110    * valid
111    *
112    * @param finite_sites If 'true', the length is measured in number of bases
113    * (e.g. integer sequence positions) for which the tree is valid. Otherwise,
114    * the length is measured real valued number on a continuous chromosome.
115    *
116    * @return The length of the current segment (see above for its unit)
117    */
calcSegmentLength()118   double calcSegmentLength() const {
119     if (model().getSequenceScaling() == relative) {
120       return (next_base() - current_base()) / model().loci_length();
121     } else {
122       return ceil(next_base()) - ceil(current_base());
123     }
124   }
125 
set_random_generator(RandomGenerator * rg)126   void set_random_generator(RandomGenerator *rg) {
127     this->random_generator_ = rg;
128   }
random_generator()129   RandomGenerator* random_generator() const { return this->random_generator_; }
130 
getNodes()131   NodeContainer const *getNodes() const { return &nodes_; };
132 
133   // Central functions
134   void buildInitialTree();
135   void sampleNextGenealogy();
136   TreePoint samplePoint(Node* node = NULL, double length_left = -1) const;
137 
138   void clear();
139 
140   //Debugging Tools
141   void addNodeToTree(Node *node, Node *parent, Node *first_child, Node *second_child);
142   void createExampleTree();
143   void createScaledExampleTree();
144   bool checkLeafsOnLocalTree(Node const* node=NULL) const;
145   bool checkTree(Node const* root = NULL) const;
146   double calcTreeLength() const;
147   bool checkTreeLength() const;
148   bool checkInvariants(Node const* node = NULL) const;
149   bool checkNodeProperties() const;
150   bool checkContemporaries(const double time) const;
151   bool printNodes() const;
152   bool checkForNodeAtHeight(const double height) const;
153   bool checkRootIsRegistered(Node const* node) const;
154   bool checkRoots() const;
155 
156   //Debug Tree Printing
157   int countLinesLeft(Node const* node) const;
158   int countLinesRight(Node const* node) const;
159   int countBelowLinesLeft(Node const* node) const;
160   int countBelowLinesRight(Node const* node) const;
161   bool printTree() const;
162   std::vector<Node const*> determinePositions() const;
163   void printPositions(const std::vector<Node const*> &positions) const;
164 
nodes()165   NodeContainer *nodes() { return &(this->nodes_); }
166 
167   double getTMRCA(const bool &scaled = false) const {
168     if (scaled) return local_root()->height() / (4 * this->model_->default_pop_size());
169     else return local_root()->height();
170   }
171 
172   double getLocalTreeLength(const bool &scaled = false) const {
173     if (scaled) return local_root()->length_below() / (4 * this->model_->default_pop_size());
174     else return local_root()->length_below();
175   }
176 
177   //derived class from Forest
record_Recombevent(size_t pop_i,double opportunity,eventCode event_code)178   virtual void record_Recombevent(size_t pop_i,
179     double opportunity,
180     eventCode event_code){
181     (void)pop_i;
182     (void)opportunity;
183     (void)event_code;
184   }
record_all_event(TimeInterval const & ti)185   virtual void record_all_event( TimeInterval const &ti){
186     (void) ti;
187   }
188 
189   // Calc & Print Summary Statistics
190   void calcSegmentSumStats();
191   void clearSumStats();
192   void printLocusSumStats(std::ostream &output) const;
193   void printSegmentSumStats(std::ostream &output) const;
194 
get_rec_base(const size_t idx)195   double get_rec_base(const size_t idx) const {
196     return rec_bases_.at(idx);
197   }
198 
current_base()199   double current_base() const { return get_rec_base(current_rec_); }
next_base()200   double next_base() const { return get_rec_base(current_rec_ + 1); }
set_current_base(double const base)201   void set_current_base(double const base) { rec_bases_[current_rec_] = base; };
set_next_base(double const base)202   void set_next_base(double const base) { rec_bases_.push_back(base); };
203 
current_rec()204   size_t current_rec() const { return current_rec_; };
205 
coalescence_finished()206   bool coalescence_finished() const { return this->coalescence_finished_; }
207 
208  private:
Forest()209   Forest() { this->initialize(); }
210 
211   //Operations on the Tree
212   Node* cut(const TreePoint &cut_point);
213 
214   void updateAbove(Node* node,
215                    bool above_local_root = false,
216                    const bool &recursive = true,
217                    const bool &invariants_only = false);
218 
219   // Tools for doing coalescence & recombination
220   void sampleCoalescences(Node *start_node);
221   size_t getNodeState(Node const *node, const double current_time) const;
222   Node* updateBranchBelowEvent(Node* node, const TreePoint &event_point);
223   Node* possiblyMoveUpwards(Node* node, const TimeInterval &event);
224 
225   // Implementation of the different events
226   void implementNoEvent(const TimeInterval &ti, bool &coalescence_finished);
227   void implementCoalescence(const Event &event, TimeIntervalIterator &tii);
228   void implementPwCoalescence(Node* root_1, Node* root_2, const double time);
229   void implementRecombination(const Event &event, TimeIntervalIterator &tii);
230   void implementMigration(const Event &event, const bool &recalculate, TimeIntervalIterator &tii);
231   void implementFixedTimeEvent(TimeIntervalIterator &ti);
232 
233   // Pruning
nodeIsOld(Node const * node)234   bool nodeIsOld(Node const* node) const {
235     if ( node->local() ) return false;
236     if ( node->is_root() ) return false;
237     if ( model().has_window_rec() &&
238          segment_count() - node->last_update() > model().window_length_rec()) {
239       return true;
240     }
241     if ( model().has_window_seq() &&
242          current_base() - get_rec_base(node->last_update()) > model().window_length_seq()) {
243       return true;
244     }
245     return false;
246   }
247 
nodeIsActive(Node const * node)248   bool nodeIsActive(Node const* node) const {
249     return (node == active_node(0) || node == active_node(1));
250   }
251 
252   bool pruneNodeIfNeeded(Node* node, const bool prune_orphans = true);
253 
254   // Calculation of Rates
255   double calcCoalescenceRate(const size_t pop, const TimeInterval &ti) const;
256   double calcPwCoalescenceRate(const size_t pop, const TimeInterval &ti) const;
257   double calcRecombinationRate(Node const* node) const;
258   void calcRates(const TimeInterval &ti);
259 
260   void sampleEvent(const TimeInterval &ti, double &event_time, Event &return_event) const;
261 
262   void sampleEventType(const double time, const size_t time_line,
263                        const TimeInterval &ti, Event &return_event) const;
264 
265   void selectFirstTime(const double new_time, const size_t time_line,
266                        double &current_time, size_t &current_time_line) const;
267 
getTimeLineGrowth(const size_t time_line)268   double getTimeLineGrowth(const size_t time_line) const {
269     if (time_line == 0) return 0.0;
270     else if (time_line == 1) return model().growth_rate(active_node(0)->population());
271     else if (time_line == 2) return model().growth_rate(active_node(1)->population());
272     else throw std::out_of_range("Trying to get growthrate of unknown time line.");
273   }
274 
275 
276   // Private Members
277   NodeContainer nodes_;    // The nodes of the Tree/Forest
278 
279   // We have 3 different kind of roots that are important:
280   // local root: root of the smallest subtree containing all local sequences
281   Node* local_root_;
282 
283   // primary root: root of the tree that contains all local sequences
284   Node* primary_root_;
285 
286   // secondary roots: roots of trees that contain only non-local nodes
287   // std::unordered_set<Node*> secondary_roots_;
288 
289   size_t sample_size_;      // The number of sampled nodes (changes while building the initial tree)
290   size_t current_rec_;      // A counter for recombinations
291 
292   std::vector<double> rec_bases_; // Genetic positions of the recombinations
293 
294   Model* model_;
295   RandomGenerator* random_generator_;
296 
297   void initialize(Model *model = new Model(),
298                   RandomGenerator *rg = NULL);
299 
300   void createSampleNodes();
301 
302   // Temporarily used when doing the coalescence
303 
304   // Rates:
305   double rates_[3];
306 
307   // States: Each (branch above an) active node can either be in state
308   // - 0 = off (the other coalescence has not reached it yet) or
309   // - 1 = potentially coalescing in a time interval or
310   // - 2 = potentially recombining in a time interval
311   size_t states_[2];
312   Node* active_nodes_[2];
313   Event  tmp_event_;
314   double tmp_event_time_;
315   ContemporariesContainer contemporaries_;
316 
317   // These are pointers to the up to two active nodes during a coalescence
318   size_t active_nodes_timelines_[2];
active_node(size_t nr)319   Node* active_node(size_t nr) const { return active_nodes_[nr]; };
set_active_node(size_t nr,Node * node)320   void set_active_node(size_t nr, Node* node) { active_nodes_[nr] = node; };
321 
getEventNode()322   Node* getEventNode() const { return active_node(tmp_event_.active_node_nr()); }
getEventNodesState()323   size_t getEventNodesState() const { return states_[tmp_event_.active_node_nr()]; };
getOtherNode()324   Node* getOtherNode() const { return active_node(1-tmp_event_.active_node_nr()); };
getOtherNodesState()325   size_t getOtherNodesState() const { return states_[1-tmp_event_.active_node_nr()]; };
326 
327   bool coalescence_finished_;
328 };
329 
330 #endif
331