1 /*
2  * phylotesting.h
3  *
4  *  Created on: Aug 23, 2013
5  *      Author: minh
6  */
7 
8 #ifndef PHYLOTESTING_H_
9 #define PHYLOTESTING_H_
10 
11 #include "utils/tools.h"
12 #include "utils/checkpoint.h"
13 #include "nclextra/modelsblock.h"
14 #include "alignment/superalignment.h"
15 
16 class PhyloTree;
17 class IQTree;
18 class ModelCheckpoint;
19 
20 const int MF_SAMPLE_SIZE_TRIPLE = 1;
21 const int MF_IGNORED            = 2;
22 const int MF_RUNNING            = 4;
23 const int MF_WAITING            = 8;
24 const int MF_DONE               = 16;
25 
26 /**
27     Candidate model under testing
28  */
29 class CandidateModel {
30 
31 public:
32 
33     /** constructor */
34     CandidateModel(int flag = 0) {
35         logl = 0.0;
36         df = 0;
37         tree_len = 0.0;
38         aln = NULL;
39         AIC_score = DBL_MAX;
40         AICc_score = DBL_MAX;
41         BIC_score = DBL_MAX;
42         this->flag = flag;
43     }
44 
CandidateModel(flag)45     CandidateModel(string subst_name, string rate_name, Alignment *aln, int flag = 0) : CandidateModel(flag) {
46         this->subst_name = orig_subst_name = subst_name;
47         this->rate_name = orig_rate_name = rate_name;
48         this->aln = aln;
49     }
50 
CandidateModel(flag)51     CandidateModel(Alignment *aln, int flag = 0) : CandidateModel(flag) {
52         this->aln = aln;
53         getUsualModel(aln);
54     }
55 
getName()56     string getName() {
57         return subst_name + rate_name;
58     }
59 
60     /**
61      get usual model for a given alignment
62      @param aln input alignment
63      @return length of the alignment
64      */
65     size_t getUsualModel(Alignment *aln);
66 
67     /**
68      evaluate this model
69      @param params program parameters
70      @param in_aln input alignment
71      @param[in] in_model_info input checkpointing information
72      @param[out] out_model_info output checkpointing information
73      @param models_block models block
74      @param num_thread number of threads
75      @param brlen_type BRLEN_OPTIMIZE | BRLEN_FIX | BRLEN_SCALE | TOPO_UNLINKED
76      @return tree string
77      */
78     string evaluate(Params &params,
79                     ModelCheckpoint &in_model_info, ModelCheckpoint &out_model_info,
80                     ModelsBlock *models_block, int &num_threads, int brlen_type);
81 
82     /**
83      evaluate concatenated alignment
84      */
85     string evaluateConcatenation(Params &params, SuperAlignment *super_aln,
86                                  ModelCheckpoint &model_info, ModelsBlock *models_block, int num_threads);
87 
88     /**
89      compute information criterion scores (AIC, AICc, BIC)
90      */
91     void computeICScores(size_t sample_size);
92     void computeICScores();
93 
94     /**
95      compute information criterion scores (AIC, AICc, BIC)
96      */
97     double computeICScore(size_t sample_size);
98 
99     /** @return model score */
100     double getScore();
101 
102     /** @return model score */
103     double getScore(ModelTestCriterion mtc);
104 
105     /**
106      save model into checkpoint
107      */
saveCheckpoint(Checkpoint * ckp)108     void saveCheckpoint(Checkpoint *ckp) {
109         stringstream ostr;
110         ostr.precision(10);
111         ostr << logl << " " << df << " " << tree_len;
112         if (!tree.empty())
113             ostr << " " << tree;
114         ckp->put(getName(), ostr.str());
115     }
116 
117     /**
118      restore model from checkpoint
119      */
restoreCheckpoint(Checkpoint * ckp)120     bool restoreCheckpoint(Checkpoint *ckp) {
121         string val;
122         if (ckp->getString(getName(), val)) {
123             stringstream str(val);
124             str >> logl >> df >> tree_len;
125             return true;
126         }
127         return false;
128     }
129 
130     /**
131      restore model from checkpoint
132      */
restoreCheckpointRminus1(Checkpoint * ckp,CandidateModel * model)133     bool restoreCheckpointRminus1(Checkpoint *ckp, CandidateModel *model) {
134         size_t posR;
135         const char *rates[] = {"+R", "*R", "+H", "*H"};
136         for (int i = 0; i < sizeof(rates)/sizeof(char*); i++) {
137             if ((posR = model->rate_name.find(rates[i])) != string::npos) {
138                 int cat = convert_int(model->rate_name.substr(posR+2).c_str());
139                 subst_name = model->subst_name;
140                 rate_name = model->rate_name.substr(0, posR+2) + convertIntToString(cat-1);
141                 return restoreCheckpoint(ckp);
142             }
143         }
144         return false;
145     }
146 
147     /** turn on some flag with OR operator */
setFlag(int flag)148     void setFlag(int flag) {
149         this->flag |= flag;
150     }
151 
hasFlag(int flag)152     bool hasFlag(int flag) {
153         return (this->flag & flag) != 0;
154     }
155 
156     string set_name; // subset name
157     string subst_name; // substitution matrix name
158     string orig_subst_name; // original substitution name
159     string rate_name; // rate heterogeneity name
160     string orig_rate_name; // original rate heterogeneity name
161     double logl; // tree log likelihood
162     int df;      // #parameters
163     double tree_len; // tree length, added 2015-06-24 for rcluster algorithm
164     string tree; // added 2015-04-28: tree string
165     double AIC_score, AICc_score, BIC_score;    // scores
166     double AIC_weight, AICc_weight, BIC_weight; // weights
167     bool AIC_conf, AICc_conf, BIC_conf;         // in confidence set?
168 
169     Alignment *aln; // associated alignment
170 
171 protected:
172 
173     /** flag */
174     int flag;
175 };
176 
177 /**
178  set of candidate models
179  */
180 class CandidateModelSet : public vector<CandidateModel> {
181 public:
182 
CandidateModelSet()183     CandidateModelSet() : vector<CandidateModel>() {
184         current_model = -1;
185     }
186 
187     /** get ID of the best model */
188     int getBestModelID(ModelTestCriterion mtc);
189 
190     /**
191      * get the list of model
192      * @param params program parameters
193      * @param aln alignment
194      * param separate_rate true to separate rates from models
195      * @param merge_phase true to consider models for merging phase
196      * @return maximum number of rate categories
197      */
198     int generate(Params &params, Alignment *aln, bool separate_rate, bool merge_phase);
199 
200     /**
201      Filter out all "non-promissing" rate models
202      */
203     void filterRates(int finished_model);
204 
205     /**
206      Filter out all "non-promissing" substitution models
207      */
208     void filterSubst(int finished_model);
209 
210     /**
211      testing the best-fit model
212      return in params.freq_type and params.rate_type
213      @param params global program parameters
214      @param in_tree phylogenetic tree
215      @param model_info (IN/OUT) information for all models considered
216      @param models_block global model definition
217      @param num_threads number of threads
218      @param brlen_type BRLEN_OPTIMIZE | BRLEN_FIX | BRLEN_SCALE | TOPO_UNLINK
219      @param set_name for partition model selection
220      @param in_model_name a specific model name if testing one model
221      @param adjust model adjustment for modelomatic
222      @param merge_phase true to consider models for merging phase
223      @return name of best-fit-model
224      */
225     CandidateModel test(Params &params, PhyloTree* in_tree, ModelCheckpoint &model_info,
226                 ModelsBlock *models_block, int num_threads, int brlen_type,
227                 string set_name = "", string in_model_name = "",
228                 bool merge_phase = false);
229 
230     /**
231      for a rate model XXX+R[k], return XXX+R[k-j] that finished
232      @return the index of fewer category +R model that finished
233      */
getLowerKModel(int model)234     int getLowerKModel(int model) {
235         size_t posR;
236         const char *rates[] = {"+R", "*R", "+H", "*H"};
237         for (int i = 0; i < sizeof(rates)/sizeof(char*); i++) {
238             if ((posR = at(model).rate_name.find(rates[i])) == string::npos)
239                 continue;
240             int cat = convert_int(at(model).rate_name.substr(posR+2).c_str());
241             for (int prev_model = model-1; prev_model >= 0; prev_model--, cat--) {
242                 string name = at(model).rate_name.substr(0, posR+2) + convertIntToString(cat-1);
243                 if (at(prev_model).rate_name != name)
244                     break;
245                 if (!at(prev_model).hasFlag(MF_DONE))
246                     continue;
247                 return prev_model;
248             }
249         }
250         return -1;
251     }
252 
getHigherKModel(int model)253     int getHigherKModel(int model) {
254         size_t posR;
255         const char *rates[] = {"+R", "*R", "+H", "*H"};
256         for (int i = 0; i < sizeof(rates)/sizeof(char*); i++) {
257             if ((posR = at(model).rate_name.find(rates[i])) == string::npos)
258                 continue;
259             size_t this_posR = at(model).rate_name.find(rates[i]);
260             ASSERT(this_posR != string::npos);
261             int cat = convert_int(at(model).rate_name.substr(this_posR+2).c_str());
262             for (int next_model = model+1; next_model < size(); next_model++, cat++) {
263 //                if (at(next_model).name.substr(0, posR) != orig_name.substr(0, posR))
264 //                    break;
265                 string rate_name = at(model).rate_name.substr(posR, 2) + convertIntToString(cat+1);
266                 if (at(next_model).rate_name.find(rate_name) == string::npos)
267                     break;
268                 return next_model;
269             }
270         }
271         return -1;
272     }
273 
274     /** get the next model to evaluate in parallel */
275     int64_t getNextModel();
276 
277     /**
278      evaluate all models in parallel
279      */
280     CandidateModel evaluateAll(Params &params, PhyloTree* in_tree, ModelCheckpoint &model_info,
281                      ModelsBlock *models_block, int num_threads, int brlen_type,
282                      string in_model_name = "", bool merge_phase = false, bool write_info = true);
283 
284 private:
285 
286     /** current model */
287     int64_t current_model;
288 };
289 
290 //typedef vector<ModelInfo> ModelCheckpoint;
291 
292 class ModelCheckpoint : public Checkpoint {
293 
294 public:
295 
296     /*
297         get the best model
298         @param[out] best_model name of the best model
299         @return TRUE if best model found, FALSE otherwise (unfinished job)
300     */
301     bool getBestModel(string &best_model);
302 
303     /*
304      get the best model list
305      @param[out] best_model_list list of the best model
306      @return TRUE if best model found, FALSE otherwise (unfinished job)
307      */
308     bool getBestModelList(string &best_model_list);
309 
310     /*
311      put the best model list
312      @param best_model_list list of the best model
313      @return TRUE if best model found, FALSE otherwise (unfinished job)
314      */
315     void putBestModelList(string &best_model_list);
316 
317     /*
318         get the ordered model list according to AIC, AICc or BIC
319         @param tree associated tree
320         @param[out] ordered_models list of models ordered by specified criterion
321         @return TRUE if ordered_models found, FALSE otherwise (unfinished job)
322     */
323     bool getOrderedModels(PhyloTree *tree, CandidateModelSet &ordered_models);
324 
325     /*
326         get the best tree
327         @param[out] best_tree NEWICK string of the best tree
328         @return TRUE if best tree found, FALSE otherwise (unfinished job)
329     */
330     bool getBestTree(string &best_tree);
331 
332 };
333 
334 /**
335  * computing AIC, AICc, and BIC scores
336  */
337 void computeInformationScores(double tree_lh, int df, int ssize, double &AIC, double &AICc, double &BIC);
338 
339 double computeInformationScore(double tree_lh, int df, int ssize, ModelTestCriterion mtc);
340 
341 string criterionName(ModelTestCriterion mtc);
342 
343 /**
344  perform ModelFinder to find the best-fit model
345  @param params program parameters
346  @param iqtree phylogenetic tree
347  @param model_info (IN/OUT) information for all models considered
348  */
349 void runModelFinder(Params &params, IQTree &iqtree, ModelCheckpoint &model_info);
350 
351 /**
352  testing the best-fit model
353  return in params.freq_type and params.rate_type
354  @param set_name for partitioned analysis
355  @param in_tree phylogenetic tree
356  @param model_info (IN/OUT) information for all models considered
357  @param set_name for partition model selection
358  @param print_mem_usage true to print RAM memory used (default: false)
359  @return name of best-fit-model
360  */
361 //string testModel(Params &params, PhyloTree* in_tree, ModelCheckpoint &model_info,
362 //		ModelsBlock *models_block, int num_threads, int brlen_type,
363 //        string set_name = "", bool print_mem_usage = false, string in_model_name = "");
364 
365 
366 /**
367     get sequence type for a model name
368     @param model_name model name string
369     @param seq_type (OUT) sequence type, SEQ_UNKNOWN if is not determined
370     @return 1 for parametric model, 2 for empirical model
371 */
372 int detectSeqType(const char *model_name, SeqType &seq_type);
373 
374 string detectSeqTypeName(string model_name);
375 
376 
377 #endif /* PHYLOTESTING_H_ */
378