1 /* 2 * phylotesting.h 3 * 4 * Created on: Aug 23, 2013 5 * Author: minh 6 */ 7 8 #ifndef PHYLOTESTING_H_ 9 #define PHYLOTESTING_H_ 10 11 #include "utils/tools.h" 12 #include "utils/checkpoint.h" 13 #include "nclextra/modelsblock.h" 14 #include "alignment/superalignment.h" 15 16 class PhyloTree; 17 class IQTree; 18 class ModelCheckpoint; 19 20 const int MF_SAMPLE_SIZE_TRIPLE = 1; 21 const int MF_IGNORED = 2; 22 const int MF_RUNNING = 4; 23 const int MF_WAITING = 8; 24 const int MF_DONE = 16; 25 26 /** 27 Candidate model under testing 28 */ 29 class CandidateModel { 30 31 public: 32 33 /** constructor */ 34 CandidateModel(int flag = 0) { 35 logl = 0.0; 36 df = 0; 37 tree_len = 0.0; 38 aln = NULL; 39 AIC_score = DBL_MAX; 40 AICc_score = DBL_MAX; 41 BIC_score = DBL_MAX; 42 this->flag = flag; 43 } 44 CandidateModel(flag)45 CandidateModel(string subst_name, string rate_name, Alignment *aln, int flag = 0) : CandidateModel(flag) { 46 this->subst_name = orig_subst_name = subst_name; 47 this->rate_name = orig_rate_name = rate_name; 48 this->aln = aln; 49 } 50 CandidateModel(flag)51 CandidateModel(Alignment *aln, int flag = 0) : CandidateModel(flag) { 52 this->aln = aln; 53 getUsualModel(aln); 54 } 55 getName()56 string getName() { 57 return subst_name + rate_name; 58 } 59 60 /** 61 get usual model for a given alignment 62 @param aln input alignment 63 @return length of the alignment 64 */ 65 size_t getUsualModel(Alignment *aln); 66 67 /** 68 evaluate this model 69 @param params program parameters 70 @param in_aln input alignment 71 @param[in] in_model_info input checkpointing information 72 @param[out] out_model_info output checkpointing information 73 @param models_block models block 74 @param num_thread number of threads 75 @param brlen_type BRLEN_OPTIMIZE | BRLEN_FIX | BRLEN_SCALE | TOPO_UNLINKED 76 @return tree string 77 */ 78 string evaluate(Params ¶ms, 79 ModelCheckpoint &in_model_info, ModelCheckpoint &out_model_info, 80 ModelsBlock *models_block, int &num_threads, int brlen_type); 81 82 /** 83 evaluate concatenated alignment 84 */ 85 string evaluateConcatenation(Params ¶ms, SuperAlignment *super_aln, 86 ModelCheckpoint &model_info, ModelsBlock *models_block, int num_threads); 87 88 /** 89 compute information criterion scores (AIC, AICc, BIC) 90 */ 91 void computeICScores(size_t sample_size); 92 void computeICScores(); 93 94 /** 95 compute information criterion scores (AIC, AICc, BIC) 96 */ 97 double computeICScore(size_t sample_size); 98 99 /** @return model score */ 100 double getScore(); 101 102 /** @return model score */ 103 double getScore(ModelTestCriterion mtc); 104 105 /** 106 save model into checkpoint 107 */ saveCheckpoint(Checkpoint * ckp)108 void saveCheckpoint(Checkpoint *ckp) { 109 stringstream ostr; 110 ostr.precision(10); 111 ostr << logl << " " << df << " " << tree_len; 112 if (!tree.empty()) 113 ostr << " " << tree; 114 ckp->put(getName(), ostr.str()); 115 } 116 117 /** 118 restore model from checkpoint 119 */ restoreCheckpoint(Checkpoint * ckp)120 bool restoreCheckpoint(Checkpoint *ckp) { 121 string val; 122 if (ckp->getString(getName(), val)) { 123 stringstream str(val); 124 str >> logl >> df >> tree_len; 125 return true; 126 } 127 return false; 128 } 129 130 /** 131 restore model from checkpoint 132 */ restoreCheckpointRminus1(Checkpoint * ckp,CandidateModel * model)133 bool restoreCheckpointRminus1(Checkpoint *ckp, CandidateModel *model) { 134 size_t posR; 135 const char *rates[] = {"+R", "*R", "+H", "*H"}; 136 for (int i = 0; i < sizeof(rates)/sizeof(char*); i++) { 137 if ((posR = model->rate_name.find(rates[i])) != string::npos) { 138 int cat = convert_int(model->rate_name.substr(posR+2).c_str()); 139 subst_name = model->subst_name; 140 rate_name = model->rate_name.substr(0, posR+2) + convertIntToString(cat-1); 141 return restoreCheckpoint(ckp); 142 } 143 } 144 return false; 145 } 146 147 /** turn on some flag with OR operator */ setFlag(int flag)148 void setFlag(int flag) { 149 this->flag |= flag; 150 } 151 hasFlag(int flag)152 bool hasFlag(int flag) { 153 return (this->flag & flag) != 0; 154 } 155 156 string set_name; // subset name 157 string subst_name; // substitution matrix name 158 string orig_subst_name; // original substitution name 159 string rate_name; // rate heterogeneity name 160 string orig_rate_name; // original rate heterogeneity name 161 double logl; // tree log likelihood 162 int df; // #parameters 163 double tree_len; // tree length, added 2015-06-24 for rcluster algorithm 164 string tree; // added 2015-04-28: tree string 165 double AIC_score, AICc_score, BIC_score; // scores 166 double AIC_weight, AICc_weight, BIC_weight; // weights 167 bool AIC_conf, AICc_conf, BIC_conf; // in confidence set? 168 169 Alignment *aln; // associated alignment 170 171 protected: 172 173 /** flag */ 174 int flag; 175 }; 176 177 /** 178 set of candidate models 179 */ 180 class CandidateModelSet : public vector<CandidateModel> { 181 public: 182 CandidateModelSet()183 CandidateModelSet() : vector<CandidateModel>() { 184 current_model = -1; 185 } 186 187 /** get ID of the best model */ 188 int getBestModelID(ModelTestCriterion mtc); 189 190 /** 191 * get the list of model 192 * @param params program parameters 193 * @param aln alignment 194 * param separate_rate true to separate rates from models 195 * @param merge_phase true to consider models for merging phase 196 * @return maximum number of rate categories 197 */ 198 int generate(Params ¶ms, Alignment *aln, bool separate_rate, bool merge_phase); 199 200 /** 201 Filter out all "non-promissing" rate models 202 */ 203 void filterRates(int finished_model); 204 205 /** 206 Filter out all "non-promissing" substitution models 207 */ 208 void filterSubst(int finished_model); 209 210 /** 211 testing the best-fit model 212 return in params.freq_type and params.rate_type 213 @param params global program parameters 214 @param in_tree phylogenetic tree 215 @param model_info (IN/OUT) information for all models considered 216 @param models_block global model definition 217 @param num_threads number of threads 218 @param brlen_type BRLEN_OPTIMIZE | BRLEN_FIX | BRLEN_SCALE | TOPO_UNLINK 219 @param set_name for partition model selection 220 @param in_model_name a specific model name if testing one model 221 @param adjust model adjustment for modelomatic 222 @param merge_phase true to consider models for merging phase 223 @return name of best-fit-model 224 */ 225 CandidateModel test(Params ¶ms, PhyloTree* in_tree, ModelCheckpoint &model_info, 226 ModelsBlock *models_block, int num_threads, int brlen_type, 227 string set_name = "", string in_model_name = "", 228 bool merge_phase = false); 229 230 /** 231 for a rate model XXX+R[k], return XXX+R[k-j] that finished 232 @return the index of fewer category +R model that finished 233 */ getLowerKModel(int model)234 int getLowerKModel(int model) { 235 size_t posR; 236 const char *rates[] = {"+R", "*R", "+H", "*H"}; 237 for (int i = 0; i < sizeof(rates)/sizeof(char*); i++) { 238 if ((posR = at(model).rate_name.find(rates[i])) == string::npos) 239 continue; 240 int cat = convert_int(at(model).rate_name.substr(posR+2).c_str()); 241 for (int prev_model = model-1; prev_model >= 0; prev_model--, cat--) { 242 string name = at(model).rate_name.substr(0, posR+2) + convertIntToString(cat-1); 243 if (at(prev_model).rate_name != name) 244 break; 245 if (!at(prev_model).hasFlag(MF_DONE)) 246 continue; 247 return prev_model; 248 } 249 } 250 return -1; 251 } 252 getHigherKModel(int model)253 int getHigherKModel(int model) { 254 size_t posR; 255 const char *rates[] = {"+R", "*R", "+H", "*H"}; 256 for (int i = 0; i < sizeof(rates)/sizeof(char*); i++) { 257 if ((posR = at(model).rate_name.find(rates[i])) == string::npos) 258 continue; 259 size_t this_posR = at(model).rate_name.find(rates[i]); 260 ASSERT(this_posR != string::npos); 261 int cat = convert_int(at(model).rate_name.substr(this_posR+2).c_str()); 262 for (int next_model = model+1; next_model < size(); next_model++, cat++) { 263 // if (at(next_model).name.substr(0, posR) != orig_name.substr(0, posR)) 264 // break; 265 string rate_name = at(model).rate_name.substr(posR, 2) + convertIntToString(cat+1); 266 if (at(next_model).rate_name.find(rate_name) == string::npos) 267 break; 268 return next_model; 269 } 270 } 271 return -1; 272 } 273 274 /** get the next model to evaluate in parallel */ 275 int64_t getNextModel(); 276 277 /** 278 evaluate all models in parallel 279 */ 280 CandidateModel evaluateAll(Params ¶ms, PhyloTree* in_tree, ModelCheckpoint &model_info, 281 ModelsBlock *models_block, int num_threads, int brlen_type, 282 string in_model_name = "", bool merge_phase = false, bool write_info = true); 283 284 private: 285 286 /** current model */ 287 int64_t current_model; 288 }; 289 290 //typedef vector<ModelInfo> ModelCheckpoint; 291 292 class ModelCheckpoint : public Checkpoint { 293 294 public: 295 296 /* 297 get the best model 298 @param[out] best_model name of the best model 299 @return TRUE if best model found, FALSE otherwise (unfinished job) 300 */ 301 bool getBestModel(string &best_model); 302 303 /* 304 get the best model list 305 @param[out] best_model_list list of the best model 306 @return TRUE if best model found, FALSE otherwise (unfinished job) 307 */ 308 bool getBestModelList(string &best_model_list); 309 310 /* 311 put the best model list 312 @param best_model_list list of the best model 313 @return TRUE if best model found, FALSE otherwise (unfinished job) 314 */ 315 void putBestModelList(string &best_model_list); 316 317 /* 318 get the ordered model list according to AIC, AICc or BIC 319 @param tree associated tree 320 @param[out] ordered_models list of models ordered by specified criterion 321 @return TRUE if ordered_models found, FALSE otherwise (unfinished job) 322 */ 323 bool getOrderedModels(PhyloTree *tree, CandidateModelSet &ordered_models); 324 325 /* 326 get the best tree 327 @param[out] best_tree NEWICK string of the best tree 328 @return TRUE if best tree found, FALSE otherwise (unfinished job) 329 */ 330 bool getBestTree(string &best_tree); 331 332 }; 333 334 /** 335 * computing AIC, AICc, and BIC scores 336 */ 337 void computeInformationScores(double tree_lh, int df, int ssize, double &AIC, double &AICc, double &BIC); 338 339 double computeInformationScore(double tree_lh, int df, int ssize, ModelTestCriterion mtc); 340 341 string criterionName(ModelTestCriterion mtc); 342 343 /** 344 perform ModelFinder to find the best-fit model 345 @param params program parameters 346 @param iqtree phylogenetic tree 347 @param model_info (IN/OUT) information for all models considered 348 */ 349 void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info); 350 351 /** 352 testing the best-fit model 353 return in params.freq_type and params.rate_type 354 @param set_name for partitioned analysis 355 @param in_tree phylogenetic tree 356 @param model_info (IN/OUT) information for all models considered 357 @param set_name for partition model selection 358 @param print_mem_usage true to print RAM memory used (default: false) 359 @return name of best-fit-model 360 */ 361 //string testModel(Params ¶ms, PhyloTree* in_tree, ModelCheckpoint &model_info, 362 // ModelsBlock *models_block, int num_threads, int brlen_type, 363 // string set_name = "", bool print_mem_usage = false, string in_model_name = ""); 364 365 366 /** 367 get sequence type for a model name 368 @param model_name model name string 369 @param seq_type (OUT) sequence type, SEQ_UNKNOWN if is not determined 370 @return 1 for parametric model, 2 for empirical model 371 */ 372 int detectSeqType(const char *model_name, SeqType &seq_type); 373 374 string detectSeqTypeName(string model_name); 375 376 377 #endif /* PHYLOTESTING_H_ */ 378