1 /*
2 * phylotesting.cpp
3 * implementation of ModelFinder and PartitionFinder
4 * Created on: Aug 23, 2013
5 * Author: minh
6 */
7
8
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13 #include <iqtree_config.h>
14 #include <numeric>
15 #include "tree/phylotree.h"
16 #include "tree/iqtree.h"
17 #include "tree/phylosupertree.h"
18 #include "tree/phylotreemixlen.h"
19 #include "phylotesting.h"
20
21 #include "model/modelmarkov.h"
22 #include "model/modeldna.h"
23 #include "nclextra/myreader.h"
24 #include "model/rateheterogeneity.h"
25 #include "model/rategamma.h"
26 #include "model/rateinvar.h"
27 #include "model/rategammainvar.h"
28 #include "model/ratefree.h"
29 #include "model/ratefreeinvar.h"
30 //#include "modeltest_wrapper.h"
31 #include "model/modelprotein.h"
32 #include "model/modelbin.h"
33 #include "model/modelcodon.h"
34 #include "model/modelmorphology.h"
35 #include "model/modelmixture.h"
36 #include "model/modelliemarkov.h"
37 #include "model/modelpomo.h"
38 #include "utils/timeutil.h"
39 #include "model/modelfactorymixlen.h"
40 #include "tree/phylosupertreeplen.h"
41 #include "tree/phylosupertreeunlinked.h"
42
43 #include "phyloanalysis.h"
44 #include "gsl/mygsl.h"
45 #include "utils/MPIHelper.h"
46 //#include "vectorclass/vectorclass.h"
47
48
49 /******* Binary model set ******/
50 const char* bin_model_names[] = {"GTR2", "JC2"};
51
52
53 /******* Morphological model set ******/
54 // 2018-08-20: don't test ORDERED model due to lots of numerical issues
55 //const char* morph_model_names[] = {"MK", "ORDERED"};
56 const char* morph_model_names[] = {"MK"};
57
58
59 /******* DNA model set ******/
60 const char* dna_model_names[] = {"GTR", "SYM", "TVM", "TVMe", "TIM3",
61 "TIM3e", "TIM2", "TIM2e", "TIM", "TIMe", "TPM3u", "TPM3",
62 "TPM2u", "TPM2", "K81u", "K81", "TN", "TNe", "HKY", "K80", "F81", "JC"};
63
64 /* DNA models supported by PhyML/PartitionFinder */
65 const char* dna_model_names_old[] ={"GTR", "SYM", "TVM", "TVMe", "TIM", "TIMe",
66 "K81u", "K81", "TN", "TNe", "HKY", "K80", "F81", "JC"};
67
68 /* DNA model supported by RAxML */
69 const char* dna_model_names_rax[] ={"GTR"};
70
71 /* DNA model supported by MrBayes */
72 const char *dna_model_names_mrbayes[] = {"GTR", "SYM", "HKY", "K80", "F81", "JC"};
73
74 /* DNA model supported by ModelOMatic */
75 const char *dna_model_names_modelomatic[] = {"GTR", "HKY", "K80", "F81", "JC"};
76
77 //const char* dna_freq_names[] = {"+FO"};
78
79 // Lie-Markov models without an RY, WS or MK prefix
80 const char *dna_model_names_lie_markov_fullsym[] =
81 {"1.1", "3.3a", "4.4a", "6.7a", "9.20b", "12.12"};
82 // Lie-Markov models with RY symmetry/distinguished pairing
83 const char *dna_model_names_lie_markov_ry[] = {
84 "RY2.2b", "RY3.3b", "RY3.3c", "RY3.4", "RY4.4b",
85 "RY4.5a", "RY4.5b", "RY5.6a", "RY5.6b", "RY5.7a",
86 "RY5.7b", "RY5.7c", "RY5.11a", "RY5.11b", "RY5.11c",
87 "RY5.16", "RY6.6", "RY6.7b", "RY6.8a", "RY6.8b",
88 "RY6.17a", "RY6.17b","RY8.8", "RY8.10a", "RY8.10b",
89 "RY8.16", "RY8.17", "RY8.18", "RY9.20a", "RY10.12",
90 "RY10.34"
91 };
92 // Lie-Markov models with WS symmetry/distinguished pairing
93 const char *dna_model_names_lie_markov_ws[] = {
94 "WS2.2b", "WS3.3b", "WS3.3c", "WS3.4", "WS4.4b",
95 "WS4.5a", "WS4.5b", "WS5.6a", "WS5.6b", "WS5.7a",
96 "WS5.7b", "WS5.7c", "WS5.11a", "WS5.11b", "WS5.11c",
97 "WS5.16", "WS6.6", "WS6.7b", "WS6.8a", "WS6.8b",
98 "WS6.17a", "WS6.17b","WS8.8", "WS8.10a", "WS8.10b",
99 "WS8.16", "WS8.17", "WS8.18", "WS9.20a", "WS10.12",
100 "WS10.34"
101 };
102 // Lie-Markov models with MK symmetry/distinguished pairing
103 const char *dna_model_names_lie_markov_mk[] = {
104 "MK2.2b", "MK3.3b", "MK3.3c", "MK3.4", "MK4.4b",
105 "MK4.5a", "MK4.5b", "MK5.6a", "MK5.6b", "MK5.7a",
106 "MK5.7b", "MK5.7c", "MK5.11a", "MK5.11b", "MK5.11c",
107 "MK5.16", "MK6.6", "MK6.7b", "MK6.8a", "MK6.8b",
108 "MK6.17a", "MK6.17b","MK8.8", "MK8.10a", "MK8.10b",
109 "MK8.16", "MK8.17", "MK8.18", "MK9.20a", "MK10.12",
110 "MK10.34"
111 };
112 // Lie-Markov models which are strand symmetric
113 const char *dna_model_names_lie_markov_strsym[] = {
114 "1.1", "WS2.2b", "3.3a", "WS3.3b", "WS3.3c", "WS3.4",
115 "WS4.4b", "WS4.5a", "WS4.5b", "WS5.6a", "WS6.6"
116 };
117
118
119 /****** Protein model set ******/
120 const char* aa_model_names[] = {"LG", "WAG", "JTT", "JTTDCMut", "DCMut", "VT", "PMB", "Blosum62", "Dayhoff",
121 "mtREV", "mtART", "mtZOA", "mtMet" , "mtVer" , "mtInv", "mtMAM",
122 "HIVb", "HIVw", "FLU", "rtREV", "cpREV"};
123
124 /* Protein models supported by PhyML/PartitionFinder */
125 const char *aa_model_names_phyml[] = {"LG", "WAG", "JTT", "DCMut", "VT", "Blosum62", "Dayhoff",
126 "mtREV", "mtART", "mtMAM",
127 "HIVb", "HIVw", "rtREV", "cpREV"};
128
129 /* Protein models supported by RAxML */
130 const char *aa_model_names_rax[] = {"LG", "WAG", "JTT", "JTTDCMut", "DCMut", "VT", "PMB", "Blosum62", "Dayhoff",
131 "mtREV", "mtART", "mtZOA", "mtMAM",
132 "HIVb", "HIVw", "FLU", "rtREV", "cpREV"};
133
134 const char* aa_model_names_mrbayes[] = {"WAG", "JTT", "VT", "Blosum62", "Dayhoff",
135 "mtREV", "mtMAM",
136 "rtREV", "cpREV"};
137
138 const char* aa_model_names_modelomatic[] = {"LG", "WAG", "JTT", "VT", "Blosum62", "Dayhoff",
139 "mtART", "mtMAM", "mtREV",
140 "HIVb", "HIVw", "rtREV", "cpREV"};
141
142 const char *aa_model_names_nuclear[] = {"LG", "WAG", "JTT", "JTTDCMut","DCMut", "VT", "PMB", "Blosum62", "Dayhoff"};
143
144 const char *aa_model_names_mitochondrial[] = {"mtREV", "mtART", "mtZOA", "mtMet" , "mtVer" , "mtInv", "mtMAM"};
145
146 const char *aa_model_names_chloroplast[] = {"cpREV"};
147
148 const char *aa_model_names_viral[] = {"HIVb", "HIVw", "FLU", "rtREV"};
149
150 const char* aa_freq_names[] = {"", "+F"};
151
152
153 /****** Codon models ******/
154 //const char *codon_model_names[] = {"GY", "MG", "MGK", "KOSI07", "SCHN05","KOSI07_GY1KTV","SCHN05_GY1KTV"};
155 //short int std_genetic_code[] = { 0, 0, 0, 1, 1, 1, 1};
156 const char *codon_model_names[] = { "GY", "MGK", "MG", "KOSI07", "SCHN05"};
157 short int std_genetic_code[] = { 0, 0, 0, 1, 1};
158 const char *codon_model_names_modelomatic[] = {"GY"};
159 short int std_genetic_code_modelomatic[] = { 0};
160
161 const char *codon_freq_names[] = {"+F3X4", "+F1X4", "+F", ""};
162
163 //const double TOL_LIKELIHOOD_MODELTEST = 0.1;
164 const double TOL_GRADIENT_MODELTEST = 0.0001;
165
166 extern double RunKMeans1D(int n, int k, double *points, int *weights, double *centers, int *assignments);
167
168
getSeqTypeName(SeqType seq_type)169 string getSeqTypeName(SeqType seq_type) {
170 switch (seq_type) {
171 case SEQ_BINARY: return "binary";
172 case SEQ_DNA: return "DNA";
173 case SEQ_PROTEIN: return "protein";
174 case SEQ_CODON: return "codon";
175 case SEQ_MORPH: return "morphological";
176 case SEQ_POMO: return "PoMo";
177 case SEQ_UNKNOWN: return "unknown";
178 case SEQ_MULTISTATE: return "MultiState";
179 }
180 }
181
getUsualModelSubst(SeqType seq_type)182 string getUsualModelSubst(SeqType seq_type) {
183 switch (seq_type) {
184 case SEQ_DNA: return dna_model_names[0];
185 case SEQ_PROTEIN: return aa_model_names[0];
186 case SEQ_CODON: return string(codon_model_names[0]) + codon_freq_names[0];
187 case SEQ_BINARY: return bin_model_names[0];
188 case SEQ_MORPH: return morph_model_names[0];
189 case SEQ_POMO: return string(dna_model_names[0]) + "+P";
190 default: ASSERT(0 && "Unprocessed seq_type"); return "";
191 }
192 }
193
194 void getRateHet(SeqType seq_type, string model_name, double frac_invariant_sites,
195 string rate_set, StrVector &ratehet);
196
getUsualModel(Alignment * aln)197 size_t CandidateModel::getUsualModel(Alignment *aln) {
198 size_t aln_len = 0;
199 if (aln->isSuperAlignment()) {
200 SuperAlignment *super_aln = (SuperAlignment*)aln;
201 for (auto it = super_aln->partitions.begin(); it != super_aln->partitions.end(); it++) {
202 CandidateModel usual_model(*it);
203 if (!subst_name.empty())
204 subst_name += ',';
205 subst_name += usual_model.subst_name;
206 if (!rate_name.empty())
207 rate_name += ',';
208 rate_name += usual_model.rate_name;
209 aln_len += (*it)->getNSite();
210 }
211 } else {
212 subst_name = getUsualModelSubst(aln->seq_type);
213 StrVector ratehet;
214 getRateHet(aln->seq_type, Params::getInstance().model_name, aln->frac_invariant_sites, "1", ratehet);
215 ASSERT(!ratehet.empty());
216 rate_name = ratehet[0];
217 aln_len = aln->getNSite();
218 }
219 orig_subst_name = subst_name;
220 orig_rate_name = rate_name;
221 return aln_len;
222 }
223
computeICScores(size_t sample_size)224 void CandidateModel::computeICScores(size_t sample_size) {
225 computeInformationScores(logl, df, sample_size, AIC_score, AICc_score, BIC_score);
226 }
227
computeICScores()228 void CandidateModel::computeICScores() {
229 size_t sample_size = aln->getNSite();
230 if (aln->isSuperAlignment()) {
231 sample_size = 0;
232 SuperAlignment *super_aln = (SuperAlignment*)aln;
233 for (auto a : super_aln->partitions)
234 sample_size += a->getNSite();
235 }
236 if (hasFlag(MF_SAMPLE_SIZE_TRIPLE))
237 sample_size /= 3;
238 computeInformationScores(logl, df, sample_size, AIC_score, AICc_score, BIC_score);
239 }
240
computeICScore(size_t sample_size)241 double CandidateModel::computeICScore(size_t sample_size) {
242 return computeInformationScore(logl, df, sample_size, Params::getInstance().model_test_criterion);
243 }
244
getScore(ModelTestCriterion mtc)245 double CandidateModel::getScore(ModelTestCriterion mtc) {
246 switch (mtc) {
247 case MTC_AIC:
248 return AIC_score;
249 case MTC_AICC:
250 return AICc_score;
251 case MTC_BIC:
252 return BIC_score;
253 case MTC_ALL:
254 ASSERT(0 && "Unhandled case");
255 return 0.0;
256 }
257 }
258
getScore()259 double CandidateModel::getScore() {
260 return getScore(Params::getInstance().model_test_criterion);
261 }
262
getBestModelID(ModelTestCriterion mtc)263 int CandidateModelSet::getBestModelID(ModelTestCriterion mtc) {
264 double best_score = DBL_MAX;
265 int best_model = -1;
266 for (int model = 0; model < size(); model++)
267 if (at(model).hasFlag(MF_DONE) && best_score > at(model).getScore(mtc)) {
268 best_score = at(model).getScore(mtc);
269 best_model = model;
270 }
271 return best_model;
272 }
273
getBestModel(string & best_model)274 bool ModelCheckpoint::getBestModel(string &best_model) {
275 return getString("best_model_" + criterionName(Params::getInstance().model_test_criterion), best_model);
276 }
277
getBestModelList(string & best_model_list)278 bool ModelCheckpoint::getBestModelList(string &best_model_list) {
279 return getString("best_model_list_" + criterionName(Params::getInstance().model_test_criterion), best_model_list);
280 }
281
putBestModelList(string & best_model_list)282 void ModelCheckpoint::putBestModelList(string &best_model_list) {
283 return put("best_model_list_" + criterionName(Params::getInstance().model_test_criterion), best_model_list);
284 }
285
getBestTree(string & best_tree)286 bool ModelCheckpoint::getBestTree(string &best_tree) {
287 return getString("best_tree_" + criterionName(Params::getInstance().model_test_criterion), best_tree);
288 }
289
getOrderedModels(PhyloTree * tree,CandidateModelSet & ordered_models)290 bool ModelCheckpoint::getOrderedModels(PhyloTree *tree, CandidateModelSet &ordered_models) {
291 double best_score_AIC, best_score_AICc, best_score_BIC;
292 if (tree->isSuperTree()) {
293 PhyloSuperTree *stree = (PhyloSuperTree*)tree;
294 ordered_models.clear();
295 for (int part = 0; part != stree->size(); part++) {
296 startStruct(stree->at(part)->aln->name);
297 CandidateModel info;
298 if (!getBestModel(info.subst_name)) return false;
299 info.restoreCheckpoint(this);
300 info.computeICScores(stree->at(part)->getAlnNSite());
301 endStruct();
302 ordered_models.push_back(info);
303 }
304 return true;
305 } else {
306 CKP_RESTORE2(this, best_score_AIC);
307 CKP_RESTORE2(this, best_score_AICc);
308 CKP_RESTORE2(this, best_score_BIC);
309 double sum_AIC = 0, sum_AICc = 0, sum_BIC = 0;
310 string str;
311 bool ret = getBestModelList(str);
312 if (!ret) return false;
313 istringstream istr(str);
314 string model;
315 ordered_models.clear();
316 while (istr >> model) {
317 CandidateModel info;
318 info.subst_name = model;
319 info.restoreCheckpoint(this);
320 info.computeICScores(tree->getAlnNSite());
321 sum_AIC += info.AIC_weight = exp(-0.5*(info.AIC_score-best_score_AIC));
322 sum_AICc += info.AICc_weight = exp(-0.5*(info.AICc_score-best_score_AICc));
323 sum_BIC += info.BIC_weight = exp(-0.5*(info.BIC_score-best_score_BIC));
324 ordered_models.push_back(info);
325 }
326 sum_AIC = 1.0/sum_AIC;
327 sum_AICc = 1.0/sum_AICc;
328 sum_BIC = 1.0/sum_BIC;
329 for (auto it = ordered_models.begin(); it != ordered_models.end(); it++) {
330 it->AIC_weight *= sum_AIC;
331 it->AICc_weight *= sum_AICc;
332 it->BIC_weight *= sum_BIC;
333 it->AIC_conf = it->AIC_weight > 0.05;
334 it->AICc_conf = it->AICc_weight > 0.05;
335 it->BIC_conf = it->BIC_weight > 0.05;
336 }
337 return true;
338 }
339 }
340
341 /**
342 * copy from cvec to strvec
343 */
copyCString(const char ** cvec,int n,StrVector & strvec,bool touppercase=false)344 void copyCString(const char **cvec, int n, StrVector &strvec, bool touppercase = false) {
345 strvec.resize(n);
346 for (int i = 0; i < n; i++) {
347 strvec[i] = cvec[i];
348 if (touppercase)
349 std::transform(strvec[i].begin(), strvec[i].end(), strvec[i].begin(), ::toupper);
350 }
351 }
352
353 /**
354 * append from cvec to strvec
355 */
appendCString(const char ** cvec,int n,StrVector & strvec,bool touppercase=false)356 void appendCString(const char **cvec, int n, StrVector &strvec, bool touppercase = false) {
357 strvec.reserve(strvec.size()+n);
358 for (int i = 0; i < n; i++) {
359 strvec.push_back(cvec[i]);
360 if (touppercase)
361 std::transform(strvec.back().begin(), strvec.back().end(), strvec.back().begin(), ::toupper);
362 }
363 }
364
365
detectSeqType(const char * model_name,SeqType & seq_type)366 int detectSeqType(const char *model_name, SeqType &seq_type) {
367 bool empirical_model = false;
368 int i;
369 string model_str = model_name;
370 std::transform(model_str.begin(), model_str.end(), model_str.begin(), ::toupper);
371 StrVector model_list;
372
373 seq_type = SEQ_UNKNOWN;
374
375 copyCString(bin_model_names, sizeof(bin_model_names)/sizeof(char*), model_list, true);
376 for (i = 0; i < model_list.size(); i++)
377 if (model_str == model_list[i]) {
378 seq_type = SEQ_BINARY;
379 break;
380 }
381 copyCString(morph_model_names, sizeof(morph_model_names)/sizeof(char*), model_list, true);
382 for (i = 0; i < model_list.size(); i++)
383 if (model_str == model_list[i]) {
384 seq_type = SEQ_MORPH;
385 break;
386 }
387 copyCString(dna_model_names, sizeof(dna_model_names)/sizeof(char*), model_list, true);
388 for (i = 0; i < model_list.size(); i++)
389 if (model_str == model_list[i]) {
390 seq_type = SEQ_DNA;
391 break;
392 }
393 copyCString(aa_model_names, sizeof(aa_model_names)/sizeof(char*), model_list, true);
394 for (i = 0; i < model_list.size(); i++)
395 if (model_str == model_list[i]) {
396 seq_type = SEQ_PROTEIN;
397 empirical_model = true;
398 break;
399 }
400 copyCString(codon_model_names, sizeof(codon_model_names)/sizeof(char*), model_list, true);
401 for (i = 0; i < model_list.size(); i++)
402 if (model_str.substr(0,model_list[i].length()) == model_list[i]) {
403 seq_type = SEQ_CODON;
404 if (std_genetic_code[i]) empirical_model = true;
405 break;
406 }
407
408 return (empirical_model) ? 2 : 1;
409 }
410
detectSeqTypeName(string model_name)411 string detectSeqTypeName(string model_name) {
412 SeqType seq_type;
413 detectSeqType(model_name.c_str(), seq_type);
414 switch (seq_type) {
415 case SEQ_BINARY: return "BIN"; break;
416 case SEQ_MORPH: return "MORPH"; break;
417 case SEQ_DNA: return "DNA"; break;
418 case SEQ_PROTEIN: return "AA"; break;
419 case SEQ_CODON: return "CODON"; break;
420 default: break;
421 }
422 return "";
423 }
424
computeInformationScores(double tree_lh,int df,int ssize,double & AIC,double & AICc,double & BIC)425 void computeInformationScores(double tree_lh, int df, int ssize, double &AIC, double &AICc, double &BIC) {
426 AIC = -2 * tree_lh + 2 * df;
427 AICc = AIC + 2.0 * df * (df + 1) / max(ssize - df - 1, 1);
428 BIC = -2 * tree_lh + df * log(ssize);
429 }
430
computeInformationScore(double tree_lh,int df,int ssize,ModelTestCriterion mtc)431 double computeInformationScore(double tree_lh, int df, int ssize, ModelTestCriterion mtc) {
432 double AIC, AICc, BIC;
433 computeInformationScores(tree_lh, df, ssize, AIC, AICc, BIC);
434 if (mtc == MTC_AIC)
435 return AIC;
436 if (mtc == MTC_AICC)
437 return AICc;
438 if (mtc == MTC_BIC)
439 return BIC;
440 return 0.0;
441 }
442
criterionName(ModelTestCriterion mtc)443 string criterionName(ModelTestCriterion mtc) {
444 if (mtc == MTC_AIC)
445 return "AIC";
446 if (mtc == MTC_AICC)
447 return "AICc";
448 if (mtc == MTC_BIC)
449 return "BIC";
450 return "";
451 }
452
453
454 /**
455 * select models for all partitions
456 * @param[in,out] model_info (IN/OUT) all model information
457 * @return total number of parameters
458 */
459 void testPartitionModel(Params ¶ms, PhyloSuperTree* in_tree, ModelCheckpoint &model_info,
460 ModelsBlock *models_block, int num_threads);
461
462
463 /**
464 compute log-adapter function according to Whelan et al. 2015
465 @param orig_aln original codon alignment
466 @param newaln AA alignment
467 @param[out] adjusted_df adjusted degree of freedom factor
468 @return adjusted log-likelihood factor
469 */
computeAdapter(Alignment * orig_aln,Alignment * newaln,int & adjusted_df)470 double computeAdapter(Alignment *orig_aln, Alignment *newaln, int &adjusted_df) {
471 int aa, codon;
472
473 // count codon occurences
474 unsigned int codon_counts[orig_aln->num_states];
475 orig_aln->computeAbsoluteStateFreq(codon_counts);
476
477 // compute AA frequency
478 // double aa_freq[newaln->num_states];
479 // newaln->computeStateFreq(aa_freq);
480
481 // compute codon frequency
482 double codon_freq[orig_aln->num_states];
483 //orig_aln->computeStateFreq(codon_freq);
484
485 double sum = 0.0;
486 for (codon = 0; codon < orig_aln->num_states; codon++)
487 sum += codon_counts[codon];
488 sum = 1.0/sum;
489 for (codon = 0; codon < orig_aln->num_states; codon++)
490 codon_freq[codon] = sum*codon_counts[codon];
491
492 // new rescale codon_freq s.t. codons coding for the same AA
493 // have f summing up to the frequency of this AA
494 for (aa = 0; aa < newaln->num_states; aa++) {
495 double sum = 0;
496 for (codon = 0; codon < orig_aln->num_states; codon++)
497 if (newaln->convertState(orig_aln->genetic_code[(int)orig_aln->codon_table[codon]]) == aa)
498 sum += codon_freq[codon];
499 sum = 1.0/sum;
500 for (codon = 0; codon < orig_aln->num_states; codon++)
501 if (newaln->convertState(orig_aln->genetic_code[(int)orig_aln->codon_table[codon]]) == aa)
502 codon_freq[codon] *= sum;
503 }
504
505 // now compute adapter function
506 double adapter = 0.0;
507 adjusted_df = 0;
508 vector<bool> has_AA;
509 has_AA.resize(newaln->num_states, false);
510
511 for (codon = 0; codon < orig_aln->num_states; codon++) {
512 if (codon_counts[codon] == 0)
513 continue;
514 has_AA[newaln->convertState(orig_aln->genetic_code[(int)orig_aln->codon_table[codon]])] = true;
515 adapter += codon_counts[codon]*log(codon_freq[codon]);
516 adjusted_df++;
517 }
518 for (aa = 0; aa < has_AA.size(); aa++)
519 if (has_AA[aa])
520 adjusted_df--;
521 return adapter;
522 }
523
524 /**
525 compute fast ML tree by stepwise addition MP + ML-NNI
526 @return the tree string
527 */
computeFastMLTree(Params & params,Alignment * aln,ModelCheckpoint & model_info,ModelsBlock * models_block,int & num_threads,int brlen_type,string dist_file)528 string computeFastMLTree(Params ¶ms, Alignment *aln,
529 ModelCheckpoint &model_info, ModelsBlock *models_block,
530 int &num_threads, int brlen_type, string dist_file) {
531 //string model_name;
532 CandidateModel usual_model(aln);
533 StrVector subst_names;
534 StrVector rate_names;
535 convert_string_vec(usual_model.subst_name.c_str(), subst_names);
536 convert_string_vec(usual_model.rate_name.c_str(), rate_names);
537 ASSERT(subst_names.size() == rate_names.size());
538 //set<string> model_set;
539
540 string concat_tree;
541
542 IQTree *iqtree = NULL;
543
544 StrVector saved_model_names;
545
546 if (aln->isSuperAlignment()) {
547 SuperAlignment *saln = (SuperAlignment*)aln;
548 if (params.partition_type == TOPO_UNLINKED)
549 iqtree = new PhyloSuperTreeUnlinked(saln);
550 else if (params.partition_type == BRLEN_OPTIMIZE)
551 iqtree = new PhyloSuperTree(saln);
552 else
553 iqtree = new PhyloSuperTreePlen(saln, brlen_type);
554 for (int part = 0; part != subst_names.size(); part++) {
555 saved_model_names.push_back(saln->partitions[part]->model_name);
556 saln->partitions[part]->model_name = subst_names[part] + rate_names[part];
557 }
558 } else if (posRateHeterotachy(rate_names[0]) != string::npos)
559 iqtree = new PhyloTreeMixlen(aln, 0);
560 else
561 iqtree = new IQTree(aln);
562
563 if ((params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) && !iqtree->isInitializedPLL()) {
564 /* Initialized all data structure for PLL*/
565 iqtree->initializePLL(params);
566 }
567
568 iqtree->setParams(¶ms);
569 iqtree->setLikelihoodKernel(params.SSE);
570 iqtree->optimize_by_newton = params.optimize_by_newton;
571 iqtree->setNumThreads(num_threads);
572 iqtree->setCheckpoint(&model_info);
573
574 iqtree->dist_file = dist_file;
575 iqtree->computeInitialTree(params.SSE);
576 iqtree->restoreCheckpoint();
577
578 //ASSERT(iqtree->root);
579 iqtree->initializeModel(params, usual_model.getName(), models_block);
580 if (!iqtree->getModel()->isMixture() || aln->seq_type == SEQ_POMO) {
581 usual_model.subst_name = iqtree->getSubstName();
582 usual_model.rate_name = iqtree->getRateName();
583 }
584
585 iqtree->getModelFactory()->restoreCheckpoint();
586
587 #ifdef _OPENMP
588 if (num_threads <= 0) {
589 num_threads = iqtree->testNumThreads();
590 omp_set_num_threads(num_threads);
591 } else
592 iqtree->warnNumThreads();
593 #endif
594
595 iqtree->initializeAllPartialLh();
596 double saved_modelEps = params.modelEps;
597 params.modelEps = params.modelfinder_eps;
598 string initTree;
599
600 double start_time = getRealTime();
601
602 cout << "Perform fast likelihood tree search using " << subst_names[0]+rate_names[0] << " model..." << endl;
603
604 if (iqtree->getCheckpoint()->getBool("finishedFastMLTree")) {
605 // model optimization already done: ignore this step
606 iqtree->setCurScore(iqtree->computeLikelihood());
607 initTree = iqtree->getTreeString();
608 cout << "CHECKPOINT: Tree restored, LogL: " << iqtree->getCurScore() << endl;
609 } else {
610 bool saved_opt_gammai = params.opt_gammai;
611 // disable thorough I+G optimization
612 params.opt_gammai = false;
613 initTree = iqtree->optimizeModelParameters(false, params.modelEps*50.0);
614 if (iqtree->isMixlen())
615 initTree = ((ModelFactoryMixlen*)iqtree->getModelFactory())->sortClassesByTreeLength();
616
617 // do quick NNI search
618 if (params.start_tree != STT_USER_TREE) {
619 cout << "Perform nearest neighbor interchange..." << endl;
620 iqtree->doNNISearch(true);
621 initTree = iqtree->getTreeString();
622 }
623 params.opt_gammai = saved_opt_gammai;
624
625 iqtree->saveCheckpoint();
626 iqtree->getModelFactory()->saveCheckpoint();
627 iqtree->getCheckpoint()->putBool("finishedFastMLTree", true);
628 iqtree->getCheckpoint()->dump();
629 // cout << "initTree: " << initTree << endl;
630 cout << "Time for fast ML tree search: " << getRealTime() - start_time << " seconds" << endl;
631 cout << endl;
632 }
633
634 // restore model epsilon
635 params.modelEps = saved_modelEps;
636
637 // save information to the checkpoint for later retrieval
638 if (iqtree->isSuperTree()) {
639 PhyloSuperTree *stree = (PhyloSuperTree*)iqtree;
640 int part = 0;
641 for (auto it = stree->begin(); it != stree->end(); it++, part++) {
642 model_info.startStruct((*it)->aln->name);
643 (*it)->saveCheckpoint();
644 (*it)->getModelFactory()->saveCheckpoint();
645 model_info.endStruct();
646 }
647 SuperAlignment *saln = (SuperAlignment*)aln;
648 // restore model_names
649 for (int i = 0; i < saln->partitions.size(); i++)
650 saln->partitions[i]->model_name = saved_model_names[i];
651 } else {
652 iqtree->saveCheckpoint();
653 iqtree->getModelFactory()->saveCheckpoint();
654 }
655
656
657 delete iqtree;
658 return initTree;
659 }
660
661 /**
662 Transfer parameters from ModelFinder into the a checkpoint to speed up later stage
663 */
transferModelFinderParameters(IQTree * iqtree,Checkpoint * target)664 void transferModelFinderParameters(IQTree *iqtree, Checkpoint *target) {
665
666 Checkpoint *source = iqtree->getCheckpoint();
667
668 // transfer the substitution model and site-rate parameters
669 if (iqtree->isSuperTree()) {
670 DoubleVector tree_lens;
671 string struct_name;
672 if (iqtree->params->partition_type == BRLEN_SCALE || iqtree->params->partition_type == BRLEN_FIX)
673 struct_name = "PartitionModelPlen";
674 else
675 struct_name = "PartitionModel";
676 target->startStruct(struct_name);
677 SuperAlignment *super_aln = (SuperAlignment*)iqtree->aln;
678 for (auto aln : super_aln->partitions) {
679 source->transferSubCheckpoint(target, aln->name + CKP_SEP + "Model");
680 source->transferSubCheckpoint(target, aln->name + CKP_SEP + "Rate");
681
682 // transfer partition rates
683 if (iqtree->params->partition_type == BRLEN_SCALE) {
684 source->startStruct(aln->name);
685 CandidateModel info;
686 info.subst_name = aln->model_name;
687 if (info.restoreCheckpoint(source))
688 tree_lens.push_back(info.tree_len);
689 else
690 ASSERT(0 && "Could not restore tree_len");
691 source->endStruct();
692 }
693 }
694
695 if (iqtree->params->partition_type == BRLEN_SCALE) {
696 // now normalize the rates
697 PhyloSuperTree *tree = (PhyloSuperTree*)iqtree;
698 double sum = 0.0;
699 size_t nsite = 0;
700 int i;
701 for (i = 0; i < tree->size(); i++) {
702 sum += tree_lens[i] * tree->at(i)->aln->getNSite();
703 if (tree->at(i)->aln->seq_type == SEQ_CODON && tree->rescale_codon_brlen)
704 nsite += 3*tree->at(i)->aln->getNSite();
705 else
706 nsite += tree->at(i)->aln->getNSite();
707 }
708
709 sum /= nsite;
710 iqtree->restoreCheckpoint();
711 iqtree->scaleLength(sum/iqtree->treeLength());
712 iqtree->saveCheckpoint();
713 sum = 1.0/sum;
714 for (i = 0; i < tree->size(); i++)
715 tree_lens[i] *= sum;
716 target->putVector("part_rates", tree_lens);
717 }
718 target->endStruct();
719 } else {
720 source->transferSubCheckpoint(target, "Model");
721 source->transferSubCheckpoint(target, "Rate");
722 }
723
724 // transfer tree
725 source->transferSubCheckpoint(target, "PhyloTree");
726 }
727
runModelFinder(Params & params,IQTree & iqtree,ModelCheckpoint & model_info)728 void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info)
729 {
730 // iqtree.setCurScore(-DBL_MAX);
731 bool test_only = (params.model_name.find("ONLY") != string::npos) ||
732 (params.model_name.substr(0,2) == "MF" && params.model_name.substr(0,3) != "MFP");
733
734 bool empty_model_found = params.model_name.empty() && !iqtree.isSuperTree();
735
736 if (params.model_name.empty() && iqtree.isSuperTree()) {
737 // check whether any partition has empty model_name
738 PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree;
739 for (auto i = stree->begin(); i != stree->end(); i++)
740 if ((*i)->aln->model_name.empty()) {
741 empty_model_found = true;
742 break;
743 }
744 }
745
746 if (params.model_joint)
747 empty_model_found = false;
748
749 // Model already specifed, nothing to do here
750 if (!empty_model_found && params.model_name.substr(0, 4) != "TEST" && params.model_name.substr(0, 2) != "MF")
751 return;
752 if (MPIHelper::getInstance().getNumProcesses() > 1)
753 outError("Please use only 1 MPI process! We are currently working on the MPI parallelization of model selection.");
754 // TODO: check if necessary
755 // if (iqtree.isSuperTree())
756 // ((PhyloSuperTree*) &iqtree)->mapTrees();
757 double cpu_time = getCPUTime();
758 double real_time = getRealTime();
759 model_info.setFileName((string)params.out_prefix + ".model.gz");
760 model_info.setDumpInterval(params.checkpoint_dump_interval);
761
762 bool ok_model_file = false;
763 if (!params.model_test_again) {
764 ok_model_file = model_info.load();
765 }
766
767 cout << endl;
768
769 ok_model_file &= model_info.size() > 0;
770 if (ok_model_file)
771 cout << "NOTE: Restoring information from model checkpoint file " << model_info.getFileName() << endl;
772
773
774 Checkpoint *orig_checkpoint = iqtree.getCheckpoint();
775 iqtree.setCheckpoint(&model_info);
776 iqtree.restoreCheckpoint();
777
778 int partition_type;
779 if (CKP_RESTORE2((&model_info), partition_type)) {
780 if (partition_type != params.partition_type)
781 outError("Mismatch partition type between checkpoint and partition file command option\nRerun with -mredo to ignore .model.gz checkpoint file");
782 } else {
783 partition_type = params.partition_type;
784 CKP_SAVE2((&model_info), partition_type);
785 }
786
787 ModelsBlock *models_block = readModelsDefinition(params);
788
789 // compute initial tree
790 if (params.modelfinder_ml_tree) {
791 // 2019-09-10: Now perform NNI on the initial tree
792 string tree_str = computeFastMLTree(params, iqtree.aln, model_info,
793 models_block, params.num_threads, params.partition_type, iqtree.dist_file);
794 iqtree.restoreCheckpoint();
795 } else {
796 iqtree.computeInitialTree(params.SSE);
797
798 if (iqtree.isSuperTree()) {
799 PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree;
800 int part = 0;
801 for (auto it = stree->begin(); it != stree->end(); it++, part++) {
802 model_info.startStruct((*it)->aln->name);
803 (*it)->saveCheckpoint();
804 model_info.endStruct();
805 }
806 } else {
807 iqtree.saveCheckpoint();
808 }
809 }
810
811 // also save initial tree to the original .ckp.gz checkpoint
812 // string initTree = iqtree.getTreeString();
813 // CKP_SAVE(initTree);
814 // iqtree.saveCheckpoint();
815 // checkpoint->dump(true);
816
817 CandidateModelSet candidate_models;
818 int max_cats = candidate_models.generate(params, iqtree.aln, params.model_test_separate_rate, false);
819
820 uint64_t mem_size = iqtree.getMemoryRequiredThreaded(max_cats);
821 cout << "NOTE: ModelFinder requires " << (mem_size / 1024) / 1024 << " MB RAM!" << endl;
822 if (mem_size >= getMemorySize()) {
823 outError("Memory required exceeds your computer RAM size!");
824 }
825 #ifdef BINARY32
826 if (mem_size >= 2000000000) {
827 outError("Memory required exceeds 2GB limit of 32-bit executable");
828 }
829 #endif
830
831
832 if (iqtree.isSuperTree()) {
833 // partition model selection
834 PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree;
835 testPartitionModel(params, stree, model_info, models_block, params.num_threads);
836 stree->mapTrees();
837 string res_models = "";
838 for (auto it = stree->begin(); it != stree->end(); it++) {
839 if (it != stree->begin()) res_models += ",";
840 res_models += (*it)->aln->model_name;
841 }
842 iqtree.aln->model_name = res_models;
843 } else {
844 // single model selection
845 CandidateModel best_model;
846 if (params.openmp_by_model)
847 best_model = CandidateModelSet().evaluateAll(params, &iqtree,
848 model_info, models_block, params.num_threads, BRLEN_OPTIMIZE);
849 else
850 best_model = CandidateModelSet().test(params, &iqtree,
851 model_info, models_block, params.num_threads, BRLEN_OPTIMIZE);
852 iqtree.aln->model_name = best_model.getName();
853
854 Checkpoint *checkpoint = &model_info;
855 string best_model_AIC, best_model_AICc, best_model_BIC;
856 CKP_RESTORE(best_model_AIC);
857 CKP_RESTORE(best_model_AICc);
858 CKP_RESTORE(best_model_BIC);
859 cout << "Akaike Information Criterion: " << best_model_AIC << endl;
860 cout << "Corrected Akaike Information Criterion: " << best_model_AICc << endl;
861 cout << "Bayesian Information Criterion: " << best_model_BIC << endl;
862 cout << "Best-fit model: " << iqtree.aln->model_name << " chosen according to "
863 << criterionName(params.model_test_criterion) << endl;
864 }
865
866 delete models_block;
867
868 // force to dump all checkpointing information
869 model_info.dump(true);
870
871 // transfer models parameters
872 transferModelFinderParameters(&iqtree, orig_checkpoint);
873 iqtree.setCheckpoint(orig_checkpoint);
874
875 params.startCPUTime = cpu_time;
876 params.start_real_time = real_time;
877 cpu_time = getCPUTime() - cpu_time;
878 real_time = getRealTime() - real_time;
879 cout << endl;
880 cout << "All model information printed to " << model_info.getFileName() << endl;
881 cout << "CPU time for ModelFinder: " << cpu_time << " seconds (" << convert_time(cpu_time) << ")" << endl;
882 cout << "Wall-clock time for ModelFinder: " << real_time << " seconds (" << convert_time(real_time) << ")" << endl;
883
884 // alignment = iqtree.aln;
885 if (test_only) {
886 params.min_iterations = 0;
887 }
888 }
889
890 /**
891 * get the list of substitution models
892 */
getModelSubst(SeqType seq_type,bool standard_code,string model_name,string model_set,char * model_subset,StrVector & model_names)893 void getModelSubst(SeqType seq_type, bool standard_code, string model_name,
894 string model_set, char *model_subset, StrVector &model_names) {
895 int i, j;
896
897 if (model_set == "1") {
898 model_names.push_back(getUsualModelSubst(seq_type));
899 return;
900 }
901
902 if (iEquals(model_set, "ALL") || iEquals(model_set, "AUTO"))
903 model_set = "";
904
905 if (seq_type == SEQ_BINARY) {
906 if (model_set.empty()) {
907 copyCString(bin_model_names, sizeof(bin_model_names) / sizeof(char*), model_names);
908 } else if (model_set[0] == '+') {
909 // append model_set into existing models
910 convert_string_vec(model_set.c_str()+1, model_names);
911 appendCString(bin_model_names, sizeof(bin_model_names) / sizeof(char*), model_names);
912 } else {
913 convert_string_vec(model_set.c_str(), model_names);
914 }
915 } else if (seq_type == SEQ_MORPH) {
916 if (model_set.empty()) {
917 copyCString(morph_model_names, sizeof(morph_model_names) / sizeof(char*), model_names);
918 } else if (model_set[0] == '+') {
919 // append model_set into existing models
920 convert_string_vec(model_set.c_str()+1, model_names);
921 appendCString(morph_model_names, sizeof(morph_model_names) / sizeof(char*), model_names);
922 } else {
923 convert_string_vec(model_set.c_str(), model_names);
924 }
925 } else if (seq_type == SEQ_DNA || seq_type == SEQ_POMO) {
926 if (model_set.empty()) {
927 copyCString(dna_model_names, sizeof(dna_model_names) / sizeof(char*), model_names);
928 // copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names);
929 } else if (model_set == "partitionfinder" || model_set== "phyml") {
930 copyCString(dna_model_names_old, sizeof(dna_model_names_old) / sizeof(char*), model_names);
931 // copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names);
932 } else if (model_set == "raxml") {
933 copyCString(dna_model_names_rax, sizeof(dna_model_names_rax) / sizeof(char*), model_names);
934 // copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names);
935 } else if (model_set == "mrbayes") {
936 copyCString(dna_model_names_mrbayes, sizeof(dna_model_names_mrbayes) / sizeof(char*), model_names);
937 // copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names);
938 } else if (model_set == "modelomatic") {
939 copyCString(dna_model_names_modelomatic, sizeof(dna_model_names_modelomatic) / sizeof(char*), model_names);
940 } else if (model_set == "liemarkov") {
941 copyCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
942 appendCString(dna_model_names_lie_markov_ry, sizeof(dna_model_names_lie_markov_ry) / sizeof(char*), model_names);
943 appendCString(dna_model_names_lie_markov_ws, sizeof(dna_model_names_lie_markov_ws) / sizeof(char*), model_names);
944 appendCString(dna_model_names_lie_markov_mk, sizeof(dna_model_names_lie_markov_mk) / sizeof(char*), model_names);
945 } else if (model_set == "liemarkovry") {
946 copyCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
947 appendCString(dna_model_names_lie_markov_ry, sizeof(dna_model_names_lie_markov_ry) / sizeof(char*), model_names);
948 } else if (model_set == "liemarkovws") {
949 copyCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
950 appendCString(dna_model_names_lie_markov_ws, sizeof(dna_model_names_lie_markov_ws) / sizeof(char*), model_names);
951 } else if (model_set == "liemarkovmk") {
952 copyCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
953 appendCString(dna_model_names_lie_markov_mk, sizeof(dna_model_names_lie_markov_mk) / sizeof(char*), model_names);
954 } else if (model_set == "strandsymmetric") {
955 copyCString(dna_model_names_lie_markov_strsym, sizeof(dna_model_names_lie_markov_strsym) / sizeof(char*), model_names);
956 // IMPORTANT NOTE: If you add any more -mset names for sets of Lie Markov models,
957 // you also need to change getPrototypeModel function.
958 } else if (model_set[0] == '+') {
959 // append model_set into existing models
960 convert_string_vec(model_set.c_str()+1, model_names);
961 appendCString(dna_model_names, sizeof(dna_model_names) / sizeof(char*), model_names);
962 } else {
963 convert_string_vec(model_set.c_str(), model_names);
964 }
965
966 if (model_name.find("+LMRY") != string::npos) {
967 appendCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
968 appendCString(dna_model_names_lie_markov_ry, sizeof(dna_model_names_lie_markov_ry) / sizeof(char*), model_names);
969 } else if (model_name.find("+LMWS") != string::npos) {
970 appendCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
971 appendCString(dna_model_names_lie_markov_ws, sizeof(dna_model_names_lie_markov_ws) / sizeof(char*), model_names);
972 } else if (model_name.find("+LMMK") != string::npos) {
973 appendCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
974 appendCString(dna_model_names_lie_markov_mk, sizeof(dna_model_names_lie_markov_mk) / sizeof(char*), model_names);
975 } else if (model_name.find("+LMSS") != string::npos) {
976 appendCString(dna_model_names_lie_markov_strsym, sizeof(dna_model_names_lie_markov_strsym) / sizeof(char*), model_names);
977 } else if (model_name.find("+LM") != string::npos) {
978 appendCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
979 appendCString(dna_model_names_lie_markov_ry, sizeof(dna_model_names_lie_markov_ry) / sizeof(char*), model_names);
980 appendCString(dna_model_names_lie_markov_ws, sizeof(dna_model_names_lie_markov_ws) / sizeof(char*), model_names);
981 appendCString(dna_model_names_lie_markov_mk, sizeof(dna_model_names_lie_markov_mk) / sizeof(char*), model_names);
982 }
983 } else if (seq_type == SEQ_PROTEIN) {
984 if (model_set.empty()) {
985 copyCString(aa_model_names, sizeof(aa_model_names) / sizeof(char*), model_names);
986 } else if (model_set == "partitionfinder" || model_set == "phyml") {
987 copyCString(aa_model_names_phyml, sizeof(aa_model_names_phyml) / sizeof(char*), model_names);
988 } else if (model_set == "raxml") {
989 copyCString(aa_model_names_rax, sizeof(aa_model_names_rax) / sizeof(char*), model_names);
990 } else if (model_set == "mrbayes") {
991 copyCString(aa_model_names_mrbayes, sizeof(aa_model_names_mrbayes) / sizeof(char*), model_names);
992 } else if (model_set == "modelomatic") {
993 copyCString(aa_model_names_modelomatic, sizeof(aa_model_names_modelomatic) / sizeof(char*), model_names);
994 } else if (model_set[0] == '+') {
995 // append model_set into existing models
996 convert_string_vec(model_set.c_str()+1, model_names);
997 appendCString(aa_model_names, sizeof(aa_model_names) / sizeof(char*), model_names);
998 } else {
999 convert_string_vec(model_set.c_str(), model_names);
1000 }
1001
1002 if (model_subset) {
1003 StrVector submodel_names;
1004 if (strncmp(model_subset, "nuclear", 3) == 0) {
1005 copyCString(aa_model_names_nuclear, sizeof(aa_model_names_nuclear) / sizeof(char*), submodel_names);
1006 } else if (strncmp(model_subset, "mitochondrial", 3) == 0) {
1007 copyCString(aa_model_names_mitochondrial, sizeof(aa_model_names_mitochondrial) / sizeof(char*), submodel_names);
1008 } else if (strncmp(model_subset, "chloroplast", 3) == 0) {
1009 copyCString(aa_model_names_chloroplast, sizeof(aa_model_names_chloroplast) / sizeof(char*), submodel_names);
1010 } else if (strncmp(model_subset, "viral",3) == 0) {
1011 copyCString(aa_model_names_viral, sizeof(aa_model_names_viral) / sizeof(char*), submodel_names);
1012 } else {
1013 outError("Wrong -msub option");
1014 }
1015 for (i = 0; i < model_names.size(); i++) {
1016 bool appear = false;
1017 for (j = 0; j < submodel_names.size(); j++)
1018 if (model_names[i] == submodel_names[j]) {
1019 appear = true;
1020 break;
1021 }
1022 if (!appear) {
1023 model_names.erase(model_names.begin()+i);
1024 i--;
1025 }
1026 }
1027 }
1028
1029 } else if (seq_type == SEQ_CODON) {
1030 if (model_set.empty()) {
1031 if (standard_code)
1032 copyCString(codon_model_names, sizeof(codon_model_names) / sizeof(char*), model_names);
1033 else {
1034 i = sizeof(codon_model_names) / sizeof(char*);
1035 for (j = 0; j < i; j++)
1036 if (!std_genetic_code[j])
1037 model_names.push_back(codon_model_names[j]);
1038 // copyCString(codon_model_names, sizeof(codon_model_names) / sizeof(char*) - 1, model_names);
1039 }
1040 } else if (model_set == "modelomatic") {
1041 copyCString(codon_model_names_modelomatic, sizeof(codon_model_names_modelomatic) / sizeof(char*), model_names);
1042 } else if (model_set[0] == '+') {
1043 // append model_set into existing models
1044 convert_string_vec(model_set.c_str()+1, model_names);
1045 if (standard_code)
1046 appendCString(codon_model_names, sizeof(codon_model_names) / sizeof(char*), model_names);
1047 else {
1048 i = sizeof(codon_model_names) / sizeof(char*);
1049 for (j = 0; j < i; j++)
1050 if (!std_genetic_code[j])
1051 model_names.push_back(codon_model_names[j]);
1052 }
1053 } else
1054 convert_string_vec(model_set.c_str(), model_names);
1055 }
1056 }
1057
getStateFreqs(SeqType seq_type,char * state_freq_set,StrVector & freq_names)1058 void getStateFreqs(SeqType seq_type, char *state_freq_set, StrVector &freq_names) {
1059 int j;
1060
1061 switch (seq_type) {
1062 case SEQ_PROTEIN:
1063 copyCString(aa_freq_names, sizeof(aa_freq_names)/sizeof(char*), freq_names);
1064 break;
1065 case SEQ_CODON:
1066 copyCString(codon_freq_names, sizeof(codon_freq_names) / sizeof(char*), freq_names);
1067 break;
1068 default:
1069 break;
1070 }
1071 if (state_freq_set)
1072 convert_string_vec(state_freq_set, freq_names);
1073 for (j = 0; j < freq_names.size(); j++) {
1074 std::transform(freq_names[j].begin(), freq_names[j].end(), freq_names[j].begin(), ::toupper);
1075 if (freq_names[j] != "" && freq_names[j][0] != '+')
1076 freq_names[j] = "+" + freq_names[j];
1077 }
1078 }
1079
1080 /**
1081 get list of rate heterogeneity
1082 */
getRateHet(SeqType seq_type,string model_name,double frac_invariant_sites,string rate_set,StrVector & ratehet)1083 void getRateHet(SeqType seq_type, string model_name, double frac_invariant_sites,
1084 string rate_set, StrVector &ratehet) {
1085 const char *rate_options[] = { "", "+I", "+ASC", "+G", "+I+G", "+ASC+G", "+R", "+ASC+R"};
1086 bool test_options_default[] = {true, true, false, true, true, false, false, false};
1087 bool test_options_fast[] = {false, false, false, false, true, false, false, false};
1088 bool test_options_morph[] = {true, false, true, true, false, true, false, false};
1089 bool test_options_morph_fast[]= {false, false, false, false, false, true, false, false};
1090 bool test_options_noASC_I[] = {true, false, false, true, false, false, false, false};
1091 bool test_options_noASC_I_fast[]={false,false, false, true, false, false, false, false};
1092 bool test_options_asc[] ={false, false, true, false, false, true, false, false};
1093 bool test_options_new[] = {true, true, false, true, true, false, true, false};
1094 bool test_options_morph_new[] = {true, false, true, true, false, true, true, true};
1095 bool test_options_noASC_I_new[]= {true, false, false, true, false, false, true, false};
1096 bool test_options_asc_new[] = {false, false, true, false, false, true, false, true};
1097 bool test_options_pomo[] = {true, false, false, true, false, false, false, false};
1098 bool test_options_norate[] = {true, false, false, false, false, false, false, false};
1099 bool *test_options = test_options_default;
1100 // bool test_options_codon[] = {true,false, false,false, false, false};
1101 const int noptions = sizeof(rate_options) / sizeof(char*);
1102 int i, j;
1103
1104 bool with_new = (model_name.find("NEW") != string::npos || model_name.substr(0,2) == "MF" || model_name.empty());
1105 bool with_asc = model_name.find("ASC") != string::npos;
1106
1107 if (seq_type == SEQ_POMO) {
1108 for (i = 0; i < noptions; i++)
1109 test_options[i] = test_options_pomo[i];
1110 }
1111 // If not PoMo, go on with normal treatment.
1112 else if (frac_invariant_sites == 0.0) {
1113 // morphological or SNP data: activate +ASC
1114 if (with_new && rate_set != "1") {
1115 if (with_asc)
1116 test_options = test_options_asc_new;
1117 else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MORPH)
1118 test_options = test_options_morph_new;
1119 else
1120 test_options = test_options_noASC_I_new;
1121 } else if (with_asc)
1122 test_options = test_options_asc;
1123 else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MORPH) {
1124 if (rate_set == "1")
1125 test_options = test_options_morph_fast;
1126 else
1127 test_options = test_options_morph;
1128 } else {
1129 if (rate_set == "1")
1130 test_options = test_options_noASC_I_fast;
1131 else
1132 test_options = test_options_noASC_I;
1133 }
1134 } else if (frac_invariant_sites >= 1.0) {
1135 // 2018-06-12: alignment with only invariant sites, no rate variation added
1136 test_options = test_options_norate;
1137 } else {
1138 // normal data, use +I instead
1139 if (with_new && rate_set != "1") {
1140 // change +I+G to +R
1141 if (with_asc)
1142 test_options = test_options_asc_new;
1143 else
1144 test_options = test_options_new;
1145 } else if (with_asc) {
1146 test_options = test_options_asc;
1147 } else if (rate_set == "1")
1148 test_options = test_options_fast;
1149 else
1150 test_options = test_options_default;
1151 if (frac_invariant_sites == 0.0) {
1152 // deactivate +I
1153 for (j = 0; j < noptions; j++)
1154 if (strstr(rate_options[j], "+I"))
1155 test_options[j] = false;
1156 }
1157 }
1158 if (!rate_set.empty() && rate_set != "1" && !iEquals(rate_set, "ALL") && !iEquals(rate_set, "AUTO")) {
1159 // take the rate_options from user-specified models
1160 convert_string_vec(rate_set.c_str(), ratehet);
1161 if (!ratehet.empty() && iEquals(ratehet[0], "ALL")) {
1162 ratehet.erase(ratehet.begin());
1163 StrVector ratedef;
1164 for (j = 0; j < noptions; j++)
1165 if (test_options[j])
1166 ratedef.push_back(rate_options[j]);
1167 ratehet.insert(ratehet.begin(), ratedef.begin(), ratedef.end());
1168 }
1169 for (j = 0; j < ratehet.size(); j++) {
1170 if (ratehet[j] != "" && ratehet[j][0] != '+' && ratehet[j][0] != '*')
1171 ratehet[j] = "+" + ratehet[j];
1172 if (ratehet[j] == "+E") // for equal rate model
1173 ratehet[j] = "";
1174 }
1175 } else {
1176 for (j = 0; j < noptions; j++)
1177 if (test_options[j])
1178 ratehet.push_back(rate_options[j]);
1179
1180 }
1181 }
1182
generate(Params & params,Alignment * aln,bool separate_rate,bool merge_phase)1183 int CandidateModelSet::generate(Params ¶ms, Alignment *aln, bool separate_rate, bool merge_phase) {
1184 StrVector model_names;
1185 StrVector freq_names;
1186 SeqType seq_type = aln->seq_type;
1187
1188 int i, j;
1189 string model_set;
1190
1191 if (merge_phase) {
1192 model_set = params.merge_models;
1193 } else
1194 model_set = params.model_set;
1195
1196 bool auto_model = iEquals(model_set, "AUTO");
1197
1198 getModelSubst(seq_type, aln->isStandardGeneticCode(), params.model_name,
1199 model_set, params.model_subset, model_names);
1200
1201 if (model_names.empty())
1202 return 1;
1203
1204 getStateFreqs(seq_type, params.state_freq_set, freq_names);
1205
1206 // combine model_names with freq_names
1207 if (freq_names.size() > 0) {
1208 StrVector orig_model_names = model_names;
1209 model_names.clear();
1210 for (j = 0; j < orig_model_names.size(); j++) {
1211 if (aln->seq_type == SEQ_CODON) {
1212 SeqType seq_type;
1213 int model_type = detectSeqType(orig_model_names[j].c_str(), seq_type);
1214 for (i = 0; i < freq_names.size(); i++) {
1215 // disallow MG+F
1216 if (freq_names[i] == "+F" && orig_model_names[j].find("MG") != string::npos)
1217 continue;
1218 if (freq_names[i] != "" || (model_type == 2 && orig_model_names[j].find("MG") == string::npos))
1219 // empirical model also allow ""
1220 model_names.push_back(orig_model_names[j] + freq_names[i]);
1221 }
1222 } else {
1223 for (i = 0; i < freq_names.size(); i++)
1224 model_names.push_back(orig_model_names[j] + freq_names[i]);
1225 }
1226 }
1227 }
1228
1229
1230
1231
1232 StrVector ratehet;
1233 int max_cats = params.num_rate_cats;
1234 string ratehet_set;
1235 if (merge_phase) {
1236 ratehet_set = params.merge_rates;
1237 } else
1238 ratehet_set = params.ratehet_set;
1239
1240 //bool auto_rate = iEquals(ratehet_set, "AUTO");
1241
1242 getRateHet(seq_type, params.model_name, aln->frac_invariant_sites, ratehet_set, ratehet);
1243
1244 // add number of rate cateogories for special rate models
1245 const char *rates[] = {"+R", "*R", "+H", "*H"};
1246
1247 for (i = 0; i < sizeof(rates)/sizeof(char*); i++)
1248 if (params.model_name.find(rates[i]) != string::npos)
1249 ratehet.push_back(rates[i]);
1250
1251 size_t pos;
1252
1253 vector<int> flags;
1254 flags.resize(ratehet.size(), 0);
1255
1256 for (i = 0; i < sizeof(rates)/sizeof(char*); i++)
1257 for (j = 0; j < ratehet.size(); j++)
1258 if ((pos = ratehet[j].find(rates[i])) != string::npos &&
1259 (pos >= ratehet[j].length()-2 || !isdigit(ratehet[j][pos+2]) ))
1260 {
1261 string str = ratehet[j];
1262 ratehet[j].insert(pos+2, convertIntToString(params.min_rate_cats));
1263 max_cats = max(max_cats, params.max_rate_cats);
1264 for (int k = params.min_rate_cats+1; k <= params.max_rate_cats; k++) {
1265 int ins_pos = j+k-params.min_rate_cats;
1266 ratehet.insert(ratehet.begin() + ins_pos, str.substr(0, pos+2) + convertIntToString(k) + str.substr(pos+2));
1267 flags.insert(flags.begin() + ins_pos, MF_WAITING);
1268 }
1269 }
1270
1271 ASSERT(ratehet.size() == flags.size());
1272
1273 string pomo_suffix = (seq_type == SEQ_POMO) ? "+P" : "";
1274 // TODO DS: should we allow virtual population size?
1275
1276 // combine substitution models with rate heterogeneity
1277 if (separate_rate) {
1278 for (i = 0; i < model_names.size(); i++)
1279 push_back(CandidateModel(model_names[i], ratehet[0] + pomo_suffix, aln));
1280 for (j = 0; j < ratehet.size(); j++)
1281 if (ratehet[j] != "")
1282 push_back(CandidateModel("", ratehet[j] + pomo_suffix, aln));
1283 } else {
1284 if (auto_model) {
1285 // all rate heterogeneity for the first model
1286 for (j = 0; j < ratehet.size(); j++)
1287 push_back(CandidateModel(model_names[0], ratehet[j] + pomo_suffix, aln, flags[j]));
1288 // now all models the first RHAS
1289 for (i = 1; i < model_names.size(); i++)
1290 push_back(CandidateModel(model_names[i], ratehet[0] + pomo_suffix, aln, flags[0]));
1291 // all remaining models
1292 for (i = 1; i < model_names.size(); i++)
1293 for (j = 1; j < ratehet.size(); j++) {
1294 push_back(CandidateModel(model_names[i], ratehet[j] + pomo_suffix, aln, flags[j]));
1295 }
1296 } else {
1297 // testing all models
1298 for (i = 0; i < model_names.size(); i++)
1299 for (j = 0; j < ratehet.size(); j++) {
1300 push_back(CandidateModel(model_names[i], ratehet[j] + pomo_suffix, aln, flags[j]));
1301 }
1302 }
1303 }
1304 if (params.model_extra_set) {
1305 StrVector extra_model_names;
1306 convert_string_vec(params.model_extra_set, extra_model_names);
1307 for (auto s : extra_model_names)
1308 push_back(CandidateModel(s, "", aln));
1309 }
1310 return max_cats;
1311 }
1312
replaceModelInfo(string & set_name,ModelCheckpoint & model_info,ModelCheckpoint & new_info)1313 void replaceModelInfo(string &set_name, ModelCheckpoint &model_info, ModelCheckpoint &new_info) {
1314 for (auto it = new_info.begin(); it != new_info.end(); it++) {
1315 model_info.put(set_name + CKP_SEP + it->first, it->second);
1316 }
1317 }
1318
extractModelInfo(string & orig_set_name,ModelCheckpoint & model_info,ModelCheckpoint & part_model_info)1319 void extractModelInfo(string &orig_set_name, ModelCheckpoint &model_info, ModelCheckpoint &part_model_info) {
1320 string set_name = orig_set_name + CKP_SEP;
1321 int len = set_name.length();
1322 for (auto it = model_info.lower_bound(set_name); it != model_info.end() && it->first.substr(0, len) == set_name; it++) {
1323 part_model_info.put(it->first.substr(len), it->second);
1324 }
1325 }
1326
getSubsetName(PhyloSuperTree * super_tree,set<int> & subset)1327 string getSubsetName(PhyloSuperTree *super_tree, set<int> &subset) {
1328 string set_name;
1329 for (auto it = subset.begin(); it != subset.end(); it++) {
1330 if (it != subset.begin())
1331 set_name += "+";
1332 set_name += super_tree->at(*it)->aln->name;
1333 }
1334 return set_name;
1335 }
1336
getSubsetAlnLength(PhyloSuperTree * super_tree,set<int> & subset)1337 int getSubsetAlnLength(PhyloSuperTree *super_tree, set<int> &subset) {
1338 int len = 0;
1339 for (auto i : subset) {
1340 len += super_tree->at(i)->aln->getNSite();
1341 }
1342 return len;
1343 }
1344
1345 /**
1346 * transfer model parameters from two subsets to the target subsets
1347 */
transferModelParameters(PhyloSuperTree * super_tree,ModelCheckpoint & model_info,ModelCheckpoint & part_model_info,set<int> & gene_set1,set<int> & gene_set2)1348 void transferModelParameters(PhyloSuperTree *super_tree, ModelCheckpoint &model_info, ModelCheckpoint &part_model_info,
1349 set<int> &gene_set1, set<int> &gene_set2)
1350 {
1351 set<int> merged_set;
1352 merged_set.insert(gene_set1.begin(), gene_set1.end());
1353 merged_set.insert(gene_set2.begin(), gene_set2.end());
1354 string set_name = getSubsetName(super_tree, merged_set);
1355 string set1_name = getSubsetName(super_tree, gene_set1);
1356 string set2_name = getSubsetName(super_tree, gene_set2);
1357 double weight1 = getSubsetAlnLength(super_tree, gene_set1);
1358 double weight2 = getSubsetAlnLength(super_tree, gene_set2);
1359 double weight_sum = weight1 + weight2;
1360 weight1 = weight1/weight_sum;
1361 weight2 = weight2/weight_sum;
1362 enum MeanComp {GEOM_MEAN, ARIT_MEAN};
1363 enum ValType {VAL_SINGLE, VAL_VECTOR};
1364 vector<tuple<ValType, MeanComp,string> > info_strings = {
1365 make_tuple(VAL_SINGLE, ARIT_MEAN, (string)"RateGamma" + CKP_SEP + "gamma_shape"),
1366 make_tuple(VAL_SINGLE, ARIT_MEAN, (string)"RateGammaInvar" + CKP_SEP + "gamma_shape"),
1367 make_tuple(VAL_SINGLE, ARIT_MEAN, (string)"RateGammaInvar" + CKP_SEP + "p_invar"),
1368 make_tuple(VAL_SINGLE, ARIT_MEAN, (string)"RateInvar" + CKP_SEP + "p_invar")
1369 //make_tuple(VAL_VECTOR, GEOM_MEAN, (string)"ModelDNA" + CKP_SEP + "rates")
1370 };
1371 for (auto info : info_strings) {
1372 switch (std::get<0>(info)) {
1373 case VAL_SINGLE: {
1374 double value1, value2, value;
1375 bool ok1 = model_info.get(set1_name + CKP_SEP + std::get<2>(info), value1);
1376 bool ok2 = model_info.get(set2_name + CKP_SEP + std::get<2>(info), value2);
1377 if (!ok1 || !ok2)
1378 continue;
1379 if (part_model_info.get(std::get<2>(info), value))
1380 continue; // value already exist
1381 switch (std::get<1>(info)) {
1382 case ARIT_MEAN:
1383 value = weight1*value1 + weight2*value2;
1384 break;
1385 case GEOM_MEAN:
1386 value = sqrt(value1*value2);
1387 break;
1388 }
1389 part_model_info.put(std::get<2>(info), value);
1390 break;
1391 }
1392 case VAL_VECTOR: {
1393 DoubleVector value1, value2, value;
1394 bool ok1 = model_info.getVector(set1_name + CKP_SEP + std::get<2>(info), value1);
1395 bool ok2 = model_info.getVector(set2_name + CKP_SEP + std::get<2>(info), value2);
1396 if (!ok1 || !ok2)
1397 continue;
1398 ASSERT(value1.size() == value2.size());
1399 if (part_model_info.getVector(std::get<2>(info), value))
1400 continue; // value already exist
1401 value.reserve(value1.size());
1402 for (int i = 0; i < value1.size(); i++)
1403 switch (std::get<1>(info)) {
1404 case ARIT_MEAN:
1405 value.push_back(weight1*value1[i] + weight2*value2[i]);
1406 break;
1407 case GEOM_MEAN:
1408 value.push_back(sqrt(value1[i]*value2[i]));
1409 break;
1410 }
1411 part_model_info.putVector(std::get<2>(info), value);
1412 break;
1413 }
1414 }
1415 }
1416 }
1417
mergePartitions(PhyloSuperTree * super_tree,vector<set<int>> & gene_sets,StrVector & model_names)1418 void mergePartitions(PhyloSuperTree* super_tree, vector<set<int> > &gene_sets, StrVector &model_names) {
1419 cout << "Merging into " << gene_sets.size() << " partitions..." << endl;
1420 vector<set<int> >::iterator it;
1421 SuperAlignment *super_aln = (SuperAlignment*)super_tree->aln;
1422 vector<PartitionInfo> part_info;
1423 vector<PhyloTree*> tree_vec;
1424 SuperAlignment *new_super_aln = new SuperAlignment();
1425 for (it = gene_sets.begin(); it != gene_sets.end(); it++) {
1426 Alignment *aln = super_aln->concatenateAlignments(*it);
1427 PartitionInfo info;
1428 aln->model_name = model_names[it-gene_sets.begin()];
1429 info.part_rate = 1.0; // BIG FIX: make -spp works with -m TESTMERGE now!
1430 info.evalNNIs = 0;
1431 for (set<int>::iterator i = it->begin(); i != it->end(); i++) {
1432 if (i != it->begin()) {
1433 aln->name += "+";
1434 if (!super_aln->partitions[*i]->position_spec.empty())
1435 aln->position_spec += ", ";
1436 }
1437 aln->name += super_aln->partitions[*i]->name;
1438 aln->position_spec += super_aln->partitions[*i]->position_spec;
1439 if (!super_aln->partitions[*i]->aln_file.empty()) {
1440 if (aln->aln_file.empty())
1441 aln->aln_file = super_aln->partitions[*i]->aln_file;
1442 else if (aln->aln_file != super_aln->partitions[*i]->aln_file) {
1443 aln->aln_file = aln->aln_file + ',' + super_aln->partitions[*i]->aln_file;
1444 }
1445 }
1446 if (!super_aln->partitions[*i]->sequence_type.empty()) {
1447 if (aln->sequence_type.empty())
1448 aln->sequence_type = super_aln->partitions[*i]->sequence_type;
1449 else if (aln->sequence_type != super_aln->partitions[*i]->sequence_type) {
1450 aln->sequence_type = "__NA__";
1451 }
1452 }
1453 }
1454 info.cur_ptnlh = NULL;
1455 info.nniMoves[0].ptnlh = NULL;
1456 info.nniMoves[1].ptnlh = NULL;
1457 part_info.push_back(info);
1458 PhyloTree *tree = super_tree->extractSubtree(*it);
1459 tree->setParams(super_tree->params);
1460 tree->setAlignment(aln);
1461 tree_vec.push_back(tree);
1462 new_super_aln->partitions.push_back(aln);
1463 }
1464
1465 // BUG FIX 2016-11-29: when merging partitions with -m TESTMERGE, sequence order is changed
1466 // get the taxa names from existing tree
1467 StrVector seq_names;
1468 if (super_tree->root) {
1469 super_tree->getTaxaName(seq_names);
1470 }
1471 new_super_aln->init(&seq_names);
1472
1473 for (PhyloSuperTree::reverse_iterator tit = super_tree->rbegin(); tit != super_tree->rend(); tit++)
1474 delete (*tit);
1475 super_tree->clear();
1476 super_tree->insert(super_tree->end(), tree_vec.begin(), tree_vec.end());
1477 super_tree->part_info = part_info;
1478
1479 delete super_tree->aln;
1480 // super_tree->aln = new SuperAlignment(super_tree);
1481 super_tree->setAlignment(new_super_aln);
1482 }
1483
1484 /**
1485 called when some partition is changed
1486 */
fixPartitions(PhyloSuperTree * super_tree)1487 void fixPartitions(PhyloSuperTree* super_tree) {
1488 SuperAlignment *super_aln = (SuperAlignment*)super_tree->aln;
1489 int part;
1490 bool aln_changed = false;
1491 for (part = 0; part < super_tree->size(); part++)
1492 if (super_aln->partitions[part] != super_tree->at(part)->aln) {
1493 aln_changed = true;
1494 super_aln->partitions[part] = super_tree->at(part)->aln;
1495 }
1496 if (!aln_changed)
1497 return;
1498 super_aln->buildPattern();
1499 super_aln->orderPatternByNumChars(PAT_VARIANT);
1500 super_tree->deleteAllPartialLh();
1501 }
1502
evaluate(Params & params,ModelCheckpoint & in_model_info,ModelCheckpoint & out_model_info,ModelsBlock * models_block,int & num_threads,int brlen_type)1503 string CandidateModel::evaluate(Params ¶ms,
1504 ModelCheckpoint &in_model_info, ModelCheckpoint &out_model_info,
1505 ModelsBlock *models_block,
1506 int &num_threads, int brlen_type)
1507 {
1508 //string model_name = name;
1509 Alignment *in_aln = aln;
1510 IQTree *iqtree = NULL;
1511 if (in_aln->isSuperAlignment()) {
1512 SuperAlignment *saln = (SuperAlignment*)in_aln;
1513 if (params.partition_type == BRLEN_OPTIMIZE)
1514 iqtree = new PhyloSuperTree(saln);
1515 else
1516 iqtree = new PhyloSuperTreePlen(saln, brlen_type);
1517 StrVector subst_names;
1518 StrVector rate_names;
1519 convert_string_vec(subst_name.c_str(), subst_names);
1520 convert_string_vec(rate_name.c_str(), rate_names);
1521 ASSERT(subst_names.size() == rate_names.size());
1522 for (int part = 0; part != subst_names.size(); part++)
1523 saln->partitions[part]->model_name = subst_names[part]+rate_names[part];
1524 } else if (posRateHeterotachy(getName()) != string::npos)
1525 iqtree = new PhyloTreeMixlen(in_aln, 0);
1526 else
1527 iqtree = new IQTree(in_aln);
1528 iqtree->setParams(¶ms);
1529 iqtree->setLikelihoodKernel(params.SSE);
1530 iqtree->optimize_by_newton = params.optimize_by_newton;
1531 iqtree->setNumThreads(num_threads);
1532
1533 iqtree->setCheckpoint(&in_model_info);
1534 #ifdef _OPENMP
1535 #pragma omp critical
1536 #endif
1537 iqtree->restoreCheckpoint();
1538 ASSERT(iqtree->root);
1539 iqtree->initializeModel(params, getName(), models_block);
1540 if (!iqtree->getModel()->isMixture() || in_aln->seq_type == SEQ_POMO) {
1541 subst_name = iqtree->getSubstName();
1542 rate_name = iqtree->getRateName();
1543 }
1544
1545
1546 if (restoreCheckpoint(&in_model_info)) {
1547 delete iqtree;
1548 return "";
1549 }
1550
1551 #ifdef _OPENMP
1552 #pragma omp critical
1553 #endif
1554 iqtree->getModelFactory()->restoreCheckpoint();
1555
1556 // now switch to the output checkpoint
1557 iqtree->getModelFactory()->setCheckpoint(&out_model_info);
1558 iqtree->setCheckpoint(&out_model_info);
1559
1560 double new_logl;
1561
1562 if (params.model_test_and_tree) {
1563 //--- PERFORM FULL TREE SEARCH PER MODEL ----//
1564
1565 // BQM 2017-03-29: disable bootstrap
1566 int orig_num_bootstrap_samples = params.num_bootstrap_samples;
1567 int orig_gbo_replicates = params.gbo_replicates;
1568 params.num_bootstrap_samples = 0;
1569 params.gbo_replicates = 0;
1570 STOP_CONDITION orig_stop_condition = params.stop_condition;
1571 if (params.stop_condition == SC_BOOTSTRAP_CORRELATION)
1572 params.stop_condition = SC_UNSUCCESS_ITERATION;
1573
1574 iqtree->aln->model_name = getName();
1575
1576 cout << endl << "===> Testing model " << getName() << endl;
1577
1578 if (iqtree->root) {
1579 // start from previous tree
1580 string initTree = iqtree->getTreeString();
1581 iqtree->getCheckpoint()->put("initTree", initTree);
1582 iqtree->saveCheckpoint();
1583 }
1584
1585 #ifdef _OPENMP
1586 if (num_threads <= 0) {
1587 num_threads = iqtree->testNumThreads();
1588 omp_set_num_threads(num_threads);
1589 } else
1590 iqtree->warnNumThreads();
1591 #endif
1592
1593 runTreeReconstruction(params, iqtree);
1594 new_logl = iqtree->computeLikelihood();
1595 tree_len = iqtree->treeLength();
1596 tree = iqtree->getTreeString();
1597
1598 // restore original parameters
1599 // 2017-03-29: restore bootstrap replicates
1600 params.num_bootstrap_samples = orig_num_bootstrap_samples;
1601 params.gbo_replicates = orig_gbo_replicates;
1602 params.stop_condition = orig_stop_condition;
1603
1604 int count = iqtree->getCheckpoint()->eraseKeyPrefix("finished");
1605 cout << count << " finished checkpoint entries erased" << endl;
1606 iqtree->getCheckpoint()->eraseKeyPrefix("CandidateSet");
1607
1608 } else {
1609 //--- FIX TREE TOPOLOGY AND ESTIMATE MODEL PARAMETERS ----//
1610
1611 if (verbose_mode >= VB_MED)
1612 cout << "Optimizing model " << getName() << endl;
1613
1614 #ifdef _OPENMP
1615 if (num_threads <= 0) {
1616 num_threads = iqtree->testNumThreads();
1617 omp_set_num_threads(num_threads);
1618 } else
1619 iqtree->warnNumThreads();
1620 #endif
1621
1622 iqtree->initializeAllPartialLh();
1623
1624 for (int step = 0; step < 2; step++) {
1625 new_logl = iqtree->getModelFactory()->optimizeParameters(brlen_type, false,
1626 params.modelfinder_eps, TOL_GRADIENT_MODELTEST);
1627 tree_len = iqtree->treeLength();
1628 iqtree->getModelFactory()->saveCheckpoint();
1629 iqtree->saveCheckpoint();
1630
1631 // check if logl(+R[k]) is worse than logl(+R[k-1])
1632 CandidateModel prev_info;
1633 if (!prev_info.restoreCheckpointRminus1(&in_model_info, this)) break;
1634 if (prev_info.logl < new_logl + params.modelfinder_eps) break;
1635 if (step == 0) {
1636 iqtree->getRate()->initFromCatMinusOne();
1637 } else if (new_logl < prev_info.logl - params.modelfinder_eps*10.0) {
1638 outWarning("Log-likelihood " + convertDoubleToString(new_logl) + " of " +
1639 getName() + " worse than " + prev_info.getName() + " " + convertDoubleToString(prev_info.logl));
1640 }
1641 }
1642
1643 }
1644
1645 // sum in case of adjusted df and logl already stored
1646 df += iqtree->getModelFactory()->getNParameters(brlen_type);
1647 logl += new_logl;
1648 string tree_string = iqtree->getTreeString();
1649
1650 #ifdef _OPENMP
1651 #pragma omp critical
1652 {
1653 #endif
1654 saveCheckpoint(&in_model_info);
1655 #ifdef _OPENMP
1656 }
1657 #endif
1658
1659 delete iqtree;
1660 return tree_string;
1661 }
1662
1663 /** model information by merging two partitions */
1664 struct ModelPair {
1665 /** score after merging */
1666 double score;
1667 /** ID of partition 1 */
1668 int part1;
1669 /** ID of partition 2 */
1670 int part2;
1671 /** log-likelihood */
1672 double logl;
1673 /** degree of freedom */
1674 int df;
1675 /** tree length */
1676 double tree_len;
1677 /** IDs of merged partitions */
1678 set<int> merged_set;
1679 /** set name */
1680 string set_name;
1681 /* best model name */
1682 string model_name;
1683 };
1684
1685 class ModelPairSet : public multimap<double, ModelPair> {
1686
1687 public:
1688
1689 /** insert a partition pair */
insertPair(ModelPair & pair)1690 void insertPair(ModelPair &pair) {
1691 insert(value_type(pair.score, pair));
1692 }
1693
1694 /**
1695 find the maximum compatible partition pairs
1696 @param num max number of pairs to return
1697 */
getCompatiblePairs(int num,ModelPairSet & res)1698 void getCompatiblePairs(int num, ModelPairSet &res) {
1699 set<int> part_ids;
1700
1701 for (auto it = begin(); it != end() && res.size() < num; it++) {
1702
1703 // check for compatibility
1704 vector<int> overlap;
1705 set_intersection(part_ids.begin(), part_ids.end(),
1706 it->second.merged_set.begin(), it->second.merged_set.end(),
1707 std::back_inserter(overlap));
1708
1709 if (!overlap.empty()) continue;
1710
1711 // take the union
1712 part_ids.insert(it->second.merged_set.begin(), it->second.merged_set.end());
1713
1714 // put the compatible pair to the set
1715 res.insertPair(it->second);
1716 }
1717 }
1718
1719 };
1720
evaluateConcatenation(Params & params,SuperAlignment * super_aln,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads)1721 string CandidateModel::evaluateConcatenation(Params ¶ms, SuperAlignment *super_aln,
1722 ModelCheckpoint &model_info, ModelsBlock *models_block, int num_threads)
1723 {
1724 aln = super_aln->concatenateAlignments();
1725 size_t ssize = getUsualModel(aln);
1726
1727 string concat_tree;
1728
1729 cout << "Testing " << getName() << " on supermatrix..." << endl;
1730 concat_tree = evaluate(params, model_info, model_info,
1731 models_block, num_threads, BRLEN_OPTIMIZE);
1732
1733 computeICScores(ssize);
1734
1735 delete aln;
1736 aln = NULL;
1737 return concat_tree;
1738 }
1739
1740 /**
1741 * k-means clustering of partitions using partition-specific tree length
1742 * @return score (AIC/BIC/etc.) of the clustering
1743 * @param[out] gene_sets
1744 * @param[out[ model_names
1745 */
doKmeansClustering(Params & params,PhyloSuperTree * in_tree,int ncluster,DoubleVector & lenvec,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads,vector<set<int>> & gene_sets,StrVector & model_names)1746 double doKmeansClustering(Params ¶ms, PhyloSuperTree *in_tree,
1747 int ncluster, DoubleVector &lenvec,
1748 ModelCheckpoint &model_info, ModelsBlock *models_block,
1749 int num_threads,
1750 vector<set<int> > &gene_sets, StrVector &model_names)
1751 {
1752
1753 cout << "k-means merging into " << ncluster << " partitions..." << endl;
1754
1755 ASSERT(lenvec.size() == in_tree->size());
1756 int npart = in_tree->size();
1757 IntVector weights;
1758 weights.resize(npart, 1);
1759 int *clusters = new int[npart];
1760 double *centers = new double[ncluster];
1761 RunKMeans1D(npart, ncluster, lenvec.data(), weights.data(), centers, clusters);
1762
1763 SuperAlignment *super_aln = ((SuperAlignment*)in_tree->aln);
1764
1765 double lhsum = 0.0;
1766 int dfsum = 0;
1767 if (params.partition_type == BRLEN_FIX || params.partition_type == BRLEN_SCALE) {
1768 dfsum = in_tree->getNBranchParameters(BRLEN_OPTIMIZE);
1769 if (params.partition_type == BRLEN_SCALE)
1770 dfsum -= 1;
1771 }
1772
1773 for (int cluster = 0; cluster < ncluster; cluster++) {
1774 string set_name;
1775 set<int> merged_set;
1776 for (int i = 0; i < in_tree->size(); i++)
1777 if (clusters[i] == cluster) {
1778 if (!set_name.empty())
1779 set_name += "+";
1780 set_name += in_tree->at(i)->aln->name;
1781 merged_set.insert(i);
1782 }
1783 gene_sets.push_back(merged_set);
1784 CandidateModel best_model;
1785 bool done_before = false;
1786 {
1787 // if pairs previously examined, reuse the information
1788 model_info.startStruct(set_name);
1789 if (model_info.getBestModel(best_model.subst_name)) {
1790 best_model.restoreCheckpoint(&model_info);
1791 done_before = true;
1792 }
1793 model_info.endStruct();
1794 }
1795 ModelCheckpoint part_model_info;
1796 if (!done_before) {
1797 Alignment *aln = super_aln->concatenateAlignments(merged_set);
1798 PhyloTree *tree = in_tree->extractSubtree(merged_set);
1799 tree->setAlignment(aln);
1800 extractModelInfo(set_name, model_info, part_model_info);
1801 tree->num_precision = in_tree->num_precision;
1802 tree->setParams(¶ms);
1803 tree->sse = params.SSE;
1804 tree->optimize_by_newton = params.optimize_by_newton;
1805 tree->num_threads = params.model_test_and_tree ? num_threads : 1;
1806 /*if (params.model_test_and_tree) {
1807 tree->setCheckpoint(new Checkpoint());
1808 tree->saveCheckpoint();
1809 } else*/
1810 {
1811 tree->setCheckpoint(&part_model_info);
1812 // trick to restore checkpoint
1813 tree->restoreCheckpoint();
1814 tree->saveCheckpoint();
1815 }
1816 best_model = CandidateModelSet().test(params, tree, part_model_info, models_block,
1817 params.model_test_and_tree ? num_threads : 1, params.partition_type,
1818 set_name, "", true);
1819 best_model.restoreCheckpoint(&part_model_info);
1820 model_names.push_back(best_model.getName());
1821 delete tree;
1822 delete aln;
1823 }
1824 lhsum += best_model.logl;
1825 dfsum += best_model.df;
1826 {
1827 if (!done_before) {
1828 replaceModelInfo(set_name, model_info, part_model_info);
1829 model_info.dump();
1830 cout.width(4);
1831 cout << right << cluster+1 << " ";
1832 cout.width(12);
1833 cout << left << best_model.getName() << " ";
1834 cout.width(11);
1835 cout << best_model.logl << " " << set_name;
1836 cout << endl;
1837 }
1838 }
1839 }
1840
1841 int ssize = in_tree->getAlnNSite();
1842 double score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
1843 cout << "k-means score for " << ncluster << " partitions: " << score << " (LnL: " << lhsum << " " << "df: " << dfsum << ")" << endl;
1844
1845 delete [] centers;
1846 delete [] clusters;
1847 return score;
1848 }
1849
1850 class SubsetPair : public pair<int,int> {
1851 public:
1852 // distance between two partition pairs
1853 double distance;
1854 };
1855
comparePairs(const SubsetPair & a,const SubsetPair & b)1856 bool comparePairs(const SubsetPair &a, const SubsetPair &b) {
1857 return a.distance < b.distance;
1858 }
1859
comparePartition(const pair<int,double> & a,const pair<int,double> & b)1860 bool comparePartition(const pair<int,double> &a, const pair<int, double> &b) {
1861 return a.second > b.second;
1862 }
1863
1864 /**
1865 find k-closest partition pairs for rcluster algorithm
1866 */
findClosestPairs(SuperAlignment * super_aln,DoubleVector & lenvec,vector<set<int>> & gene_sets,double log_transform,vector<SubsetPair> & closest_pairs)1867 void findClosestPairs(SuperAlignment *super_aln, DoubleVector &lenvec, vector<set<int> > &gene_sets,
1868 double log_transform, vector<SubsetPair> &closest_pairs) {
1869
1870 for (int part1 = 0; part1 < lenvec.size()-1; part1++)
1871 for (int part2 = part1+1; part2 < lenvec.size(); part2++)
1872 if (super_aln->partitions[*gene_sets[part1].begin()]->seq_type == super_aln->partitions[*gene_sets[part2].begin()]->seq_type &&
1873 super_aln->partitions[*gene_sets[part1].begin()]->genetic_code == super_aln->partitions[*gene_sets[part2].begin()]->genetic_code) {
1874 // only merge partitions of the same data type
1875 SubsetPair pair;
1876 pair.first = part1;
1877 pair.second = part2;
1878 if (log_transform)
1879 pair.distance = fabs(log(lenvec[part1]) - log(lenvec[part2]));
1880 else
1881 pair.distance = fabs(lenvec[part1] - lenvec[part2]);
1882 closest_pairs.push_back(pair);
1883 }
1884 if (!closest_pairs.empty() && Params::getInstance().partfinder_rcluster < 100) {
1885 // sort distance
1886 std::sort(closest_pairs.begin(), closest_pairs.end(), comparePairs);
1887 size_t num_pairs = round(closest_pairs.size() * (Params::getInstance().partfinder_rcluster/100.0));
1888 num_pairs = min(num_pairs, Params::getInstance().partfinder_rcluster_max);
1889 if (num_pairs <= 0) num_pairs = 1;
1890 closest_pairs.erase(closest_pairs.begin() + num_pairs, closest_pairs.end());
1891 }
1892 }
1893
1894 /**
1895 merge vector src into dest, eliminating duplicates
1896 */
mergePairs(vector<SubsetPair> & dest,vector<SubsetPair> & src)1897 void mergePairs(vector<SubsetPair> &dest, vector<SubsetPair> &src) {
1898 unordered_set<string> dest_set;
1899 for (SubsetPair s: dest)
1900 dest_set.insert(convertIntToString(s.first) + "-" + convertIntToString(s.second));
1901 for (SubsetPair s: src)
1902 if (dest_set.find(convertIntToString(s.first) + "-" + convertIntToString(s.second)) == dest_set.end())
1903 dest.push_back(s);
1904 }
1905
1906 /**
1907 * select models for all partitions
1908 * @param[in,out] model_info (IN/OUT) all model information
1909 * @return total number of parameters
1910 */
testPartitionModel(Params & params,PhyloSuperTree * in_tree,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads)1911 void testPartitionModel(Params ¶ms, PhyloSuperTree* in_tree, ModelCheckpoint &model_info,
1912 ModelsBlock *models_block, int num_threads)
1913 {
1914 // params.print_partition_info = true;
1915 // params.print_conaln = true;
1916 int i = 0;
1917 // PhyloSuperTree::iterator it;
1918 DoubleVector lhvec; // log-likelihood for each partition
1919 DoubleVector dfvec; // number of parameters for each partition
1920 DoubleVector lenvec; // tree length for each partition
1921 double lhsum = 0.0;
1922 int dfsum = 0;
1923 if (params.partition_type == BRLEN_FIX || params.partition_type == BRLEN_SCALE) {
1924 dfsum = in_tree->getNBranchParameters(BRLEN_OPTIMIZE);
1925 if (params.partition_type == BRLEN_SCALE)
1926 dfsum -= 1;
1927 }
1928 int ssize = in_tree->getAlnNSite();
1929 int64_t num_model = 0;
1930 int64_t total_num_model = in_tree->size();
1931
1932 // 2017-06-07: -rcluster-max for max absolute number of pairs
1933 if (params.partfinder_rcluster_max == 0)
1934 params.partfinder_rcluster_max = max((size_t)1000, 10*in_tree->size());
1935
1936 if (params.partition_merge != MERGE_NONE) {
1937 double p = params.partfinder_rcluster/100.0;
1938 size_t num_pairs = round(in_tree->size()*(in_tree->size()-1)*p/2);
1939 if (p < 1.0)
1940 num_pairs = min(num_pairs, params.partfinder_rcluster_max);
1941 total_num_model += num_pairs;
1942 for (i = in_tree->size()-2; i > 0; i--)
1943 total_num_model += max(round(i*p), 1.0);
1944 }
1945
1946
1947 #ifdef _OPENMP
1948 if (num_threads <= 0) {
1949 // partition selection scales well with many cores
1950 num_threads = min((int64_t)countPhysicalCPUCores(), total_num_model);
1951 num_threads = min(num_threads, params.num_threads_max);
1952 omp_set_num_threads(num_threads);
1953 cout << "NUMBER OF THREADS FOR PARTITION FINDING: " << num_threads << endl;
1954 }
1955 #endif
1956
1957 double start_time = getRealTime();
1958
1959 SuperAlignment *super_aln = ((SuperAlignment*)in_tree->aln);
1960
1961 cout << "Selecting individual models for " << in_tree->size() << " charsets using " << criterionName(params.model_test_criterion) << "..." << endl;
1962 //cout << " No. AIC AICc BIC Charset" << endl;
1963 cout << " No. Model Score TreeLen Charset" << endl;
1964
1965 lhvec.resize(in_tree->size());
1966 dfvec.resize(in_tree->size());
1967 lenvec.resize(in_tree->size());
1968
1969 // sort partition by computational cost for OpenMP effciency
1970 vector<pair<int,double> > partitionID;
1971
1972 for (i = 0; i < in_tree->size(); i++) {
1973 Alignment *this_aln = in_tree->at(i)->aln;
1974 // computation cost is proportional to #sequences, #patterns, and #states
1975 partitionID.push_back({i, ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states});
1976 }
1977
1978 if (num_threads > 1) {
1979 std::sort(partitionID.begin(), partitionID.end(), comparePartition);
1980 }
1981
1982 bool parallel_over_partitions = false;
1983 int brlen_type = params.partition_type;
1984 if (brlen_type == TOPO_UNLINKED)
1985 brlen_type = BRLEN_OPTIMIZE;
1986
1987 bool test_merge = (params.partition_merge != MERGE_NONE) && params.partition_type != TOPO_UNLINKED && (in_tree->size() > 1);
1988
1989 #ifdef _OPENMP
1990 parallel_over_partitions = !params.model_test_and_tree && (in_tree->size() >= num_threads);
1991 #pragma omp parallel for private(i) schedule(dynamic) reduction(+: lhsum, dfsum) if(parallel_over_partitions)
1992 #endif
1993 for (int j = 0; j < in_tree->size(); j++) {
1994 i = partitionID[j].first;
1995 PhyloTree *this_tree = in_tree->at(i);
1996 // scan through models for this partition, assuming the information occurs consecutively
1997 ModelCheckpoint part_model_info;
1998 extractModelInfo(this_tree->aln->name, model_info, part_model_info);
1999 // do the computation
2000 string part_model_name;
2001 if (params.model_name.empty())
2002 part_model_name = this_tree->aln->model_name;
2003 CandidateModel best_model;
2004 best_model = CandidateModelSet().test(params, this_tree, part_model_info, models_block,
2005 (parallel_over_partitions ? 1 : num_threads), brlen_type, this_tree->aln->name, part_model_name, test_merge);
2006
2007 bool check = (best_model.restoreCheckpoint(&part_model_info));
2008 ASSERT(check);
2009
2010 double score = best_model.computeICScore(this_tree->getAlnNSite());
2011 this_tree->aln->model_name = best_model.getName();
2012 lhsum += (lhvec[i] = best_model.logl);
2013 dfsum += (dfvec[i] = best_model.df);
2014 lenvec[i] = best_model.tree_len;
2015
2016 #ifdef _OPENMP
2017 #pragma omp critical
2018 #endif
2019 {
2020 num_model++;
2021 cout.width(4);
2022 cout << right << num_model << " ";
2023 cout.width(12);
2024 cout << left << best_model.getName() << " ";
2025 cout.width(11);
2026 cout << score << " ";
2027 cout.width(11);
2028 cout << best_model.tree_len << " ";
2029 cout << this_tree->aln->name;
2030 if (num_model >= 10) {
2031 double remain_time = (total_num_model-num_model)*(getRealTime()-start_time)/num_model;
2032 cout << "\t" << convert_time(getRealTime()-start_time) << " ("
2033 << convert_time(remain_time) << " left)";
2034 }
2035 cout << endl;
2036 replaceModelInfo(this_tree->aln->name, model_info, part_model_info);
2037 model_info.dump();
2038 }
2039 }
2040
2041 // in case ModelOMatic change the alignment
2042 fixPartitions(in_tree);
2043
2044 double inf_score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
2045 cout << "Full partition model " << criterionName(params.model_test_criterion)
2046 << " score: " << inf_score << " (LnL: " << lhsum << " df:" << dfsum << ")" << endl;
2047
2048 if (!test_merge) {
2049 super_aln->printBestPartition((string(params.out_prefix) + ".best_scheme.nex").c_str());
2050 super_aln->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
2051 model_info.dump();
2052 return;
2053 }
2054
2055 vector<set<int> > gene_sets;
2056 StrVector model_names;
2057 StrVector greedy_model_trees;
2058
2059 gene_sets.resize(in_tree->size());
2060 model_names.resize(in_tree->size());
2061 greedy_model_trees.resize(in_tree->size());
2062 for (i = 0; i < gene_sets.size(); i++) {
2063 gene_sets[i].insert(i);
2064 model_names[i] = in_tree->at(i)->aln->model_name;
2065 greedy_model_trees[i] = in_tree->at(i)->aln->name;
2066 }
2067
2068 if (params.partition_merge == MERGE_KMEANS) {
2069 // kmeans cluster based on parition tree length
2070 double cur_score = inf_score;
2071 for (int ncluster = in_tree->size()-1; ncluster >= 1; ncluster--) {
2072 vector<set<int> > this_gene_sets;
2073 StrVector this_model_names;
2074 //double sum = in_tree->size()/std::accumulate(lenvec.begin(), lenvec.end(), 0.0);
2075 double score = doKmeansClustering(params, in_tree, ncluster, lenvec, model_info,
2076 models_block, num_threads, this_gene_sets, this_model_names);
2077 if (score < cur_score) {
2078 cout << "Better score found: " << score << endl;
2079 cur_score = score;
2080 gene_sets = this_gene_sets;
2081 model_names = this_model_names;
2082 } else {
2083 //break;
2084 }
2085 }
2086 } else {
2087 cout << "Merging models to increase model fit (about " << total_num_model << " total partition schemes)..." << endl;
2088 }
2089
2090 /* following implements the greedy algorithm of Lanfear et al. (2012) */
2091 while (params.partition_merge != MERGE_KMEANS && gene_sets.size() >= 2) {
2092 // stepwise merging charsets
2093
2094 // list of all better pairs of partitions than current partitioning scheme
2095 ModelPairSet better_pairs;
2096
2097 // 2015-06-24: begin rcluster algorithm
2098 // compute distance between gene_sets
2099 ASSERT(gene_sets.size() == lenvec.size());
2100 // find closest partition pairs
2101 vector<SubsetPair> closest_pairs;
2102 findClosestPairs(super_aln, lenvec, gene_sets, false, closest_pairs);
2103 if (params.partfinder_log_rate) {
2104 // additional consider pairs by log-rate
2105 vector<SubsetPair> log_closest_pairs;
2106 findClosestPairs(super_aln, lenvec, gene_sets, true, log_closest_pairs);
2107 mergePairs(closest_pairs, log_closest_pairs);
2108 }
2109 // sort partition by computational cost for OpenMP effciency
2110 for (i = 0; i < closest_pairs.size(); i++) {
2111 // computation cost is proportional to #sequences, #patterns, and #states
2112 Alignment *this_aln = in_tree->at(closest_pairs[i].first)->aln;
2113 closest_pairs[i].distance = -((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
2114 this_aln = in_tree->at(closest_pairs[i].second)->aln;
2115 closest_pairs[i].distance -= ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
2116 }
2117 if (num_threads > 1)
2118 std::sort(closest_pairs.begin(), closest_pairs.end(), comparePairs);
2119
2120 size_t num_pairs = closest_pairs.size();
2121
2122 #ifdef _OPENMP
2123 #pragma omp parallel for private(i) schedule(dynamic) if(!params.model_test_and_tree)
2124 #endif
2125 for (size_t pair = 0; pair < num_pairs; pair++) {
2126 // information of current partitions pair
2127 ModelPair cur_pair;
2128 cur_pair.part1 = closest_pairs[pair].first;
2129 cur_pair.part2 = closest_pairs[pair].second;
2130 ASSERT(cur_pair.part1 < cur_pair.part2);
2131 cur_pair.merged_set.insert(gene_sets[cur_pair.part1].begin(), gene_sets[cur_pair.part1].end());
2132 cur_pair.merged_set.insert(gene_sets[cur_pair.part2].begin(), gene_sets[cur_pair.part2].end());
2133 cur_pair.set_name = getSubsetName(in_tree, cur_pair.merged_set);
2134 double weight1 = getSubsetAlnLength(in_tree, gene_sets[cur_pair.part1]);
2135 double weight2 = getSubsetAlnLength(in_tree, gene_sets[cur_pair.part2]);
2136 double sum = 1.0 / (weight1 + weight2);
2137 weight1 *= sum;
2138 weight2 *= sum;
2139 CandidateModel best_model;
2140 bool done_before = false;
2141 #ifdef _OPENMP
2142 #pragma omp critical
2143 #endif
2144 {
2145 // if pairs previously examined, reuse the information
2146 model_info.startStruct(cur_pair.set_name);
2147 if (model_info.getBestModel(best_model.subst_name)) {
2148 best_model.restoreCheckpoint(&model_info);
2149 done_before = true;
2150 }
2151 model_info.endStruct();
2152 }
2153 ModelCheckpoint part_model_info;
2154 double cur_tree_len = 0.0;
2155 if (!done_before) {
2156 Alignment *aln = super_aln->concatenateAlignments(cur_pair.merged_set);
2157 PhyloTree *tree = in_tree->extractSubtree(cur_pair.merged_set);
2158 //tree->scaleLength((weight1*lenvec[cur_pair.part1] + weight2*lenvec[cur_pair.part2])/tree->treeLength());
2159 tree->scaleLength(sqrt(lenvec[cur_pair.part1]*lenvec[cur_pair.part2])/tree->treeLength());
2160 cur_tree_len = tree->treeLength();
2161 tree->setAlignment(aln);
2162 extractModelInfo(cur_pair.set_name, model_info, part_model_info);
2163 transferModelParameters(in_tree, model_info, part_model_info, gene_sets[cur_pair.part1], gene_sets[cur_pair.part2]);
2164 tree->num_precision = in_tree->num_precision;
2165 tree->setParams(¶ms);
2166 tree->sse = params.SSE;
2167 tree->optimize_by_newton = params.optimize_by_newton;
2168 tree->num_threads = params.model_test_and_tree ? num_threads : 1;
2169 {
2170 tree->setCheckpoint(&part_model_info);
2171 // trick to restore checkpoint
2172 tree->restoreCheckpoint();
2173 tree->saveCheckpoint();
2174 }
2175 best_model = CandidateModelSet().test(params, tree, part_model_info, models_block,
2176 params.model_test_and_tree ? num_threads : 1, params.partition_type, cur_pair.set_name, "", true);
2177 best_model.restoreCheckpoint(&part_model_info);
2178 delete tree;
2179 delete aln;
2180 }
2181 cur_pair.logl = best_model.logl;
2182 cur_pair.df = best_model.df;
2183 cur_pair.model_name = best_model.getName();
2184 cur_pair.tree_len = best_model.tree_len;
2185 double lhnew = lhsum - lhvec[cur_pair.part1] - lhvec[cur_pair.part2] + best_model.logl;
2186 int dfnew = dfsum - dfvec[cur_pair.part1] - dfvec[cur_pair.part2] + best_model.df;
2187 cur_pair.score = computeInformationScore(lhnew, dfnew, ssize, params.model_test_criterion);
2188 #ifdef _OPENMP
2189 #pragma omp critical
2190 #endif
2191 {
2192 if (!done_before) {
2193 replaceModelInfo(cur_pair.set_name, model_info, part_model_info);
2194 model_info.dump();
2195 num_model++;
2196 cout.width(4);
2197 cout << right << num_model << " ";
2198 cout.width(12);
2199 cout << left << best_model.getName() << " ";
2200 cout.width(11);
2201 cout << cur_pair.score << " ";
2202 cout.width(11);
2203 cout << cur_pair.tree_len << " " << cur_pair.set_name;
2204 if (num_model >= 10) {
2205 double remain_time = max(total_num_model-num_model, (int64_t)0)*(getRealTime()-start_time)/num_model;
2206 cout << "\t" << convert_time(getRealTime()-start_time) << " ("
2207 << convert_time(remain_time) << " left)";
2208 }
2209 cout << endl;
2210 }
2211 if (cur_pair.score < inf_score)
2212 better_pairs.insertPair(cur_pair);
2213 }
2214
2215 }
2216 if (better_pairs.empty()) break;
2217 ModelPairSet compatible_pairs;
2218
2219 int num_comp_pairs = params.partition_merge == MERGE_RCLUSTERF ? gene_sets.size()/2 : 1;
2220 better_pairs.getCompatiblePairs(num_comp_pairs, compatible_pairs);
2221 if (compatible_pairs.size() > 1)
2222 cout << compatible_pairs.size() << " compatible better partition pairs found" << endl;
2223
2224 // 2017-12-21: simultaneously merging better pairs
2225 for (auto it_pair = compatible_pairs.begin(); it_pair != compatible_pairs.end(); it_pair++) {
2226 ModelPair opt_pair = it_pair->second;
2227
2228 lhsum = lhsum - lhvec[opt_pair.part1] - lhvec[opt_pair.part2] + opt_pair.logl;
2229 dfsum = dfsum - dfvec[opt_pair.part1] - dfvec[opt_pair.part2] + opt_pair.df;
2230 inf_score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
2231 ASSERT(inf_score <= opt_pair.score + 0.1);
2232
2233 cout << "Merging " << opt_pair.set_name << " with " << criterionName(params.model_test_criterion)
2234 << " score: " << inf_score << " (LnL: " << lhsum << " df: " << dfsum << ")" << endl;
2235 // change entry opt_part1 to merged one
2236 gene_sets[opt_pair.part1] = opt_pair.merged_set;
2237 lhvec[opt_pair.part1] = opt_pair.logl;
2238 dfvec[opt_pair.part1] = opt_pair.df;
2239 lenvec[opt_pair.part1] = opt_pair.tree_len;
2240 model_names[opt_pair.part1] = opt_pair.model_name;
2241 greedy_model_trees[opt_pair.part1] = "(" + greedy_model_trees[opt_pair.part1] + "," +
2242 greedy_model_trees[opt_pair.part2] + ")" +
2243 convertIntToString(in_tree->size()-gene_sets.size()+1) + ":" +
2244 convertDoubleToString(inf_score);
2245
2246 // delete entry opt_part2
2247 lhvec.erase(lhvec.begin() + opt_pair.part2);
2248 dfvec.erase(dfvec.begin() + opt_pair.part2);
2249 lenvec.erase(lenvec.begin() + opt_pair.part2);
2250 gene_sets.erase(gene_sets.begin() + opt_pair.part2);
2251 model_names.erase(model_names.begin() + opt_pair.part2);
2252 greedy_model_trees.erase(greedy_model_trees.begin() + opt_pair.part2);
2253
2254 // decrease part ID for all pairs beyond opt_pair.part2
2255 auto next_pair = it_pair;
2256 for (next_pair++; next_pair != compatible_pairs.end(); next_pair++) {
2257 if (next_pair->second.part1 > opt_pair.part2)
2258 next_pair->second.part1--;
2259 if (next_pair->second.part2 > opt_pair.part2)
2260 next_pair->second.part2--;
2261 }
2262 }
2263 }
2264
2265 string final_model_tree;
2266 if (greedy_model_trees.size() == 1)
2267 final_model_tree = greedy_model_trees[0];
2268 else {
2269 final_model_tree = "(";
2270 for (i = 0; i < greedy_model_trees.size(); i++) {
2271 if (i>0)
2272 final_model_tree += ",";
2273 final_model_tree += greedy_model_trees[i];
2274 }
2275 final_model_tree += ")";
2276 }
2277
2278 cout << "Agglomerative model selection: " << final_model_tree << endl;
2279
2280 if (gene_sets.size() < in_tree->size())
2281 mergePartitions(in_tree, gene_sets, model_names);
2282
2283 if (!iEquals(params.merge_models, "all")) {
2284 // test all candidate models again
2285 lhsum = 0.0;
2286 dfsum = 0;
2287 if (params.partition_type == BRLEN_FIX || params.partition_type == BRLEN_SCALE) {
2288 dfsum = in_tree->getNBranchParameters(BRLEN_OPTIMIZE);
2289 if (params.partition_type == BRLEN_SCALE)
2290 dfsum -= 1;
2291 }
2292
2293 // sort partition by computational cost for OpenMP effciency
2294 partitionID.clear();
2295 for (i = 0; i < in_tree->size(); i++) {
2296 Alignment *this_aln = in_tree->at(i)->aln;
2297 // computation cost is proportional to #sequences, #patterns, and #states
2298 partitionID.push_back({i, ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states});
2299 }
2300
2301 if (num_threads > 1) {
2302 std::sort(partitionID.begin(), partitionID.end(), comparePartition);
2303 }
2304
2305 #ifdef _OPENMP
2306 parallel_over_partitions = !params.model_test_and_tree && (in_tree->size() >= num_threads);
2307 #pragma omp parallel for private(i) schedule(dynamic) reduction(+: lhsum, dfsum) if(parallel_over_partitions)
2308 #endif
2309 for (int j = 0; j < in_tree->size(); j++) {
2310 i = partitionID[j].first;
2311 PhyloTree *this_tree = in_tree->at(i);
2312 // scan through models for this partition, assuming the information occurs consecutively
2313 ModelCheckpoint part_model_info;
2314 extractModelInfo(this_tree->aln->name, model_info, part_model_info);
2315 // do the computation
2316 string part_model_name;
2317 if (params.model_name.empty())
2318 part_model_name = this_tree->aln->model_name;
2319 CandidateModel best_model;
2320 best_model = CandidateModelSet().test(params, this_tree, part_model_info, models_block,
2321 (parallel_over_partitions ? 1 : num_threads), brlen_type,
2322 this_tree->aln->name, part_model_name, false);
2323
2324 bool check = (best_model.restoreCheckpoint(&part_model_info));
2325 ASSERT(check);
2326
2327 double score = best_model.computeICScore(this_tree->getAlnNSite());
2328 this_tree->aln->model_name = best_model.getName();
2329 lhsum += (lhvec[i] = best_model.logl);
2330 dfsum += (dfvec[i] = best_model.df);
2331 lenvec[i] = best_model.tree_len;
2332
2333 #ifdef _OPENMP
2334 #pragma omp critical
2335 #endif
2336 {
2337 num_model++;
2338 cout.width(4);
2339 cout << right << num_model << " ";
2340 cout.width(12);
2341 cout << left << best_model.getName() << " ";
2342 cout.width(11);
2343 cout << score << " " << this_tree->aln->name;
2344 if (num_model >= 10) {
2345 double remain_time = (total_num_model-num_model)*(getRealTime()-start_time)/num_model;
2346 cout << "\t" << convert_time(getRealTime()-start_time) << " ("
2347 << convert_time(remain_time) << " left)";
2348 }
2349 cout << endl;
2350 replaceModelInfo(this_tree->aln->name, model_info, part_model_info);
2351 model_info.dump();
2352 }
2353 }
2354 }
2355
2356 inf_score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
2357 cout << "Best partition model " << criterionName(params.model_test_criterion) << " score: " << inf_score << " (LnL: " << lhsum << " df:" << dfsum << ")" << endl;
2358
2359 ((SuperAlignment*)in_tree->aln)->printBestPartition((string(params.out_prefix) + ".best_scheme.nex").c_str());
2360 ((SuperAlignment*)in_tree->aln)->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
2361 model_info.dump();
2362 }
2363
isMixtureModel(ModelsBlock * models_block,string & model_str)2364 bool isMixtureModel(ModelsBlock *models_block, string &model_str) {
2365 size_t mix_pos;
2366 for (mix_pos = 0; mix_pos < model_str.length(); mix_pos++) {
2367 size_t next_mix_pos = model_str.find_first_of("+*", mix_pos);
2368 string sub_model_str = model_str.substr(mix_pos, next_mix_pos-mix_pos);
2369 if (models_block->findMixModel(sub_model_str))
2370 return true;
2371 if (next_mix_pos == string::npos)
2372 break;
2373 mix_pos = next_mix_pos;
2374 }
2375 return false;
2376 }
2377
filterRates(int finished_model)2378 void CandidateModelSet::filterRates(int finished_model) {
2379 if (Params::getInstance().score_diff_thres < 0)
2380 return;
2381 double best_score = DBL_MAX;
2382 ASSERT(finished_model >= 0);
2383 int model;
2384 for (model = 0; model <= finished_model; model++)
2385 if (at(model).subst_name == at(0).subst_name) {
2386 if (!at(model).hasFlag(MF_DONE + MF_IGNORED))
2387 return; // only works if all models done
2388 best_score = min(best_score, at(model).getScore());
2389 }
2390
2391 double ok_score = best_score + Params::getInstance().score_diff_thres;
2392 set<string> ok_rates;
2393 for (model = 0; model <= finished_model; model++)
2394 if (at(model).getScore() <= ok_score) {
2395 string rate_name = at(model).orig_rate_name;
2396 ok_rates.insert(rate_name);
2397 }
2398 for (model = finished_model+1; model < size(); model++)
2399 if (ok_rates.find(at(model).orig_rate_name) == ok_rates.end())
2400 at(model).setFlag(MF_IGNORED);
2401 }
2402
filterSubst(int finished_model)2403 void CandidateModelSet::filterSubst(int finished_model) {
2404 if (Params::getInstance().score_diff_thres < 0)
2405 return;
2406 double best_score = DBL_MAX;
2407 ASSERT(finished_model >= 0);
2408 int model;
2409 for (model = 0; model <= finished_model; model++)
2410 if (at(model).rate_name == at(0).rate_name)
2411 best_score = min(best_score, at(model).getScore());
2412
2413 double ok_score = best_score + Params::getInstance().score_diff_thres;
2414 set<string> ok_model;
2415 for (model = 0; model <= finished_model; model++) {
2416 if (at(model).rate_name != at(0).rate_name)
2417 continue;
2418 if (at(model).getScore() <= ok_score) {
2419 string subst_name = at(model).orig_subst_name;
2420 ok_model.insert(subst_name);
2421 } else
2422 at(model).setFlag(MF_IGNORED);
2423 }
2424 for (model = finished_model+1; model < size(); model++)
2425 if (ok_model.find(at(model).orig_subst_name) == ok_model.end())
2426 at(model).setFlag(MF_IGNORED);
2427 }
2428
test(Params & params,PhyloTree * in_tree,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads,int brlen_type,string set_name,string in_model_name,bool merge_phase)2429 CandidateModel CandidateModelSet::test(Params ¶ms, PhyloTree* in_tree, ModelCheckpoint &model_info,
2430 ModelsBlock *models_block, int num_threads, int brlen_type,
2431 string set_name, string in_model_name, bool merge_phase)
2432 {
2433 ModelCheckpoint *checkpoint = &model_info;
2434
2435 in_tree->params = ¶ms;
2436
2437 // for ModelOMatic
2438 Alignment *prot_aln = NULL;
2439 Alignment *dna_aln = NULL;
2440 bool do_modelomatic = params.modelomatic && in_tree->aln->seq_type == SEQ_CODON;
2441
2442 if (in_model_name.empty()) {
2443 generate(params, in_tree->aln, params.model_test_separate_rate, merge_phase);
2444 if (do_modelomatic) {
2445 // generate models for protein
2446 // adapter coefficient according to Whelan et al. 2015
2447 prot_aln = in_tree->aln->convertCodonToAA();
2448 int adjusted_df;
2449 double adjusted_logl = computeAdapter(in_tree->aln, prot_aln, adjusted_df);
2450 if (set_name.empty())
2451 cout << "Adjusted LnL: " << adjusted_logl << " df: " << adjusted_df << endl;
2452 size_t start = size();
2453 generate(params, prot_aln, params.model_test_separate_rate, merge_phase);
2454 size_t i;
2455 for (i = start; i < size(); i++) {
2456 at(i).logl = adjusted_logl;
2457 at(i).df = adjusted_df;
2458 }
2459
2460 // generate models for DNA
2461 dna_aln = in_tree->aln->convertCodonToDNA();
2462 start = size();
2463 generate(params, dna_aln, params.model_test_separate_rate, merge_phase);
2464 for (i = start; i < size(); i++) {
2465 at(i).setFlag(MF_SAMPLE_SIZE_TRIPLE);
2466 }
2467 }
2468 } else {
2469 push_back(CandidateModel(in_model_name, "", in_tree->aln));
2470 }
2471
2472 DoubleVector model_scores;
2473 int model;
2474 int best_model = -1;
2475 Alignment *best_aln = in_tree->aln;
2476
2477 int ssize = in_tree->aln->getNSite(); // sample size
2478 //if (adjust)
2479 // ssize = adjust->sample_size;
2480 if (params.model_test_sample_size)
2481 ssize = params.model_test_sample_size;
2482 if (set_name == "") {
2483 cout << "ModelFinder will test up to " << size() << " ";
2484 if (do_modelomatic)
2485 cout << "codon/AA/DNA";
2486 else
2487 cout << getSeqTypeName(in_tree->aln->seq_type);
2488 cout << " models (sample size: " << ssize << ") ..." << endl;
2489 if (params.model_test_and_tree == 0)
2490 cout << " No. Model -LnL df AIC AICc BIC" << endl;
2491 }
2492
2493 // uint64_t RAM_requirement = 0;
2494 int best_model_AIC = -1, best_model_AICc = -1, best_model_BIC = -1;
2495 double best_score_AIC = DBL_MAX, best_score_AICc = DBL_MAX, best_score_BIC = DBL_MAX;
2496 string best_tree_AIC, best_tree_AICc, best_tree_BIC;
2497
2498 // CKP_RESTORE(best_score_AIC);
2499 // CKP_RESTORE(best_score_AICc);
2500 // CKP_RESTORE(best_score_BIC);
2501 // CKP_RESTORE(best_model_AIC);
2502 // CKP_RESTORE(best_model_AICc);
2503 // CKP_RESTORE(best_model_BIC);
2504
2505 CKP_RESTORE(best_tree_AIC);
2506 CKP_RESTORE(best_tree_AICc);
2507 CKP_RESTORE(best_tree_BIC);
2508
2509 // detect rate hetegeneity automatically or not
2510 bool auto_rate = merge_phase ? iEquals(params.merge_rates, "AUTO") : iEquals(params.ratehet_set, "AUTO");
2511 bool auto_subst = merge_phase ? iEquals(params.merge_models, "AUTO") : iEquals(params.model_set, "AUTO");
2512 int rate_block = size();
2513 if (auto_rate) {
2514 for (rate_block = 0; rate_block < size(); rate_block++)
2515 if (rate_block+1 < size() && at(rate_block+1).subst_name != at(rate_block).subst_name)
2516 break;
2517 }
2518
2519 int subst_block = size();
2520 if (auto_subst) {
2521 for (subst_block = size()-1; subst_block >= 0; subst_block--)
2522 if (at(subst_block).rate_name == at(0).rate_name)
2523 break;
2524 }
2525
2526
2527 //------------- MAIN FOR LOOP GOING THROUGH ALL MODELS TO BE TESTED ---------//
2528
2529 for (model = 0; model < size(); model++) {
2530 if (model == rate_block+1)
2531 filterRates(rate_block); // auto filter rate models
2532 if (model == subst_block+1)
2533 filterSubst(subst_block); // auto filter substitution model
2534 if (at(model).hasFlag(MF_IGNORED)) {
2535 model_scores.push_back(DBL_MAX);
2536 continue;
2537 }
2538 //cout << model_names[model] << endl;
2539 if (at(model).subst_name == "") {
2540 // now switching to test rate heterogeneity
2541 if (best_model == -1)
2542 switch (params.model_test_criterion) {
2543 case MTC_AIC:
2544 best_model = best_model_AIC;
2545 break;
2546 case MTC_AICC:
2547 best_model = best_model_AICc;
2548 break;
2549 case MTC_BIC:
2550 best_model = best_model_BIC;
2551 break;
2552 default: ASSERT(0);
2553 }
2554 at(model).subst_name = at(best_model).subst_name;
2555 }
2556
2557 // optimize model parameters
2558 string orig_model_name = at(model).getName();
2559 // keep separate output model_info to only update model_info if better model found
2560 ModelCheckpoint out_model_info;
2561 //CandidateModel info;
2562 //info.set_name = set_name;
2563 at(model).set_name = set_name;
2564 string tree_string;
2565
2566 /***** main call to estimate model parameters ******/
2567 tree_string = at(model).evaluate(params,
2568 model_info, out_model_info, models_block, num_threads, brlen_type);
2569
2570 at(model).computeICScores(ssize);
2571 at(model).setFlag(MF_DONE);
2572
2573 CandidateModel prev_info;
2574
2575 bool skip_model = false;
2576
2577 if (prev_info.restoreCheckpointRminus1(checkpoint, &at(model))) {
2578 // check stop criterion for +R
2579 prev_info.computeICScores(ssize);
2580 switch (params.model_test_criterion) {
2581 case MTC_ALL:
2582 if (at(model).AIC_score > prev_info.AIC_score &&
2583 at(model).AICc_score > prev_info.AICc_score &&
2584 at(model).BIC_score > prev_info.BIC_score) {
2585 // skip remaining model
2586 skip_model = true;
2587 }
2588 break;
2589 case MTC_AIC:
2590 if (at(model).AIC_score > prev_info.AIC_score) {
2591 // skip remaining model
2592 skip_model = true;
2593 }
2594 break;
2595 case MTC_AICC:
2596 if (at(model).AICc_score > prev_info.AICc_score) {
2597 // skip remaining model
2598 skip_model = true;
2599 }
2600 break;
2601 case MTC_BIC:
2602 if (at(model).BIC_score > prev_info.BIC_score) {
2603 // skip remaining model
2604 skip_model = true;
2605 }
2606 break;
2607 }
2608 }
2609
2610 if (at(model).AIC_score < best_score_AIC) {
2611 best_model_AIC = model;
2612 best_score_AIC = at(model).AIC_score;
2613 if (!tree_string.empty())
2614 best_tree_AIC = tree_string;
2615 // only update model_info with better model
2616 if (params.model_test_criterion == MTC_AIC) {
2617 model_info.putSubCheckpoint(&out_model_info, "");
2618 best_aln = at(model).aln;
2619 }
2620 }
2621 if (at(model).AICc_score < best_score_AICc) {
2622 best_model_AICc = model;
2623 best_score_AICc = at(model).AICc_score;
2624 if (!tree_string.empty())
2625 best_tree_AICc = tree_string;
2626 // only update model_info with better model
2627 if (params.model_test_criterion == MTC_AICC) {
2628 model_info.putSubCheckpoint(&out_model_info, "");
2629 best_aln = at(model).aln;
2630 }
2631 }
2632
2633 if (at(model).BIC_score < best_score_BIC) {
2634 best_model_BIC = model;
2635 best_score_BIC = at(model).BIC_score;
2636 if (!tree_string.empty())
2637 best_tree_BIC = tree_string;
2638 // only update model_info with better model
2639 if (params.model_test_criterion == MTC_BIC) {
2640 model_info.putSubCheckpoint(&out_model_info, "");
2641 best_aln = at(model).aln;
2642 }
2643 }
2644
2645 switch (params.model_test_criterion) {
2646 case MTC_AIC: model_scores.push_back(at(model).AIC_score); break;
2647 case MTC_AICC: model_scores.push_back(at(model).AICc_score); break;
2648 default: model_scores.push_back(at(model).BIC_score); break;
2649 }
2650
2651 CKP_SAVE(best_tree_AIC);
2652 CKP_SAVE(best_tree_AICc);
2653 CKP_SAVE(best_tree_BIC);
2654 checkpoint->dump();
2655
2656
2657 if (set_name == "") {
2658 cout.width(3);
2659 cout << right << model+1 << " ";
2660 cout.width(13);
2661 cout << left << at(model).getName() << " ";
2662
2663 cout.precision(3);
2664 cout << fixed;
2665 cout.width(12);
2666 cout << -at(model).logl << " ";
2667 cout.width(3);
2668 cout << at(model).df << " ";
2669 cout.width(12);
2670 cout << at(model).AIC_score << " ";
2671 cout.width(12);
2672 cout << at(model).AICc_score << " " << at(model).BIC_score;
2673 cout << endl;
2674 }
2675
2676 if (skip_model) {
2677 // skip over all +R model of higher categories
2678 const char *rates[] = {"+R", "*R", "+H", "*H"};
2679 size_t posR;
2680 for (int i = 0; i < sizeof(rates)/sizeof(char*); i++)
2681 if ((posR = orig_model_name.find(rates[i])) != string::npos)
2682 break;
2683 string first_part = orig_model_name.substr(0, posR+2);
2684 for (int next = model+1; next < size() && at(next).getName().substr(0, posR+2) == first_part; next++) {
2685 at(next).setFlag(MF_IGNORED);
2686 }
2687 }
2688
2689
2690 }
2691
2692 ASSERT(model_scores.size() == size());
2693
2694 if (best_model_BIC == -1)
2695 outError("No models were examined! Please check messages above");
2696
2697 int *model_rank = new int[model_scores.size()];
2698
2699 // string best_tree; // BQM 2015-07-21: With Lars find best model
2700 /* sort models by their scores */
2701 switch (params.model_test_criterion) {
2702 case MTC_AIC:
2703 best_model = best_model_AIC;
2704 break;
2705 case MTC_AICC:
2706 best_model = best_model_AICc;
2707 break;
2708 case MTC_BIC:
2709 best_model = best_model_BIC;
2710 break;
2711 default: ASSERT(0);
2712 }
2713 sort_index(model_scores.data(), model_scores.data() + model_scores.size(), model_rank);
2714
2715 string model_list;
2716 for (model = 0; model < model_scores.size(); model++) {
2717 if (model_scores[model_rank[model]] == DBL_MAX)
2718 break;
2719 if (model > 0)
2720 model_list += " ";
2721 model_list += at(model_rank[model]).getName();
2722 }
2723
2724 model_info.putBestModelList(model_list);
2725
2726 model_info.put("best_model_AIC", at(best_model_AIC).getName());
2727 model_info.put("best_model_AICc", at(best_model_AICc).getName());
2728 model_info.put("best_model_BIC", at(best_model_BIC).getName());
2729
2730 CKP_SAVE(best_score_AIC);
2731 CKP_SAVE(best_score_AICc);
2732 CKP_SAVE(best_score_BIC);
2733
2734 checkpoint->dump();
2735
2736 delete [] model_rank;
2737
2738 // update alignment if best data type changed
2739 if (best_aln != in_tree->aln) {
2740 delete in_tree->aln;
2741 in_tree->aln = best_aln;
2742 if (best_aln == prot_aln)
2743 prot_aln = NULL;
2744 else
2745 dna_aln = NULL;
2746 }
2747
2748 if (dna_aln)
2749 delete dna_aln;
2750 if (prot_aln)
2751 delete prot_aln;
2752
2753 // in_tree->deleteAllPartialLh();
2754
2755 string best_tree;
2756 model_info.getBestTree(best_tree);
2757
2758 // BQM 2015-07-21 with Lars: load the best_tree
2759 // if (params.model_test_and_tree)
2760 in_tree->readTreeString(best_tree);
2761
2762
2763 return at(best_model);
2764 }
2765
getNextModel()2766 int64_t CandidateModelSet::getNextModel() {
2767 int64_t next_model;
2768 #pragma omp critical
2769 {
2770 if (size() == 0)
2771 next_model = -1;
2772 else if (current_model == -1)
2773 next_model = 0;
2774 else {
2775 for (next_model = current_model+1; next_model != current_model; next_model++) {
2776 if (next_model == size())
2777 next_model = 0;
2778 if (!at(next_model).hasFlag(MF_IGNORED + MF_WAITING + MF_RUNNING)) {
2779 break;
2780 }
2781 }
2782 }
2783 }
2784 if (next_model != current_model) {
2785 current_model = next_model;
2786 at(next_model).setFlag(MF_RUNNING);
2787 return next_model;
2788 } else
2789 return -1;
2790 }
2791
evaluateAll(Params & params,PhyloTree * in_tree,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads,int brlen_type,string in_model_name,bool merge_phase,bool write_info)2792 CandidateModel CandidateModelSet::evaluateAll(Params ¶ms, PhyloTree* in_tree, ModelCheckpoint &model_info,
2793 ModelsBlock *models_block, int num_threads, int brlen_type,
2794 string in_model_name, bool merge_phase, bool write_info)
2795 {
2796 //ModelCheckpoint *checkpoint = &model_info;
2797
2798 in_tree->params = ¶ms;
2799
2800 Alignment *prot_aln = NULL;
2801 Alignment *dna_aln = NULL;
2802 bool do_modelomatic = params.modelomatic && in_tree->aln->seq_type == SEQ_CODON;
2803
2804
2805
2806 if (in_model_name.empty()) {
2807 generate(params, in_tree->aln, params.model_test_separate_rate, merge_phase);
2808 if (do_modelomatic) {
2809 // generate models for protein
2810 // adapter coefficient according to Whelan et al. 2015
2811 prot_aln = in_tree->aln->convertCodonToAA();
2812 int adjusted_df;
2813 double adjusted_logl = computeAdapter(in_tree->aln, prot_aln, adjusted_df);
2814 if (write_info)
2815 cout << "Adjusted LnL: " << adjusted_logl << " df: " << adjusted_df << endl;
2816 size_t start = size();
2817 generate(params, prot_aln, params.model_test_separate_rate, merge_phase);
2818 size_t i;
2819 for (i = start; i < size(); i++) {
2820 at(i).logl = adjusted_logl;
2821 at(i).df = adjusted_df;
2822 }
2823
2824 // generate models for DNA
2825 dna_aln = in_tree->aln->convertCodonToDNA();
2826 start = size();
2827 generate(params, dna_aln, params.model_test_separate_rate, merge_phase);
2828 for (i = start; i < size(); i++) {
2829 at(i).setFlag(MF_SAMPLE_SIZE_TRIPLE);
2830 }
2831 }
2832 } else {
2833 push_back(CandidateModel(in_model_name, "", in_tree->aln));
2834 }
2835
2836 if (write_info) {
2837 cout << "ModelFinder will test " << size() << " ";
2838 if (do_modelomatic)
2839 cout << "codon/AA/DNA";
2840 else
2841 cout << getSeqTypeName(in_tree->aln->seq_type);
2842 cout << " models (sample size: " << in_tree->aln->getNSite() << ") ..." << endl;
2843 cout << " No. Model -LnL df AIC AICc BIC" << endl;
2844 }
2845
2846 double best_score = DBL_MAX;
2847
2848 // detect rate hetegeneity automatically or not
2849 bool auto_rate = merge_phase ? iEquals(params.merge_rates, "AUTO") : iEquals(params.ratehet_set, "AUTO");
2850 bool auto_subst = merge_phase ? iEquals(params.merge_models, "AUTO") : iEquals(params.model_set, "AUTO");
2851 int rate_block = size();
2852 if (auto_rate) {
2853 for (rate_block = 0; rate_block < size(); rate_block++)
2854 if (rate_block+1 < size() && at(rate_block+1).subst_name != at(rate_block).subst_name)
2855 break;
2856 }
2857
2858 int subst_block = size();
2859 if (auto_subst) {
2860 for (subst_block = size()-1; subst_block >= 0; subst_block--)
2861 if (at(subst_block).rate_name == at(0).rate_name)
2862 break;
2863 }
2864
2865 int64_t num_models = size();
2866 #ifdef _OPENMP
2867 #pragma omp parallel num_threads(num_threads)
2868 #endif
2869 {
2870 int64_t model;
2871 do {
2872 model = getNextModel();
2873 if (model == -1)
2874 break;
2875
2876 // optimize model parameters
2877 string orig_model_name = at(model).getName();
2878 // keep separate output model_info to only update model_info if better model found
2879 ModelCheckpoint out_model_info;
2880 at(model).set_name = at(model).aln->name;
2881 string tree_string;
2882
2883 // main call to estimate model parameters
2884 tree_string = at(model).evaluate(params, model_info, out_model_info,
2885 models_block, num_threads, brlen_type);
2886 at(model).computeICScores();
2887 at(model).setFlag(MF_DONE);
2888
2889 int lower_model = getLowerKModel(model);
2890 if (lower_model >= 0 && at(lower_model).getScore() < at(model).getScore()) {
2891 // ignore all +R_k model with higher category
2892 for (int higher_model = model; higher_model != -1;
2893 higher_model = getHigherKModel(higher_model)) {
2894 at(higher_model).setFlag(MF_IGNORED);
2895 }
2896
2897 }
2898 #ifdef _OPENMP
2899 #pragma omp critical
2900 {
2901 #endif
2902 if (best_score > at(model).getScore()) {
2903 best_score = at(model).getScore();
2904 if (!tree_string.empty()) {
2905 //model_info.put("best_tree_" + criterionName(params.model_test_criterion), tree_string);
2906 }
2907 // only update model_info with better model
2908 model_info.putSubCheckpoint(&out_model_info, "");
2909 }
2910 model_info.dump();
2911 if (write_info) {
2912 cout.width(3);
2913 cout << right << model+1 << " ";
2914 cout.width(13);
2915 cout << left << at(model).getName() << " ";
2916
2917 cout.precision(3);
2918 cout << fixed;
2919 cout.width(12);
2920 cout << -at(model).logl << " ";
2921 cout.width(3);
2922 cout << at(model).df << " ";
2923 cout.width(12);
2924 cout << at(model).AIC_score << " ";
2925 cout.width(12);
2926 cout << at(model).AICc_score << " " << at(model).BIC_score;
2927 cout << endl;
2928
2929 }
2930 if (model >= rate_block)
2931 filterRates(model); // auto filter rate models
2932 if (model >= subst_block)
2933 filterSubst(model); // auto filter substitution model
2934 #ifdef _OPENMP
2935 }
2936 #endif
2937 } while (model != -1);
2938 }
2939
2940 // store the best model
2941 ModelTestCriterion criteria[] = {MTC_AIC, MTC_AICC, MTC_BIC};
2942 for (auto mtc : criteria) {
2943 int best_model = getBestModelID(mtc);
2944 model_info.put("best_score_" + criterionName(mtc), at(best_model).getScore(mtc));
2945 model_info.put("best_model_" + criterionName(mtc), at(best_model).getName());
2946 }
2947
2948
2949 /* sort models by their scores */
2950 multimap<double,int> model_sorted;
2951 for (int64_t model = 0; model < num_models; model++)
2952 if (at(model).hasFlag(MF_DONE)) {
2953 model_sorted.insert(multimap<double,int>::value_type(at(model).getScore(), model));
2954 }
2955 string model_list;
2956 for (auto it = model_sorted.begin(); it != model_sorted.end(); it++) {
2957 if (it != model_sorted.begin())
2958 model_list += " ";
2959 model_list += at(it->second).getName();
2960 }
2961
2962 model_info.putBestModelList(model_list);
2963 model_info.dump();
2964
2965 // update alignment if best data type changed
2966 int best_model = getBestModelID(params.model_test_criterion);
2967 if (at(best_model).aln != in_tree->aln) {
2968 delete in_tree->aln;
2969 in_tree->aln = at(best_model).aln;
2970 if (in_tree->aln == prot_aln)
2971 prot_aln = NULL;
2972 else
2973 dna_aln = NULL;
2974 }
2975
2976 if (dna_aln)
2977 delete dna_aln;
2978 if (prot_aln)
2979 delete prot_aln;
2980
2981 return at(best_model);
2982 }
2983
2984
2985