1 /*
2  * phylotesting.cpp
3  * implementation of ModelFinder and PartitionFinder
4  *  Created on: Aug 23, 2013
5  *      Author: minh
6  */
7 
8 
9 
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13 #include <iqtree_config.h>
14 #include <numeric>
15 #include "tree/phylotree.h"
16 #include "tree/iqtree.h"
17 #include "tree/phylosupertree.h"
18 #include "tree/phylotreemixlen.h"
19 #include "phylotesting.h"
20 
21 #include "model/modelmarkov.h"
22 #include "model/modeldna.h"
23 #include "nclextra/myreader.h"
24 #include "model/rateheterogeneity.h"
25 #include "model/rategamma.h"
26 #include "model/rateinvar.h"
27 #include "model/rategammainvar.h"
28 #include "model/ratefree.h"
29 #include "model/ratefreeinvar.h"
30 //#include "modeltest_wrapper.h"
31 #include "model/modelprotein.h"
32 #include "model/modelbin.h"
33 #include "model/modelcodon.h"
34 #include "model/modelmorphology.h"
35 #include "model/modelmixture.h"
36 #include "model/modelliemarkov.h"
37 #include "model/modelpomo.h"
38 #include "utils/timeutil.h"
39 #include "model/modelfactorymixlen.h"
40 #include "tree/phylosupertreeplen.h"
41 #include "tree/phylosupertreeunlinked.h"
42 
43 #include "phyloanalysis.h"
44 #include "gsl/mygsl.h"
45 #include "utils/MPIHelper.h"
46 //#include "vectorclass/vectorclass.h"
47 
48 
49 /******* Binary model set ******/
50 const char* bin_model_names[] = {"GTR2", "JC2"};
51 
52 
53 /******* Morphological model set ******/
54 // 2018-08-20: don't test ORDERED model due to lots of numerical issues
55 //const char* morph_model_names[] = {"MK", "ORDERED"};
56 const char* morph_model_names[] = {"MK"};
57 
58 
59 /******* DNA model set ******/
60 const char* dna_model_names[] = {"GTR", "SYM", "TVM",  "TVMe", "TIM3",
61         "TIM3e", "TIM2", "TIM2e", "TIM", "TIMe", "TPM3u", "TPM3",
62         "TPM2u",  "TPM2",  "K81u", "K81", "TN", "TNe",  "HKY",  "K80", "F81", "JC"};
63 
64 /* DNA models supported by PhyML/PartitionFinder */
65 const char* dna_model_names_old[] ={"GTR",  "SYM", "TVM", "TVMe", "TIM", "TIMe",
66          "K81u", "K81", "TN", "TNe", "HKY", "K80", "F81", "JC"};
67 
68 /* DNA model supported by RAxML */
69 const char* dna_model_names_rax[] ={"GTR"};
70 
71 /* DNA model supported by MrBayes */
72 const char *dna_model_names_mrbayes[] = {"GTR", "SYM", "HKY", "K80", "F81", "JC"};
73 
74 /* DNA model supported by ModelOMatic */
75 const char *dna_model_names_modelomatic[] = {"GTR", "HKY", "K80", "F81", "JC"};
76 
77 //const char* dna_freq_names[] = {"+FO"};
78 
79 // Lie-Markov models without an RY, WS or MK prefix
80 const char *dna_model_names_lie_markov_fullsym[] =
81     {"1.1", "3.3a", "4.4a", "6.7a", "9.20b", "12.12"};
82 // Lie-Markov models with RY symmetry/distinguished pairing
83 const char *dna_model_names_lie_markov_ry[] = {
84           "RY2.2b",  "RY3.3b", "RY3.3c",  "RY3.4",   "RY4.4b",
85           "RY4.5a",  "RY4.5b", "RY5.6a",  "RY5.6b",  "RY5.7a",
86           "RY5.7b",  "RY5.7c", "RY5.11a", "RY5.11b", "RY5.11c",
87           "RY5.16",  "RY6.6",  "RY6.7b",  "RY6.8a",  "RY6.8b",
88           "RY6.17a", "RY6.17b","RY8.8",   "RY8.10a", "RY8.10b",
89           "RY8.16",  "RY8.17", "RY8.18",  "RY9.20a", "RY10.12",
90 	  "RY10.34"
91 };
92 // Lie-Markov models with WS symmetry/distinguished pairing
93 const char *dna_model_names_lie_markov_ws[] = {
94           "WS2.2b",  "WS3.3b", "WS3.3c",  "WS3.4",   "WS4.4b",
95           "WS4.5a",  "WS4.5b", "WS5.6a",  "WS5.6b",  "WS5.7a",
96           "WS5.7b",  "WS5.7c", "WS5.11a", "WS5.11b", "WS5.11c",
97           "WS5.16",  "WS6.6",  "WS6.7b",  "WS6.8a",  "WS6.8b",
98           "WS6.17a", "WS6.17b","WS8.8",   "WS8.10a", "WS8.10b",
99           "WS8.16",  "WS8.17", "WS8.18",  "WS9.20a", "WS10.12",
100 	  "WS10.34"
101 };
102 // Lie-Markov models with MK symmetry/distinguished pairing
103 const char *dna_model_names_lie_markov_mk[] = {
104           "MK2.2b",  "MK3.3b", "MK3.3c",  "MK3.4",   "MK4.4b",
105           "MK4.5a",  "MK4.5b", "MK5.6a",  "MK5.6b",  "MK5.7a",
106           "MK5.7b",  "MK5.7c", "MK5.11a", "MK5.11b", "MK5.11c",
107           "MK5.16",  "MK6.6",  "MK6.7b",  "MK6.8a",  "MK6.8b",
108           "MK6.17a", "MK6.17b","MK8.8",   "MK8.10a", "MK8.10b",
109           "MK8.16",  "MK8.17", "MK8.18",  "MK9.20a", "MK10.12",
110 	  "MK10.34"
111 };
112 // Lie-Markov models which are strand symmetric
113 const char *dna_model_names_lie_markov_strsym[] = {
114           "1.1",    "WS2.2b", "3.3a",   "WS3.3b", "WS3.3c", "WS3.4",
115           "WS4.4b", "WS4.5a", "WS4.5b", "WS5.6a", "WS6.6"
116 };
117 
118 
119 /****** Protein model set ******/
120 const char* aa_model_names[] = {"LG", "WAG", "JTT", "JTTDCMut", "DCMut", "VT", "PMB", "Blosum62", "Dayhoff",
121         "mtREV", "mtART", "mtZOA", "mtMet" , "mtVer" , "mtInv", "mtMAM",
122 		"HIVb", "HIVw", "FLU", "rtREV", "cpREV"};
123 
124 /* Protein models supported by PhyML/PartitionFinder */
125 const char *aa_model_names_phyml[] = {"LG", "WAG", "JTT", "DCMut", "VT", "Blosum62", "Dayhoff",
126 		"mtREV", "mtART", "mtMAM",
127 		"HIVb", "HIVw", "rtREV", "cpREV"};
128 
129 /* Protein models supported by RAxML */
130 const char *aa_model_names_rax[] = {"LG", "WAG", "JTT", "JTTDCMut", "DCMut", "VT", "PMB", "Blosum62", "Dayhoff",
131         "mtREV", "mtART", "mtZOA", "mtMAM",
132 		"HIVb", "HIVw", "FLU", "rtREV", "cpREV"};
133 
134 const char* aa_model_names_mrbayes[] = {"WAG", "JTT", "VT", "Blosum62", "Dayhoff",
135         "mtREV", "mtMAM",
136 		"rtREV", "cpREV"};
137 
138 const char* aa_model_names_modelomatic[] = {"LG", "WAG", "JTT", "VT", "Blosum62", "Dayhoff",
139         "mtART", "mtMAM", "mtREV",
140         "HIVb", "HIVw", "rtREV", "cpREV"};
141 
142 const char *aa_model_names_nuclear[] = {"LG", "WAG", "JTT", "JTTDCMut","DCMut", "VT", "PMB", "Blosum62", "Dayhoff"};
143 
144 const char *aa_model_names_mitochondrial[] = {"mtREV", "mtART", "mtZOA", "mtMet" , "mtVer" , "mtInv", "mtMAM"};
145 
146 const char *aa_model_names_chloroplast[] = {"cpREV"};
147 
148 const char *aa_model_names_viral[] = {"HIVb", "HIVw", "FLU", "rtREV"};
149 
150 const char* aa_freq_names[] = {"", "+F"};
151 
152 
153 /****** Codon models ******/
154 //const char *codon_model_names[] = {"GY", "MG", "MGK", "KOSI07", "SCHN05","KOSI07_GY1KTV","SCHN05_GY1KTV"};
155 //short int std_genetic_code[]    = {   0,    0,     0,        1,        1,              1,              1};
156 const char *codon_model_names[] = { "GY", "MGK", "MG", "KOSI07", "SCHN05"};
157 short int std_genetic_code[]    = {   0,    0,     0,        1,        1};
158 const char *codon_model_names_modelomatic[] = {"GY"};
159 short int std_genetic_code_modelomatic[]    = {   0};
160 
161 const char *codon_freq_names[] = {"+F3X4", "+F1X4", "+F", ""};
162 
163 //const double TOL_LIKELIHOOD_MODELTEST = 0.1;
164 const double TOL_GRADIENT_MODELTEST   = 0.0001;
165 
166 extern double RunKMeans1D(int n, int k, double *points, int *weights, double *centers, int *assignments);
167 
168 
getSeqTypeName(SeqType seq_type)169 string getSeqTypeName(SeqType seq_type) {
170     switch (seq_type) {
171         case SEQ_BINARY: return "binary";
172         case SEQ_DNA: return "DNA";
173         case SEQ_PROTEIN: return "protein";
174         case SEQ_CODON: return "codon";
175         case SEQ_MORPH: return "morphological";
176         case SEQ_POMO: return "PoMo";
177         case SEQ_UNKNOWN: return "unknown";
178         case SEQ_MULTISTATE: return "MultiState";
179     }
180 }
181 
getUsualModelSubst(SeqType seq_type)182 string getUsualModelSubst(SeqType seq_type) {
183     switch (seq_type) {
184         case SEQ_DNA: return dna_model_names[0];
185         case SEQ_PROTEIN: return aa_model_names[0];
186         case SEQ_CODON: return string(codon_model_names[0]) + codon_freq_names[0];
187         case SEQ_BINARY: return bin_model_names[0];
188         case SEQ_MORPH: return morph_model_names[0];
189         case SEQ_POMO: return string(dna_model_names[0]) + "+P";
190         default: ASSERT(0 && "Unprocessed seq_type"); return "";
191     }
192 }
193 
194 void getRateHet(SeqType seq_type, string model_name, double frac_invariant_sites,
195                 string rate_set, StrVector &ratehet);
196 
getUsualModel(Alignment * aln)197 size_t CandidateModel::getUsualModel(Alignment *aln) {
198     size_t aln_len = 0;
199     if (aln->isSuperAlignment()) {
200         SuperAlignment *super_aln = (SuperAlignment*)aln;
201         for (auto it = super_aln->partitions.begin(); it != super_aln->partitions.end(); it++) {
202             CandidateModel usual_model(*it);
203             if (!subst_name.empty())
204                 subst_name += ',';
205             subst_name += usual_model.subst_name;
206             if (!rate_name.empty())
207                 rate_name += ',';
208             rate_name += usual_model.rate_name;
209             aln_len += (*it)->getNSite();
210         }
211     } else {
212         subst_name = getUsualModelSubst(aln->seq_type);
213         StrVector ratehet;
214         getRateHet(aln->seq_type, Params::getInstance().model_name, aln->frac_invariant_sites, "1", ratehet);
215         ASSERT(!ratehet.empty());
216         rate_name = ratehet[0];
217         aln_len = aln->getNSite();
218     }
219     orig_subst_name = subst_name;
220     orig_rate_name = rate_name;
221     return aln_len;
222 }
223 
computeICScores(size_t sample_size)224 void CandidateModel::computeICScores(size_t sample_size) {
225     computeInformationScores(logl, df, sample_size, AIC_score, AICc_score, BIC_score);
226 }
227 
computeICScores()228 void CandidateModel::computeICScores() {
229     size_t sample_size = aln->getNSite();
230     if (aln->isSuperAlignment()) {
231         sample_size = 0;
232         SuperAlignment *super_aln = (SuperAlignment*)aln;
233         for (auto a : super_aln->partitions)
234             sample_size += a->getNSite();
235     }
236     if (hasFlag(MF_SAMPLE_SIZE_TRIPLE))
237         sample_size /= 3;
238     computeInformationScores(logl, df, sample_size, AIC_score, AICc_score, BIC_score);
239 }
240 
computeICScore(size_t sample_size)241 double CandidateModel::computeICScore(size_t sample_size) {
242     return computeInformationScore(logl, df, sample_size, Params::getInstance().model_test_criterion);
243 }
244 
getScore(ModelTestCriterion mtc)245 double CandidateModel::getScore(ModelTestCriterion mtc) {
246     switch (mtc) {
247         case MTC_AIC:
248             return AIC_score;
249         case MTC_AICC:
250             return AICc_score;
251         case MTC_BIC:
252             return BIC_score;
253         case MTC_ALL:
254             ASSERT(0 && "Unhandled case");
255             return 0.0;
256     }
257 }
258 
getScore()259 double CandidateModel::getScore() {
260     return getScore(Params::getInstance().model_test_criterion);
261 }
262 
getBestModelID(ModelTestCriterion mtc)263 int CandidateModelSet::getBestModelID(ModelTestCriterion mtc) {
264     double best_score = DBL_MAX;
265     int best_model = -1;
266     for (int model = 0; model < size(); model++)
267         if (at(model).hasFlag(MF_DONE) && best_score > at(model).getScore(mtc)) {
268             best_score = at(model).getScore(mtc);
269             best_model = model;
270         }
271     return best_model;
272 }
273 
getBestModel(string & best_model)274 bool ModelCheckpoint::getBestModel(string &best_model) {
275     return getString("best_model_" + criterionName(Params::getInstance().model_test_criterion), best_model);
276 }
277 
getBestModelList(string & best_model_list)278 bool ModelCheckpoint::getBestModelList(string &best_model_list) {
279     return getString("best_model_list_" + criterionName(Params::getInstance().model_test_criterion), best_model_list);
280 }
281 
putBestModelList(string & best_model_list)282 void ModelCheckpoint::putBestModelList(string &best_model_list) {
283     return put("best_model_list_" + criterionName(Params::getInstance().model_test_criterion), best_model_list);
284 }
285 
getBestTree(string & best_tree)286 bool  ModelCheckpoint::getBestTree(string &best_tree) {
287     return getString("best_tree_" + criterionName(Params::getInstance().model_test_criterion), best_tree);
288 }
289 
getOrderedModels(PhyloTree * tree,CandidateModelSet & ordered_models)290 bool ModelCheckpoint::getOrderedModels(PhyloTree *tree, CandidateModelSet &ordered_models) {
291     double best_score_AIC, best_score_AICc, best_score_BIC;
292     if (tree->isSuperTree()) {
293         PhyloSuperTree *stree = (PhyloSuperTree*)tree;
294         ordered_models.clear();
295         for (int part = 0; part != stree->size(); part++) {
296             startStruct(stree->at(part)->aln->name);
297             CandidateModel info;
298             if (!getBestModel(info.subst_name)) return false;
299             info.restoreCheckpoint(this);
300             info.computeICScores(stree->at(part)->getAlnNSite());
301             endStruct();
302             ordered_models.push_back(info);
303         }
304         return true;
305     } else {
306         CKP_RESTORE2(this, best_score_AIC);
307         CKP_RESTORE2(this, best_score_AICc);
308         CKP_RESTORE2(this, best_score_BIC);
309         double sum_AIC = 0, sum_AICc = 0, sum_BIC = 0;
310         string str;
311         bool ret = getBestModelList(str);
312         if (!ret) return false;
313         istringstream istr(str);
314         string model;
315         ordered_models.clear();
316         while (istr >> model) {
317             CandidateModel info;
318             info.subst_name = model;
319             info.restoreCheckpoint(this);
320             info.computeICScores(tree->getAlnNSite());
321             sum_AIC  += info.AIC_weight = exp(-0.5*(info.AIC_score-best_score_AIC));
322             sum_AICc += info.AICc_weight = exp(-0.5*(info.AICc_score-best_score_AICc));
323             sum_BIC  += info.BIC_weight = exp(-0.5*(info.BIC_score-best_score_BIC));
324             ordered_models.push_back(info);
325         }
326         sum_AIC = 1.0/sum_AIC;
327         sum_AICc = 1.0/sum_AICc;
328         sum_BIC = 1.0/sum_BIC;
329         for (auto it = ordered_models.begin(); it != ordered_models.end(); it++) {
330             it->AIC_weight *= sum_AIC;
331             it->AICc_weight *= sum_AICc;
332             it->BIC_weight *= sum_BIC;
333             it->AIC_conf = it->AIC_weight > 0.05;
334             it->AICc_conf = it->AICc_weight > 0.05;
335             it->BIC_conf = it->BIC_weight > 0.05;
336         }
337         return true;
338     }
339 }
340 
341 /**
342  * copy from cvec to strvec
343  */
copyCString(const char ** cvec,int n,StrVector & strvec,bool touppercase=false)344 void copyCString(const char **cvec, int n, StrVector &strvec, bool touppercase = false) {
345 	strvec.resize(n);
346 	for (int i = 0; i < n; i++) {
347 		strvec[i] = cvec[i];
348         if (touppercase)
349             std::transform(strvec[i].begin(), strvec[i].end(), strvec[i].begin(), ::toupper);
350     }
351 }
352 
353 /**
354  * append from cvec to strvec
355  */
appendCString(const char ** cvec,int n,StrVector & strvec,bool touppercase=false)356 void appendCString(const char **cvec, int n, StrVector &strvec, bool touppercase = false) {
357         strvec.reserve(strvec.size()+n);
358 	for (int i = 0; i < n; i++) {
359 	    strvec.push_back(cvec[i]);
360             if (touppercase)
361 	      std::transform(strvec.back().begin(), strvec.back().end(), strvec.back().begin(), ::toupper);
362         }
363 }
364 
365 
detectSeqType(const char * model_name,SeqType & seq_type)366 int detectSeqType(const char *model_name, SeqType &seq_type) {
367     bool empirical_model = false;
368     int i;
369     string model_str = model_name;
370     std::transform(model_str.begin(), model_str.end(), model_str.begin(), ::toupper);
371     StrVector model_list;
372 
373     seq_type = SEQ_UNKNOWN;
374 
375     copyCString(bin_model_names, sizeof(bin_model_names)/sizeof(char*), model_list, true);
376     for (i = 0; i < model_list.size(); i++)
377         if (model_str == model_list[i]) {
378             seq_type = SEQ_BINARY;
379             break;
380         }
381     copyCString(morph_model_names, sizeof(morph_model_names)/sizeof(char*), model_list, true);
382     for (i = 0; i < model_list.size(); i++)
383         if (model_str == model_list[i]) {
384             seq_type = SEQ_MORPH;
385             break;
386         }
387     copyCString(dna_model_names, sizeof(dna_model_names)/sizeof(char*), model_list, true);
388     for (i = 0; i < model_list.size(); i++)
389         if (model_str == model_list[i]) {
390             seq_type = SEQ_DNA;
391             break;
392         }
393     copyCString(aa_model_names, sizeof(aa_model_names)/sizeof(char*), model_list, true);
394     for (i = 0; i < model_list.size(); i++)
395         if (model_str == model_list[i]) {
396             seq_type = SEQ_PROTEIN;
397             empirical_model = true;
398             break;
399         }
400     copyCString(codon_model_names, sizeof(codon_model_names)/sizeof(char*), model_list, true);
401     for (i = 0; i < model_list.size(); i++)
402         if (model_str.substr(0,model_list[i].length()) == model_list[i]) {
403             seq_type = SEQ_CODON;
404             if (std_genetic_code[i]) empirical_model = true;
405             break;
406         }
407 
408     return (empirical_model) ? 2 : 1;
409 }
410 
detectSeqTypeName(string model_name)411 string detectSeqTypeName(string model_name) {
412     SeqType seq_type;
413     detectSeqType(model_name.c_str(), seq_type);
414     switch (seq_type) {
415     case SEQ_BINARY: return "BIN"; break;
416     case SEQ_MORPH: return "MORPH"; break;
417     case SEQ_DNA: return "DNA"; break;
418     case SEQ_PROTEIN: return "AA"; break;
419     case SEQ_CODON: return "CODON"; break;
420     default: break;
421     }
422     return "";
423 }
424 
computeInformationScores(double tree_lh,int df,int ssize,double & AIC,double & AICc,double & BIC)425 void computeInformationScores(double tree_lh, int df, int ssize, double &AIC, double &AICc, double &BIC) {
426 	AIC = -2 * tree_lh + 2 * df;
427 	AICc = AIC + 2.0 * df * (df + 1) / max(ssize - df - 1, 1);
428 	BIC = -2 * tree_lh + df * log(ssize);
429 }
430 
computeInformationScore(double tree_lh,int df,int ssize,ModelTestCriterion mtc)431 double computeInformationScore(double tree_lh, int df, int ssize, ModelTestCriterion mtc) {
432 	double AIC, AICc, BIC;
433 	computeInformationScores(tree_lh, df, ssize, AIC, AICc, BIC);
434 	if (mtc == MTC_AIC)
435 		return AIC;
436 	if (mtc == MTC_AICC)
437 		return AICc;
438 	if (mtc == MTC_BIC)
439 		return BIC;
440 	return 0.0;
441 }
442 
criterionName(ModelTestCriterion mtc)443 string criterionName(ModelTestCriterion mtc) {
444 	if (mtc == MTC_AIC)
445 		return "AIC";
446 	if (mtc == MTC_AICC)
447 		return "AICc";
448 	if (mtc == MTC_BIC)
449 		return "BIC";
450 	return "";
451 }
452 
453 
454 /**
455  * select models for all partitions
456  * @param[in,out] model_info (IN/OUT) all model information
457  * @return total number of parameters
458  */
459 void testPartitionModel(Params &params, PhyloSuperTree* in_tree, ModelCheckpoint &model_info,
460                         ModelsBlock *models_block, int num_threads);
461 
462 
463 /**
464  compute log-adapter function according to Whelan et al. 2015
465  @param orig_aln original codon alignment
466  @param newaln AA alignment
467  @param[out] adjusted_df adjusted degree of freedom factor
468  @return adjusted log-likelihood factor
469  */
computeAdapter(Alignment * orig_aln,Alignment * newaln,int & adjusted_df)470 double computeAdapter(Alignment *orig_aln, Alignment *newaln, int &adjusted_df) {
471     int aa, codon;
472 
473     // count codon occurences
474     unsigned int codon_counts[orig_aln->num_states];
475     orig_aln->computeAbsoluteStateFreq(codon_counts);
476 
477     // compute AA frequency
478 //    double aa_freq[newaln->num_states];
479 //    newaln->computeStateFreq(aa_freq);
480 
481     // compute codon frequency
482     double codon_freq[orig_aln->num_states];
483     //orig_aln->computeStateFreq(codon_freq);
484 
485     double sum = 0.0;
486     for (codon = 0; codon < orig_aln->num_states; codon++)
487         sum += codon_counts[codon];
488     sum = 1.0/sum;
489     for (codon = 0; codon < orig_aln->num_states; codon++)
490         codon_freq[codon] = sum*codon_counts[codon];
491 
492     // new rescale codon_freq s.t. codons coding for the same AA
493     // have f summing up to the frequency of this AA
494     for (aa = 0; aa < newaln->num_states; aa++) {
495         double sum = 0;
496         for (codon = 0; codon < orig_aln->num_states; codon++)
497             if (newaln->convertState(orig_aln->genetic_code[(int)orig_aln->codon_table[codon]]) == aa)
498                 sum += codon_freq[codon];
499         sum = 1.0/sum;
500         for (codon = 0; codon < orig_aln->num_states; codon++)
501             if (newaln->convertState(orig_aln->genetic_code[(int)orig_aln->codon_table[codon]]) == aa)
502                 codon_freq[codon] *= sum;
503     }
504 
505     // now compute adapter function
506     double adapter = 0.0;
507     adjusted_df = 0;
508     vector<bool> has_AA;
509     has_AA.resize(newaln->num_states, false);
510 
511     for (codon = 0; codon < orig_aln->num_states; codon++) {
512         if (codon_counts[codon] == 0)
513             continue;
514         has_AA[newaln->convertState(orig_aln->genetic_code[(int)orig_aln->codon_table[codon]])] = true;
515         adapter += codon_counts[codon]*log(codon_freq[codon]);
516         adjusted_df++;
517     }
518     for (aa = 0; aa < has_AA.size(); aa++)
519         if (has_AA[aa])
520             adjusted_df--;
521     return adapter;
522 }
523 
524 /**
525  compute fast ML tree by stepwise addition MP + ML-NNI
526  @return the tree string
527  */
computeFastMLTree(Params & params,Alignment * aln,ModelCheckpoint & model_info,ModelsBlock * models_block,int & num_threads,int brlen_type,string dist_file)528 string computeFastMLTree(Params &params, Alignment *aln,
529                        ModelCheckpoint &model_info, ModelsBlock *models_block,
530                        int &num_threads, int brlen_type, string dist_file) {
531     //string model_name;
532     CandidateModel usual_model(aln);
533     StrVector subst_names;
534     StrVector rate_names;
535     convert_string_vec(usual_model.subst_name.c_str(), subst_names);
536     convert_string_vec(usual_model.rate_name.c_str(), rate_names);
537     ASSERT(subst_names.size() == rate_names.size());
538     //set<string> model_set;
539 
540     string concat_tree;
541 
542     IQTree *iqtree = NULL;
543 
544     StrVector saved_model_names;
545 
546     if (aln->isSuperAlignment()) {
547         SuperAlignment *saln = (SuperAlignment*)aln;
548         if (params.partition_type == TOPO_UNLINKED)
549             iqtree = new PhyloSuperTreeUnlinked(saln);
550         else if (params.partition_type == BRLEN_OPTIMIZE)
551             iqtree = new PhyloSuperTree(saln);
552         else
553             iqtree = new PhyloSuperTreePlen(saln, brlen_type);
554         for (int part = 0; part != subst_names.size(); part++) {
555             saved_model_names.push_back(saln->partitions[part]->model_name);
556             saln->partitions[part]->model_name = subst_names[part] + rate_names[part];
557         }
558     } else if (posRateHeterotachy(rate_names[0]) != string::npos)
559         iqtree = new PhyloTreeMixlen(aln, 0);
560     else
561         iqtree = new IQTree(aln);
562 
563     if ((params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) && !iqtree->isInitializedPLL()) {
564         /* Initialized all data structure for PLL*/
565         iqtree->initializePLL(params);
566     }
567 
568     iqtree->setParams(&params);
569     iqtree->setLikelihoodKernel(params.SSE);
570     iqtree->optimize_by_newton = params.optimize_by_newton;
571     iqtree->setNumThreads(num_threads);
572     iqtree->setCheckpoint(&model_info);
573 
574     iqtree->dist_file = dist_file;
575     iqtree->computeInitialTree(params.SSE);
576     iqtree->restoreCheckpoint();
577 
578     //ASSERT(iqtree->root);
579     iqtree->initializeModel(params, usual_model.getName(), models_block);
580     if (!iqtree->getModel()->isMixture() || aln->seq_type == SEQ_POMO) {
581         usual_model.subst_name = iqtree->getSubstName();
582         usual_model.rate_name = iqtree->getRateName();
583     }
584 
585     iqtree->getModelFactory()->restoreCheckpoint();
586 
587 #ifdef _OPENMP
588     if (num_threads <= 0) {
589         num_threads = iqtree->testNumThreads();
590         omp_set_num_threads(num_threads);
591     } else
592         iqtree->warnNumThreads();
593 #endif
594 
595     iqtree->initializeAllPartialLh();
596     double saved_modelEps = params.modelEps;
597     params.modelEps = params.modelfinder_eps;
598     string initTree;
599 
600     double start_time = getRealTime();
601 
602     cout << "Perform fast likelihood tree search using " << subst_names[0]+rate_names[0] << " model..." << endl;
603 
604     if (iqtree->getCheckpoint()->getBool("finishedFastMLTree")) {
605         // model optimization already done: ignore this step
606         iqtree->setCurScore(iqtree->computeLikelihood());
607         initTree = iqtree->getTreeString();
608         cout << "CHECKPOINT: Tree restored, LogL: " << iqtree->getCurScore() << endl;
609     } else {
610         bool saved_opt_gammai = params.opt_gammai;
611         // disable thorough I+G optimization
612         params.opt_gammai = false;
613         initTree = iqtree->optimizeModelParameters(false, params.modelEps*50.0);
614         if (iqtree->isMixlen())
615             initTree = ((ModelFactoryMixlen*)iqtree->getModelFactory())->sortClassesByTreeLength();
616 
617         // do quick NNI search
618         if (params.start_tree != STT_USER_TREE) {
619             cout << "Perform nearest neighbor interchange..." << endl;
620             iqtree->doNNISearch(true);
621             initTree = iqtree->getTreeString();
622         }
623         params.opt_gammai = saved_opt_gammai;
624 
625         iqtree->saveCheckpoint();
626         iqtree->getModelFactory()->saveCheckpoint();
627         iqtree->getCheckpoint()->putBool("finishedFastMLTree", true);
628         iqtree->getCheckpoint()->dump();
629         //        cout << "initTree: " << initTree << endl;
630         cout << "Time for fast ML tree search: " << getRealTime() - start_time << " seconds" << endl;
631         cout << endl;
632     }
633 
634     // restore model epsilon
635     params.modelEps = saved_modelEps;
636 
637     // save information to the checkpoint for later retrieval
638     if (iqtree->isSuperTree()) {
639         PhyloSuperTree *stree = (PhyloSuperTree*)iqtree;
640         int part = 0;
641         for (auto it = stree->begin(); it != stree->end(); it++, part++) {
642             model_info.startStruct((*it)->aln->name);
643             (*it)->saveCheckpoint();
644             (*it)->getModelFactory()->saveCheckpoint();
645             model_info.endStruct();
646         }
647         SuperAlignment *saln = (SuperAlignment*)aln;
648         // restore model_names
649         for (int i = 0; i < saln->partitions.size(); i++)
650             saln->partitions[i]->model_name = saved_model_names[i];
651     } else {
652         iqtree->saveCheckpoint();
653         iqtree->getModelFactory()->saveCheckpoint();
654     }
655 
656 
657     delete iqtree;
658     return initTree;
659 }
660 
661 /**
662  Transfer parameters from ModelFinder into the a checkpoint to speed up later stage
663  */
transferModelFinderParameters(IQTree * iqtree,Checkpoint * target)664 void transferModelFinderParameters(IQTree *iqtree, Checkpoint *target) {
665 
666     Checkpoint *source = iqtree->getCheckpoint();
667 
668     // transfer the substitution model and site-rate parameters
669     if (iqtree->isSuperTree()) {
670         DoubleVector tree_lens;
671         string struct_name;
672         if (iqtree->params->partition_type == BRLEN_SCALE || iqtree->params->partition_type == BRLEN_FIX)
673             struct_name = "PartitionModelPlen";
674         else
675             struct_name = "PartitionModel";
676         target->startStruct(struct_name);
677         SuperAlignment *super_aln = (SuperAlignment*)iqtree->aln;
678         for (auto aln : super_aln->partitions) {
679             source->transferSubCheckpoint(target, aln->name + CKP_SEP + "Model");
680             source->transferSubCheckpoint(target, aln->name + CKP_SEP + "Rate");
681 
682             // transfer partition rates
683             if (iqtree->params->partition_type == BRLEN_SCALE) {
684                 source->startStruct(aln->name);
685                 CandidateModel info;
686                 info.subst_name = aln->model_name;
687                 if (info.restoreCheckpoint(source))
688                     tree_lens.push_back(info.tree_len);
689                 else
690                     ASSERT(0 && "Could not restore tree_len");
691                 source->endStruct();
692             }
693         }
694 
695         if (iqtree->params->partition_type == BRLEN_SCALE) {
696             // now normalize the rates
697             PhyloSuperTree *tree = (PhyloSuperTree*)iqtree;
698             double sum = 0.0;
699             size_t nsite = 0;
700             int i;
701             for (i = 0; i < tree->size(); i++) {
702                 sum += tree_lens[i] * tree->at(i)->aln->getNSite();
703                 if (tree->at(i)->aln->seq_type == SEQ_CODON && tree->rescale_codon_brlen)
704                     nsite += 3*tree->at(i)->aln->getNSite();
705                 else
706                     nsite += tree->at(i)->aln->getNSite();
707             }
708 
709             sum /= nsite;
710             iqtree->restoreCheckpoint();
711             iqtree->scaleLength(sum/iqtree->treeLength());
712             iqtree->saveCheckpoint();
713             sum = 1.0/sum;
714             for (i = 0; i < tree->size(); i++)
715                 tree_lens[i] *= sum;
716             target->putVector("part_rates", tree_lens);
717         }
718         target->endStruct();
719     } else {
720         source->transferSubCheckpoint(target, "Model");
721         source->transferSubCheckpoint(target, "Rate");
722     }
723 
724     // transfer tree
725     source->transferSubCheckpoint(target, "PhyloTree");
726 }
727 
runModelFinder(Params & params,IQTree & iqtree,ModelCheckpoint & model_info)728 void runModelFinder(Params &params, IQTree &iqtree, ModelCheckpoint &model_info)
729 {
730     //    iqtree.setCurScore(-DBL_MAX);
731     bool test_only = (params.model_name.find("ONLY") != string::npos) ||
732         (params.model_name.substr(0,2) == "MF" && params.model_name.substr(0,3) != "MFP");
733 
734     bool empty_model_found = params.model_name.empty() && !iqtree.isSuperTree();
735 
736     if (params.model_name.empty() && iqtree.isSuperTree()) {
737         // check whether any partition has empty model_name
738         PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree;
739         for (auto i = stree->begin(); i != stree->end(); i++)
740             if ((*i)->aln->model_name.empty()) {
741                 empty_model_found = true;
742                 break;
743             }
744     }
745 
746     if (params.model_joint)
747         empty_model_found = false;
748 
749     // Model already specifed, nothing to do here
750     if (!empty_model_found && params.model_name.substr(0, 4) != "TEST" && params.model_name.substr(0, 2) != "MF")
751         return;
752     if (MPIHelper::getInstance().getNumProcesses() > 1)
753         outError("Please use only 1 MPI process! We are currently working on the MPI parallelization of model selection.");
754     // TODO: check if necessary
755     //        if (iqtree.isSuperTree())
756     //            ((PhyloSuperTree*) &iqtree)->mapTrees();
757     double cpu_time = getCPUTime();
758     double real_time = getRealTime();
759     model_info.setFileName((string)params.out_prefix + ".model.gz");
760     model_info.setDumpInterval(params.checkpoint_dump_interval);
761 
762     bool ok_model_file = false;
763     if (!params.model_test_again) {
764         ok_model_file = model_info.load();
765     }
766 
767     cout << endl;
768 
769     ok_model_file &= model_info.size() > 0;
770     if (ok_model_file)
771         cout << "NOTE: Restoring information from model checkpoint file " << model_info.getFileName() << endl;
772 
773 
774     Checkpoint *orig_checkpoint = iqtree.getCheckpoint();
775     iqtree.setCheckpoint(&model_info);
776     iqtree.restoreCheckpoint();
777 
778     int partition_type;
779     if (CKP_RESTORE2((&model_info), partition_type)) {
780         if (partition_type != params.partition_type)
781             outError("Mismatch partition type between checkpoint and partition file command option\nRerun with -mredo to ignore .model.gz checkpoint file");
782     } else {
783         partition_type = params.partition_type;
784         CKP_SAVE2((&model_info), partition_type);
785     }
786 
787     ModelsBlock *models_block = readModelsDefinition(params);
788 
789     // compute initial tree
790     if (params.modelfinder_ml_tree) {
791         // 2019-09-10: Now perform NNI on the initial tree
792         string tree_str = computeFastMLTree(params, iqtree.aln, model_info,
793             models_block, params.num_threads, params.partition_type, iqtree.dist_file);
794         iqtree.restoreCheckpoint();
795     } else {
796         iqtree.computeInitialTree(params.SSE);
797 
798         if (iqtree.isSuperTree()) {
799             PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree;
800             int part = 0;
801             for (auto it = stree->begin(); it != stree->end(); it++, part++) {
802                 model_info.startStruct((*it)->aln->name);
803                 (*it)->saveCheckpoint();
804                 model_info.endStruct();
805             }
806         } else {
807             iqtree.saveCheckpoint();
808         }
809     }
810 
811     // also save initial tree to the original .ckp.gz checkpoint
812     //        string initTree = iqtree.getTreeString();
813     //        CKP_SAVE(initTree);
814     //        iqtree.saveCheckpoint();
815     //        checkpoint->dump(true);
816 
817     CandidateModelSet candidate_models;
818     int max_cats = candidate_models.generate(params, iqtree.aln, params.model_test_separate_rate, false);
819 
820     uint64_t mem_size = iqtree.getMemoryRequiredThreaded(max_cats);
821     cout << "NOTE: ModelFinder requires " << (mem_size / 1024) / 1024 << " MB RAM!" << endl;
822     if (mem_size >= getMemorySize()) {
823         outError("Memory required exceeds your computer RAM size!");
824     }
825 #ifdef BINARY32
826     if (mem_size >= 2000000000) {
827         outError("Memory required exceeds 2GB limit of 32-bit executable");
828     }
829 #endif
830 
831 
832     if (iqtree.isSuperTree()) {
833         // partition model selection
834         PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree;
835         testPartitionModel(params, stree, model_info, models_block, params.num_threads);
836         stree->mapTrees();
837         string res_models = "";
838         for (auto it = stree->begin(); it != stree->end(); it++) {
839             if (it != stree->begin()) res_models += ",";
840             res_models += (*it)->aln->model_name;
841         }
842         iqtree.aln->model_name = res_models;
843     } else {
844         // single model selection
845         CandidateModel best_model;
846         if (params.openmp_by_model)
847             best_model = CandidateModelSet().evaluateAll(params, &iqtree,
848                 model_info, models_block, params.num_threads, BRLEN_OPTIMIZE);
849         else
850             best_model = CandidateModelSet().test(params, &iqtree,
851                 model_info, models_block, params.num_threads, BRLEN_OPTIMIZE);
852         iqtree.aln->model_name = best_model.getName();
853 
854         Checkpoint *checkpoint = &model_info;
855         string best_model_AIC, best_model_AICc, best_model_BIC;
856         CKP_RESTORE(best_model_AIC);
857         CKP_RESTORE(best_model_AICc);
858         CKP_RESTORE(best_model_BIC);
859         cout << "Akaike Information Criterion:           " << best_model_AIC << endl;
860         cout << "Corrected Akaike Information Criterion: " << best_model_AICc << endl;
861         cout << "Bayesian Information Criterion:         " << best_model_BIC << endl;
862         cout << "Best-fit model: " << iqtree.aln->model_name << " chosen according to "
863             << criterionName(params.model_test_criterion) << endl;
864     }
865 
866     delete models_block;
867 
868     // force to dump all checkpointing information
869     model_info.dump(true);
870 
871     // transfer models parameters
872     transferModelFinderParameters(&iqtree, orig_checkpoint);
873     iqtree.setCheckpoint(orig_checkpoint);
874 
875     params.startCPUTime = cpu_time;
876     params.start_real_time = real_time;
877     cpu_time = getCPUTime() - cpu_time;
878     real_time = getRealTime() - real_time;
879     cout << endl;
880     cout << "All model information printed to " << model_info.getFileName() << endl;
881     cout << "CPU time for ModelFinder: " << cpu_time << " seconds (" << convert_time(cpu_time) << ")" << endl;
882     cout << "Wall-clock time for ModelFinder: " << real_time << " seconds (" << convert_time(real_time) << ")" << endl;
883 
884     //        alignment = iqtree.aln;
885     if (test_only) {
886         params.min_iterations = 0;
887     }
888 }
889 
890 /**
891  * get the list of substitution models
892  */
getModelSubst(SeqType seq_type,bool standard_code,string model_name,string model_set,char * model_subset,StrVector & model_names)893 void getModelSubst(SeqType seq_type, bool standard_code, string model_name,
894                    string model_set, char *model_subset, StrVector &model_names) {
895     int i, j;
896 
897     if (model_set == "1") {
898         model_names.push_back(getUsualModelSubst(seq_type));
899         return;
900     }
901 
902     if (iEquals(model_set, "ALL") || iEquals(model_set, "AUTO"))
903         model_set = "";
904 
905     if (seq_type == SEQ_BINARY) {
906         if (model_set.empty()) {
907             copyCString(bin_model_names, sizeof(bin_model_names) / sizeof(char*), model_names);
908         } else if (model_set[0] == '+') {
909             // append model_set into existing models
910             convert_string_vec(model_set.c_str()+1, model_names);
911             appendCString(bin_model_names, sizeof(bin_model_names) / sizeof(char*), model_names);
912         } else {
913             convert_string_vec(model_set.c_str(), model_names);
914         }
915     } else if (seq_type == SEQ_MORPH) {
916         if (model_set.empty()) {
917             copyCString(morph_model_names, sizeof(morph_model_names) / sizeof(char*), model_names);
918         } else if (model_set[0] == '+') {
919             // append model_set into existing models
920             convert_string_vec(model_set.c_str()+1, model_names);
921             appendCString(morph_model_names, sizeof(morph_model_names) / sizeof(char*), model_names);
922         } else {
923             convert_string_vec(model_set.c_str(), model_names);
924         }
925     } else if (seq_type == SEQ_DNA || seq_type == SEQ_POMO) {
926         if (model_set.empty()) {
927             copyCString(dna_model_names, sizeof(dna_model_names) / sizeof(char*), model_names);
928             //            copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names);
929         } else if (model_set == "partitionfinder" || model_set== "phyml") {
930             copyCString(dna_model_names_old, sizeof(dna_model_names_old) / sizeof(char*), model_names);
931             //            copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names);
932         } else if (model_set == "raxml") {
933             copyCString(dna_model_names_rax, sizeof(dna_model_names_rax) / sizeof(char*), model_names);
934             //            copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names);
935         } else if (model_set == "mrbayes") {
936             copyCString(dna_model_names_mrbayes, sizeof(dna_model_names_mrbayes) / sizeof(char*), model_names);
937             //            copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names);
938         } else if (model_set == "modelomatic") {
939             copyCString(dna_model_names_modelomatic, sizeof(dna_model_names_modelomatic) / sizeof(char*), model_names);
940         } else if (model_set == "liemarkov") {
941             copyCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
942             appendCString(dna_model_names_lie_markov_ry, sizeof(dna_model_names_lie_markov_ry) / sizeof(char*), model_names);
943             appendCString(dna_model_names_lie_markov_ws, sizeof(dna_model_names_lie_markov_ws) / sizeof(char*), model_names);
944             appendCString(dna_model_names_lie_markov_mk, sizeof(dna_model_names_lie_markov_mk) / sizeof(char*), model_names);
945         } else if (model_set == "liemarkovry") {
946             copyCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
947             appendCString(dna_model_names_lie_markov_ry, sizeof(dna_model_names_lie_markov_ry) / sizeof(char*), model_names);
948         } else if (model_set == "liemarkovws") {
949             copyCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
950             appendCString(dna_model_names_lie_markov_ws, sizeof(dna_model_names_lie_markov_ws) / sizeof(char*), model_names);
951         } else if (model_set == "liemarkovmk") {
952             copyCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
953             appendCString(dna_model_names_lie_markov_mk, sizeof(dna_model_names_lie_markov_mk) / sizeof(char*), model_names);
954         } else if (model_set == "strandsymmetric") {
955             copyCString(dna_model_names_lie_markov_strsym, sizeof(dna_model_names_lie_markov_strsym) / sizeof(char*), model_names);
956             // IMPORTANT NOTE: If you add any more -mset names for sets of Lie Markov models,
957             // you also need to change getPrototypeModel function.
958         } else if (model_set[0] == '+') {
959             // append model_set into existing models
960             convert_string_vec(model_set.c_str()+1, model_names);
961             appendCString(dna_model_names, sizeof(dna_model_names) / sizeof(char*), model_names);
962         } else {
963             convert_string_vec(model_set.c_str(), model_names);
964         }
965 
966         if (model_name.find("+LMRY") != string::npos) {
967             appendCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
968             appendCString(dna_model_names_lie_markov_ry, sizeof(dna_model_names_lie_markov_ry) / sizeof(char*), model_names);
969         } else if (model_name.find("+LMWS") != string::npos) {
970             appendCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
971             appendCString(dna_model_names_lie_markov_ws, sizeof(dna_model_names_lie_markov_ws) / sizeof(char*), model_names);
972         } else if (model_name.find("+LMMK") != string::npos) {
973             appendCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
974             appendCString(dna_model_names_lie_markov_mk, sizeof(dna_model_names_lie_markov_mk) / sizeof(char*), model_names);
975         } else if (model_name.find("+LMSS") != string::npos) {
976             appendCString(dna_model_names_lie_markov_strsym, sizeof(dna_model_names_lie_markov_strsym) / sizeof(char*), model_names);
977         } else if (model_name.find("+LM") != string::npos) {
978             appendCString(dna_model_names_lie_markov_fullsym, sizeof(dna_model_names_lie_markov_fullsym) / sizeof(char*), model_names);
979             appendCString(dna_model_names_lie_markov_ry, sizeof(dna_model_names_lie_markov_ry) / sizeof(char*), model_names);
980             appendCString(dna_model_names_lie_markov_ws, sizeof(dna_model_names_lie_markov_ws) / sizeof(char*), model_names);
981             appendCString(dna_model_names_lie_markov_mk, sizeof(dna_model_names_lie_markov_mk) / sizeof(char*), model_names);
982         }
983     } else if (seq_type == SEQ_PROTEIN) {
984         if (model_set.empty()) {
985             copyCString(aa_model_names, sizeof(aa_model_names) / sizeof(char*), model_names);
986         } else if (model_set == "partitionfinder" || model_set == "phyml") {
987             copyCString(aa_model_names_phyml, sizeof(aa_model_names_phyml) / sizeof(char*), model_names);
988         } else if (model_set == "raxml") {
989             copyCString(aa_model_names_rax, sizeof(aa_model_names_rax) / sizeof(char*), model_names);
990         } else if (model_set == "mrbayes") {
991             copyCString(aa_model_names_mrbayes, sizeof(aa_model_names_mrbayes) / sizeof(char*), model_names);
992         } else if (model_set == "modelomatic") {
993             copyCString(aa_model_names_modelomatic, sizeof(aa_model_names_modelomatic) / sizeof(char*), model_names);
994         } else if (model_set[0] == '+') {
995             // append model_set into existing models
996             convert_string_vec(model_set.c_str()+1, model_names);
997             appendCString(aa_model_names, sizeof(aa_model_names) / sizeof(char*), model_names);
998         } else {
999             convert_string_vec(model_set.c_str(), model_names);
1000         }
1001 
1002         if (model_subset) {
1003             StrVector submodel_names;
1004             if (strncmp(model_subset, "nuclear", 3) == 0) {
1005                 copyCString(aa_model_names_nuclear, sizeof(aa_model_names_nuclear) / sizeof(char*), submodel_names);
1006             } else if (strncmp(model_subset, "mitochondrial", 3) == 0) {
1007                 copyCString(aa_model_names_mitochondrial, sizeof(aa_model_names_mitochondrial) / sizeof(char*), submodel_names);
1008             } else if (strncmp(model_subset, "chloroplast", 3) == 0) {
1009                 copyCString(aa_model_names_chloroplast, sizeof(aa_model_names_chloroplast) / sizeof(char*), submodel_names);
1010             } else if (strncmp(model_subset, "viral",3) == 0) {
1011                 copyCString(aa_model_names_viral, sizeof(aa_model_names_viral) / sizeof(char*), submodel_names);
1012             } else {
1013                 outError("Wrong -msub option");
1014             }
1015             for (i = 0; i < model_names.size(); i++) {
1016                 bool appear = false;
1017                 for (j = 0; j < submodel_names.size(); j++)
1018                     if (model_names[i] == submodel_names[j]) {
1019                         appear = true;
1020                         break;
1021                     }
1022                 if (!appear) {
1023                     model_names.erase(model_names.begin()+i);
1024                     i--;
1025                 }
1026             }
1027         }
1028 
1029     } else if (seq_type == SEQ_CODON) {
1030         if (model_set.empty()) {
1031             if (standard_code)
1032                 copyCString(codon_model_names, sizeof(codon_model_names) / sizeof(char*), model_names);
1033             else {
1034                 i = sizeof(codon_model_names) / sizeof(char*);
1035                 for (j = 0; j < i; j++)
1036                     if (!std_genetic_code[j])
1037                         model_names.push_back(codon_model_names[j]);
1038                 //                copyCString(codon_model_names, sizeof(codon_model_names) / sizeof(char*) - 1, model_names);
1039             }
1040         } else if (model_set == "modelomatic") {
1041             copyCString(codon_model_names_modelomatic, sizeof(codon_model_names_modelomatic) / sizeof(char*), model_names);
1042         } else if (model_set[0] == '+') {
1043             // append model_set into existing models
1044             convert_string_vec(model_set.c_str()+1, model_names);
1045             if (standard_code)
1046                 appendCString(codon_model_names, sizeof(codon_model_names) / sizeof(char*), model_names);
1047             else {
1048                 i = sizeof(codon_model_names) / sizeof(char*);
1049                 for (j = 0; j < i; j++)
1050                     if (!std_genetic_code[j])
1051                         model_names.push_back(codon_model_names[j]);
1052             }
1053         } else
1054             convert_string_vec(model_set.c_str(), model_names);
1055     }
1056 }
1057 
getStateFreqs(SeqType seq_type,char * state_freq_set,StrVector & freq_names)1058 void getStateFreqs(SeqType seq_type, char *state_freq_set, StrVector &freq_names) {
1059     int j;
1060 
1061     switch (seq_type) {
1062         case SEQ_PROTEIN:
1063             copyCString(aa_freq_names, sizeof(aa_freq_names)/sizeof(char*), freq_names);
1064             break;
1065         case SEQ_CODON:
1066             copyCString(codon_freq_names, sizeof(codon_freq_names) / sizeof(char*), freq_names);
1067             break;
1068         default:
1069             break;
1070     }
1071     if (state_freq_set)
1072         convert_string_vec(state_freq_set, freq_names);
1073     for (j = 0; j < freq_names.size(); j++) {
1074         std::transform(freq_names[j].begin(), freq_names[j].end(), freq_names[j].begin(), ::toupper);
1075         if (freq_names[j] != "" && freq_names[j][0] != '+')
1076             freq_names[j] = "+" + freq_names[j];
1077     }
1078 }
1079 
1080 /**
1081  get list of rate heterogeneity
1082  */
getRateHet(SeqType seq_type,string model_name,double frac_invariant_sites,string rate_set,StrVector & ratehet)1083 void getRateHet(SeqType seq_type, string model_name, double frac_invariant_sites,
1084                 string rate_set, StrVector &ratehet) {
1085     const char *rate_options[]    = {  "", "+I", "+ASC", "+G", "+I+G", "+ASC+G", "+R", "+ASC+R"};
1086     bool test_options_default[]   = {true,   true, false,  true,  true,   false, false,  false};
1087     bool test_options_fast[]      = {false, false, false, false,  true,   false, false,  false};
1088     bool test_options_morph[]     = {true,  false,  true,  true, false,    true, false,  false};
1089     bool test_options_morph_fast[]= {false, false, false, false, false,    true, false,  false};
1090     bool test_options_noASC_I[]   = {true,  false, false,  true, false,   false, false,  false};
1091     bool test_options_noASC_I_fast[]={false,false, false,  true, false,   false, false,  false};
1092     bool test_options_asc[]       ={false,  false,  true, false, false,    true, false,  false};
1093     bool test_options_new[]       = {true,   true, false,  true,  true,   false,  true,  false};
1094     bool test_options_morph_new[] = {true,  false,  true,  true, false,    true,  true,   true};
1095     bool test_options_noASC_I_new[]= {true, false, false,  true, false,   false,  true,  false};
1096     bool test_options_asc_new[]   = {false, false,  true, false, false,    true, false,   true};
1097     bool test_options_pomo[]      = {true,  false, false,  true, false,   false, false,  false};
1098     bool test_options_norate[]    = {true,  false, false, false, false,   false, false,  false};
1099     bool *test_options = test_options_default;
1100     //    bool test_options_codon[] =  {true,false,  false,false,  false,    false};
1101     const int noptions = sizeof(rate_options) / sizeof(char*);
1102     int i, j;
1103 
1104     bool with_new = (model_name.find("NEW") != string::npos || model_name.substr(0,2) == "MF" || model_name.empty());
1105     bool with_asc = model_name.find("ASC") != string::npos;
1106 
1107     if (seq_type == SEQ_POMO) {
1108         for (i = 0; i < noptions; i++)
1109             test_options[i] = test_options_pomo[i];
1110     }
1111     // If not PoMo, go on with normal treatment.
1112     else if (frac_invariant_sites == 0.0) {
1113         // morphological or SNP data: activate +ASC
1114         if (with_new && rate_set != "1") {
1115             if (with_asc)
1116                 test_options = test_options_asc_new;
1117             else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MORPH)
1118                 test_options = test_options_morph_new;
1119             else
1120                 test_options = test_options_noASC_I_new;
1121         } else if (with_asc)
1122             test_options = test_options_asc;
1123         else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MORPH) {
1124             if (rate_set == "1")
1125                 test_options = test_options_morph_fast;
1126             else
1127                 test_options = test_options_morph;
1128         } else {
1129             if (rate_set == "1")
1130                 test_options = test_options_noASC_I_fast;
1131             else
1132                 test_options = test_options_noASC_I;
1133         }
1134     } else if (frac_invariant_sites >= 1.0) {
1135         // 2018-06-12: alignment with only invariant sites, no rate variation added
1136         test_options = test_options_norate;
1137     } else {
1138         // normal data, use +I instead
1139         if (with_new && rate_set != "1") {
1140             // change +I+G to +R
1141             if (with_asc)
1142                 test_options = test_options_asc_new;
1143             else
1144                 test_options = test_options_new;
1145         } else if (with_asc) {
1146             test_options = test_options_asc;
1147         } else if (rate_set == "1")
1148             test_options = test_options_fast;
1149         else
1150             test_options = test_options_default;
1151         if (frac_invariant_sites == 0.0) {
1152             // deactivate +I
1153             for (j = 0; j < noptions; j++)
1154                 if (strstr(rate_options[j], "+I"))
1155                     test_options[j] = false;
1156         }
1157     }
1158     if (!rate_set.empty() && rate_set != "1" && !iEquals(rate_set, "ALL") && !iEquals(rate_set, "AUTO")) {
1159         // take the rate_options from user-specified models
1160         convert_string_vec(rate_set.c_str(), ratehet);
1161         if (!ratehet.empty() && iEquals(ratehet[0], "ALL")) {
1162             ratehet.erase(ratehet.begin());
1163             StrVector ratedef;
1164             for (j = 0; j < noptions; j++)
1165                 if (test_options[j])
1166                     ratedef.push_back(rate_options[j]);
1167             ratehet.insert(ratehet.begin(), ratedef.begin(), ratedef.end());
1168         }
1169         for (j = 0; j < ratehet.size(); j++) {
1170             if (ratehet[j] != "" && ratehet[j][0] != '+' && ratehet[j][0] != '*')
1171                 ratehet[j] = "+" + ratehet[j];
1172             if (ratehet[j] == "+E") // for equal rate model
1173                 ratehet[j] = "";
1174         }
1175     } else {
1176         for (j = 0; j < noptions; j++)
1177             if (test_options[j])
1178                 ratehet.push_back(rate_options[j]);
1179 
1180     }
1181 }
1182 
generate(Params & params,Alignment * aln,bool separate_rate,bool merge_phase)1183 int CandidateModelSet::generate(Params &params, Alignment *aln, bool separate_rate, bool merge_phase) {
1184 	StrVector model_names;
1185     StrVector freq_names;
1186 	SeqType seq_type = aln->seq_type;
1187 
1188 	int i, j;
1189     string model_set;
1190 
1191     if (merge_phase) {
1192         model_set = params.merge_models;
1193     } else
1194         model_set = params.model_set;
1195 
1196     bool auto_model = iEquals(model_set, "AUTO");
1197 
1198     getModelSubst(seq_type, aln->isStandardGeneticCode(), params.model_name,
1199                   model_set, params.model_subset, model_names);
1200 
1201 	if (model_names.empty())
1202         return 1;
1203 
1204     getStateFreqs(seq_type, params.state_freq_set, freq_names);
1205 
1206     // combine model_names with freq_names
1207     if (freq_names.size() > 0) {
1208         StrVector orig_model_names = model_names;
1209         model_names.clear();
1210         for (j = 0; j < orig_model_names.size(); j++) {
1211             if (aln->seq_type == SEQ_CODON) {
1212                 SeqType seq_type;
1213                 int model_type = detectSeqType(orig_model_names[j].c_str(), seq_type);
1214                 for (i = 0; i < freq_names.size(); i++) {
1215                     // disallow MG+F
1216                     if (freq_names[i] == "+F" && orig_model_names[j].find("MG") != string::npos)
1217                         continue;
1218                     if (freq_names[i] != "" || (model_type == 2 && orig_model_names[j].find("MG") == string::npos))
1219                         // empirical model also allow ""
1220                         model_names.push_back(orig_model_names[j] + freq_names[i]);
1221                 }
1222             } else {
1223                 for (i = 0; i < freq_names.size(); i++)
1224                     model_names.push_back(orig_model_names[j] + freq_names[i]);
1225             }
1226         }
1227     }
1228 
1229 
1230 
1231 
1232     StrVector ratehet;
1233     int max_cats = params.num_rate_cats;
1234     string ratehet_set;
1235     if (merge_phase) {
1236         ratehet_set = params.merge_rates;
1237     } else
1238         ratehet_set = params.ratehet_set;
1239 
1240     //bool auto_rate = iEquals(ratehet_set, "AUTO");
1241 
1242     getRateHet(seq_type, params.model_name, aln->frac_invariant_sites, ratehet_set, ratehet);
1243 
1244     // add number of rate cateogories for special rate models
1245     const char *rates[] = {"+R", "*R", "+H", "*H"};
1246 
1247     for (i = 0; i < sizeof(rates)/sizeof(char*); i++)
1248         if (params.model_name.find(rates[i]) != string::npos)
1249             ratehet.push_back(rates[i]);
1250 
1251     size_t pos;
1252 
1253     vector<int> flags;
1254     flags.resize(ratehet.size(), 0);
1255 
1256     for (i = 0; i < sizeof(rates)/sizeof(char*); i++)
1257     for (j = 0; j < ratehet.size(); j++)
1258         if ((pos = ratehet[j].find(rates[i])) != string::npos &&
1259             (pos >= ratehet[j].length()-2 || !isdigit(ratehet[j][pos+2]) ))
1260         {
1261             string str = ratehet[j];
1262             ratehet[j].insert(pos+2, convertIntToString(params.min_rate_cats));
1263             max_cats = max(max_cats, params.max_rate_cats);
1264             for (int k = params.min_rate_cats+1; k <= params.max_rate_cats; k++) {
1265                 int ins_pos = j+k-params.min_rate_cats;
1266                 ratehet.insert(ratehet.begin() + ins_pos, str.substr(0, pos+2) + convertIntToString(k) + str.substr(pos+2));
1267                 flags.insert(flags.begin() + ins_pos, MF_WAITING);
1268             }
1269         }
1270 
1271     ASSERT(ratehet.size() == flags.size());
1272 
1273     string pomo_suffix = (seq_type == SEQ_POMO) ? "+P" : "";
1274     // TODO DS: should we allow virtual population size?
1275 
1276     // combine substitution models with rate heterogeneity
1277     if (separate_rate) {
1278         for (i = 0; i < model_names.size(); i++)
1279             push_back(CandidateModel(model_names[i], ratehet[0] + pomo_suffix, aln));
1280         for (j = 0; j < ratehet.size(); j++)
1281             if (ratehet[j] != "")
1282                 push_back(CandidateModel("", ratehet[j] + pomo_suffix, aln));
1283     } else {
1284         if (auto_model) {
1285             // all rate heterogeneity for the first model
1286             for (j = 0; j < ratehet.size(); j++)
1287                 push_back(CandidateModel(model_names[0], ratehet[j] + pomo_suffix, aln, flags[j]));
1288             // now all models the first RHAS
1289             for (i = 1; i < model_names.size(); i++)
1290                 push_back(CandidateModel(model_names[i], ratehet[0] + pomo_suffix, aln, flags[0]));
1291             // all remaining models
1292             for (i = 1; i < model_names.size(); i++)
1293                 for (j = 1; j < ratehet.size(); j++) {
1294                     push_back(CandidateModel(model_names[i], ratehet[j] + pomo_suffix, aln, flags[j]));
1295                 }
1296         } else {
1297             // testing all models
1298             for (i = 0; i < model_names.size(); i++)
1299                 for (j = 0; j < ratehet.size(); j++) {
1300                     push_back(CandidateModel(model_names[i], ratehet[j] + pomo_suffix, aln, flags[j]));
1301                 }
1302         }
1303     }
1304     if (params.model_extra_set) {
1305         StrVector extra_model_names;
1306         convert_string_vec(params.model_extra_set, extra_model_names);
1307         for (auto s : extra_model_names)
1308             push_back(CandidateModel(s, "", aln));
1309     }
1310     return max_cats;
1311 }
1312 
replaceModelInfo(string & set_name,ModelCheckpoint & model_info,ModelCheckpoint & new_info)1313 void replaceModelInfo(string &set_name, ModelCheckpoint &model_info, ModelCheckpoint &new_info) {
1314     for (auto it = new_info.begin(); it != new_info.end(); it++) {
1315         model_info.put(set_name + CKP_SEP + it->first, it->second);
1316     }
1317 }
1318 
extractModelInfo(string & orig_set_name,ModelCheckpoint & model_info,ModelCheckpoint & part_model_info)1319 void extractModelInfo(string &orig_set_name, ModelCheckpoint &model_info, ModelCheckpoint &part_model_info) {
1320     string set_name = orig_set_name + CKP_SEP;
1321     int len = set_name.length();
1322     for (auto it = model_info.lower_bound(set_name); it != model_info.end() && it->first.substr(0, len) == set_name; it++) {
1323         part_model_info.put(it->first.substr(len), it->second);
1324     }
1325 }
1326 
getSubsetName(PhyloSuperTree * super_tree,set<int> & subset)1327 string getSubsetName(PhyloSuperTree *super_tree, set<int> &subset) {
1328     string set_name;
1329     for (auto it = subset.begin(); it != subset.end(); it++) {
1330         if (it != subset.begin())
1331             set_name += "+";
1332         set_name += super_tree->at(*it)->aln->name;
1333     }
1334     return set_name;
1335 }
1336 
getSubsetAlnLength(PhyloSuperTree * super_tree,set<int> & subset)1337 int getSubsetAlnLength(PhyloSuperTree *super_tree, set<int> &subset) {
1338     int len = 0;
1339     for (auto i : subset) {
1340         len += super_tree->at(i)->aln->getNSite();
1341     }
1342     return len;
1343 }
1344 
1345 /**
1346  * transfer model parameters from two subsets to the target subsets
1347  */
transferModelParameters(PhyloSuperTree * super_tree,ModelCheckpoint & model_info,ModelCheckpoint & part_model_info,set<int> & gene_set1,set<int> & gene_set2)1348 void transferModelParameters(PhyloSuperTree *super_tree, ModelCheckpoint &model_info, ModelCheckpoint &part_model_info,
1349                              set<int> &gene_set1, set<int> &gene_set2)
1350 {
1351     set<int> merged_set;
1352     merged_set.insert(gene_set1.begin(), gene_set1.end());
1353     merged_set.insert(gene_set2.begin(), gene_set2.end());
1354     string set_name = getSubsetName(super_tree, merged_set);
1355     string set1_name = getSubsetName(super_tree, gene_set1);
1356     string set2_name = getSubsetName(super_tree, gene_set2);
1357     double weight1 = getSubsetAlnLength(super_tree, gene_set1);
1358     double weight2 = getSubsetAlnLength(super_tree, gene_set2);
1359     double weight_sum = weight1 + weight2;
1360     weight1 = weight1/weight_sum;
1361     weight2 = weight2/weight_sum;
1362     enum MeanComp {GEOM_MEAN, ARIT_MEAN};
1363     enum ValType {VAL_SINGLE, VAL_VECTOR};
1364     vector<tuple<ValType, MeanComp,string> > info_strings = {
1365         make_tuple(VAL_SINGLE, ARIT_MEAN, (string)"RateGamma" + CKP_SEP + "gamma_shape"),
1366         make_tuple(VAL_SINGLE, ARIT_MEAN, (string)"RateGammaInvar" + CKP_SEP + "gamma_shape"),
1367         make_tuple(VAL_SINGLE, ARIT_MEAN, (string)"RateGammaInvar" + CKP_SEP + "p_invar"),
1368         make_tuple(VAL_SINGLE, ARIT_MEAN, (string)"RateInvar" + CKP_SEP + "p_invar")
1369         //make_tuple(VAL_VECTOR, GEOM_MEAN, (string)"ModelDNA" + CKP_SEP + "rates")
1370     };
1371     for (auto info : info_strings) {
1372         switch (std::get<0>(info)) {
1373             case VAL_SINGLE: {
1374                 double value1, value2, value;
1375                 bool ok1 = model_info.get(set1_name + CKP_SEP + std::get<2>(info), value1);
1376                 bool ok2 = model_info.get(set2_name + CKP_SEP + std::get<2>(info), value2);
1377                 if (!ok1 || !ok2)
1378                     continue;
1379                 if (part_model_info.get(std::get<2>(info), value))
1380                     continue; // value already exist
1381                 switch (std::get<1>(info)) {
1382                     case ARIT_MEAN:
1383                         value = weight1*value1 + weight2*value2;
1384                         break;
1385                     case GEOM_MEAN:
1386                         value = sqrt(value1*value2);
1387                         break;
1388                 }
1389                 part_model_info.put(std::get<2>(info), value);
1390                 break;
1391             }
1392             case VAL_VECTOR: {
1393                 DoubleVector value1, value2, value;
1394                 bool ok1 = model_info.getVector(set1_name + CKP_SEP + std::get<2>(info), value1);
1395                 bool ok2 = model_info.getVector(set2_name + CKP_SEP + std::get<2>(info), value2);
1396                 if (!ok1 || !ok2)
1397                     continue;
1398                 ASSERT(value1.size() == value2.size());
1399                 if (part_model_info.getVector(std::get<2>(info), value))
1400                     continue; // value already exist
1401                 value.reserve(value1.size());
1402                 for (int i = 0; i < value1.size(); i++)
1403                 switch (std::get<1>(info)) {
1404                     case ARIT_MEAN:
1405                         value.push_back(weight1*value1[i] + weight2*value2[i]);
1406                         break;
1407                     case GEOM_MEAN:
1408                         value.push_back(sqrt(value1[i]*value2[i]));
1409                         break;
1410                 }
1411                 part_model_info.putVector(std::get<2>(info), value);
1412                 break;
1413             }
1414         }
1415     }
1416 }
1417 
mergePartitions(PhyloSuperTree * super_tree,vector<set<int>> & gene_sets,StrVector & model_names)1418 void mergePartitions(PhyloSuperTree* super_tree, vector<set<int> > &gene_sets, StrVector &model_names) {
1419 	cout << "Merging into " << gene_sets.size() << " partitions..." << endl;
1420 	vector<set<int> >::iterator it;
1421 	SuperAlignment *super_aln = (SuperAlignment*)super_tree->aln;
1422 	vector<PartitionInfo> part_info;
1423 	vector<PhyloTree*> tree_vec;
1424     SuperAlignment *new_super_aln = new SuperAlignment();
1425 	for (it = gene_sets.begin(); it != gene_sets.end(); it++) {
1426         Alignment *aln = super_aln->concatenateAlignments(*it);
1427 		PartitionInfo info;
1428 		aln->model_name = model_names[it-gene_sets.begin()];
1429         info.part_rate = 1.0; // BIG FIX: make -spp works with -m TESTMERGE now!
1430         info.evalNNIs = 0;
1431 		for (set<int>::iterator i = it->begin(); i != it->end(); i++) {
1432 			if (i != it->begin()) {
1433 				aln->name += "+";
1434                 if (!super_aln->partitions[*i]->position_spec.empty())
1435                     aln->position_spec += ", ";
1436 			}
1437 			aln->name += super_aln->partitions[*i]->name;
1438 			aln->position_spec += super_aln->partitions[*i]->position_spec;
1439 			if (!super_aln->partitions[*i]->aln_file.empty()) {
1440                 if (aln->aln_file.empty())
1441                     aln->aln_file = super_aln->partitions[*i]->aln_file;
1442                 else if (aln->aln_file != super_aln->partitions[*i]->aln_file) {
1443                     aln->aln_file = aln->aln_file + ',' + super_aln->partitions[*i]->aln_file;
1444                 }
1445 			}
1446 			if (!super_aln->partitions[*i]->sequence_type.empty()) {
1447                 if (aln->sequence_type.empty())
1448                     aln->sequence_type = super_aln->partitions[*i]->sequence_type;
1449                 else if (aln->sequence_type != super_aln->partitions[*i]->sequence_type) {
1450                     aln->sequence_type = "__NA__";
1451                 }
1452 			}
1453 		}
1454 		info.cur_ptnlh = NULL;
1455 		info.nniMoves[0].ptnlh = NULL;
1456 		info.nniMoves[1].ptnlh = NULL;
1457 		part_info.push_back(info);
1458 		PhyloTree *tree = super_tree->extractSubtree(*it);
1459         tree->setParams(super_tree->params);
1460 		tree->setAlignment(aln);
1461 		tree_vec.push_back(tree);
1462         new_super_aln->partitions.push_back(aln);
1463 	}
1464 
1465     // BUG FIX 2016-11-29: when merging partitions with -m TESTMERGE, sequence order is changed
1466     // get the taxa names from existing tree
1467     StrVector seq_names;
1468     if (super_tree->root) {
1469         super_tree->getTaxaName(seq_names);
1470     }
1471     new_super_aln->init(&seq_names);
1472 
1473 	for (PhyloSuperTree::reverse_iterator tit = super_tree->rbegin(); tit != super_tree->rend(); tit++)
1474 		delete (*tit);
1475 	super_tree->clear();
1476 	super_tree->insert(super_tree->end(), tree_vec.begin(), tree_vec.end());
1477 	super_tree->part_info = part_info;
1478 
1479 	delete super_tree->aln;
1480 //    super_tree->aln = new SuperAlignment(super_tree);
1481     super_tree->setAlignment(new_super_aln);
1482 }
1483 
1484 /**
1485  called when some partition is changed
1486  */
fixPartitions(PhyloSuperTree * super_tree)1487 void fixPartitions(PhyloSuperTree* super_tree) {
1488     SuperAlignment *super_aln = (SuperAlignment*)super_tree->aln;
1489     int part;
1490     bool aln_changed = false;
1491     for (part = 0; part < super_tree->size(); part++)
1492         if (super_aln->partitions[part] != super_tree->at(part)->aln) {
1493             aln_changed = true;
1494             super_aln->partitions[part] = super_tree->at(part)->aln;
1495         }
1496     if (!aln_changed)
1497         return;
1498     super_aln->buildPattern();
1499     super_aln->orderPatternByNumChars(PAT_VARIANT);
1500     super_tree->deleteAllPartialLh();
1501 }
1502 
evaluate(Params & params,ModelCheckpoint & in_model_info,ModelCheckpoint & out_model_info,ModelsBlock * models_block,int & num_threads,int brlen_type)1503 string CandidateModel::evaluate(Params &params,
1504     ModelCheckpoint &in_model_info, ModelCheckpoint &out_model_info,
1505     ModelsBlock *models_block,
1506     int &num_threads, int brlen_type)
1507 {
1508     //string model_name = name;
1509     Alignment *in_aln = aln;
1510     IQTree *iqtree = NULL;
1511     if (in_aln->isSuperAlignment()) {
1512         SuperAlignment *saln = (SuperAlignment*)in_aln;
1513         if (params.partition_type == BRLEN_OPTIMIZE)
1514             iqtree = new PhyloSuperTree(saln);
1515         else
1516             iqtree = new PhyloSuperTreePlen(saln, brlen_type);
1517         StrVector subst_names;
1518         StrVector rate_names;
1519         convert_string_vec(subst_name.c_str(), subst_names);
1520         convert_string_vec(rate_name.c_str(), rate_names);
1521         ASSERT(subst_names.size() == rate_names.size());
1522         for (int part = 0; part != subst_names.size(); part++)
1523             saln->partitions[part]->model_name = subst_names[part]+rate_names[part];
1524     } else if (posRateHeterotachy(getName()) != string::npos)
1525         iqtree = new PhyloTreeMixlen(in_aln, 0);
1526     else
1527         iqtree = new IQTree(in_aln);
1528     iqtree->setParams(&params);
1529     iqtree->setLikelihoodKernel(params.SSE);
1530     iqtree->optimize_by_newton = params.optimize_by_newton;
1531     iqtree->setNumThreads(num_threads);
1532 
1533     iqtree->setCheckpoint(&in_model_info);
1534 #ifdef _OPENMP
1535 #pragma omp critical
1536 #endif
1537     iqtree->restoreCheckpoint();
1538     ASSERT(iqtree->root);
1539     iqtree->initializeModel(params, getName(), models_block);
1540     if (!iqtree->getModel()->isMixture() || in_aln->seq_type == SEQ_POMO) {
1541         subst_name = iqtree->getSubstName();
1542         rate_name = iqtree->getRateName();
1543     }
1544 
1545 
1546     if (restoreCheckpoint(&in_model_info)) {
1547         delete iqtree;
1548         return "";
1549     }
1550 
1551 #ifdef _OPENMP
1552 #pragma omp critical
1553 #endif
1554     iqtree->getModelFactory()->restoreCheckpoint();
1555 
1556     // now switch to the output checkpoint
1557     iqtree->getModelFactory()->setCheckpoint(&out_model_info);
1558     iqtree->setCheckpoint(&out_model_info);
1559 
1560     double new_logl;
1561 
1562     if (params.model_test_and_tree) {
1563         //--- PERFORM FULL TREE SEARCH PER MODEL ----//
1564 
1565         // BQM 2017-03-29: disable bootstrap
1566         int orig_num_bootstrap_samples = params.num_bootstrap_samples;
1567         int orig_gbo_replicates = params.gbo_replicates;
1568         params.num_bootstrap_samples = 0;
1569         params.gbo_replicates = 0;
1570         STOP_CONDITION orig_stop_condition = params.stop_condition;
1571         if (params.stop_condition == SC_BOOTSTRAP_CORRELATION)
1572             params.stop_condition = SC_UNSUCCESS_ITERATION;
1573 
1574         iqtree->aln->model_name = getName();
1575 
1576         cout << endl << "===> Testing model " << getName() << endl;
1577 
1578         if (iqtree->root) {
1579             // start from previous tree
1580             string initTree = iqtree->getTreeString();
1581             iqtree->getCheckpoint()->put("initTree", initTree);
1582             iqtree->saveCheckpoint();
1583         }
1584 
1585 #ifdef _OPENMP
1586         if (num_threads <= 0) {
1587             num_threads = iqtree->testNumThreads();
1588             omp_set_num_threads(num_threads);
1589         } else
1590             iqtree->warnNumThreads();
1591 #endif
1592 
1593         runTreeReconstruction(params, iqtree);
1594         new_logl = iqtree->computeLikelihood();
1595         tree_len = iqtree->treeLength();
1596         tree = iqtree->getTreeString();
1597 
1598         // restore original parameters
1599         // 2017-03-29: restore bootstrap replicates
1600         params.num_bootstrap_samples = orig_num_bootstrap_samples;
1601         params.gbo_replicates = orig_gbo_replicates;
1602         params.stop_condition = orig_stop_condition;
1603 
1604         int count = iqtree->getCheckpoint()->eraseKeyPrefix("finished");
1605         cout << count << " finished checkpoint entries erased" << endl;
1606         iqtree->getCheckpoint()->eraseKeyPrefix("CandidateSet");
1607 
1608     } else {
1609         //--- FIX TREE TOPOLOGY AND ESTIMATE MODEL PARAMETERS ----//
1610 
1611         if (verbose_mode >= VB_MED)
1612             cout << "Optimizing model " << getName() << endl;
1613 
1614         #ifdef _OPENMP
1615         if (num_threads <= 0) {
1616             num_threads = iqtree->testNumThreads();
1617             omp_set_num_threads(num_threads);
1618         } else
1619             iqtree->warnNumThreads();
1620         #endif
1621 
1622         iqtree->initializeAllPartialLh();
1623 
1624         for (int step = 0; step < 2; step++) {
1625             new_logl = iqtree->getModelFactory()->optimizeParameters(brlen_type, false,
1626                 params.modelfinder_eps, TOL_GRADIENT_MODELTEST);
1627             tree_len = iqtree->treeLength();
1628             iqtree->getModelFactory()->saveCheckpoint();
1629             iqtree->saveCheckpoint();
1630 
1631             // check if logl(+R[k]) is worse than logl(+R[k-1])
1632             CandidateModel prev_info;
1633             if (!prev_info.restoreCheckpointRminus1(&in_model_info, this)) break;
1634             if (prev_info.logl < new_logl + params.modelfinder_eps) break;
1635             if (step == 0) {
1636                 iqtree->getRate()->initFromCatMinusOne();
1637             } else if (new_logl < prev_info.logl - params.modelfinder_eps*10.0) {
1638                 outWarning("Log-likelihood " + convertDoubleToString(new_logl) + " of " +
1639                            getName() + " worse than " + prev_info.getName() + " " + convertDoubleToString(prev_info.logl));
1640             }
1641         }
1642 
1643     }
1644 
1645     // sum in case of adjusted df and logl already stored
1646     df += iqtree->getModelFactory()->getNParameters(brlen_type);
1647     logl += new_logl;
1648     string tree_string = iqtree->getTreeString();
1649 
1650 #ifdef _OPENMP
1651 #pragma omp critical
1652     {
1653 #endif
1654     saveCheckpoint(&in_model_info);
1655 #ifdef _OPENMP
1656     }
1657 #endif
1658 
1659     delete iqtree;
1660     return tree_string;
1661 }
1662 
1663 /** model information by merging two partitions */
1664 struct ModelPair {
1665     /** score after merging */
1666     double score;
1667     /** ID of partition 1 */
1668     int part1;
1669     /** ID of partition 2 */
1670     int part2;
1671     /** log-likelihood */
1672     double logl;
1673     /** degree of freedom */
1674     int df;
1675     /** tree length */
1676     double tree_len;
1677     /** IDs of merged partitions */
1678     set<int> merged_set;
1679     /** set name */
1680     string set_name;
1681     /* best model name */
1682     string model_name;
1683 };
1684 
1685 class ModelPairSet : public multimap<double, ModelPair> {
1686 
1687 public:
1688 
1689     /** insert a partition pair */
insertPair(ModelPair & pair)1690     void insertPair(ModelPair &pair) {
1691         insert(value_type(pair.score, pair));
1692     }
1693 
1694     /**
1695         find the maximum compatible partition pairs
1696         @param num max number of pairs to return
1697     */
getCompatiblePairs(int num,ModelPairSet & res)1698     void getCompatiblePairs(int num, ModelPairSet &res) {
1699         set<int> part_ids;
1700 
1701         for (auto it = begin(); it != end() && res.size() < num; it++) {
1702 
1703             // check for compatibility
1704             vector<int> overlap;
1705             set_intersection(part_ids.begin(), part_ids.end(),
1706                 it->second.merged_set.begin(), it->second.merged_set.end(),
1707                 std::back_inserter(overlap));
1708 
1709             if (!overlap.empty()) continue;
1710 
1711             // take the union
1712             part_ids.insert(it->second.merged_set.begin(), it->second.merged_set.end());
1713 
1714             // put the compatible pair to the set
1715             res.insertPair(it->second);
1716         }
1717     }
1718 
1719 };
1720 
evaluateConcatenation(Params & params,SuperAlignment * super_aln,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads)1721 string CandidateModel::evaluateConcatenation(Params &params, SuperAlignment *super_aln,
1722     ModelCheckpoint &model_info, ModelsBlock *models_block, int num_threads)
1723 {
1724     aln = super_aln->concatenateAlignments();
1725     size_t ssize = getUsualModel(aln);
1726 
1727     string concat_tree;
1728 
1729     cout << "Testing " << getName() << " on supermatrix..." << endl;
1730     concat_tree = evaluate(params, model_info, model_info,
1731         models_block, num_threads, BRLEN_OPTIMIZE);
1732 
1733     computeICScores(ssize);
1734 
1735     delete aln;
1736     aln = NULL;
1737     return concat_tree;
1738 }
1739 
1740 /**
1741  * k-means clustering of partitions using partition-specific tree length
1742  * @return score (AIC/BIC/etc.) of the clustering
1743  * @param[out] gene_sets
1744  * @param[out[ model_names
1745  */
doKmeansClustering(Params & params,PhyloSuperTree * in_tree,int ncluster,DoubleVector & lenvec,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads,vector<set<int>> & gene_sets,StrVector & model_names)1746 double doKmeansClustering(Params &params, PhyloSuperTree *in_tree,
1747     int ncluster, DoubleVector &lenvec,
1748     ModelCheckpoint &model_info, ModelsBlock *models_block,
1749     int num_threads,
1750     vector<set<int> > &gene_sets, StrVector &model_names)
1751 {
1752 
1753     cout << "k-means merging into " << ncluster << " partitions..." << endl;
1754 
1755     ASSERT(lenvec.size() == in_tree->size());
1756     int npart = in_tree->size();
1757     IntVector weights;
1758     weights.resize(npart, 1);
1759     int *clusters = new int[npart];
1760     double *centers = new double[ncluster];
1761     RunKMeans1D(npart, ncluster, lenvec.data(), weights.data(), centers, clusters);
1762 
1763     SuperAlignment *super_aln = ((SuperAlignment*)in_tree->aln);
1764 
1765     double lhsum = 0.0;
1766     int dfsum = 0;
1767     if (params.partition_type == BRLEN_FIX || params.partition_type == BRLEN_SCALE) {
1768         dfsum = in_tree->getNBranchParameters(BRLEN_OPTIMIZE);
1769         if (params.partition_type == BRLEN_SCALE)
1770             dfsum -= 1;
1771     }
1772 
1773     for (int cluster = 0; cluster < ncluster; cluster++) {
1774         string set_name;
1775         set<int> merged_set;
1776         for (int i = 0; i < in_tree->size(); i++)
1777             if (clusters[i] == cluster) {
1778                 if (!set_name.empty())
1779                     set_name += "+";
1780                 set_name += in_tree->at(i)->aln->name;
1781                 merged_set.insert(i);
1782             }
1783         gene_sets.push_back(merged_set);
1784         CandidateModel best_model;
1785         bool done_before = false;
1786         {
1787             // if pairs previously examined, reuse the information
1788             model_info.startStruct(set_name);
1789             if (model_info.getBestModel(best_model.subst_name)) {
1790                 best_model.restoreCheckpoint(&model_info);
1791                 done_before = true;
1792             }
1793             model_info.endStruct();
1794         }
1795         ModelCheckpoint part_model_info;
1796         if (!done_before) {
1797             Alignment *aln = super_aln->concatenateAlignments(merged_set);
1798             PhyloTree *tree = in_tree->extractSubtree(merged_set);
1799             tree->setAlignment(aln);
1800             extractModelInfo(set_name, model_info, part_model_info);
1801             tree->num_precision = in_tree->num_precision;
1802             tree->setParams(&params);
1803             tree->sse = params.SSE;
1804             tree->optimize_by_newton = params.optimize_by_newton;
1805             tree->num_threads = params.model_test_and_tree ? num_threads : 1;
1806             /*if (params.model_test_and_tree) {
1807              tree->setCheckpoint(new Checkpoint());
1808              tree->saveCheckpoint();
1809              } else*/
1810             {
1811             tree->setCheckpoint(&part_model_info);
1812             // trick to restore checkpoint
1813             tree->restoreCheckpoint();
1814             tree->saveCheckpoint();
1815             }
1816             best_model = CandidateModelSet().test(params, tree, part_model_info, models_block,
1817                 params.model_test_and_tree ? num_threads : 1, params.partition_type,
1818                 set_name, "", true);
1819             best_model.restoreCheckpoint(&part_model_info);
1820             model_names.push_back(best_model.getName());
1821             delete tree;
1822             delete aln;
1823         }
1824         lhsum += best_model.logl;
1825         dfsum += best_model.df;
1826         {
1827             if (!done_before) {
1828                 replaceModelInfo(set_name, model_info, part_model_info);
1829                 model_info.dump();
1830                 cout.width(4);
1831                 cout << right << cluster+1 << " ";
1832                 cout.width(12);
1833                 cout << left << best_model.getName() << " ";
1834                 cout.width(11);
1835                 cout << best_model.logl << " " << set_name;
1836                 cout << endl;
1837             }
1838         }
1839     }
1840 
1841     int ssize = in_tree->getAlnNSite();
1842     double score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
1843     cout << "k-means score for " << ncluster << " partitions: " << score << " (LnL: " << lhsum << "  " << "df: " << dfsum << ")" << endl;
1844 
1845     delete [] centers;
1846     delete [] clusters;
1847     return score;
1848 }
1849 
1850 class SubsetPair : public pair<int,int> {
1851 public:
1852     // distance between two partition pairs
1853     double distance;
1854 };
1855 
comparePairs(const SubsetPair & a,const SubsetPair & b)1856 bool comparePairs(const SubsetPair &a, const SubsetPair &b) {
1857     return a.distance < b.distance;
1858 }
1859 
comparePartition(const pair<int,double> & a,const pair<int,double> & b)1860 bool comparePartition(const pair<int,double> &a, const pair<int, double> &b) {
1861     return a.second > b.second;
1862 }
1863 
1864 /**
1865  find k-closest partition pairs for rcluster algorithm
1866  */
findClosestPairs(SuperAlignment * super_aln,DoubleVector & lenvec,vector<set<int>> & gene_sets,double log_transform,vector<SubsetPair> & closest_pairs)1867 void findClosestPairs(SuperAlignment *super_aln, DoubleVector &lenvec, vector<set<int> > &gene_sets,
1868                       double log_transform, vector<SubsetPair> &closest_pairs) {
1869 
1870     for (int part1 = 0; part1 < lenvec.size()-1; part1++)
1871         for (int part2 = part1+1; part2 < lenvec.size(); part2++)
1872             if (super_aln->partitions[*gene_sets[part1].begin()]->seq_type == super_aln->partitions[*gene_sets[part2].begin()]->seq_type &&
1873                 super_aln->partitions[*gene_sets[part1].begin()]->genetic_code == super_aln->partitions[*gene_sets[part2].begin()]->genetic_code) {
1874                 // only merge partitions of the same data type
1875                 SubsetPair pair;
1876                 pair.first = part1;
1877                 pair.second = part2;
1878                 if (log_transform)
1879                     pair.distance = fabs(log(lenvec[part1]) - log(lenvec[part2]));
1880                 else
1881                     pair.distance = fabs(lenvec[part1] - lenvec[part2]);
1882                 closest_pairs.push_back(pair);
1883             }
1884     if (!closest_pairs.empty() && Params::getInstance().partfinder_rcluster < 100) {
1885         // sort distance
1886         std::sort(closest_pairs.begin(), closest_pairs.end(), comparePairs);
1887         size_t num_pairs = round(closest_pairs.size() * (Params::getInstance().partfinder_rcluster/100.0));
1888         num_pairs = min(num_pairs, Params::getInstance().partfinder_rcluster_max);
1889         if (num_pairs <= 0) num_pairs = 1;
1890         closest_pairs.erase(closest_pairs.begin() + num_pairs, closest_pairs.end());
1891     }
1892 }
1893 
1894 /**
1895  merge vector src into dest, eliminating duplicates
1896  */
mergePairs(vector<SubsetPair> & dest,vector<SubsetPair> & src)1897 void mergePairs(vector<SubsetPair> &dest, vector<SubsetPair> &src) {
1898     unordered_set<string> dest_set;
1899     for (SubsetPair s: dest)
1900         dest_set.insert(convertIntToString(s.first) + "-" + convertIntToString(s.second));
1901     for (SubsetPair s: src)
1902         if (dest_set.find(convertIntToString(s.first) + "-" + convertIntToString(s.second)) == dest_set.end())
1903             dest.push_back(s);
1904 }
1905 
1906 /**
1907  * select models for all partitions
1908  * @param[in,out] model_info (IN/OUT) all model information
1909  * @return total number of parameters
1910  */
testPartitionModel(Params & params,PhyloSuperTree * in_tree,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads)1911 void testPartitionModel(Params &params, PhyloSuperTree* in_tree, ModelCheckpoint &model_info,
1912     ModelsBlock *models_block, int num_threads)
1913 {
1914 //    params.print_partition_info = true;
1915 //    params.print_conaln = true;
1916 	int i = 0;
1917 //	PhyloSuperTree::iterator it;
1918 	DoubleVector lhvec; // log-likelihood for each partition
1919 	DoubleVector dfvec; // number of parameters for each partition
1920     DoubleVector lenvec; // tree length for each partition
1921 	double lhsum = 0.0;
1922 	int dfsum = 0;
1923     if (params.partition_type == BRLEN_FIX || params.partition_type == BRLEN_SCALE) {
1924         dfsum = in_tree->getNBranchParameters(BRLEN_OPTIMIZE);
1925         if (params.partition_type == BRLEN_SCALE)
1926             dfsum -= 1;
1927     }
1928 	int ssize = in_tree->getAlnNSite();
1929 	int64_t num_model = 0;
1930     int64_t total_num_model = in_tree->size();
1931 
1932     // 2017-06-07: -rcluster-max for max absolute number of pairs
1933     if (params.partfinder_rcluster_max == 0)
1934         params.partfinder_rcluster_max = max((size_t)1000, 10*in_tree->size());
1935 
1936 	if (params.partition_merge != MERGE_NONE) {
1937         double p = params.partfinder_rcluster/100.0;
1938         size_t num_pairs = round(in_tree->size()*(in_tree->size()-1)*p/2);
1939         if (p < 1.0)
1940             num_pairs = min(num_pairs, params.partfinder_rcluster_max);
1941         total_num_model += num_pairs;
1942         for (i = in_tree->size()-2; i > 0; i--)
1943             total_num_model += max(round(i*p), 1.0);
1944     }
1945 
1946 
1947 #ifdef _OPENMP
1948     if (num_threads <= 0) {
1949         // partition selection scales well with many cores
1950         num_threads = min((int64_t)countPhysicalCPUCores(), total_num_model);
1951         num_threads = min(num_threads, params.num_threads_max);
1952         omp_set_num_threads(num_threads);
1953         cout << "NUMBER OF THREADS FOR PARTITION FINDING: " << num_threads << endl;
1954     }
1955 #endif
1956 
1957     double start_time = getRealTime();
1958 
1959 	SuperAlignment *super_aln = ((SuperAlignment*)in_tree->aln);
1960 
1961 	cout << "Selecting individual models for " << in_tree->size() << " charsets using " << criterionName(params.model_test_criterion) << "..." << endl;
1962 	//cout << " No. AIC         AICc        BIC         Charset" << endl;
1963 	cout << " No. Model        Score       TreeLen     Charset" << endl;
1964 
1965 	lhvec.resize(in_tree->size());
1966 	dfvec.resize(in_tree->size());
1967 	lenvec.resize(in_tree->size());
1968 
1969     // sort partition by computational cost for OpenMP effciency
1970     vector<pair<int,double> > partitionID;
1971 
1972 	for (i = 0; i < in_tree->size(); i++) {
1973         Alignment *this_aln = in_tree->at(i)->aln;
1974         // computation cost is proportional to #sequences, #patterns, and #states
1975         partitionID.push_back({i, ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states});
1976     }
1977 
1978     if (num_threads > 1) {
1979         std::sort(partitionID.begin(), partitionID.end(), comparePartition);
1980     }
1981 
1982     bool parallel_over_partitions = false;
1983     int brlen_type = params.partition_type;
1984     if (brlen_type == TOPO_UNLINKED)
1985         brlen_type = BRLEN_OPTIMIZE;
1986 
1987     bool test_merge = (params.partition_merge != MERGE_NONE) && params.partition_type != TOPO_UNLINKED && (in_tree->size() > 1);
1988 
1989 #ifdef _OPENMP
1990     parallel_over_partitions = !params.model_test_and_tree && (in_tree->size() >= num_threads);
1991 #pragma omp parallel for private(i) schedule(dynamic) reduction(+: lhsum, dfsum) if(parallel_over_partitions)
1992 #endif
1993 	for (int j = 0; j < in_tree->size(); j++) {
1994         i = partitionID[j].first;
1995         PhyloTree *this_tree = in_tree->at(i);
1996 		// scan through models for this partition, assuming the information occurs consecutively
1997 		ModelCheckpoint part_model_info;
1998 		extractModelInfo(this_tree->aln->name, model_info, part_model_info);
1999 		// do the computation
2000         string part_model_name;
2001         if (params.model_name.empty())
2002             part_model_name = this_tree->aln->model_name;
2003         CandidateModel best_model;
2004 		best_model = CandidateModelSet().test(params, this_tree, part_model_info, models_block,
2005             (parallel_over_partitions ? 1 : num_threads), brlen_type, this_tree->aln->name, part_model_name, test_merge);
2006 
2007         bool check = (best_model.restoreCheckpoint(&part_model_info));
2008         ASSERT(check);
2009 
2010 		double score = best_model.computeICScore(this_tree->getAlnNSite());
2011 		this_tree->aln->model_name = best_model.getName();
2012 		lhsum += (lhvec[i] = best_model.logl);
2013 		dfsum += (dfvec[i] = best_model.df);
2014         lenvec[i] = best_model.tree_len;
2015 
2016 #ifdef _OPENMP
2017 #pragma omp critical
2018 #endif
2019         {
2020             num_model++;
2021             cout.width(4);
2022             cout << right << num_model << " ";
2023             cout.width(12);
2024             cout << left << best_model.getName() << " ";
2025             cout.width(11);
2026             cout << score << " ";
2027             cout.width(11);
2028             cout << best_model.tree_len << " ";
2029             cout << this_tree->aln->name;
2030             if (num_model >= 10) {
2031                 double remain_time = (total_num_model-num_model)*(getRealTime()-start_time)/num_model;
2032                 cout << "\t" << convert_time(getRealTime()-start_time) << " ("
2033                     << convert_time(remain_time) << " left)";
2034             }
2035             cout << endl;
2036             replaceModelInfo(this_tree->aln->name, model_info, part_model_info);
2037             model_info.dump();
2038         }
2039     }
2040 
2041     // in case ModelOMatic change the alignment
2042     fixPartitions(in_tree);
2043 
2044 	double inf_score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
2045 	cout << "Full partition model " << criterionName(params.model_test_criterion)
2046          << " score: " << inf_score << " (LnL: " << lhsum << "  df:" << dfsum << ")" << endl;
2047 
2048 	if (!test_merge) {
2049 		super_aln->printBestPartition((string(params.out_prefix) + ".best_scheme.nex").c_str());
2050 		super_aln->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
2051         model_info.dump();
2052 		return;
2053 	}
2054 
2055     vector<set<int> > gene_sets;
2056     StrVector model_names;
2057     StrVector greedy_model_trees;
2058 
2059     gene_sets.resize(in_tree->size());
2060     model_names.resize(in_tree->size());
2061     greedy_model_trees.resize(in_tree->size());
2062     for (i = 0; i < gene_sets.size(); i++) {
2063         gene_sets[i].insert(i);
2064         model_names[i] = in_tree->at(i)->aln->model_name;
2065         greedy_model_trees[i] = in_tree->at(i)->aln->name;
2066     }
2067 
2068     if (params.partition_merge == MERGE_KMEANS) {
2069         // kmeans cluster based on parition tree length
2070         double cur_score = inf_score;
2071         for (int ncluster = in_tree->size()-1; ncluster >= 1; ncluster--) {
2072             vector<set<int> > this_gene_sets;
2073             StrVector this_model_names;
2074             //double sum = in_tree->size()/std::accumulate(lenvec.begin(), lenvec.end(), 0.0);
2075             double score = doKmeansClustering(params, in_tree, ncluster, lenvec, model_info,
2076                 models_block, num_threads, this_gene_sets, this_model_names);
2077             if (score < cur_score) {
2078                 cout << "Better score found: " << score << endl;
2079                 cur_score = score;
2080                 gene_sets = this_gene_sets;
2081                 model_names = this_model_names;
2082             } else {
2083                 //break;
2084             }
2085         }
2086     } else {
2087         cout << "Merging models to increase model fit (about " << total_num_model << " total partition schemes)..." << endl;
2088     }
2089 
2090     /* following implements the greedy algorithm of Lanfear et al. (2012) */
2091 	while (params.partition_merge != MERGE_KMEANS && gene_sets.size() >= 2) {
2092 		// stepwise merging charsets
2093 
2094         // list of all better pairs of partitions than current partitioning scheme
2095         ModelPairSet better_pairs;
2096 
2097         // 2015-06-24: begin rcluster algorithm
2098         // compute distance between gene_sets
2099         ASSERT(gene_sets.size() == lenvec.size());
2100         // find closest partition pairs
2101         vector<SubsetPair> closest_pairs;
2102         findClosestPairs(super_aln, lenvec, gene_sets, false, closest_pairs);
2103         if (params.partfinder_log_rate) {
2104             // additional consider pairs by log-rate
2105             vector<SubsetPair> log_closest_pairs;
2106             findClosestPairs(super_aln, lenvec, gene_sets, true, log_closest_pairs);
2107             mergePairs(closest_pairs, log_closest_pairs);
2108         }
2109         // sort partition by computational cost for OpenMP effciency
2110         for (i = 0; i < closest_pairs.size(); i++) {
2111             // computation cost is proportional to #sequences, #patterns, and #states
2112             Alignment *this_aln = in_tree->at(closest_pairs[i].first)->aln;
2113             closest_pairs[i].distance = -((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
2114             this_aln = in_tree->at(closest_pairs[i].second)->aln;
2115             closest_pairs[i].distance -= ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
2116         }
2117         if (num_threads > 1)
2118             std::sort(closest_pairs.begin(), closest_pairs.end(), comparePairs);
2119 
2120         size_t num_pairs = closest_pairs.size();
2121 
2122 #ifdef _OPENMP
2123 #pragma omp parallel for private(i) schedule(dynamic) if(!params.model_test_and_tree)
2124 #endif
2125         for (size_t pair = 0; pair < num_pairs; pair++) {
2126             // information of current partitions pair
2127             ModelPair cur_pair;
2128             cur_pair.part1 = closest_pairs[pair].first;
2129             cur_pair.part2 = closest_pairs[pair].second;
2130             ASSERT(cur_pair.part1 < cur_pair.part2);
2131             cur_pair.merged_set.insert(gene_sets[cur_pair.part1].begin(), gene_sets[cur_pair.part1].end());
2132             cur_pair.merged_set.insert(gene_sets[cur_pair.part2].begin(), gene_sets[cur_pair.part2].end());
2133             cur_pair.set_name = getSubsetName(in_tree, cur_pair.merged_set);
2134             double weight1 = getSubsetAlnLength(in_tree, gene_sets[cur_pair.part1]);
2135             double weight2 = getSubsetAlnLength(in_tree, gene_sets[cur_pair.part2]);
2136             double sum = 1.0 / (weight1 + weight2);
2137             weight1 *= sum;
2138             weight2 *= sum;
2139             CandidateModel best_model;
2140             bool done_before = false;
2141 #ifdef _OPENMP
2142 #pragma omp critical
2143 #endif
2144             {
2145                 // if pairs previously examined, reuse the information
2146                 model_info.startStruct(cur_pair.set_name);
2147                 if (model_info.getBestModel(best_model.subst_name)) {
2148                     best_model.restoreCheckpoint(&model_info);
2149                     done_before = true;
2150                 }
2151                 model_info.endStruct();
2152             }
2153             ModelCheckpoint part_model_info;
2154             double cur_tree_len = 0.0;
2155             if (!done_before) {
2156                 Alignment *aln = super_aln->concatenateAlignments(cur_pair.merged_set);
2157                 PhyloTree *tree = in_tree->extractSubtree(cur_pair.merged_set);
2158                 //tree->scaleLength((weight1*lenvec[cur_pair.part1] + weight2*lenvec[cur_pair.part2])/tree->treeLength());
2159                 tree->scaleLength(sqrt(lenvec[cur_pair.part1]*lenvec[cur_pair.part2])/tree->treeLength());
2160                 cur_tree_len = tree->treeLength();
2161                 tree->setAlignment(aln);
2162                 extractModelInfo(cur_pair.set_name, model_info, part_model_info);
2163                 transferModelParameters(in_tree, model_info, part_model_info, gene_sets[cur_pair.part1], gene_sets[cur_pair.part2]);
2164                 tree->num_precision = in_tree->num_precision;
2165                 tree->setParams(&params);
2166                 tree->sse = params.SSE;
2167                 tree->optimize_by_newton = params.optimize_by_newton;
2168                 tree->num_threads = params.model_test_and_tree ? num_threads : 1;
2169                 {
2170                     tree->setCheckpoint(&part_model_info);
2171                     // trick to restore checkpoint
2172                     tree->restoreCheckpoint();
2173                     tree->saveCheckpoint();
2174                 }
2175                 best_model = CandidateModelSet().test(params, tree, part_model_info, models_block,
2176                     params.model_test_and_tree ? num_threads : 1, params.partition_type, cur_pair.set_name, "", true);
2177                 best_model.restoreCheckpoint(&part_model_info);
2178                 delete tree;
2179                 delete aln;
2180             }
2181             cur_pair.logl = best_model.logl;
2182             cur_pair.df = best_model.df;
2183             cur_pair.model_name = best_model.getName();
2184             cur_pair.tree_len = best_model.tree_len;
2185             double lhnew = lhsum - lhvec[cur_pair.part1] - lhvec[cur_pair.part2] + best_model.logl;
2186             int dfnew = dfsum - dfvec[cur_pair.part1] - dfvec[cur_pair.part2] + best_model.df;
2187             cur_pair.score = computeInformationScore(lhnew, dfnew, ssize, params.model_test_criterion);
2188 #ifdef _OPENMP
2189 #pragma omp critical
2190 #endif
2191 			{
2192 				if (!done_before) {
2193 					replaceModelInfo(cur_pair.set_name, model_info, part_model_info);
2194                     model_info.dump();
2195                     num_model++;
2196 					cout.width(4);
2197 					cout << right << num_model << " ";
2198 					cout.width(12);
2199 					cout << left << best_model.getName() << " ";
2200 					cout.width(11);
2201                     cout << cur_pair.score << " ";
2202                     cout.width(11);
2203                     cout << cur_pair.tree_len << " " << cur_pair.set_name;
2204                     if (num_model >= 10) {
2205                         double remain_time = max(total_num_model-num_model, (int64_t)0)*(getRealTime()-start_time)/num_model;
2206                         cout << "\t" << convert_time(getRealTime()-start_time) << " ("
2207                             << convert_time(remain_time) << " left)";
2208                     }
2209                     cout << endl;
2210 				}
2211                 if (cur_pair.score < inf_score)
2212                     better_pairs.insertPair(cur_pair);
2213 			}
2214 
2215         }
2216 		if (better_pairs.empty()) break;
2217         ModelPairSet compatible_pairs;
2218 
2219         int num_comp_pairs = params.partition_merge == MERGE_RCLUSTERF ? gene_sets.size()/2 : 1;
2220         better_pairs.getCompatiblePairs(num_comp_pairs, compatible_pairs);
2221         if (compatible_pairs.size() > 1)
2222             cout << compatible_pairs.size() << " compatible better partition pairs found" << endl;
2223 
2224         // 2017-12-21: simultaneously merging better pairs
2225         for (auto it_pair = compatible_pairs.begin(); it_pair != compatible_pairs.end(); it_pair++) {
2226             ModelPair opt_pair = it_pair->second;
2227 
2228             lhsum = lhsum - lhvec[opt_pair.part1] - lhvec[opt_pair.part2] + opt_pair.logl;
2229             dfsum = dfsum - dfvec[opt_pair.part1] - dfvec[opt_pair.part2] + opt_pair.df;
2230             inf_score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
2231             ASSERT(inf_score <= opt_pair.score + 0.1);
2232 
2233             cout << "Merging " << opt_pair.set_name << " with " << criterionName(params.model_test_criterion)
2234                  << " score: " << inf_score << " (LnL: " << lhsum << "  df: " << dfsum << ")" << endl;
2235             // change entry opt_part1 to merged one
2236             gene_sets[opt_pair.part1] = opt_pair.merged_set;
2237             lhvec[opt_pair.part1] = opt_pair.logl;
2238             dfvec[opt_pair.part1] = opt_pair.df;
2239             lenvec[opt_pair.part1] = opt_pair.tree_len;
2240             model_names[opt_pair.part1] = opt_pair.model_name;
2241             greedy_model_trees[opt_pair.part1] = "(" + greedy_model_trees[opt_pair.part1] + "," +
2242                 greedy_model_trees[opt_pair.part2] + ")" +
2243                 convertIntToString(in_tree->size()-gene_sets.size()+1) + ":" +
2244                 convertDoubleToString(inf_score);
2245 
2246             // delete entry opt_part2
2247             lhvec.erase(lhvec.begin() + opt_pair.part2);
2248             dfvec.erase(dfvec.begin() + opt_pair.part2);
2249             lenvec.erase(lenvec.begin() + opt_pair.part2);
2250             gene_sets.erase(gene_sets.begin() + opt_pair.part2);
2251             model_names.erase(model_names.begin() + opt_pair.part2);
2252             greedy_model_trees.erase(greedy_model_trees.begin() + opt_pair.part2);
2253 
2254             // decrease part ID for all pairs beyond opt_pair.part2
2255             auto next_pair = it_pair;
2256             for (next_pair++; next_pair != compatible_pairs.end(); next_pair++) {
2257                 if (next_pair->second.part1 > opt_pair.part2)
2258                     next_pair->second.part1--;
2259                 if (next_pair->second.part2 > opt_pair.part2)
2260                     next_pair->second.part2--;
2261             }
2262         }
2263 	}
2264 
2265 	string final_model_tree;
2266 	if (greedy_model_trees.size() == 1)
2267 		final_model_tree = greedy_model_trees[0];
2268 	else {
2269 		final_model_tree = "(";
2270 		for (i = 0; i < greedy_model_trees.size(); i++) {
2271 			if (i>0)
2272 				final_model_tree += ",";
2273 			final_model_tree += greedy_model_trees[i];
2274 		}
2275 		final_model_tree += ")";
2276 	}
2277 
2278 	cout << "Agglomerative model selection: " << final_model_tree << endl;
2279 
2280     if (gene_sets.size() < in_tree->size())
2281         mergePartitions(in_tree, gene_sets, model_names);
2282 
2283     if (!iEquals(params.merge_models, "all")) {
2284         // test all candidate models again
2285         lhsum = 0.0;
2286         dfsum = 0;
2287         if (params.partition_type == BRLEN_FIX || params.partition_type == BRLEN_SCALE) {
2288             dfsum = in_tree->getNBranchParameters(BRLEN_OPTIMIZE);
2289             if (params.partition_type == BRLEN_SCALE)
2290                 dfsum -= 1;
2291         }
2292 
2293         // sort partition by computational cost for OpenMP effciency
2294         partitionID.clear();
2295         for (i = 0; i < in_tree->size(); i++) {
2296             Alignment *this_aln = in_tree->at(i)->aln;
2297             // computation cost is proportional to #sequences, #patterns, and #states
2298             partitionID.push_back({i, ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states});
2299         }
2300 
2301         if (num_threads > 1) {
2302             std::sort(partitionID.begin(), partitionID.end(), comparePartition);
2303         }
2304 
2305     #ifdef _OPENMP
2306         parallel_over_partitions = !params.model_test_and_tree && (in_tree->size() >= num_threads);
2307         #pragma omp parallel for private(i) schedule(dynamic) reduction(+: lhsum, dfsum) if(parallel_over_partitions)
2308     #endif
2309         for (int j = 0; j < in_tree->size(); j++) {
2310             i = partitionID[j].first;
2311             PhyloTree *this_tree = in_tree->at(i);
2312             // scan through models for this partition, assuming the information occurs consecutively
2313             ModelCheckpoint part_model_info;
2314             extractModelInfo(this_tree->aln->name, model_info, part_model_info);
2315             // do the computation
2316             string part_model_name;
2317             if (params.model_name.empty())
2318                 part_model_name = this_tree->aln->model_name;
2319             CandidateModel best_model;
2320             best_model = CandidateModelSet().test(params, this_tree, part_model_info, models_block,
2321                 (parallel_over_partitions ? 1 : num_threads), brlen_type,
2322                 this_tree->aln->name, part_model_name, false);
2323 
2324             bool check = (best_model.restoreCheckpoint(&part_model_info));
2325             ASSERT(check);
2326 
2327             double score = best_model.computeICScore(this_tree->getAlnNSite());
2328             this_tree->aln->model_name = best_model.getName();
2329             lhsum += (lhvec[i] = best_model.logl);
2330             dfsum += (dfvec[i] = best_model.df);
2331             lenvec[i] = best_model.tree_len;
2332 
2333     #ifdef _OPENMP
2334     #pragma omp critical
2335     #endif
2336             {
2337             num_model++;
2338             cout.width(4);
2339             cout << right << num_model << " ";
2340             cout.width(12);
2341             cout << left << best_model.getName() << " ";
2342             cout.width(11);
2343             cout << score << " " << this_tree->aln->name;
2344             if (num_model >= 10) {
2345                 double remain_time = (total_num_model-num_model)*(getRealTime()-start_time)/num_model;
2346                 cout << "\t" << convert_time(getRealTime()-start_time) << " ("
2347                 << convert_time(remain_time) << " left)";
2348             }
2349             cout << endl;
2350             replaceModelInfo(this_tree->aln->name, model_info, part_model_info);
2351             model_info.dump();
2352             }
2353         }
2354     }
2355 
2356     inf_score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
2357     cout << "Best partition model " << criterionName(params.model_test_criterion) << " score: " << inf_score << " (LnL: " << lhsum << "  df:" << dfsum << ")" << endl;
2358 
2359     ((SuperAlignment*)in_tree->aln)->printBestPartition((string(params.out_prefix) + ".best_scheme.nex").c_str());
2360 	((SuperAlignment*)in_tree->aln)->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
2361     model_info.dump();
2362 }
2363 
isMixtureModel(ModelsBlock * models_block,string & model_str)2364 bool isMixtureModel(ModelsBlock *models_block, string &model_str) {
2365     size_t mix_pos;
2366     for (mix_pos = 0; mix_pos < model_str.length(); mix_pos++) {
2367         size_t next_mix_pos = model_str.find_first_of("+*", mix_pos);
2368         string sub_model_str = model_str.substr(mix_pos, next_mix_pos-mix_pos);
2369         if (models_block->findMixModel(sub_model_str))
2370             return true;
2371         if (next_mix_pos == string::npos)
2372             break;
2373         mix_pos = next_mix_pos;
2374     }
2375     return false;
2376 }
2377 
filterRates(int finished_model)2378 void CandidateModelSet::filterRates(int finished_model) {
2379     if (Params::getInstance().score_diff_thres < 0)
2380         return;
2381     double best_score = DBL_MAX;
2382     ASSERT(finished_model >= 0);
2383     int model;
2384     for (model = 0; model <= finished_model; model++)
2385         if (at(model).subst_name == at(0).subst_name) {
2386             if (!at(model).hasFlag(MF_DONE + MF_IGNORED))
2387                 return; // only works if all models done
2388             best_score = min(best_score, at(model).getScore());
2389         }
2390 
2391     double ok_score = best_score + Params::getInstance().score_diff_thres;
2392     set<string> ok_rates;
2393     for (model = 0; model <= finished_model; model++)
2394         if (at(model).getScore() <= ok_score) {
2395             string rate_name = at(model).orig_rate_name;
2396             ok_rates.insert(rate_name);
2397         }
2398     for (model = finished_model+1; model < size(); model++)
2399         if (ok_rates.find(at(model).orig_rate_name) == ok_rates.end())
2400             at(model).setFlag(MF_IGNORED);
2401 }
2402 
filterSubst(int finished_model)2403 void CandidateModelSet::filterSubst(int finished_model) {
2404     if (Params::getInstance().score_diff_thres < 0)
2405         return;
2406     double best_score = DBL_MAX;
2407     ASSERT(finished_model >= 0);
2408     int model;
2409     for (model = 0; model <= finished_model; model++)
2410         if (at(model).rate_name == at(0).rate_name)
2411             best_score = min(best_score, at(model).getScore());
2412 
2413     double ok_score = best_score + Params::getInstance().score_diff_thres;
2414     set<string> ok_model;
2415     for (model = 0; model <= finished_model; model++) {
2416         if (at(model).rate_name != at(0).rate_name)
2417             continue;
2418         if (at(model).getScore() <= ok_score) {
2419             string subst_name = at(model).orig_subst_name;
2420             ok_model.insert(subst_name);
2421         } else
2422             at(model).setFlag(MF_IGNORED);
2423     }
2424     for (model = finished_model+1; model < size(); model++)
2425         if (ok_model.find(at(model).orig_subst_name) == ok_model.end())
2426             at(model).setFlag(MF_IGNORED);
2427 }
2428 
test(Params & params,PhyloTree * in_tree,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads,int brlen_type,string set_name,string in_model_name,bool merge_phase)2429 CandidateModel CandidateModelSet::test(Params &params, PhyloTree* in_tree, ModelCheckpoint &model_info,
2430     ModelsBlock *models_block, int num_threads, int brlen_type,
2431     string set_name, string in_model_name, bool merge_phase)
2432 {
2433     ModelCheckpoint *checkpoint = &model_info;
2434 
2435 	in_tree->params = &params;
2436 
2437     // for ModelOMatic
2438     Alignment *prot_aln = NULL;
2439     Alignment *dna_aln = NULL;
2440     bool do_modelomatic = params.modelomatic && in_tree->aln->seq_type == SEQ_CODON;
2441 
2442     if (in_model_name.empty()) {
2443         generate(params, in_tree->aln, params.model_test_separate_rate, merge_phase);
2444         if (do_modelomatic) {
2445             // generate models for protein
2446             // adapter coefficient according to Whelan et al. 2015
2447             prot_aln = in_tree->aln->convertCodonToAA();
2448             int adjusted_df;
2449             double adjusted_logl = computeAdapter(in_tree->aln, prot_aln, adjusted_df);
2450             if (set_name.empty())
2451                 cout << "Adjusted LnL: " << adjusted_logl << "  df: " << adjusted_df << endl;
2452             size_t start = size();
2453             generate(params, prot_aln, params.model_test_separate_rate, merge_phase);
2454             size_t i;
2455             for (i = start; i < size(); i++) {
2456                 at(i).logl = adjusted_logl;
2457                 at(i).df = adjusted_df;
2458             }
2459 
2460             // generate models for DNA
2461             dna_aln = in_tree->aln->convertCodonToDNA();
2462             start = size();
2463             generate(params, dna_aln, params.model_test_separate_rate, merge_phase);
2464             for (i = start; i < size(); i++) {
2465                 at(i).setFlag(MF_SAMPLE_SIZE_TRIPLE);
2466             }
2467         }
2468     } else {
2469         push_back(CandidateModel(in_model_name, "", in_tree->aln));
2470     }
2471 
2472     DoubleVector model_scores;
2473     int model;
2474 	int best_model = -1;
2475     Alignment *best_aln = in_tree->aln;
2476 
2477 	int ssize = in_tree->aln->getNSite(); // sample size
2478     //if (adjust)
2479     //    ssize = adjust->sample_size;
2480 	if (params.model_test_sample_size)
2481 		ssize = params.model_test_sample_size;
2482 	if (set_name == "") {
2483         cout << "ModelFinder will test up to " << size() << " ";
2484         if (do_modelomatic)
2485             cout << "codon/AA/DNA";
2486         else
2487             cout << getSeqTypeName(in_tree->aln->seq_type);
2488         cout << " models (sample size: " << ssize << ") ..." << endl;
2489         if (params.model_test_and_tree == 0)
2490             cout << " No. Model         -LnL         df  AIC          AICc         BIC" << endl;
2491 	}
2492 
2493     //	uint64_t RAM_requirement = 0;
2494     int best_model_AIC = -1, best_model_AICc = -1, best_model_BIC = -1;
2495     double best_score_AIC = DBL_MAX, best_score_AICc = DBL_MAX, best_score_BIC = DBL_MAX;
2496     string best_tree_AIC, best_tree_AICc, best_tree_BIC;
2497 
2498 //    CKP_RESTORE(best_score_AIC);
2499 //    CKP_RESTORE(best_score_AICc);
2500 //    CKP_RESTORE(best_score_BIC);
2501 //    CKP_RESTORE(best_model_AIC);
2502 //    CKP_RESTORE(best_model_AICc);
2503 //    CKP_RESTORE(best_model_BIC);
2504 
2505     CKP_RESTORE(best_tree_AIC);
2506     CKP_RESTORE(best_tree_AICc);
2507     CKP_RESTORE(best_tree_BIC);
2508 
2509     // detect rate hetegeneity automatically or not
2510     bool auto_rate = merge_phase ? iEquals(params.merge_rates, "AUTO") : iEquals(params.ratehet_set, "AUTO");
2511     bool auto_subst = merge_phase ? iEquals(params.merge_models, "AUTO") : iEquals(params.model_set, "AUTO");
2512     int rate_block = size();
2513     if (auto_rate) {
2514         for (rate_block = 0; rate_block < size(); rate_block++)
2515             if (rate_block+1 < size() && at(rate_block+1).subst_name != at(rate_block).subst_name)
2516                 break;
2517     }
2518 
2519     int subst_block = size();
2520     if (auto_subst) {
2521         for (subst_block = size()-1; subst_block >= 0; subst_block--)
2522             if (at(subst_block).rate_name == at(0).rate_name)
2523                 break;
2524     }
2525 
2526 
2527     //------------- MAIN FOR LOOP GOING THROUGH ALL MODELS TO BE TESTED ---------//
2528 
2529 	for (model = 0; model < size(); model++) {
2530         if (model == rate_block+1)
2531             filterRates(rate_block); // auto filter rate models
2532         if (model == subst_block+1)
2533             filterSubst(subst_block); // auto filter substitution model
2534         if (at(model).hasFlag(MF_IGNORED)) {
2535             model_scores.push_back(DBL_MAX);
2536             continue;
2537         }
2538 		//cout << model_names[model] << endl;
2539         if (at(model).subst_name == "") {
2540             // now switching to test rate heterogeneity
2541             if (best_model == -1)
2542                 switch (params.model_test_criterion) {
2543                 case MTC_AIC:
2544                     best_model = best_model_AIC;
2545                     break;
2546                 case MTC_AICC:
2547                     best_model = best_model_AICc;
2548                     break;
2549                 case MTC_BIC:
2550                     best_model = best_model_BIC;
2551                     break;
2552                 default: ASSERT(0);
2553                 }
2554             at(model).subst_name = at(best_model).subst_name;
2555         }
2556 
2557 		// optimize model parameters
2558         string orig_model_name = at(model).getName();
2559         // keep separate output model_info to only update model_info if better model found
2560         ModelCheckpoint out_model_info;
2561 		//CandidateModel info;
2562 		//info.set_name = set_name;
2563         at(model).set_name = set_name;
2564         string tree_string;
2565 
2566         /***** main call to estimate model parameters ******/
2567         tree_string = at(model).evaluate(params,
2568             model_info, out_model_info, models_block, num_threads, brlen_type);
2569 
2570         at(model).computeICScores(ssize);
2571         at(model).setFlag(MF_DONE);
2572 
2573         CandidateModel prev_info;
2574 
2575         bool skip_model = false;
2576 
2577         if (prev_info.restoreCheckpointRminus1(checkpoint, &at(model))) {
2578             // check stop criterion for +R
2579             prev_info.computeICScores(ssize);
2580             switch (params.model_test_criterion) {
2581             case MTC_ALL:
2582                 if (at(model).AIC_score > prev_info.AIC_score &&
2583                     at(model).AICc_score > prev_info.AICc_score &&
2584                     at(model).BIC_score > prev_info.BIC_score) {
2585                     // skip remaining model
2586                     skip_model = true;
2587                 }
2588                 break;
2589             case MTC_AIC:
2590                 if (at(model).AIC_score > prev_info.AIC_score) {
2591                     // skip remaining model
2592                     skip_model = true;
2593                 }
2594                 break;
2595             case MTC_AICC:
2596                 if (at(model).AICc_score > prev_info.AICc_score) {
2597                     // skip remaining model
2598                     skip_model = true;
2599                 }
2600                 break;
2601             case MTC_BIC:
2602                 if (at(model).BIC_score > prev_info.BIC_score) {
2603                     // skip remaining model
2604                     skip_model = true;
2605                 }
2606                 break;
2607             }
2608         }
2609 
2610 		if (at(model).AIC_score < best_score_AIC) {
2611             best_model_AIC = model;
2612             best_score_AIC = at(model).AIC_score;
2613             if (!tree_string.empty())
2614                 best_tree_AIC = tree_string;
2615             // only update model_info with better model
2616             if (params.model_test_criterion == MTC_AIC) {
2617                 model_info.putSubCheckpoint(&out_model_info, "");
2618                 best_aln = at(model).aln;
2619             }
2620         }
2621 		if (at(model).AICc_score < best_score_AICc) {
2622             best_model_AICc = model;
2623             best_score_AICc = at(model).AICc_score;
2624             if (!tree_string.empty())
2625                 best_tree_AICc = tree_string;
2626             // only update model_info with better model
2627             if (params.model_test_criterion == MTC_AICC) {
2628                 model_info.putSubCheckpoint(&out_model_info, "");
2629                 best_aln = at(model).aln;
2630             }
2631         }
2632 
2633 		if (at(model).BIC_score < best_score_BIC) {
2634 			best_model_BIC = model;
2635             best_score_BIC = at(model).BIC_score;
2636             if (!tree_string.empty())
2637                 best_tree_BIC = tree_string;
2638             // only update model_info with better model
2639             if (params.model_test_criterion == MTC_BIC) {
2640                 model_info.putSubCheckpoint(&out_model_info, "");
2641                 best_aln = at(model).aln;
2642             }
2643         }
2644 
2645         switch (params.model_test_criterion) {
2646             case MTC_AIC: model_scores.push_back(at(model).AIC_score); break;
2647             case MTC_AICC: model_scores.push_back(at(model).AICc_score); break;
2648             default: model_scores.push_back(at(model).BIC_score); break;
2649         }
2650 
2651         CKP_SAVE(best_tree_AIC);
2652         CKP_SAVE(best_tree_AICc);
2653         CKP_SAVE(best_tree_BIC);
2654         checkpoint->dump();
2655 
2656 
2657 		if (set_name == "") {
2658             cout.width(3);
2659             cout << right << model+1 << "  ";
2660             cout.width(13);
2661             cout << left << at(model).getName() << " ";
2662 
2663             cout.precision(3);
2664             cout << fixed;
2665             cout.width(12);
2666             cout << -at(model).logl << " ";
2667             cout.width(3);
2668             cout << at(model).df << " ";
2669             cout.width(12);
2670             cout << at(model).AIC_score << " ";
2671             cout.width(12);
2672             cout << at(model).AICc_score << " " << at(model).BIC_score;
2673             cout << endl;
2674         }
2675 
2676         if (skip_model) {
2677             // skip over all +R model of higher categories
2678             const char *rates[] = {"+R", "*R", "+H", "*H"};
2679             size_t posR;
2680             for (int i = 0; i < sizeof(rates)/sizeof(char*); i++)
2681                 if ((posR = orig_model_name.find(rates[i])) != string::npos)
2682                     break;
2683             string first_part = orig_model_name.substr(0, posR+2);
2684             for (int next = model+1; next < size() && at(next).getName().substr(0, posR+2) == first_part; next++) {
2685                 at(next).setFlag(MF_IGNORED);
2686             }
2687         }
2688 
2689 
2690 	}
2691 
2692     ASSERT(model_scores.size() == size());
2693 
2694     if (best_model_BIC == -1)
2695         outError("No models were examined! Please check messages above");
2696 
2697 	int *model_rank = new int[model_scores.size()];
2698 
2699 //    string best_tree; // BQM 2015-07-21: With Lars find best model
2700 	/* sort models by their scores */
2701 	switch (params.model_test_criterion) {
2702 	case MTC_AIC:
2703 		best_model = best_model_AIC;
2704 		break;
2705 	case MTC_AICC:
2706 		best_model = best_model_AICc;
2707 		break;
2708 	case MTC_BIC:
2709 		best_model = best_model_BIC;
2710 		break;
2711     default: ASSERT(0);
2712 	}
2713 	sort_index(model_scores.data(), model_scores.data() + model_scores.size(), model_rank);
2714 
2715     string model_list;
2716 	for (model = 0; model < model_scores.size(); model++) {
2717         if (model_scores[model_rank[model]] == DBL_MAX)
2718             break;
2719         if (model > 0)
2720             model_list += " ";
2721 		model_list += at(model_rank[model]).getName();
2722     }
2723 
2724     model_info.putBestModelList(model_list);
2725 
2726     model_info.put("best_model_AIC", at(best_model_AIC).getName());
2727     model_info.put("best_model_AICc", at(best_model_AICc).getName());
2728     model_info.put("best_model_BIC", at(best_model_BIC).getName());
2729 
2730     CKP_SAVE(best_score_AIC);
2731     CKP_SAVE(best_score_AICc);
2732     CKP_SAVE(best_score_BIC);
2733 
2734     checkpoint->dump();
2735 
2736 	delete [] model_rank;
2737 
2738     // update alignment if best data type changed
2739     if (best_aln != in_tree->aln) {
2740         delete in_tree->aln;
2741         in_tree->aln = best_aln;
2742         if (best_aln == prot_aln)
2743             prot_aln = NULL;
2744         else
2745             dna_aln = NULL;
2746     }
2747 
2748     if (dna_aln)
2749         delete dna_aln;
2750     if (prot_aln)
2751         delete prot_aln;
2752 
2753 //	in_tree->deleteAllPartialLh();
2754 
2755     string best_tree;
2756     model_info.getBestTree(best_tree);
2757 
2758     // BQM 2015-07-21 with Lars: load the best_tree
2759 //	if (params.model_test_and_tree)
2760 		in_tree->readTreeString(best_tree);
2761 
2762 
2763 	return at(best_model);
2764 }
2765 
getNextModel()2766 int64_t CandidateModelSet::getNextModel() {
2767     int64_t next_model;
2768 #pragma omp critical
2769     {
2770     if (size() == 0)
2771         next_model = -1;
2772     else if (current_model == -1)
2773         next_model = 0;
2774     else {
2775         for (next_model = current_model+1; next_model != current_model; next_model++) {
2776             if (next_model == size())
2777                 next_model = 0;
2778             if (!at(next_model).hasFlag(MF_IGNORED + MF_WAITING + MF_RUNNING)) {
2779                 break;
2780             }
2781         }
2782     }
2783     }
2784     if (next_model != current_model) {
2785         current_model = next_model;
2786         at(next_model).setFlag(MF_RUNNING);
2787         return next_model;
2788     } else
2789         return -1;
2790 }
2791 
evaluateAll(Params & params,PhyloTree * in_tree,ModelCheckpoint & model_info,ModelsBlock * models_block,int num_threads,int brlen_type,string in_model_name,bool merge_phase,bool write_info)2792 CandidateModel CandidateModelSet::evaluateAll(Params &params, PhyloTree* in_tree, ModelCheckpoint &model_info,
2793                                     ModelsBlock *models_block, int num_threads, int brlen_type,
2794                                     string in_model_name, bool merge_phase, bool write_info)
2795 {
2796     //ModelCheckpoint *checkpoint = &model_info;
2797 
2798     in_tree->params = &params;
2799 
2800     Alignment *prot_aln = NULL;
2801     Alignment *dna_aln = NULL;
2802     bool do_modelomatic = params.modelomatic && in_tree->aln->seq_type == SEQ_CODON;
2803 
2804 
2805 
2806     if (in_model_name.empty()) {
2807         generate(params, in_tree->aln, params.model_test_separate_rate, merge_phase);
2808         if (do_modelomatic) {
2809             // generate models for protein
2810             // adapter coefficient according to Whelan et al. 2015
2811             prot_aln = in_tree->aln->convertCodonToAA();
2812             int adjusted_df;
2813             double adjusted_logl = computeAdapter(in_tree->aln, prot_aln, adjusted_df);
2814             if (write_info)
2815                 cout << "Adjusted LnL: " << adjusted_logl << "  df: " << adjusted_df << endl;
2816             size_t start = size();
2817             generate(params, prot_aln, params.model_test_separate_rate, merge_phase);
2818             size_t i;
2819             for (i = start; i < size(); i++) {
2820                 at(i).logl = adjusted_logl;
2821                 at(i).df = adjusted_df;
2822             }
2823 
2824             // generate models for DNA
2825             dna_aln = in_tree->aln->convertCodonToDNA();
2826             start = size();
2827             generate(params, dna_aln, params.model_test_separate_rate, merge_phase);
2828             for (i = start; i < size(); i++) {
2829                 at(i).setFlag(MF_SAMPLE_SIZE_TRIPLE);
2830             }
2831         }
2832     } else {
2833         push_back(CandidateModel(in_model_name, "", in_tree->aln));
2834     }
2835 
2836     if (write_info) {
2837         cout << "ModelFinder will test " << size() << " ";
2838         if (do_modelomatic)
2839             cout << "codon/AA/DNA";
2840         else
2841             cout << getSeqTypeName(in_tree->aln->seq_type);
2842         cout << " models (sample size: " << in_tree->aln->getNSite() << ") ..." << endl;
2843         cout << " No. Model         -LnL         df  AIC          AICc         BIC" << endl;
2844     }
2845 
2846     double best_score = DBL_MAX;
2847 
2848     // detect rate hetegeneity automatically or not
2849     bool auto_rate = merge_phase ? iEquals(params.merge_rates, "AUTO") : iEquals(params.ratehet_set, "AUTO");
2850     bool auto_subst = merge_phase ? iEquals(params.merge_models, "AUTO") : iEquals(params.model_set, "AUTO");
2851     int rate_block = size();
2852     if (auto_rate) {
2853         for (rate_block = 0; rate_block < size(); rate_block++)
2854             if (rate_block+1 < size() && at(rate_block+1).subst_name != at(rate_block).subst_name)
2855                 break;
2856     }
2857 
2858     int subst_block = size();
2859     if (auto_subst) {
2860         for (subst_block = size()-1; subst_block >= 0; subst_block--)
2861             if (at(subst_block).rate_name == at(0).rate_name)
2862                 break;
2863     }
2864 
2865     int64_t num_models = size();
2866 #ifdef _OPENMP
2867 #pragma omp parallel num_threads(num_threads)
2868 #endif
2869     {
2870     int64_t model;
2871     do {
2872         model = getNextModel();
2873         if (model == -1)
2874             break;
2875 
2876         // optimize model parameters
2877         string orig_model_name = at(model).getName();
2878         // keep separate output model_info to only update model_info if better model found
2879         ModelCheckpoint out_model_info;
2880         at(model).set_name = at(model).aln->name;
2881         string tree_string;
2882 
2883         // main call to estimate model parameters
2884         tree_string = at(model).evaluate(params, model_info, out_model_info,
2885                                          models_block, num_threads, brlen_type);
2886         at(model).computeICScores();
2887         at(model).setFlag(MF_DONE);
2888 
2889         int lower_model = getLowerKModel(model);
2890         if (lower_model >= 0 && at(lower_model).getScore() < at(model).getScore()) {
2891             // ignore all +R_k model with higher category
2892             for (int higher_model = model; higher_model != -1;
2893                 higher_model = getHigherKModel(higher_model)) {
2894                 at(higher_model).setFlag(MF_IGNORED);
2895             }
2896 
2897         }
2898 #ifdef _OPENMP
2899 #pragma omp critical
2900         {
2901 #endif
2902         if (best_score > at(model).getScore()) {
2903             best_score = at(model).getScore();
2904             if (!tree_string.empty()) {
2905                 //model_info.put("best_tree_" + criterionName(params.model_test_criterion), tree_string);
2906             }
2907             // only update model_info with better model
2908             model_info.putSubCheckpoint(&out_model_info, "");
2909         }
2910         model_info.dump();
2911         if (write_info) {
2912             cout.width(3);
2913             cout << right << model+1 << "  ";
2914             cout.width(13);
2915             cout << left << at(model).getName() << " ";
2916 
2917             cout.precision(3);
2918             cout << fixed;
2919             cout.width(12);
2920             cout << -at(model).logl << " ";
2921             cout.width(3);
2922             cout << at(model).df << " ";
2923             cout.width(12);
2924             cout << at(model).AIC_score << " ";
2925             cout.width(12);
2926             cout << at(model).AICc_score << " " << at(model).BIC_score;
2927             cout << endl;
2928 
2929         }
2930         if (model >= rate_block)
2931             filterRates(model); // auto filter rate models
2932         if (model >= subst_block)
2933             filterSubst(model); // auto filter substitution model
2934 #ifdef _OPENMP
2935         }
2936 #endif
2937     } while (model != -1);
2938     }
2939 
2940     // store the best model
2941     ModelTestCriterion criteria[] = {MTC_AIC, MTC_AICC, MTC_BIC};
2942     for (auto mtc : criteria) {
2943         int best_model = getBestModelID(mtc);
2944         model_info.put("best_score_" + criterionName(mtc), at(best_model).getScore(mtc));
2945         model_info.put("best_model_" + criterionName(mtc), at(best_model).getName());
2946     }
2947 
2948 
2949     /* sort models by their scores */
2950     multimap<double,int> model_sorted;
2951     for (int64_t model = 0; model < num_models; model++)
2952         if (at(model).hasFlag(MF_DONE)) {
2953             model_sorted.insert(multimap<double,int>::value_type(at(model).getScore(), model));
2954         }
2955     string model_list;
2956     for (auto it = model_sorted.begin(); it != model_sorted.end(); it++) {
2957         if (it != model_sorted.begin())
2958             model_list += " ";
2959         model_list += at(it->second).getName();
2960     }
2961 
2962     model_info.putBestModelList(model_list);
2963     model_info.dump();
2964 
2965     // update alignment if best data type changed
2966     int best_model = getBestModelID(params.model_test_criterion);
2967     if (at(best_model).aln != in_tree->aln) {
2968         delete in_tree->aln;
2969         in_tree->aln = at(best_model).aln;
2970         if (in_tree->aln == prot_aln)
2971             prot_aln = NULL;
2972         else
2973             dna_aln = NULL;
2974     }
2975 
2976     if (dna_aln)
2977         delete dna_aln;
2978     if (prot_aln)
2979         delete prot_aln;
2980 
2981     return at(best_model);
2982 }
2983 
2984 
2985