1 //
2 //  rateheterotachy.cpp
3 //  iqtree
4 //
5 //  Created by Minh Bui on 11/8/16.
6 //
7 //
8 
9 #include "tree/phylotree.h"
10 #include "rateheterotachy.h"
11 
RateHeterotachy(int ncat,string params,PhyloTree * tree)12 RateHeterotachy::RateHeterotachy(int ncat, string params, PhyloTree *tree) : RateHeterogeneity() {
13     phylo_tree = tree;
14     prop = NULL;
15     fix_params = 0;
16     optimize_steps = 0;
17     setNCategory(ncat);
18 
19 	if (params.empty()) return;
20 	DoubleVector params_vec;
21 	try {
22 		convert_double_vec(params.c_str(), params_vec);
23 		if (params_vec.size() != ncategory)
24 			outError("Number of parameters for rate heterotachy model must equal number of categories");
25 		int i;
26 		double sum_prop;
27 		for (i = 0, sum_prop = 0.0; i < ncategory; i++) {
28 			prop[i] = params_vec[i];
29 			sum_prop += prop[i];
30 		}
31 		if (fabs(sum_prop-1.0) > 1e-5)
32 			outError("Sum of category proportions not equal to 1");
33 		// Minh: Please double check this one. It isn't quite so
34 		// clear what fix_params is doing, as it seems to take values
35 		// 0, 1 or 2.  -- MDW
36         //BQM: that OK
37 
38 		if (!(tree->params->optimize_from_given_params)) {
39 		        fix_params = 1;
40 		} // else fix_params == 0 still.
41 	} catch (string &str) {
42 		outError(str);
43 	}
44 }
45 
46 /**
47     destructor
48 */
~RateHeterotachy()49 RateHeterotachy::~RateHeterotachy() {
50     if (prop)
51         delete [] prop;
52     prop = NULL;
53 }
54 
setNCategory(int ncat)55 void RateHeterotachy::setNCategory(int ncat) {
56     ncategory = ncat;
57     if (optimize_steps == 0)
58         optimize_steps = ncat*100;
59     // initialize with gamma rates
60 	if (prop) delete [] prop;
61 	prop  = new double[ncategory];
62 
63     int i;
64 	for (i = 0; i < ncategory; i++)
65         prop[i] = (1.0-getPInvar())/ncategory;
66 
67 	name = "+H";
68 	name += convertIntToString(ncategory);
69 	full_name = "Rate heterotachy";
70 	full_name += " with " + convertIntToString(ncategory) + " categories";
71 }
72 
startCheckpoint()73 void RateHeterotachy::startCheckpoint() {
74     checkpoint->startStruct("RateHeterotachy" + convertIntToString(ncategory));
75 }
76 
77 /**
78     save object into the checkpoint
79 */
saveCheckpoint()80 void RateHeterotachy::saveCheckpoint() {
81     startCheckpoint();
82     CKP_ARRAY_SAVE(ncategory, prop);
83     endCheckpoint();
84     RateHeterogeneity::saveCheckpoint();
85 }
86 
87 /**
88     restore object from the checkpoint
89 */
restoreCheckpoint()90 void RateHeterotachy::restoreCheckpoint() {
91     RateHeterogeneity::restoreCheckpoint();
92     startCheckpoint();
93     CKP_ARRAY_RESTORE(ncategory, prop);
94     endCheckpoint();
95 }
96 
getNDim()97 int RateHeterotachy::getNDim() {
98     if (fix_params) return 0;
99     return ncategory-1;
100 }
101 
102 /**
103  * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
104  */
getNameParams()105 string RateHeterotachy::getNameParams() {
106 	stringstream str;
107 	str << "+H" << ncategory << "{";
108 	for (int i = 0; i < ncategory; i++) {
109 		if (i > 0) str << ",";
110 		str << prop[i];
111 	}
112 	str << "}";
113 	return str.str();
114 }
115 
writeInfo(ostream & out)116 void RateHeterotachy::writeInfo(ostream &out) {
117     if (fix_params != 2) {
118         out << "Heterotachy weights:     ";
119         for (int i = 0; i < ncategory; i++)
120             out << " " << prop[i];
121         out << endl;
122     }
123     DoubleVector lenvec;
124     phylo_tree->treeLengths(lenvec);
125     out << "Heterotachy tree lengths:";
126     for (int j = 0; j < lenvec.size(); j++)
127         out << " " << lenvec[j];
128     out << endl;
129 }
130 
writeParameters(ostream & out)131 void RateHeterotachy::writeParameters(ostream &out) {
132 	for (int i = 0; i < ncategory; i++)
133 		out << "\t" << prop[i];
134 }
135 
136 /**
137     optimize parameters. Default is to optimize gamma shape
138     @return the best likelihood
139 */
optimizeParameters(double gradient_epsilon)140 double RateHeterotachy::optimizeParameters(double gradient_epsilon) {
141     if (fix_params)
142         return phylo_tree->computeLikelihood();
143 	if (verbose_mode >= VB_MED)
144 		cout << "Optimizing " << name << " model parameters by EM algorithm..." << endl;
145 
146     return optimizeWithEM();
147 }
148 
optimizeWithEM()149 double RateHeterotachy::optimizeWithEM() {
150 
151     // first compute _pattern_lh_cat
152     phylo_tree->computePatternLhCat(WSL_RATECAT);
153     size_t ptn, c;
154     size_t nptn = phylo_tree->aln->getNPattern();
155     size_t nmix = ncategory;
156 
157     double *new_prop = aligned_alloc<double>(nmix);
158     double *ratio_prop = aligned_alloc<double>(nmix);
159 
160     // EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
161 
162     for (int step = 0; step < optimize_steps; step++) {
163         // E-step
164 
165         if (step > 0) {
166             // convert _pattern_lh_cat taking into account new weights
167             for (ptn = 0; ptn < nptn; ptn++) {
168                 double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
169                 for (c = 0; c < nmix; c++) {
170                     this_lk_cat[c] *= ratio_prop[c];
171                 }
172             }
173         }
174         memset(new_prop, 0, nmix*sizeof(double));
175         for (ptn = 0; ptn < nptn; ptn++) {
176             double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
177             double lk_ptn = phylo_tree->ptn_invar[ptn];
178             for (c = 0; c < nmix; c++) {
179                 lk_ptn += this_lk_cat[c];
180             }
181             ASSERT(lk_ptn != 0.0);
182             lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn;
183             for (c = 0; c < nmix; c++) {
184                 new_prop[c] += this_lk_cat[c] * lk_ptn;
185             }
186         }
187         bool converged = true;
188         double new_pinvar = 0.0;
189         for (c = 0; c < nmix; c++) {
190             new_prop[c] /= phylo_tree->getAlnNSite();
191             // Make sure that probabilities do not get zero
192             if (new_prop[c] < 1e-10) new_prop[c] = 1e-10;
193             // check for convergence
194             converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
195             ratio_prop[c] = new_prop[c] / prop[c];
196             if (std::isnan(ratio_prop[c])) {
197                 cerr << "BUG: " << new_prop[c] << " " << prop[c] << " " << ratio_prop[c] << endl;
198             }
199             prop[c] = new_prop[c];
200             new_pinvar += prop[c];
201         }
202         new_pinvar = fabs(1.0 - new_pinvar);
203         if (new_pinvar > 1e-6) {
204             converged = converged && (fabs(getPInvar()-new_pinvar) < 1e-4);
205             // TODO fix p_pinvar
206             setPInvar(new_pinvar);
207 //            phylo_tree->getRate()->setOptimizePInvar(false);
208             phylo_tree->computePtnInvar();
209             phylo_tree->clearAllPartialLH();
210 
211         }
212         if (converged) break;
213 
214     }
215 
216     aligned_free(ratio_prop);
217     aligned_free(new_prop);
218 //    aligned_free(lk_ptn);
219     return phylo_tree->computeLikelihood();
220 
221 /*
222     size_t ptn, c;
223     size_t nptn = phylo_tree->aln->getNPattern();
224     size_t nmix = ncategory;
225     const double MIN_PROP = 1e-4;
226 
227     double new_prop[nmix];
228 
229     // EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
230 
231     // first compute _pattern_lh_cat
232     double score;
233     score = phylo_tree->computePatternLhCat(WSL_RATECAT);
234 
235     memset(new_prop, 0, nmix*sizeof(double));
236 
237     // E-step
238     // decoupled weights (prop) from _pattern_lh_cat to obtain L_ci and compute pattern likelihood L_i
239     for (ptn = 0; ptn < nptn; ptn++) {
240         double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
241         double lk_ptn = phylo_tree->ptn_invar[ptn];
242         for (c = 0; c < nmix; c++) {
243             lk_ptn += this_lk_cat[c];
244         }
245         assert(lk_ptn != 0.0);
246         lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn;
247 
248         // transform _pattern_lh_cat into posterior probabilities of each category
249         for (c = 0; c < nmix; c++) {
250             this_lk_cat[c] *= lk_ptn;
251             new_prop[c] += this_lk_cat[c];
252         }
253 
254     }
255 
256     // M-step, update weights according to (*)
257     int maxpropid = 0;
258     double new_pinvar = 0.0;
259     for (c = 0; c < nmix; c++) {
260         new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite();
261         if (new_prop[c] > new_prop[maxpropid])
262             maxpropid = c;
263     }
264     // regularize prop
265     bool zero_prop = false;
266     for (c = 0; c < nmix; c++) {
267         if (new_prop[c] < MIN_PROP) {
268             new_prop[maxpropid] -= (MIN_PROP - new_prop[c]);
269             new_prop[c] = MIN_PROP;
270             zero_prop = true;
271         }
272     }
273 
274     double sum_prop = 0.0;
275     for (c = 0; c < nmix; c++) {
276         // check for convergence
277         sum_prop += new_prop[c];
278         prop[c] = new_prop[c];
279         new_pinvar += new_prop[c];
280     }
281 
282     new_pinvar = 1.0 - new_pinvar;
283 
284     if (new_pinvar > 1e-4 && getPInvar() != 0.0) {
285         setPInvar(new_pinvar);
286 //            setOptimizePInvar(false);
287         phylo_tree->computePtnInvar();
288     }
289 
290     assert(fabs(sum_prop+new_pinvar-1.0) < MIN_PROP);
291 
292     // sort the rates in increasing order
293     if (sorted_rates) {
294         // TODO sort tree lengths per category
295     }
296 
297     return phylo_tree->computeLikelihood();
298 */
299 }
300