1 //
2 // rateheterotachy.cpp
3 // iqtree
4 //
5 // Created by Minh Bui on 11/8/16.
6 //
7 //
8
9 #include "tree/phylotree.h"
10 #include "rateheterotachy.h"
11
RateHeterotachy(int ncat,string params,PhyloTree * tree)12 RateHeterotachy::RateHeterotachy(int ncat, string params, PhyloTree *tree) : RateHeterogeneity() {
13 phylo_tree = tree;
14 prop = NULL;
15 fix_params = 0;
16 optimize_steps = 0;
17 setNCategory(ncat);
18
19 if (params.empty()) return;
20 DoubleVector params_vec;
21 try {
22 convert_double_vec(params.c_str(), params_vec);
23 if (params_vec.size() != ncategory)
24 outError("Number of parameters for rate heterotachy model must equal number of categories");
25 int i;
26 double sum_prop;
27 for (i = 0, sum_prop = 0.0; i < ncategory; i++) {
28 prop[i] = params_vec[i];
29 sum_prop += prop[i];
30 }
31 if (fabs(sum_prop-1.0) > 1e-5)
32 outError("Sum of category proportions not equal to 1");
33 // Minh: Please double check this one. It isn't quite so
34 // clear what fix_params is doing, as it seems to take values
35 // 0, 1 or 2. -- MDW
36 //BQM: that OK
37
38 if (!(tree->params->optimize_from_given_params)) {
39 fix_params = 1;
40 } // else fix_params == 0 still.
41 } catch (string &str) {
42 outError(str);
43 }
44 }
45
46 /**
47 destructor
48 */
~RateHeterotachy()49 RateHeterotachy::~RateHeterotachy() {
50 if (prop)
51 delete [] prop;
52 prop = NULL;
53 }
54
setNCategory(int ncat)55 void RateHeterotachy::setNCategory(int ncat) {
56 ncategory = ncat;
57 if (optimize_steps == 0)
58 optimize_steps = ncat*100;
59 // initialize with gamma rates
60 if (prop) delete [] prop;
61 prop = new double[ncategory];
62
63 int i;
64 for (i = 0; i < ncategory; i++)
65 prop[i] = (1.0-getPInvar())/ncategory;
66
67 name = "+H";
68 name += convertIntToString(ncategory);
69 full_name = "Rate heterotachy";
70 full_name += " with " + convertIntToString(ncategory) + " categories";
71 }
72
startCheckpoint()73 void RateHeterotachy::startCheckpoint() {
74 checkpoint->startStruct("RateHeterotachy" + convertIntToString(ncategory));
75 }
76
77 /**
78 save object into the checkpoint
79 */
saveCheckpoint()80 void RateHeterotachy::saveCheckpoint() {
81 startCheckpoint();
82 CKP_ARRAY_SAVE(ncategory, prop);
83 endCheckpoint();
84 RateHeterogeneity::saveCheckpoint();
85 }
86
87 /**
88 restore object from the checkpoint
89 */
restoreCheckpoint()90 void RateHeterotachy::restoreCheckpoint() {
91 RateHeterogeneity::restoreCheckpoint();
92 startCheckpoint();
93 CKP_ARRAY_RESTORE(ncategory, prop);
94 endCheckpoint();
95 }
96
getNDim()97 int RateHeterotachy::getNDim() {
98 if (fix_params) return 0;
99 return ncategory-1;
100 }
101
102 /**
103 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
104 */
getNameParams()105 string RateHeterotachy::getNameParams() {
106 stringstream str;
107 str << "+H" << ncategory << "{";
108 for (int i = 0; i < ncategory; i++) {
109 if (i > 0) str << ",";
110 str << prop[i];
111 }
112 str << "}";
113 return str.str();
114 }
115
writeInfo(ostream & out)116 void RateHeterotachy::writeInfo(ostream &out) {
117 if (fix_params != 2) {
118 out << "Heterotachy weights: ";
119 for (int i = 0; i < ncategory; i++)
120 out << " " << prop[i];
121 out << endl;
122 }
123 DoubleVector lenvec;
124 phylo_tree->treeLengths(lenvec);
125 out << "Heterotachy tree lengths:";
126 for (int j = 0; j < lenvec.size(); j++)
127 out << " " << lenvec[j];
128 out << endl;
129 }
130
writeParameters(ostream & out)131 void RateHeterotachy::writeParameters(ostream &out) {
132 for (int i = 0; i < ncategory; i++)
133 out << "\t" << prop[i];
134 }
135
136 /**
137 optimize parameters. Default is to optimize gamma shape
138 @return the best likelihood
139 */
optimizeParameters(double gradient_epsilon)140 double RateHeterotachy::optimizeParameters(double gradient_epsilon) {
141 if (fix_params)
142 return phylo_tree->computeLikelihood();
143 if (verbose_mode >= VB_MED)
144 cout << "Optimizing " << name << " model parameters by EM algorithm..." << endl;
145
146 return optimizeWithEM();
147 }
148
optimizeWithEM()149 double RateHeterotachy::optimizeWithEM() {
150
151 // first compute _pattern_lh_cat
152 phylo_tree->computePatternLhCat(WSL_RATECAT);
153 size_t ptn, c;
154 size_t nptn = phylo_tree->aln->getNPattern();
155 size_t nmix = ncategory;
156
157 double *new_prop = aligned_alloc<double>(nmix);
158 double *ratio_prop = aligned_alloc<double>(nmix);
159
160 // EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
161
162 for (int step = 0; step < optimize_steps; step++) {
163 // E-step
164
165 if (step > 0) {
166 // convert _pattern_lh_cat taking into account new weights
167 for (ptn = 0; ptn < nptn; ptn++) {
168 double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
169 for (c = 0; c < nmix; c++) {
170 this_lk_cat[c] *= ratio_prop[c];
171 }
172 }
173 }
174 memset(new_prop, 0, nmix*sizeof(double));
175 for (ptn = 0; ptn < nptn; ptn++) {
176 double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
177 double lk_ptn = phylo_tree->ptn_invar[ptn];
178 for (c = 0; c < nmix; c++) {
179 lk_ptn += this_lk_cat[c];
180 }
181 ASSERT(lk_ptn != 0.0);
182 lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn;
183 for (c = 0; c < nmix; c++) {
184 new_prop[c] += this_lk_cat[c] * lk_ptn;
185 }
186 }
187 bool converged = true;
188 double new_pinvar = 0.0;
189 for (c = 0; c < nmix; c++) {
190 new_prop[c] /= phylo_tree->getAlnNSite();
191 // Make sure that probabilities do not get zero
192 if (new_prop[c] < 1e-10) new_prop[c] = 1e-10;
193 // check for convergence
194 converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
195 ratio_prop[c] = new_prop[c] / prop[c];
196 if (std::isnan(ratio_prop[c])) {
197 cerr << "BUG: " << new_prop[c] << " " << prop[c] << " " << ratio_prop[c] << endl;
198 }
199 prop[c] = new_prop[c];
200 new_pinvar += prop[c];
201 }
202 new_pinvar = fabs(1.0 - new_pinvar);
203 if (new_pinvar > 1e-6) {
204 converged = converged && (fabs(getPInvar()-new_pinvar) < 1e-4);
205 // TODO fix p_pinvar
206 setPInvar(new_pinvar);
207 // phylo_tree->getRate()->setOptimizePInvar(false);
208 phylo_tree->computePtnInvar();
209 phylo_tree->clearAllPartialLH();
210
211 }
212 if (converged) break;
213
214 }
215
216 aligned_free(ratio_prop);
217 aligned_free(new_prop);
218 // aligned_free(lk_ptn);
219 return phylo_tree->computeLikelihood();
220
221 /*
222 size_t ptn, c;
223 size_t nptn = phylo_tree->aln->getNPattern();
224 size_t nmix = ncategory;
225 const double MIN_PROP = 1e-4;
226
227 double new_prop[nmix];
228
229 // EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
230
231 // first compute _pattern_lh_cat
232 double score;
233 score = phylo_tree->computePatternLhCat(WSL_RATECAT);
234
235 memset(new_prop, 0, nmix*sizeof(double));
236
237 // E-step
238 // decoupled weights (prop) from _pattern_lh_cat to obtain L_ci and compute pattern likelihood L_i
239 for (ptn = 0; ptn < nptn; ptn++) {
240 double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
241 double lk_ptn = phylo_tree->ptn_invar[ptn];
242 for (c = 0; c < nmix; c++) {
243 lk_ptn += this_lk_cat[c];
244 }
245 assert(lk_ptn != 0.0);
246 lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn;
247
248 // transform _pattern_lh_cat into posterior probabilities of each category
249 for (c = 0; c < nmix; c++) {
250 this_lk_cat[c] *= lk_ptn;
251 new_prop[c] += this_lk_cat[c];
252 }
253
254 }
255
256 // M-step, update weights according to (*)
257 int maxpropid = 0;
258 double new_pinvar = 0.0;
259 for (c = 0; c < nmix; c++) {
260 new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite();
261 if (new_prop[c] > new_prop[maxpropid])
262 maxpropid = c;
263 }
264 // regularize prop
265 bool zero_prop = false;
266 for (c = 0; c < nmix; c++) {
267 if (new_prop[c] < MIN_PROP) {
268 new_prop[maxpropid] -= (MIN_PROP - new_prop[c]);
269 new_prop[c] = MIN_PROP;
270 zero_prop = true;
271 }
272 }
273
274 double sum_prop = 0.0;
275 for (c = 0; c < nmix; c++) {
276 // check for convergence
277 sum_prop += new_prop[c];
278 prop[c] = new_prop[c];
279 new_pinvar += new_prop[c];
280 }
281
282 new_pinvar = 1.0 - new_pinvar;
283
284 if (new_pinvar > 1e-4 && getPInvar() != 0.0) {
285 setPInvar(new_pinvar);
286 // setOptimizePInvar(false);
287 phylo_tree->computePtnInvar();
288 }
289
290 assert(fabs(sum_prop+new_pinvar-1.0) < MIN_PROP);
291
292 // sort the rates in increasing order
293 if (sorted_rates) {
294 // TODO sort tree lengths per category
295 }
296
297 return phylo_tree->computeLikelihood();
298 */
299 }
300