1 /***************************************************************************
2 * Copyright (C) 2009-2015 by *
3 * BUI Quang Minh <minh.bui@univie.ac.at> *
4 * Lam-Tung Nguyen <nltung@gmail.com> *
5 * *
6 * *
7 * This program is free software; you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation; either version 2 of the License, or *
10 * (at your option) any later version. *
11 * *
12 * This program is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU General Public License for more details. *
16 * *
17 * You should have received a copy of the GNU General Public License *
18 * along with this program; if not, write to the *
19 * Free Software Foundation, Inc., *
20 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
21 ***************************************************************************/
22 #include "rategammainvar.h"
23
RateGammaInvar(int ncat,double shape,bool median,double p_invar_sites,string optimize_alg,PhyloTree * tree,bool testParamDone)24 RateGammaInvar::RateGammaInvar(int ncat, double shape, bool median,
25 double p_invar_sites, string optimize_alg, PhyloTree *tree, bool testParamDone) :
26 RateInvar(p_invar_sites, tree), RateGamma(ncat, shape, median, tree) {
27 name = "+I" + name;
28 full_name = "Invar+" + full_name;
29 this->optimize_alg = optimize_alg;
30 cur_optimize = 0;
31 this->testParamDone = testParamDone;
32 for (int cat = 0; cat < ncategory; cat++)
33 rates[cat] = 1.0/(1.0-p_invar);
34 computeRates();
35 }
36
setPInvar(double pInvar)37 void RateGammaInvar::setPInvar(double pInvar) {
38 p_invar = pInvar;
39 for (int cat = 0; cat < ncategory; cat++)
40 rates[cat] = 1.0/(1.0-p_invar);
41 computeRates();
42 }
43
startCheckpoint()44 void RateGammaInvar::startCheckpoint() {
45 checkpoint->startStruct("RateGammaInvar");
46 }
47
saveCheckpoint()48 void RateGammaInvar::saveCheckpoint() {
49 RateInvar::saveCheckpoint();
50 RateGamma::saveCheckpoint();
51 }
52
restoreCheckpoint()53 void RateGammaInvar::restoreCheckpoint() {
54 // should restore p_invar first before gamma, because RateGamma will call computeRates()
55 RateInvar::restoreCheckpoint();
56 for (int cat = 0; cat < ncategory; cat++)
57 rates[cat] = 1.0/(1.0-p_invar);
58 RateGamma::restoreCheckpoint();
59 }
60
setNCategory(int ncat)61 void RateGammaInvar::setNCategory(int ncat) {
62 RateGamma::setNCategory(ncat);
63 for (int cat = 0; cat < ncategory; cat++)
64 rates[cat] = 1.0/(1.0-p_invar);
65 name = "+I" + name;
66 full_name = "Invar+" + full_name;
67 computeRates();
68 }
69
getNameParams()70 string RateGammaInvar::getNameParams() {
71 return RateInvar::getNameParams() + RateGamma::getNameParams();
72 }
73
computeFunction(double value)74 double RateGammaInvar::computeFunction(double value) {
75 if (cur_optimize == 0)
76 gamma_shape = value;
77 else {
78 p_invar = value;
79 for (int cat = 0; cat < ncategory; cat++)
80 rates[cat] = 1.0/(1.0-p_invar);
81 }
82 // need to compute rates again if p_inv or Gamma shape changes!
83 computeRates();
84 phylo_tree->clearAllPartialLH();
85 return -phylo_tree->computeLikelihood();
86 }
87
writeInfo(ostream & out)88 void RateGammaInvar::writeInfo(ostream &out) {
89 RateInvar::writeInfo(out);
90 RateGamma::writeInfo(out);
91 }
92
writeParameters(ostream & out)93 void RateGammaInvar::writeParameters(ostream &out) {
94 RateInvar::writeParameters(out);
95 RateGamma::writeParameters(out);
96 }
97
setVariables(double * variables)98 void RateGammaInvar::setVariables(double *variables) {
99 RateGamma::setVariables(variables);
100 int gid = RateGamma::getNDim();
101 RateInvar::setVariables(variables+gid);
102 }
103
getVariables(double * variables)104 bool RateGammaInvar::getVariables(double *variables) {
105 int gid = RateGamma::getNDim();
106 bool changed = RateGamma::getVariables(variables);
107 changed |= RateInvar::getVariables(variables+gid);
108 if (changed) {
109 // need to compute rates again if p_inv or Gamma shape changes!
110 RateGamma::computeRates();
111 }
112 return changed;
113 }
114
targetFunk(double x[])115 double RateGammaInvar::targetFunk(double x[]) {
116 ASSERT(phylo_tree);
117 getVariables(x);
118 phylo_tree->clearAllPartialLH();
119 return -phylo_tree->computeLikelihood();
120 }
121
setBounds(double * lower_bound,double * upper_bound,bool * bound_check)122 void RateGammaInvar::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
123 int gid = RateGamma::getNDim();
124 RateGamma::setBounds(lower_bound, upper_bound, bound_check);
125 RateInvar::setBounds(lower_bound+gid, upper_bound+gid, bound_check+gid);
126 }
127
optimizeParameters(double gradient_epsilon)128 double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
129
130 int ndim = getNDim();
131
132 // return if nothing to be optimized
133 if (ndim == 0)
134 return phylo_tree->computeLikelihood();
135
136 if (verbose_mode >= VB_MED)
137 cout << "Optimizing " << name << " model parameters by " << optimize_alg << " algorithm..." << endl;
138
139 if (optimize_alg.find("EM_RR") != string::npos) {
140 return randomRestartOptimization(gradient_epsilon);
141 } else if (optimize_alg.find("Brent") != string::npos || phylo_tree->aln->frac_const_sites == 0.0 || isFixPInvar() || isFixGammaShape()) {
142 double lh = phylo_tree->computeLikelihood();
143 cur_optimize = 0;
144 double gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
145 ASSERT(gamma_lh >= lh-0.1);
146 cur_optimize = 1;
147 double invar_lh = -DBL_MAX;
148 invar_lh = RateInvar::optimizeParameters(gradient_epsilon);
149 ASSERT(invar_lh >= gamma_lh-0.1);
150 cur_optimize = 0;
151 return invar_lh;
152 } else if (optimize_alg.find("EM") != string::npos) {
153 return optimizeWithEM(gradient_epsilon);
154 } else if (optimize_alg.find("BFGS") != string::npos) {
155 //if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
156 double *variables = new double[ndim+1];
157 double *upper_bound = new double[ndim+1];
158 double *lower_bound = new double[ndim+1];
159 bool *bound_check = new bool[ndim+1];
160 double score;
161
162 // by BFGS algorithm
163 setVariables(variables);
164 setBounds(lower_bound, upper_bound, bound_check);
165
166 score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_GAMMA_SHAPE));
167
168 getVariables(variables);
169
170 phylo_tree->clearAllPartialLH();
171 score = phylo_tree->computeLikelihood();
172
173 delete [] bound_check;
174 delete [] lower_bound;
175 delete [] upper_bound;
176 delete [] variables;
177
178 return score;
179 } else {
180 string errMsg = "Unknown optimization algorithm: " + optimize_alg;
181 outError(errMsg.c_str());
182 return 0.0;
183 }
184 }
185
186
computePatternRates(DoubleVector & pattern_rates,IntVector & pattern_cat)187 int RateGammaInvar::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) {
188 //cout << "Computing Gamma site rates by empirical Bayes..." << endl;
189
190 phylo_tree->computePatternLhCat(WSL_RATECAT);
191
192 int npattern = phylo_tree->aln->getNPattern();
193 pattern_rates.resize(npattern);
194 pattern_cat.resize(npattern);
195
196 double *lh_cat = phylo_tree->_pattern_lh_cat;
197 for (int i = 0; i < npattern; i++) {
198 double sum_rate = 0.0, sum_lh = phylo_tree->ptn_invar[i];
199 int best = 0;
200 double best_lh = phylo_tree->ptn_invar[i];
201 for (int c = 0; c < ncategory; c++) {
202 sum_rate += rates[c] * lh_cat[c];
203 sum_lh += lh_cat[c];
204 if (lh_cat[c] > best_lh || (lh_cat[c] == best_lh && random_double()<0.5)) { // break tie at random
205 best = c+1;
206 best_lh = lh_cat[c];
207 }
208 }
209 pattern_rates[i] = sum_rate / sum_lh;
210 pattern_cat[i] = best;
211 lh_cat += ncategory;
212 }
213 return ncategory+1;
214 }
215
optimizeWithEM(double gradient_epsilon)216 double RateGammaInvar::optimizeWithEM(double gradient_epsilon) {
217 double curlh = phylo_tree->computeLikelihood();
218
219 cur_optimize = 0;
220 double gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
221 ASSERT(gamma_lh > curlh - 1.0);
222 curlh = gamma_lh;
223
224 size_t ncat = getNRate();
225 size_t nptn = phylo_tree->aln->getNPattern();
226 size_t nSites = phylo_tree->aln->getNSite();
227
228 // Compute the pattern likelihood for each category (invariable and variable category)
229 phylo_tree->computePatternLhCat(WSL_RATECAT);
230 phylo_tree->computePtnInvar();
231
232 double ppInvar = 0;
233 for (size_t ptn = 0; ptn < nptn; ptn++) {
234 double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn * ncat;
235 double lk_ptn = phylo_tree->ptn_invar[ptn];
236 for (size_t cat = 0; cat < ncat; cat++) {
237 lk_ptn += this_lk_cat[cat];
238 }
239 ASSERT(lk_ptn != 0.0);
240 ppInvar += (phylo_tree->ptn_invar[ptn]) * phylo_tree->ptn_freq[ptn] / lk_ptn;
241 }
242
243 double newPInvar = ppInvar / nSites;
244 ASSERT(newPInvar < 1.0);
245 //double curPInv = getPInvar();
246 // setPInvar(newPInvar);
247 p_invar = newPInvar;
248 phylo_tree->clearAllPartialLH();
249 // phylo_tree->scaleLength((1-newPInvar)/(1-curPInv));
250 double pinvLH = phylo_tree->computeLikelihood();
251 ASSERT(pinvLH > curlh - 1.0);
252 return pinvLH;
253 }
254
randomRestartOptimization(double gradient_epsilon)255 double RateGammaInvar::randomRestartOptimization(double gradient_epsilon) {
256 if (testParamDone) {
257 return optimizeWithEM(gradient_epsilon);
258 }
259 double frac_const = phylo_tree->aln->frac_const_sites;
260 double bestLogl = phylo_tree->getCurScore();
261 double bestAlpha = 0.0;
262 double bestPInvar = 0.0;
263 double testInterval = (frac_const - MIN_PINVAR*2) / 10;
264 double initPInv = MIN_PINVAR;
265 double initAlpha = getGammaShape();
266
267 while (initPInv <= frac_const) {
268 if (verbose_mode >= VB_MED) {
269 cout << endl;
270 cout << "Testing with init. pinv = " << initPInv << " / init. alpha = " << initAlpha << endl;
271 }
272 setPInvar(initPInv);
273 setGammaShape(initAlpha);
274 phylo_tree->clearAllPartialLH();
275 double logl = optimizeWithEM(gradient_epsilon);
276 double estAlpha = getGammaShape();
277 double estPInv = getPInvar();
278
279 if (verbose_mode >= VB_MED) {
280 cout << "Est. alpha: " << estAlpha << " / Est. pinv: " << estPInv
281 << " / Logl: " << logl << endl;
282 }
283
284 initPInv = initPInv + testInterval;
285
286 if (logl > bestLogl) {
287 bestLogl = logl;
288 bestAlpha = estAlpha;
289 bestPInvar = estPInv;
290 }
291 }
292
293 if (verbose_mode >= VB_MED) {
294 cout << "Best gamma shape: " << bestAlpha << " / best p_inv: " << bestPInvar
295 << " / logl: " << bestLogl << endl;
296 }
297
298 setPInvar(bestPInvar);
299 setGammaShape(bestAlpha);
300 phylo_tree->clearAllPartialLH();
301 testParamDone = true;
302 return phylo_tree->computeLikelihood();
303 }
304
305
meanRates()306 double RateGammaInvar::meanRates() {
307 double ret = 0.0;
308 for (int i = 0; i < ncategory; i++)
309 ret += rates[i];
310 ret *= (1.0-getPInvar())/ncategory;
311 return ret;
312 }
313
314 /**
315 * rescale rates s.t. mean rate is equal to 1, useful for FreeRate model
316 * @return rescaling factor
317 */
rescaleRates()318 double RateGammaInvar::rescaleRates() {
319 double norm = meanRates();
320 for (int i = 0; i < ncategory; i++)
321 rates[i] /= norm;
322 return norm;
323 }
324
325
326
327
328