1 /***************************************************************************
2  *   Copyright (C) 2009-2015 by                                            *
3  *   BUI Quang Minh <minh.bui@univie.ac.at>                                *
4  *   Lam-Tung Nguyen <nltung@gmail.com>                                    *
5  *                                                                         *
6  *                                                                         *
7  *   This program is free software; you can redistribute it and/or modify  *
8  *   it under the terms of the GNU General Public License as published by  *
9  *   the Free Software Foundation; either version 2 of the License, or     *
10  *   (at your option) any later version.                                   *
11  *                                                                         *
12  *   This program is distributed in the hope that it will be useful,       *
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15  *   GNU General Public License for more details.                          *
16  *                                                                         *
17  *   You should have received a copy of the GNU General Public License     *
18  *   along with this program; if not, write to the                         *
19  *   Free Software Foundation, Inc.,                                       *
20  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
21  ***************************************************************************/
22 #include "rategammainvar.h"
23 
RateGammaInvar(int ncat,double shape,bool median,double p_invar_sites,string optimize_alg,PhyloTree * tree,bool testParamDone)24 RateGammaInvar::RateGammaInvar(int ncat, double shape, bool median,
25 		double p_invar_sites, string optimize_alg, PhyloTree *tree, bool testParamDone) :
26 		RateInvar(p_invar_sites, tree), RateGamma(ncat, shape, median, tree) {
27 	name = "+I" + name;
28 	full_name = "Invar+" + full_name;
29     this->optimize_alg = optimize_alg;
30     cur_optimize = 0;
31     this->testParamDone = testParamDone;
32     for (int cat = 0; cat < ncategory; cat++)
33         rates[cat] = 1.0/(1.0-p_invar);
34 	computeRates();
35 }
36 
setPInvar(double pInvar)37 void RateGammaInvar::setPInvar(double pInvar) {
38     p_invar = pInvar;
39     for (int cat = 0; cat < ncategory; cat++)
40         rates[cat] = 1.0/(1.0-p_invar);
41     computeRates();
42 }
43 
startCheckpoint()44 void RateGammaInvar::startCheckpoint() {
45     checkpoint->startStruct("RateGammaInvar");
46 }
47 
saveCheckpoint()48 void RateGammaInvar::saveCheckpoint() {
49     RateInvar::saveCheckpoint();
50     RateGamma::saveCheckpoint();
51 }
52 
restoreCheckpoint()53 void RateGammaInvar::restoreCheckpoint() {
54     // should restore p_invar first before gamma, because RateGamma will call computeRates()
55     RateInvar::restoreCheckpoint();
56     for (int cat = 0; cat < ncategory; cat++)
57         rates[cat] = 1.0/(1.0-p_invar);
58     RateGamma::restoreCheckpoint();
59 }
60 
setNCategory(int ncat)61 void RateGammaInvar::setNCategory(int ncat) {
62 	RateGamma::setNCategory(ncat);
63     for (int cat = 0; cat < ncategory; cat++)
64         rates[cat] = 1.0/(1.0-p_invar);
65 	name = "+I" + name;
66 	full_name = "Invar+" + full_name;
67 	computeRates();
68 }
69 
getNameParams()70 string RateGammaInvar::getNameParams() {
71 	return RateInvar::getNameParams() + RateGamma::getNameParams();
72 }
73 
computeFunction(double value)74 double RateGammaInvar::computeFunction(double value) {
75 	if (cur_optimize == 0)
76 		gamma_shape = value;
77 	else {
78 		p_invar = value;
79         for (int cat = 0; cat < ncategory; cat++)
80             rates[cat] = 1.0/(1.0-p_invar);
81     }
82 	// need to compute rates again if p_inv or Gamma shape changes!
83 	computeRates();
84 	phylo_tree->clearAllPartialLH();
85 	return -phylo_tree->computeLikelihood();
86 }
87 
writeInfo(ostream & out)88 void RateGammaInvar::writeInfo(ostream &out) {
89 	RateInvar::writeInfo(out);
90 	RateGamma::writeInfo(out);
91 }
92 
writeParameters(ostream & out)93 void RateGammaInvar::writeParameters(ostream &out) {
94 	RateInvar::writeParameters(out);
95 	RateGamma::writeParameters(out);
96 }
97 
setVariables(double * variables)98 void RateGammaInvar::setVariables(double *variables) {
99 	RateGamma::setVariables(variables);
100 	int gid = RateGamma::getNDim();
101 	RateInvar::setVariables(variables+gid);
102 }
103 
getVariables(double * variables)104 bool RateGammaInvar::getVariables(double *variables) {
105 	int gid = RateGamma::getNDim();
106 	bool changed = RateGamma::getVariables(variables);
107 	changed |= RateInvar::getVariables(variables+gid);
108     if (changed) {
109         // need to compute rates again if p_inv or Gamma shape changes!
110         RateGamma::computeRates();
111     }
112     return changed;
113 }
114 
targetFunk(double x[])115 double RateGammaInvar::targetFunk(double x[]) {
116 	ASSERT(phylo_tree);
117 	getVariables(x);
118 	phylo_tree->clearAllPartialLH();
119 	return -phylo_tree->computeLikelihood();
120 }
121 
setBounds(double * lower_bound,double * upper_bound,bool * bound_check)122 void RateGammaInvar::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
123 	int gid = RateGamma::getNDim();
124 	RateGamma::setBounds(lower_bound, upper_bound, bound_check);
125 	RateInvar::setBounds(lower_bound+gid, upper_bound+gid, bound_check+gid);
126 }
127 
optimizeParameters(double gradient_epsilon)128 double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
129 
130 	int ndim = getNDim();
131 
132 	// return if nothing to be optimized
133 	if (ndim == 0)
134 		return phylo_tree->computeLikelihood();
135 
136     if (verbose_mode >= VB_MED)
137         cout << "Optimizing " << name << " model parameters by " << optimize_alg << " algorithm..." << endl;
138 
139 	if (optimize_alg.find("EM_RR") != string::npos) {
140         return randomRestartOptimization(gradient_epsilon);
141     } else if (optimize_alg.find("Brent") != string::npos || phylo_tree->aln->frac_const_sites == 0.0 || isFixPInvar() || isFixGammaShape()) {
142 		double lh = phylo_tree->computeLikelihood();
143 		cur_optimize = 0;
144 		double gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
145 		ASSERT(gamma_lh >= lh-0.1);
146 		cur_optimize = 1;
147 		double invar_lh = -DBL_MAX;
148         invar_lh = RateInvar::optimizeParameters(gradient_epsilon);
149 		ASSERT(invar_lh >= gamma_lh-0.1);
150         cur_optimize = 0;
151         return invar_lh;
152 	} else if (optimize_alg.find("EM") != string::npos) {
153         return optimizeWithEM(gradient_epsilon);
154     } else if (optimize_alg.find("BFGS") != string::npos) {
155         //if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
156         double *variables = new double[ndim+1];
157         double *upper_bound = new double[ndim+1];
158         double *lower_bound = new double[ndim+1];
159         bool *bound_check = new bool[ndim+1];
160         double score;
161 
162         // by BFGS algorithm
163         setVariables(variables);
164         setBounds(lower_bound, upper_bound, bound_check);
165 
166         score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_GAMMA_SHAPE));
167 
168         getVariables(variables);
169 
170         phylo_tree->clearAllPartialLH();
171         score = phylo_tree->computeLikelihood();
172 
173         delete [] bound_check;
174         delete [] lower_bound;
175         delete [] upper_bound;
176         delete [] variables;
177 
178         return score;
179     } else {
180         string errMsg = "Unknown optimization algorithm: " + optimize_alg;
181         outError(errMsg.c_str());
182         return 0.0;
183     }
184 }
185 
186 
computePatternRates(DoubleVector & pattern_rates,IntVector & pattern_cat)187 int RateGammaInvar::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) {
188 	//cout << "Computing Gamma site rates by empirical Bayes..." << endl;
189 
190 	phylo_tree->computePatternLhCat(WSL_RATECAT);
191 
192 	int npattern = phylo_tree->aln->getNPattern();
193 	pattern_rates.resize(npattern);
194 	pattern_cat.resize(npattern);
195 
196 	double *lh_cat = phylo_tree->_pattern_lh_cat;
197 	for (int i = 0; i < npattern; i++) {
198 		double sum_rate = 0.0, sum_lh = phylo_tree->ptn_invar[i];
199 		int best = 0;
200         double best_lh = phylo_tree->ptn_invar[i];
201 		for (int c = 0; c < ncategory; c++) {
202 			sum_rate += rates[c] * lh_cat[c];
203 			sum_lh += lh_cat[c];
204 			if (lh_cat[c] > best_lh || (lh_cat[c] == best_lh && random_double()<0.5)) { // break tie at random
205                 best = c+1;
206                 best_lh = lh_cat[c];
207             }
208 		}
209 		pattern_rates[i] = sum_rate / sum_lh;
210 		pattern_cat[i] = best;
211 		lh_cat += ncategory;
212 	}
213     return ncategory+1;
214 }
215 
optimizeWithEM(double gradient_epsilon)216 double RateGammaInvar::optimizeWithEM(double gradient_epsilon) {
217     double curlh = phylo_tree->computeLikelihood();
218 
219     cur_optimize = 0;
220     double gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
221     ASSERT(gamma_lh > curlh - 1.0);
222     curlh = gamma_lh;
223 
224     size_t ncat = getNRate();
225     size_t nptn = phylo_tree->aln->getNPattern();
226     size_t nSites = phylo_tree->aln->getNSite();
227 
228     // Compute the pattern likelihood for each category (invariable and variable category)
229     phylo_tree->computePatternLhCat(WSL_RATECAT);
230     phylo_tree->computePtnInvar();
231 
232     double ppInvar = 0;
233     for (size_t ptn = 0; ptn < nptn; ptn++) {
234         double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn * ncat;
235         double lk_ptn = phylo_tree->ptn_invar[ptn];
236         for (size_t cat = 0; cat < ncat; cat++) {
237             lk_ptn += this_lk_cat[cat];
238         }
239         ASSERT(lk_ptn != 0.0);
240         ppInvar += (phylo_tree->ptn_invar[ptn]) * phylo_tree->ptn_freq[ptn] / lk_ptn;
241     }
242 
243     double newPInvar = ppInvar / nSites;
244     ASSERT(newPInvar < 1.0);
245     //double curPInv = getPInvar();
246 //    setPInvar(newPInvar);
247     p_invar = newPInvar;
248     phylo_tree->clearAllPartialLH();
249 //    phylo_tree->scaleLength((1-newPInvar)/(1-curPInv));
250     double pinvLH = phylo_tree->computeLikelihood();
251     ASSERT(pinvLH > curlh - 1.0);
252     return pinvLH;
253 }
254 
randomRestartOptimization(double gradient_epsilon)255 double RateGammaInvar::randomRestartOptimization(double gradient_epsilon) {
256     if (testParamDone) {
257         return optimizeWithEM(gradient_epsilon);
258     }
259     double frac_const = phylo_tree->aln->frac_const_sites;
260     double bestLogl = phylo_tree->getCurScore();
261     double bestAlpha = 0.0;
262     double bestPInvar = 0.0;
263     double testInterval = (frac_const - MIN_PINVAR*2) / 10;
264     double initPInv = MIN_PINVAR;
265     double initAlpha = getGammaShape();
266 
267     while (initPInv <= frac_const) {
268         if (verbose_mode >= VB_MED) {
269             cout << endl;
270             cout << "Testing with init. pinv = " << initPInv << " / init. alpha = " << initAlpha << endl;
271         }
272         setPInvar(initPInv);
273         setGammaShape(initAlpha);
274         phylo_tree->clearAllPartialLH();
275         double logl = optimizeWithEM(gradient_epsilon);
276         double estAlpha = getGammaShape();
277         double estPInv = getPInvar();
278 
279         if (verbose_mode >= VB_MED) {
280             cout << "Est. alpha: " << estAlpha << " / Est. pinv: " << estPInv
281             << " / Logl: " << logl << endl;
282         }
283 
284         initPInv = initPInv + testInterval;
285 
286         if (logl > bestLogl) {
287             bestLogl = logl;
288             bestAlpha = estAlpha;
289             bestPInvar = estPInv;
290         }
291     }
292 
293     if (verbose_mode >= VB_MED) {
294         cout << "Best gamma shape: " << bestAlpha << " / best p_inv: " << bestPInvar
295         << " / logl: " << bestLogl << endl;
296     }
297 
298     setPInvar(bestPInvar);
299     setGammaShape(bestAlpha);
300     phylo_tree->clearAllPartialLH();
301     testParamDone = true;
302     return phylo_tree->computeLikelihood();
303 }
304 
305 
meanRates()306 double RateGammaInvar::meanRates() {
307 	double ret = 0.0;
308 	for (int i = 0; i < ncategory; i++)
309 		ret += rates[i];
310     ret *= (1.0-getPInvar())/ncategory;
311 	return ret;
312 }
313 
314 /**
315  * rescale rates s.t. mean rate is equal to 1, useful for FreeRate model
316  * @return rescaling factor
317  */
rescaleRates()318 double RateGammaInvar::rescaleRates() {
319 	double norm = meanRates();
320 	for (int i = 0; i < ncategory; i++)
321 		rates[i] /= norm;
322 	return norm;
323 }
324 
325 
326 
327 
328