/* * mlparam.c * * * Part of TREE-PUZZLE 5.2 (July 2004) * * (c) 2003-2004 by Heiko A. Schmidt, Korbinian Strimmer, and Arndt von Haeseler * (c) 1999-2003 by Heiko A. Schmidt, Korbinian Strimmer, * M. Vingron, and Arndt von Haeseler * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler * * All parts of the source except where indicated are distributed under * the GNU public licence. See http://www.opensource.org for details. * * ($Id$) * */ #ifdef HAVE_CONFIG_H # include #endif #define EXTERN extern /* prototypes */ #include #include #include #include "puzzle.h" #include "util.h" #include "ml.h" #include "gamma.h" /******************************************************************************/ /* discrete Gamma-distribution and related stuff */ /******************************************************************************/ /* compare general base frequencies with frequencies of taxon i with chi square */ double homogentest(int taxon) { return chi2test(Freqtpm, Basecomp[taxon], gettpmradix(), &chi2fail); } /* discrete Gamma according to Yang 1994 (JME 39:306-314) */ void YangDiscreteGamma (double shape, int c, dvector x) { double twoc, mu; int i; twoc = 2.0*c; mu = 0.0; for (i = 0; i < c; i++) { /* corresponding rates */ x[i] = icdfGamma ( (2.0*i+1.0)/twoc, shape); mu += x[i]; } mu = mu/c; /* rescale for avarage rate of 1.0 */ for (i = 0; i < c; i++) { x[i] /= mu; } } /* compute rates of each category when rates are Gamma-distributed */ void updaterates() { int i; double alpha; if (numcats == 1) { Rates[0] = 1.0; return; } if (Geta == 0.0) { for (i = 0; i < numcats; i++) Rates[i] = 1.0; return; } alpha = (1.0 - Geta)/Geta; YangDiscreteGamma (alpha, numcats, Rates); /* if invariable sites are present */ for (i = 0; i < numcats; i++) Rates[i] = Rates[i]/(1.0-fracinv); /* check for very small rates */ for (i = 0; i < numcats; i++) if (Rates[i] < 0.000001) Rates[i] = 0.000001; } /******************************************************************************/ /* parameter estimation */ /******************************************************************************/ /* compute sample mean and standard deviation of sample mean */ void computestat(double *data, int n, double *mean, double *err) { int i; double sum; double temp; double avg; /* compute sample mean */ sum = 0; for (i = 0; i < n; i++) sum += data[i]; avg = sum/(double) n; (*mean) = avg; /* compute std. error of sample mean with curvature method */ sum = 0; for (i = 0; i < n; i++) { temp = data[i] - avg; sum += temp*temp; } if (n != 1) (*err) = sqrt(sum)/sqrt((double)(n-1)*n); /* unbiased estimator */ else (*err) = 0.0; /* if n == 1 */ } /* compute ML value of quartet (a,b,c,d) for parameter estimation with quartet subsampling */ double quartetml(int a, int b, int c, int d) { double d1, d2, d3; /* compute ML for all topologies */ if (approxp_optn) { /* approximate parameter mode */ d1 = quartet_alklhd(a,b,c,d); /* (a,b)-(c,d) */ d2 = quartet_alklhd(a,c,b,d); /* (a,c)-(b,d) */ d3 = quartet_alklhd(a,d,b,c); /* (a,d)-(b,c) */ } else { d1 = quartet_lklhd(a,b,c,d); /* (a,b)-(c,d) */ d2 = quartet_lklhd(a,c,b,d); /* (a,c)-(b,d) */ d3 = quartet_lklhd(a,d,b,c); /* (a,d)-(b,c) */ } /* looking for max(d1, d2, d3) */ if (d1 < d2) { /* d2 > d1 */ if (d2 < d3) { /* d3 > d2 > d1 */ /* d3 maximum */ return d3; } else { /* d2 >= d3 > d1 */ /* d2 maximum */ return d2; } } else { /* d1 >= d2 */ if (d1 < d3) { /* d3 > d1 >= d2 */ /* d3 maximum */ return d3; } else { /* d1 >= d2 && d1 >= d3 */ /* d1 maximum */ return d1; } } } /* optimization function TSparam - quartets */ double opttsq(double x) { if (x < MINTS) TSparam = MINTS; else if (x > MAXTS) TSparam = MAXTS; else TSparam = x; tranprobmat(); distupdate(qca, qcb, qcc, qcd); return (-quartetml(qca, qcb, qcc, qcd)); } /* optimization function YRparam - quartets */ double optyrq(double x) { if (x < MINYR) YRparam = MINYR; else if (x > MAXYR) YRparam = MAXYR; else YRparam = x; tranprobmat(); distupdate(qca, qcb, qcc, qcd); return (-quartetml(qca, qcb, qcc, qcd)); } /* estimate substitution process parameters - random quartets */ void optimseqevolparamsquart() { double tsmeanold, yrmeanold; dvector tslist, yrlist; int fin; ivector taxon; uli minqts, maxqts, n; taxon = new_ivector(4); /* number of quartets to be investigated */ minqts = (uli) floor(0.25 * MINPERTAXUM * Maxspc) + 1; maxqts = (uli) floor(0.25 * MAXPERTAXUM * Maxspc) + 1; if (Maxspc == 4) { minqts = (uli) 1; maxqts = (uli) 1; } tslist = new_dvector(maxqts); yrlist = new_dvector(maxqts); /* initialize averages */ tsmean = TSparam; yrmean = YRparam; fin = FALSE; /* investigate maxqts random quartets */ for (n = 0; n < maxqts; n++) { /* choose random quartet */ chooser(Maxspc, 4, taxon); /* * optimize parameters on this quartet */ qca = taxon[0]; qcb = taxon[1]; qcc = taxon[2]; qcd = taxon[3]; /* initialize start values with average value */ if ((SH_optn || nuc_optn) && optim_optn && (data_optn == 0)) TSparam = tsmean; if ((nuc_optn && TN_optn) && optim_optn && (data_optn == 0)) YRparam = yrmean; /* estimation */ twodimenmin(EPSILON_SUBSTPARAM, (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), MINTS, &TSparam, MAXTS, opttsq, &tserr, (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), MINYR, &YRparam, MAXYR, optyrq, &yrerr); tsmeanold = tsmean; yrmeanold = yrmean; tslist[n] = TSparam; yrlist[n] = YRparam; computestat(tslist, n+1 , &tsmean, &tserr); computestat(yrlist, n+1 , &yrmean, &yrerr); /* check whether the means are converging */ if (n > minqts-2) { if ((fabs(tsmean-tsmeanold) < TSDIFF) && (fabs(yrmean-yrmeanold) < YRDIFF)) fin = TRUE; } /* investigate at least minqts quartets */ if (n > minqts-2 && (fin || n > maxqts-2)) break; } /* round estimated numbers to 2 digits after the decimal point */ if (tserr != 0.0) tsmean = floor(100.0*tsmean+0.5)/100.0; if (yrerr != 0.0) yrmean = floor(100.0*yrmean+0.5)/100.0; /* update ML engine */ TSparam = tsmean; YRparam = yrmean; tranprobmat(); free_ivector(taxon); } /* optimization function TSparam - tree */ double opttst(double x) { double result; if (x < MINTS) TSparam = MINTS; else if (x > MAXTS) TSparam = MAXTS; else TSparam = x; tranprobmat(); /*epe*/ # if PARALLEL PP_Update_EEI(); # endif computedistan(); /* pre-set branchlengths = FALSE; parallel estimation = TRUE */ if (approxp_optn) result = usertree_alklhd(FALSE, TRUE); else result = usertree_lklhd(FALSE, TRUE); return (-result); } /* optimization function YRparam - tree */ double optyrt(double x) { double result; if (x < MINYR) YRparam = MINYR; else if (x > MAXYR) YRparam = MAXYR; else YRparam = x; tranprobmat(); /*epe*/ # if PARALLEL PP_Update_EEI(); # endif computedistan(); /* pre-set branchlengths = FALSE; parallel estimation = TRUE */ if (approxp_optn) result = usertree_alklhd(FALSE, TRUE); else result = usertree_lklhd(FALSE, TRUE); return (-result); } /* optimize substitution process parameters - tree */ void optimseqevolparamstree() { twodimenmin(EPSILON_SUBSTPARAM, (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), MINTS, &TSparam, MAXTS, opttst, &tserr, (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), MINYR, &YRparam, MAXYR, optyrt, &yrerr); } /* optimization function fracinv */ double optfi(double x) { double result; if (x < MINFI) fracinv = MINFI; else if (x > MAXFI) fracinv = MAXFI; else fracinv = x; /*epe*/ # if PARALLEL PP_Update_fracinv(); # endif computedistan(); /* pre-set branchlengths = FALSE; parallel estimation = TRUE */ if (approxp_optn) result = usertree_alklhd(FALSE, TRUE); else result = usertree_lklhd(FALSE, TRUE); return (-result); } /* optimization function Geta */ double optge(double x) { double result; if (x < MINGE) Geta = MINGE; else if (x > MAXGE) Geta = MAXGE; else Geta = x; updaterates(); /*epe*/ # if PARALLEL PP_Update_Rates(); # endif computedistan(); /* pre-set branchlengths = FALSE; parallel estimation = TRUE */ if (approxp_optn) result = usertree_alklhd(FALSE, TRUE); else result = usertree_lklhd(FALSE, TRUE); return (-result); } /* optimize rate heterogeneity parameters */ void optimrateparams() { twodimenmin(EPSILON_RATEPARAM, fracinv_optim, MINFI, &fracinv, fracconst, optfi, &fierr, grate_optim, MINGE, &Geta, MAXGE, optge, &geerr); } /******************************************************************************/ /* main routines to estimate parameters from tree or quartets */ /******************************************************************************/ /* estimate parameters of substitution process and rate heterogeneity - no tree n-taxon tree is not needed because of quartet method or NJ tree topology */ void estimateparametersnotree() { int it, nump, change; double TSold, YRold, FIold, GEold; it = 0; nump = 0; /* count number of parameters */ if (data_optn == NUCLEOTIDE && optim_optn) nump++; if (fracinv_optim || grate_optim) nump++; do { /* repeat until nothing changes any more */ it++; change = FALSE; /* optimize substitution parameters */ if (data_optn == NUCLEOTIDE && optim_optn) { TSold = TSparam; YRold = YRparam; /* * optimize */ fprintf(STDOUT, "Optimizing missing substitution process parameters\n"); fflush(STDOUT); if (qcalg_optn) { /* quartet sampling */ optimseqevolparamsquart(); } else { /* NJ tree */ tmpfp = tmpfile(); njtree(tmpfp); rewind(tmpfp); readusertree(tmpfp, FALSE); closefile(tmpfp); optimseqevolparamstree(); } /*epe*/ #if PARALLEL PP_NoUpdate(); # endif computedistan(); /* update ML distances */ /* same tolerance as 1D minimization */ if ((fabs(TSparam - TSold) > 3.3*EPSILON_SUBSTPARAM) || (fabs(YRparam - YRold) > 3.3*EPSILON_SUBSTPARAM) ) change = TRUE; } /* optimize rate heterogeneity variables */ if (fracinv_optim || grate_optim) { FIold = fracinv; GEold = Geta; /* * optimize */ fprintf(STDOUT, "Optimizing missing rate heterogeneity parameters\n"); fflush(STDOUT); /* compute NJ tree */ tmpfp = tmpfile(); njtree(tmpfp); /* use NJ tree topology to estimate parameters */ rewind(tmpfp); readusertree(tmpfp, FALSE); closefile(tmpfp); optimrateparams(); # if PARALLEL PP_NoUpdate(); # endif /* PARALLEL */ computedistan(); /* update ML distances */ /* same tolerance as 1D minimization */ if ((fabs(fracinv - FIold) > 3.3*EPSILON_RATEPARAM) || (fabs(Geta - GEold) > 3.3*EPSILON_RATEPARAM) ) change = TRUE; } if (nump == 1) return; } while (it != MAXITS && change); return; } /* estimateparametersnotree */ /* estimate parameters of substitution process and rate heterogeneity - tree same as above but here the n-taxon tree is already in memory */ void estimateparameterstree() { int it, nump, change; double TSold, YRold, FIold, GEold; it = 0; nump = 0; /* count number of parameters */ if (data_optn == NUCLEOTIDE && optim_optn) nump++; if (fracinv_optim || grate_optim) nump++; do { /* repeat until nothing changes any more */ it++; change = FALSE; /* optimize substitution process parameters */ if (data_optn == NUCLEOTIDE && optim_optn) { TSold = TSparam; YRold = YRparam; /* * optimize */ fprintf(STDOUT, "Optimizing missing substitution process parameters\n"); fflush(STDOUT); optimseqevolparamstree(); /*epe*/ /*??? - auskommentiert, HAS ;-) */ #ifdef EPE_DEBUG fprintf(STDOUT, "\n\nRED ALERT\n\n\n"); #endif /* EPE_DEBUG */ computedistan(); /* update ML distances */ /* same tolerance as 1D minimization */ if ((fabs(TSparam - TSold) > 3.3*EPSILON_SUBSTPARAM) || (fabs(YRparam - YRold) > 3.3*EPSILON_SUBSTPARAM) ) change = TRUE; } /* optimize rate heterogeneity variables */ if (fracinv_optim || grate_optim) { FIold = fracinv; GEold = Geta; /* * optimize */ fprintf(STDOUT, "Optimizing missing rate heterogeneity parameters\n"); fflush(STDOUT); optimrateparams(); /*epe*/ /*??? - auskommentiert, HAS ;-) */ #ifdef EPE_DEBUG fprintf(STDOUT, "\n\nRED ALERT\n\n\n"); #endif /* EPE_DEBUG */ computedistan(); /* update ML distances */ /* same tolerance as 1D minimization */ if ((fabs(fracinv - FIold) > 3.3*EPSILON_RATEPARAM) || (fabs(Geta - GEold) > 3.3*EPSILON_RATEPARAM) ) change = TRUE; } if (nump == 1) return; } while (it != MAXITS && change); return; } /* estimateparameterstree */