1 /*
2  * mlparam.c
3  *
4  *
5  * Part of TREE-PUZZLE 5.2 (July 2004)
6  *
7  * (c) 2003-2004 by Heiko A. Schmidt, Korbinian Strimmer, and Arndt von Haeseler
8  * (c) 1999-2003 by Heiko A. Schmidt, Korbinian Strimmer,
9  *                  M. Vingron, and Arndt von Haeseler
10  * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler
11  *
12  * All parts of the source except where indicated are distributed under
13  * the GNU public licence.  See http://www.opensource.org for details.
14  *
15  * ($Id$)
16  *
17  */
18 
19 #ifdef HAVE_CONFIG_H
20 #  include <config.h>
21 #endif
22 
23 
24 #define EXTERN extern
25 
26 
27 /* prototypes */
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <math.h>
31 #include "puzzle.h"
32 #include "util.h"
33 #include "ml.h"
34 #include "gamma.h"
35 
36 
37 
38 /******************************************************************************/
39 /* discrete Gamma-distribution and related stuff                              */
40 /******************************************************************************/
41 
42 /* compare general base frequencies with frequencies of taxon i with chi square */
homogentest(int taxon)43 double homogentest(int taxon)
44 {
45 	return chi2test(Freqtpm, Basecomp[taxon], gettpmradix(), &chi2fail);
46 }
47 
48 
49 /* discrete Gamma according to Yang 1994 (JME 39:306-314) */
YangDiscreteGamma(double shape,int c,dvector x)50 void YangDiscreteGamma (double shape, int c, dvector x)
51 {
52 	double twoc, mu;
53 	int i;
54 
55 	twoc = 2.0*c;
56 	mu = 0.0;
57 	for (i = 0; i < c; i++)
58 	{
59 		/* corresponding rates */
60 		x[i] = icdfGamma ( (2.0*i+1.0)/twoc, shape);
61 		mu += x[i];
62 	}
63 	mu = mu/c;
64 
65 	/* rescale for avarage rate of 1.0 */
66 	for (i = 0; i < c; i++)
67 	{
68 		x[i] /= mu;
69 	}
70 }
71 
72 /* compute rates of each category when rates are Gamma-distributed */
updaterates()73 void updaterates()
74 {
75 	int i;
76 	double alpha;
77 
78 	if (numcats == 1)
79 	{
80 		Rates[0] = 1.0;
81 		return;
82 	}
83 	if (Geta == 0.0)
84 	{
85 		for (i = 0; i < numcats; i++)
86 			Rates[i] = 1.0;
87 		return;
88 	}
89 	alpha = (1.0 - Geta)/Geta;
90 
91 	YangDiscreteGamma (alpha, numcats, Rates);
92 
93 	/* if invariable sites are present */
94 	for (i = 0; i < numcats; i++)
95 		Rates[i] = Rates[i]/(1.0-fracinv);
96 
97 	/* check for very small rates */
98 	for (i = 0; i < numcats; i++)
99 		if (Rates[i] < 0.000001) Rates[i] = 0.000001;
100 }
101 
102 
103 
104 /******************************************************************************/
105 /* parameter estimation                                                       */
106 /******************************************************************************/
107 
108 /* compute sample mean and standard deviation of sample mean */
computestat(double * data,int n,double * mean,double * err)109 void computestat(double *data, int n, double *mean, double *err)
110 {
111 	int i;
112 	double sum;
113 	double temp;
114 	double avg;
115 
116 	/* compute sample mean */
117 	sum = 0;
118 	for (i = 0; i < n; i++) sum += data[i];
119 	avg = sum/(double) n;
120 	(*mean) = avg;
121 
122 	/* compute std. error of sample mean with curvature method */
123 	sum = 0;
124 	for (i = 0; i < n; i++) {
125 		temp = data[i] - avg;
126 		sum += temp*temp;
127 	}
128 	if (n != 1)
129 		(*err) = sqrt(sum)/sqrt((double)(n-1)*n); /* unbiased estimator */
130 	else
131 		(*err) = 0.0; /* if n == 1 */
132 }
133 
134 /* compute ML value of quartet (a,b,c,d) for parameter estimation with quartet subsampling */
quartetml(int a,int b,int c,int d)135 double quartetml(int a, int b, int c, int d)
136 {
137 	double d1, d2, d3;
138 
139 	/* compute ML for all topologies */
140 	if (approxp_optn) { /* approximate parameter mode */
141 		d1 = quartet_alklhd(a,b,c,d); /* (a,b)-(c,d) */
142 		d2 = quartet_alklhd(a,c,b,d); /* (a,c)-(b,d) */
143 		d3 = quartet_alklhd(a,d,b,c); /* (a,d)-(b,c) */
144 	} else {
145 		d1 = quartet_lklhd(a,b,c,d); /* (a,b)-(c,d) */
146 		d2 = quartet_lklhd(a,c,b,d); /* (a,c)-(b,d) */
147 		d3 = quartet_lklhd(a,d,b,c); /* (a,d)-(b,c) */
148 	}
149 
150 	/* looking for max(d1, d2, d3) */
151 	if (d1 < d2) { /* d2 > d1 */
152 		if (d2 < d3) { /* d3 > d2 > d1 */
153 			/* d3 maximum */
154 			return d3;
155 		} else { /* d2 >= d3 > d1 */
156 			/* d2 maximum */
157 			return d2;
158 		}
159 	} else { /* d1 >= d2 */
160 		if (d1 < d3) { /* d3 > d1 >= d2 */
161 			/* d3 maximum */
162 			return d3;
163 		} else { /* d1 >= d2 && d1 >= d3 */
164 			/* d1 maximum */
165 			return d1;
166 		}
167 	}
168 }
169 
170 /* optimization function TSparam - quartets */
opttsq(double x)171 double opttsq(double x)
172 {
173 	if (x < MINTS) TSparam = MINTS;
174 	else if (x > MAXTS) TSparam = MAXTS;
175 	else TSparam = x;
176 	tranprobmat();
177 	distupdate(qca, qcb, qcc, qcd);
178 	return (-quartetml(qca, qcb, qcc, qcd));
179 }
180 
181 /* optimization function YRparam - quartets */
optyrq(double x)182 double optyrq(double x)
183 {
184 	if (x < MINYR) YRparam = MINYR;
185 	else if (x > MAXYR) YRparam = MAXYR;
186 	else YRparam = x;
187 	tranprobmat();
188 	distupdate(qca, qcb, qcc, qcd);
189 	return (-quartetml(qca, qcb, qcc, qcd));
190 }
191 
192 /* estimate substitution process parameters - random quartets */
optimseqevolparamsquart()193 void optimseqevolparamsquart()
194 {
195 	double tsmeanold, yrmeanold;
196 	dvector tslist, yrlist;
197 	int fin;
198 	ivector taxon;
199 	uli minqts, maxqts, n;
200 
201 
202 	taxon = new_ivector(4);
203 
204 	/* number of quartets to be investigated */
205 	minqts = (uli) floor(0.25 * MINPERTAXUM * Maxspc) + 1;
206 	maxqts = (uli) floor(0.25 * MAXPERTAXUM * Maxspc) + 1;
207 	if (Maxspc == 4) {
208 		minqts = (uli) 1;
209 		maxqts = (uli) 1;
210 	}
211 
212 	tslist = new_dvector(maxqts);
213 	yrlist = new_dvector(maxqts);
214 
215 	/* initialize averages */
216 	tsmean = TSparam;
217 	yrmean = YRparam;
218 
219 	fin = FALSE;
220 
221 	/* investigate maxqts random quartets */
222 	for (n = 0; n < maxqts; n++) {
223 
224 		/* choose random quartet */
225 		chooser(Maxspc, 4, taxon);
226 
227 		/*
228 		 * optimize parameters on this quartet
229 		 */
230 
231 		qca = taxon[0];
232 		qcb = taxon[1];
233 		qcc = taxon[2];
234 		qcd = taxon[3];
235 
236 		/* initialize start values with average value */
237 		if ((SH_optn || nuc_optn) && optim_optn && (data_optn == 0)) TSparam = tsmean;
238 		if ((nuc_optn && TN_optn) && optim_optn && (data_optn == 0)) YRparam = yrmean;
239 
240 		/* estimation */
241 		twodimenmin(EPSILON_SUBSTPARAM,
242 			(SH_optn || nuc_optn) && optim_optn && (data_optn == 0),
243 				MINTS, &TSparam, MAXTS, opttsq, &tserr,
244 			(nuc_optn && TN_optn) && optim_optn && (data_optn == 0),
245 				MINYR, &YRparam, MAXYR, optyrq, &yrerr);
246 
247 
248 		tsmeanold = tsmean;
249 		yrmeanold = yrmean;
250 		tslist[n] = TSparam;
251 		yrlist[n] = YRparam;
252 		computestat(tslist, n+1 , &tsmean, &tserr);
253 		computestat(yrlist, n+1 , &yrmean, &yrerr);
254 
255 		/* check whether the means are converging */
256 		if (n > minqts-2) {
257 			if ((fabs(tsmean-tsmeanold) < TSDIFF) &&
258 				(fabs(yrmean-yrmeanold) < YRDIFF))
259 					fin = TRUE;
260 		}
261 
262 		/* investigate at least minqts quartets */
263 		if (n > minqts-2 && (fin || n > maxqts-2)) break;
264 	}
265 
266 	/* round estimated numbers to 2 digits after the decimal point */
267 	if (tserr != 0.0) tsmean = floor(100.0*tsmean+0.5)/100.0;
268 	if (yrerr != 0.0) yrmean = floor(100.0*yrmean+0.5)/100.0;
269 
270 	/* update ML engine */
271 	TSparam = tsmean;
272 	YRparam = yrmean;
273 	tranprobmat();
274 
275 	free_ivector(taxon);
276 }
277 
278 /* optimization function TSparam - tree */
opttst(double x)279 double opttst(double x)
280 {
281 	double result;
282 
283 	if (x < MINTS) TSparam = MINTS;
284 	else if (x > MAXTS) TSparam = MAXTS;
285 	else TSparam = x;
286 	tranprobmat();
287 
288 /*epe*/
289 #	if PARALLEL
290 		PP_Update_EEI();
291 #	endif
292 	computedistan();
293 	/* pre-set branchlengths = FALSE; parallel estimation = TRUE */
294 	if (approxp_optn) result = usertree_alklhd(FALSE, TRUE);
295 	else result = usertree_lklhd(FALSE, TRUE);
296 
297 	return (-result);
298 }
299 
300 /* optimization function YRparam - tree */
optyrt(double x)301 double optyrt(double x)
302 {
303 	double result;
304 
305 	if (x < MINYR) YRparam = MINYR;
306 	else if (x > MAXYR) YRparam = MAXYR;
307 	else YRparam = x;
308 	tranprobmat();
309 /*epe*/
310 #	if PARALLEL
311 		PP_Update_EEI();
312 #	endif
313 	computedistan();
314 	/* pre-set branchlengths = FALSE; parallel estimation = TRUE */
315 	if (approxp_optn) result = usertree_alklhd(FALSE, TRUE);
316 	else result = usertree_lklhd(FALSE, TRUE);
317 
318 	return (-result);
319 }
320 
321 
322 /* optimize substitution process parameters - tree */
optimseqevolparamstree()323 void optimseqevolparamstree()
324 {
325 	twodimenmin(EPSILON_SUBSTPARAM,
326 		(SH_optn || nuc_optn) && optim_optn && (data_optn == 0),
327 			MINTS, &TSparam, MAXTS, opttst, &tserr,
328 		(nuc_optn && TN_optn) && optim_optn && (data_optn == 0),
329 			MINYR, &YRparam, MAXYR, optyrt, &yrerr);
330 }
331 
332 
333 /* optimization function fracinv */
optfi(double x)334 double optfi(double x)
335 {
336 	double result;
337 
338 	if (x < MINFI) fracinv = MINFI;
339 	else if (x > MAXFI) fracinv = MAXFI;
340 	else fracinv = x;
341 
342 /*epe*/
343 #	if PARALLEL
344 		PP_Update_fracinv();
345 #	endif
346 
347 	computedistan();
348 	/* pre-set branchlengths = FALSE; parallel estimation = TRUE */
349 	if (approxp_optn) result = usertree_alklhd(FALSE, TRUE);
350 	else result = usertree_lklhd(FALSE, TRUE);
351 
352 	return (-result);
353 }
354 
355 
356 /* optimization function Geta */
optge(double x)357 double optge(double x)
358 {
359 	double result;
360 
361 	if (x < MINGE) Geta = MINGE;
362 	else if (x > MAXGE) Geta = MAXGE;
363 	else Geta = x;
364 
365 	updaterates();
366 
367 /*epe*/
368 #	if PARALLEL
369 		PP_Update_Rates();
370 #	endif
371 
372 	computedistan();
373 	/* pre-set branchlengths = FALSE; parallel estimation = TRUE */
374 	if (approxp_optn) result = usertree_alklhd(FALSE, TRUE);
375 	else result = usertree_lklhd(FALSE, TRUE);
376 
377 	return (-result);
378 }
379 
380 
381 /* optimize rate heterogeneity parameters */
optimrateparams()382 void optimrateparams()
383 {
384 	twodimenmin(EPSILON_RATEPARAM,
385 		fracinv_optim,
386 			MINFI, &fracinv, fracconst, optfi, &fierr,
387 		grate_optim,
388 			MINGE, &Geta, MAXGE, optge, &geerr);
389 
390 }
391 
392 
393 /******************************************************************************/
394 /* main routines to estimate parameters from tree or quartets                 */
395 /******************************************************************************/
396 
397 /* estimate parameters of substitution process and rate heterogeneity - no tree
398    n-taxon tree is not needed because of quartet method or NJ tree topology */
estimateparametersnotree()399 void estimateparametersnotree()
400 {
401 	int it, nump, change;
402 	double TSold, YRold, FIold, GEold;
403 
404 	it = 0;
405 	nump = 0;
406 
407 	/* count number of parameters */
408 	if (data_optn == NUCLEOTIDE && optim_optn) nump++;
409 	if (fracinv_optim || grate_optim) nump++;
410 
411 	do { /* repeat until nothing changes any more */
412 		it++;
413 		change = FALSE;
414 
415 		/* optimize substitution parameters */
416 		if (data_optn == NUCLEOTIDE && optim_optn) {
417 
418 			TSold = TSparam;
419 			YRold = YRparam;
420 
421 
422 			/*
423 			 * optimize
424 			 */
425 
426 			fprintf(STDOUT, "Optimizing missing substitution process parameters\n");
427 			fflush(STDOUT);
428 
429 			if (qcalg_optn) { /* quartet sampling */
430 				optimseqevolparamsquart();
431 			} else { /* NJ tree */
432 				tmpfp = tmpfile();
433 				njtree(tmpfp);
434 				rewind(tmpfp);
435 				readusertree(tmpfp, FALSE);
436 				closefile(tmpfp);
437 				optimseqevolparamstree();
438 			}
439 
440 /*epe*/
441 #if			PARALLEL
442 				PP_NoUpdate();
443 #			endif
444 			computedistan(); /* update ML distances */
445 
446 			/* same tolerance as 1D minimization */
447 			if ((fabs(TSparam - TSold) > 3.3*EPSILON_SUBSTPARAM) ||
448 				(fabs(YRparam - YRold) > 3.3*EPSILON_SUBSTPARAM)
449 			) change = TRUE;
450 
451 		}
452 
453 		/* optimize rate heterogeneity variables */
454 		if (fracinv_optim || grate_optim) {
455 
456 			FIold = fracinv;
457 			GEold = Geta;
458 
459 
460 			/*
461 			 * optimize
462 			 */
463 
464 			fprintf(STDOUT, "Optimizing missing rate heterogeneity parameters\n");
465 			fflush(STDOUT);
466 			/* compute NJ tree */
467 			tmpfp = tmpfile();
468 			njtree(tmpfp);
469 			/* use NJ tree topology to estimate parameters */
470 			rewind(tmpfp);
471 			readusertree(tmpfp, FALSE);
472 			closefile(tmpfp);
473 
474 			optimrateparams();
475 #			if PARALLEL
476 				PP_NoUpdate();
477 #			endif /* PARALLEL */
478 			computedistan(); /* update ML distances */
479 
480 
481 			/* same tolerance as 1D minimization */
482 			if ((fabs(fracinv - FIold) > 3.3*EPSILON_RATEPARAM) ||
483 				(fabs(Geta - GEold) > 3.3*EPSILON_RATEPARAM)
484 			) change = TRUE;
485 
486 		}
487 
488 		if (nump == 1) return;
489 
490 	} while (it != MAXITS && change);
491 
492 	return;
493 } /* estimateparametersnotree */
494 
495 
496 /* estimate parameters of substitution process and rate heterogeneity - tree
497    same as above but here the n-taxon tree is already in memory */
estimateparameterstree()498 void estimateparameterstree()
499 {
500 	int it, nump, change;
501 	double TSold, YRold, FIold, GEold;
502 
503 	it = 0;
504 	nump = 0;
505 
506 	/* count number of parameters */
507 	if (data_optn == NUCLEOTIDE && optim_optn) nump++;
508 	if (fracinv_optim || grate_optim) nump++;
509 
510 	do { /* repeat until nothing changes any more */
511 		it++;
512 		change = FALSE;
513 
514 		/* optimize substitution process parameters */
515 		if (data_optn == NUCLEOTIDE && optim_optn) {
516 
517 			TSold = TSparam;
518 			YRold = YRparam;
519 
520 
521 			/*
522 			 * optimize
523 			 */
524 
525 			fprintf(STDOUT, "Optimizing missing substitution process parameters\n");
526 			fflush(STDOUT);
527 			optimseqevolparamstree();
528 			/*epe*/ /*??? - auskommentiert, HAS ;-) */
529 #ifdef EPE_DEBUG
530 			fprintf(STDOUT, "\n\nRED ALERT\n\n\n");
531 #endif /* EPE_DEBUG */
532 			computedistan(); /* update ML distances */
533 
534 
535 			/* same tolerance as 1D minimization */
536 			if ((fabs(TSparam - TSold) > 3.3*EPSILON_SUBSTPARAM) ||
537 				(fabs(YRparam - YRold) > 3.3*EPSILON_SUBSTPARAM)
538 			) change = TRUE;
539 
540 		}
541 
542 		/* optimize rate heterogeneity variables */
543 		if (fracinv_optim || grate_optim) {
544 
545 			FIold = fracinv;
546 			GEold = Geta;
547 
548 
549 			/*
550 			 * optimize
551 			 */
552 
553 			fprintf(STDOUT, "Optimizing missing rate heterogeneity parameters\n");
554 			fflush(STDOUT);
555 			optimrateparams();
556 			/*epe*/ /*??? - auskommentiert, HAS ;-) */
557 #ifdef EPE_DEBUG
558 			fprintf(STDOUT, "\n\nRED ALERT\n\n\n");
559 #endif /* EPE_DEBUG */
560 			computedistan(); /* update ML distances */
561 
562 
563 			/* same tolerance as 1D minimization */
564 			if ((fabs(fracinv - FIold) > 3.3*EPSILON_RATEPARAM) ||
565 				(fabs(Geta - GEold) > 3.3*EPSILON_RATEPARAM)
566 			) change = TRUE;
567 
568 		}
569 
570 		if (nump == 1) return;
571 
572 	} while (it != MAXITS && change);
573 
574 	return;
575 } /* estimateparameterstree */
576 
577