1 /*
2 * mlparam.c
3 *
4 *
5 * Part of TREE-PUZZLE 5.2 (July 2004)
6 *
7 * (c) 2003-2004 by Heiko A. Schmidt, Korbinian Strimmer, and Arndt von Haeseler
8 * (c) 1999-2003 by Heiko A. Schmidt, Korbinian Strimmer,
9 * M. Vingron, and Arndt von Haeseler
10 * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler
11 *
12 * All parts of the source except where indicated are distributed under
13 * the GNU public licence. See http://www.opensource.org for details.
14 *
15 * ($Id$)
16 *
17 */
18
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
22
23
24 #define EXTERN extern
25
26
27 /* prototypes */
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <math.h>
31 #include "puzzle.h"
32 #include "util.h"
33 #include "ml.h"
34 #include "gamma.h"
35
36
37
38 /******************************************************************************/
39 /* discrete Gamma-distribution and related stuff */
40 /******************************************************************************/
41
42 /* compare general base frequencies with frequencies of taxon i with chi square */
homogentest(int taxon)43 double homogentest(int taxon)
44 {
45 return chi2test(Freqtpm, Basecomp[taxon], gettpmradix(), &chi2fail);
46 }
47
48
49 /* discrete Gamma according to Yang 1994 (JME 39:306-314) */
YangDiscreteGamma(double shape,int c,dvector x)50 void YangDiscreteGamma (double shape, int c, dvector x)
51 {
52 double twoc, mu;
53 int i;
54
55 twoc = 2.0*c;
56 mu = 0.0;
57 for (i = 0; i < c; i++)
58 {
59 /* corresponding rates */
60 x[i] = icdfGamma ( (2.0*i+1.0)/twoc, shape);
61 mu += x[i];
62 }
63 mu = mu/c;
64
65 /* rescale for avarage rate of 1.0 */
66 for (i = 0; i < c; i++)
67 {
68 x[i] /= mu;
69 }
70 }
71
72 /* compute rates of each category when rates are Gamma-distributed */
updaterates()73 void updaterates()
74 {
75 int i;
76 double alpha;
77
78 if (numcats == 1)
79 {
80 Rates[0] = 1.0;
81 return;
82 }
83 if (Geta == 0.0)
84 {
85 for (i = 0; i < numcats; i++)
86 Rates[i] = 1.0;
87 return;
88 }
89 alpha = (1.0 - Geta)/Geta;
90
91 YangDiscreteGamma (alpha, numcats, Rates);
92
93 /* if invariable sites are present */
94 for (i = 0; i < numcats; i++)
95 Rates[i] = Rates[i]/(1.0-fracinv);
96
97 /* check for very small rates */
98 for (i = 0; i < numcats; i++)
99 if (Rates[i] < 0.000001) Rates[i] = 0.000001;
100 }
101
102
103
104 /******************************************************************************/
105 /* parameter estimation */
106 /******************************************************************************/
107
108 /* compute sample mean and standard deviation of sample mean */
computestat(double * data,int n,double * mean,double * err)109 void computestat(double *data, int n, double *mean, double *err)
110 {
111 int i;
112 double sum;
113 double temp;
114 double avg;
115
116 /* compute sample mean */
117 sum = 0;
118 for (i = 0; i < n; i++) sum += data[i];
119 avg = sum/(double) n;
120 (*mean) = avg;
121
122 /* compute std. error of sample mean with curvature method */
123 sum = 0;
124 for (i = 0; i < n; i++) {
125 temp = data[i] - avg;
126 sum += temp*temp;
127 }
128 if (n != 1)
129 (*err) = sqrt(sum)/sqrt((double)(n-1)*n); /* unbiased estimator */
130 else
131 (*err) = 0.0; /* if n == 1 */
132 }
133
134 /* compute ML value of quartet (a,b,c,d) for parameter estimation with quartet subsampling */
quartetml(int a,int b,int c,int d)135 double quartetml(int a, int b, int c, int d)
136 {
137 double d1, d2, d3;
138
139 /* compute ML for all topologies */
140 if (approxp_optn) { /* approximate parameter mode */
141 d1 = quartet_alklhd(a,b,c,d); /* (a,b)-(c,d) */
142 d2 = quartet_alklhd(a,c,b,d); /* (a,c)-(b,d) */
143 d3 = quartet_alklhd(a,d,b,c); /* (a,d)-(b,c) */
144 } else {
145 d1 = quartet_lklhd(a,b,c,d); /* (a,b)-(c,d) */
146 d2 = quartet_lklhd(a,c,b,d); /* (a,c)-(b,d) */
147 d3 = quartet_lklhd(a,d,b,c); /* (a,d)-(b,c) */
148 }
149
150 /* looking for max(d1, d2, d3) */
151 if (d1 < d2) { /* d2 > d1 */
152 if (d2 < d3) { /* d3 > d2 > d1 */
153 /* d3 maximum */
154 return d3;
155 } else { /* d2 >= d3 > d1 */
156 /* d2 maximum */
157 return d2;
158 }
159 } else { /* d1 >= d2 */
160 if (d1 < d3) { /* d3 > d1 >= d2 */
161 /* d3 maximum */
162 return d3;
163 } else { /* d1 >= d2 && d1 >= d3 */
164 /* d1 maximum */
165 return d1;
166 }
167 }
168 }
169
170 /* optimization function TSparam - quartets */
opttsq(double x)171 double opttsq(double x)
172 {
173 if (x < MINTS) TSparam = MINTS;
174 else if (x > MAXTS) TSparam = MAXTS;
175 else TSparam = x;
176 tranprobmat();
177 distupdate(qca, qcb, qcc, qcd);
178 return (-quartetml(qca, qcb, qcc, qcd));
179 }
180
181 /* optimization function YRparam - quartets */
optyrq(double x)182 double optyrq(double x)
183 {
184 if (x < MINYR) YRparam = MINYR;
185 else if (x > MAXYR) YRparam = MAXYR;
186 else YRparam = x;
187 tranprobmat();
188 distupdate(qca, qcb, qcc, qcd);
189 return (-quartetml(qca, qcb, qcc, qcd));
190 }
191
192 /* estimate substitution process parameters - random quartets */
optimseqevolparamsquart()193 void optimseqevolparamsquart()
194 {
195 double tsmeanold, yrmeanold;
196 dvector tslist, yrlist;
197 int fin;
198 ivector taxon;
199 uli minqts, maxqts, n;
200
201
202 taxon = new_ivector(4);
203
204 /* number of quartets to be investigated */
205 minqts = (uli) floor(0.25 * MINPERTAXUM * Maxspc) + 1;
206 maxqts = (uli) floor(0.25 * MAXPERTAXUM * Maxspc) + 1;
207 if (Maxspc == 4) {
208 minqts = (uli) 1;
209 maxqts = (uli) 1;
210 }
211
212 tslist = new_dvector(maxqts);
213 yrlist = new_dvector(maxqts);
214
215 /* initialize averages */
216 tsmean = TSparam;
217 yrmean = YRparam;
218
219 fin = FALSE;
220
221 /* investigate maxqts random quartets */
222 for (n = 0; n < maxqts; n++) {
223
224 /* choose random quartet */
225 chooser(Maxspc, 4, taxon);
226
227 /*
228 * optimize parameters on this quartet
229 */
230
231 qca = taxon[0];
232 qcb = taxon[1];
233 qcc = taxon[2];
234 qcd = taxon[3];
235
236 /* initialize start values with average value */
237 if ((SH_optn || nuc_optn) && optim_optn && (data_optn == 0)) TSparam = tsmean;
238 if ((nuc_optn && TN_optn) && optim_optn && (data_optn == 0)) YRparam = yrmean;
239
240 /* estimation */
241 twodimenmin(EPSILON_SUBSTPARAM,
242 (SH_optn || nuc_optn) && optim_optn && (data_optn == 0),
243 MINTS, &TSparam, MAXTS, opttsq, &tserr,
244 (nuc_optn && TN_optn) && optim_optn && (data_optn == 0),
245 MINYR, &YRparam, MAXYR, optyrq, &yrerr);
246
247
248 tsmeanold = tsmean;
249 yrmeanold = yrmean;
250 tslist[n] = TSparam;
251 yrlist[n] = YRparam;
252 computestat(tslist, n+1 , &tsmean, &tserr);
253 computestat(yrlist, n+1 , &yrmean, &yrerr);
254
255 /* check whether the means are converging */
256 if (n > minqts-2) {
257 if ((fabs(tsmean-tsmeanold) < TSDIFF) &&
258 (fabs(yrmean-yrmeanold) < YRDIFF))
259 fin = TRUE;
260 }
261
262 /* investigate at least minqts quartets */
263 if (n > minqts-2 && (fin || n > maxqts-2)) break;
264 }
265
266 /* round estimated numbers to 2 digits after the decimal point */
267 if (tserr != 0.0) tsmean = floor(100.0*tsmean+0.5)/100.0;
268 if (yrerr != 0.0) yrmean = floor(100.0*yrmean+0.5)/100.0;
269
270 /* update ML engine */
271 TSparam = tsmean;
272 YRparam = yrmean;
273 tranprobmat();
274
275 free_ivector(taxon);
276 }
277
278 /* optimization function TSparam - tree */
opttst(double x)279 double opttst(double x)
280 {
281 double result;
282
283 if (x < MINTS) TSparam = MINTS;
284 else if (x > MAXTS) TSparam = MAXTS;
285 else TSparam = x;
286 tranprobmat();
287
288 /*epe*/
289 # if PARALLEL
290 PP_Update_EEI();
291 # endif
292 computedistan();
293 /* pre-set branchlengths = FALSE; parallel estimation = TRUE */
294 if (approxp_optn) result = usertree_alklhd(FALSE, TRUE);
295 else result = usertree_lklhd(FALSE, TRUE);
296
297 return (-result);
298 }
299
300 /* optimization function YRparam - tree */
optyrt(double x)301 double optyrt(double x)
302 {
303 double result;
304
305 if (x < MINYR) YRparam = MINYR;
306 else if (x > MAXYR) YRparam = MAXYR;
307 else YRparam = x;
308 tranprobmat();
309 /*epe*/
310 # if PARALLEL
311 PP_Update_EEI();
312 # endif
313 computedistan();
314 /* pre-set branchlengths = FALSE; parallel estimation = TRUE */
315 if (approxp_optn) result = usertree_alklhd(FALSE, TRUE);
316 else result = usertree_lklhd(FALSE, TRUE);
317
318 return (-result);
319 }
320
321
322 /* optimize substitution process parameters - tree */
optimseqevolparamstree()323 void optimseqevolparamstree()
324 {
325 twodimenmin(EPSILON_SUBSTPARAM,
326 (SH_optn || nuc_optn) && optim_optn && (data_optn == 0),
327 MINTS, &TSparam, MAXTS, opttst, &tserr,
328 (nuc_optn && TN_optn) && optim_optn && (data_optn == 0),
329 MINYR, &YRparam, MAXYR, optyrt, &yrerr);
330 }
331
332
333 /* optimization function fracinv */
optfi(double x)334 double optfi(double x)
335 {
336 double result;
337
338 if (x < MINFI) fracinv = MINFI;
339 else if (x > MAXFI) fracinv = MAXFI;
340 else fracinv = x;
341
342 /*epe*/
343 # if PARALLEL
344 PP_Update_fracinv();
345 # endif
346
347 computedistan();
348 /* pre-set branchlengths = FALSE; parallel estimation = TRUE */
349 if (approxp_optn) result = usertree_alklhd(FALSE, TRUE);
350 else result = usertree_lklhd(FALSE, TRUE);
351
352 return (-result);
353 }
354
355
356 /* optimization function Geta */
optge(double x)357 double optge(double x)
358 {
359 double result;
360
361 if (x < MINGE) Geta = MINGE;
362 else if (x > MAXGE) Geta = MAXGE;
363 else Geta = x;
364
365 updaterates();
366
367 /*epe*/
368 # if PARALLEL
369 PP_Update_Rates();
370 # endif
371
372 computedistan();
373 /* pre-set branchlengths = FALSE; parallel estimation = TRUE */
374 if (approxp_optn) result = usertree_alklhd(FALSE, TRUE);
375 else result = usertree_lklhd(FALSE, TRUE);
376
377 return (-result);
378 }
379
380
381 /* optimize rate heterogeneity parameters */
optimrateparams()382 void optimrateparams()
383 {
384 twodimenmin(EPSILON_RATEPARAM,
385 fracinv_optim,
386 MINFI, &fracinv, fracconst, optfi, &fierr,
387 grate_optim,
388 MINGE, &Geta, MAXGE, optge, &geerr);
389
390 }
391
392
393 /******************************************************************************/
394 /* main routines to estimate parameters from tree or quartets */
395 /******************************************************************************/
396
397 /* estimate parameters of substitution process and rate heterogeneity - no tree
398 n-taxon tree is not needed because of quartet method or NJ tree topology */
estimateparametersnotree()399 void estimateparametersnotree()
400 {
401 int it, nump, change;
402 double TSold, YRold, FIold, GEold;
403
404 it = 0;
405 nump = 0;
406
407 /* count number of parameters */
408 if (data_optn == NUCLEOTIDE && optim_optn) nump++;
409 if (fracinv_optim || grate_optim) nump++;
410
411 do { /* repeat until nothing changes any more */
412 it++;
413 change = FALSE;
414
415 /* optimize substitution parameters */
416 if (data_optn == NUCLEOTIDE && optim_optn) {
417
418 TSold = TSparam;
419 YRold = YRparam;
420
421
422 /*
423 * optimize
424 */
425
426 fprintf(STDOUT, "Optimizing missing substitution process parameters\n");
427 fflush(STDOUT);
428
429 if (qcalg_optn) { /* quartet sampling */
430 optimseqevolparamsquart();
431 } else { /* NJ tree */
432 tmpfp = tmpfile();
433 njtree(tmpfp);
434 rewind(tmpfp);
435 readusertree(tmpfp, FALSE);
436 closefile(tmpfp);
437 optimseqevolparamstree();
438 }
439
440 /*epe*/
441 #if PARALLEL
442 PP_NoUpdate();
443 # endif
444 computedistan(); /* update ML distances */
445
446 /* same tolerance as 1D minimization */
447 if ((fabs(TSparam - TSold) > 3.3*EPSILON_SUBSTPARAM) ||
448 (fabs(YRparam - YRold) > 3.3*EPSILON_SUBSTPARAM)
449 ) change = TRUE;
450
451 }
452
453 /* optimize rate heterogeneity variables */
454 if (fracinv_optim || grate_optim) {
455
456 FIold = fracinv;
457 GEold = Geta;
458
459
460 /*
461 * optimize
462 */
463
464 fprintf(STDOUT, "Optimizing missing rate heterogeneity parameters\n");
465 fflush(STDOUT);
466 /* compute NJ tree */
467 tmpfp = tmpfile();
468 njtree(tmpfp);
469 /* use NJ tree topology to estimate parameters */
470 rewind(tmpfp);
471 readusertree(tmpfp, FALSE);
472 closefile(tmpfp);
473
474 optimrateparams();
475 # if PARALLEL
476 PP_NoUpdate();
477 # endif /* PARALLEL */
478 computedistan(); /* update ML distances */
479
480
481 /* same tolerance as 1D minimization */
482 if ((fabs(fracinv - FIold) > 3.3*EPSILON_RATEPARAM) ||
483 (fabs(Geta - GEold) > 3.3*EPSILON_RATEPARAM)
484 ) change = TRUE;
485
486 }
487
488 if (nump == 1) return;
489
490 } while (it != MAXITS && change);
491
492 return;
493 } /* estimateparametersnotree */
494
495
496 /* estimate parameters of substitution process and rate heterogeneity - tree
497 same as above but here the n-taxon tree is already in memory */
estimateparameterstree()498 void estimateparameterstree()
499 {
500 int it, nump, change;
501 double TSold, YRold, FIold, GEold;
502
503 it = 0;
504 nump = 0;
505
506 /* count number of parameters */
507 if (data_optn == NUCLEOTIDE && optim_optn) nump++;
508 if (fracinv_optim || grate_optim) nump++;
509
510 do { /* repeat until nothing changes any more */
511 it++;
512 change = FALSE;
513
514 /* optimize substitution process parameters */
515 if (data_optn == NUCLEOTIDE && optim_optn) {
516
517 TSold = TSparam;
518 YRold = YRparam;
519
520
521 /*
522 * optimize
523 */
524
525 fprintf(STDOUT, "Optimizing missing substitution process parameters\n");
526 fflush(STDOUT);
527 optimseqevolparamstree();
528 /*epe*/ /*??? - auskommentiert, HAS ;-) */
529 #ifdef EPE_DEBUG
530 fprintf(STDOUT, "\n\nRED ALERT\n\n\n");
531 #endif /* EPE_DEBUG */
532 computedistan(); /* update ML distances */
533
534
535 /* same tolerance as 1D minimization */
536 if ((fabs(TSparam - TSold) > 3.3*EPSILON_SUBSTPARAM) ||
537 (fabs(YRparam - YRold) > 3.3*EPSILON_SUBSTPARAM)
538 ) change = TRUE;
539
540 }
541
542 /* optimize rate heterogeneity variables */
543 if (fracinv_optim || grate_optim) {
544
545 FIold = fracinv;
546 GEold = Geta;
547
548
549 /*
550 * optimize
551 */
552
553 fprintf(STDOUT, "Optimizing missing rate heterogeneity parameters\n");
554 fflush(STDOUT);
555 optimrateparams();
556 /*epe*/ /*??? - auskommentiert, HAS ;-) */
557 #ifdef EPE_DEBUG
558 fprintf(STDOUT, "\n\nRED ALERT\n\n\n");
559 #endif /* EPE_DEBUG */
560 computedistan(); /* update ML distances */
561
562
563 /* same tolerance as 1D minimization */
564 if ((fabs(fracinv - FIold) > 3.3*EPSILON_RATEPARAM) ||
565 (fabs(Geta - GEold) > 3.3*EPSILON_RATEPARAM)
566 ) change = TRUE;
567
568 }
569
570 if (nump == 1) return;
571
572 } while (it != MAXITS && change);
573
574 return;
575 } /* estimateparameterstree */
576
577