1 /*
2  * ml.h
3  *
4  *
5  * Part of TREE-PUZZLE 5.2 (July 2004)
6  *
7  * (c) 2003-2004 by Heiko A. Schmidt, Korbinian Strimmer, and Arndt von Haeseler
8  * (c) 1999-2003 by Heiko A. Schmidt, Korbinian Strimmer,
9  *                  M. Vingron, and Arndt von Haeseler
10  * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler
11  *
12  * All parts of the source except where indicated are distributed under
13  * the GNU public licence.  See http://www.opensource.org for details.
14  *
15  * ($Id$)
16  *
17  */
18 
19 
20 #ifndef _ML_
21 #define _ML_
22 
23 /* values for mlmode */
24 #define ML_QUART 1
25 #define ML_TREE  2
26 #define NJ_TREE  3
27 
28 #if 0
29 /* values for rhettype */
30 #define MLUNIFORMRATE 0
31 #define MLGAMMARATE   1
32 #define MLTWORATE     2
33 #define MLMIXEDRATE   3
34 #endif
35 
36 /* values for rhetmode */
37 #define UNIFORMRATE 0
38 #define GAMMARATE   1
39 #define TWORATE     2
40 #define MIXEDRATE   3
41 
42 
43 /* definitions */
44 
45 #define MINTS 0.20  /* Ts/Tv parameter */
46 #define MAXTS 30.0
47 #define MINYR 0.10  /* Y/R Ts parameter */
48 #define MAXYR 6.00
49 #define MINFI 0.00  /* fraction invariable sites */
50 #define MAXFI 0.99  /* only for input */
51 #define MINGE 0.01  /* rate heterogeneity parameter */
52 #define MAXGE 0.99
53 #define MINCAT 4    /* discrete Gamma categories */
54 #define MAXCAT 16
55 
56 #define RMHROOT         5.0     /* upper relative bound for height of root   */
57 #define MAXARC          900.0   /* upper limit on branch length (PAM) = 6.0  */
58 #define MINARC          0.001   /* lower limit on branch length (PAM) = 0.00001 */
59 
60 #ifdef USE_ADJUSTABLE_EPS
61     /* error in branch length (PAM) = 0.000001   */
62 #   define EPSILON_BRANCH_DEFAULT  0.0001
63 #   define EPSILON_BRANCH epsilon_branch
64     EXTERN double epsilon_branch;
65 #else
66 #   define EPSILON_BRANCH  0.0001
67 #endif
68 
69 #ifdef USE_ADJUSTABLE_EPS
70 /* error in node and root heights            */
71 #   define EPSILON_HEIGHTS_DEFAULT 0.0001
72 #   define EPSILON_HEIGHTS epsilon_heights
73     EXTERN double epsilon_heights;
74 #else
75 #   define EPSILON_HEIGHTS 0.0001
76 #endif
77 
78 #ifdef USE_ADJUSTABLE_EPS
79 #   if (EXTERN != extern)
80 	double epsilon_branch = EPSILON_BRANCH_DEFAULT;
81 	double epsilon_heights = EPSILON_HEIGHTS_DEFAULT;
82 #   endif
83 #endif
84 
85 #define MAXIT           100     /* maximum number of iterates of smoothing   */
86 #define MINFDIFF        0.00002 /* lower limit on base frequency differences */
87 #define MINFREQ         0.0001  /* lower limit on base frequencies = 0.01%   */
88 #define NUMQBRNCH       5       /* number of branches in a quartet           */
89 #define NUMQIBRNCH      1       /* number of internal branches in a quartet  */
90 #define NUMQSPC         4       /* number of sequences in a quartet          */
91 
92 /* 2D minimisation */
93 
94 #ifdef USE_ADJUSTABLE_EPS
95     /* epsilon substitution process estimation */
96 #   define EPSILON_SUBSTPARAM_DEFAULT 0.01
97 #   define EPSILON_SUBSTPARAM epsilon_substparam
98     EXTERN double epsilon_substparam;
99 #else
100 #   define EPSILON_SUBSTPARAM 0.01
101 #endif
102 
103 #ifdef USE_ADJUSTABLE_EPS
104     /* epsilon rate heterogeneity estimation  */
105 #   define EPSILON_RATEPARAM_DEFAULT  0.01
106 #   define EPSILON_RATEPARAM  epsilon_rateparam
107     EXTERN double epsilon_rateparam;
108 #else
109 #   define EPSILON_RATEPARAM 0.01
110 #endif
111 
112 #ifdef USE_ADJUSTABLE_EPS
113 #   if (EXTERN != extern)
114 	double epsilon_substparam = EPSILON_SUBSTPARAM_DEFAULT;
115 	double epsilon_rateparam = EPSILON_RATEPARAM_DEFAULT;
116 #   endif
117 #endif
118 
119 /* quartet series */
120 #define MINPERTAXUM 2
121 #define MAXPERTAXUM 6
122 #define TSDIFF 0.20
123 #define YRDIFF 0.10
124 
125 /* type definitions */
126 
127 typedef struct node
128 {
129 	struct node *isop;	/* pointers within node (nodes) */
130 	struct node *kinp;	/* pointers between nodes (branches) */
131 	int descen;
132 	int number;
133 	double length;		/* branch length inferred (no clock) */
134 	double lengthc;		/* branch length inferred (clock) */
135 	double lengthext;	/* external length from usertree */
136 	int    lengthset;	/* length set by usertree */
137 	double varlen;
138 	double height;
139 	double varheight;
140 	ivector paths;		/* path directions for Korbi's p-step */
141 	cvector eprob;
142 	dcube partials;		/* partial likelihoods */
143 	char *label;		/* internal labels */
144 } Node;
145 
146 typedef struct tree
147 {
148 	Node    *rootp;
149 	Node   **ebrnchp;	/* list of pointers to external branches     */
150 	Node   **ibrnchp;       /* list of pointers to internal branches     */
151 	double   lklhd;	        /* total log-likelihood                      */
152 	double   lklhdc;        /* total log-likelihood clock                */
153 	dmatrix  condlkl;	/* lhs for each pattern and non-zero rate    */
154 	double   rssleast;
155 } Tree;
156 
157 
158 /* global variables */
159 
160 EXTERN Node    *chep;      /* pointer to current height node                 */
161 EXTERN Node    *rootbr;    /* pointer to root branch                         */
162 EXTERN Node   **heights;   /* pointer to height nodes in unrooted tree       */
163 EXTERN int      Numhts;    /* number of height nodes in unrooted tree        */
164 EXTERN double   hroot;     /* height of root                                 */
165 EXTERN double   varhroot;  /* variance of height of root                     */
166 EXTERN double   maxhroot;  /* maximal height of root                         */
167 EXTERN int      locroot;   /* location of root                               */
168 EXTERN int      numbestroot; /* number of best locations for root            */
169 EXTERN int      clockmode; /* clocklike vs. nonclocklike computation         */
170 EXTERN cmatrix  Identif;   /* sequence names                                 */
171 EXTERN cmatrix  Namestr;   /* sequence names (delimited by '\0')             */
172 EXTERN cmatrix  Seqchar;   /* ML sequence data                               */
173 EXTERN cmatrix  Seqpat;    /* ordered site patterns                          */
174 EXTERN ivector  constpat;  /* indicates constant site patterns               */
175 EXTERN cvector  seqchi;
176 EXTERN cvector  seqchj;
177 EXTERN dcube    partiali;
178 EXTERN dcube    partialj;
179 EXTERN dcube    ltprobr;   /* transition probabilites (for all non-zero rates*/
180 EXTERN dmatrix  Distanmat;   /* matrix with maximum likelihood distances     */
181 EXTERN dmatrix  Evec;        /* Eigenvectors                                 */
182 EXTERN dmatrix  Ievc;        /* Inverse eigenvectors                         */
183 EXTERN double   TSparam;     /* Ts/Tv parameter                              */
184 EXTERN double   GTR_ACrate;  /* A <-> G mutation rate for GTR                */
185 EXTERN double   GTR_AGrate;  /* A <-> C mutation rate for GTR                */
186 EXTERN double   GTR_ATrate;  /* A <-> T mutation rate for GTR                */
187 EXTERN double   GTR_CGrate;  /* C <-> G mutation rate for GTR                */
188 EXTERN double   GTR_CTrate;  /* C <-> T mutation rate for GTR                */
189 EXTERN double   GTR_GTrate;  /* G <-> T mutation rate for GTR                */
190 EXTERN double   tsmean, yrmean;
191 EXTERN double   YRparam;     /* Y/R Ts parameter                             */
192 EXTERN double   geerr;       /* estimated error of rate heterogeneity        */
193 EXTERN double   Geta;        /* rate heterogeneity parameter                 */
194 EXTERN double   fracconst;   /* fraction of constant sites                   */
195 EXTERN double   fracconstpat;/* fraction of constant patterns                */
196 EXTERN double   Proportion;  /* for tree drawing                             */
197 EXTERN double   tserr;       /* estimated error of TSparam                   */
198 EXTERN double   yrerr;       /* estimated error of YRparam                   */
199 EXTERN double   fracinv;     /* fraction of invariable sites                 */
200 EXTERN double   fierr;       /* estimated error of fracinv                   */
201 EXTERN dvector  Brnlength;
202 EXTERN dvector  Distanvec;
203 EXTERN dvector  Eval;        /* Eigenvalues of 1 PAM rate matrix             */
204 EXTERN dvector  Freqtpm;     /* base frequencies                             */
205 EXTERN dvector  Rates;       /* rate of each of the categories               */
206 EXTERN dmatrix  iexp;
207 EXTERN imatrix  Basecomp;    /* base composition of each taxon               */
208 EXTERN ivector  usedtaxa;    /* list needed in the input treefile procedure  */
209 EXTERN int      numtc;       /* auxiliary variable for printing rooted tree  */
210 EXTERN int      qcalg_optn;  /* use quartet subsampling algorithm            */
211 EXTERN int      approxp_optn;/* approximate parameter estimation             */
212 EXTERN int      chi2fail;    /* flag for chi2 test                           */
213 EXTERN int      Converg;     /* flag for ML convergence (no clock)           */
214 EXTERN int      Convergc;    /* flag for ML convergence (clock)              */
215 EXTERN int      data_optn;   /* type of sequence input data                  */
216 EXTERN int      Dayhf_optn;  /* Dayhoff model                                */
217 EXTERN int      HKY_optn;    /* use HKY model                                */
218 EXTERN int      Jtt_optn;    /* JTT model                                    */
219 EXTERN int      print_GTR_optn; /* print (restricted) GTR matrix */
220 EXTERN int      GTR_optn;    /* GTR model                                    */
221 EXTERN int      blosum62_optn; /* BLOSUM 62 model                            */
222 EXTERN int      mtrev_optn;  /* mtREV model                                  */
223 EXTERN int      cprev_optn;  /* cpREV model                                  */
224 EXTERN int      vtmv_optn;   /* VT model                                     */
225 EXTERN int      wag_optn;    /* WAG model                                    */
226 EXTERN int      Maxsite;     /* number of ML characters per taxum            */
227 EXTERN int      Maxspc;      /* number of sequences                          */
228 EXTERN int      mlmode;      /* quartet ML or user defined tree ML           */
229 EXTERN int      nuc_optn;    /* nucleotide (4x4) models                      */
230 EXTERN int      Numbrnch;    /* number of branches of current tree           */
231 EXTERN int      numcats;     /* number of rate categories                    */
232 EXTERN int      Numconst;    /* number of constant sites                     */
233 EXTERN int      Numconstpat; /* number of constant patterns                  */
234 EXTERN int      Numibrnch;   /* number of internal branches of current tree  */
235 EXTERN int      Numitc;      /* number of ML iterations assumning clock      */
236 EXTERN int      Numit;       /* number of ML iterations if convergence       */
237 EXTERN int      Numptrn;     /* number of site patterns                      */
238 EXTERN int      Numspc;      /* number of sequences of current tree          */
239 EXTERN int      optim_optn;  /* optimize model parameters                    */
240 EXTERN int      grate_optim; /* optimize Gamma rate heterogeneity parameter  */
241 EXTERN int      SH_optn;     /* SH nucleotide (16x16) model                  */
242 EXTERN int      TN_optn;     /* use TN model                                 */
243 EXTERN int      tpmradix;    /* number of different states                   */
244 EXTERN int      fracinv_optim;  /* optimize fraction of invariable sites     */
245 EXTERN int      typ_optn;    /* type of PUZZLE analysis                      */
246 EXTERN ivector  Weight;      /* weight of each site pattern                  */
247 EXTERN Tree    *Ctree;       /* pointer to current tree                      */
248 EXTERN int      qca, qcb, qcc, qcd; /* quartet currently optimized           */
249 EXTERN ivector  Alias;       /* link site -> corresponding site pattern      */
250 EXTERN ivector  bestrate;    /* optimal assignment of rates to sequence sites*/
251 
252 EXTERN int      bestratefound;  /* best rate per site already reconstructed ? */
253 
254 /* function prototypes of all ml function */
255 
256 void convfreq(dvector);
257 void tp_radixsort(cmatrix, ivector, int, int, int *);
258 void condenceseq(cmatrix, ivector, cmatrix, ivector, int, int, int);
259 void countconstantsites(cmatrix, ivector, int, int, int *, int*);
260 void evaluateseqs(void);
261 void elmhes(dmatrix, ivector, int);
262 void eltran(dmatrix, dmatrix, ivector, int);
263 void mcdiv(double, double, double, double, double *, double *);
264 void hqr2(int, int, int, dmatrix, dmatrix, dvector, dvector);
265 void onepamratematrix(dmatrix);
266 void eigensystem(dvector, dmatrix);
267 void luinverse(dmatrix, dmatrix, int);
268 void checkevector(dmatrix, dmatrix, int);
269 void tranprobmat(void);
270 void tprobmtrx(double, dmatrix);
271 double comptotloglkl(dmatrix);
272 void allsitelkl(dmatrix, dvector);
273 void writesitelklmatrixphy(FILE *outf, int numut, int numsite, dmatrix allslkl, dmatrix allslklc, int clock);
274 void writesitelklvectorphy(FILE *outf, char *name, int numsite, dvector aslkl);
275 /* void writesitelklbin(FILE *, dvector); */ /* TODO */
276 double pairlkl(double);
277 double mldistance(int, int, double); /*epe*/
278 /* double mldistance(int, int); */
279 void initdistan(void);
280 void computedistan(void);
281 void productpartials(Node *);
282 void partialsinternal(Node *);
283 void partialsexternal(Node *);
284 void initpartials(Tree *);
285 double intlkl(double);
286 void optinternalbranch(Node *);
287 double extlkl(double);
288 void optexternalbranch(Node *);
289 void finishlkl(Node *);
290 double optlkl(Tree *);
291 double treelkl(Tree *);
292 void luequation(dmatrix, dvector, int);
293 void lslength(Tree *, dvector, int, int, dvector);
294 #if PARALLEL
295 	void lslength_par(Tree *, dvector, int, int, dvector);
296 #endif
297 void setextlength(Tree *, int, int, dvector);
298 void getusertree(FILE *, cvector, int, int);
299 Node *internalnode(Tree *, char **, int *);
300 void constructtree(Tree *, cvector, int);
301 void removebasalbif(cvector);
302 void makeusertree(FILE *, int);
303 Tree *new_tree(int, int, cmatrix);
304 Tree *new_quartet(int, cmatrix);
305 void free_tree(Tree *, int);
306 void make_quartet(int, int, int, int);
307 void changedistan(dmatrix, dvector, int);
308 double quartet_lklhd(int, int, int, int);
309 double quartet_alklhd(int, int, int, int);
310 void readusertree(FILE *, int);
311 double usertree_lklhd(int, int);
312 double usertree_alklhd(int, int);
313 void mlstart(void);
314 void distupdate(int, int, int, int);
315 void mlfinish(void);
316 void prbranch(Node *, int, int, int, ivector, ivector, FILE *);
317 void getproportion(double *, dvector, int);
318 void prtopology(FILE *);
319 void fputphylogeny(FILE *);
320 void resulttree(FILE *outfp, int usebranch);
321 void njtree(FILE *);
322 void njdistantree(Tree *);
323 void findbestratecombination(void);
324 void printbestratecombination(FILE *);
325 void printbestratecombinationtofile(FILE *, int);
326 int checkedge(int);
327 void fputsubstree(FILE *, Node *);
328 void fputrooted(FILE *, int);
329 void findheights(Node *);
330 void initclock(int);
331 double clock_alklhd(int);
332 double heightlkl(double);
333 void optheight(void);
334 double rheightlkl(double);
335 void optrheight(void);
336 double clock_lklhd(int);
337 int findrootedge(void);
338 void resultheights(FILE *);
339 
340 /* mlparam.c */
341 double homogentest(int);
342 void YangDiscreteGamma(double, int, double *);
343 void updaterates(void);
344 void computestat(double *, int, double *, double *);
345 double quartetml(int, int, int, int);
346 double opttsq(double);
347 double optyrq(double);
348 void optimseqevolparamsquart(void);
349 double opttst(double);
350 double optyrt(double);
351 void optimseqevolparamstree(void);
352 double optfi(double);
353 double optge(double);
354 void optimrateparams(void);
355 
356 void estimateparametersnotree(void); /* moved from puzzle1.c */
357 void estimateparameterstree(void);   /* moved from puzzle1.c */
358 /* mlparam.c end */
359 
360 int gettpmradix(void);
361 void rtfdata(dmatrix, double *);
362 int code2int(cvector, int *, int *);
363 char *int2code(int);
364 
365 void jttdata(dmatrix, double *);
366 void dyhfdata(dmatrix, double *);
367 void mtrevdata(dmatrix, double *);
368 void cprev45data(dmatrix, double *);
369 void blosum62data(dmatrix, double *);
370 void vtmvdata(dmatrix, double *);
371 void wagdata(dmatrix, double *);
372 
373 #endif
374