1 /*
2
3 PhyML: a program that computes maximum likelihood phylogenies from
4 DNA or AA homologous sequences.
5
6 Copyright (C) Stephane Guindon. Oct 2003 onward.
7
8 All parts of the source except where indicated are distributed under
9 the GNU public licence. See http://www.opensource.org for details.
10
11 */
12
13
14 /* Routines for Markov-Modulated Markov Models (M4) */
15
16 #include "m4.h"
17
M4_main(int argc,char ** argv)18 int M4_main(int argc, char **argv)
19 {
20
21 calign *cdata;
22 option *io;
23 t_tree *tree;
24 int num_data_set;
25 int num_tree,num_rand_tree;
26 t_mod *mod;
27 time_t t_beg,t_end;
28 phydbl best_lnL;
29 int r_seed;
30 char *most_likely_tree=NULL;
31
32
33 #ifdef MPI
34 int rc;
35 rc = MPI_Init(&argc,&argv);
36 if (rc != MPI_SUCCESS) {
37 PhyML_Printf("\n== Err. starting MPI program. Terminating.\n");
38 MPI_Abort(MPI_COMM_WORLD, rc);
39 }
40 MPI_Comm_size(MPI_COMM_WORLD,&Global_numTask);
41 MPI_Comm_rank(MPI_COMM_WORLD,&Global_myRank);
42 #endif
43
44 #ifdef QUIET
45 setvbuf(stdout,NULL,_IOFBF,2048);
46 #endif
47
48
49 tree = NULL;
50 mod = NULL;
51 best_lnL = UNLIKELY;
52
53 io = (option *)Get_Input(argc,argv);
54 r_seed = (io->r_seed < 0)?(time(NULL)):(io->r_seed);
55 srand(r_seed);
56 io->r_seed = r_seed;
57
58
59 if(io->in_tree == 2) Test_Multiple_Data_Set_Format(io);
60 else io->n_trees = 1;
61
62
63 if((io->n_data_sets > 1) && (io->n_trees > 1))
64 {
65 io->n_data_sets = MIN(io->n_trees,io->n_data_sets);
66 io->n_trees = MIN(io->n_trees,io->n_data_sets);
67 }
68
69 for(num_data_set=0;num_data_set<io->n_data_sets;num_data_set++)
70 {
71 best_lnL = UNLIKELY;
72 Get_Seq(io);
73 Make_Model_Complete(io->mod);
74 Set_Model_Name(io->mod);
75 Print_Settings(io);
76 mod = io->mod;
77
78 if(io->data)
79 {
80 if(io->n_data_sets > 1) PhyML_Printf("\n. Data set [#%d]\n",num_data_set+1);
81 cdata = Compact_Data(io->data,io);
82
83 Free_Seq(io->data,cdata->n_otu);
84
85 if(cdata) Check_Ambiguities(cdata,io->datatype,io->state_len);
86 else
87 {
88 PhyML_Printf("\n== Err. in file %s at line %d\n",__FILE__,__LINE__);
89 Warn_And_Exit("");
90 }
91
92 for(num_tree=(io->n_trees == 1)?(0):(num_data_set);num_tree < io->n_trees;num_tree++)
93 {
94 if(!io->mod->s_opt->random_input_tree) io->mod->s_opt->n_rand_starts = 1;
95
96 for(num_rand_tree=0;num_rand_tree<io->mod->s_opt->n_rand_starts;num_rand_tree++)
97 {
98 if((io->mod->s_opt->random_input_tree) && (io->mod->s_opt->topo_search != NNI_MOVE))
99 if(!io->quiet) PhyML_Printf("\n. [Random start %3d/%3d]\n",num_rand_tree+1,io->mod->s_opt->n_rand_starts);
100
101 Init_Model(cdata,mod,io);
102
103 if(io->mod->use_m4mod) M4_Init_Model(mod->m4mod,cdata,mod);
104
105 switch(io->in_tree)
106 {
107 case 0 : case 1 : { tree = Dist_And_BioNJ(cdata,mod,io); break; }
108 case 2 : { tree = Read_User_Tree(cdata,mod,io); break; }
109 }
110
111
112 if(!tree) continue;
113
114 time(&t_beg);
115 time(&(tree->t_beg));
116
117
118 tree->mod = mod;
119 tree->io = io;
120 tree->data = cdata;
121 tree->n_pattern = tree->data->crunch_len;
122
123 Set_Both_Sides(YES,tree);
124
125 if(mod->s_opt->random_input_tree) Random_Tree(tree);
126
127 if((!num_data_set) && (!num_tree) && (!num_rand_tree)) Check_Memory_Amount(tree);
128
129 Make_Tree_For_Pars(tree);
130 Make_Tree_For_Lk(tree);
131 Make_Spr(tree);
132
133 if(io->do_alias_subpatt)
134 {
135 MIXT_Set_Alias_Subpatt(YES,tree);
136 Lk(NULL,tree);
137 MIXT_Set_Alias_Subpatt(NO,tree);
138 }
139
140 if(tree->mod->s_opt->opt_topo)
141 {
142 if(tree->mod->s_opt->topo_search == NNI_MOVE) Simu_Loop(tree);
143 else if(tree->mod->s_opt->topo_search == SPR_MOVE) Global_Spr_Search(tree);
144 else Best_Of_NNI_And_SPR(tree);
145 }
146 else
147 {
148 if(tree->mod->s_opt->opt_subst_param ||
149 tree->mod->s_opt->opt_bl) Round_Optimize(tree,ROUND_MAX);
150 else Lk(NULL,tree);
151 }
152
153
154 Set_Both_Sides(YES,tree);
155 Lk(NULL,tree);
156 Pars(NULL,tree);
157 Get_Tree_Size(tree);
158 PhyML_Printf("\n. Log likelihood of the current tree: %f.\n",tree->c_lnL);
159
160 Exit("\n");
161
162 /* */
163 M4_Compute_Proba_Hidden_States_On_Edges(tree);
164 /* */
165
166 Get_Best_Root_Position(tree);
167
168 /* Print the tree estimated using the current random (or BioNJ) starting tree */
169 if(io->mod->s_opt->n_rand_starts > 1)
170 {
171 Br_Len_Involving_Invar(tree);
172 Print_Tree(io->fp_out_trees,tree);
173 fflush(NULL);
174 }
175
176 /* Record the most likely tree in a string of characters */
177 if(tree->c_lnL > best_lnL)
178 {
179 best_lnL = tree->c_lnL;
180 Br_Len_Involving_Invar(tree);
181 if(most_likely_tree) Free(most_likely_tree);
182 most_likely_tree = Write_Tree(tree);
183 Get_Tree_Size(tree);
184 }
185
186 /* JF(tree); */
187
188 time(&t_end);
189
190 Print_Fp_Out(io->fp_out_stats,t_beg,t_end,tree,
191 io,num_data_set+1,
192 (tree->mod->s_opt->n_rand_starts > 1)?
193 (num_rand_tree):(num_tree),YES, io->precision);
194
195 if(tree->io->print_site_lnl) Print_Site_Lk(tree,io->fp_out_lk);
196
197 /* Start from BioNJ tree */
198 if((num_rand_tree == io->mod->s_opt->n_rand_starts-1) && (tree->mod->s_opt->random_input_tree))
199 {
200 /* Do one more iteration in the loop, but don't randomize the tree */
201 num_rand_tree--;
202 tree->mod->s_opt->random_input_tree = 0;
203 }
204
205 Free_Spr_List_One_Edge(tree);
206 Free_One_Spr(tree->best_spr);
207 if(tree->mat) Free_Mat(tree->mat);
208 Free_Tree_Pars(tree);
209 Free_Tree_Lk(tree);
210 Free_Tree(tree);
211 }
212
213
214 /* Launch bootstrap analysis */
215 if(io->do_boot || io->do_tbe)
216 {
217 if(!io->quiet) PhyML_Printf("\n. Launch bootstrap analysis on the most likely tree...\n");
218
219 #ifdef MPI
220 MPI_Bcast (most_likely_tree, strlen(most_likely_tree)+1, MPI_CHAR, 0, MPI_COMM_WORLD);
221 if(!io->quiet) PhyML_Printf("\n. The bootstrap analysis will use %d CPUs.\n",Global_numTask);
222 #endif
223
224 most_likely_tree = Bootstrap_From_String(most_likely_tree,cdata,mod,io);
225 }
226 else if(io->ratio_test)
227 {
228 /* Launch aLRT */
229 if(!io->quiet) PhyML_Printf("\n. Compute aLRT branch supports on the most likely tree...\n");
230 most_likely_tree = aLRT_From_String(most_likely_tree,cdata,mod,io);
231 }
232
233 /* Print the most likely tree in the output file */
234 if(!io->quiet) PhyML_Printf("\n. Printing the most likely tree in file '%s'...\n", Basename(io->out_tree_file));
235 if(io->n_data_sets == 1) rewind(io->fp_out_tree);
236 PhyML_Fprintf(io->fp_out_tree,"%s\n",most_likely_tree);
237
238
239 if(io->n_trees > 1 && io->n_data_sets > 1) break;
240 }
241 Free_Calign(cdata);
242 }
243 else
244 {
245 PhyML_Printf("\n. No data was found.\n");
246 PhyML_Printf("\n. Err in file %s at line %d\n",__FILE__,__LINE__);
247 Warn_And_Exit("");
248 }
249 Free_Model_Complete(mod);
250 }
251
252 if(most_likely_tree) Free(most_likely_tree);
253
254 if(mod->s_opt->n_rand_starts > 1) PhyML_Printf("\n. Best log likelihood: %f\n",best_lnL);
255
256 Free_Optimiz(mod->s_opt);
257 Free_Model_Basic(mod);
258
259 if(io->fp_in_align) fclose(io->fp_in_align);
260 if(io->fp_in_tree) fclose(io->fp_in_tree);
261 if(io->fp_out_lk) fclose(io->fp_out_lk);
262 if(io->fp_out_tree) fclose(io->fp_out_tree);
263 if(io->fp_out_trees) fclose(io->fp_out_trees);
264 if(io->fp_out_stats) fclose(io->fp_out_stats);
265
266 Free_Input(io);
267
268 time(&t_end);
269 Print_Time_Info(t_beg,t_end);
270
271 #ifdef MPI
272 MPI_Finalize();
273 #endif
274
275 return 0;
276 }
277
278 //////////////////////////////////////////////////////////////
279 //////////////////////////////////////////////////////////////
280
281
282 /* Allocate memory */
M4_Make_Light()283 m4 *M4_Make_Light()
284 {
285 m4 *m4mod;
286
287 m4mod = (m4 *)mCalloc(1,sizeof(m4));
288 M4_Set_M4mod_Default(m4mod);
289 return m4mod;
290 }
291
292 //////////////////////////////////////////////////////////////
293 //////////////////////////////////////////////////////////////
294
295
M4_Set_M4mod_Default(m4 * m4mod)296 void M4_Set_M4mod_Default(m4 *m4mod)
297 {
298 m4mod->use_cov_alpha = 1;
299 m4mod->use_cov_alpha = 0;
300 m4mod->n_h = 3;
301 m4mod->n_o = 4;
302 }
303
304 //////////////////////////////////////////////////////////////
305 //////////////////////////////////////////////////////////////
306
307 /* Allocate memory */
M4_Make_Complete(int n_h,int n_o,m4 * m4mod)308 void M4_Make_Complete(int n_h, int n_o, m4 *m4mod)
309 {
310 int i;
311
312 m4mod->n_h = n_h;
313 m4mod->n_o = n_o;
314 m4mod->n_o = n_o;
315 m4mod->o_rr = (phydbl *)mCalloc(n_o*n_o,sizeof(phydbl));
316 m4mod->o_fq = (phydbl *)mCalloc(n_o,sizeof(phydbl));
317 m4mod->o_mats = (phydbl **)mCalloc(n_h,sizeof(phydbl *));
318 for(i=0;i<n_h;i++) m4mod->o_mats[i] = (phydbl *)mCalloc(n_o*n_o,sizeof(phydbl));
319 m4mod->h_mat = (phydbl *)mCalloc(n_h*n_h,sizeof(phydbl));
320 m4mod->h_rr = (phydbl *)mCalloc(n_h*n_h,sizeof(phydbl));
321 m4mod->h_fq = (phydbl *)mCalloc(n_h,sizeof(phydbl));
322 m4mod->multipl = (phydbl *)mCalloc(n_h,sizeof(phydbl));
323 m4mod->multipl_unscaled = (phydbl *)mCalloc(n_h,sizeof(phydbl));
324 m4mod->h_fq_unscaled = (phydbl *)mCalloc(n_h,sizeof(phydbl));
325 }
326
327 //////////////////////////////////////////////////////////////
328 //////////////////////////////////////////////////////////////
329
330 /* Fill in the (big) rate matrix of the M4 t_mod */
M4_Update_Qmat(m4 * m4mod,t_mod * mod)331 void M4_Update_Qmat(m4 *m4mod, t_mod *mod)
332 {
333 int i,j;
334 int n_s, n_o, n_h;
335 phydbl mr, sum;
336
337 /* The number of states in M4 models is the product
338 of the number of hidden states (or classes) by the
339 number of observable states
340 */
341 n_s = mod->ns;
342 n_o = m4mod->n_o;
343 n_h = m4mod->n_h;
344
345 /* Set the relative substitution rates */
346 if(mod->m4mod->use_cov_alpha)
347 {
348 DiscreteGamma(m4mod->h_fq,m4mod->multipl,m4mod->alpha,m4mod->alpha,m4mod->n_h,mod->ras->gamma_median);
349 }
350 else if(mod->m4mod->use_cov_free)
351 {
352 sum = .0;
353 for(i=0;i<mod->m4mod->n_h;i++) sum += FABS(mod->m4mod->h_fq_unscaled[i]);
354 for(i=0;i<mod->m4mod->n_h;i++) mod->m4mod->h_fq[i] = FABS(mod->m4mod->h_fq_unscaled[i])/sum;
355
356 do
357 {
358 sum = .0;
359 for(i=0;i<mod->m4mod->n_h;i++)
360 {
361 if(mod->m4mod->h_fq[i] < 0.01) mod->m4mod->h_fq[i]=0.01;
362 if(mod->m4mod->h_fq[i] > 0.99) mod->m4mod->h_fq[i]=0.99;
363 sum += mod->m4mod->h_fq[i];
364 }
365
366 for(i=0;i<mod->m4mod->n_h;i++) mod->m4mod->h_fq[i]/=sum;
367 }
368 while((sum > 1.01) || (sum < 0.99));
369
370
371 /* Make sure the multipliers are centered on 1.0 */
372 sum = .0;
373 for(i=0;i<mod->m4mod->n_h;i++) sum += FABS(mod->m4mod->multipl_unscaled[i]) * mod->m4mod->h_fq[i];
374 for(i=0;i<mod->m4mod->n_h;i++) mod->m4mod->multipl[i] = mod->m4mod->multipl_unscaled[i] / sum;
375
376 /* printf("\n. WARNING\n"); */
377 /* mod->m4mod->h_fq[0] = 1./3; */
378 /* mod->m4mod->h_fq[1] = 1./3; */
379 /* mod->m4mod->h_fq[2] = 1./3; */
380
381 /* mod->m4mod->multipl[0] = 1.0; */
382 /* mod->m4mod->multipl[1] = 1.0; */
383 /* mod->m4mod->multipl[2] = 1.0; */
384
385 sum = 0;
386 for(i=0;i<mod->m4mod->n_h;i++) sum += mod->m4mod->multipl[i] * mod->m4mod->h_fq[i];
387 if(sum < 0.99 || sum > 1.01)
388 {
389 PhyML_Printf("\n. sum = %f",sum);
390 PhyML_Printf("\n. Err in file %s at line %d\n\n",__FILE__,__LINE__);
391 Warn_And_Exit("\n");
392 }
393
394 /* PhyML_Printf("\n__ "); */
395 /* for(i=0;i<mod->m4mod->n_h;i++) PhyML_Printf("\n.%f %f %f", */
396 /* mod->m4mod->h_fq[i], */
397 /* mod->m4mod->h_fq_unscaled[i], */
398 /* mod->m4mod->multipl[i]); */
399 }
400
401
402 /* PhyML_Printf("\n."); */
403 /* PhyML_Printf("\n. M4 model parameters"); */
404 /* m4mod->delta=.0; */
405 /* PhyML_Printf("\n. Delta = %f",m4mod->delta); */
406 /* for(i=0;i<mod->m4mod->n_h;i++) PhyML_Printf("\n. multipl %d = %f",i,m4mod->multipl[i]); */
407 /* for(i=0;i<mod->m4mod->n_h;i++) PhyML_Printf("\n. fq %d = %f",i,m4mod->h_fq[i]); */
408
409
410 /* Set up the stationary frequency vector */
411 for(i=0;i<n_s;i++) mod->e_frq->pi->v[i] = m4mod->o_fq[i%n_o] * m4mod->h_fq[i/n_o];
412
413
414 if(mod->whichmodel != CUSTOM &&
415 mod->whichmodel != GTR &&
416 mod->io->datatype == NT)
417 {
418 phydbl kappa1,kappa2;
419
420 if((mod->whichmodel != F84) && (mod->whichmodel != TN93)) mod->lambda->v = 1.;
421 else if(mod->whichmodel == F84)
422 {
423 mod->lambda->v = Get_Lambda_F84(mod->e_frq->pi->v,&(mod->kappa->v));
424 }
425
426 kappa2 = mod->kappa->v*2./(1.+mod->lambda->v);
427 kappa1 = kappa2 * mod->lambda->v;
428
429 /* A <-> C */ m4mod->o_rr[0] = 1.0;
430 /* A <-> G */ m4mod->o_rr[1] = kappa2;
431 /* A <-> T */ m4mod->o_rr[2] = 1.0;
432 /* C <-> G */ m4mod->o_rr[3] = 1.0;
433 /* C <-> T */ m4mod->o_rr[4] = kappa1;
434 }
435
436 /* Fill in the matrices of nucleotide or amino-acid substitution rates here */
437 Update_Qmat_Generic(m4mod->o_rr, m4mod->o_fq, m4mod->n_o, m4mod->o_mats[0]);
438
439 /* Print_Square_Matrix_Generic(n_o,m4mod->o_mats[0]); */
440
441 /* Multiply each of these matrices by a relative substitution rate */
442 for(i=1;i<m4mod->n_h;i++) For(j,n_o*n_o) m4mod->o_mats[i][j] = m4mod->o_mats[0][j]*m4mod->multipl[i];
443 For(j,n_o*n_o) m4mod->o_mats[0][j] *= m4mod->multipl[0];
444
445 For(i,n_s*n_s) mod->r_mat->qmat->v[i] = .0;
446
447 /* Diagonal blocks (i.e, nucleotide substitutions), symmetric */
448 for(i=0;i<n_s;i++)
449 {
450 for(j=i+1;j<n_s;j++)
451 {
452 if((int)(j/n_o) == (int)(i/n_o))
453 {
454 mod->r_mat->qmat->v[i*n_s+j] = m4mod->o_mats[(int)(i/n_o)][(i%n_o)*n_o+j%n_o];
455 mod->r_mat->qmat->v[j*n_s+i] = mod->r_mat->qmat->v[i*n_s+j] * m4mod->o_fq[i%n_o] / m4mod->o_fq[j%n_o];
456 }
457 }
458 }
459
460 /* Work out scaling factor such that the expected number of observed state substitution
461 along a branch of length 1 is 1.*/
462 mr = .0;
463 for(i=0;i<n_s;i++)
464 {
465 sum = .0;
466 for(j=0;j<n_s;j++) sum += mod->r_mat->qmat->v[i*n_s+j];
467 mr += sum * m4mod->o_fq[i%n_o] * m4mod->h_fq[(int)(i/n_o)];
468 }
469
470 /* Scale the diagonal blocks */
471 For(i,n_s*n_s) mod->r_mat->qmat->v[i] /= mr;
472
473 /* We are done with the diagonal blocks. Let's fill the non-diagonal ones now. */
474
475 /* Fill the matrix of substitution rate across classes (switches) here */
476 Update_Qmat_Generic(m4mod->h_rr, m4mod->h_fq, m4mod->n_h, m4mod->h_mat);
477
478 /* Print_Square_Matrix_Generic(m4mod->n_h,m4mod->h_mat); */
479
480 /* Multiply this matrix by the switching rate */
481 For(i,n_h*n_h) m4mod->h_mat[i] *= m4mod->delta;
482
483 /* Fill the non diagonal blocks */
484 for(i=0;i<n_s;i++)
485 {
486 for(j=i+1;j<n_s;j++)
487 {
488 if((int)(j/n_o) != (int)(i/n_o))
489 {
490 if(i%n_o == j%n_o)
491 {
492 mod->r_mat->qmat->v[i*n_s+j] = m4mod->h_mat[(int)(i/n_o)*n_h+(int)(j/n_o)];
493 mod->r_mat->qmat->v[j*n_s+i] = mod->r_mat->qmat->v[i*n_s+j] * m4mod->h_fq[(int)(i/n_o)] / m4mod->h_fq[(int)(j/n_o)];
494 }
495 }
496 }
497 }
498
499 /* Note: class equilibrium frequencies are already built in the h_mat matrix.
500 No need to 'add' these frequencies later on. */
501
502 /* We are done with the non diagonal blocks */
503
504
505 /* Diagonal cells */
506 for(i=0;i<n_s;i++)
507 {
508 sum = .0;
509 for(j=0;j<n_s;j++)
510 {
511 if(j != i)
512 sum += mod->r_mat->qmat->v[i*n_s+j];
513 }
514 mod->r_mat->qmat->v[i*n_s+i] = -sum;
515 }
516
517 For(i,n_s*n_s) mod->eigen->q[i] = mod->r_mat->qmat->v[i];
518 }
519
520 //////////////////////////////////////////////////////////////
521 //////////////////////////////////////////////////////////////
522
523
M4_Init_Partial_Lk_Tips_Double(t_tree * tree)524 void M4_Init_Partial_Lk_Tips_Double(t_tree *tree)
525 {
526 int curr_site,i,j,k,l,dim1,dim2,dim3;
527
528 dim1 = tree->mod->ras->n_catg * tree->mod->m4mod->n_h * tree->mod->m4mod->n_o;
529 dim2 = tree->mod->m4mod->n_h * tree->mod->m4mod->n_o;
530 dim3 = tree->mod->m4mod->n_o;
531
532
533 Fors(curr_site,tree->data->crunch_len,tree->mod->io->state_len)
534 {
535 for(i=0;i<tree->n_otu;i++)
536 {
537 for(j=1;j<tree->mod->m4mod->n_h;j++)
538 {
539 for(k=0;k<tree->mod->m4mod->n_o;k++)
540 {
541 tree->a_nodes[i]->b[0]->p_lk_rght[curr_site*dim1 + 0*dim2 + j*dim3+k] =
542 tree->a_nodes[i]->b[0]->p_lk_rght[curr_site*dim1 + 0*dim2 + 0*dim3+k];
543
544 printf("\n() i=%d plk=%f",
545 curr_site*dim1 + 0*dim2 + j*dim3+k,
546 tree->a_nodes[i]->b[0]->p_lk_rght[curr_site*dim1 + 0*dim2 + j*dim3+k]);
547
548 /* tree->a_nodes[i]->b[0]->p_lk_rght[curr_site][0][j*tree->mod->m4mod->n_o+k] = */
549 /* tree->a_nodes[i]->b[0]->p_lk_rght[curr_site][0][k]; */
550 }
551
552
553 for(k=0;k<tree->mod->m4mod->n_o;k++)
554 for(l=1;l<tree->mod->ras->n_catg;l++)
555 tree->a_nodes[i]->b[0]->p_lk_rght[curr_site*dim1 + l*dim2 + j*dim3+k] =
556 tree->a_nodes[i]->b[0]->p_lk_rght[curr_site*dim1 + 0*dim2 + j*dim3+k];
557 /* tree->a_nodes[i]->b[0]->p_lk_rght[curr_site][l][j*tree->mod->m4mod->n_o+k] = */
558 /* tree->a_nodes[i]->b[0]->p_lk_rght[curr_site][0][j*tree->mod->m4mod->n_o+k]; */
559 }
560 }
561 }
562 }
563
564 //////////////////////////////////////////////////////////////
565 //////////////////////////////////////////////////////////////
566
567
M4_Init_Partial_Lk_Tips_Int(t_tree * tree)568 void M4_Init_Partial_Lk_Tips_Int(t_tree *tree)
569 {
570 int curr_site,i,j,k,dim2,dim3;
571
572 dim2 = tree->mod->m4mod->n_h * tree->mod->m4mod->n_o;
573 dim3 = tree->mod->m4mod->n_o;
574
575 Fors(curr_site,tree->data->crunch_len,tree->mod->io->state_len)
576 {
577 for(i=0;i<tree->n_otu;i++)
578 {
579 for(j=1;j<tree->mod->m4mod->n_h;j++)
580 {
581 for(k=0;k<tree->mod->m4mod->n_o;k++)
582 {
583 tree->a_nodes[i]->b[0]->p_lk_tip_r[curr_site*dim2 + j*dim3+k] =
584 tree->a_nodes[i]->b[0]->p_lk_tip_r[curr_site*dim2 + 0*dim3+k];
585 /* tree->a_nodes[i]->b[0]->p_lk_tip_r[curr_site][j*tree->mod->m4mod->n_o+k] = */
586 /* tree->a_nodes[i]->b[0]->p_lk_tip_r[curr_site][k]; */
587 }
588 }
589 }
590 }
591 }
592
593 //////////////////////////////////////////////////////////////
594 //////////////////////////////////////////////////////////////
595
596
M4_Integral_Term_On_One_Edge(t_edge * b,t_tree * tree)597 phydbl ****M4_Integral_Term_On_One_Edge(t_edge *b, t_tree *tree)
598 {
599 phydbl ****integral,*P1,*P2;
600 int ns;
601 int g,i,j,k,l;
602 int step;
603
604
605 ns = tree->mod->ns;
606
607
608 P1 = (phydbl *)mCalloc(tree->mod->ras->n_catg*ns*ns,sizeof(phydbl));
609 P2 = (phydbl *)mCalloc(tree->mod->ras->n_catg*ns*ns,sizeof(phydbl));
610
611
612 integral = (phydbl ****)mCalloc(tree->mod->ras->n_catg,sizeof(phydbl ***));
613 for(g=0;g<tree->mod->ras->n_catg;g++)
614 {
615 integral[g] = (phydbl ***)mCalloc(ns,sizeof(phydbl **));
616 for(j=0;j<ns;j++)
617 {
618 integral[g][j] = (phydbl **)mCalloc(ns,sizeof(phydbl *));
619 for(k=0;k<ns;k++) integral[g][j][k] = (phydbl *)mCalloc(ns,sizeof(phydbl));
620 }
621 }
622
623 /* Integral calculation */
624 step = 100;
625
626 PhyML_Printf("\n. [");
627 for(i=0;i<step;i++)
628 {
629 for(g=0;g<tree->mod->ras->n_catg;g++)
630 {
631 PMat(((phydbl)(i+0.5)/step)*b->l->v*tree->mod->ras->gamma_rr->v[g],tree->mod,g*ns*ns,P1,NULL);
632 PMat(((phydbl)(step-i-0.5)/step)*b->l->v*tree->mod->ras->gamma_rr->v[g],tree->mod,g*ns*ns,P2,NULL);
633
634 for(j=0;j<ns;j++)
635 {
636 for(k=0;k<ns;k++)
637 {
638 for(l=0;l<ns;l++)
639 {
640 /* integral[g][j][k][l] += P1[g][j][k] * P2[g][j][l] / ((phydbl)(step)); */
641 integral[g][j][k][l] += P1[g*ns*ns + j*ns+k] * P2[g*ns*ns + j*ns+l] / ((phydbl)(step));
642 }
643 }
644 }
645 }
646 PhyML_Printf("."); fflush(NULL);
647 }
648 PhyML_Printf("]\n");
649
650 Free(P1);
651 Free(P2);
652
653 return integral;
654 }
655
656 //////////////////////////////////////////////////////////////
657 //////////////////////////////////////////////////////////////
658
659
M4_Post_Prob_H_Class_Edge_Site(t_edge * b,phydbl **** integral,phydbl * postprob,t_tree * tree)660 void M4_Post_Prob_H_Class_Edge_Site(t_edge *b, phydbl ****integral, phydbl *postprob, t_tree *tree)
661 {
662 /* Calculation of the expected frequencies of each hidden
663 class at a given site. */
664
665 phydbl site_lk;
666 int g,i,j,k,l;
667 int n_h;
668 phydbl sum;
669 int dim1,dim2;
670
671 dim1 = tree->mod->ras->n_catg * tree->mod->ns;
672 dim2 = tree->mod->ns;
673
674 n_h = tree->mod->m4mod->n_h; /* number of classes, i.e., number of hidden states */
675
676 site_lk = (phydbl)exp(tree->cur_site_lk[tree->curr_site]);
677
678 if(b->rght->tax)
679 {
680 sum = .0;
681 for(i=0;i<n_h;i++)
682 {
683 postprob[i] = .0;
684 for(j=0;j<tree->mod->m4mod->n_o;j++)
685 {
686 for(g=0;g<tree->mod->ras->n_catg;g++)
687 {
688 for(k=0;k<tree->mod->ns;k++)
689 {
690 for(l=0;l<tree->mod->ns;l++)
691 {
692 postprob[i] +=
693
694 (1./site_lk) *
695 tree->mod->ras->gamma_r_proba->v[g] *
696 tree->mod->m4mod->h_fq[i] *
697 tree->mod->m4mod->o_fq[j] *
698 b->p_lk_left[tree->curr_site*dim1 + g*dim2 + k] *
699 b->p_lk_tip_r[tree->curr_site*dim2 + l] *
700 integral[g][i*tree->mod->m4mod->n_o+j][k][l];
701
702 /* (1./site_lk) * */
703 /* tree->mod->ras->gamma_r_proba[g] * */
704 /* tree->mod->m4mod->h_fq[i] * */
705 /* tree->mod->m4mod->o_fq[j] * */
706 /* b->p_lk_left[tree->curr_site][g][k] * */
707 /* b->p_lk_tip_r[tree->curr_site][l] * */
708 /* /\* b->p_lk_rght[tree->curr_site][0][l] * *\/ */
709 /* integral[g][i*tree->mod->m4mod->n_o+j][k][l]; */
710 }
711 }
712 }
713 }
714 sum += postprob[i];
715 }
716
717 /* TO DO */
718 for(i=0;i<n_h;i++) postprob[i] *= exp(b->sum_scale_left[tree->curr_site]);
719
720 }
721 else
722 {
723 sum = .0;
724 for(i=0;i<n_h;i++)
725 {
726 postprob[i] = .0;
727 for(j=0;j<tree->mod->m4mod->n_o;j++)
728 {
729 for(g=0;g<tree->mod->ras->n_catg;g++)
730 {
731 for(k=0;k<tree->mod->ns;k++)
732 {
733 for(l=0;l<tree->mod->ns;l++)
734 {
735 postprob[i] +=
736
737 (1./site_lk) *
738 tree->mod->ras->gamma_r_proba->v[g] *
739 tree->mod->m4mod->h_fq[i] *
740 tree->mod->m4mod->o_fq[j] *
741 b->p_lk_left[tree->curr_site*dim1 + g*dim2 + k] *
742 b->p_lk_rght[tree->curr_site*dim1 + g*dim2 + l] *
743 integral[g][i*tree->mod->m4mod->n_o+j][k][l];
744
745 /* (1./site_lk) * */
746 /* tree->mod->ras->gamma_r_proba[g] * */
747 /* tree->mod->m4mod->h_fq[i] * */
748 /* tree->mod->m4mod->o_fq[j] * */
749 /* b->p_lk_left[tree->curr_site][g][k] * */
750 /* b->p_lk_rght[tree->curr_site][g][l] * */
751 /* integral[g][i*tree->mod->m4mod->n_o+j][k][l]; */
752 }
753 }
754 }
755 }
756 sum += postprob[i];
757 }
758
759 /* TO DO */
760 for(i=0;i<n_h;i++) postprob[i] *= exp(b->sum_scale_left[tree->curr_site] + b->sum_scale_rght[tree->curr_site]);
761
762 }
763
764 for(i=0;i<n_h;i++)
765 if((postprob[i] < -1.E-5) || (postprob[i] > 1.0+1.E-5))
766 {
767 PhyML_Printf("\n. Cat : %d Prob : %f\n",i,postprob[i]);
768 PhyML_Printf("\n. Err in file %s at line %d\n\n",__FILE__,__LINE__);
769 Warn_And_Exit("\n");
770 }
771
772 sum = 0.0;
773 for(i=0;i<n_h;i++) sum += postprob[i];
774
775 if((sum > 1.0+1.E-2) || (sum < 1.0-1.E-2))
776 {
777 PhyML_Printf("\n. Sum = %f\n",sum);
778 PhyML_Printf("\n. Err in file %s at line %d\n\n",__FILE__,__LINE__);
779 Exit("\n");
780 }
781
782 return;
783 }
784
785 //////////////////////////////////////////////////////////////
786 //////////////////////////////////////////////////////////////
787
M4_Compute_Proba_Hidden_States_On_Edges(t_tree * tree)788 phydbl ***M4_Compute_Proba_Hidden_States_On_Edges(t_tree *tree)
789 {
790 int i;
791 phydbl ***post_probs;
792 phydbl ****integral;
793
794
795 post_probs = (phydbl ***)mCalloc(2*tree->n_otu-3,sizeof(phydbl **));
796
797 For(i,2*tree->n_otu-3)
798 {
799 post_probs[i] = (phydbl **)mCalloc(tree->n_pattern,sizeof(phydbl *));
800 for(tree->curr_site=0;tree->curr_site<tree->n_pattern;tree->curr_site++)
801 post_probs[i][tree->curr_site] = (phydbl *)mCalloc(tree->mod->m4mod->n_h,sizeof(phydbl));
802 }
803
804
805 /* Compute posterior probabilities of each hidden class (usually a rate class)
806 on each edge, at each site.
807 */
808 For(i,2*tree->n_otu-3)
809 {
810 PhyML_Printf("\n. Edge %4d/%4d",i+1,2*tree->n_otu-3);
811
812 integral = M4_Integral_Term_On_One_Edge(tree->a_edges[i],tree);
813
814 for(tree->curr_site=0;tree->curr_site<tree->n_pattern;tree->curr_site++)
815 M4_Post_Prob_H_Class_Edge_Site(tree->a_edges[i],
816 integral,
817 post_probs[i][tree->curr_site],
818 tree);
819
820 M4_Free_Integral_Term_On_One_Edge(integral,tree);
821 }
822 return post_probs;
823 }
824
825 //////////////////////////////////////////////////////////////
826 //////////////////////////////////////////////////////////////
827
828
829 /* Estimate the (posterior) mean relative rate of substitution on each branch
830 at each site. The posterior mean rates averaged over sites is also estimated
831 for each edge. The corresponding trees are printed in a postscript file. Tree 0
832 is the tree with posterior mean rates averaged over the sites. The following trees
833 have posterior mean rates computed for each site.
834 */
M4_Compute_Posterior_Mean_Rates(phydbl *** post_probs,t_tree * tree)835 void M4_Compute_Posterior_Mean_Rates(phydbl ***post_probs, t_tree *tree)
836 {
837 char *s;
838 int i;
839 phydbl **mean_post_probs;
840 phydbl *mrr;
841 phydbl sum;
842 int patt,br,rcat;
843 phydbl *mean_br_len;
844 int best_r,len_var;
845 phydbl max_prob;
846
847 mean_br_len = (phydbl *)mCalloc(2*tree->n_otu-3,sizeof(phydbl));
848 mean_post_probs = (phydbl **)mCalloc(2*tree->n_otu-3,sizeof(phydbl *));
849 For(i,2*tree->n_otu-3) mean_post_probs[i] = (phydbl *)mCalloc(tree->mod->m4mod->n_h,sizeof(phydbl ));
850 mrr = (phydbl *)mCalloc(2*tree->n_otu-3,sizeof(phydbl));
851
852 Record_Br_Len(tree);
853 M4_Scale_Br_Len(tree);
854
855 /* Compute the posterior mean relative rate on each branch averaged over the
856 whole set of patterns (sites) */
857 len_var = 0;
858 for(patt=0;patt<tree->n_pattern;patt++)
859 {
860 if(!Is_Invar(patt,1,NT,tree->data))
861 {
862 For(br,2*tree->n_otu-3)
863 {
864 max_prob = -1.;
865 best_r = -1;
866 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++)
867 {
868 if(post_probs[br][patt][rcat] > max_prob)
869 {
870 max_prob = post_probs[br][patt][rcat];
871 best_r = rcat;
872 }
873 }
874
875 /* /\* Add weight on each category, weight is proportional to the corresponding posterior probability *\/ */
876 /* for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++) */
877 /* { */
878 /* mean_post_probs[br][rcat] += post_probs[br][patt][rcat] * tree->data->wght[patt]; */
879 /* } */
880
881 /* Add weight on the most probable rate category only */
882 mean_post_probs[br][best_r] += tree->data->wght[patt];
883 }
884 len_var += tree->data->wght[patt];
885 }
886 }
887
888 For(br,2*tree->n_otu-3)
889 {
890 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++)
891 {
892 mean_post_probs[br][rcat] /= (phydbl)len_var;
893 }
894 }
895
896 /* Compute the posterior mean relative rate and scale
897 each branch length using this factor */
898 For(br,2*tree->n_otu-3)
899 {
900 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++)
901 {
902 mrr[br] += mean_post_probs[br][rcat] * tree->mod->m4mod->multipl[rcat];
903 }
904 tree->a_edges[br]->l->v *= mrr[br];
905 }
906
907 PhyML_Fprintf(tree->io->fp_out_stats,"\n. Mean posterior probabilities & rates\n");
908 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++) PhyML_Fprintf(tree->io->fp_out_stats,"%2.4f ",tree->mod->m4mod->multipl[rcat]);
909 PhyML_Fprintf(tree->io->fp_out_stats,"\n");
910 For(br,2*tree->n_otu-3)
911 {
912 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++)
913 {
914 PhyML_Fprintf(tree->io->fp_out_stats,"%2.4f ",mean_post_probs[br][rcat]);
915 }
916 /* PhyML_Fprintf(tree->io->fp_out_stats," -- %f -> %f x %f = %f",mrr[br],tree->a_edges[br]->l->v,mrr[br],tree->a_edges[br]->l->v*mrr[br]); */
917
918 PhyML_Fprintf(tree->io->fp_out_stats," mrr=%f ",mrr[br]);
919
920 if(mrr[br] > 1.) PhyML_Fprintf(tree->io->fp_out_stats,"FAST ");
921 else PhyML_Fprintf(tree->io->fp_out_stats,"SLOW ");
922
923 PhyML_Fprintf(tree->io->fp_out_stats,"%s",tree->a_edges[br]->labels[0]);
924
925 PhyML_Fprintf(tree->io->fp_out_stats,"\n");
926 }
927
928 /* Print the tree */
929 PhyML_Fprintf(tree->io->fp_out_tree,"Constrained tree with corrected branch lengths = ");
930 s = Write_Tree(tree);
931 PhyML_Fprintf(tree->io->fp_out_tree,"%s\n",s);
932 Free(s);
933 tree->ps_tree = DR_Make_Tdraw_Struct(tree);
934 DR_Print_Postscript_Header(tree->n_pattern,tree->io->fp_out_ps);
935 tree->ps_page_number = 0;
936 DR_Print_Tree_Postscript(tree->ps_page_number++,YES,tree->io->fp_out_ps,tree);
937
938 /* Go back to the initial scaled branch lengths */
939 For(br,2*tree->n_otu-3) tree->a_edges[br]->l->v /= mrr[br];
940
941 /* Compute the posterior mean relative rate at each site, for each branch
942 and each rate category. Scale branch lengths using these factors and
943 print each tree (i.e., on tree per site pattern) */
944 for(patt=0;patt<tree->n_pattern;patt++)
945 {
946 For(br,2*tree->n_otu-3)
947 {
948 mrr[br] = .0;
949 max_prob = -1.;
950 best_r = -1;
951 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++) /* For each rate class */
952 {
953 mrr[br] += post_probs[br][patt][rcat] * tree->mod->m4mod->multipl[rcat];
954 if(post_probs[br][patt][rcat] > max_prob)
955 {
956 max_prob = post_probs[br][patt][rcat];
957 best_r = rcat;
958 }
959 }
960 /* mrr[br] = tree->mod->m4mod->multipl[best_r]; /\* Use the most probable relative rate instead of mean *\/ */
961 tree->a_edges[br]->l->v *= mrr[br];
962 }
963
964 For(br,2*tree->n_otu-3) mean_br_len[br] += tree->a_edges[br]->l->v * tree->data->wght[patt];
965
966 PhyML_Fprintf(tree->io->fp_out_stats,"\n. Posterior probabilities site %4d (weight=%d, is_inv=%d)\n",
967 patt,
968 tree->data->wght[patt],
969 Is_Invar(patt,1,NT,tree->data));
970
971 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++) PhyML_Fprintf(tree->io->fp_out_stats,"%2.4f ",tree->mod->m4mod->multipl[rcat]);
972 PhyML_Fprintf(tree->io->fp_out_stats,"\n");
973 For(br,2*tree->n_otu-3)
974 {
975 PhyML_Fprintf(tree->io->fp_out_stats,"Edge %3d ",br);
976 max_prob = -1.0;
977 best_r = -1;
978 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++)
979 {
980 if(post_probs[br][patt][rcat] > max_prob)
981 {
982 max_prob = post_probs[br][patt][rcat];
983 best_r = rcat;
984 }
985 }
986
987 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++)
988 {
989 PhyML_Fprintf(tree->io->fp_out_stats,"%2.4f",post_probs[br][patt][rcat]);
990 if(rcat == best_r) PhyML_Fprintf(tree->io->fp_out_stats,"* ");
991 else PhyML_Fprintf(tree->io->fp_out_stats," ");
992 }
993
994 /* PhyML_Fprintf(tree->io->fp_out_stats," -- %f -> %f x %f = %f",mrr[br],tree->a_edges[br]->l->v,mrr[br],tree->a_edges[br]->l->v*mrr[br]); */
995
996 if(mrr[br] > 1.01) PhyML_Fprintf(tree->io->fp_out_stats," %s ","FAST");
997 else if(mrr[br] < 0.99) PhyML_Fprintf(tree->io->fp_out_stats," %s ","SLOW");
998 else PhyML_Fprintf(tree->io->fp_out_stats," %s ","MEDIUM");
999 PhyML_Fprintf(tree->io->fp_out_stats,"%s ",tree->a_edges[br]->labels[0]);
1000 PhyML_Fprintf(tree->io->fp_out_stats,"\n");
1001 }
1002
1003 PhyML_Fprintf(tree->io->fp_out_tree,"tree %d = ",patt+1);
1004 s = Write_Tree(tree);
1005 PhyML_Fprintf(tree->io->fp_out_tree,"%s\n",s);
1006 Free(s);
1007 DR_Print_Tree_Postscript(tree->ps_page_number++,YES,tree->io->fp_out_ps,tree);
1008
1009 /* Go back to the initial scaled branch lengths */
1010 For(br,2*tree->n_otu-3) tree->a_edges[br]->l->v /= mrr[br];
1011
1012 For(br,2*tree->n_otu-3)
1013 {
1014 sum = .0;
1015 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++)
1016 {
1017 sum += post_probs[br][patt][rcat];
1018 }
1019
1020 if((sum < 0.99) || (sum > 1.01))
1021 {
1022 PhyML_Fprintf(tree->io->fp_out_stats,"\n. sum = %f\n",sum);
1023 PhyML_Fprintf(tree->io->fp_out_stats,"\n. Err in file %s at line %d\n\n",__FILE__,__LINE__);
1024 Warn_And_Exit("\n");
1025 }
1026 }
1027 }
1028
1029 /* Mean branch lengths */
1030 For(br,2*tree->n_otu-3)
1031 {
1032 mean_br_len[br] /= (phydbl)tree->data->init_len;
1033 tree->a_edges[br]->l->v = mean_br_len[br];
1034 }
1035 PhyML_Fprintf(tree->io->fp_out_tree,"Mean branch lengths=");
1036 s = Write_Tree(tree);
1037 PhyML_Fprintf(tree->io->fp_out_tree,"%s\n",s);
1038 Free(s);
1039 /* DR_Print_Tree_Postscript(tree->ps_page_number++,tree->io->fp_out_ps,tree); */
1040
1041 Restore_Br_Len(tree);
1042
1043 DR_Print_Postscript_EOF(tree->io->fp_out_ps);
1044
1045 For(br,2*tree->n_otu-3)
1046 {
1047 for(tree->curr_site=0;tree->curr_site<tree->n_pattern;tree->curr_site++)
1048 Free(post_probs[br][tree->curr_site]);
1049 Free(post_probs[br]);
1050 }
1051 Free(post_probs);
1052 For(i,2*tree->n_otu-3) Free(mean_post_probs[i]);
1053 Free(mean_post_probs);
1054 Free(mrr);
1055 Free(mean_br_len);
1056 }
1057
1058 //////////////////////////////////////////////////////////////
1059 //////////////////////////////////////////////////////////////
1060
1061
1062 /* Classifiy each branch, at each site, among one of the rate classes */
M4_Site_Branch_Classification(phydbl *** post_probs,t_tree * tree)1063 phydbl **M4_Site_Branch_Classification(phydbl ***post_probs, t_tree *tree)
1064 {
1065 int patt, br, rcat, i;
1066 phydbl **best_probs;
1067 phydbl post_prob_fast, post_prob_slow;
1068
1069 best_probs = (phydbl **)mCalloc(tree->n_pattern,sizeof(phydbl *));
1070 for(i=0;i<tree->n_pattern;i++) best_probs[i] = (phydbl *)mCalloc(2*tree->n_otu-3,sizeof(phydbl));
1071
1072 tree->write_labels = YES;
1073
1074 for(patt=0;patt<tree->n_pattern;patt++)
1075 {
1076 For(br,2*tree->n_otu-3)
1077 {
1078 post_prob_fast = .0;
1079 post_prob_slow = .0;
1080
1081 for(rcat=0;rcat<tree->mod->m4mod->n_h;rcat++) /* For each rate class */
1082 {
1083 if(tree->mod->m4mod->multipl[rcat] > 1.0)
1084 post_prob_fast += post_probs[br][patt][rcat];
1085 else
1086 post_prob_slow += post_probs[br][patt][rcat];
1087 }
1088
1089 best_probs[patt][br] = (post_prob_fast > post_prob_slow)?(post_prob_fast):(post_prob_slow);
1090
1091 if(!(tree->a_edges[br]->n_labels%BLOCK_LABELS)) Make_New_Edge_Label(tree->a_edges[br]);
1092
1093 /* if((post_prob_fast > post_prob_slow) && (best_probs[patt][br] > 0.95)) */
1094 /* strcpy(tree->a_edges[br]->labels[tree->a_edges[br]->n_labels],"FASTER"); */
1095 /* else if((post_prob_fast < post_prob_slow) && (best_probs[patt][br] > 0.95)) */
1096 /* strcpy(tree->a_edges[br]->labels[tree->a_edges[br]->n_labels],"SLOWER"); */
1097 /* else */
1098 /* strcpy(tree->a_edges[br]->labels[tree->a_edges[br]->n_labels],"UNKNOWN"); */
1099
1100 if(post_prob_fast > post_prob_slow)
1101 strcpy(tree->a_edges[br]->labels[tree->a_edges[br]->n_labels],"FASTER");
1102 else
1103 strcpy(tree->a_edges[br]->labels[tree->a_edges[br]->n_labels],"SLOWER");
1104
1105 tree->a_edges[br]->n_labels++;
1106 }
1107 }
1108 return best_probs;
1109 }
1110
1111 //////////////////////////////////////////////////////////////
1112 //////////////////////////////////////////////////////////////
1113
1114
M4_Site_Branch_Classification_Experiment(t_tree * tree)1115 void M4_Site_Branch_Classification_Experiment(t_tree *tree)
1116 {
1117 calign *cpy_data;
1118 short int **true_rclass, **est_rclass;
1119 phydbl **best_probs;
1120 int i,j;
1121 phydbl correct_class, mis_class, unknown;
1122
1123 true_rclass = (short int **)mCalloc(tree->data->init_len, sizeof(short int *));
1124 est_rclass = (short int **)mCalloc(tree->data->init_len, sizeof(short int *));
1125
1126 for(i=0;i<tree->data->init_len;i++)
1127 {
1128 true_rclass[i] = (short int *)mCalloc(2*tree->n_otu-3,sizeof(short int));
1129 est_rclass[i] = (short int *)mCalloc(2*tree->n_otu-3,sizeof(short int));
1130 }
1131
1132 if(tree->io->datatype != NT && tree->io->datatype != AA)
1133 {
1134 PhyML_Printf("\n. Not implemented yet.");
1135 PhyML_Printf("\n. Err in file %s at line %d\n",__FILE__,__LINE__);
1136 Warn_And_Exit("");
1137 }
1138
1139 cpy_data = Copy_Cseq(tree->data,tree->io);
1140
1141 /* Generate a simulated data set under H0, with the right sequence length. */
1142 PhyML_Printf("\n. Evolving sequences (delta=%f, alpha=%f) ...\n",tree->mod->m4mod->delta,tree->mod->m4mod->alpha);
1143 Evolve(cpy_data,tree->mod,0,tree);
1144
1145 for(i=0;i<cpy_data->init_len;i++)
1146 {
1147 For(j,2*tree->n_otu-3)
1148 {
1149 if(!strcmp(tree->a_edges[j]->labels[i],"FASTER"))
1150 {
1151 true_rclass[i][j] = 1;
1152 }
1153 else if(!strcmp(tree->a_edges[j]->labels[i],"SLOWER"))
1154 {
1155 true_rclass[i][j] = 0;
1156 }
1157 else
1158 {
1159 PhyML_Printf("\n. Err in file %s at line %d\n\n",__FILE__,__LINE__);
1160 Warn_And_Exit("\n");
1161 }
1162 }
1163 }
1164
1165 For(j,2*tree->n_otu-3)
1166 {
1167 Free_Edge_Labels(tree->a_edges[j]);
1168 tree->a_edges[j]->n_labels = 0;
1169 }
1170
1171 /* Generate the memory needed for likelihood calculation because
1172 we will need bigger arrays
1173 */
1174 Free_Tree_Lk(tree);
1175 Free_Tree_Pars(tree);
1176
1177 tree->data = cpy_data;
1178 tree->n_pattern = tree->data->crunch_len;
1179
1180 /* Allocate memory and initialize likelihood structure with
1181 simulated sequences (i.e., columns are not compressed)
1182 */
1183 Make_Tree_For_Pars(tree);
1184 Make_Tree_For_Lk(tree);
1185
1186 /* Estimate model parameters */
1187 PhyML_Printf("\n. Estimating model parameters...\n");
1188 tree->mod->s_opt->opt_cov_alpha = 1;
1189 tree->mod->s_opt->opt_cov_delta = 1;
1190 Round_Optimize(tree,ROUND_MAX);
1191
1192 tree->both_sides = 1;
1193 Lk(NULL,tree);
1194
1195 /* Classify branches */
1196 best_probs = M4_Site_Branch_Classification(M4_Compute_Proba_Hidden_States_On_Edges(tree),tree);
1197
1198 for(i=0;i<tree->data->init_len;i++)
1199 {
1200 For(j,2*tree->n_otu-3)
1201 {
1202 if(!strcmp(tree->a_edges[j]->labels[i],"FASTER"))
1203 {
1204 est_rclass[i][j] = 1;
1205 }
1206 else if(!strcmp(tree->a_edges[j]->labels[i],"SLOWER"))
1207 {
1208 est_rclass[i][j] = 0;
1209 }
1210 else if(!strcmp(tree->a_edges[j]->labels[i],"UNKNOWN"))
1211 {
1212 est_rclass[i][j] = -1;
1213 }
1214 else
1215 {
1216 PhyML_Printf("\n. Err in file %s at line %d\n\n",__FILE__,__LINE__);
1217 Warn_And_Exit("\n");
1218 }
1219 }
1220 }
1221
1222 unknown = .0;
1223 correct_class = .0;
1224 mis_class = .0;
1225 for(i=0;i<tree->data->init_len;i++)
1226 {
1227 For(j,2*tree->n_otu-3)
1228 {
1229 /* PhyML_Printf("\n. Edge %3d %4d %4d - %f",j,true_rclass[i][j],est_rclass[i][j],best_probs[i][j]); */
1230 if(est_rclass[i][j] == -1)
1231 {
1232 unknown += 1.;
1233 }
1234 else if(est_rclass[i][j] == true_rclass[i][j])
1235 {
1236 correct_class += 1.;
1237 }
1238 else if(est_rclass[i][j] != true_rclass[i][j])
1239 {
1240 mis_class += 1.;
1241 }
1242 else
1243 {
1244 PhyML_Printf("\n. Err in file %s at line %d\n\n",__FILE__,__LINE__);
1245 Warn_And_Exit("\n");
1246 }
1247 }
1248 /* PhyML_Printf("\n"); */
1249 }
1250
1251 correct_class /= ((tree->data->init_len * (2*tree->n_otu-3)) - unknown);
1252 mis_class /= ((tree->data->init_len * (2*tree->n_otu-3)) - unknown);
1253 unknown /= (tree->data->init_len * (2*tree->n_otu-3));
1254
1255 PhyML_Printf("\n. correct_class = %f mis_class = %f unknown = %f",
1256 correct_class,
1257 mis_class,
1258 unknown);
1259
1260
1261 for(i=0;i<tree->data->init_len;i++)
1262 {
1263 Free(true_rclass[i]);
1264 Free(est_rclass[i]);
1265 Free(best_probs[i]);
1266 }
1267 Free(true_rclass);
1268 Free(est_rclass);
1269 Free(best_probs);
1270
1271 }
1272
1273 //////////////////////////////////////////////////////////////
1274 //////////////////////////////////////////////////////////////
1275
1276
1277 /* Scale branch lengths such that they express expected number
1278 of nucleotide or amino-acid substitutions */
1279
M4_Scale_Br_Len(t_tree * tree)1280 void M4_Scale_Br_Len(t_tree *tree)
1281 {
1282 phydbl scale_fact,mrs;
1283 int i,j;
1284
1285 /* (1) Work out the relative mean rate of switches */
1286 mrs = .0;
1287 for(i=0;i<tree->mod->m4mod->n_h;i++)
1288 {
1289 for(j=0;j<tree->mod->m4mod->n_h;j++)
1290 {
1291 if(j != i)
1292 mrs += tree->mod->m4mod->h_fq[i] * tree->mod->m4mod->h_mat[i*tree->mod->m4mod->n_h+j];
1293 }
1294 }
1295
1296 /* (2) scale_fact = (1 + delta x mrs) */
1297 scale_fact = 1.0 + tree->mod->m4mod->delta * mrs;
1298
1299 /* (3) Scale branch lengths */
1300 For(i,2*tree->n_otu-3) tree->a_edges[i]->l->v /= scale_fact;
1301 }
1302
1303 //////////////////////////////////////////////////////////////
1304 //////////////////////////////////////////////////////////////
1305
1306
M4_Free_Integral_Term_On_One_Edge(phydbl **** integral,t_tree * tree)1307 void M4_Free_Integral_Term_On_One_Edge(phydbl ****integral, t_tree *tree)
1308 {
1309 int g,i,j;
1310
1311 for(g=0;g<tree->mod->ras->n_catg;g++)
1312 {
1313 for(i=0;i<tree->mod->m4mod->n_h;i++)
1314 {
1315 for(j=0;j<tree->mod->m4mod->n_h;j++)
1316 {
1317 Free(integral[g][i][j]);
1318 }
1319 Free(integral[g][i]);
1320 }
1321 Free(integral[g]);
1322 }
1323 Free(integral);
1324 }
1325
1326 //////////////////////////////////////////////////////////////
1327 //////////////////////////////////////////////////////////////
1328
1329
M4_Detect_Site_Switches_Experiment(t_tree * tree)1330 void M4_Detect_Site_Switches_Experiment(t_tree *tree)
1331 {
1332 t_mod *nocov_mod,*cov_mod,*ori_mod;
1333 calign *ori_data,*cpy_data;
1334 int i,n_iter;
1335 phydbl *nocov_bl,*cov_bl;
1336 phydbl *site_lnl_nocov, *site_lnl_cov;
1337
1338 nocov_bl = (phydbl *)mCalloc(2*tree->n_otu-3,sizeof(phydbl));
1339 cov_bl = (phydbl *)mCalloc(2*tree->n_otu-3,sizeof(phydbl));
1340 site_lnl_nocov = (phydbl *)mCalloc(tree->data->init_len,sizeof(phydbl));
1341 site_lnl_cov = (phydbl *)mCalloc(tree->data->init_len,sizeof(phydbl));
1342
1343 ori_data = tree->data;
1344 ori_mod = tree->mod;
1345
1346 if(tree->io->datatype != NT && tree->io->datatype != AA)
1347 {
1348 PhyML_Printf("\n== Not implemented yet.");
1349 PhyML_Printf("\n== Err in file %s at line %d\n",__FILE__,__LINE__);
1350 Warn_And_Exit("");
1351 }
1352
1353 cpy_data = Copy_Cseq(tree->data,tree->io);
1354
1355 PhyML_Printf("\n. Estimate model parameters under non-switching substitution model.\n");
1356 Switch_From_M4mod_To_Mod(tree->mod);
1357 Simu_Loop(tree);
1358 nocov_mod = (t_mod *)Copy_Model(tree->mod); /* Record model parameters */
1359 For(i,2*tree->n_otu-3) nocov_bl[i] = tree->a_edges[i]->l->v; /* Record branch lengths */
1360 for(i=0;i<tree->data->crunch_len;i++) site_lnl_nocov[i] = tree->cur_site_lk[i];
1361 Print_Lk(tree,"[LnL under non-switching substitution model]");
1362
1363 PhyML_Printf("\n. Estimate model parameters under switching substitution model.\n");
1364 Switch_From_Mod_To_M4mod(tree->mod);
1365 Simu_Loop(tree);
1366 cov_mod = (t_mod *)Copy_Model(tree->mod); /* Record model parameters */
1367 For(i,2*tree->n_otu-3) cov_bl[i] = tree->a_edges[i]->l->v; /* Record branch lengths */
1368 for(i=0;i<tree->data->crunch_len;i++) site_lnl_cov[i] = tree->cur_site_lk[i];
1369 Print_Lk(tree,"[LnL under switching substitution model]");
1370
1371
1372 PhyML_Printf("\n");
1373 for(i=0;i<tree->data->crunch_len;i++) PhyML_Printf("TRUTH %f %f\n",site_lnl_nocov[i],site_lnl_cov[i]);
1374
1375 /* Generate a simulated data set under H0, with the right sequence length. */
1376 tree->mod = nocov_mod;
1377 Evolve(cpy_data, nocov_mod, 0, tree);
1378
1379 /* Generate the memory needed for likelihood calculation because
1380 we will need bigger arrays
1381 */
1382 tree->mod = cov_mod;
1383 Free_Tree_Lk(tree);
1384 Free_Tree_Pars(tree);
1385
1386 tree->data = cpy_data;
1387 tree->n_pattern = tree->data->crunch_len;
1388 tree->mod = cov_mod;
1389
1390 /* Allocate memory and initialize likelihood structure with
1391 simulated sequences (i.e., columns are not compressed)
1392 */
1393 Make_Tree_For_Pars(tree);
1394 Make_Tree_For_Lk(tree);
1395
1396
1397 n_iter = 0;
1398 do
1399 {
1400 /* Get the transition proba right to generate sequences */
1401 tree->mod = nocov_mod;
1402 For(i,2*tree->n_otu-3) tree->a_edges[i]->l->v = nocov_bl[i];
1403 For(i,2*tree->n_otu-3) Update_PMat_At_Given_Edge(tree->a_edges[i],tree);
1404
1405 /* Generate sequences */
1406 Evolve(cpy_data, nocov_mod, 0, tree);
1407 tree->data = cpy_data;
1408
1409 if(tree->mod->s_opt->greedy) Init_Partial_Lk_Tips_Double(tree);
1410 else Init_Partial_Lk_Tips_Int(tree);
1411
1412 tree->mod = nocov_mod;
1413 For(i,2*tree->n_otu-3) tree->a_edges[i]->l->v = nocov_bl[i];
1414 Lk(NULL,tree);
1415 for(i=0;i<tree->data->crunch_len;i++) site_lnl_nocov[i] = tree->cur_site_lk[i];
1416 Print_Lk(tree,"[CPY LnL under non-switching substitution model]");
1417
1418 tree->mod = cov_mod;
1419 For(i,2*tree->n_otu-3) tree->a_edges[i]->l->v = cov_bl[i];
1420 Lk(NULL,tree);
1421 for(i=0;i<tree->data->crunch_len;i++) site_lnl_cov[i] = tree->cur_site_lk[i];
1422 Print_Lk(tree,"[CPY LnL under switching substitution model]");
1423
1424 PhyML_Printf("\n");
1425 for(i=0;i<tree->data->crunch_len;i++) PhyML_Printf("SYNTH %f %f\n",site_lnl_nocov[i],site_lnl_cov[i]);
1426 }
1427 while(++n_iter < 200);
1428
1429 Free_Tree_Lk(tree);
1430 Free_Tree_Pars(tree);
1431
1432 /* Back to the original data set to check that everything is ok */
1433 tree->data = ori_data;
1434 tree->n_pattern = tree->data->crunch_len;
1435
1436 Make_Tree_For_Pars(tree);
1437 Make_Tree_For_Lk(tree);
1438
1439 tree->mod = nocov_mod;
1440 For(i,2*tree->n_otu-3) tree->a_edges[i]->l->v = nocov_bl[i];
1441 Lk(NULL,tree);
1442 Print_Lk(tree,"[FINAL LnL under non-switching substitution model]");
1443
1444 tree->mod = cov_mod;
1445 For(i,2*tree->n_otu-3) tree->a_edges[i]->l->v = cov_bl[i];
1446 Lk(NULL,tree);
1447 Print_Lk(tree,"[FINAL LnL under switching substitution model]");
1448
1449 tree->mod = ori_mod;
1450
1451 Free_Model(cov_mod);
1452 Free_Model(nocov_mod);
1453 Free(site_lnl_cov);
1454 Free(site_lnl_nocov);
1455
1456 Free_Calign(cpy_data);
1457 Free(nocov_bl);
1458 Free(cov_bl);
1459 }
1460
1461 //////////////////////////////////////////////////////////////
1462 //////////////////////////////////////////////////////////////
1463
1464
M4_Posterior_Prediction_Experiment(t_tree * tree)1465 void M4_Posterior_Prediction_Experiment(t_tree *tree)
1466 {
1467 calign *ori_data,*cpy_data;
1468 int i,n_iter,n_simul;
1469 FILE *fp_nocov,*fp_cov,*fp_obs;
1470 char *s;
1471 t_edge *best_edge;
1472
1473 s = (char *)mCalloc(100,sizeof(char));
1474
1475 best_edge = NULL;
1476
1477 strcpy(s,tree->io->in_align_file);
1478 fp_nocov = Openfile(strcat(s,"_nocov"),1);
1479 strcpy(s,tree->io->in_align_file);
1480 fp_cov = Openfile(strcat(s,"_cov"),1);
1481 strcpy(s,tree->io->in_align_file);
1482 fp_obs = Openfile(strcat(s,"_obs"),1);
1483
1484 Free(s);
1485
1486 Print_Diversity_Header(fp_nocov, tree);
1487 Print_Diversity_Header(fp_cov, tree);
1488 Print_Diversity_Header(fp_obs, tree);
1489
1490 ori_data = tree->data;
1491
1492 if(tree->io->datatype != NT && tree->io->datatype != AA)
1493 {
1494 PhyML_Printf("\n. Not implemented yet.");
1495 PhyML_Printf("\n. Err in file %s at line %d\n",__FILE__,__LINE__);
1496 Warn_And_Exit("");
1497 }
1498
1499 cpy_data = Copy_Cseq(tree->data,tree->io);
1500
1501 /* Generate a simulated data set under H0, with the right sequence length. */
1502 Set_Model_Parameters(tree->mod);
1503 For(i,2*tree->n_otu-3) Update_PMat_At_Given_Edge(tree->a_edges[i],tree);
1504 Evolve(cpy_data,tree->mod,0,tree);
1505
1506 /* Generate the memory needed for likelihood calculation because
1507 we will need bigger arrays
1508 */
1509 Free_Tree_Lk(tree);
1510 Free_Tree_Pars(tree);
1511
1512 tree->data = cpy_data;
1513 tree->n_pattern = tree->data->crunch_len;
1514
1515 /* Allocate memory and initialize likelihood structure with
1516 simulated sequences (i.e., columns are not compressed)
1517 */
1518 Make_Tree_For_Pars(tree);
1519 Make_Tree_For_Lk(tree);
1520
1521 /* Go back to the original data set */
1522 tree->data = ori_data;
1523 tree->n_pattern = ori_data->crunch_len;
1524
1525 if(tree->mod->s_opt->greedy) Init_Partial_Lk_Tips_Double(tree);
1526 else Init_Partial_Lk_Tips_Int(tree);
1527
1528 PhyML_Printf("\n. Estimate model parameters under non-switching substitution model.\n");
1529 Switch_From_M4mod_To_Mod(tree->mod);
1530
1531 tree->bl_from_node_stamps = 1;
1532 /* best_edge = TIMES_Find_Best_Root_Position(tree); */
1533 PhyML_Printf("\n. Put root on t_edge %3d",i);
1534 TIMES_Least_Square_Node_Times(best_edge,tree);
1535 TIMES_Adjust_Node_Times(tree);
1536 /* TIMES_Round_Optimize(tree); */
1537
1538 /* Round_Optimize(tree,tree->data); */
1539 /* Simu_Loop(tree); */
1540 Print_Lk(tree,"[LnL under non-switching substitution model]");
1541 Print_Tree(tree->io->fp_out_tree,tree);
1542
1543 /* Print observed diversities */
1544 Init_Ui_Tips(tree);
1545 Site_Diversity(tree);
1546 Print_Diversity(fp_obs,tree);
1547
1548 n_simul = 100;
1549 n_iter = 0;
1550 do
1551 {
1552 Evolve(cpy_data,tree->mod,0,tree);
1553 tree->data = cpy_data;
1554 tree->n_pattern = cpy_data->init_len;
1555
1556 if(tree->mod->s_opt->greedy) Init_Partial_Lk_Tips_Double(tree);
1557 else Init_Partial_Lk_Tips_Int(tree);
1558
1559 Lk(NULL,tree);
1560
1561 Init_Ui_Tips(tree);
1562 Site_Diversity(tree);
1563 Print_Diversity(fp_nocov,tree);
1564
1565 Print_Lk(tree,"[CPY under non-switching substitution model]");
1566 }while(++n_iter < n_simul);
1567
1568
1569 /* Go back to the original data set */
1570 tree->data = ori_data;
1571 tree->n_pattern = ori_data->crunch_len;
1572
1573 if(tree->mod->s_opt->greedy) Init_Partial_Lk_Tips_Double(tree);
1574 else Init_Partial_Lk_Tips_Int(tree);
1575
1576 PhyML_Printf("\n. Estimate model parameters under switching substitution model.\n");
1577 Switch_From_Mod_To_M4mod(tree->mod);
1578 /* TIME_Round_Optimize(tree); */
1579 /* Round_Optimize(tree,tree->data); */
1580 /* Simu_Loop(tree); */
1581 Print_Lk(tree,"[LnL under switching substitution model]");
1582 Print_Tree(tree->io->fp_out_tree,tree);
1583
1584 n_iter = 0;
1585 do
1586 {
1587 Evolve(cpy_data,tree->mod,0,tree);
1588 tree->data = cpy_data;
1589 tree->n_pattern = cpy_data->init_len;
1590 if(tree->mod->s_opt->greedy) Init_Partial_Lk_Tips_Double(tree);
1591 else Init_Partial_Lk_Tips_Int(tree);
1592
1593 Lk(NULL,tree);
1594
1595 Init_Ui_Tips(tree);
1596 Site_Diversity(tree);
1597 Print_Diversity(fp_cov,tree);
1598
1599 Print_Lk(tree,"[LnL under switching substitution model]");
1600 }while(++n_iter < n_simul);
1601
1602 fclose(fp_obs);
1603 fclose(fp_nocov);
1604 fclose(fp_cov);
1605 }
1606
1607 //////////////////////////////////////////////////////////////
1608 //////////////////////////////////////////////////////////////
1609
M4_Copy_M4_Model(t_mod * ori_mod,m4 * ori_m4mod)1610 m4 *M4_Copy_M4_Model(t_mod *ori_mod, m4 *ori_m4mod)
1611 {
1612 int i,j,n_h, n_o;
1613 m4 *cpy_m4mod;
1614
1615 if(ori_mod->io->datatype != NT && ori_mod->io->datatype != AA)
1616 {
1617 PhyML_Printf("\n== Not implemented yet.");
1618 PhyML_Printf("\n== Err in file %s at line %d\n",__FILE__,__LINE__);
1619 Exit("\n");
1620 }
1621
1622
1623 cpy_m4mod = (m4 *)M4_Make_Light();
1624 cpy_m4mod->n_o = ori_m4mod->n_o;
1625 cpy_m4mod->n_h = ori_m4mod->n_h;
1626
1627 if(ori_mod->use_m4mod)
1628 {
1629 M4_Make_Complete(cpy_m4mod->n_h,
1630 cpy_m4mod->n_o,
1631 cpy_m4mod);
1632
1633 n_h = cpy_m4mod->n_h;
1634 n_o = cpy_m4mod->n_o;
1635
1636 cpy_m4mod->n_h = ori_m4mod->n_h;
1637 cpy_m4mod->n_o = ori_m4mod->n_o;
1638 for(i=0;i<n_h;i++) For(j,n_o*n_o) cpy_m4mod->o_mats[i][j] = ori_m4mod->o_mats[i][j];
1639 for(i=0;i<n_h;i++) cpy_m4mod->multipl[i] = ori_m4mod->multipl[i];
1640 for(i=0;i<n_h;i++) cpy_m4mod->multipl_unscaled[i] = ori_m4mod->multipl_unscaled[i];
1641 For(i,n_o*n_o) cpy_m4mod->o_rr[i] = ori_m4mod->o_rr[i];
1642 For(i,n_h*n_h) cpy_m4mod->h_rr[i] = ori_m4mod->h_rr[i];
1643 For(i,n_h*n_h) cpy_m4mod->h_mat[i] = ori_m4mod->h_mat[i];
1644 for(i=0;i<n_o;i++) cpy_m4mod->o_fq[i] = ori_m4mod->o_fq[i];
1645 for(i=0;i<n_h;i++) cpy_m4mod->h_fq[i] = ori_m4mod->h_fq[i];
1646 for(i=0;i<n_h;i++) cpy_m4mod->h_fq_unscaled[i] = ori_m4mod->h_fq_unscaled[i];
1647 cpy_m4mod->delta = ori_m4mod->delta;
1648 cpy_m4mod->alpha = ori_m4mod->alpha;
1649 }
1650
1651 return cpy_m4mod;
1652 }
1653
1654 //////////////////////////////////////////////////////////////
1655 //////////////////////////////////////////////////////////////
1656
1657