1 /*
2
3 PhyML: a program that computes maximum likelihood phylogenies from
4 DNA or AA homologous sequences.
5
6 Copyright (C) Stephane Guindon. Oct 2003 onward.
7
8 All parts of the source except where indicated are distributed under
9 the GNU public licence. See http://www.opensource.org for details.
10
11 */
12
13 #include "ancestral.h"
14
Sample_Ancestral_Seq(int fullmutmap,int fromprior,t_tree * tree)15 void Sample_Ancestral_Seq(int fullmutmap, int fromprior, t_tree *tree)
16 {
17 int rate_cat;
18 int i,j,k,l;
19 phydbl *probs;
20 phydbl sum;
21 int n_mut;
22 FILE *fp;
23 phydbl *muttime;
24 int *muttype,*muttax;
25 char *s,*mut_string;
26 int *ordering;
27
28
29 if(tree->is_mixt_tree == YES)
30 {
31 MIXT_Sample_Ancestral_Seq(fullmutmap,fromprior,tree);
32 return;
33 }
34
35 probs = (phydbl *)mCalloc(tree->mod->ras->n_catg,sizeof(phydbl));
36
37 muttime = (phydbl *)mCalloc((2*tree->n_otu-3)*5, // At most 5 mutations per branch on average
38 sizeof(phydbl));
39
40 muttype = (int *)mCalloc((2*tree->n_otu-3)*5, // At most 5 mutations per branch on average
41 sizeof(int));
42
43 muttax = (int *)mCalloc((2*tree->n_otu-3)*5, // At most 5 mutations per branch on average
44 sizeof(int));
45
46 ordering = (int *)mCalloc((2*tree->n_otu-3)*5, // At most 5 mutations per branch on average
47 sizeof(int));
48
49 s = (char *)mCalloc(T_MAX_NAME,sizeof(char));
50
51 for(i=0;i<2*tree->n_otu-1;++i)
52 if(tree->a_nodes[i]->tax == NO)
53 {
54 tree->a_nodes[i]->c_seq_anc = (align *)mCalloc(1,sizeof(align));;
55 tree->a_nodes[i]->c_seq_anc->state = (char *)mCalloc(tree->n_pattern,sizeof(char));
56 }
57
58
59 /* Update P(D_x|X=i) for each state i and node X */
60 Set_Both_Sides(YES,tree);
61 Lk(NULL,tree);
62
63
64 for(i=0;i<tree->n_pattern;++i)
65 {
66 /* Sample the rate class from its posterior density */
67 for(j=0;j<tree->mod->ras->n_catg;j++)
68 {
69 if(fromprior == NO)
70 probs[j] =
71 tree->unscaled_site_lk_cat[i*tree->mod->ras->n_catg+j]*
72 tree->mod->ras->gamma_r_proba->v[j];
73 else
74 probs[j] = tree->mod->ras->gamma_r_proba->v[j];
75 }
76
77 /* Scale probas. */
78 sum = .0;
79 for(j=0;j<tree->mod->ras->n_catg;j++) sum += probs[j];
80 for(j=0;j<tree->mod->ras->n_catg;j++) probs[j]/=sum;
81
82 rate_cat = Sample_i_With_Proba_pi(probs,tree->mod->ras->n_catg);
83
84 n_mut = 0;
85 if(tree->n_root != NULL)
86 {
87 Sample_Ancestral_Seq_Pre(tree->n_root,tree->n_root->v[1],tree->n_root->b[1],
88 i,rate_cat,
89 muttype,muttime,muttax,&n_mut,
90 fullmutmap,fromprior,tree);
91
92
93 Sample_Ancestral_Seq_Pre(tree->n_root,tree->n_root->v[2],tree->n_root->b[2],
94 i,rate_cat,
95 muttype,muttime,muttax,&n_mut,
96 fullmutmap,fromprior,tree);
97
98
99 }
100 else
101 {
102 Sample_Ancestral_Seq_Pre(tree->a_nodes[0],tree->a_nodes[0]->v[0],tree->a_nodes[0]->b[0],
103 i,rate_cat,
104 muttype,muttime,muttax,&n_mut,
105 fullmutmap,fromprior,tree);
106 }
107
108 for(j=0;j<n_mut;j++) ordering[j] = 0;
109
110 for(j=0;j<n_mut-1;j++)
111 {
112 for(k=j+1;k<n_mut;k++)
113 {
114 if(muttime[k] > muttime[j]) ordering[k]++;
115 else ordering[j]++;
116 }
117 }
118
119 strcpy(s,"mutmap.");
120 sprintf(s+strlen(s),"%d.",tree->io->r_seed);
121 sprintf(s+strlen(s),"%d",i);
122 fp = fopen(s,"a");
123 PhyML_Fprintf(fp,"\n-1 -1 -1.0 -1 %d",tree->mixt_tree ? tree->mixt_tree->mcmc->run : tree->mcmc->run);
124
125 Reverse_Muttime(muttime,n_mut,tree);
126
127 for(j=0;j<n_mut;j++)
128 {
129 for(k=0;k<n_mut;k++)
130 {
131 if(ordering[k] == j)
132 {
133 for(l=0;l<tree->data->init_len;l++) if(tree->data->sitepatt[l] == i) break;
134 mut_string = Mutation_Id(muttype[k],tree);
135 PhyML_Fprintf(fp,"\n%4d %s %g %4d %s",j,mut_string,muttime[k],l,tree->a_nodes[muttax[k]]->name);
136 Free(mut_string);
137 break;
138 }
139 }
140 }
141
142
143 for(j=0;j<n_mut;j++)
144 {
145 muttype[j] = -2;
146 muttime[j] = +1.;
147 }
148
149 fclose(fp);
150 }
151
152 Free(s);
153 Free(muttype);
154 Free(muttime);
155 Free(muttax);
156 Free(ordering);
157 Free(probs);
158 }
159
160 //////////////////////////////////////////////////////////////
161 //////////////////////////////////////////////////////////////
162
Sample_Ancestral_Seq_Pre(t_node * a,t_node * d,t_edge * b,int site,int r_cat,int * muttype,phydbl * muttime,int * muttax,int * n_mut,int fullmutmap,int fromprior,t_tree * tree)163 void Sample_Ancestral_Seq_Pre(t_node *a, t_node *d, t_edge *b,
164 int site, int r_cat,
165 int *muttype, phydbl *muttime, int *muttax, int *n_mut,
166 int fullmutmap, int fromprior, t_tree *tree)
167 {
168
169 int i;
170 int sa, sd;
171 int ns;
172 phydbl *probs,sum;
173
174 /* PhyML_Printf("\n>> a: %d d: %d b->left: %d b->rght: %d",a?a->num:-1,d?d->num:-1,b?b->left->num:-1,b?b->rght->num:-1); */
175
176 ns = tree->mod->ns;
177
178 probs = (phydbl *)mCalloc(ns,sizeof(phydbl));
179
180 if(a->tax == TRUE) // Sample state at tip if observed state is ambiguous
181 {
182 assert(b);
183 if(a == b->left) for(i=0;i<ns;++i) probs[i] = b->p_lk_tip_l[site*ns+i];
184 else for(i=0;i<ns;++i) probs[i] = b->p_lk_tip_r[site*ns+i];
185 for(i=0;i<ns;++i) probs[i] *= tree->mod->e_frq->pi->v[i];
186 sum = 0.0;
187 for(i=0;i<ns;++i) sum += probs[i];
188 for(i=0;i<ns;++i) probs[i] /= sum;
189 sa = Sample_i_With_Proba_pi(probs,ns);
190 }
191 else if(a == tree->n_root)
192 {
193 sa = Sample_Ancestral_Seq_Core(NULL,tree->n_root,NULL,r_cat,site,tree);
194 }
195 else
196 {
197 sa = Assign_State(a->c_seq_anc->state + site,
198 tree->mod->io->datatype,
199 tree->mod->io->state_len);
200 }
201
202 sd = Sample_Ancestral_Seq_Core(a,d,b,r_cat,site,tree);
203
204 if(fullmutmap == YES) Map_Mutations(a,d,sa,sd,b,site,r_cat,muttype,muttime,muttax,n_mut,tree);
205
206 if(d->tax) return;
207 else
208 {
209 for(i=0;i<3;++i)
210 {
211 if(d->v[i] != a && d->b[i] != tree->e_root)
212 {
213 Sample_Ancestral_Seq_Pre(d,d->v[i],d->b[i],site,r_cat,muttype,muttime,muttax,n_mut,fullmutmap,fromprior,tree);
214 }
215 }
216 }
217 }
218
219 //////////////////////////////////////////////////////////////
220 //////////////////////////////////////////////////////////////
221
Sample_Ancestral_Seq_Core(t_node * a,t_node * d,t_edge * b,int r_cat,int site,t_tree * tree)222 int Sample_Ancestral_Seq_Core(t_node *a, t_node *d, t_edge *b, int r_cat, int site, t_tree *tree)
223 {
224 int i,j;
225 int ns;
226
227 ns = tree->mod->ns;
228
229 int dim1 = tree->mod->ras->n_catg * ns;
230 int dim2 = ns;
231 int dim3 = ns * ns;
232
233 phydbl *probs,*Pij,sum;
234
235 int state;
236
237
238 probs = (phydbl *)mCalloc(ns,sizeof(phydbl));
239 state = -1;
240
241 if(d->tax == YES)
242 {
243 assert(b);
244 if(d == b->left) for(i=0;i<ns;++i) probs[i] = b->p_lk_tip_l[site*ns+i];
245 else if(d == b->rght) for(i=0;i<ns;++i) probs[i] = b->p_lk_tip_r[site*ns+i];
246 else assert(FALSE);
247 for(i=0;i<ns;++i) probs[i] *= tree->mod->e_frq->pi->v[i];
248 sum = 0.0;
249 for(i=0;i<ns;++i) sum += probs[i];
250 for(i=0;i<ns;++i) probs[i] /= sum;
251 state = Sample_i_With_Proba_pi(probs,ns);
252 }
253 else
254 {
255 if(a == NULL) // d is root node
256 {
257 assert(d == tree->n_root);
258
259 phydbl r,l;
260
261 Update_PMat_At_Given_Edge(tree->n_root->b[1],tree);
262 Update_PMat_At_Given_Edge(tree->n_root->b[2],tree);
263
264 for(i=0;i<ns;++i)
265 {
266 if(tree->e_root->left == tree->n_root->v[1])
267 Pij = tree->n_root->b[1]->Pij_rr;
268 else
269 Pij = tree->n_root->b[2]->Pij_rr;
270
271 l = 0.0;
272 for(j=0;j<ns;++j)
273 {
274 if(tree->e_root->left->tax == NO)
275 l += tree->e_root->p_lk_left[site*dim1+r_cat*dim2+j] * Pij[r_cat*dim3+i*dim2+j];
276 else
277 l += tree->e_root->p_lk_tip_l[site*dim2+j] * Pij[r_cat*dim3+i*dim2+j];
278 }
279
280
281 if(tree->e_root->rght == tree->n_root->v[1])
282 Pij = tree->n_root->b[1]->Pij_rr;
283 else
284 Pij = tree->n_root->b[2]->Pij_rr;
285
286 r = 0.0;
287 for(j=0;j<ns;++j)
288 {
289 if(tree->e_root->rght->tax == NO)
290 r += tree->e_root->p_lk_rght[site*dim1+r_cat*dim2+j] * Pij[r_cat*dim3+i*dim2+j];
291 else
292 r += tree->e_root->p_lk_tip_r[site*dim2+j] * Pij[r_cat*dim3+i*dim2+j];
293 }
294
295 probs[i] = r*l*tree->mod->e_frq->pi->v[i];
296 }
297
298 sum = 0.0;
299 for(i=0;i<ns;++i) sum += probs[i];
300 for(i=0;i<ns;++i) probs[i] /= sum;
301
302 state = Sample_i_With_Proba_pi(probs,ns);
303
304 d->c_seq_anc->state[site] = Reciproc_Assign_State(state,tree->io->datatype);
305 }
306 else
307 {
308 assert(b);
309
310 // State (already) sampled at node a
311 state = Assign_State(a->c_seq_anc->state + site,
312 tree->mod->io->datatype,
313 tree->mod->io->state_len);
314
315 Pij = b->Pij_rr;
316
317 for(i=0;i<ns;++i)
318 {
319 if(d == b->left)
320 probs[i] = b->p_lk_left[site*dim1+r_cat*dim2+i] * Pij[r_cat*dim3+state*dim2+i];
321 else if(d == b->rght)
322 probs[i] = b->p_lk_rght[site*dim1+r_cat*dim2+i] * Pij[r_cat*dim3+state*dim2+i];
323 else assert(FALSE);
324 }
325
326 sum = 0.0;
327 for(i=0;i<ns;++i) sum += probs[i];
328 for(i=0;i<ns;++i) probs[i] /= sum;
329
330 state = Sample_i_With_Proba_pi(probs,ns);
331
332 d->c_seq_anc->state[site] = Reciproc_Assign_State(state,tree->io->datatype);
333 }
334 }
335
336 Free(probs);
337 return state;
338
339 }
340
341
342 //////////////////////////////////////////////////////////////
343 //////////////////////////////////////////////////////////////
344
Map_Mutations(t_node * a,t_node * d,int sa,int sd,t_edge * b,int site,int rcat,int * muttype,phydbl * muttime,int * muttax,int * n_mut,t_tree * tree)345 void Map_Mutations(t_node *a, t_node *d, int sa, int sd, t_edge *b, int site, int rcat, int *muttype, phydbl *muttime, int *muttax, int *n_mut, t_tree *tree)
346 {
347 int i,j;
348 phydbl *probs,*all_probs;
349 int slast,snew; // Last state visited
350 phydbl tlast;
351 phydbl *Q;
352 phydbl u,sum;
353 int *mut; // Array of mutations
354 int thismut;
355 int n_mut_branch;
356 int n_iter;
357 int first_mut;
358 phydbl T;
359 int ns,tax_idx;
360 phydbl rr;
361
362 ns = tree->mod->ns;
363
364 all_probs = (phydbl *)mCalloc(ns*ns,sizeof(phydbl));
365 mut = (int *)mCalloc(ns*ns,sizeof(int));
366
367 // Site relative rate
368 rr = tree->mod->ras->gamma_rr->v[rcat];
369
370 // Rate matrix
371 Q = tree->mod->r_mat->qmat->v;
372
373 // Length of the 'time' interval considered.
374 T = 0.0;
375 #ifdef PHYTIME
376 T = tree->rates->cur_l[d->num]*rr;
377 #else
378 T = b->l->v*rr;
379 #endif
380
381
382 /* PhyML_Printf("\n. Mutmap: a:%d d:%d ta:%G td:%G cr:%G rr:%G l:%G", */
383 /* a?a->num:-1,d?d->num:-1, */
384 /* tree->times->nd_t[a->num], */
385 /* tree->times->nd_t[d->num], */
386 /* cr,rr, */
387 /* fabs(tree->times->nd_t[a->num]-tree->times->nd_t[d->num])*cr*rr); */
388
389 // Matrix of change probabilities
390 for(i=0;i<ns;i++)
391 {
392 // We only care about the non-diagonal elements here
393 for(j=0;j<ns;j++) all_probs[i*ns+j] = Q[i*ns+j];
394
395 // Set the diagonal to 0 so that p(i->i)=0.0;
396 all_probs[i*ns+i] = 0.0;
397
398 // Normalise so that \sum_j p(i->j) = 1.0;
399 sum = 0;
400 for(j=0;j<ns;j++) sum += all_probs[i*ns+j];
401 for(j=0;j<ns;j++) all_probs[i*ns+j] /= sum;
402 }
403
404 for(i=0;i<ns*ns;++i) mut[i] = 0;
405 tlast = .0;
406 slast = sa;
407 snew = sa;
408 probs = NULL;
409 n_mut_branch = 0;
410 n_iter = 0;
411 first_mut = YES;
412
413 do
414 {
415 if((sa != sd) && (first_mut == YES)) // ancestral and descendant states are distinct
416 {
417 // Sample a time for the first mutation conditional on at least one mutation
418 // occurring (see formula 2.1 in Hobolth and Stone, 2009).
419 u = Uni();
420 tlast = -log(1. - u*(1.-exp(Q[sa*ns+sa]*T)))/-Q[sa*ns+sa];
421 }
422 else
423 {
424 // Sample a time for the next mutation
425 tlast = tlast + Rexp(-Q[slast*ns+slast]);
426 }
427
428 // Select the appropriate vector of change probabilities
429 probs = all_probs+slast*ns;
430
431
432 /* printf("\n. sa=%2d sd=%2d slast=%2d tlast=%12G T=%12G rcat=%2d site=%4d",sa,sd,slast,tlast,T,rcat,site); */
433
434 // The time for the next mutation does not exceed the length
435 // of the time interval -> sample a new mutation event
436 if(tlast < T)
437 {
438 first_mut = NO;
439
440 n_mut_branch++;
441
442 if(n_mut_branch > 5 && n_mut_branch%10 == 0)
443 PhyML_Printf("\n. # of mutations on edge %d (length: %g) exceeds 5 (%d) ! The program will probably crash soon...:(",
444 b->num,
445 tree->rates ? tree->rates->cur_l[d->num] : b->l->v,
446 n_mut_branch);
447
448 snew = Sample_i_With_Proba_pi(probs,ns);
449
450 // Record mutation type
451 mut[slast*ns+snew]++;
452
453 // Record mutation type in the site mutation array
454 thismut = slast*ns+snew;
455
456 muttype[(*n_mut)+n_mut_branch-1] = thismut;
457
458 // Record time of mutation
459 muttime[(*n_mut)+n_mut_branch-1] = tlast;
460
461 #ifdef PHYTIME
462 // Transform into time in calendar units
463 muttime[(*n_mut)+n_mut_branch-1] /= tree->rates->cur_l[d->num];
464 muttime[(*n_mut)+n_mut_branch-1] *= fabs(tree->times->nd_t[a->num]-tree->times->nd_t[d->num]);
465 #endif
466
467 tax_idx = -1;
468 Random_Tax_Idx(a,d,&tax_idx,tree);
469 muttax[(*n_mut)+n_mut_branch-1] = tax_idx;
470
471 // Update the last state
472 slast = snew;
473 }
474 else
475 {
476 if(slast == sd) break;
477 else
478 {
479 // Restart from the beginning
480 for(i=0;i<ns*ns;++i) mut[i] = 0;
481 for(i=0;i<n_mut_branch;i++) muttype[(*n_mut)+n_mut_branch-1] = -2;
482 for(i=0;i<n_mut_branch;i++) muttime[(*n_mut)+n_mut_branch-1] = +1.;
483 tlast = 0.0;
484 slast = sa;
485 snew = sa;
486 n_mut_branch = 0;
487 first_mut = YES;
488 n_iter++;
489 }
490 }
491 }
492 while(++n_iter < 10000);
493
494
495 if(n_iter == 10000)
496 {
497 PhyML_Printf("\n. sa=%2d sd=%2d slast=%2d tlast=%12G T=%12G rcat=%12f site=%4d",sa,sd,slast,tlast,T,rcat,site);
498 assert(FALSE);
499 }
500
501 (*n_mut) += n_mut_branch;
502
503
504 /* for(i=0;i<ns;i++) */
505 /* { */
506 /* for(j=i+1;j<ns;j++) */
507 /* { */
508 /* if(mut[i*ns+j] + mut[j*ns+i] > 0) */
509 /* { */
510 /* thismut = MIN(i,j) * ns + MAX(i,j) - (MIN(i,j)+1+(int)POW(MIN(i,j)+1,2))/2; */
511
512 /* if(tree->mixt_tree != NULL) */
513 /* tree->mixt_tree->mutmap[thismut*(tree->n_pattern)*(2*tree->n_otu-3) + b->num*(tree->n_pattern) + site]++; */
514 /* else */
515 /* tree->mutmap[thismut*(tree->n_pattern)*(2*tree->n_otu-3) + b->num*(tree->n_pattern) + site]++; */
516 /* } */
517 /* } */
518 /* } */
519
520 Free(all_probs);
521 Free(mut);
522 }
523
524 /*////////////////////////////////////////////////////////////
525 ////////////////////////////////////////////////////////////*/
526
Ancestral_Sequences(t_tree * tree,int print)527 void Ancestral_Sequences(t_tree *tree, int print)
528 {
529 int i;
530
531 if(print == YES)
532 {
533
534 PhyML_Printf("\n\n. Estimating ancestral sequences...");
535
536 strcpy(tree->io->out_ancestral_seq_file,tree->io->out_file);
537 strcat(tree->io->out_ancestral_seq_file,"_phyml_ancestral_");
538 if(tree->io->append_run_ID)
539 {
540 strcat(tree->io->out_ancestral_seq_file,tree->io->run_id_string);
541 strcat(tree->io->out_ancestral_seq_file,"_");
542 }
543 strcat(tree->io->out_ancestral_seq_file,"seq.txt");
544 tree->io->fp_out_ancestral_seq = Openfile(tree->io->out_ancestral_seq_file,1);
545
546
547 strcpy(tree->io->out_ancestral_tree_file,tree->io->out_file);
548 strcat(tree->io->out_ancestral_tree_file,"_phyml_ancestral_");
549 if(tree->io->append_run_ID)
550 {
551 strcat(tree->io->out_ancestral_tree_file,tree->io->run_id_string);
552 strcat(tree->io->out_ancestral_tree_file,"_");
553 }
554 strcat(tree->io->out_ancestral_tree_file,"tree.txt");
555 tree->io->fp_out_ancestral_tree = Openfile(tree->io->out_ancestral_tree_file,1);
556
557
558 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n\n\n");
559 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. Printing marginal probabilities of ancestral sequences at each site");
560 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. of the alignment and each node of the tree. The tree in Newick format");
561 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. with internal nodes labels corresponding to those given below can be");
562 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. found in the file '%s'.",tree->io->out_ancestral_tree_file);
563 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n");
564 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n");
565 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. Ancestral reconstruction is conducted based on the \"Minimum Posterior Expected");
566 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. Error\" (MPEE) criterion, which accomodates for uncertainty in the selection of ");
567 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. ancestral character states.");
568 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n");
569 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n");
570 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. Recommended citation:");
571 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. Oliva A., Pulicani S., Lefort V., Brehelin L., Gascuel O. and S. Guindon,");
572 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. \"Accounting for ambiguity in ancestral sequence reconstruction\",");
573 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n. Bioinformatics, Volume 35, Issue 21, 1 November 2019.");
574 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n");
575
576 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n\n");
577 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"Site\tNodeLabel\t");
578 for(i=0;i<tree->mod->ns;i++) PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"%10c\t",Reciproc_Assign_State(i,tree->io->datatype));
579 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"MPEE\t");
580 PhyML_Fprintf(tree->io->fp_out_ancestral_seq,"\n");
581
582
583 short int bck_support = tree->io->print_support_val;
584
585 tree->io->print_node_num = YES;
586 tree->io->print_support_val = NO;
587 char *s = Write_Tree(tree);
588 PhyML_Fprintf(tree->io->fp_out_ancestral_tree,"%s",s);
589 tree->io->print_node_num = NO;
590 tree->io->print_support_val = bck_support;
591
592 Free(s);
593 fclose(tree->io->fp_out_ancestral_tree);
594 }
595
596 for(i=0;i<2*tree->n_otu-2;i++)
597 if(tree->a_nodes[i]->tax == NO)
598 Ancestral_Sequences_One_Node(tree->a_nodes[i],tree,print);
599
600 if(tree->n_root) Ancestral_Sequences_One_Node(tree->n_root,tree,print);
601
602
603 fclose(tree->io->fp_out_ancestral_seq);
604 }
605
606 //////////////////////////////////////////////////////////////
607 //////////////////////////////////////////////////////////////
608
Ancestral_Sequences_One_Node(t_node * d,t_tree * tree,int print)609 void Ancestral_Sequences_One_Node(t_node *d, t_tree *tree, int print)
610 {
611 if(d->tax) return;
612 else
613 {
614 if(tree->is_mixt_tree)
615 {
616 MIXT_Ancestral_Sequences_One_Node(d,tree,print);
617 }
618 else
619 {
620 t_node *v0,*v1,*v2; // three neighbours of d
621 t_edge *b0,*b1,*b2;
622 int i,j;
623 int catg;
624 phydbl p0, p1, p2;
625 phydbl *p;
626 int site,csite;
627 phydbl *p_lk0, *p_lk1, *p_lk2;
628 int *sum_scale0, *sum_scale1, *sum_scale2;
629 phydbl sum_probas;
630 phydbl *Pij0, *Pij1, *Pij2;
631 phydbl inc,sum_scale;
632 FILE *fp;
633
634 unsigned const int ncatg = tree->mod->ras->n_catg;
635 unsigned const int ns = tree->mod->ns;
636 unsigned const int nsns = ns*ns;
637 unsigned const int ncatgns = ns*ncatg;
638
639
640 if(tree->scaling_method == SCALE_RATE_SPECIFIC)
641 {
642 PhyML_Fprintf(stderr,"\n. Likelihood rescaling method not compatible with the calculation of ancestral state probabilities.");
643 Exit("\n");
644 }
645
646
647 if(!d) return;
648
649 fp = tree->io->fp_out_ancestral_seq;
650 assert(fp != NULL);
651
652
653 p = (phydbl *)mCalloc(ns,sizeof(phydbl));
654
655 for(site=0;site<tree->data->init_len;site++) // For each site in the current partition element
656 {
657 csite = tree->data->sitepatt[site];
658
659 for(i=0;i<ns;i++) p[i] = .0;
660
661 v0 = d->v[0];
662 v1 = d->v[1];
663 v2 = d->v[2];
664
665 b0 = d->b[0];
666 b1 = d->b[1];
667 b2 = d->b[2];
668
669 Pij0 = b0->Pij_rr;
670 Pij1 = b1->Pij_rr;
671 Pij2 = b2->Pij_rr;
672
673 sum_scale = 0.0;
674
675 if(v0 == b0->left)
676 {
677 p_lk0 = b0->p_lk_left;
678 sum_scale0 = b0->sum_scale_left;
679 }
680 else
681 {
682 p_lk0 = b0->p_lk_rght;
683 sum_scale0 = b0->sum_scale_rght;
684 }
685
686 if(v1 == b1->left)
687 {
688 p_lk1 = b1->p_lk_left;
689 sum_scale1 = b1->sum_scale_left;
690 }
691 else
692 {
693 p_lk1 = b1->p_lk_rght;
694 sum_scale1 = b1->sum_scale_rght;
695 }
696
697 if(v2 == b2->left)
698 {
699 p_lk2 = b2->p_lk_left;
700 sum_scale2 = b2->sum_scale_left;
701 }
702 else
703 {
704 p_lk2 = b2->p_lk_rght;
705 sum_scale2 = b2->sum_scale_rght;
706 }
707
708
709 for(catg=0;catg<ncatg;++catg)
710 {
711 for(i=0;i<ns;++i)
712 {
713 p0 = .0;
714 if(v0->tax)
715 {
716 for(j=0;j<ns;++j)
717 {
718 p0 += v0->b[0]->p_lk_tip_r[csite*ns+j] * Pij0[catg*nsns+i*ns+j];
719
720 if(isinf(p0) || isnan(p0))
721 {
722 PhyML_Fprintf(stderr,"\n. p0: %G v0->b[0]->p_lk_tip_r[csite*ns+j]: %G Pij0[catg*nsns+i*ns+j]: %G\n",
723 p0,
724 v0->b[0]->p_lk_tip_r[csite*ns+j],
725 Pij0[catg*nsns+i*ns+j]);
726 Exit("\n");
727 }
728 }
729 }
730 else
731 {
732 for(j=0;j<ns;j++)
733 {
734 p0 += p_lk0[csite*ncatgns+catg*ns+j] * Pij0[catg*nsns+i*ns+j];
735
736 if(isinf(p0) || isnan(p0))
737 {
738 PhyML_Fprintf(stderr,"\n. p0: %G p_lk0[csite*ncatgns+catg*ns+j]: %G Pij0[catg*nsns+i*ns+j]: %G (phydbl)POW(2,sum_scale0[csite*ncatg+catg]): %G sum_scale0: %d\n",
739 p0,
740 p_lk0[csite*ncatgns+catg*ns+j],
741 Pij0[catg*nsns+i*ns+j],
742 (phydbl)POW(2,sum_scale0[csite]),
743 sum_scale0[csite]);
744 Exit("\n");
745 }
746 }
747 if(catg == 0 && i == 0) sum_scale += sum_scale0[csite];
748 }
749
750 p1 = .0;
751 if(v1->tax)
752 {
753 for(j=0;j<ns;j++)
754 {
755 p1 += v1->b[0]->p_lk_tip_r[csite*ns+j] * Pij1[catg*nsns+i*ns+j];
756
757 if(isinf(p1) || isnan(p1))
758 {
759 PhyML_Fprintf(stderr,"\n. p1: %G v1->b[0]->p_lk_tip_r[csite*ns+j]: %G Pij1[catg*nsns+i*ns+j]: %G\n",
760 p1,
761 v1->b[0]->p_lk_tip_r[csite*ns+j],
762 Pij1[catg*nsns+i*ns+j]);
763 Exit("\n");
764 }
765 }
766 }
767 else
768 {
769 for(j=0;j<ns;j++)
770 {
771 p1 += p_lk1[csite*ncatgns+catg*ns+j] * Pij1[catg*nsns+i*ns+j];
772
773 if(isinf(p1) || isnan(p1))
774 {
775 PhyML_Fprintf(stderr,"\n. p1: %G p_lk1[csite*ncatgns+catg*ns+j]: %G Pij1[catg*nsns+i*ns+j]: %G (phydbl)POW(2,sum_scale1[csite*ncatg+catg]): %G\n",
776 p1,
777 p_lk1[csite*ncatgns+catg*ns+j],
778 Pij1[catg*nsns+i*ns+j],
779 (phydbl)POW(2,sum_scale1[csite]));
780 Exit("\n");
781 }
782 }
783 if(catg == 0 && i == 0) sum_scale += sum_scale1[csite];
784 }
785
786 p2 = .0;
787 if(v2->tax)
788 {
789 for(j=0;j<ns;j++)
790 {
791 p2 += v2->b[0]->p_lk_tip_r[csite*ns+j] * Pij2[catg*nsns+i*ns+j];
792 }
793 }
794 else
795 {
796 for(j=0;j<ns;j++)
797 {
798
799 p2 += p_lk2[csite*ncatgns+catg*ns+j] * Pij2[catg*nsns+i*ns+j];
800
801 if(isinf(p2) || isnan(p2))
802 {
803 PhyML_Fprintf(stderr,"\n. p2: %G p_lk2[csite*ncatgns+catg*ns+j]: %G Pij2[catg*nsns+i*ns+j]: %G (phydbl)POW(2,sum_scale2[csite]): %G\n",
804 p2,
805 p_lk2[csite*ncatgns+catg*ns+j],
806 Pij2[catg*nsns+i*ns+j],
807 (phydbl)POW(2,sum_scale2[csite]));
808 Exit("\n");
809 }
810 }
811 if(catg == 0 && i == 0) sum_scale += sum_scale2[csite];
812 }
813
814 inc =
815 p0*p1*p2*
816 tree->mod->e_frq->pi->v[i] *
817 tree->mod->ras->gamma_r_proba->v[catg];
818
819 p[i] += inc;
820
821
822 if(isinf(p[i]) || isnan(p[i]))
823 {
824 PhyML_Fprintf(stderr,"\n. site: %4d p0: %G p1: %G p2: %G tree->mod->e_frq->pi->v[i]: %G tree->cur_site_lk[csite]: %G tree->mod->ras->gamma_r_proba->v[catg]: %G tree->c_lnL_sorted[csite]: %G",
825 csite,
826 p0,p1,p2,
827 tree->mod->e_frq->pi->v[i] ,
828 tree->cur_site_lk[csite] ,
829 tree->mod->ras->gamma_r_proba->v[catg],
830 tree->c_lnL_sorted[csite]);
831 Exit("\n");
832 }
833 }
834
835 /* printf("\n. site: %d || %d %d %d", */
836 /* csite, */
837 /* v0->tax ? -1 : sum_scale0[csite*ncatg+catg], */
838 /* v1->tax ? -1 : sum_scale1[csite*ncatg+catg], */
839 /* v2->tax ? -1 : sum_scale2[csite*ncatg+catg]); */
840
841 }
842
843 if(tree->mod->ras->invar == YES)
844 {
845 int num_prec_issue = NO;
846 phydbl inv_site_lk = Invariant_Lk(sum_scale,csite,&num_prec_issue,tree);
847
848
849 switch(num_prec_issue)
850 {
851 case YES :
852 {
853 assert(isinf(inv_site_lk));
854 inv_site_lk = Invariant_Lk(0,csite,&num_prec_issue,tree);
855 for(i=0;i<ns;i++) p[i] = inv_site_lk * tree->mod->ras->pinvar->v * tree->mod->e_frq->pi->v[i];
856 break;
857 }
858 case NO :
859 {
860 for(i=0;i<ns;i++) p[i] = p[i] * (1.-tree->mod->ras->pinvar->v) + inv_site_lk * tree->mod->ras->pinvar->v * tree->mod->e_frq->pi->v[i];
861 break;
862 }
863 }
864
865 }
866
867
868 for(i=0;i<ns;i++) p[i] = log(p[i]) - (phydbl)LOG2 * sum_scale;
869 for(i=0;i<ns;i++) p[i] -= tree->c_lnL_sorted[csite];
870 for(i=0;i<ns;i++) p[i] = exp(p[i]);
871
872
873
874 /* sum_probas = 0.0; */
875 /* for(i=0;i<ns;i++) sum_probas += p[i]; */
876 /* for(i=0;i<ns;i++) p[i]/=sum_probas; */
877
878 sum_probas = 0.0;
879 for(i=0;i<ns;i++) sum_probas += p[i];
880 if(Are_Equal(sum_probas,1.0,0.01) == NO)
881 {
882 PhyML_Fprintf(stderr,"\n. Probabilities do not sum to 1.0! Aborting.");
883 for(i=0;i<ns;++i) PhyML_Fprintf(stderr,"\n. p[%2d]=%G",i,p[i]);
884 Exit("\n");
885 }
886
887 if(print == YES)
888 {
889 PhyML_Fprintf(fp,"%4d\t%9d\t",site+1,d->num);
890 for(i=0;i<ns;i++) PhyML_Fprintf(fp,"%10g\t",p[i]);
891 PhyML_Fprintf(fp,"%s",Bit_To_Character_String(Integer_To_Bit(MPEE_Infer(p,ns),ns),ns));
892 PhyML_Fprintf(fp,"\n");
893 }
894
895
896
897 }
898 Free(p);
899 }
900 }
901 }
902
903 /*////////////////////////////////////////////////////////////
904 ////////////////////////////////////////////////////////////*/
905
MPEE_Infer(const phydbl * p,const int ns)906 int MPEE_Infer(const phydbl *p, const int ns)
907 {
908 unsigned int i,j,sorted;
909 phydbl *alpha;
910 const unsigned mesh = 50;
911 int *best_state,*idx,*count,highest_count,highest_count_idx,most_frequent_state;
912
913 idx = (int *)mCalloc(ns,sizeof(int));
914
915 alpha = (phydbl *)mCalloc(mesh,sizeof(phydbl));
916 best_state = (int *)mCalloc(mesh+1,sizeof(int));
917 count = (int *)mCalloc(mesh+1,sizeof(int));
918
919
920 // idx[0] gives the index of the largest value in p.
921 // idx[1] gives the index of the second largest value in p
922 // etc.
923 for(i=0;i<ns;++i) idx[i] = i;
924 do
925 {
926 sorted = YES;
927 for(i=0;i<ns-1;++i)
928 {
929 if(p[idx[i]] < p[idx[i+1]])
930 {
931 sorted = NO;
932
933 j = idx[i];
934 idx[i] = idx[i+1];
935 idx[i+1] = j;
936 }
937 }
938 }
939 while(sorted == NO);
940
941 /* for(i=0;i<ns;++i) */
942 /* { */
943 /* PhyML_Printf("\n. p[%d]=%g",i,p[i]); */
944 /* } */
945
946 /* for(i=0;i<ns;++i) */
947 /* { */
948 /* PhyML_Printf("\n. idx[%d]=%d",i,idx[i]); */
949 /* } */
950
951 for(i=0;i<mesh+1;++i)
952 {
953 for(j=0;j<ns;++j) alpha[j] = i*((phydbl)j/(j+1))/mesh;
954 best_state[i] = MPEE_Score(alpha,idx,p,ns);
955 }
956
957
958 /* for(i=0;i<mesh;++i) */
959 /* { */
960 /* PhyML_Printf("\n. best_state[%d]=%d",i,best_state[i]); */
961 /* } */
962
963 for(i=0;i<mesh+1;++i)
964 {
965 count[i] = 0;
966 for(j=0;j<mesh+1;++j)
967 {
968 if(best_state[i] == best_state[j]) count[i]++;
969 }
970 }
971
972 highest_count = 0;
973 highest_count_idx = -1;
974 for(i=0;i<mesh+1;++i)
975 {
976 if(count[i] > highest_count)
977 {
978 highest_count = count[i];
979 highest_count_idx = i;
980 }
981 }
982
983 most_frequent_state = best_state[highest_count_idx];
984
985 Free(alpha);
986 Free(best_state);
987 Free(idx);
988 Free(count);
989
990 return(most_frequent_state);
991 }
992
993 /*////////////////////////////////////////////////////////////
994 ////////////////////////////////////////////////////////////*/
995
MPEE_Score(const phydbl * alpha,int * idx,const phydbl * p,const int ns)996 int MPEE_Score(const phydbl *alpha, int *idx, const phydbl *p, const int ns)
997 {
998 unsigned int i;
999 phydbl *mpee_score,*cdf_sorted;
1000 int min_idx,res;
1001 phydbl a,b;
1002 phydbl min;
1003
1004 mpee_score = (phydbl *)mCalloc(ns,sizeof(phydbl));
1005 cdf_sorted = (phydbl *)mCalloc(ns,sizeof(phydbl));
1006
1007
1008 cdf_sorted[0] = p[idx[0]];
1009 for(i=1;i<ns;++i) cdf_sorted[i] = cdf_sorted[i-1] + p[idx[i]];
1010
1011 for(i=0;i<ns-1;++i) // for the different levels of ambiguity
1012 {
1013 a = alpha[i];
1014 b = (phydbl)(ns-1.-a*(i+1))/(ns-i-1);
1015
1016 mpee_score[i] = a + (b-a)*(1-cdf_sorted[i]);
1017 }
1018 mpee_score[ns-1] = (phydbl)(ns-1.)/ns;
1019
1020 min = mpee_score[0];
1021 min_idx = 0;
1022 for(i=1;i<ns;++i)
1023 if(mpee_score[i] < min)
1024 {
1025 min = mpee_score[i];
1026 min_idx = i;
1027 }
1028
1029 res = 0;
1030 for(i=0;i<min_idx+1;++i)
1031 {
1032 res += (int)pow(2,ns-1-idx[i]);
1033 }
1034 /* If best score ambiguity level is say 2 and A and G have highest
1035 posterior probs, then res = 2^(4-1-0) + 2^(4-1-2) = 10, which is
1036 bit representation is [1,0,1,0]
1037 */
1038
1039 Free(mpee_score);
1040 Free(cdf_sorted);
1041
1042 return(res);
1043 }
1044
1045 /*////////////////////////////////////////////////////////////
1046 ////////////////////////////////////////////////////////////*/
1047
Reverse_Muttime(phydbl * muttime,int n_mut,t_tree * tree)1048 void Reverse_Muttime(phydbl *muttime, int n_mut, t_tree *tree)
1049 {
1050 phydbl h;
1051 int i;
1052 h = Tree_Height(tree);
1053 for(i=0;i<n_mut;++i) muttime[i] = h - muttime[i];
1054
1055 }
1056
1057
1058 /*////////////////////////////////////////////////////////////
1059 ////////////////////////////////////////////////////////////*/
1060 /*////////////////////////////////////////////////////////////
1061 ////////////////////////////////////////////////////////////*/
1062 /*////////////////////////////////////////////////////////////
1063 ////////////////////////////////////////////////////////////*/
1064 /*////////////////////////////////////////////////////////////
1065 ////////////////////////////////////////////////////////////*/
1066 /*////////////////////////////////////////////////////////////
1067 ////////////////////////////////////////////////////////////*/
1068 /*////////////////////////////////////////////////////////////
1069 ////////////////////////////////////////////////////////////*/
1070 /*////////////////////////////////////////////////////////////
1071 ////////////////////////////////////////////////////////////*/
1072