1 /* evolver.c
2 Copyright, Ziheng Yang, April 1995.
3
4 cl -Ox evolver.c tools.c
5 cl -Ox -DCodonNSbranches -FeevolverNSbranches.exe evolver.c tools.c
6 cl -Ox -DCodonNSsites -FeevolverNSsites.exe evolver.c tools.c
7 cl -Ox -DCodonNSbranchsites -FeevolverNSbranchsites.exe evolver.c tools.c
8
9 cc -O3 -o evolver evolver.c tools.c -lm
10 cc -O4 -DCodonNSbranches -o evolverNSbranches evolver.c tools.c -lm
11 cc -O4 -DCodonNSsites -o evolverNSsites evolver.c tools.c -lm
12 cc -O4 -DCodonNSbranchsites -o evolverNSbranchsites evolver.c tools.c -lm
13
14 evolver
15 evolver 5 MCbase.dat
16 evolver 6 MCcodon.dat
17 evolver 7 MCaa.dat
18 evolver 9 <TreesFile> <MasterTreeFile>
19 */
20
21 /*
22 #define CodonNSbranches
23 #define CodonNSsites
24 #define CodonNSbranchsites
25 */
26
27 #include "paml.h"
28
29 #define NS 5000
30 #define NBRANCH (NS*2-2)
31 #define MAXNSONS 20
32 #define LSPNAME 96
33 #define NCODE 64
34 #define NCATG 40
35
36
37 struct CommonInfo {
38 unsigned char *z[2 * NS - 1];
39 unsigned char *spname[NS], daafile[512], cleandata, readpattern;
40 int ns, ls, npatt, np, ntime, ncode, clock, rooted, model, icode;
41 int seqtype, *pose, ncatG, NSsites;
42 int ngene, lgene[1], posG[1 + 1]; /* not used */
43 double piG[1][4], rgene[1]; /* not used */
44 double *fpatt, kappa, omega, alpha, pi[64], *conP, daa[20 * 20];
45 double freqK[NCATG], rK[NCATG];
46 char *siteID; /* used if ncatG>1 */
47 double *siterates; /* rates for gamma or omega for site or branch-site models */
48 double *omegaBS, *QfactorBS; /* omega IDs for branch-site models */
49 } com;
50 struct TREEB {
51 int nbranch, nnode, root, branches[NBRANCH][2];
52 } tree;
53 struct TREEN {
54 int father, nson, sons[MAXNSONS], ibranch;
55 double branch, age, omega, label, label2, *conP;
56 char *annotation, fossil;
57 } *nodes;
58
59 extern char BASEs[];
60 extern int GeneticCode[][64], noisy;
61 int LASTROUND = 0; /* not used */
62
63 #define EVOLVER
64 #define NODESTRUCTURE
65 #define BIRTHDEATH
66 #include "treesub.c"
67 #include "treespace.c"
68
69 void TreeDistances(FILE* fout);
70 void Simulate(char *ctlf);
71 void MakeSeq(char *z, int ls);
72 int EigenQbase(double rates[], double pi[], double Root[], double U[], double V[], double Q[]);
73 int EigenQcodon(int getstats, double kappa, double omega, double pi[], double Root[], double U[], double V[], double Q[]);
74 int EigenQaa(double pi[], double Root[], double U[], double V[], double Q[]);
75 void CladeMrBayesProbabilities(char treefile[]);
76 int between_f_and_x(void);
77 void LabelClades(FILE *fout);
78
79 char *MCctlf0[] = { "MCbase.dat","MCcodon.dat","MCaa.dat" };
80 char *seqf[] = { "mc.paml", "mc.paml", "mc.nex", "mc.nex" };
81
82 enum { JC69, K80, F81, F84, HKY85, T92, TN93, REV } BaseModels;
83 char *basemodels[] = { "JC69","K80","F81","F84","HKY85","T92","TN93","REV" };
84 enum { Poisson, EqualInput, Empirical, Empirical_F } AAModels;
85 char *aamodels[] = { "Poisson", "EqualInput", "Empirical", "Empirical_F" };
86
87
88 double PMat[NCODE*NCODE], U[NCODE*NCODE], V[NCODE*NCODE], Root[NCODE];
89 static double Qfactor = -1, Qrates[5]; /* Qrates[] hold kappa's for nucleotides */
90
91
main(int argc,char * argv[])92 int main(int argc, char *argv[])
93 {
94 char *MCctlf = NULL, outf[512] = "evolver.out", treefile[512] = "mcmc.txt", mastertreefile[512] = "\0";
95 int i, option = -1, ntree = 1, rooted, BD = 0, gotoption = 0, pick1tree = 0;
96 double bfactor = 1, birth = -1, death = -1, sample = -1, mut = -1, *space;
97 FILE *fout = gfopen(outf, "w");
98
99 printf("EVOLVER in %s\n", pamlVerStr);
100 com.alpha = 0; com.cleandata = 1; com.model = 0; com.NSsites = 0;
101
102 if (argc > 2 && !strcmp(argv[argc - 1], "--stdout-no-buf"))
103 setvbuf(stdout, NULL, _IONBF, 0);
104 if (argc > 1) {
105 gotoption = 1; sscanf(argv[1], "%d", &option);
106 }
107 if (argc == 1)
108 printf("Results for options 1-4 & 8 go into %s\n", outf);
109 else if (option != 5 && option != 6 && option != 7 && option != 9) {
110 puts("Usage: \n\tevolver \n\tevolver option datafile"); exit(-1);
111 }
112 if (option >= 4 && option <= 6)
113 MCctlf = argv[2];
114 else if (option == 9) {
115 strcpy(treefile, argv[2]);
116 if (argc > 3) strcpy(mastertreefile, argv[3]);
117 if (argc > 4) sscanf(argv[4], "%d", &pick1tree);
118 }
119
120 for (i = 0; i < NS; i++) {
121 com.spname[i] = (char*)malloc(LSPNAME * sizeof(char*));
122 if (com.spname[i] == NULL) error2("oom");
123 }
124
125 #if defined (CodonNSbranches)
126 option = 6; com.model = 1;
127 MCctlf = (argc == 3 ? argv[2] : "MCcodonNSbranches.dat");
128 gotoption = 1;
129 #elif defined (CodonNSsites)
130 option = 6; com.NSsites = 3;
131 MCctlf = (argc == 3 ? argv[2] : "MCcodonNSsites.dat");
132 gotoption = 1;
133 #elif defined (CodonNSbranchsites)
134 option = 6; com.model = 1; com.NSsites = 3;
135 MCctlf = (argc == 3 ? argv[2] : "MCcodonNSbranchsites.dat");
136 gotoption = 1;
137 #endif
138
139 if (!gotoption) {
140 for (; ;) {
141 fflush(fout);
142 printf("\n\t(1) Get random UNROOTED trees?\n");
143 printf("\t(2) Get random ROOTED trees?\n");
144 printf("\t(3) List all UNROOTED trees?\n");
145 printf("\t(4) List all ROOTED trees?\n");
146 printf("\t(5) Simulate nucleotide data sets (use %s)?\n", MCctlf0[0]);
147 printf("\t(6) Simulate codon data sets (use %s)?\n", MCctlf0[1]);
148 printf("\t(7) Simulate amino acid data sets (use %s)?\n", MCctlf0[2]);
149 printf("\t(8) Calculate identical bi-partitions between trees?\n");
150 printf("\t(9) Calculate clade support values (evolver 9 treefile mastertreefile <pick1tree>)?\n");
151 printf("\t(11) Label clades?\n");
152 printf("\t(0) Quit?\n");
153
154 option = 5;
155 scanf("%d", &option);
156
157 if (option == 0) exit(0);
158 if (option >= 5 && option <= 7) break;
159 if (option < 5) {
160 printf("No. of species: ");
161 scanf("%d", &com.ns);
162 }
163 if (com.ns > NS) error2("Too many species. Raise NS.");
164 if ((space = (double*)malloc(10000 * sizeof(double))) == NULL) error2("oom");
165 rooted = !(option % 2);
166 if (option < 3) {
167 printf("\nnumber of trees & random number seed? ");
168 scanf("%d%d", &ntree, &i);
169 SetSeed(i, 1);
170 printf("Want branch lengths from the birth-death process (0/1)? ");
171 scanf("%d", &BD);
172 }
173 if (option <= 4) {
174 if (com.ns < 3) error2("no need to do this?");
175 i = (com.ns * 2 - 1) * sizeof(struct TREEN);
176 if ((nodes = (struct TREEN*)malloc(i)) == NULL)
177 error2("oom");
178 }
179 switch (option) {
180 case(1): /* random UNROOTED trees */
181 case(2): /* random ROOTED trees */
182 /* default names */
183 if (com.ns <= 52)
184 for (i = 0; i < com.ns; i++) sprintf(com.spname[i], "%c", (i < 26 ? 'A' + i : 'a' + i - 26));
185 else
186 for (i = 0; i < com.ns; i++) sprintf(com.spname[i], "S%d", i + 1);
187
188 if (BD) {
189 printf("\nbirth rate, death rate, sampling fraction, and ");
190 printf("mutation rate (tree height)?\n");
191 scanf("%lf%lf%lf%lf", &birth, &death, &sample, &mut);
192 }
193 for (i = 0; i < ntree; i++) {
194 RandomLHistory(rooted, space);
195 if (BD)
196 BranchLengthBD(1, birth, death, sample, mut);
197 if (com.ns < 20 && ntree < 10) { OutTreeN(F0, 0, BD); puts("\n"); }
198 OutTreeN(fout, 1, BD); FPN(fout);
199 }
200 /*
201 for (i=0; i<com.ns-2-!rooted; i++)
202 Ib[i] = (int)((3.+i)*rndu());
203 MakeTreeIb (com.ns, Ib, rooted);
204 */
205 break;
206 case(3):
207 case(4):
208 ListTrees(fout, com.ns, rooted);
209 break;
210 case(8): TreeDistances(fout); break;
211 case(9):
212 printf("tree file names? ");
213 scanf("%s%s", treefile, mastertreefile);
214 break;
215 case(10): between_f_and_x(); break;
216 case(11): LabelClades(fout); break;
217 default: exit(0);
218 }
219 }
220 }
221
222 if (option >= 5 && option <= 7) {
223 com.seqtype = option - 5; /* 0, 1, 2 for bases, codons, & amino acids */
224 Simulate(MCctlf ? MCctlf : MCctlf0[option - 5]);
225 }
226 else if (option == 9) {
227
228 CladeSupport(fout, treefile, 1, mastertreefile, pick1tree);
229 /* CladeMrBayesProbabilities("\data\Lakner2008SB/M1510YY03.nex.t"); */
230 }
231 return(0);
232 }
233
234
between_f_and_x(void)235 int between_f_and_x(void)
236 {
237 /* this helps with the exponential transform for frequency parameters */
238 int i, n, fromf = 0;
239 double x[100];
240
241 for (;;) {
242 printf("\ndirection (0:x=>f; 1:f=>x; -1:end) & #classes? ");
243 scanf("%d", &fromf);
244 if (fromf == -1) return(0);
245 scanf("%d", &n); if (n > 100) error2("too many classes");
246 printf("input the first %d values for %s? ", n - 1, (fromf ? "f" : "x"));
247 FOR(i, n - 1) scanf("%lf", &x[i]);
248 x[n - 1] = (fromf ? 1 - sum(x, n - 1) : 0);
249 f_and_x(x, x, n, fromf, 1);
250 matout(F0, x, 1, n);
251 }
252 }
253
254
LabelClades(FILE * fout)255 void LabelClades(FILE *fout)
256 {
257 /* This reads in a tree and scan species names to check whether they form a
258 paraphyletic group and then label the clade.
259 It assumes that the tree is unrooted, and so goes through two rounds to check
260 whether the remaining seqs form a monophyletic clade.
261 */
262 FILE *ftree;
263 int unrooted = 1, iclade, sizeclade, mrca, paraphyl, is, imrca, i, j, k, lasts, haslength;
264 char key[96] = "A", treef[64] = "/A/F/flu/HA.all.prankcodon.tre", *p, chosen[NS], *endstr = "end";
265 int *anc[NS - 1], loc, bitmask, SI = sizeof(int) * 8;
266 int debug;
267
268 printf("Tree file name? ");
269 scanf("%s", treef);
270 printf("Treat tree as unrooted (0 no, 1 yes)? ");
271 scanf("%d", &unrooted);
272
273 ftree = gfopen(treef, "r");
274 fscanf(ftree, "%d%d", &com.ns, &j);
275 if (com.ns <= 0) error2("need ns in tree file");
276 debug = (com.ns < 20);
277
278 i = (com.ns * 2 - 1) * sizeof(struct TREEN);
279 if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
280 for (i = 0; i < com.ns * 2 - 1; i++) nodes[i].annotation = NULL;
281 for (i = 0; i < com.ns - 1; i++) {
282 anc[i] = (int*)malloc((com.ns / SI + 1) * sizeof(int));
283 if (anc[i] == NULL) error2("oom");
284 }
285 ReadTreeN(ftree, &haslength, 1, 0);
286 fclose(ftree);
287 if (debug) { OutTreeN(F0, 1, PrNodeNum); FPN(F0); }
288
289 for (iclade = 0; iclade < com.ns - 1; iclade++) {
290 printf("\nString for selecting sequences (followed by non-digit) (end to end)? ");
291 scanf("%s", key);
292 if (strcmp(endstr, key) == 0)
293 break;
294 for (i = 0; i < com.ns; i++)
295 chosen[i] = '\0';
296
297 k = strlen(key);
298 for (i = 0; i < com.ns; i++) {
299 if ((p = strstr(com.spname[i], key))
300 && !isdigit(p[k]))
301 chosen[i] = 1;
302 }
303
304 /*
305 for(i=0; i<com.ns; i++)
306 if(strstr(com.spname[i], key)) chosen[i] = 1;
307 */
308
309 /* look for MRCA, going through two rounds, assuming unrooted tree */
310 for (imrca = 0; imrca < 1 + unrooted; imrca++) {
311 if (imrca)
312 for (i = 0; i < com.ns; i++) chosen[i] = 1 - chosen[i];
313
314 for (i = 0, sizeclade = 0; i < com.ns; i++)
315 if (chosen[i]) {
316 sizeclade++;
317 lasts = i;
318 }
319
320 if (sizeclade <= 1 || sizeclade >= com.ns - 1) {
321 puts("unable to form a clade. <2 seqs.");
322 break;
323 }
324 for (i = 0; i < com.ns - 1; i++) for (j = 0; j < com.ns / SI + 1; j++)
325 anc[i][j] = 0;
326 for (is = 0; is < com.ns; is++) {
327 if (chosen[is] == 0) continue;
328 loc = is / SI; bitmask = 1 << (is%SI);
329 for (j = nodes[is].father; j != -1; j = nodes[j].father) {
330 anc[j - com.ns][loc] |= bitmask;
331 if (is == lasts) {
332 for (i = 0, k = 0; i < com.ns; i++)
333 if (anc[j - com.ns][i / SI] & (1 << (i%SI)))
334 k++;
335 if (k == sizeclade) {
336 mrca = j; break;
337 }
338 }
339 }
340 }
341 if (imrca == 0 && mrca != tree.root) /* 1st round is enough */
342 break;
343 }
344
345 if (sizeclade <= 1 || sizeclade >= com.ns - 1 || mrca == tree.root) {
346 printf("Unable to label. Ignored.");
347 continue;
348 }
349
350 if (debug)
351 for (is = 0; is < com.ns - 1; is++) {
352 printf("\nnode %4d: ", is + com.ns);
353 for (j = 0; j < com.ns; j++) {
354 loc = j / SI; bitmask = 1 << (j%SI);
355 printf(" %d", (anc[is][loc] & bitmask) != 0);
356 }
357 }
358
359 printf("\nClade #%d (%s): %d seqs selected, MRCA is %d\n", iclade + 1, key, sizeclade, mrca + 1);
360 for (is = 0, paraphyl = 0; is < com.ns; is++) {
361 if (chosen[is] == 0)
362 for (j = nodes[is].father; j != -1; j = nodes[j].father)
363 if (j == mrca) { paraphyl++; break; }
364 }
365 if (paraphyl)
366 printf("\nThis clade is paraphyletic, & includes %d other sequences\n", paraphyl);
367
368 nodes[mrca].label = iclade + 1;
369 if (debug) OutTreeN(F0, 1, haslength | PrLabel);
370 }
371
372 for (i = 0; i < com.ns - 1; i++) free(anc[i]);
373 OutTreeN(fout, 1, haslength | PrLabel); FPN(fout);
374 printf("Printed final tree with labels in evolver.out\n");
375 exit(0);
376 }
377
TreeDistanceDistribution(FILE * fout)378 void TreeDistanceDistribution(FILE* fout)
379 {
380 /* This calculates figure 3.7 of Yang (2006).
381 This reads the file of all trees (such as 7s.all.trees), and calculates the
382 distribution of partition distance in all pairwise comparisons.
383 */
384 int i, j, ntree, k, *nib, nsame, IBsame[NS], lpart = 0;
385 char treef[64] = "5s.all.trees", *partition;
386 FILE *ftree;
387 double mPD[NS], PD1[NS]; /* distribution of partition distances */
388
389 puts("Tree file name?");
390 scanf("%s", treef);
391
392 ftree = gfopen(treef, "r");
393 fscanf(ftree, "%d%d", &com.ns, &ntree);
394 printf("%2d sequences %2d trees.\n", com.ns, ntree);
395 i = (com.ns * 2 - 1) * sizeof(struct TREEN);
396 if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
397
398 lpart = (com.ns - 1)*com.ns * sizeof(char);
399 i = ntree*lpart;
400 printf("\n%d bytes of space requested.\n", i);
401 partition = (char*)malloc(i);
402 nib = (int*)malloc(ntree * sizeof(int));
403 if (partition == NULL || nib == NULL) error2("out of memory");
404
405 puts("\ntree #: mean prop of tree pairs with 0 1 2 ... shared bipartitions\n");
406 fputs("\ntree #: prop of tree pairs with 0 1 2 ... shared bipartitions\n", fout);
407 for (i = 0; i < ntree; i++) {
408 ReadTreeN(ftree, &j, 0, 1);
409 nib[i] = tree.nbranch - com.ns;
410 Tree2Partition(partition + i*lpart);
411 }
412 for (k = 0; k < com.ns - 3; k++) mPD[k] = 0;
413 for (i = 0; i < ntree; i++, FPN(fout)) {
414 for (k = 0; k < com.ns - 3; k++) PD1[k] = 0;
415 for (j = 0; j < ntree; j++) {
416 if (j == i) continue;
417 nsame = NSameBranch(partition + i*lpart, partition + j*lpart, nib[i], nib[j], IBsame);
418 PD1[nsame] ++;
419 }
420 for (k = 0; k < com.ns - 3; k++) PD1[k] /= (ntree - 1.);
421 for (k = 0; k < com.ns - 3; k++) mPD[k] = (mPD[k] * i + PD1[k]) / (i + 1.);
422 printf("%8d (%5.1f%%):", i + 1, (i + 1.) / ntree * 100);
423 for (k = 0; k < com.ns - 3; k++) printf(" %7.4f", mPD[k]);
424 fprintf(fout, "%8d:", i + 1); for (k = 0; k < com.ns - 3; k++) fprintf(fout, " %7.4f", PD1[k]);
425 printf("%s", (com.ns < 8 || (i + 1) % 100 == 0 ? "\n" : "\r"));
426 }
427 free(partition); free(nodes); free(nib); fclose(ftree);
428 exit(0);
429 }
430
431
TreeDistances(FILE * fout)432 void TreeDistances(FILE* fout)
433 {
434 /* I think this is broken after i changed the routine Tree2Partition().
435 */
436 int i, j, ntree, k, *nib, parti2B[NS], nsame, IBsame[NS], nIBsame[NS], lpart = 0;
437 char treef[64] = "5s.all.trees", *partition;
438 FILE *ftree;
439 double psame, mp, vp;
440
441 /*
442 TreeDistanceDistribution(fout);
443 */
444
445 puts("\nNumber of identical bi-partitions between trees.\nTree file name?");
446 scanf("%s", treef);
447
448 ftree = gfopen(treef, "r");
449 fscanf(ftree, "%d%d", &com.ns, &ntree);
450 printf("%2d sequences %2d trees.\n", com.ns, ntree);
451 i = (com.ns * 2 - 1) * sizeof(struct TREEN);
452 if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
453
454 if (ntree < 2) error2("ntree");
455 printf("\n%d species, %d trees\n", com.ns, ntree);
456 puts("\n\t1: first vs. rest?\n\t2: all pairwise comparisons?\n");
457 k = 2;
458 scanf("%d", &k);
459
460 lpart = (com.ns - 1)*com.ns * sizeof(char);
461 i = (k == 1 ? 2 : ntree)*lpart;
462 printf("\n%d bytes of space requested.\n", i);
463 partition = (char*)malloc(i);
464 nib = (int*)malloc(ntree * sizeof(int));
465 if (partition == NULL || nib == NULL) error2("out of memory");
466
467 if (k == 2) { /* pairwise comparisons */
468 fputs("Number of identical bi-partitions in pairwise comparisons\n", fout);
469 for (i = 0; i < ntree; i++) {
470 ReadTreeN(ftree, &j, 0, 1);
471 nib[i] = tree.nbranch - com.ns;
472 Tree2Partition(partition + i*lpart);
473 }
474 for (i = 0; i < ntree; i++, FPN(F0), FPN(fout)) {
475 printf("%2d (%2d):", i + 1, nib[i]);
476 fprintf(fout, "%2d (%2d):", i + 1, nib[i]);
477 for (j = 0; j < i; j++) {
478 nsame = NSameBranch(partition + i*lpart, partition + j*lpart, nib[i], nib[j], IBsame);
479 printf(" %2d", nsame);
480 fprintf(fout, " %2d", nsame);
481 }
482 }
483 }
484 else { /* first vs. others */
485 ReadTreeN(ftree, &j, 0, 1);
486 nib[0] = tree.nbranch - com.ns;
487 if (nib[0] == 0) error2("1st tree is a star tree..");
488 Tree2Partition(partition);
489 fputs("Comparing the first tree with the others\nFirst tree:\n", fout);
490 OutTreeN(fout, 0, 0); FPN(fout); OutTreeB(fout); FPN(fout);
491 fputs("\nInternal branches in the first tree:\n", fout);
492 FOR(i, nib[0]) {
493 k = parti2B[i];
494 fprintf(fout, "%3d (%2d..%-2d): ( ",
495 i + 1, tree.branches[k][0] + 1, tree.branches[k][1] + 1);
496 FOR(j, com.ns) if (partition[i*com.ns + j]) fprintf(fout, "%d ", j + 1);
497 fputs(")\n", fout);
498 }
499 if (nodes[tree.root].nson <= 2)
500 fputs("\nRooted tree, results may not be correct.\n", fout);
501 fputs("\nCorrect internal branches compared with the 1st tree:\n", fout);
502 FOR(k, nib[0]) nIBsame[k] = 0;
503 for (i = 1, mp = vp = 0; i < ntree; i++, FPN(fout)) {
504 ReadTreeN(ftree, &j, 0, 1);
505 nib[1] = tree.nbranch - com.ns;
506 Tree2Partition(partition + lpart);
507 nsame = NSameBranch(partition, partition + lpart, nib[0], nib[1], IBsame);
508
509 psame = nsame / (double)nib[0];
510 FOR(k, nib[0]) nIBsame[k] += IBsame[k];
511 fprintf(fout, "1 vs. %3d: %4d: ", i + 1, nsame);
512 FOR(k, nib[0]) if (IBsame[k]) fprintf(fout, " %2d", k + 1);
513 printf("1 vs. %5d: %6d/%d %10.4f\n", i + 1, nsame, nib[0], psame);
514 vp += square(psame - mp)*(i - 1.) / i;
515 mp = (mp*(i - 1.) + psame) / i;
516 }
517 vp = (ntree <= 2 ? 0 : sqrt(vp / ((ntree - 1 - 1)*(ntree - 1.))));
518 fprintf(fout, "\nmean and S.E. of proportion of identical partitions\n");
519 fprintf(fout, "between the 1st and all the other %d trees ", ntree - 1);
520 fprintf(fout, "(ignore these if not revelant):\n %.4f +- %.4f\n", mp, vp);
521 fprintf(fout, "\nNumbers of times, out of %d, ", ntree - 1);
522 fprintf(fout, "interior branches of tree 1 are present");
523 fputs("\n(This may be bootstrap support for nodes in tree 1)\n", fout);
524 FOR(k, nib[0]) {
525 i = tree.branches[parti2B[k]][0] + 1; j = tree.branches[parti2B[k]][1] + 1;
526 fprintf(fout, "%3d (%2d..%-2d): %6d (%5.1f%%)\n",
527 k + 1, i, j, nIBsame[k], nIBsame[k] * 100. / (ntree - 1.));
528 }
529 }
530 free(partition); free(nodes); free(nib); fclose(ftree);
531 }
532
533
EigenQbase(double rates[],double pi[],double Root[],double U[],double V[],double Q[])534 int EigenQbase(double rates[], double pi[], double Root[], double U[], double V[], double Q[])
535 {
536 /* Construct the rate matrix Q[] for nucleotide model REV.
537 */
538 int i, j, k;
539 double mr, space[4];
540
541 zero(Q, 16);
542 for (i = 0, k = 0; i < 3; i++) for (j = i + 1; j < 4; j++)
543 if (i * 4 + j != 11) Q[i * 4 + j] = Q[j * 4 + i] = rates[k++];
544 for (i = 0, Q[3 * 4 + 2] = Q[2 * 4 + 3] = 1; i < 4; i++) FOR(j, 4) Q[i * 4 + j] *= pi[j];
545 for (i = 0, mr = 0; i < 4; i++)
546 {
547 Q[i * 4 + i] = 0; Q[i * 4 + i] = -sum(Q + i * 4, 4); mr -= pi[i] * Q[i * 4 + i];
548 }
549 abyx(1 / mr, Q, 16);
550
551 eigenQREV(Q, com.pi, 4, Root, U, V, space);
552 return (0);
553 }
554
555
556 static double freqK_NS = -1;
557
EigenQcodon(int getstats,double kappa,double omega,double pi[],double Root[],double U[],double V[],double Q[])558 int EigenQcodon(int getstats, double kappa, double omega, double pi[],
559 double Root[], double U[], double V[], double Q[])
560 {
561 /* Construct the rate matrix Q[].
562 64 codons are used, and stop codons have 0 freqs.
563 */
564 int n = com.ncode, i, j, k, c[2], ndiff, pos = 0, from[3], to[3];
565 double mr, space[64];
566
567 for (i = 0; i < n*n; i++) Q[i] = 0;
568 for (i = 0; i < n; i++) FOR(j, i) {
569 from[0] = i / 16; from[1] = (i / 4) % 4; from[2] = i % 4;
570 to[0] = j / 16; to[1] = (j / 4) % 4; to[2] = j % 4;
571 c[0] = GeneticCode[com.icode][i]; c[1] = GeneticCode[com.icode][j];
572 if (c[0] == -1 || c[1] == -1) continue;
573 for (k = 0, ndiff = 0; k < 3; k++) if (from[k] != to[k]) { ndiff++; pos = k; }
574 if (ndiff != 1) continue;
575 Q[i*n + j] = 1;
576 if ((from[pos] + to[pos] - 1)*(from[pos] + to[pos] - 5) == 0) Q[i*n + j] *= kappa;
577 if (c[0] != c[1]) Q[i*n + j] *= omega;
578 Q[j*n + i] = Q[i*n + j];
579 }
580 for (i = 0; i < n; i++) for (j = 0; j < n; j++)
581 Q[i*n + j] *= com.pi[j];
582 for (i = 0, mr = 0; i < n; i++) {
583 Q[i*n + i] = -sum(Q + i*n, n);
584 mr -= pi[i] * Q[i*n + i];
585 }
586
587 if (getstats)
588 Qfactor += freqK_NS * mr;
589 else {
590 if (com.ncatG == 0) FOR(i, n*n) Q[i] *= 1 / mr;
591 else FOR(i, n*n) Q[i] *= Qfactor; /* NSsites models */
592 eigenQREV(Q, com.pi, n, Root, U, V, space);
593 }
594 return (0);
595 }
596
597
598
EigenQaa(double pi[],double Root[],double U[],double V[],double Q[])599 int EigenQaa(double pi[], double Root[], double U[], double V[], double Q[])
600 {
601 /* Construct the rate matrix Q[]
602 */
603 int n = 20, i, j;
604 double mr, space[20];
605
606 FOR(i, n*n) Q[i] = 0;
607 switch (com.model) {
608 case (Poisson): case (EqualInput):
609 fillxc(Q, 1., n*n); break;
610 case (Empirical): case (Empirical_F):
611 FOR(i, n) FOR(j, i) Q[i*n + j] = Q[j*n + i] = com.daa[i*n + j] / 100;
612 break;
613 }
614 FOR(i, n) FOR(j, n) Q[i*n + j] *= com.pi[j];
615 for (i = 0, mr = 0; i < n; i++) {
616 Q[i*n + i] = 0; Q[i*n + i] = -sum(Q + i*n, n); mr -= com.pi[i] * Q[i*n + i];
617 }
618
619 eigenQREV(Q, com.pi, n, Root, U, V, space);
620 FOR(i, n) Root[i] = Root[i] / mr;
621
622 return (0);
623 }
624
625
GetDaa(FILE * fout,double daa[])626 int GetDaa(FILE* fout, double daa[])
627 {
628 /* Get the amino acid substitution rate matrix (grantham, dayhoff, jones, etc).
629 */
630 FILE * fdaa;
631 char aa3[4] = "";
632 int i, j, n = 20;
633
634 fdaa = gfopen(com.daafile, "r");
635 printf("\nReading rate matrix from %s\n", com.daafile);
636
637 for (i = 0; i < n; i++) for (j = 0, daa[i*n + i] = 0; j < i; j++) {
638 fscanf(fdaa, "%lf", &daa[i*n + j]);
639 daa[j*n + i] = daa[i*n + j];
640 }
641 if (com.model == Empirical) {
642 FOR(i, n) if (fscanf(fdaa, "%lf", &com.pi[i]) != 1) error2("err aaRatefile");
643 if (fabs(1 - sum(com.pi, 20)) > 1e-4) error2("\nSum of aa freq. != 1\n");
644 }
645 fclose(fdaa);
646
647 if (fout) {
648 fprintf(fout, "\n%s\n", com.daafile);
649 FOR(i, n) {
650 fprintf(fout, "\n%4s", getAAstr(aa3, i));
651 FOR(j, i) fprintf(fout, "%5.0f", daa[i*n + j]);
652 }
653 FPN(fout);
654 }
655
656 return (0);
657 }
658
659
660
661
MakeSeq(char * z,int ls)662 void MakeSeq(char*z, int ls)
663 {
664 /* generate a random sequence of nucleotides, codons, or amino acids by
665 sampling com.pi[], or read the ancestral sequence from the file RootSeq.txt
666 if the file exists.
667 */
668 int i, j, h, n = com.ncode, ch, n31 = (com.seqtype == 1 ? 3 : 1), lst;
669 double p[64], r, small = 5e-5;
670 unsigned char *pch = (com.seqtype == 2 ? AAs : BASEs);
671 unsigned char rootseqf[] = "RootSeq.txt", codon[4] = " ";
672 FILE *frootseq = (FILE*)fopen(rootseqf, "r");
673 static int times = 0;
674
675 if (frootseq) {
676 if (times++ == 0) printf("Reading sequence at the root from file.\n\n");
677 if (com.siterates && com.ncatG > 1)
678 error2("sequence for root doesn't work for site-class models");
679
680 for (lst = 0; ; ) {
681 for (i = 0; i < n31; i++) {
682 while ((ch = fgetc(frootseq)) != EOF && !isalpha(ch));
683 if (ch == EOF) error2("EOF when reading root sequence.");
684 if (isalpha(ch))
685 codon[i] = (unsigned char)(ch = CodeChara((char)ch, com.seqtype));
686 }
687 if (com.seqtype == 1) ch = codon[0] * 16 + codon[1] * 4 + codon[2];
688 if (ch<0 || ch>n - 1)
689 printf("error when reading site %d\n", lst + 1);
690 if (com.seqtype == 1 && com.pi[ch] == 0)
691 printf("you seem to have a stop codon in the root sequence\n");
692
693 z[lst++] = (unsigned char)ch;
694 if (lst == com.ls) break;
695 }
696 fclose(frootseq);
697 }
698 else {
699 for (j = 0; j < n; j++) p[j] = com.pi[j];
700 for (j = 1; j < n; j++) p[j] += p[j - 1];
701 if (fabs(p[n - 1] - 1) > small) {
702 printf("\nsum pi = %.6f != 1!\n", p[n - 1]);
703 exit(-1);
704 }
705 for (h = 0; h < com.ls; h++) {
706 for (j = 0, r = rndu(); j < n - 1; j++)
707 if (r < p[j]) break;
708 z[h] = (unsigned char)j;
709 }
710 }
711 }
712
713
714
Evolve(int inode)715 void Evolve(int inode)
716 {
717 /* evolve sequence com.z[tree.root] along the tree to generate com.z[],
718 using nodes[].branch, nodes[].omega, & com.model
719 Needs com.z[0,1,...,nnode-1], while com.z[0] -- com.z[ns-1] constitute
720 the data.
721 For codon sequences, com.siterates[] has w's for NSsites and NSbranchsite models.
722 */
723 int is, h, i, j, ison, from, n = com.ncode, longseq = 100000;
724 double t, rw;
725
726 for (is = 0; is < nodes[inode].nson; is++) {
727 ison = nodes[inode].sons[is];
728 memcpy(com.z[ison], com.z[inode], com.ls * sizeof(unsigned char));
729 t = nodes[ison].branch;
730
731 if (com.seqtype == 1 && com.model && com.NSsites) { /* branch-site models */
732 Qfactor = com.QfactorBS[ison];
733 for (h = 0; h < com.ls; h++)
734 com.siterates[h] = com.omegaBS[ison*com.ncatG + com.siteID[h]];
735 }
736
737 for (h = 0; h < com.ls; h++) {
738 /* decide whether to recalcualte PMat[]. */
739 if (h == 0 || (com.siterates && com.siterates[h] != com.siterates[h - 1])) {
740 rw = (com.siterates ? com.siterates[h] : 1);
741
742 switch (com.seqtype) {
743 case (BASEseq):
744 if (com.model <= TN93)
745 PMatTN93(PMat, t*Qfactor*rw*Qrates[0], t*Qfactor*rw*Qrates[1], t*Qfactor*rw, com.pi);
746 else if (com.model == REV)
747 PMatUVRoot(PMat, t*rw, com.ncode, U, V, Root);
748 break;
749
750 case (CODONseq): /* Watch out for NSsites model */
751 if (com.model || com.NSsites) { /* no need to update UVRoot if M0 */
752 if (com.model && com.NSsites == 0) /* branch */
753 rw = nodes[ison].omega; /* should be equal to com.rK[nodes[].label] */
754
755 EigenQcodon(0, com.kappa, rw, com.pi, Root, U, V, PMat);
756 }
757 PMatUVRoot(PMat, t, com.ncode, U, V, Root);
758 break;
759
760 case (AAseq):
761 PMatUVRoot(PMat, t*rw, com.ncode, U, V, Root);
762 break;
763 }
764 for (i = 0; i < n; i++)
765 for (j = 1; j < n; j++)
766 PMat[i*n + j] += PMat[i*n + j - 1];
767 }
768 for (j = 0, from = com.z[ison][h], rw = rndu(); j < n - 1; j++)
769 if (rw < PMat[from*n + j]) break;
770 com.z[ison][h] = j;
771 }
772
773 if (com.ls > longseq) printf("\r nodes %2d -> %2d, evolving . . ", inode + 1, ison + 1);
774
775 if (nodes[ison].nson) Evolve(ison);
776 } /* for (is) */
777
778 if (inode == tree.root && com.ls > longseq) printf("\r%s", strc(50, ' '));
779 }
780
781
782
Simulate(char * ctlf)783 void Simulate(char *ctlf)
784 {
785 /* simulate nr data sets of nucleotide, codon, or AA sequences.
786 ls: number of nucleotides, codons, or AAs in each sequence.
787 All 64 codons are used for codon sequences.
788 When com.alpha or com.ncatG>1, sites are randomized after sequences are
789 generated.
790 space[com.ls] is used to hold site marks.
791 format: 0: paml sites; 1: paml patterns; 2: paup nex; 3: paup JC69 format
792 */
793 char *ancf = "ancestral.txt", *siteIDf = "siterates.txt";
794 FILE *fin, *fseq, *ftree = NULL, *fanc = NULL, *fsiteID = NULL;
795 char *paupstart = "paupstart", *paupblock = "paupblock", *paupend = "paupend";
796 char line[32000];
797 int lline = 32000, i, j, k, ir, n, nr, fixtree = 1, sspace = 10000, rooted = 1;
798 int h = 0, format = 0, b[3] = { 0 }, nrate = 1, counts[NCATG];
799 int *siteorder = NULL;
800 char *tmpseq = NULL, *pc;
801 double birth = 0, death = 0, sample = 1, mut = 1, tlength, *space, *blengthBS;
802 double T, C, A, G, Y, R, Falias[NCATG];
803 int Lalias[NCATG];
804
805 noisy = 1;
806 printf("\nReading options from data file %s\n", ctlf);
807 com.ncode = n = (com.seqtype == 0 ? 4 : (com.seqtype == 1 ? 64 : 20));
808 fin = (FILE*)gfopen(ctlf, "r");
809 fscanf(fin, "%d", &format);
810 fgets(line, lline, fin);
811 printf("\nSimulated data will go into %s.\n", seqf[format]);
812 if (format == 2) printf("%s, %s, & %s will be appended if existent.\n", paupstart, paupblock, paupend);
813
814 fscanf(fin, "%d", &i);
815 fgets(line, lline, fin);
816 SetSeed(i, 1);
817 fscanf(fin, "%d%d%d", &com.ns, &com.ls, &nr);
818 fgets(line, lline, fin);
819 i = (com.ns * 2 - 1) * sizeof(struct TREEN);
820 if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
821
822 if (com.ns > NS) error2("too many seqs?");
823 printf("\n%d seqs, %d sites, %d replicate(s)\n", com.ns, com.ls, nr);
824 k = (com.ns*com.ls* (com.seqtype == CODONseq ? 4 : 1) *nr) / 1000 + 1;
825 printf("Seq file will be about %dK bytes.\n", k);
826 for (i = 0; i < com.ns; i++) /* default spname */
827 sprintf(com.spname[i], "S%d", i + 1);
828
829 if (fixtree) {
830 fscanf(fin, "%lf", &tlength); fgets(line, lline, fin);
831 if (ReadTreeN(fin, &i, 1, 1)) /* might overwrite spname */
832 error2("err tree..");
833
834 if (i == 0) error2("use : to specify branch lengths in tree");
835 for (i = 0, T = 0; i < tree.nnode; i++)
836 if (i != tree.root) T += nodes[i].branch;
837 if (tlength > 0) {
838 for (i = 0; i < tree.nnode; i++)
839 if (i != tree.root) nodes[i].branch *= tlength / T;
840 }
841 printf("tree length = %.3f\n", (tlength > 0 ? tlength : T));
842 if (com.ns < 100) {
843 printf("\nModel tree & branch lengths:\n");
844 OutTreeN(F0, 1, 1); FPN(F0);
845 OutTreeN(F0, 0, 1); FPN(F0);
846 }
847 if (com.seqtype == CODONseq && com.model && !com.NSsites) { /* branch model */
848 FOR(i, tree.nnode) nodes[i].omega = nodes[i].label;
849 FPN(F0); OutTreeN(F0, 1, PrBranch | PrLabel); FPN(F0);
850 }
851 }
852 else { /* random trees, broken or need testing? */
853 printf("\nbirth rate, death rate, sampling fraction, mutation rate (tree height)?\n");
854 fscanf(fin, "%lf%lf%lf%lf", &birth, &death, &sample, &mut);
855 fgets(line, lline, fin);
856 printf("%9.4f %9.4f %9.4f %9.4f\n", birth, death, sample, mut);
857 }
858
859 if (com.seqtype == BASEseq) {
860 fscanf(fin, "%d", &com.model);
861 fgets(line, lline, fin);
862 if (com.model<0 || com.model>REV) error2("model err");
863 if (com.model == T92) error2("T92: please use HKY85 with T=A and C=G.");
864
865 printf("\nModel: %s\n", basemodels[com.model]);
866 if (com.model == REV) nrate = 5;
867 else if (com.model == TN93) nrate = 2;
868 FOR(i, nrate) fscanf(fin, "%lf", &Qrates[i]);
869 fgets(line, lline, fin);
870 if (nrate <= 2) FOR(i, nrate) printf("kappa %9.5f\n", Qrates[i]); FPN(F0);
871 if (nrate == 5) {
872 printf("a & b & c & d & e: ");
873 FOR(i, nrate) printf("%9.5f", Qrates[i]); FPN(F0);
874 }
875 if ((com.model == JC69 || com.model == F81) && Qrates[0] != 1)
876 error2("kappa should be 1 for this model");
877 }
878 else if (com.seqtype == CODONseq) {
879 for (i = 0; i < 64; i++)
880 getcodon(CODONs[i], i);
881 if (com.model == 0 && com.NSsites) { /* site model */
882 fscanf(fin, "%d", &com.ncatG); fgets(line, lline, fin);
883 if (com.ncatG > NCATG) error2("ncatG>NCATG");
884 FOR(i, com.ncatG) fscanf(fin, "%lf", &com.freqK[i]); fgets(line, lline, fin);
885 FOR(i, com.ncatG) fscanf(fin, "%lf", &com.rK[i]); fgets(line, lline, fin);
886 printf("\n\ndN/dS (w) for site classes (K=%d)", com.ncatG);
887 printf("\nf: "); FOR(i, com.ncatG) printf("%9.5f", com.freqK[i]);
888 printf("\nw: "); FOR(i, com.ncatG) printf("%9.5f", com.rK[i]); FPN(F0);
889 }
890 else if (com.model && com.NSsites) { /* branchsite model */
891 fscanf(fin, "%d", &com.ncatG); fgets(line, lline, fin);
892 if (com.ncatG > min2(NCATG, 127)) error2("ncatG too large");
893 FOR(i, com.ncatG) fscanf(fin, "%lf", &com.freqK[i]); fgets(line, lline, fin);
894 printf("\n%d site classes.\nFreqs: ", com.ncatG);
895 FOR(i, com.ncatG) printf("%9.5f", com.freqK[i]);
896
897 if ((com.omegaBS = (double*)malloc((com.ncatG + 2)*tree.nnode * sizeof(double))) == NULL)
898 error2("oom");
899 com.QfactorBS = com.omegaBS + com.ncatG*tree.nnode;
900 blengthBS = com.QfactorBS + tree.nnode;
901
902 for (i = 0; i < tree.nnode; i++)
903 blengthBS[i] = nodes[i].branch;
904 for (k = 0; k < com.ncatG; k++) {
905 ReadTreeN(fin, &i, 0, 1);
906 if (i) error2("do not include branch lengths except in the first tree.");
907 /* if (!j) error2("Use # to specify omega's for branches"); */
908 for (i = 0; i < tree.nnode; i++) com.omegaBS[i*com.ncatG + k] = nodes[i].label;
909 }
910 for (i = 0; i < tree.nnode; i++)
911 {
912 nodes[i].branch = blengthBS[i]; nodes[i].label = nodes[i].omega = 0;
913 }
914 for (i = 0; i < tree.nnode; i++) { /* print out omega as node labels. */
915 nodes[i].annotation = pc = (char*)malloc(20 * com.ncatG * sizeof(char));
916 sprintf(pc, "'[%.2f", com.omegaBS[i*com.ncatG + 0]);
917 for (k = 1, pc += strlen(pc); k < com.ncatG; k++, pc += strlen(pc))
918 sprintf(pc, ", %.2f", com.omegaBS[i*com.ncatG + k]);
919 sprintf(pc, "]'");
920 }
921 FPN(F0); OutTreeN(F0, 1, PrBranch | PrLabel); FPN(F0);
922 }
923 else if (com.model == 0) { /* M0 */
924 fscanf(fin, "%lf", &com.omega);
925 fgets(line, lline, fin);
926 printf("omega = %9.5f\n", com.omega);
927 for (i = 0; i < tree.nbranch; i++)
928 nodes[tree.branches[i][1]].omega = com.omega;
929 }
930
931 fscanf(fin, "%lf", &com.kappa); fgets(line, lline, fin);
932 printf("kappa = %9.5f\n", com.kappa);
933 }
934
935 if (com.seqtype == BASEseq || com.seqtype == AAseq) {
936 fscanf(fin, "%lf%d", &com.alpha, &com.ncatG);
937 fgets(line, lline, fin);
938 if (com.alpha)
939 printf("Gamma rates, alpha =%.4f (K=%d)\n", com.alpha, com.ncatG);
940 else {
941 com.ncatG = 0;
942 puts("Rates are constant over sites.");
943 }
944 }
945 if (com.alpha || com.ncatG) { /* this is used for codon NSsites as well. */
946 k = com.ls;
947 if (com.seqtype == 1 && com.model && com.NSsites) k *= tree.nnode;
948 if ((com.siterates = (double*)malloc(k * sizeof(double))) == NULL) error2("oom1");
949 if ((siteorder = (int*)malloc(com.ls * sizeof(int))) == NULL) error2("oom2");
950 }
951
952 if (com.seqtype == AAseq) { /* get aa substitution model and rate matrix */
953 fscanf(fin, "%d", &com.model);
954 printf("\nmodel: %s", aamodels[com.model]);
955 if (com.model >= 2) { fscanf(fin, "%s", com.daafile); GetDaa(NULL, com.daa); }
956 fgets(line, lline, fin);
957 }
958
959 /* get freqs com.pi[] */
960 if ((com.seqtype == BASEseq && com.model > K80) ||
961 com.seqtype == CODONseq ||
962 (com.seqtype == AAseq && (com.model == 1 || com.model == 3)))
963 for (k = 0; k < com.ncode; k++) fscanf(fin, "%lf", &com.pi[k]);
964 else if (com.model == 0 || (com.seqtype == BASEseq && com.model <= K80))
965 fillxc(com.pi, 1. / com.ncode, com.ncode);
966
967 printf("sum pi = 1 = %.6f:", sum(com.pi, com.ncode));
968 matout2(F0, com.pi, com.ncode / 4, 4, 9, 6);
969 if (com.seqtype == CODONseq) {
970 fscanf(fin, "%d", &com.icode); fgets(line, lline, fin);
971 printf("genetic code = %d\n", com.icode);
972 for (k = 0; k < com.ncode; k++)
973 if (GeneticCode[com.icode][k] == -1 && com.pi[k])
974 error2("stop codons should have frequency 0?");
975 }
976
977 if (com.seqtype == BASEseq) {
978 if (com.model < REV) {
979 T = com.pi[0]; C = com.pi[1]; A = com.pi[2]; G = com.pi[3]; Y = T + C; R = A + G;
980 if (com.model == F84) {
981 Qrates[1] = 1 + Qrates[0] / R; /* kappa2 */
982 Qrates[0] = 1 + Qrates[0] / Y; /* kappa1 */
983 }
984 else if (com.model <= HKY85) Qrates[1] = Qrates[0];
985 Qfactor = 1 / (2 * T*C*Qrates[0] + 2 * A*G*Qrates[1] + 2 * Y*R);
986 }
987 else
988 if (com.model == REV) EigenQbase(Qrates, com.pi, Root, U, V, PMat);
989 }
990
991 /* get Qfactor for NSsites & NSbranchsite models */
992 if (com.seqtype == CODONseq && com.NSsites) {
993 if (!com.model) { /* site models */
994 for (k = 0, Qfactor = 0; k < com.ncatG; k++) {
995 freqK_NS = com.freqK[k];
996 EigenQcodon(1, com.kappa, com.rK[k], com.pi, NULL, NULL, NULL, PMat);
997 }
998 Qfactor = 1 / Qfactor;
999 printf("Qfactor for NSsites model = %9.5f\n", Qfactor);
1000 }
1001 else { /* branch-site models */
1002 for (i = 0; i < tree.nnode; i++) {
1003 if (i == tree.root) { com.QfactorBS[i] = -1; continue; }
1004 for (k = 0, Qfactor = 0; k < com.ncatG; k++) {
1005 freqK_NS = com.freqK[k];
1006 EigenQcodon(1, com.kappa, com.omegaBS[i*com.ncatG + k], com.pi, NULL, NULL, NULL, PMat);
1007 }
1008 com.QfactorBS[i] = 1 / Qfactor; Qfactor = 0;
1009 printf("node %2d: Qfactor = %9.5f\n", i + 1, com.QfactorBS[i]);
1010 }
1011 }
1012 }
1013 if (com.seqtype == CODONseq && com.ncatG <= 1 && com.model == 0)
1014 EigenQcodon(0, com.kappa, com.omega, com.pi, Root, U, V, PMat);
1015 else if (com.seqtype == AAseq)
1016 EigenQaa(com.pi, Root, U, V, PMat);
1017
1018 puts("\nAll parameters are read. Ready to simulate\n");
1019 sspace = max2(sspace, 8000000);
1020 space = (double*)malloc(sspace);
1021 if (com.alpha || com.ncatG) tmpseq = (char*)space;
1022 if (space == NULL) {
1023 printf("oom for space, %d bytes needed.", sspace);
1024 exit(-1);
1025 }
1026
1027 fseq = gfopen(seqf[format], "w");
1028 if (format == 2 || format == 3) appendfile(fseq, paupstart);
1029
1030 fanc = (FILE*)gfopen(ancf, "w");
1031 if (fixtree) {
1032 fputs("\nAncestral sequences generated during simulation ", fanc);
1033 fprintf(fanc, "(check against %s)\n", seqf[format]);
1034 OutTreeN(fanc, 0, 0); FPN(fanc); OutTreeB(fanc); FPN(fanc);
1035 }
1036 if (com.alpha || com.NSsites) {
1037 fsiteID = (FILE*)gfopen(siteIDf, "w");
1038 if (com.seqtype == 1) fprintf(fsiteID, "\nSite class IDs\n");
1039 else fprintf(fsiteID, "\nRates for sites\n");
1040 if (com.seqtype == CODONseq && com.NSsites) {
1041 if (!com.model) matout(fsiteID, com.rK, 1, com.ncatG);
1042 if ((com.siteID = (char*)malloc(com.ls * sizeof(char))) == NULL)
1043 error2("oom siteID");
1044 }
1045 }
1046
1047 for (ir = 0; ir < nr; ir++) {
1048 for (j = 0; j < com.ns * 2 - 1; j++) {
1049 com.z[j] = (unsigned char*)realloc(com.z[j], com.ls * sizeof(unsigned char));
1050 if (com.z[j] == NULL) error2("memory problem for z[]");
1051 }
1052 if (!fixtree) { /* right now tree is fixed */
1053 RandomLHistory(rooted, space);
1054 if (rooted && com.ns < 10) j = GetIofLHistory();
1055 BranchLengthBD(1, birth, death, sample, mut);
1056 if (com.ns < 20) {
1057 printf("\ntree used: ");
1058 OutTreeN(F0, 1, 1);
1059 FPN(F0);
1060 }
1061 }
1062 MakeSeq(com.z[tree.root], com.ls);
1063
1064 if (com.alpha)
1065 Rates4Sites(com.siterates, com.alpha, com.ncatG, com.ls, 0, space);
1066 else if (com.seqtype == 1 && com.NSsites) { /* for NSsites */
1067 /* the table for the alias algorithm is the same, but ncatG is small. */
1068 MultiNomialAliasSetTable(com.ncatG, com.freqK, Falias, Lalias, space);
1069 MultiNomialAlias(com.ls, com.ncatG, Falias, Lalias, counts);
1070
1071 for (i = 0, h = 0; i < com.ncatG; i++)
1072 for (j = 0; j < counts[i]; j++) {
1073 com.siteID[h] = (char)i;
1074 com.siterates[h++] = com.rK[i]; /* overwritten later for branchsite */
1075 }
1076 }
1077
1078 Evolve(tree.root);
1079
1080 /* randomize sites for site-class model */
1081 if (com.siterates && com.ncatG > 1) {
1082 if (format == 1 && ir == 0)
1083 puts("\nrequested site pattern counts as output for site-class model.\n");
1084 randorder(siteorder, com.ls, (int*)space);
1085 for (j = 0; j < tree.nnode; j++) {
1086 memcpy(tmpseq, com.z[j], com.ls * sizeof(char));
1087 for (h = 0; h < com.ls; h++) com.z[j][h] = tmpseq[siteorder[h]];
1088 }
1089 if (com.alpha || com.ncatG > 1) {
1090 memcpy(space, com.siterates, com.ls * sizeof(double));
1091 for (h = 0; h < com.ls; h++) com.siterates[h] = space[siteorder[h]];
1092 }
1093 if (com.siteID) {
1094 memcpy((char*)space, com.siteID, com.ls * sizeof(char));
1095 for (h = 0; h < com.ls; h++) com.siteID[h] = *((char*)space + siteorder[h]);
1096 }
1097 }
1098
1099 /* print sequences*/
1100 if (format == 1 || format == 3) {
1101 for (i = 0; i < com.ns; i++) for (h = 0; h < com.ls; h++) com.z[i][h] ++; /* code as 1, 2, ... */
1102 PatternWeightSimple();
1103 for (i = 0; i < com.ns; i++) for (h = 0; h < com.npatt; h++) com.z[i][h] --; /* code as 0, 1, ... */
1104 if (format == 3)
1105 PatternWeightJC69like();
1106 }
1107 if (format == 2 || format == 3) fprintf(fseq, "\n\n[Replicate # %d]\n", ir + 1);
1108 printSeqs(fseq, com.z, com.spname, com.ns, com.ls, com.npatt, com.fpatt, NULL, NULL, format); /* printsma not usable as it codes into 0,1,...,60. */
1109 if ((format == 2 || format == 3) && !fixtree) {
1110 fprintf(fseq, "\nbegin tree;\n tree true_tree = [&U] ");
1111 OutTreeN(fseq, 1, 1); fputs(";\n", fseq);
1112 fprintf(fseq, "end;\n\n");
1113 }
1114 if (format == 2 || format == 3) appendfile(fseq, paupblock);
1115
1116 /* print ancestral seqs, rates for sites. */
1117 if (format != 1 && format != 3) { /* don't print ancestors if site patterns are printed. */
1118 j = (com.seqtype == CODONseq ? 3 * com.ls : com.ls);
1119 fprintf(fanc, "[replicate %d]\n", ir + 1);
1120
1121 if (!fixtree) {
1122 if (format < 2)
1123 {
1124 OutTreeN(fanc, 1, 1); FPN(fanc); FPN(fanc);
1125 }
1126 }
1127 else {
1128 fprintf(fanc, "%6d %6d\n", tree.nnode - com.ns, j);
1129 for (j = com.ns; j < tree.nnode; j++, FPN(fanc)) {
1130 fprintf(fanc, "node%-26d ", j + 1);
1131 print1seq(fanc, com.z[j], com.ls, NULL);
1132 }
1133 FPN(fanc);
1134
1135 if (fsiteID) {
1136 if (com.seqtype == CODONseq && com.NSsites && com.model == 0) { /* site model */
1137 k = 0;
1138 if (com.rK[com.ncatG - 1] > 1)
1139 FOR(h, com.ls) if (com.rK[com.siteID[h]] > 1) k++;
1140 fprintf(fsiteID, "\n[replicate %d: %2d]\n", ir + 1, k);
1141 if (k) for (h = 0, k = 0; h < com.ls; h++) {
1142 if (com.rK[com.siteID[h]] > 1) {
1143 fprintf(fsiteID, "%4d ", h + 1);
1144 if (++k % 15 == 0) FPN(fsiteID);
1145 }
1146 }
1147 FPN(fsiteID);
1148 }
1149 else if (com.seqtype == CODONseq && com.NSsites && com.model) { /* branchsite */
1150 fprintf(fsiteID, "\n[replicate %d]\n", ir + 1);
1151 for (h = 0; h < com.ls; h++) {
1152 fprintf(fsiteID, " %4d ", com.siteID[h] + 1);
1153 if (h == com.ls - 1 || (h + 1) % 15 == 0) FPN(fsiteID);
1154 }
1155 }
1156 else { /* gamma rates */
1157 fprintf(fsiteID, "\n[replicate %d]\n", ir + 1);
1158 for (h = 0; h < com.ls; h++) {
1159 fprintf(fsiteID, "%7.4f ", com.siterates[h]);
1160 if (h == com.ls - 1 || (h + 1) % 10 == 0) FPN(fsiteID);
1161 }
1162 }
1163 }
1164 }
1165 }
1166
1167 printf("\rdid data set %d %s", ir + 1, (com.ls > 100000 || nr < 100 ? "\n" : ""));
1168 } /* for (ir) */
1169 if (format == 2 || format == 3) appendfile(fseq, paupend);
1170
1171 fclose(fseq); if (!fixtree) fclose(fanc);
1172 if (com.alpha || com.NSsites) fclose(fsiteID);
1173 for (j = 0; j < com.ns * 2 - 1; j++) free(com.z[j]);
1174 free(space);
1175 if (com.model && com.NSsites) /* branch-site model */
1176 for (i = 0; i < tree.nnode; i++) free(nodes[i].annotation);
1177 free(nodes);
1178 if (com.alpha || com.ncatG) {
1179 free(com.siterates); com.siterates = NULL;
1180 free(siteorder);
1181 if (com.siteID) free(com.siteID); com.siteID = NULL;
1182 }
1183 if (com.seqtype == 1 && com.model && com.NSsites) free(com.omegaBS);
1184 com.omegaBS = NULL;
1185
1186 exit(0);
1187 }
1188
1189
GetSpnamesFromMB(FILE * fmb,char line[],int lline)1190 int GetSpnamesFromMB(FILE *fmb, char line[], int lline)
1191 {
1192 /* This reads species names from MrBayes output file fmb, like the following.
1193
1194 Taxon 1 -> 1_Arabidopsis_thaliana
1195 Taxon 2 -> 2_Taxus_baccata
1196 */
1197 int j, ispecies;
1198 char *p = NULL, *mbstr1 = "Taxon ", *mbstr2 = "->";
1199
1200 puts("Reading species names from mb output file.\n");
1201 rewind(fmb);
1202 for (ispecies = 0; ; ) {
1203 if (fgets(line, lline, fmb) == NULL) return(-1);
1204 if (strstr(line, mbstr1) && strstr(line, mbstr2)) {
1205 p = strstr(line, mbstr1) + 5;
1206 sscanf(p, "%d", &ispecies);
1207 p = strstr(line, mbstr2) + 3;
1208 if (com.spname[ispecies - 1][0])
1209 error2("species name already read?");
1210
1211 for (j = 0; isgraph(*p) && j < lline; ) com.spname[ispecies - 1][j++] = *p++;
1212 com.spname[ispecies - 1][j] = 0;
1213
1214 printf("\tTaxon %2d: %s\n", ispecies, com.spname[ispecies - 1]);
1215 }
1216 else if (ispecies)
1217 break;
1218 }
1219 com.ns = ispecies;
1220 rewind(fmb);
1221
1222 return(0);
1223 }
1224
GrepLine(FILE * fin,char * query,char * line,int lline)1225 char *GrepLine(FILE *fin, char *query, char *line, int lline)
1226 {
1227 /* This greps infile to search for query[], and returns NULL or line[].
1228 */
1229 char *p = NULL;
1230
1231 rewind(fin);
1232 for (; ; ) {
1233 if (fgets(line, lline, fin) == NULL) return(NULL);
1234 if (strstr(line, query)) return(line);
1235 }
1236 return(NULL);
1237 }
1238
1239
CladeMrBayesProbabilities(char treefile[])1240 void CladeMrBayesProbabilities(char treefile[])
1241 {
1242 /* This reads a tree from treefile and then scans a set of MrBayes output files
1243 (mbfiles) to retrieve posterior probabilities for every clade in that tree.
1244 It first scans the first mb output file to get the species names.
1245
1246 Sample mb output:
1247 6 -- ...........................************* 8001 1.000 0.005 (0.000)
1248 7 -- ....................******************** 8001 1.000 0.006 (0.000)
1249
1250 Note 4 Jan 2014: This uses parti2B[], and is broken after i rewrote Tree2Partition().
1251 */
1252 int lline = 100000, i, j, k, nib, inode, parti2B[NS];
1253 char line[100000], *partition, *p;
1254 char symbol[2] = ".*", cladestr[NS + 1] = { 0 };
1255 FILE *ftree, *fmb[20];
1256 double *Pclade, t;
1257 int nmbfiles = 2;
1258 char *mbfiles[] = { "mb-1e-4.out", "mb-1e-1.out" };
1259
1260 printf("tree file is %s\nmb output files:\n", treefile);
1261 ftree = gfopen(treefile, "r");
1262 for (k = 0; k < nmbfiles; k++)
1263 fmb[k] = gfopen(mbfiles[k], "r");
1264 for (k = 0; k < nmbfiles; k++) printf("\t%s\n", mbfiles[k]);
1265
1266 GetSpnamesFromMB(fmb[0], line, lline); /* read species names from mb output */
1267
1268 fscanf(ftree, "%d%d", &i, &k);
1269 if (i && i != com.ns) error2("do you mean to specify ns in the tree file?");
1270 i = (com.ns * 2 - 1) * sizeof(struct TREEN);
1271 if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
1272 ReadTreeN(ftree, &i, 0, 1);
1273
1274 FPN(F0); OutTreeN(F0, 0, 0); FPN(F0); FPN(F0);
1275 nib = tree.nbranch - com.ns;
1276 for (i = 0; i < tree.nnode; i++) {
1277 nodes[i].annotation = NULL;
1278 if (i > com.ns) nodes[i].annotation = (char*)malloc(100 * sizeof(char));
1279 }
1280
1281 partition = (char*)malloc(nib*com.ns * sizeof(char));
1282 if (partition == NULL) error2("oom");
1283 if ((Pclade = (double*)malloc(nib*nmbfiles * sizeof(double))) == NULL)
1284 error2("oom");
1285 for (i = 0; i < nib*nmbfiles; i++) Pclade[i] = 0;
1286
1287 Tree2Partition(partition);
1288
1289 for (i = 0; i < nib; i++) {
1290 inode = tree.branches[parti2B[i]][1];
1291 if (partition[i*com.ns + 0])
1292 for (j = 0; j < com.ns; j++) cladestr[j] = symbol[1 - partition[i*com.ns + j]];
1293 else
1294 for (j = 0; j < com.ns; j++) cladestr[j] = symbol[partition[i*com.ns + j]];
1295 printf("#%2d branch %2d node %2d %s", i + 1, parti2B[i], inode, cladestr);
1296
1297 for (k = 0; k < nmbfiles; k++) {
1298 if (GrepLine(fmb[k], cladestr, line, lline)) {
1299 p = strstr(line, cladestr);
1300 sscanf(p + com.ns, "%lf%lf", &t, &Pclade[i*nmbfiles + k]);
1301 }
1302 }
1303 for (k = 0; k < nmbfiles; k++) printf("%6.2f", Pclade[i*nmbfiles + k]);
1304 FPN(F0);
1305 for (k = 0, p = nodes[inode].annotation; k < nmbfiles; k++) {
1306 sprintf(p, "%3.0f%s", Pclade[i*nmbfiles + k] * 100, (k < nmbfiles - 1 ? "/" : ""));
1307 p += 4;
1308 }
1309 }
1310 FPN(F0); OutTreeN(F0, 1, PrLabel); FPN(F0);
1311
1312 for (i = 0; i < tree.nnode; i++) free(nodes[i].annotation);
1313 free(nodes); free(partition); free(Pclade);
1314 fclose(ftree);
1315 for (k = 0; k < nmbfiles; k++) fclose(fmb[k]);
1316 exit(0);
1317 }
1318