1 /* evolver.c
2    Copyright, Ziheng Yang, April 1995.
3 
4      cl -Ox evolver.c tools.c
5      cl -Ox -DCodonNSbranches    -FeevolverNSbranches.exe    evolver.c tools.c
6      cl -Ox -DCodonNSsites       -FeevolverNSsites.exe       evolver.c tools.c
7      cl -Ox -DCodonNSbranchsites -FeevolverNSbranchsites.exe evolver.c tools.c
8 
9      cc -O3 -o evolver evolver.c tools.c -lm
10      cc -O4 -DCodonNSbranches -o evolverNSbranches evolver.c tools.c -lm
11      cc -O4 -DCodonNSsites -o evolverNSsites evolver.c tools.c -lm
12      cc -O4 -DCodonNSbranchsites -o evolverNSbranchsites evolver.c tools.c -lm
13 
14      evolver
15      evolver 5 MCbase.dat
16      evolver 6 MCcodon.dat
17      evolver 7 MCaa.dat
18      evolver 9 <TreesFile> <MasterTreeFile>
19 */
20 
21 /*
22 #define CodonNSbranches
23 #define CodonNSsites
24 #define CodonNSbranchsites
25 */
26 
27 #include "paml.h"
28 
29 #define NS            5000
30 #define NBRANCH       (NS*2-2)
31 #define MAXNSONS      20
32 #define LSPNAME       96
33 #define NCODE         64
34 #define NCATG         40
35 
36 
37 struct CommonInfo {
38    unsigned char *z[2 * NS - 1];
39    unsigned char *spname[NS], daafile[512], cleandata, readpattern;
40    int ns, ls, npatt, np, ntime, ncode, clock, rooted, model, icode;
41    int seqtype, *pose, ncatG, NSsites;
42    int ngene, lgene[1], posG[1 + 1];  /* not used */
43    double piG[1][4], rgene[1];  /* not used */
44    double *fpatt, kappa, omega, alpha, pi[64], *conP, daa[20 * 20];
45    double freqK[NCATG], rK[NCATG];
46    char *siteID;        /* used if ncatG>1 */
47    double *siterates;   /* rates for gamma or omega for site or branch-site models */
48    double *omegaBS, *QfactorBS;     /* omega IDs for branch-site models */
49 }  com;
50 struct TREEB {
51    int nbranch, nnode, root, branches[NBRANCH][2];
52 }  tree;
53 struct TREEN {
54    int father, nson, sons[MAXNSONS], ibranch;
55    double branch, age, omega, label, label2, *conP;
56    char *annotation, fossil;
57 }  *nodes;
58 
59 extern char BASEs[];
60 extern int GeneticCode[][64], noisy;
61 int LASTROUND = 0; /* not used */
62 
63 #define EVOLVER
64 #define NODESTRUCTURE
65 #define BIRTHDEATH
66 #include "treesub.c"
67 #include "treespace.c"
68 
69 void TreeDistances(FILE* fout);
70 void Simulate(char *ctlf);
71 void MakeSeq(char *z, int ls);
72 int EigenQbase(double rates[], double pi[], double Root[], double U[], double V[], double Q[]);
73 int EigenQcodon(int getstats, double kappa, double omega, double pi[], double Root[], double U[], double V[], double Q[]);
74 int EigenQaa(double pi[], double Root[], double U[], double V[], double Q[]);
75 void CladeMrBayesProbabilities(char treefile[]);
76 int between_f_and_x(void);
77 void LabelClades(FILE *fout);
78 
79 char *MCctlf0[] = { "MCbase.dat","MCcodon.dat","MCaa.dat" };
80 char *seqf[] = { "mc.paml", "mc.paml", "mc.nex", "mc.nex" };
81 
82 enum { JC69, K80, F81, F84, HKY85, T92, TN93, REV } BaseModels;
83 char *basemodels[] = { "JC69","K80","F81","F84","HKY85","T92","TN93","REV" };
84 enum { Poisson, EqualInput, Empirical, Empirical_F } AAModels;
85 char *aamodels[] = { "Poisson", "EqualInput", "Empirical", "Empirical_F" };
86 
87 
88 double PMat[NCODE*NCODE], U[NCODE*NCODE], V[NCODE*NCODE], Root[NCODE];
89 static double Qfactor = -1, Qrates[5];  /* Qrates[] hold kappa's for nucleotides */
90 
91 
main(int argc,char * argv[])92 int main(int argc, char *argv[])
93 {
94    char *MCctlf = NULL, outf[512] = "evolver.out", treefile[512] = "mcmc.txt", mastertreefile[512] = "\0";
95    int i, option = -1, ntree = 1, rooted, BD = 0, gotoption = 0, pick1tree = 0;
96    double bfactor = 1, birth = -1, death = -1, sample = -1, mut = -1, *space;
97    FILE *fout = gfopen(outf, "w");
98 
99    printf("EVOLVER in %s\n", pamlVerStr);
100    com.alpha = 0; com.cleandata = 1; com.model = 0; com.NSsites = 0;
101 
102    if (argc > 2 && !strcmp(argv[argc - 1], "--stdout-no-buf"))
103       setvbuf(stdout, NULL, _IONBF, 0);
104    if (argc > 1) {
105       gotoption = 1;   sscanf(argv[1], "%d", &option);
106    }
107    if (argc == 1)
108       printf("Results for options 1-4 & 8 go into %s\n", outf);
109    else if (option != 5 && option != 6 && option != 7 && option != 9) {
110       puts("Usage: \n\tevolver \n\tevolver option datafile"); exit(-1);
111    }
112    if (option >= 4 && option <= 6)
113       MCctlf = argv[2];
114    else if (option == 9) {
115       strcpy(treefile, argv[2]);
116       if (argc > 3) strcpy(mastertreefile, argv[3]);
117       if (argc > 4) sscanf(argv[4], "%d", &pick1tree);
118    }
119 
120    for (i = 0; i < NS; i++) {
121       com.spname[i] = (char*)malloc(LSPNAME * sizeof(char*));
122       if (com.spname[i] == NULL) error2("oom");
123    }
124 
125 #if defined (CodonNSbranches)
126    option = 6;  com.model = 1;
127    MCctlf = (argc == 3 ? argv[2] : "MCcodonNSbranches.dat");
128    gotoption = 1;
129 #elif defined (CodonNSsites)
130    option = 6;  com.NSsites = 3;
131    MCctlf = (argc == 3 ? argv[2] : "MCcodonNSsites.dat");
132    gotoption = 1;
133 #elif defined (CodonNSbranchsites)
134    option = 6;  com.model = 1; com.NSsites = 3;
135    MCctlf = (argc == 3 ? argv[2] : "MCcodonNSbranchsites.dat");
136    gotoption = 1;
137 #endif
138 
139    if (!gotoption) {
140       for (; ;) {
141          fflush(fout);
142          printf("\n\t(1) Get random UNROOTED trees?\n");
143          printf("\t(2) Get random ROOTED trees?\n");
144          printf("\t(3) List all UNROOTED trees?\n");
145          printf("\t(4) List all ROOTED trees?\n");
146          printf("\t(5) Simulate nucleotide data sets (use %s)?\n", MCctlf0[0]);
147          printf("\t(6) Simulate codon data sets      (use %s)?\n", MCctlf0[1]);
148          printf("\t(7) Simulate amino acid data sets (use %s)?\n", MCctlf0[2]);
149          printf("\t(8) Calculate identical bi-partitions between trees?\n");
150          printf("\t(9) Calculate clade support values (evolver 9 treefile mastertreefile <pick1tree>)?\n");
151          printf("\t(11) Label clades?\n");
152          printf("\t(0) Quit?\n");
153 
154          option = 5;
155          scanf("%d", &option);
156 
157          if (option == 0) exit(0);
158          if (option >= 5 && option <= 7) break;
159          if (option < 5) {
160             printf("No. of species: ");
161             scanf("%d", &com.ns);
162          }
163          if (com.ns > NS) error2("Too many species.  Raise NS.");
164          if ((space = (double*)malloc(10000 * sizeof(double))) == NULL) error2("oom");
165          rooted = !(option % 2);
166          if (option < 3) {
167             printf("\nnumber of trees & random number seed? ");
168             scanf("%d%d", &ntree, &i);
169             SetSeed(i, 1);
170             printf("Want branch lengths from the birth-death process (0/1)? ");
171             scanf("%d", &BD);
172          }
173          if (option <= 4) {
174             if (com.ns < 3) error2("no need to do this?");
175             i = (com.ns * 2 - 1) * sizeof(struct TREEN);
176             if ((nodes = (struct TREEN*)malloc(i)) == NULL)
177                error2("oom");
178          }
179          switch (option) {
180          case(1):   /* random UNROOTED trees */
181          case(2):   /* random ROOTED trees */
182             /* default names */
183             if (com.ns <= 52)
184                for (i = 0; i < com.ns; i++)  sprintf(com.spname[i], "%c", (i < 26 ? 'A' + i : 'a' + i - 26));
185             else
186                for (i = 0; i < com.ns; i++)  sprintf(com.spname[i], "S%d", i + 1);
187 
188             if (BD) {
189                printf("\nbirth rate, death rate, sampling fraction, and ");
190                printf("mutation rate (tree height)?\n");
191                scanf("%lf%lf%lf%lf", &birth, &death, &sample, &mut);
192             }
193             for (i = 0; i < ntree; i++) {
194                RandomLHistory(rooted, space);
195                if (BD)
196                   BranchLengthBD(1, birth, death, sample, mut);
197                if (com.ns < 20 && ntree < 10) { OutTreeN(F0, 0, BD); puts("\n"); }
198                OutTreeN(fout, 1, BD);  FPN(fout);
199             }
200             /*
201             for (i=0; i<com.ns-2-!rooted; i++)
202                Ib[i] = (int)((3.+i)*rndu());
203             MakeTreeIb (com.ns, Ib, rooted);
204             */
205             break;
206          case(3):
207          case(4):
208             ListTrees(fout, com.ns, rooted);
209             break;
210          case(8):  TreeDistances(fout);  break;
211          case(9):
212             printf("tree file names? ");
213             scanf("%s%s", treefile, mastertreefile);
214             break;
215          case(10): between_f_and_x();    break;
216          case(11): LabelClades(fout);    break;
217          default:  exit(0);
218          }
219       }
220    }
221 
222    if (option >= 5 && option <= 7) {
223       com.seqtype = option - 5;  /* 0, 1, 2 for bases, codons, & amino acids */
224       Simulate(MCctlf ? MCctlf : MCctlf0[option - 5]);
225    }
226    else if (option == 9) {
227 
228       CladeSupport(fout, treefile, 1, mastertreefile, pick1tree);
229       /* CladeMrBayesProbabilities("\data\Lakner2008SB/M1510YY03.nex.t"); */
230    }
231    return(0);
232 }
233 
234 
between_f_and_x(void)235 int between_f_and_x(void)
236 {
237    /* this helps with the exponential transform for frequency parameters */
238    int i, n, fromf = 0;
239    double x[100];
240 
241    for (;;) {
242       printf("\ndirection (0:x=>f; 1:f=>x; -1:end)  &  #classes? ");
243       scanf("%d", &fromf);
244       if (fromf == -1) return(0);
245       scanf("%d", &n);  if (n > 100) error2("too many classes");
246       printf("input the first %d values for %s? ", n - 1, (fromf ? "f" : "x"));
247       FOR(i, n - 1) scanf("%lf", &x[i]);
248       x[n - 1] = (fromf ? 1 - sum(x, n - 1) : 0);
249       f_and_x(x, x, n, fromf, 1);
250       matout(F0, x, 1, n);
251    }
252 }
253 
254 
LabelClades(FILE * fout)255 void LabelClades(FILE *fout)
256 {
257 /* This reads in a tree and scan species names to check whether they form a
258    paraphyletic group and then label the clade.
259    It assumes that the tree is unrooted, and so goes through two rounds to check
260    whether the remaining seqs form a monophyletic clade.
261 */
262    FILE *ftree;
263    int unrooted = 1, iclade, sizeclade, mrca, paraphyl, is, imrca, i, j, k, lasts, haslength;
264    char key[96] = "A", treef[64] = "/A/F/flu/HA.all.prankcodon.tre", *p, chosen[NS], *endstr = "end";
265    int *anc[NS - 1], loc, bitmask, SI = sizeof(int) * 8;
266    int debug;
267 
268    printf("Tree file name? ");
269    scanf("%s", treef);
270    printf("Treat tree as unrooted (0 no, 1 yes)? ");
271    scanf("%d", &unrooted);
272 
273    ftree = gfopen(treef, "r");
274    fscanf(ftree, "%d%d", &com.ns, &j);
275    if (com.ns <= 0) error2("need ns in tree file");
276    debug = (com.ns < 20);
277 
278    i = (com.ns * 2 - 1) * sizeof(struct TREEN);
279    if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
280    for (i = 0; i < com.ns * 2 - 1; i++)  nodes[i].annotation = NULL;
281    for (i = 0; i < com.ns - 1; i++) {
282       anc[i] = (int*)malloc((com.ns / SI + 1) * sizeof(int));
283       if (anc[i] == NULL)  error2("oom");
284    }
285    ReadTreeN(ftree, &haslength, 1, 0);
286    fclose(ftree);
287    if (debug) { OutTreeN(F0, 1, PrNodeNum);  FPN(F0); }
288 
289    for (iclade = 0; iclade < com.ns - 1; iclade++) {
290       printf("\nString for selecting sequences (followed by non-digit) (end to end)? ");
291       scanf("%s", key);
292       if (strcmp(endstr, key) == 0)
293          break;
294       for (i = 0; i < com.ns; i++)
295          chosen[i] = '\0';
296 
297       k = strlen(key);
298       for (i = 0; i < com.ns; i++) {
299          if ((p = strstr(com.spname[i], key))
300             && !isdigit(p[k]))
301             chosen[i] = 1;
302       }
303 
304       /*
305       for(i=0; i<com.ns; i++)
306          if(strstr(com.spname[i], key)) chosen[i] = 1;
307       */
308 
309       /* look for MRCA, going through two rounds, assuming unrooted tree */
310       for (imrca = 0; imrca < 1 + unrooted; imrca++) {
311          if (imrca)
312             for (i = 0; i < com.ns; i++) chosen[i] = 1 - chosen[i];
313 
314          for (i = 0, sizeclade = 0; i < com.ns; i++)
315             if (chosen[i]) {
316                sizeclade++;
317                lasts = i;
318             }
319 
320          if (sizeclade <= 1 || sizeclade >= com.ns - 1) {
321             puts("unable to form a clade.  <2 seqs.");
322             break;
323          }
324          for (i = 0; i < com.ns - 1; i++) for (j = 0; j < com.ns / SI + 1; j++)
325             anc[i][j] = 0;
326          for (is = 0; is < com.ns; is++) {
327             if (chosen[is] == 0) continue;
328             loc = is / SI;  bitmask = 1 << (is%SI);
329             for (j = nodes[is].father; j != -1; j = nodes[j].father) {
330                anc[j - com.ns][loc] |= bitmask;
331                if (is == lasts) {
332                   for (i = 0, k = 0; i < com.ns; i++)
333                      if (anc[j - com.ns][i / SI] & (1 << (i%SI)))
334                         k++;
335                   if (k == sizeclade) {
336                      mrca = j;  break;
337                   }
338                }
339             }
340          }
341          if (imrca == 0 && mrca != tree.root) /* 1st round is enough */
342             break;
343       }
344 
345       if (sizeclade <= 1 || sizeclade >= com.ns - 1 || mrca == tree.root) {
346          printf("Unable to label.  Ignored.");
347          continue;
348       }
349 
350       if (debug)
351          for (is = 0; is < com.ns - 1; is++) {
352             printf("\nnode %4d: ", is + com.ns);
353             for (j = 0; j < com.ns; j++) {
354                loc = j / SI;  bitmask = 1 << (j%SI);
355                printf(" %d", (anc[is][loc] & bitmask) != 0);
356             }
357          }
358 
359       printf("\nClade #%d (%s): %d seqs selected, MRCA is %d\n", iclade + 1, key, sizeclade, mrca + 1);
360       for (is = 0, paraphyl = 0; is < com.ns; is++) {
361          if (chosen[is] == 0)
362             for (j = nodes[is].father; j != -1; j = nodes[j].father)
363                if (j == mrca) { paraphyl++;  break; }
364       }
365       if (paraphyl)
366          printf("\nThis clade is paraphyletic, & includes %d other sequences\n", paraphyl);
367 
368       nodes[mrca].label = iclade + 1;
369       if (debug) OutTreeN(F0, 1, haslength | PrLabel);
370    }
371 
372    for (i = 0; i < com.ns - 1; i++)  free(anc[i]);
373    OutTreeN(fout, 1, haslength | PrLabel);  FPN(fout);
374    printf("Printed final tree with labels in evolver.out\n");
375    exit(0);
376 }
377 
TreeDistanceDistribution(FILE * fout)378 void TreeDistanceDistribution(FILE* fout)
379 {
380    /* This calculates figure 3.7 of Yang (2006).
381       This reads the file of all trees (such as 7s.all.trees), and calculates the
382       distribution of partition distance in all pairwise comparisons.
383    */
384    int i, j, ntree, k, *nib, nsame, IBsame[NS], lpart = 0;
385    char treef[64] = "5s.all.trees", *partition;
386    FILE *ftree;
387    double mPD[NS], PD1[NS];  /* distribution of partition distances */
388 
389    puts("Tree file name?");
390    scanf("%s", treef);
391 
392    ftree = gfopen(treef, "r");
393    fscanf(ftree, "%d%d", &com.ns, &ntree);
394    printf("%2d sequences %2d trees.\n", com.ns, ntree);
395    i = (com.ns * 2 - 1) * sizeof(struct TREEN);
396    if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
397 
398    lpart = (com.ns - 1)*com.ns * sizeof(char);
399    i = ntree*lpart;
400    printf("\n%d bytes of space requested.\n", i);
401    partition = (char*)malloc(i);
402    nib = (int*)malloc(ntree * sizeof(int));
403    if (partition == NULL || nib == NULL) error2("out of memory");
404 
405    puts("\ntree #: mean prop of tree pairs with 0 1 2 ... shared bipartitions\n");
406    fputs("\ntree #: prop of tree pairs with 0 1 2 ... shared bipartitions\n", fout);
407    for (i = 0; i < ntree; i++) {
408       ReadTreeN(ftree, &j, 0, 1);
409       nib[i] = tree.nbranch - com.ns;
410       Tree2Partition(partition + i*lpart);
411    }
412    for (k = 0; k < com.ns - 3; k++) mPD[k] = 0;
413    for (i = 0; i < ntree; i++, FPN(fout)) {
414       for (k = 0; k < com.ns - 3; k++) PD1[k] = 0;
415       for (j = 0; j < ntree; j++) {
416          if (j == i) continue;
417          nsame = NSameBranch(partition + i*lpart, partition + j*lpart, nib[i], nib[j], IBsame);
418          PD1[nsame] ++;
419       }
420       for (k = 0; k < com.ns - 3; k++) PD1[k] /= (ntree - 1.);
421       for (k = 0; k < com.ns - 3; k++) mPD[k] = (mPD[k] * i + PD1[k]) / (i + 1.);
422       printf("%8d (%5.1f%%):", i + 1, (i + 1.) / ntree * 100);
423       for (k = 0; k < com.ns - 3; k++) printf(" %7.4f", mPD[k]);
424       fprintf(fout, "%8d:", i + 1);  for (k = 0; k < com.ns - 3; k++) fprintf(fout, " %7.4f", PD1[k]);
425       printf("%s", (com.ns < 8 || (i + 1) % 100 == 0 ? "\n" : "\r"));
426    }
427    free(partition); free(nodes); free(nib); fclose(ftree);
428    exit(0);
429 }
430 
431 
TreeDistances(FILE * fout)432 void TreeDistances(FILE* fout)
433 {
434    /* I think this is broken after i changed the routine Tree2Partition().
435    */
436    int i, j, ntree, k, *nib, parti2B[NS], nsame, IBsame[NS], nIBsame[NS], lpart = 0;
437    char treef[64] = "5s.all.trees", *partition;
438    FILE *ftree;
439    double psame, mp, vp;
440 
441    /*
442    TreeDistanceDistribution(fout);
443    */
444 
445    puts("\nNumber of identical bi-partitions between trees.\nTree file name?");
446    scanf("%s", treef);
447 
448    ftree = gfopen(treef, "r");
449    fscanf(ftree, "%d%d", &com.ns, &ntree);
450    printf("%2d sequences %2d trees.\n", com.ns, ntree);
451    i = (com.ns * 2 - 1) * sizeof(struct TREEN);
452    if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
453 
454    if (ntree < 2) error2("ntree");
455    printf("\n%d species, %d trees\n", com.ns, ntree);
456    puts("\n\t1: first vs. rest?\n\t2: all pairwise comparisons?\n");
457    k = 2;
458    scanf("%d", &k);
459 
460    lpart = (com.ns - 1)*com.ns * sizeof(char);
461    i = (k == 1 ? 2 : ntree)*lpart;
462    printf("\n%d bytes of space requested.\n", i);
463    partition = (char*)malloc(i);
464    nib = (int*)malloc(ntree * sizeof(int));
465    if (partition == NULL || nib == NULL) error2("out of memory");
466 
467    if (k == 2) {    /* pairwise comparisons */
468       fputs("Number of identical bi-partitions in pairwise comparisons\n", fout);
469       for (i = 0; i < ntree; i++) {
470          ReadTreeN(ftree, &j, 0, 1);
471          nib[i] = tree.nbranch - com.ns;
472          Tree2Partition(partition + i*lpart);
473       }
474       for (i = 0; i < ntree; i++, FPN(F0), FPN(fout)) {
475          printf("%2d (%2d):", i + 1, nib[i]);
476          fprintf(fout, "%2d (%2d):", i + 1, nib[i]);
477          for (j = 0; j < i; j++) {
478             nsame = NSameBranch(partition + i*lpart, partition + j*lpart, nib[i], nib[j], IBsame);
479             printf(" %2d", nsame);
480             fprintf(fout, " %2d", nsame);
481          }
482       }
483    }
484    else {  /* first vs. others */
485       ReadTreeN(ftree, &j, 0, 1);
486       nib[0] = tree.nbranch - com.ns;
487       if (nib[0] == 0) error2("1st tree is a star tree..");
488       Tree2Partition(partition);
489       fputs("Comparing the first tree with the others\nFirst tree:\n", fout);
490       OutTreeN(fout, 0, 0);  FPN(fout);  OutTreeB(fout);  FPN(fout);
491       fputs("\nInternal branches in the first tree:\n", fout);
492       FOR(i, nib[0]) {
493          k = parti2B[i];
494          fprintf(fout, "%3d (%2d..%-2d): ( ",
495             i + 1, tree.branches[k][0] + 1, tree.branches[k][1] + 1);
496          FOR(j, com.ns) if (partition[i*com.ns + j]) fprintf(fout, "%d ", j + 1);
497          fputs(")\n", fout);
498       }
499       if (nodes[tree.root].nson <= 2)
500          fputs("\nRooted tree, results may not be correct.\n", fout);
501       fputs("\nCorrect internal branches compared with the 1st tree:\n", fout);
502       FOR(k, nib[0]) nIBsame[k] = 0;
503       for (i = 1, mp = vp = 0; i < ntree; i++, FPN(fout)) {
504          ReadTreeN(ftree, &j, 0, 1);
505          nib[1] = tree.nbranch - com.ns;
506          Tree2Partition(partition + lpart);
507          nsame = NSameBranch(partition, partition + lpart, nib[0], nib[1], IBsame);
508 
509          psame = nsame / (double)nib[0];
510          FOR(k, nib[0]) nIBsame[k] += IBsame[k];
511          fprintf(fout, "1 vs. %3d: %4d: ", i + 1, nsame);
512          FOR(k, nib[0]) if (IBsame[k]) fprintf(fout, " %2d", k + 1);
513          printf("1 vs. %5d: %6d/%d  %10.4f\n", i + 1, nsame, nib[0], psame);
514          vp += square(psame - mp)*(i - 1.) / i;
515          mp = (mp*(i - 1.) + psame) / i;
516       }
517       vp = (ntree <= 2 ? 0 : sqrt(vp / ((ntree - 1 - 1)*(ntree - 1.))));
518       fprintf(fout, "\nmean and S.E. of proportion of identical partitions\n");
519       fprintf(fout, "between the 1st and all the other %d trees ", ntree - 1);
520       fprintf(fout, "(ignore these if not revelant):\n %.4f +- %.4f\n", mp, vp);
521       fprintf(fout, "\nNumbers of times, out of %d, ", ntree - 1);
522       fprintf(fout, "interior branches of tree 1 are present");
523       fputs("\n(This may be bootstrap support for nodes in tree 1)\n", fout);
524       FOR(k, nib[0]) {
525          i = tree.branches[parti2B[k]][0] + 1;  j = tree.branches[parti2B[k]][1] + 1;
526          fprintf(fout, "%3d (%2d..%-2d): %6d (%5.1f%%)\n",
527             k + 1, i, j, nIBsame[k], nIBsame[k] * 100. / (ntree - 1.));
528       }
529    }
530    free(partition);  free(nodes); free(nib);  fclose(ftree);
531 }
532 
533 
EigenQbase(double rates[],double pi[],double Root[],double U[],double V[],double Q[])534 int EigenQbase(double rates[], double pi[], double Root[], double U[], double V[], double Q[])
535 {
536    /* Construct the rate matrix Q[] for nucleotide model REV.
537    */
538    int i, j, k;
539    double mr, space[4];
540 
541    zero(Q, 16);
542    for (i = 0, k = 0; i < 3; i++) for (j = i + 1; j < 4; j++)
543       if (i * 4 + j != 11) Q[i * 4 + j] = Q[j * 4 + i] = rates[k++];
544    for (i = 0, Q[3 * 4 + 2] = Q[2 * 4 + 3] = 1; i < 4; i++) FOR(j, 4) Q[i * 4 + j] *= pi[j];
545    for (i = 0, mr = 0; i < 4; i++)
546    {
547       Q[i * 4 + i] = 0; Q[i * 4 + i] = -sum(Q + i * 4, 4); mr -= pi[i] * Q[i * 4 + i];
548    }
549    abyx(1 / mr, Q, 16);
550 
551    eigenQREV(Q, com.pi, 4, Root, U, V, space);
552    return (0);
553 }
554 
555 
556 static double freqK_NS = -1;
557 
EigenQcodon(int getstats,double kappa,double omega,double pi[],double Root[],double U[],double V[],double Q[])558 int EigenQcodon(int getstats, double kappa, double omega, double pi[],
559    double Root[], double U[], double V[], double Q[])
560 {
561    /* Construct the rate matrix Q[].
562       64 codons are used, and stop codons have 0 freqs.
563    */
564    int n = com.ncode, i, j, k, c[2], ndiff, pos = 0, from[3], to[3];
565    double mr, space[64];
566 
567    for (i = 0; i < n*n; i++) Q[i] = 0;
568    for (i = 0; i < n; i++) FOR(j, i) {
569       from[0] = i / 16; from[1] = (i / 4) % 4; from[2] = i % 4;
570       to[0] = j / 16;   to[1] = (j / 4) % 4;   to[2] = j % 4;
571       c[0] = GeneticCode[com.icode][i];   c[1] = GeneticCode[com.icode][j];
572       if (c[0] == -1 || c[1] == -1)  continue;
573       for (k = 0, ndiff = 0; k < 3; k++)  if (from[k] != to[k]) { ndiff++; pos = k; }
574       if (ndiff != 1)  continue;
575       Q[i*n + j] = 1;
576       if ((from[pos] + to[pos] - 1)*(from[pos] + to[pos] - 5) == 0)  Q[i*n + j] *= kappa;
577       if (c[0] != c[1])  Q[i*n + j] *= omega;
578       Q[j*n + i] = Q[i*n + j];
579    }
580    for (i = 0; i < n; i++) for (j = 0; j < n; j++)
581       Q[i*n + j] *= com.pi[j];
582    for (i = 0, mr = 0; i < n; i++) {
583       Q[i*n + i] = -sum(Q + i*n, n);
584       mr -= pi[i] * Q[i*n + i];
585    }
586 
587    if (getstats)
588       Qfactor += freqK_NS * mr;
589    else {
590       if (com.ncatG == 0) FOR(i, n*n) Q[i] *= 1 / mr;
591       else             FOR(i, n*n) Q[i] *= Qfactor;  /* NSsites models */
592       eigenQREV(Q, com.pi, n, Root, U, V, space);
593    }
594    return (0);
595 }
596 
597 
598 
EigenQaa(double pi[],double Root[],double U[],double V[],double Q[])599 int EigenQaa(double pi[], double Root[], double U[], double V[], double Q[])
600 {
601    /* Construct the rate matrix Q[]
602    */
603    int n = 20, i, j;
604    double mr, space[20];
605 
606    FOR(i, n*n) Q[i] = 0;
607    switch (com.model) {
608    case (Poisson): case (EqualInput):
609       fillxc(Q, 1., n*n);  break;
610    case (Empirical): case (Empirical_F):
611       FOR(i, n) FOR(j, i) Q[i*n + j] = Q[j*n + i] = com.daa[i*n + j] / 100;
612       break;
613    }
614    FOR(i, n) FOR(j, n) Q[i*n + j] *= com.pi[j];
615    for (i = 0, mr = 0; i < n; i++) {
616       Q[i*n + i] = 0; Q[i*n + i] = -sum(Q + i*n, n);  mr -= com.pi[i] * Q[i*n + i];
617    }
618 
619    eigenQREV(Q, com.pi, n, Root, U, V, space);
620    FOR(i, n)  Root[i] = Root[i] / mr;
621 
622    return (0);
623 }
624 
625 
GetDaa(FILE * fout,double daa[])626 int GetDaa(FILE* fout, double daa[])
627 {
628    /* Get the amino acid substitution rate matrix (grantham, dayhoff, jones, etc).
629    */
630    FILE * fdaa;
631    char aa3[4] = "";
632    int i, j, n = 20;
633 
634    fdaa = gfopen(com.daafile, "r");
635    printf("\nReading rate matrix from %s\n", com.daafile);
636 
637    for (i = 0; i < n; i++)  for (j = 0, daa[i*n + i] = 0; j < i; j++) {
638       fscanf(fdaa, "%lf", &daa[i*n + j]);
639       daa[j*n + i] = daa[i*n + j];
640    }
641    if (com.model == Empirical) {
642       FOR(i, n) if (fscanf(fdaa, "%lf", &com.pi[i]) != 1) error2("err aaRatefile");
643       if (fabs(1 - sum(com.pi, 20)) > 1e-4) error2("\nSum of aa freq. != 1\n");
644    }
645    fclose(fdaa);
646 
647    if (fout) {
648       fprintf(fout, "\n%s\n", com.daafile);
649       FOR(i, n) {
650          fprintf(fout, "\n%4s", getAAstr(aa3, i));
651          FOR(j, i)  fprintf(fout, "%5.0f", daa[i*n + j]);
652       }
653       FPN(fout);
654    }
655 
656    return (0);
657 }
658 
659 
660 
661 
MakeSeq(char * z,int ls)662 void MakeSeq(char*z, int ls)
663 {
664    /* generate a random sequence of nucleotides, codons, or amino acids by
665       sampling com.pi[], or read the ancestral sequence from the file RootSeq.txt
666       if the file exists.
667    */
668    int i, j, h, n = com.ncode, ch, n31 = (com.seqtype == 1 ? 3 : 1), lst;
669    double p[64], r, small = 5e-5;
670    unsigned char *pch = (com.seqtype == 2 ? AAs : BASEs);
671    unsigned char rootseqf[] = "RootSeq.txt", codon[4] = "   ";
672    FILE *frootseq = (FILE*)fopen(rootseqf, "r");
673    static int times = 0;
674 
675    if (frootseq) {
676       if (times++ == 0) printf("Reading sequence at the root from file.\n\n");
677       if (com.siterates && com.ncatG > 1)
678          error2("sequence for root doesn't work for site-class models");
679 
680       for (lst = 0; ; ) {
681          for (i = 0; i < n31; i++) {
682             while ((ch = fgetc(frootseq)) != EOF && !isalpha(ch));
683             if (ch == EOF) error2("EOF when reading root sequence.");
684             if (isalpha(ch))
685                codon[i] = (unsigned char)(ch = CodeChara((char)ch, com.seqtype));
686          }
687          if (com.seqtype == 1) ch = codon[0] * 16 + codon[1] * 4 + codon[2];
688          if (ch<0 || ch>n - 1)
689             printf("error when reading site %d\n", lst + 1);
690          if (com.seqtype == 1 && com.pi[ch] == 0)
691             printf("you seem to have a stop codon in the root sequence\n");
692 
693          z[lst++] = (unsigned char)ch;
694          if (lst == com.ls) break;
695       }
696       fclose(frootseq);
697    }
698    else {
699       for (j = 0; j < n; j++)  p[j] = com.pi[j];
700       for (j = 1; j < n; j++)  p[j] += p[j - 1];
701       if (fabs(p[n - 1] - 1) > small) {
702          printf("\nsum pi = %.6f != 1!\n", p[n - 1]);
703          exit(-1);
704       }
705       for (h = 0; h < com.ls; h++) {
706          for (j = 0, r = rndu(); j < n - 1; j++)
707             if (r < p[j]) break;
708          z[h] = (unsigned char)j;
709       }
710    }
711 }
712 
713 
714 
Evolve(int inode)715 void Evolve(int inode)
716 {
717    /* evolve sequence com.z[tree.root] along the tree to generate com.z[],
718       using nodes[].branch, nodes[].omega, & com.model
719       Needs com.z[0,1,...,nnode-1], while com.z[0] -- com.z[ns-1] constitute
720       the data.
721       For codon sequences, com.siterates[] has w's for NSsites and NSbranchsite models.
722    */
723    int is, h, i, j, ison, from, n = com.ncode, longseq = 100000;
724    double t, rw;
725 
726    for (is = 0; is < nodes[inode].nson; is++) {
727       ison = nodes[inode].sons[is];
728       memcpy(com.z[ison], com.z[inode], com.ls * sizeof(unsigned char));
729       t = nodes[ison].branch;
730 
731       if (com.seqtype == 1 && com.model && com.NSsites) { /* branch-site models */
732          Qfactor = com.QfactorBS[ison];
733          for (h = 0; h < com.ls; h++)
734             com.siterates[h] = com.omegaBS[ison*com.ncatG + com.siteID[h]];
735       }
736 
737       for (h = 0; h < com.ls; h++) {
738          /* decide whether to recalcualte PMat[]. */
739          if (h == 0 || (com.siterates && com.siterates[h] != com.siterates[h - 1])) {
740             rw = (com.siterates ? com.siterates[h] : 1);
741 
742             switch (com.seqtype) {
743             case (BASEseq):
744                if (com.model <= TN93)
745                   PMatTN93(PMat, t*Qfactor*rw*Qrates[0], t*Qfactor*rw*Qrates[1], t*Qfactor*rw, com.pi);
746                else if (com.model == REV)
747                   PMatUVRoot(PMat, t*rw, com.ncode, U, V, Root);
748                break;
749 
750             case (CODONseq): /* Watch out for NSsites model */
751                if (com.model || com.NSsites) { /* no need to update UVRoot if M0 */
752                   if (com.model && com.NSsites == 0) /* branch */
753                      rw = nodes[ison].omega;  /* should be equal to com.rK[nodes[].label] */
754 
755                   EigenQcodon(0, com.kappa, rw, com.pi, Root, U, V, PMat);
756                }
757                PMatUVRoot(PMat, t, com.ncode, U, V, Root);
758                break;
759 
760             case (AAseq):
761                PMatUVRoot(PMat, t*rw, com.ncode, U, V, Root);
762                break;
763             }
764             for (i = 0; i < n; i++)
765                for (j = 1; j < n; j++)
766                   PMat[i*n + j] += PMat[i*n + j - 1];
767          }
768          for (j = 0, from = com.z[ison][h], rw = rndu(); j < n - 1; j++)
769             if (rw < PMat[from*n + j]) break;
770          com.z[ison][h] = j;
771       }
772 
773       if (com.ls > longseq) printf("\r   nodes %2d -> %2d, evolving . .   ", inode + 1, ison + 1);
774 
775       if (nodes[ison].nson) Evolve(ison);
776    }  /* for (is) */
777 
778    if (inode == tree.root && com.ls > longseq)  printf("\r%s", strc(50, ' '));
779 }
780 
781 
782 
Simulate(char * ctlf)783 void Simulate(char *ctlf)
784 {
785    /* simulate nr data sets of nucleotide, codon, or AA sequences.
786       ls: number of nucleotides, codons, or AAs in each sequence.
787       All 64 codons are used for codon sequences.
788       When com.alpha or com.ncatG>1, sites are randomized after sequences are
789       generated.
790       space[com.ls] is used to hold site marks.
791       format:  0: paml sites; 1: paml patterns; 2: paup nex; 3: paup JC69 format
792     */
793    char *ancf = "ancestral.txt", *siteIDf = "siterates.txt";
794    FILE *fin, *fseq, *ftree = NULL, *fanc = NULL, *fsiteID = NULL;
795    char *paupstart = "paupstart", *paupblock = "paupblock", *paupend = "paupend";
796    char line[32000];
797    int lline = 32000, i, j, k, ir, n, nr, fixtree = 1, sspace = 10000, rooted = 1;
798    int h = 0, format = 0, b[3] = { 0 }, nrate = 1, counts[NCATG];
799    int *siteorder = NULL;
800    char *tmpseq = NULL, *pc;
801    double birth = 0, death = 0, sample = 1, mut = 1, tlength, *space, *blengthBS;
802    double T, C, A, G, Y, R, Falias[NCATG];
803    int    Lalias[NCATG];
804 
805    noisy = 1;
806    printf("\nReading options from data file %s\n", ctlf);
807    com.ncode = n = (com.seqtype == 0 ? 4 : (com.seqtype == 1 ? 64 : 20));
808    fin = (FILE*)gfopen(ctlf, "r");
809    fscanf(fin, "%d", &format);
810    fgets(line, lline, fin);
811    printf("\nSimulated data will go into %s.\n", seqf[format]);
812    if (format == 2) printf("%s, %s, & %s will be appended if existent.\n", paupstart, paupblock, paupend);
813 
814    fscanf(fin, "%d", &i);
815    fgets(line, lline, fin);
816    SetSeed(i, 1);
817    fscanf(fin, "%d%d%d", &com.ns, &com.ls, &nr);
818    fgets(line, lline, fin);
819    i = (com.ns * 2 - 1) * sizeof(struct TREEN);
820    if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
821 
822    if (com.ns > NS) error2("too many seqs?");
823    printf("\n%d seqs, %d sites, %d replicate(s)\n", com.ns, com.ls, nr);
824    k = (com.ns*com.ls* (com.seqtype == CODONseq ? 4 : 1) *nr) / 1000 + 1;
825    printf("Seq file will be about %dK bytes.\n", k);
826    for (i = 0; i < com.ns; i++)          /* default spname */
827       sprintf(com.spname[i], "S%d", i + 1);
828 
829    if (fixtree) {
830       fscanf(fin, "%lf", &tlength);   fgets(line, lline, fin);
831       if (ReadTreeN(fin, &i, 1, 1))  /* might overwrite spname */
832          error2("err tree..");
833 
834       if (i == 0) error2("use : to specify branch lengths in tree");
835       for (i = 0, T = 0; i < tree.nnode; i++)
836          if (i != tree.root) T += nodes[i].branch;
837       if (tlength > 0) {
838          for (i = 0; i < tree.nnode; i++)
839             if (i != tree.root) nodes[i].branch *= tlength / T;
840       }
841       printf("tree length = %.3f\n", (tlength > 0 ? tlength : T));
842       if (com.ns < 100) {
843          printf("\nModel tree & branch lengths:\n");
844          OutTreeN(F0, 1, 1); FPN(F0);
845          OutTreeN(F0, 0, 1); FPN(F0);
846       }
847       if (com.seqtype == CODONseq && com.model && !com.NSsites) { /* branch model */
848          FOR(i, tree.nnode) nodes[i].omega = nodes[i].label;
849          FPN(F0);  OutTreeN(F0, 1, PrBranch | PrLabel);  FPN(F0);
850       }
851    }
852    else {   /* random trees, broken or need testing? */
853       printf("\nbirth rate, death rate, sampling fraction, mutation rate (tree height)?\n");
854       fscanf(fin, "%lf%lf%lf%lf", &birth, &death, &sample, &mut);
855       fgets(line, lline, fin);
856       printf("%9.4f %9.4f %9.4f %9.4f\n", birth, death, sample, mut);
857    }
858 
859    if (com.seqtype == BASEseq) {
860       fscanf(fin, "%d", &com.model);
861       fgets(line, lline, fin);
862       if (com.model<0 || com.model>REV) error2("model err");
863       if (com.model == T92) error2("T92: please use HKY85 with T=A and C=G.");
864 
865       printf("\nModel: %s\n", basemodels[com.model]);
866       if (com.model == REV)        nrate = 5;
867       else if (com.model == TN93)  nrate = 2;
868       FOR(i, nrate) fscanf(fin, "%lf", &Qrates[i]);
869       fgets(line, lline, fin);
870       if (nrate <= 2) FOR(i, nrate) printf("kappa %9.5f\n", Qrates[i]); FPN(F0);
871       if (nrate == 5) {
872          printf("a & b & c & d & e: ");
873          FOR(i, nrate) printf("%9.5f", Qrates[i]); FPN(F0);
874       }
875       if ((com.model == JC69 || com.model == F81) && Qrates[0] != 1)
876          error2("kappa should be 1 for this model");
877    }
878    else if (com.seqtype == CODONseq) {
879       for (i = 0; i < 64; i++)
880          getcodon(CODONs[i], i);
881       if (com.model == 0 && com.NSsites) {  /* site model */
882          fscanf(fin, "%d", &com.ncatG);   fgets(line, lline, fin);
883          if (com.ncatG > NCATG) error2("ncatG>NCATG");
884          FOR(i, com.ncatG) fscanf(fin, "%lf", &com.freqK[i]);  fgets(line, lline, fin);
885          FOR(i, com.ncatG) fscanf(fin, "%lf", &com.rK[i]);     fgets(line, lline, fin);
886          printf("\n\ndN/dS (w) for site classes (K=%d)", com.ncatG);
887          printf("\nf: ");  FOR(i, com.ncatG) printf("%9.5f", com.freqK[i]);
888          printf("\nw: ");  FOR(i, com.ncatG) printf("%9.5f", com.rK[i]);  FPN(F0);
889       }
890       else if (com.model && com.NSsites) {  /* branchsite model */
891          fscanf(fin, "%d", &com.ncatG);   fgets(line, lline, fin);
892          if (com.ncatG > min2(NCATG, 127)) error2("ncatG too large");
893          FOR(i, com.ncatG) fscanf(fin, "%lf", &com.freqK[i]);  fgets(line, lline, fin);
894          printf("\n%d site classes.\nFreqs: ", com.ncatG);
895          FOR(i, com.ncatG) printf("%9.5f", com.freqK[i]);
896 
897          if ((com.omegaBS = (double*)malloc((com.ncatG + 2)*tree.nnode * sizeof(double))) == NULL)
898             error2("oom");
899          com.QfactorBS = com.omegaBS + com.ncatG*tree.nnode;
900          blengthBS = com.QfactorBS + tree.nnode;
901 
902          for (i = 0; i < tree.nnode; i++)
903             blengthBS[i] = nodes[i].branch;
904          for (k = 0; k < com.ncatG; k++) {
905             ReadTreeN(fin, &i, 0, 1);
906             if (i) error2("do not include branch lengths except in the first tree.");
907             /* if (!j) error2("Use # to specify omega's for branches"); */
908             for (i = 0; i < tree.nnode; i++)  com.omegaBS[i*com.ncatG + k] = nodes[i].label;
909          }
910          for (i = 0; i < tree.nnode; i++)
911          {
912             nodes[i].branch = blengthBS[i];  nodes[i].label = nodes[i].omega = 0;
913          }
914          for (i = 0; i < tree.nnode; i++) {  /* print out omega as node labels. */
915             nodes[i].annotation = pc = (char*)malloc(20 * com.ncatG * sizeof(char));
916             sprintf(pc, "'[%.2f", com.omegaBS[i*com.ncatG + 0]);
917             for (k = 1, pc += strlen(pc); k < com.ncatG; k++, pc += strlen(pc))
918                sprintf(pc, ", %.2f", com.omegaBS[i*com.ncatG + k]);
919             sprintf(pc, "]'");
920          }
921          FPN(F0);  OutTreeN(F0, 1, PrBranch | PrLabel);  FPN(F0);
922       }
923       else if (com.model == 0) {  /* M0 */
924          fscanf(fin, "%lf", &com.omega);
925          fgets(line, lline, fin);
926          printf("omega = %9.5f\n", com.omega);
927          for (i = 0; i < tree.nbranch; i++)
928             nodes[tree.branches[i][1]].omega = com.omega;
929       }
930 
931       fscanf(fin, "%lf", &com.kappa);   fgets(line, lline, fin);
932       printf("kappa = %9.5f\n", com.kappa);
933    }
934 
935    if (com.seqtype == BASEseq || com.seqtype == AAseq) {
936       fscanf(fin, "%lf%d", &com.alpha, &com.ncatG);
937       fgets(line, lline, fin);
938       if (com.alpha)
939          printf("Gamma rates, alpha =%.4f (K=%d)\n", com.alpha, com.ncatG);
940       else {
941          com.ncatG = 0;
942          puts("Rates are constant over sites.");
943       }
944    }
945    if (com.alpha || com.ncatG) { /* this is used for codon NSsites as well. */
946       k = com.ls;
947       if (com.seqtype == 1 && com.model && com.NSsites) k *= tree.nnode;
948       if ((com.siterates = (double*)malloc(k * sizeof(double))) == NULL) error2("oom1");
949       if ((siteorder = (int*)malloc(com.ls * sizeof(int))) == NULL) error2("oom2");
950    }
951 
952    if (com.seqtype == AAseq) { /* get aa substitution model and rate matrix */
953       fscanf(fin, "%d", &com.model);
954       printf("\nmodel: %s", aamodels[com.model]);
955       if (com.model >= 2) { fscanf(fin, "%s", com.daafile); GetDaa(NULL, com.daa); }
956       fgets(line, lline, fin);
957    }
958 
959    /* get freqs com.pi[] */
960    if ((com.seqtype == BASEseq && com.model > K80) ||
961       com.seqtype == CODONseq ||
962       (com.seqtype == AAseq && (com.model == 1 || com.model == 3)))
963       for (k = 0; k < com.ncode; k++) fscanf(fin, "%lf", &com.pi[k]);
964    else if (com.model == 0 || (com.seqtype == BASEseq && com.model <= K80))
965       fillxc(com.pi, 1. / com.ncode, com.ncode);
966 
967    printf("sum pi = 1 = %.6f:", sum(com.pi, com.ncode));
968    matout2(F0, com.pi, com.ncode / 4, 4, 9, 6);
969    if (com.seqtype == CODONseq) {
970       fscanf(fin, "%d", &com.icode);   fgets(line, lline, fin);
971       printf("genetic code = %d\n", com.icode);
972       for (k = 0; k < com.ncode; k++)
973          if (GeneticCode[com.icode][k] == -1 && com.pi[k])
974             error2("stop codons should have frequency 0?");
975    }
976 
977    if (com.seqtype == BASEseq) {
978       if (com.model < REV) {
979          T = com.pi[0]; C = com.pi[1]; A = com.pi[2]; G = com.pi[3]; Y = T + C; R = A + G;
980          if (com.model == F84) {
981             Qrates[1] = 1 + Qrates[0] / R;   /* kappa2 */
982             Qrates[0] = 1 + Qrates[0] / Y;   /* kappa1 */
983          }
984          else if (com.model <= HKY85) Qrates[1] = Qrates[0];
985          Qfactor = 1 / (2 * T*C*Qrates[0] + 2 * A*G*Qrates[1] + 2 * Y*R);
986       }
987       else
988          if (com.model == REV) EigenQbase(Qrates, com.pi, Root, U, V, PMat);
989    }
990 
991    /* get Qfactor for NSsites & NSbranchsite models */
992    if (com.seqtype == CODONseq && com.NSsites) {
993       if (!com.model) {  /* site models */
994          for (k = 0, Qfactor = 0; k < com.ncatG; k++) {
995             freqK_NS = com.freqK[k];
996             EigenQcodon(1, com.kappa, com.rK[k], com.pi, NULL, NULL, NULL, PMat);
997          }
998          Qfactor = 1 / Qfactor;
999          printf("Qfactor for NSsites model = %9.5f\n", Qfactor);
1000       }
1001       else {            /* branch-site models */
1002          for (i = 0; i < tree.nnode; i++) {
1003             if (i == tree.root) { com.QfactorBS[i] = -1; continue; }
1004             for (k = 0, Qfactor = 0; k < com.ncatG; k++) {
1005                freqK_NS = com.freqK[k];
1006                EigenQcodon(1, com.kappa, com.omegaBS[i*com.ncatG + k], com.pi, NULL, NULL, NULL, PMat);
1007             }
1008             com.QfactorBS[i] = 1 / Qfactor;  Qfactor = 0;
1009             printf("node %2d: Qfactor = %9.5f\n", i + 1, com.QfactorBS[i]);
1010          }
1011       }
1012    }
1013    if (com.seqtype == CODONseq && com.ncatG <= 1 && com.model == 0)
1014       EigenQcodon(0, com.kappa, com.omega, com.pi, Root, U, V, PMat);
1015    else if (com.seqtype == AAseq)
1016       EigenQaa(com.pi, Root, U, V, PMat);
1017 
1018    puts("\nAll parameters are read.  Ready to simulate\n");
1019    sspace = max2(sspace, 8000000);
1020    space = (double*)malloc(sspace);
1021    if (com.alpha || com.ncatG) tmpseq = (char*)space;
1022    if (space == NULL) {
1023       printf("oom for space, %d bytes needed.", sspace);
1024       exit(-1);
1025    }
1026 
1027    fseq = gfopen(seqf[format], "w");
1028    if (format == 2 || format == 3) appendfile(fseq, paupstart);
1029 
1030    fanc = (FILE*)gfopen(ancf, "w");
1031    if (fixtree) {
1032       fputs("\nAncestral sequences generated during simulation ", fanc);
1033       fprintf(fanc, "(check against %s)\n", seqf[format]);
1034       OutTreeN(fanc, 0, 0); FPN(fanc); OutTreeB(fanc); FPN(fanc);
1035    }
1036    if (com.alpha || com.NSsites) {
1037       fsiteID = (FILE*)gfopen(siteIDf, "w");
1038       if (com.seqtype == 1) fprintf(fsiteID, "\nSite class IDs\n");
1039       else               fprintf(fsiteID, "\nRates for sites\n");
1040       if (com.seqtype == CODONseq && com.NSsites) {
1041          if (!com.model) matout(fsiteID, com.rK, 1, com.ncatG);
1042          if ((com.siteID = (char*)malloc(com.ls * sizeof(char))) == NULL)
1043             error2("oom siteID");
1044       }
1045    }
1046 
1047    for (ir = 0; ir < nr; ir++) {
1048       for (j = 0; j < com.ns * 2 - 1; j++) {
1049          com.z[j] = (unsigned char*)realloc(com.z[j], com.ls * sizeof(unsigned char));
1050          if (com.z[j] == NULL) error2("memory problem for z[]");
1051       }
1052       if (!fixtree) {    /* right now tree is fixed */
1053          RandomLHistory(rooted, space);
1054          if (rooted && com.ns < 10) j = GetIofLHistory();
1055          BranchLengthBD(1, birth, death, sample, mut);
1056          if (com.ns < 20) {
1057             printf("\ntree used: ");
1058             OutTreeN(F0, 1, 1);
1059             FPN(F0);
1060          }
1061       }
1062       MakeSeq(com.z[tree.root], com.ls);
1063 
1064       if (com.alpha)
1065          Rates4Sites(com.siterates, com.alpha, com.ncatG, com.ls, 0, space);
1066       else if (com.seqtype == 1 && com.NSsites) { /* for NSsites */
1067          /* the table for the alias algorithm is the same, but ncatG is small. */
1068          MultiNomialAliasSetTable(com.ncatG, com.freqK, Falias, Lalias, space);
1069          MultiNomialAlias(com.ls, com.ncatG, Falias, Lalias, counts);
1070 
1071          for (i = 0, h = 0; i < com.ncatG; i++)
1072             for (j = 0; j < counts[i]; j++) {
1073                com.siteID[h] = (char)i;
1074                com.siterates[h++] = com.rK[i]; /* overwritten later for branchsite */
1075             }
1076       }
1077 
1078       Evolve(tree.root);
1079 
1080       /* randomize sites for site-class model */
1081       if (com.siterates && com.ncatG > 1) {
1082          if (format == 1 && ir == 0)
1083             puts("\nrequested site pattern counts as output for site-class model.\n");
1084          randorder(siteorder, com.ls, (int*)space);
1085          for (j = 0; j < tree.nnode; j++) {
1086             memcpy(tmpseq, com.z[j], com.ls * sizeof(char));
1087             for (h = 0; h < com.ls; h++) com.z[j][h] = tmpseq[siteorder[h]];
1088          }
1089          if (com.alpha || com.ncatG > 1) {
1090             memcpy(space, com.siterates, com.ls * sizeof(double));
1091             for (h = 0; h < com.ls; h++) com.siterates[h] = space[siteorder[h]];
1092          }
1093          if (com.siteID) {
1094             memcpy((char*)space, com.siteID, com.ls * sizeof(char));
1095             for (h = 0; h < com.ls; h++) com.siteID[h] = *((char*)space + siteorder[h]);
1096          }
1097       }
1098 
1099       /* print sequences*/
1100       if (format == 1 || format == 3) {
1101          for (i = 0; i < com.ns; i++) for (h = 0; h < com.ls; h++)    com.z[i][h] ++;  /* code as 1, 2, ... */
1102          PatternWeightSimple();
1103          for (i = 0; i < com.ns; i++) for (h = 0; h < com.npatt; h++) com.z[i][h] --;  /* code as 0, 1, ... */
1104          if (format == 3)
1105             PatternWeightJC69like();
1106       }
1107       if (format == 2 || format == 3) fprintf(fseq, "\n\n[Replicate # %d]\n", ir + 1);
1108       printSeqs(fseq, com.z, com.spname, com.ns, com.ls, com.npatt, com.fpatt, NULL, NULL, format); /* printsma not usable as it codes into 0,1,...,60. */
1109       if ((format == 2 || format == 3) && !fixtree) {
1110          fprintf(fseq, "\nbegin tree;\n   tree true_tree = [&U] ");
1111          OutTreeN(fseq, 1, 1); fputs(";\n", fseq);
1112          fprintf(fseq, "end;\n\n");
1113       }
1114       if (format == 2 || format == 3) appendfile(fseq, paupblock);
1115 
1116       /* print ancestral seqs, rates for sites. */
1117       if (format != 1 && format != 3) {  /* don't print ancestors if site patterns are printed. */
1118          j = (com.seqtype == CODONseq ? 3 * com.ls : com.ls);
1119          fprintf(fanc, "[replicate %d]\n", ir + 1);
1120 
1121          if (!fixtree) {
1122             if (format < 2)
1123             {
1124                OutTreeN(fanc, 1, 1); FPN(fanc); FPN(fanc);
1125             }
1126          }
1127          else {
1128             fprintf(fanc, "%6d %6d\n", tree.nnode - com.ns, j);
1129             for (j = com.ns; j < tree.nnode; j++, FPN(fanc)) {
1130                fprintf(fanc, "node%-26d  ", j + 1);
1131                print1seq(fanc, com.z[j], com.ls, NULL);
1132             }
1133             FPN(fanc);
1134 
1135             if (fsiteID) {
1136                if (com.seqtype == CODONseq && com.NSsites && com.model == 0) { /* site model */
1137                   k = 0;
1138                   if (com.rK[com.ncatG - 1] > 1)
1139                      FOR(h, com.ls) if (com.rK[com.siteID[h]] > 1) k++;
1140                   fprintf(fsiteID, "\n[replicate %d: %2d]\n", ir + 1, k);
1141                   if (k)  for (h = 0, k = 0; h < com.ls; h++) {
1142                      if (com.rK[com.siteID[h]] > 1) {
1143                         fprintf(fsiteID, "%4d ", h + 1);
1144                         if (++k % 15 == 0) FPN(fsiteID);
1145                      }
1146                   }
1147                   FPN(fsiteID);
1148                }
1149                else if (com.seqtype == CODONseq && com.NSsites && com.model) { /* branchsite */
1150                   fprintf(fsiteID, "\n[replicate %d]\n", ir + 1);
1151                   for (h = 0; h < com.ls; h++) {
1152                      fprintf(fsiteID, " %4d ", com.siteID[h] + 1);
1153                      if (h == com.ls - 1 || (h + 1) % 15 == 0) FPN(fsiteID);
1154                   }
1155                }
1156                else {       /* gamma rates */
1157                   fprintf(fsiteID, "\n[replicate %d]\n", ir + 1);
1158                   for (h = 0; h < com.ls; h++) {
1159                      fprintf(fsiteID, "%7.4f ", com.siterates[h]);
1160                      if (h == com.ls - 1 || (h + 1) % 10 == 0) FPN(fsiteID);
1161                   }
1162                }
1163             }
1164          }
1165       }
1166 
1167       printf("\rdid data set %d %s", ir + 1, (com.ls > 100000 || nr < 100 ? "\n" : ""));
1168    }   /* for (ir) */
1169    if (format == 2 || format == 3) appendfile(fseq, paupend);
1170 
1171    fclose(fseq);  if (!fixtree) fclose(fanc);
1172    if (com.alpha || com.NSsites) fclose(fsiteID);
1173    for (j = 0; j < com.ns * 2 - 1; j++) free(com.z[j]);
1174    free(space);
1175    if (com.model && com.NSsites) /* branch-site model */
1176       for (i = 0; i < tree.nnode; i++)  free(nodes[i].annotation);
1177    free(nodes);
1178    if (com.alpha || com.ncatG) {
1179       free(com.siterates);  com.siterates = NULL;
1180       free(siteorder);
1181       if (com.siteID) free(com.siteID);  com.siteID = NULL;
1182    }
1183    if (com.seqtype == 1 && com.model && com.NSsites) free(com.omegaBS);
1184    com.omegaBS = NULL;
1185 
1186    exit(0);
1187 }
1188 
1189 
GetSpnamesFromMB(FILE * fmb,char line[],int lline)1190 int GetSpnamesFromMB(FILE *fmb, char line[], int lline)
1191 {
1192    /* This reads species names from MrBayes output file fmb, like the following.
1193 
1194          Taxon  1 -> 1_Arabidopsis_thaliana
1195          Taxon  2 -> 2_Taxus_baccata
1196    */
1197    int j, ispecies;
1198    char *p = NULL, *mbstr1 = "Taxon ", *mbstr2 = "->";
1199 
1200    puts("Reading species names from mb output file.\n");
1201    rewind(fmb);
1202    for (ispecies = 0; ; ) {
1203       if (fgets(line, lline, fmb) == NULL) return(-1);
1204       if (strstr(line, mbstr1) && strstr(line, mbstr2)) {
1205          p = strstr(line, mbstr1) + 5;
1206          sscanf(p, "%d", &ispecies);
1207          p = strstr(line, mbstr2) + 3;
1208          if (com.spname[ispecies - 1][0])
1209             error2("species name already read?");
1210 
1211          for (j = 0; isgraph(*p) && j < lline; ) com.spname[ispecies - 1][j++] = *p++;
1212          com.spname[ispecies - 1][j] = 0;
1213 
1214          printf("\tTaxon %2d:  %s\n", ispecies, com.spname[ispecies - 1]);
1215       }
1216       else if (ispecies)
1217          break;
1218    }
1219    com.ns = ispecies;
1220    rewind(fmb);
1221 
1222    return(0);
1223 }
1224 
GrepLine(FILE * fin,char * query,char * line,int lline)1225 char *GrepLine(FILE *fin, char *query, char *line, int lline)
1226 {
1227 /* This greps infile to search for query[], and returns NULL or line[].
1228 */
1229    char *p = NULL;
1230 
1231    rewind(fin);
1232    for (; ; ) {
1233       if (fgets(line, lline, fin) == NULL) return(NULL);
1234       if (strstr(line, query)) return(line);
1235    }
1236    return(NULL);
1237 }
1238 
1239 
CladeMrBayesProbabilities(char treefile[])1240 void CladeMrBayesProbabilities(char treefile[])
1241 {
1242 /* This reads a tree from treefile and then scans a set of MrBayes output files
1243    (mbfiles) to retrieve posterior probabilities for every clade in that tree.
1244    It first scans the first mb output file to get the species names.
1245 
1246    Sample mb output:
1247    6 -- ...........................*************   8001 1.000 0.005 (0.000)
1248    7 -- ....................********************   8001 1.000 0.006 (0.000)
1249 
1250    Note 4 Jan 2014: This uses parti2B[], and is broken after i rewrote Tree2Partition().
1251 */
1252    int lline = 100000, i, j, k, nib, inode, parti2B[NS];
1253    char line[100000], *partition, *p;
1254    char symbol[2] = ".*", cladestr[NS + 1] = { 0 };
1255    FILE *ftree, *fmb[20];
1256    double *Pclade, t;
1257    int nmbfiles = 2;
1258    char *mbfiles[] = { "mb-1e-4.out", "mb-1e-1.out" };
1259 
1260    printf("tree file is %s\nmb output files:\n", treefile);
1261    ftree = gfopen(treefile, "r");
1262    for (k = 0; k < nmbfiles; k++)
1263       fmb[k] = gfopen(mbfiles[k], "r");
1264    for (k = 0; k < nmbfiles; k++) printf("\t%s\n", mbfiles[k]);
1265 
1266    GetSpnamesFromMB(fmb[0], line, lline);  /* read species names from mb output */
1267 
1268    fscanf(ftree, "%d%d", &i, &k);
1269    if (i && i != com.ns) error2("do you mean to specify ns in the tree file?");
1270    i = (com.ns * 2 - 1) * sizeof(struct TREEN);
1271    if ((nodes = (struct TREEN*)malloc(i)) == NULL) error2("oom");
1272    ReadTreeN(ftree, &i, 0, 1);
1273 
1274    FPN(F0);  OutTreeN(F0, 0, 0);  FPN(F0);  FPN(F0);
1275    nib = tree.nbranch - com.ns;
1276    for (i = 0; i < tree.nnode; i++) {
1277       nodes[i].annotation = NULL;
1278       if (i > com.ns) nodes[i].annotation = (char*)malloc(100 * sizeof(char));
1279    }
1280 
1281    partition = (char*)malloc(nib*com.ns * sizeof(char));
1282    if (partition == NULL) error2("oom");
1283    if ((Pclade = (double*)malloc(nib*nmbfiles * sizeof(double))) == NULL)
1284       error2("oom");
1285    for (i = 0; i < nib*nmbfiles; i++) Pclade[i] = 0;
1286 
1287    Tree2Partition(partition);
1288 
1289    for (i = 0; i < nib; i++) {
1290       inode = tree.branches[parti2B[i]][1];
1291       if (partition[i*com.ns + 0])
1292          for (j = 0; j < com.ns; j++) cladestr[j] = symbol[1 - partition[i*com.ns + j]];
1293       else
1294          for (j = 0; j < com.ns; j++) cladestr[j] = symbol[partition[i*com.ns + j]];
1295       printf("#%2d branch %2d node %2d  %s", i + 1, parti2B[i], inode, cladestr);
1296 
1297       for (k = 0; k < nmbfiles; k++) {
1298          if (GrepLine(fmb[k], cladestr, line, lline)) {
1299             p = strstr(line, cladestr);
1300             sscanf(p + com.ns, "%lf%lf", &t, &Pclade[i*nmbfiles + k]);
1301          }
1302       }
1303       for (k = 0; k < nmbfiles; k++) printf("%6.2f", Pclade[i*nmbfiles + k]);
1304       FPN(F0);
1305       for (k = 0, p = nodes[inode].annotation; k < nmbfiles; k++) {
1306          sprintf(p, "%3.0f%s", Pclade[i*nmbfiles + k] * 100, (k < nmbfiles - 1 ? "/" : ""));
1307          p += 4;
1308       }
1309    }
1310    FPN(F0);  OutTreeN(F0, 1, PrLabel);  FPN(F0);
1311 
1312    for (i = 0; i < tree.nnode; i++) free(nodes[i].annotation);
1313    free(nodes); free(partition);  free(Pclade);
1314    fclose(ftree);
1315    for (k = 0; k < nmbfiles; k++) fclose(fmb[k]);
1316    exit(0);
1317 }
1318