1 /* yn00.c
2    Pairwise estimation of dS and dN by the method of Yang & Nielsen
3    (2000 Mol. Biol. Evol. 17:32-43)
4 
5    Copyright, 1998, Ziheng Yang
6 
7                  cc -o yn00 -fast yn00.c tools.o -lm
8                  cl -O2 yn00.c tools.o
9                  yn00 <SequenceFileName>
10 
11   Codon sequences are encoded as 0,1,...,61, as in codeml.c.
12 */
13 #include "paml.h"
14 #define NS            1000
15 #define LSPNAME       30
16 #define NCODE         64
17 #define NGENE         2000
18 
19 int GetOptions (char *ctlf);
20 int EncodeSeqCodon(void);
21 int Statistics(FILE *fout, double space[]);
22 int DistanceMatLWL85 (FILE *fout);
23 int DistanceYN00(int is, int js, double*S, double*N, double*dS,double*dN,
24     double *SEdS, double *SEdN, double *t,double space[]);
25 int GetKappa (void);
26 int GetFreqs(int is1, int is2, double f3x4[], double pi[]);
27 int CountSites(char z[],double pi[],double*Stot,double*Ntot,
28     double fbS[],double fbN[]);
29 int GetPMatCodon(double P[],double t, double kappa, double omega, double space[]);
30 int CountDiffs(char z1[],char z2[],
31                double*Sdts,double*Sdtv,double*Ndts,double*Ndtv,double PMat[]);
32 int DistanceF84(double n, double P, double Q, double pi[],
33 		          double*k_HKY, double*t, double*SEt);
34 double dsdnREV (int is, int js, double space[]);
35 
36 int ExpPattFreq(double t,double kappa,double omega,double pi[],double space[]);
37 int ConsistencyMC(void);
38 int InfiniteData(double t,double kappa,double omega,double f3x4_0[],
39     double space[]);
40 void SimulateData2s64(FILE* fout, double f3x4_0[], double space[]);
41 
42 struct common_info {
43    unsigned char *z[NS];
44    char *spname[NS], seqf[512],outf[512];
45    int ns,ls,npatt,codonf,icode,ncode,getSE,*pose,verbose, seqtype, readpattern;
46    int cleandata, fcommon,kcommon, weighting, ndata, print;
47    double *fpatt, pi[NCODE], f3x4s[NS][12], kappa, omega;
48    int ngene,posG[NGENE+1],lgene[NGENE],fix_rgene, model;
49    double rgene[NGENE],piG[NGENE][NCODE], alpha;
50 }  com;
51 
52 
53 int FROM61[64], FROM64[64], FourFold[4][4];
54 double PMat[NCODE*NCODE];
55 char *codonfreqs[]={"Fequal", "F1x4", "F3x4", "Fcodon"};
56 enum {Fequal, F1x4, F3x4, Fcodon} CodonFreqs;
57 
58 FILE *frst, *frst1, *frub;
59 extern char BASEs[], AAs[];
60 extern int noisy, GeneticCode[][64];
61 int Nsensecodon;
62 enum {NODEBUG, KAPPA, SITES, DIFF} DebugFunctions;
63 int debug=0;
64 
65 double omega_NG, dN_NG, dS_NG;  /* what are these for? */
66 
67 
68 #define YN00
69 #define REALSEQUENCE
70 #include "treesub.c"
71 
72 
main(int argc,char * argv[])73 int main(int argc, char *argv[])
74 {
75    char dsf[512]="2YN.dS", dnf[512]="2YN.dN", tf[512]="2YN.t";
76    FILE *fout, *fseq, *fds, *fdn, *ft;
77    char ctlf[96]="yn00.ctl", timestr[64];
78    int    n=com.ncode, is,js, j, idata, wname=20, sspace;
79    double t=0.4, dS=0.1,dN=0.1, S,N, SEdS, SEdN, f3x4[12], *space=NULL;
80 
81    /* ConsistencyMC(); */
82 
83    printf("YN00 in %s\n", pamlVerStr);
84    starttimer();
85    if (argc>1)  strcpy(ctlf, argv[1]);
86    com.seqtype=1;  com.cleandata=1;  /* works for clean data only? */
87    com.ndata=1;  com.print=0;
88    noisy=1; com.icode=0;  com.fcommon=0;  com.kcommon=1;
89    GetOptions(ctlf);
90    setmark_61_64 ();
91    fout = fopen (com.outf, "w");
92    frst = fopen("rst", "w");
93    frst1 = fopen("rst1", "w");
94    frub = fopen ("rub", "w");
95    if (fout==NULL || frst==NULL) error2("outfile creation err.");
96    fds = (FILE*)fopen(dsf, "w");
97    fdn = (FILE*)fopen(dnf, "w");
98    ft = (FILE*)fopen(tf, "w");
99    if(fds==NULL || fdn==NULL || ft==NULL) error2("file open error");
100 
101    if((fseq=fopen (com.seqf,"r"))==NULL) {
102       printf ("\n\nSequence file %s not found!\n", com.seqf);
103       exit(-1);
104    }
105    for (idata=0; idata<com.ndata; idata++) {
106       if (com.ndata>1) {
107          printf("\nData set %d\n", idata+1);
108          fprintf(fout, "\n\nData set %d\n", idata+1);
109          fprintf(frst, "\t%d", idata+1);
110       }
111 
112       ReadSeq((com.verbose?fout:NULL), fseq, com.cleandata, 0);
113       SetMapAmbiguity(com.seqtype, 0);
114 
115       sspace = max2(200000,64*com.ns*sizeof(double));
116       sspace = max2(sspace,64*64*5*sizeof(double));
117       if ((space=(double*)realloc(space,sspace))==NULL) error2("oom space");
118 
119       com.kappa = 4.6;
120       com.omega = 1;
121       fprintf(fout,"YN00 %15s", com.seqf);
122       Statistics(fout, space);
123 
124       if(noisy) printf("\n\n(A) Nei-Gojobori (1986) method\n");
125       fprintf(fout,"\n\n(A) Nei-Gojobori (1986) method\n");
126       DistanceMatNG86 (fout, NULL, NULL, NULL, 0);
127       fflush(fout);
128 
129       if(noisy) printf("\n\n(B) Yang & Nielsen (2000) method\n\n");
130       fprintf(fout,"\n\n(B) Yang & Nielsen (2000) method\n\n");
131       fprintf(fout,"Yang Z, Nielsen R (2000) Estimating synonymous and nonsynonymous substitution rates under realistic evolutionary models. Mol. Biol. Evol. 17:32-43\n");
132       if(!com.weighting) fputs("\n(equal weighting of pathways)\n",fout);
133 
134       if(com.fcommon)  GetFreqs(-1, -1, f3x4, com.pi);
135       if(com.kcommon) {
136          GetKappa();
137          printf("kappa = %.2f\n\n",com.kappa);
138          /* puts("kappa?"); scanf("%lf", &com.kappa); */
139       }
140 
141       fputs("\nseq. seq.     S       N        t   kappa   omega     dN +- SE    dS +- SE\n\n",fout);
142       fprintf(fds,"%6d\n", com.ns);
143       fprintf(fdn,"%6d\n", com.ns);
144       fprintf(ft,"%6d\n", com.ns);
145       for(is=0; is<com.ns; is++) {
146          fprintf(fds,"%-*s ", wname,com.spname[is]);
147          fprintf(fdn,"%-*s ", wname,com.spname[is]);
148          fprintf(ft,"%-*s ", wname,com.spname[is]);
149          for(js=0; js<is; js++) {
150             if(noisy) printf("%3d vs. %3d\n", is+1, js+1);
151             fprintf(fout, " %3d  %3d ", is+1, js+1);
152 
153             if(!com.fcommon) GetFreqs(is, js, f3x4, com.pi);
154             if(!com.kcommon) GetKappa();
155             j = DistanceYN00(is, js, &S, &N, &dS,&dN, &SEdS, &SEdN, &t,space);
156 
157             fprintf(fout,"%7.1f %7.1f %8.4f %7.4f %7.4f %6.4f +- %6.4f %7.4f +- %6.4f\n",
158                S,N,t,com.kappa,com.omega,dN,SEdN,dS,SEdS);
159             fprintf(frst," YN: %8.4f%8.4f%8.4f %6.4f +- %6.4f %7.4f +- %6.4f\n",
160                t,com.kappa,com.omega,dN,SEdN,dS,SEdS);
161 
162             fprintf(fds," %7.4f",dS); fprintf(fdn," %7.4f",dN); fprintf(ft," %7.4f",t);
163          }    /* for (js) */
164          FPN(fds); FPN(fdn); FPN(ft);
165          fflush(fds); fflush(fdn); fflush(ft);
166       }       /* for (is) */
167       FPN(fds); FPN(fdn); FPN(ft);
168 
169       if(noisy) printf("\n\n(C) LWL85, LPB93 & LWLm methods\n\n");
170       fprintf(fout,"\n\n(C) LWL85, LPB93 & LWLm methods\n\n");
171       fprintf(fout,"Li W.-H., C.-I. Wu, Luo (1985) A new method for estimating synonymous and nonsynonymous rates of nucleotide substitutions considering the relative likelihood of nucleotide and codon changes. Mol. Biol. Evol. 2: 150-174.\n");
172       fprintf(fout,"Li W-H (1993) Unbiased estimation of the rates of synonymous and nonsynonymous substitution. J. Mol. Evol. 36:96-99\n");
173       fprintf(fout,"Pamilo P, Bianchi NO (1993) Evolution of the Zfx and Zfy genes - rates and interdependence between the genes. Mol. Biol. Evol. 10:271-281\n");
174       fprintf(fout,"Yang Z (2006) Computational Molecular Evolution. Oxford University Press, Oxford. Eqs. 2.12 & 2.13\n");
175 
176       DistanceMatLWL85(fout);
177 
178       fflush(frst);
179       if(noisy) printf("\nTime used: %s\n", printtime(timestr));
180    }
181    return (0);
182 }
183 
184 
185 
GetOptions(char * ctlf)186 int GetOptions (char *ctlf)
187 {
188    int i, nopt=9, lline=4096;
189    char line[4096], *pline, opt[20], comment='*';
190    char *optstr[]={"seqfile","outfile", "verbose", "noisy", "icode",
191         "weighting","commonkappa", "commonf3x4", "ndata"};
192    double t;
193    FILE *fctl;
194 
195    if((fctl=fopen(ctlf,"r"))==NULL) error2("\nctl file open error.\n");
196    printf ("\nReading options from %s..\n", ctlf);
197    for (;;) {
198       if (fgets (line, lline, fctl) == NULL) break;
199       for (i=0,t=0,pline=line; i<lline&&line[i]; i++)
200          if (isalnum(line[i]))  { t=1; break; }
201          else if (line[i]==comment) break;
202       if (t==0) continue;
203       sscanf (line, "%s%*s%lf", opt, &t);
204       if ((pline=strstr(line, "="))==NULL) error2("option file.");
205 
206       for (i=0; i<nopt; i++) {
207          if (strncmp(opt, optstr[i], 8)==0)  {
208             if (noisy>2)
209                printf ("\n%3d %15s | %-20s %6.2f", i+1,optstr[i],opt,t);
210             switch (i) {
211                case (0): sscanf(pline+2, "%s", com.seqf);    break;
212                case (1): sscanf(pline+2, "%s", com.outf);    break;
213                case (2): com.verbose=(int)t;     break;
214                case (3): noisy=(int)t;           break;
215                case (4): com.icode=(int)t;       break;
216                case (5): com.weighting=(int)t;   break;
217                case (6): com.kcommon=(int)t;     break;
218                case (7): com.fcommon=(int)t;     break;
219                case (8): com.ndata=(int)t;       break;
220             }
221             break;
222          }
223       }
224       if (i==nopt)
225          { printf ("\noption %s in %s\n", opt, ctlf);  exit (-1); }
226    }
227 
228    for (i=0,Nsensecodon=0; i<64; i++)
229       if (GeneticCode[com.icode][i]!=-1) Nsensecodon++;
230    com.ncode = Nsensecodon;
231    fclose (fctl);
232    FPN(F0);
233    return (0);
234 }
235 
DistanceYN00(int is,int js,double * S,double * N,double * dS,double * dN,double * SEdS,double * SEdN,double * t,double space[])236 int DistanceYN00(int is, int js, double*S, double*N, double*dS,double*dN,
237     double *SEdS, double *SEdN, double *t,double space[])
238 {
239 /* calculates dS, dN, w, t by weighting.
240    com.kappa & com.pi[] are calculated beforehand are not updated.
241 */
242    int j,k,ir,nround=10, status=0;
243    double fbS[4],fbN[4],fbSt[4],fbNt[4], St,Nt, Sdts,Sdtv,Ndts,Ndtv, kappaS,kappaN;
244    double w0=0,dS0=0,dN0=0, accu=5e-4, minomega=1e-5,maxomega=99;
245 
246    if(*t==0) *t=.5;
247    if(com.omega<=0) com.omega=1;
248    for(k=0; k<4; k++) fbS[k] = fbN[k] = 0;
249    if(debug) printf("\nCountSites\n");
250    if(noisy>3) printf("\n");
251    for(k=0,*S=*N=0; k<2; k++) {
252       CountSites(com.z[k==0?is:js], com.pi, &St, &Nt, fbSt, fbNt);
253       *S += St/2;
254       *N += Nt/2;
255       for(j=0; j<4; j++) {
256          fbS[j] += fbSt[j]/2;
257          fbN[j] += fbNt[j]/2;
258       }
259       if(noisy>3) printf("Seq. %d: S = %9.3f N=%9.3f\n",k+1,St,Nt);
260    }
261    if(noisy>3) {
262       printf("Ave.  : S = %9.3f N=%9.3f\n\n",*S,*N);
263       printf("Base freqs at syn & nonsyn sites\n%10s%10s%10s%10s\n", "T", "C", "A", "G");
264       for(k=0; k<4; k++) printf(" %9.6f", fbS[k]);  FPN(F0);
265       for(k=0; k<4; k++) printf(" %9.6f", fbN[k]);  FPN(F0);
266    }
267    if(noisy>3)
268       printf(" #    Sdts   Sdtv   Ndts   Ndtv |       t   kappa       w      dN      dS |   kappaS  kappaN\n");
269 
270    /* initial values? */
271    if(com.weighting) {
272       if(*t<0.001 || *t>5) *t=0.5;
273       if(com.omega<0.01 || com.omega>5) com.omega=.5;
274    }
275    for (ir=0; ir<(com.weighting?nround:1); ir++) {   /* weighting or iteration */
276       if(com.weighting)
277          GetPMatCodon(PMat,*t,com.kappa,com.omega,space);
278       else
279          for(j=0; j<com.ncode*com.ncode; j++)
280             PMat[j] = 1;
281 
282       CountDiffs(com.z[is], com.z[js], &Sdts, &Sdtv, &Ndts, &Ndtv, PMat);
283 
284       if(DistanceF84(*S, Sdts/ *S, Sdtv/ *S, fbS, &kappaS, dS, SEdS)) status=-1;
285       if(DistanceF84(*N, Ndts/ *N, Ndtv/ *N, fbN, &kappaN, dN, SEdN)) status=-1;
286 
287       if(*dS<1e-9) {
288          status = -1;
289          com.omega = maxomega;
290       }
291       else
292          com.omega= max2(minomega, *dN/ *dS);
293       *t = *dS * 3 * *S/(*S + *N) + *dN * 3 * *N/(*S + *N);
294       if(noisy>3) {
295          printf("%2d %7.2f%7.2f%7.2f%7.2f |", ir+1, Sdts,Sdtv,Ndts,Ndtv);
296          printf("%8.4f%8.4f%8.4f%8.4f%8.4f", *t, com.kappa,com.omega,*dN,*dS);
297          printf(" | %8.4f%8.4f\n", kappaS,kappaN);
298       }
299       if(fabs(*dS-dS0)<accu && fabs(*dN-dN0)<accu && fabs(com.omega-w0)<accu)
300          break;
301       dS0=*dS;  dN0=*dN;  w0=com.omega;
302    } /* for (ir) */
303    if(ir==nround) status=-2;
304    /* if(status) printf("\n\tstatus: %d\n", status); */
305    return(status);
306 }
307 
308 
309 
Statistics(FILE * fout,double space[])310 int Statistics(FILE *fout, double space[])
311 {
312 /* This calculates base frequencies, using npatt & fpatt[]
313 */
314    int h, is,j, c[3], wname=20;
315    double f3x4tot[12], *fb3tot=com.pi, *fb3s=space;
316 
317    if(fout) {
318       fprintf(fout, "\n\nns =%4d\tls =%4d", com.ns, com.ls);
319       fprintf(fout,"\n\nCodon position x base (3x4) table for each sequence.");
320    }
321    zero(f3x4tot,12);  zero(fb3s,64*com.ns);
322    for(is=0; is<com.ns; is++)  zero(com.f3x4s[is], 12);
323    for (is=0; is<com.ns; is++) {
324       for (h=0; h<com.npatt; h++) {
325          j = FROM61[com.z[is][h]];
326          c[0]=j/16; c[1]=(j%16)/4; c[2]=j%4;
327          fb3s[is*64+j] += com.fpatt[h];
328          for(j=0; j<3; j++)
329             com.f3x4s[is][j*4+c[j]] += com.fpatt[h]/com.ls;
330       }
331       for(j=0; j<12; j++) f3x4tot[j] += com.f3x4s[is][j]/com.ns;
332       if(fout) {
333          fprintf(fout,"\n\n%-*s", wname, com.spname[is]);
334          for(j=0; j<3; j++) {
335             fprintf (fout, "\nposition %2d:", j+1);
336             for(h=0; h<4; h++)
337                fprintf (fout,"%5c:%7.5f", BASEs[h], com.f3x4s[is][j*4+h]);
338          }
339       }
340    }
341    if(fout) {
342       fprintf (fout, "\n\nAverage");
343       for(j=0; j<3; j++) {
344          fprintf (fout, "\nposition %2d:", j+1);
345          for(h=0; h<4; h++)
346             fprintf (fout,"%5c:%7.5f", BASEs[h], f3x4tot[j*4+h]);
347       }
348       for(is=0,zero(fb3tot,64);is<com.ns;is++)
349          for(j=0; j<64; j++) fb3tot[j] += fb3s[is*64+j];
350       fprintf (fout, "\n\nCodon usage for each species\n");
351       printcums (fout, com.ns, fb3s, com.icode);
352       fprintf (fout, "\nSums\n");
353       printcums (fout, 1, fb3tot, com.icode);
354    }
355 
356    return(0);
357 }
358 
GetFreqs(int is1,int is2,double f3x4[],double pi[])359 int GetFreqs(int is1, int is2, double f3x4[], double pi[])
360 {
361 /* uses com.fcommon and com.f3x4s to calculate f3x4[] and pi[].
362    Codon frequencies pi[] are calculated from the f3x4 table.
363    The calculation is duplicated when com.fcommon=1.
364 */
365    int n=com.ncode, j, k, ic, b[3];
366 
367    if (com.fcommon)
368       for(j=0,zero(f3x4,12);j<com.ns;j++)
369          for(k=0; k<12; k++) f3x4[k]+=com.f3x4s[j][k]/com.ns;
370    else
371       for(k=0; k<12; k++)
372          f3x4[k] = (com.f3x4s[is1][k]+com.f3x4s[is2][k])/2;
373 
374    if (noisy>=9)
375       matout(F0, f3x4, 3, 4);
376    for(j=0; j<n; j++) {
377       ic=FROM61[j]; b[0]=ic/16; b[1]=(ic%16)/4; b[2]=ic%4;
378       pi[j] = f3x4[b[0]] * f3x4[4+b[1]] * f3x4[8+b[2]];
379    }
380    abyx(1/sum(pi,n), pi, n);
381 
382    return (0);
383 }
384 
385 
DistanceMatLWL85(FILE * fout)386 int DistanceMatLWL85 (FILE *fout)
387 {
388 /* This implements 3 methods: LWL85 (Li, Wu & Luo 1985), LPB (Li 1993,
389    Pamilo & Bianchi 1993), and LWL85m (equation 12 in book; check other refs).
390    alpha is not used.
391 */
392    int i,j,k, h, wname=15;
393    char *codon1, *codon2, str[4]="   ";
394    double L[3], sdiff[3], vdiff[3], Lt[3], sdifft[3], vdifft[3], A[3],B[3];
395    double P[3],Q[3], a,b, dS,dN, pS2, S,N, Sd,Nd;
396 
397    for(i=0; i<com.ns; i++) {
398       for(j=0; j<i; j++) {  /* pair i and j */
399          for(k=0; k<3; k++) L[k] = sdiff[k] = vdiff[k] = 0;
400 
401          for (h=0; h<com.npatt; h++)  {
402             codon1 = CODONs[com.z[i][h]];
403             codon2 = CODONs[com.z[j][h]];
404             difcodonLWL85(codon1, codon2, Lt, sdifft, vdifft, 0, com.icode);
405             for(k=0; k<3; k++) {
406                L[k]     += Lt[k]*com.fpatt[h];
407                sdiff[k] += sdifft[k]*com.fpatt[h];
408                vdiff[k] += vdifft[k]*com.fpatt[h];
409             }
410          }
411 
412          for(k=0; k<3; k++) {
413             P[k] = sdiff[k]/L[k];
414             Q[k] = vdiff[k]/L[k];
415             a = 1 - 2*P[k] - Q[k];
416             b = 1 - 2*Q[k];
417             A[k] = -log(a)/2 + log(b)/4;
418             B[k] = -log(b)/2;
419          }
420          if(fout) {
421             fprintf(fout, "\n%d (%s) vs. %d (%s)\n\n", i+1, com.spname[i], j+1, com.spname[j]);
422             fprintf(fout,"L(i):  %9.1f %9.1f %9.1f  sum=%9.1f\n", L[0],L[1],L[2],L[0]+L[1]+L[2]);
423             fprintf(fout,"Ns(i): %9.4f %9.4f %9.4f  sum=%9.4f\n", sdiff[0],sdiff[1],sdiff[2], sdiff[0]+sdiff[1]+sdiff[2]);
424             fprintf(fout,"Nv(i): %9.4f %9.4f %9.4f  sum=%9.4f\n", vdiff[0],vdiff[1],vdiff[2], vdiff[0]+vdiff[1]+vdiff[2]);
425             fprintf(fout,"A(i):  %9.4f %9.4f %9.4f\n", A[0],A[1],A[2]);
426             fprintf(fout,"B(i):  %9.4f %9.4f %9.4f\n", B[0],B[1],B[2]);
427 
428             Sd = L[1]*A[1] + L[2]*(A[2]+B[2]);
429             Nd = L[1]*B[1] + L[0]*(A[0]+B[0]);
430             pS2 = 1/3.;
431             S = L[1]*pS2 + L[2];
432             N = L[1]*(1-pS2) + L[0];
433             dS = Sd/S;
434             dN = Nd/N;
435             fprintf(fout,"LWL85:  dS = %7.4f dN = %7.4f w =%7.4f S =%7.1f N =%7.1f\n", dS,dN, dN/dS, S, N);
436             pS2 = A[2]/(A[2]+B[2]);
437             S = L[1]*pS2 + L[2];
438             N = L[1]*(1-pS2) + L[0];
439             dS = Sd/S;
440             dN = Nd/N;
441             fprintf(fout,"LWL85m: dS = %7.4f dN = %7.4f w =%7.4f S =%7.1f N =%7.1f (rho = %.3f)\n", dS,dN, dN/dS, S, N, pS2);
442 
443             dS = (L[1]*A[1]+L[2]*A[2])/(L[1]+L[2]) + B[2];
444             dN = (L[0]*B[0]+L[1]*B[1])/(L[0]+L[1]) + A[0];
445             fprintf(fout,"LPB93:  dS = %7.4f dN = %7.4f w =%7.4f\n", dS, dN, dN/dS);
446          }
447       }
448       if(noisy)  printf(" %3d",i+1);
449    }
450    if(noisy)  FPN(F0);
451    if(fout) FPN(fout);
452    return (0);
453 }
454 
455 
456 
GetKappa(void)457 int GetKappa(void)
458 {
459 /* This calculates mutational transition/transversion rate ratio kappa
460    using 4-fold degenerate sites from pairwise comparisons
461    under HKY85, weighting estimates by the numbers of sites
462 */
463    int is,js,j,k,h, i1,pos,c[2],aa[2],b[2][3],a,ndeg,by[3]={16,4,1}, status=0;
464    double ka[2], F[2][16],S[2],wk[2], t,P,Q,pi[4];
465                  /* F&S&wk [0]: non-degenerate; [1]:4-fold;  S:sites */
466    double kdefault=(com.kappa>0?com.kappa:(com.icode==1?10:2));
467    char str1[4]="   ",str2[4]="   ", *sitestr[2]={"non-degenerate","4-fold"};
468 
469    for(is=0,com.kappa=0;is<com.ns;is++) {
470       for(js=0; js<is; js++) {
471          if(noisy>=9) printf ("\n%4d vs. %3d", is+1, js+1);
472          for(k=0; k<2; k++) zero(F[k],16);
473          for(h=0; h<com.npatt; h++) {
474             c[0] = FROM61[com.z[is][h]];
475             c[1] = FROM61[com.z[js][h]];
476             for(k=0; k<2; k++) {
477                b[k][0] = c[k]/16;
478                b[k][1] = (c[k]%16)/4;
479                b[k][2] = c[k]%4;
480                aa[k] = GeneticCode[com.icode][c[k]];
481             }
482 
483             /* find non-degenerate sites */
484             for(pos=0; pos<3; pos++) {         /* check all positions */
485                for(k=0,ndeg=0;k<2;k++) {       /* two codons */
486                   for(i1=0; i1<4; i1++) {
487                      if(i1==b[k][pos]) continue;
488                      a = GeneticCode[com.icode][c[k]+(i1-b[k][pos])*by[pos]];
489                      if(a==aa[k]) break;
490                   }
491                   if(i1==4) ndeg++;
492                }
493                if(ndeg==2) {
494                   F[0][b[0][pos]*4+b[1][pos]] += .5*com.fpatt[h];
495                   F[0][b[1][pos]*4+b[0][pos]] += .5*com.fpatt[h];
496                }
497 
498             }
499             /* find 4-fold degenerate sites at 3rd positions */
500             for(k=0,ndeg=0;k<2;k++) {       /* two codons */
501                for(j=0,i1=c[k]-b[k][2]; j<4; j++)
502                   if(j!=b[k][2] && GeneticCode[com.icode][i1+j]!=aa[k]) break;
503                if(aa[0]==aa[1] && j==4) ndeg++;
504             }
505             if (ndeg<2) continue;
506             F[1][b[0][2]*4+b[1][2]] += .5*com.fpatt[h];
507             F[1][b[1][2]*4+b[0][2]] += .5*com.fpatt[h];
508          }  /* for (h) */
509          for(k=0; k<2; k++) {  /* two kinds of sites */
510             /*
511             if(noisy>3) printf("\n%s:\n",sitestr[k]);
512             */
513             S[k] = sum(F[k],16);
514             if(S[k]<=0) { wk[k]=0; continue; }
515             for(j=0; j<16; j++) F[k][j]/=S[k];
516             P = (F[k][0*4+1]+F[k][2*4+3])*2;
517             Q = 1-(F[k][0*4+0]+F[k][1*4+1]+F[k][2*4+2]+F[k][3*4+3]) - P;
518             for(j=0; j<4; j++)
519                pi[j] = sum(F[k]+j*4,4);
520             DistanceF84(S[k], P,Q,pi, &ka[k], &t, NULL);
521             wk[k] = (ka[k]>0?S[k]:0);
522 
523             /* matout(F0,F[k],4,4);  matout(F0,pi,1,4);  */
524             /*
525             if(noisy>3)
526                printf("\nSPQkt:%9.4f%9.5f%9.5f%9.4f%9.4f\n",S[k],P,Q,ka[k],t);
527             */
528          }
529          if(wk[0]+wk[1]==0) {
530             status = -1;
531             ka[0] = kdefault;
532             if(noisy>3) printf("\ngot no kappa! fix it at %.4f\n",ka[0]);
533          }
534          else
535              ka[0] = (ka[0]*wk[0]+ka[1]*wk[1])/(wk[0]+wk[1]);
536          com.kappa += ka[0]/(com.ns*(com.ns-1.)/2);
537       }  /* for(js) */
538    }     /* for(is) */
539 
540    return (status);
541 }
542 
543 
CountSites(char z[],double pi[],double * Stot,double * Ntot,double fbS[],double fbN[])544 int CountSites(char z[],double pi[],double*Stot,double*Ntot,double fbS[],double fbN[])
545 {
546 /* This calculates the total numbers of synonymous and nonsynonymous sites
547    (Stot & Ntot) in the sequence z[] using com.kappa and pi[].
548    It also count the base frequencies at the synonymous and nonsynonymous
549    sites.  Total number of sites is scaled to be equal to sequence length
550    even if some changes are to stop codons.  Since pi[] is scaled to sum
551    to one, rates to stop codons are not considered.
552    The counting goes through the sequence codon by codon, and so is different
553    from the counting in codeml, which uses pi[] to count the sites.
554 */
555    int h, j,k, c[2],aa[2], b[3], by[3]={16,4,1};
556    double r, S,N, kappa=com.kappa;
557 
558    *Stot = *Ntot = 0;
559    for(k=0; k<4; k++)
560       fbS[k] = fbN[k] = 0;
561    for (h=0; h<com.npatt; h++)  {
562       c[0] = FROM61[z[h]];
563       b[0] = c[0]/16; b[1]=(c[0]%16)/4; b[2]=c[0]%4;
564       aa[0] = GeneticCode[com.icode][c[0]];
565       if (aa[0]==-1)
566          error2("stop codon");
567       for (j=0,S=N=0; j<3; j++) {
568          for(k=0; k<4; k++) {    /* b[j] changes to k */
569             if (k==b[j]) continue;
570             c[1]  = c[0]+(k-b[j])*by[j];
571             aa[1] = GeneticCode[com.icode][c[1]];
572             if(aa[1] == -1) continue;
573             r = pi[FROM64[c[1]]];
574             if (k+b[j]==1 || k+b[j]==5)  r *= kappa; /* transition */
575             if (aa[0]==aa[1]) { S += r; fbS[b[j]] += r*com.fpatt[h]; }
576             else              { N += r; fbN[b[j]] += r*com.fpatt[h]; }
577          }
578       }
579       *Stot += com.fpatt[h]*S;
580       *Ntot += com.fpatt[h]*N;
581    }
582    r = 3*com.ls/(*Stot+*Ntot);  *Stot*=r;  *Ntot*=r;
583    r = sum(fbS,4);  for(k=0; k<4; k++) fbS[k] /= r;
584    r = sum(fbN,4);  for(k=0; k<4; k++) fbN[k] /= r;
585    return (0);
586 }
587 
588 
GetPMatCodon(double P[],double t,double kappa,double omega,double space[])589 int GetPMatCodon(double P[],double t, double kappa, double omega, double space[])
590 {
591 /* Get PMat=exp(Q*t) for weighting pathways
592 */
593    int nterms=100, n=com.ncode, ic1, ic2, i,j,k, aa[2],ndiff,pos=0,from[3],to[3];
594    double *Q=P, *U=space+n*n, *V=U+n*n, *Root=V+n*n, mr, spacesqrt[NCODE];
595 
596    for(i=0; i<n*n; i++) Q[i] = 0;
597    for (i=0; i<n; i++) {
598       ic1=FROM61[i]; from[0]=ic1/16; from[1]=(ic1/4)%4; from[2]=ic1%4;
599       for(j=0; j<i; j++) {
600          ic2=FROM61[j];   to[0]=ic2/16;   to[1]=(ic2/4)%4;   to[2]=ic2%4;
601          aa[0] = GeneticCode[com.icode][ic1];
602          aa[1] = GeneticCode[com.icode][ic2];
603          if (aa[0]==-1 || aa[1]==-1)  continue;
604          for (k=0,ndiff=0; k<3; k++)
605             if(from[k] != to[k]) { ndiff++; pos=k; }
606          if (ndiff!=1)  continue;
607          Q[i*n+j] = 1;
608          if ((from[pos]+to[pos]-1)*(from[pos]+to[pos]-5)==0)
609             Q[i*n+j] *= kappa;
610          if(aa[0] != aa[1])  Q[i*n+j] *= omega;
611          Q[j*n+i] = Q[i*n+j];
612       }
613    }
614 
615    for(i=0; i<n; i++) for(j=0; j<n; j++)
616       Q[i*n+j] *= com.pi[j];
617 
618    for (i=0,mr=0; i<n; i++) {
619       Q[i*n+i] = -sum(Q+i*n,n);
620       mr -= com.pi[i]*Q[i*n+i];
621    }
622 
623    eigenQREV(Q, com.pi, n, Root, U, V, spacesqrt);
624    for(i=0; i<n; i++) Root[i] /= mr;
625    PMatUVRoot(P, t, n, U, V, Root);
626    /*
627    testTransP(PMat, n);
628    fprintf(frub,"\a\nP(%.5f)\n", t);
629    for(i=0; i<n; i++,FPN(frub)) for(j=0; j<n; j++)
630    fprintf(frub, " %9.5g", PMat[i*n+j]);
631    fflush(frub);
632    */
633    return (0);
634 }
635 
636 
637 
CountDiffs(char z1[],char z2[],double * Sdts,double * Sdtv,double * Ndts,double * Ndtv,double PMat[])638 int CountDiffs(char z1[],char z2[], double*Sdts,double*Sdtv,double*Ndts,double*Ndtv,double PMat[])
639 {
640 /* Count the numbers of synonymous and nonsynonymous differences between
641    sequences z1 and z2, weighting pathways with PMat. No weighting if PMat=NULL
642    Modified from difcodon()
643    dmark[i] (=0,1,2) is the i_th different codon position (i=0,1,ndiff).
644    step[j] (=0,1,2) is the codon position to be changed at step j (j=0,1,ndiff).
645    b[i][j] (=0,1,2,3) is the nucleotide at position j (0,1,2) in codon i (0,1)
646    sts,stv,nts,ntv are syn ts & tv and nonsyn ts & tv at a codon site.
647    stspath[k] stvpath[k] ntspath[k] ntvpath[k] are syn ts & tv and
648    nonsyn ts & tv differences on path k (k=2,6).
649 */
650    char str[4]="   ";
651    int n=com.ncode, h,i1,i2,i,k, transi, c[2],ct[2],aa[2], by[3]={16,4,1};
652    int dmark[3], step[3], b[2][3], bt1[3], bt2[3];
653    int ndiff, npath, nstop, stspath[6],stvpath[6],ntspath[6],ntvpath[6];
654    double sts,stv,nts,ntv; /* syn ts & tv, nonsyn ts & tv for 2 codons */
655    double ppath[6], sump,p;
656 
657    *Sdts = *Sdtv = *Ndts = *Ndtv = 0;
658    for (h=0; h<com.npatt; h++)  {
659       c[0] = FROM61[z1[h]];
660       c[1] = FROM61[z2[h]];
661       if (c[0]==c[1]) continue;
662       for(i=0; i<2; i++) {
663          b[i][0]=c[i]/16; b[i][1]=(c[i]%16)/4; b[i][2]=c[i]%4;
664          aa[i] = GeneticCode[com.icode][c[i]];
665       }
666       if (aa[0]==-1 || aa[1]==-1)
667          error2("stop codon in sequence.");
668       ndiff=0;  sts=stv=nts=ntv=0;
669       for(k=0; k<3; k++) dmark[k] = -1;
670       for(k=0; k<3; k++) if(b[0][k] != b[1][k]) dmark[ndiff++] = k;
671       npath=1;
672       if(ndiff>1) npath = (ndiff==2 ? 2 : 6);
673       if (ndiff==1) {
674          transi = b[0][dmark[0]]+b[1][dmark[0]];
675          transi = (transi==1 || transi==5);
676          if (aa[0]==aa[1])  { if (transi) sts++; else stv++; }
677          else               { if (transi) nts++; else ntv++; }
678       }
679       else {   /* ndiff=2 or 3 */
680          if(debug==DIFF) {
681             printf("\n\nh=%d %s (%c) .. ", h+1,getcodon(str,c[0]),AAs[aa[0]]);
682             printf("%s (%c): ", getcodon(str,c[1]), AAs[aa[1]]);
683          }
684          nstop=0;
685          for(k=0; k<npath; k++) {
686             if(debug==DIFF) printf("\npath %d: ", k+1);
687 
688             for(i1=0; i1<3; i1++)  step[i1] = -1;
689             if (ndiff==2) {
690                step[0] = dmark[k];
691                step[1] = dmark[1-k];
692             }
693             else {
694                step[0] = k/2;
695                step[1] = k%2;
696                if (step[0]<=step[1]) step[1]++;
697                step[2] = 3-step[0]-step[1];
698             }
699             for(i1=0; i1<3; i1++) bt1[i1] = bt2[i1]=b[0][i1];
700             stspath[k] = stvpath[k] = ntspath[k] = ntvpath[k] = 0;
701             /* mutations along each path */
702             for (i1=0,ppath[k]=1; i1<ndiff; i1++) {
703                bt2[step[i1]] = b[1][step[i1]];
704                for (i2=0,ct[0]=ct[1]=0; i2<3; i2++) {
705                   ct[0] += bt1[i2]*by[i2];
706                   ct[1] += bt2[i2]*by[i2];
707                }
708                ppath[k] *= PMat[ FROM64[ct[0]]*n + FROM64[ct[1]] ];
709                for(i2=0; i2<2; i2++) aa[i2] = GeneticCode[com.icode][ct[i2]];
710 
711                if(debug==DIFF) printf("%s (%c) %.5f: ", getcodon(str,ct[1]),AAs[aa[1]],PMat[ct[0]*n+ct[1]]);
712 
713                if (aa[1]==-1) {
714                   nstop++;  ppath[k]=0; break;
715                }
716                transi = b[0][step[i1]]+b[1][step[i1]];
717                transi = (transi==1 || transi==5);  /* transition? */
718 
719                if(aa[0]==aa[1]) { if(transi) stspath[k]++; else stvpath[k]++; }
720                else             { if(transi) ntspath[k]++; else ntvpath[k]++; }
721                for(i2=0; i2<3; i2++) bt1[i2] = bt2[i2];
722             }
723 
724             if(debug==DIFF) printf("  p =%.9f", ppath[k]);
725 
726          }  /* for(k,npath) */
727          if (npath==nstop) {  /* all paths through stop codons */
728             puts ("all paths through stop codons..");
729             if (ndiff==2) { nts=.5; ntv=1.5; }
730             else          { nts=.5; ntv=2.5; }
731          }
732          else {
733             sump = sum(ppath,npath);
734             if(sump<1e-20) {
735                printf("\nsump=0, npath=%4d\nh=%2d ", npath, h+1);
736                printf("(%s ", getcodon(str,c[0]));
737                printf("%s)", getcodon(str,c[1]));
738                for(k=0; k<npath; k++) printf(" %9.6g", ppath[k]); FPN(F0);
739                matout(frub, PMat, n, n);
740                exit(-1);
741 
742                /*
743                sump=1; FOR(k,npath) if(ppath[k]) ppath[k]=1./(npath-nstop);
744                */
745             }
746             for(k=0; k<npath; k++) {
747                p = ppath[k]/sump;
748                sts += stspath[k]*p;
749                stv += stvpath[k]*p;
750                nts += ntspath[k]*p;
751                ntv += ntvpath[k]*p;
752             }
753 
754             if(debug==DIFF) {
755                for(k=0; k<npath; k++) printf("\n p =%.5f", ppath[k]/sump);  FPN(F0);
756                printf(" syn ts & tv, nonsyn ts & tv:%9.5f%9.5f%9.5f%9.5f\n",sts,stv,nts,ntv);
757             }
758          }
759 
760          if(debug==DIFF) getchar();
761 
762       }     /* if (ndiff) */
763       *Sdts += com.fpatt[h]*sts;
764       *Sdtv += com.fpatt[h]*stv;
765       *Ndts += com.fpatt[h]*nts;
766       *Ndtv += com.fpatt[h]*ntv;
767    }  /* for (h) */
768    return (0);
769 }
770 
771 
DistanceF84(double n,double P,double Q,double pi[],double * k_HKY,double * t,double * SEt)772 int DistanceF84(double n, double P, double Q, double pi[],
773     double*k_HKY, double*t, double*SEt)
774 {
775 /* This calculates kappa and d from P (proportion of transitions) & Q
776    (proportion of transversions) & pi under F84.
777    When F84 fails, we try to use K80.  When K80 fails, we try
778    to use JC69.  When JC69 fails, we set distance t to maxt.
779    Variance formula under F84 is from Tateno et al. (1994), and briefly
780    checked against simulated data sets.
781 */
782    int failF84=0,failK80=0,failJC69=0;
783    double tc,ag, Y,R, a=0,b=0, A=-1,B=-1,C=-1, k_F84;
784    double Qsmall=min2(1e-10,0.1/n), maxkappa=999,maxt=99;
785 
786    *k_HKY=-1;
787    Y=pi[0]+pi[1];  R=pi[2]+pi[3];  tc=pi[0]*pi[1];  ag=pi[2]*pi[3];
788    if (P+Q>1) { *t=maxt; *k_HKY=1; return(3); }
789    if (P<-1e-10 || Q<-1e-10 || fabs(Y+R-1)>1e-8) {
790       printf("\nPQYR & pi[]: %9.5f%9.5f%9.5f%9.5f",P,Q,Y,R);
791       matout(F0,pi,1,4);
792       error2("DistanceF84: input err.");
793    }
794    if(Q<Qsmall)  failF84=failK80=1;
795    else if(Y<=0 || R<=0 || (tc<=0 && ag<=0)) failF84=1;
796    else {
797       A=tc/Y+ag/R; B=tc+ag; C=Y*R;
798       a=(2*B+2*(tc*R/Y+ag*Y/R)*(1-Q/(2*C)) - P) / (2*A);
799       b=1-Q/(2*C);
800       if (a<=0 || b<=0) failF84=1;
801    }
802    if (!failF84) {
803       a=-.5*log(a); b=-.5*log(b);
804       if(b<=0) failF84=1;
805       else {
806          k_F84 = a/b-1;
807          *t = 4*b*(tc*(1+ k_F84/Y) + ag*(1+ k_F84/R)+C);
808          *k_HKY = (B + (tc/Y+ag/R)* k_F84)/B; /* k_F84=>k_HKY85 */
809          if(SEt) {
810             a = A*C/(A*C-C*P/2-(A-B)*Q/2);
811             b = A*(A-B)/(A*C-C*P/2-(A-B)*Q/2) - (A-B-C)/(C-Q/2);
812             *SEt = sqrt((a*a*P+b*b*Q-square(a*P+b*Q))/n);
813          }
814       }
815    }
816    if(failF84 && !failK80) {  /* try K80 */
817       if (noisy>=9) printf("\na=%.5f  b=%.5f, use K80\n", a,b);
818       a=1-2*P-Q;  b=1-2*Q;
819       if (a<=0 || b<=0) failK80=1;
820       else {
821          a=-log(a); b=-log(b);
822          if(b<=0)  failK80=1;
823          else {
824             *k_HKY=(.5*a-.25*b)/(.25*b);
825             *t = .5*a+.25*b;
826          }
827          if(SEt) {
828             a=1/(1-2*P-Q); b=(a+1/(1-2*Q))/2;
829             *SEt = sqrt((a*a*P+b*b*Q-square(a*P+b*Q))/n);
830          }
831       }
832    }
833    if(failK80) {
834       if((P+=Q)>=.75) { failJC69=1; P=.75*(n-1.)/n; }
835       *t = -.75*log(1-P*4/3.);
836       if(*t>maxt) *t=maxt;
837       if(SEt) {
838          *SEt = sqrt(9*P*(1-P)/n) / (3-4*P);
839       }
840    }
841    if(*k_HKY>maxkappa) *k_HKY=maxkappa;
842 
843    return(failF84 + failK80 + failJC69);
844 }
845 
846 
847 
848 #if 0
849 
850 double dsdnREV (int is, int js, double space[])
851 {
852 /* This calculates ds and dn by recovering the Q*t matrix using the equation
853       F(t) = PI * P(t) = PI * exp(Q*t)
854    This is found not to work well and is not published.
855    space[64*64*5]
856    The code here is broken since I changed the coding.  Codons are now coded 0, 1, ..., 60.
857 */
858    int n=com.ncode, i,j, h;
859    double *F=PMat, *Qt=F;
860    double *Root=space+n*n,*pi=Root+n, *U=pi+n,*V=U+n*n;
861    double *T1=V+n*n,*T2=T1+n*n, t, small=1e-6;
862 
863    fprintf(frst,"\npi in model\n");
864    matout(frst,com.pi,1,n);
865    FOR(i,n*n) F[i]=0;
866    FOR (h,com.npatt) {
867       F[com.z[is][h]*n+com.z[js][h]]+=com.fpatt[h]/(2*com.ls);
868       F[com.z[js][h]*n+com.z[is][h]]+=com.fpatt[h]/(2*com.ls);
869    }
870    if(fabs(1-sum(F,n*n))>1e-6) error2("Sum F != 1 in dsdnREV");
871 
872    FOR (i,n) {
873       pi[i]=sum(F+i*n, n);
874 /*
875       if (F[i*n+i]<=small || F[i*n+i]<pi[i]/4)
876 */
877       if (F[i*n+i]<=small)  F[i*n+i]=1-pi[i]+F[i*n+i];
878       else                  abyx(1/pi[i], F+i*n, n);
879    }
880    if (eigen (1, F, n, Root, T1, U, V, T2)) error2 ("eigen jgl");
881    xtoy (U, V, n*n);
882    matinv (V, n, n, T1);
883 
884 fprintf(frst,"\npi in data\n");
885 matout (frst, pi, 1, n);   FPN(F0);
886 matout (frst, Root, 1, n);
887 
888    FOR (i,n) {
889       if (Root[i]<=0)
890          printf ("  Root %d:%10.4f", i+1, Root[i]);
891       Root[i]=log(Root[i]);
892    }
893    FOR (i,n) FOR (j,n) T1[i*n+j]=U[i*n+j]*Root[j];
894    matby (T1, V, Qt, n, n, n);
895    for (i=0,t=0; i<n; i++) t-=pi[i]*Qt[i*n+i];
896    if (t<=0) puts ("err: dsdnREV");
897 
898    FOR(i,n*n) Qt[i]+=1e-8;  /* remove negative numbers from rounding errors */
899 
900    matout(frst,Qt,n,n);
901 printf("\nt = %.5f\n", t);
902 
903    return (0);
904 }
905 
906 
907 #endif
908