1 /* yn00.c
2 Pairwise estimation of dS and dN by the method of Yang & Nielsen
3 (2000 Mol. Biol. Evol. 17:32-43)
4
5 Copyright, 1998, Ziheng Yang
6
7 cc -o yn00 -fast yn00.c tools.o -lm
8 cl -O2 yn00.c tools.o
9 yn00 <SequenceFileName>
10
11 Codon sequences are encoded as 0,1,...,61, as in codeml.c.
12 */
13 #include "paml.h"
14 #define NS 1000
15 #define LSPNAME 30
16 #define NCODE 64
17 #define NGENE 2000
18
19 int GetOptions (char *ctlf);
20 int EncodeSeqCodon(void);
21 int Statistics(FILE *fout, double space[]);
22 int DistanceMatLWL85 (FILE *fout);
23 int DistanceYN00(int is, int js, double*S, double*N, double*dS,double*dN,
24 double *SEdS, double *SEdN, double *t,double space[]);
25 int GetKappa (void);
26 int GetFreqs(int is1, int is2, double f3x4[], double pi[]);
27 int CountSites(char z[],double pi[],double*Stot,double*Ntot,
28 double fbS[],double fbN[]);
29 int GetPMatCodon(double P[],double t, double kappa, double omega, double space[]);
30 int CountDiffs(char z1[],char z2[],
31 double*Sdts,double*Sdtv,double*Ndts,double*Ndtv,double PMat[]);
32 int DistanceF84(double n, double P, double Q, double pi[],
33 double*k_HKY, double*t, double*SEt);
34 double dsdnREV (int is, int js, double space[]);
35
36 int ExpPattFreq(double t,double kappa,double omega,double pi[],double space[]);
37 int ConsistencyMC(void);
38 int InfiniteData(double t,double kappa,double omega,double f3x4_0[],
39 double space[]);
40 void SimulateData2s64(FILE* fout, double f3x4_0[], double space[]);
41
42 struct common_info {
43 unsigned char *z[NS];
44 char *spname[NS], seqf[512],outf[512];
45 int ns,ls,npatt,codonf,icode,ncode,getSE,*pose,verbose, seqtype, readpattern;
46 int cleandata, fcommon,kcommon, weighting, ndata, print;
47 double *fpatt, pi[NCODE], f3x4s[NS][12], kappa, omega;
48 int ngene,posG[NGENE+1],lgene[NGENE],fix_rgene, model;
49 double rgene[NGENE],piG[NGENE][NCODE], alpha;
50 } com;
51
52
53 int FROM61[64], FROM64[64], FourFold[4][4];
54 double PMat[NCODE*NCODE];
55 char *codonfreqs[]={"Fequal", "F1x4", "F3x4", "Fcodon"};
56 enum {Fequal, F1x4, F3x4, Fcodon} CodonFreqs;
57
58 FILE *frst, *frst1, *frub;
59 extern char BASEs[], AAs[];
60 extern int noisy, GeneticCode[][64];
61 int Nsensecodon;
62 enum {NODEBUG, KAPPA, SITES, DIFF} DebugFunctions;
63 int debug=0;
64
65 double omega_NG, dN_NG, dS_NG; /* what are these for? */
66
67
68 #define YN00
69 #define REALSEQUENCE
70 #include "treesub.c"
71
72
main(int argc,char * argv[])73 int main(int argc, char *argv[])
74 {
75 char dsf[512]="2YN.dS", dnf[512]="2YN.dN", tf[512]="2YN.t";
76 FILE *fout, *fseq, *fds, *fdn, *ft;
77 char ctlf[96]="yn00.ctl", timestr[64];
78 int n=com.ncode, is,js, j, idata, wname=20, sspace;
79 double t=0.4, dS=0.1,dN=0.1, S,N, SEdS, SEdN, f3x4[12], *space=NULL;
80
81 /* ConsistencyMC(); */
82
83 printf("YN00 in %s\n", pamlVerStr);
84 starttimer();
85 if (argc>1) strcpy(ctlf, argv[1]);
86 com.seqtype=1; com.cleandata=1; /* works for clean data only? */
87 com.ndata=1; com.print=0;
88 noisy=1; com.icode=0; com.fcommon=0; com.kcommon=1;
89 GetOptions(ctlf);
90 setmark_61_64 ();
91 fout = fopen (com.outf, "w");
92 frst = fopen("rst", "w");
93 frst1 = fopen("rst1", "w");
94 frub = fopen ("rub", "w");
95 if (fout==NULL || frst==NULL) error2("outfile creation err.");
96 fds = (FILE*)fopen(dsf, "w");
97 fdn = (FILE*)fopen(dnf, "w");
98 ft = (FILE*)fopen(tf, "w");
99 if(fds==NULL || fdn==NULL || ft==NULL) error2("file open error");
100
101 if((fseq=fopen (com.seqf,"r"))==NULL) {
102 printf ("\n\nSequence file %s not found!\n", com.seqf);
103 exit(-1);
104 }
105 for (idata=0; idata<com.ndata; idata++) {
106 if (com.ndata>1) {
107 printf("\nData set %d\n", idata+1);
108 fprintf(fout, "\n\nData set %d\n", idata+1);
109 fprintf(frst, "\t%d", idata+1);
110 }
111
112 ReadSeq((com.verbose?fout:NULL), fseq, com.cleandata, 0);
113 SetMapAmbiguity(com.seqtype, 0);
114
115 sspace = max2(200000,64*com.ns*sizeof(double));
116 sspace = max2(sspace,64*64*5*sizeof(double));
117 if ((space=(double*)realloc(space,sspace))==NULL) error2("oom space");
118
119 com.kappa = 4.6;
120 com.omega = 1;
121 fprintf(fout,"YN00 %15s", com.seqf);
122 Statistics(fout, space);
123
124 if(noisy) printf("\n\n(A) Nei-Gojobori (1986) method\n");
125 fprintf(fout,"\n\n(A) Nei-Gojobori (1986) method\n");
126 DistanceMatNG86 (fout, NULL, NULL, NULL, 0);
127 fflush(fout);
128
129 if(noisy) printf("\n\n(B) Yang & Nielsen (2000) method\n\n");
130 fprintf(fout,"\n\n(B) Yang & Nielsen (2000) method\n\n");
131 fprintf(fout,"Yang Z, Nielsen R (2000) Estimating synonymous and nonsynonymous substitution rates under realistic evolutionary models. Mol. Biol. Evol. 17:32-43\n");
132 if(!com.weighting) fputs("\n(equal weighting of pathways)\n",fout);
133
134 if(com.fcommon) GetFreqs(-1, -1, f3x4, com.pi);
135 if(com.kcommon) {
136 GetKappa();
137 printf("kappa = %.2f\n\n",com.kappa);
138 /* puts("kappa?"); scanf("%lf", &com.kappa); */
139 }
140
141 fputs("\nseq. seq. S N t kappa omega dN +- SE dS +- SE\n\n",fout);
142 fprintf(fds,"%6d\n", com.ns);
143 fprintf(fdn,"%6d\n", com.ns);
144 fprintf(ft,"%6d\n", com.ns);
145 for(is=0; is<com.ns; is++) {
146 fprintf(fds,"%-*s ", wname,com.spname[is]);
147 fprintf(fdn,"%-*s ", wname,com.spname[is]);
148 fprintf(ft,"%-*s ", wname,com.spname[is]);
149 for(js=0; js<is; js++) {
150 if(noisy) printf("%3d vs. %3d\n", is+1, js+1);
151 fprintf(fout, " %3d %3d ", is+1, js+1);
152
153 if(!com.fcommon) GetFreqs(is, js, f3x4, com.pi);
154 if(!com.kcommon) GetKappa();
155 j = DistanceYN00(is, js, &S, &N, &dS,&dN, &SEdS, &SEdN, &t,space);
156
157 fprintf(fout,"%7.1f %7.1f %8.4f %7.4f %7.4f %6.4f +- %6.4f %7.4f +- %6.4f\n",
158 S,N,t,com.kappa,com.omega,dN,SEdN,dS,SEdS);
159 fprintf(frst," YN: %8.4f%8.4f%8.4f %6.4f +- %6.4f %7.4f +- %6.4f\n",
160 t,com.kappa,com.omega,dN,SEdN,dS,SEdS);
161
162 fprintf(fds," %7.4f",dS); fprintf(fdn," %7.4f",dN); fprintf(ft," %7.4f",t);
163 } /* for (js) */
164 FPN(fds); FPN(fdn); FPN(ft);
165 fflush(fds); fflush(fdn); fflush(ft);
166 } /* for (is) */
167 FPN(fds); FPN(fdn); FPN(ft);
168
169 if(noisy) printf("\n\n(C) LWL85, LPB93 & LWLm methods\n\n");
170 fprintf(fout,"\n\n(C) LWL85, LPB93 & LWLm methods\n\n");
171 fprintf(fout,"Li W.-H., C.-I. Wu, Luo (1985) A new method for estimating synonymous and nonsynonymous rates of nucleotide substitutions considering the relative likelihood of nucleotide and codon changes. Mol. Biol. Evol. 2: 150-174.\n");
172 fprintf(fout,"Li W-H (1993) Unbiased estimation of the rates of synonymous and nonsynonymous substitution. J. Mol. Evol. 36:96-99\n");
173 fprintf(fout,"Pamilo P, Bianchi NO (1993) Evolution of the Zfx and Zfy genes - rates and interdependence between the genes. Mol. Biol. Evol. 10:271-281\n");
174 fprintf(fout,"Yang Z (2006) Computational Molecular Evolution. Oxford University Press, Oxford. Eqs. 2.12 & 2.13\n");
175
176 DistanceMatLWL85(fout);
177
178 fflush(frst);
179 if(noisy) printf("\nTime used: %s\n", printtime(timestr));
180 }
181 return (0);
182 }
183
184
185
GetOptions(char * ctlf)186 int GetOptions (char *ctlf)
187 {
188 int i, nopt=9, lline=4096;
189 char line[4096], *pline, opt[20], comment='*';
190 char *optstr[]={"seqfile","outfile", "verbose", "noisy", "icode",
191 "weighting","commonkappa", "commonf3x4", "ndata"};
192 double t;
193 FILE *fctl;
194
195 if((fctl=fopen(ctlf,"r"))==NULL) error2("\nctl file open error.\n");
196 printf ("\nReading options from %s..\n", ctlf);
197 for (;;) {
198 if (fgets (line, lline, fctl) == NULL) break;
199 for (i=0,t=0,pline=line; i<lline&&line[i]; i++)
200 if (isalnum(line[i])) { t=1; break; }
201 else if (line[i]==comment) break;
202 if (t==0) continue;
203 sscanf (line, "%s%*s%lf", opt, &t);
204 if ((pline=strstr(line, "="))==NULL) error2("option file.");
205
206 for (i=0; i<nopt; i++) {
207 if (strncmp(opt, optstr[i], 8)==0) {
208 if (noisy>2)
209 printf ("\n%3d %15s | %-20s %6.2f", i+1,optstr[i],opt,t);
210 switch (i) {
211 case (0): sscanf(pline+2, "%s", com.seqf); break;
212 case (1): sscanf(pline+2, "%s", com.outf); break;
213 case (2): com.verbose=(int)t; break;
214 case (3): noisy=(int)t; break;
215 case (4): com.icode=(int)t; break;
216 case (5): com.weighting=(int)t; break;
217 case (6): com.kcommon=(int)t; break;
218 case (7): com.fcommon=(int)t; break;
219 case (8): com.ndata=(int)t; break;
220 }
221 break;
222 }
223 }
224 if (i==nopt)
225 { printf ("\noption %s in %s\n", opt, ctlf); exit (-1); }
226 }
227
228 for (i=0,Nsensecodon=0; i<64; i++)
229 if (GeneticCode[com.icode][i]!=-1) Nsensecodon++;
230 com.ncode = Nsensecodon;
231 fclose (fctl);
232 FPN(F0);
233 return (0);
234 }
235
DistanceYN00(int is,int js,double * S,double * N,double * dS,double * dN,double * SEdS,double * SEdN,double * t,double space[])236 int DistanceYN00(int is, int js, double*S, double*N, double*dS,double*dN,
237 double *SEdS, double *SEdN, double *t,double space[])
238 {
239 /* calculates dS, dN, w, t by weighting.
240 com.kappa & com.pi[] are calculated beforehand are not updated.
241 */
242 int j,k,ir,nround=10, status=0;
243 double fbS[4],fbN[4],fbSt[4],fbNt[4], St,Nt, Sdts,Sdtv,Ndts,Ndtv, kappaS,kappaN;
244 double w0=0,dS0=0,dN0=0, accu=5e-4, minomega=1e-5,maxomega=99;
245
246 if(*t==0) *t=.5;
247 if(com.omega<=0) com.omega=1;
248 for(k=0; k<4; k++) fbS[k] = fbN[k] = 0;
249 if(debug) printf("\nCountSites\n");
250 if(noisy>3) printf("\n");
251 for(k=0,*S=*N=0; k<2; k++) {
252 CountSites(com.z[k==0?is:js], com.pi, &St, &Nt, fbSt, fbNt);
253 *S += St/2;
254 *N += Nt/2;
255 for(j=0; j<4; j++) {
256 fbS[j] += fbSt[j]/2;
257 fbN[j] += fbNt[j]/2;
258 }
259 if(noisy>3) printf("Seq. %d: S = %9.3f N=%9.3f\n",k+1,St,Nt);
260 }
261 if(noisy>3) {
262 printf("Ave. : S = %9.3f N=%9.3f\n\n",*S,*N);
263 printf("Base freqs at syn & nonsyn sites\n%10s%10s%10s%10s\n", "T", "C", "A", "G");
264 for(k=0; k<4; k++) printf(" %9.6f", fbS[k]); FPN(F0);
265 for(k=0; k<4; k++) printf(" %9.6f", fbN[k]); FPN(F0);
266 }
267 if(noisy>3)
268 printf(" # Sdts Sdtv Ndts Ndtv | t kappa w dN dS | kappaS kappaN\n");
269
270 /* initial values? */
271 if(com.weighting) {
272 if(*t<0.001 || *t>5) *t=0.5;
273 if(com.omega<0.01 || com.omega>5) com.omega=.5;
274 }
275 for (ir=0; ir<(com.weighting?nround:1); ir++) { /* weighting or iteration */
276 if(com.weighting)
277 GetPMatCodon(PMat,*t,com.kappa,com.omega,space);
278 else
279 for(j=0; j<com.ncode*com.ncode; j++)
280 PMat[j] = 1;
281
282 CountDiffs(com.z[is], com.z[js], &Sdts, &Sdtv, &Ndts, &Ndtv, PMat);
283
284 if(DistanceF84(*S, Sdts/ *S, Sdtv/ *S, fbS, &kappaS, dS, SEdS)) status=-1;
285 if(DistanceF84(*N, Ndts/ *N, Ndtv/ *N, fbN, &kappaN, dN, SEdN)) status=-1;
286
287 if(*dS<1e-9) {
288 status = -1;
289 com.omega = maxomega;
290 }
291 else
292 com.omega= max2(minomega, *dN/ *dS);
293 *t = *dS * 3 * *S/(*S + *N) + *dN * 3 * *N/(*S + *N);
294 if(noisy>3) {
295 printf("%2d %7.2f%7.2f%7.2f%7.2f |", ir+1, Sdts,Sdtv,Ndts,Ndtv);
296 printf("%8.4f%8.4f%8.4f%8.4f%8.4f", *t, com.kappa,com.omega,*dN,*dS);
297 printf(" | %8.4f%8.4f\n", kappaS,kappaN);
298 }
299 if(fabs(*dS-dS0)<accu && fabs(*dN-dN0)<accu && fabs(com.omega-w0)<accu)
300 break;
301 dS0=*dS; dN0=*dN; w0=com.omega;
302 } /* for (ir) */
303 if(ir==nround) status=-2;
304 /* if(status) printf("\n\tstatus: %d\n", status); */
305 return(status);
306 }
307
308
309
Statistics(FILE * fout,double space[])310 int Statistics(FILE *fout, double space[])
311 {
312 /* This calculates base frequencies, using npatt & fpatt[]
313 */
314 int h, is,j, c[3], wname=20;
315 double f3x4tot[12], *fb3tot=com.pi, *fb3s=space;
316
317 if(fout) {
318 fprintf(fout, "\n\nns =%4d\tls =%4d", com.ns, com.ls);
319 fprintf(fout,"\n\nCodon position x base (3x4) table for each sequence.");
320 }
321 zero(f3x4tot,12); zero(fb3s,64*com.ns);
322 for(is=0; is<com.ns; is++) zero(com.f3x4s[is], 12);
323 for (is=0; is<com.ns; is++) {
324 for (h=0; h<com.npatt; h++) {
325 j = FROM61[com.z[is][h]];
326 c[0]=j/16; c[1]=(j%16)/4; c[2]=j%4;
327 fb3s[is*64+j] += com.fpatt[h];
328 for(j=0; j<3; j++)
329 com.f3x4s[is][j*4+c[j]] += com.fpatt[h]/com.ls;
330 }
331 for(j=0; j<12; j++) f3x4tot[j] += com.f3x4s[is][j]/com.ns;
332 if(fout) {
333 fprintf(fout,"\n\n%-*s", wname, com.spname[is]);
334 for(j=0; j<3; j++) {
335 fprintf (fout, "\nposition %2d:", j+1);
336 for(h=0; h<4; h++)
337 fprintf (fout,"%5c:%7.5f", BASEs[h], com.f3x4s[is][j*4+h]);
338 }
339 }
340 }
341 if(fout) {
342 fprintf (fout, "\n\nAverage");
343 for(j=0; j<3; j++) {
344 fprintf (fout, "\nposition %2d:", j+1);
345 for(h=0; h<4; h++)
346 fprintf (fout,"%5c:%7.5f", BASEs[h], f3x4tot[j*4+h]);
347 }
348 for(is=0,zero(fb3tot,64);is<com.ns;is++)
349 for(j=0; j<64; j++) fb3tot[j] += fb3s[is*64+j];
350 fprintf (fout, "\n\nCodon usage for each species\n");
351 printcums (fout, com.ns, fb3s, com.icode);
352 fprintf (fout, "\nSums\n");
353 printcums (fout, 1, fb3tot, com.icode);
354 }
355
356 return(0);
357 }
358
GetFreqs(int is1,int is2,double f3x4[],double pi[])359 int GetFreqs(int is1, int is2, double f3x4[], double pi[])
360 {
361 /* uses com.fcommon and com.f3x4s to calculate f3x4[] and pi[].
362 Codon frequencies pi[] are calculated from the f3x4 table.
363 The calculation is duplicated when com.fcommon=1.
364 */
365 int n=com.ncode, j, k, ic, b[3];
366
367 if (com.fcommon)
368 for(j=0,zero(f3x4,12);j<com.ns;j++)
369 for(k=0; k<12; k++) f3x4[k]+=com.f3x4s[j][k]/com.ns;
370 else
371 for(k=0; k<12; k++)
372 f3x4[k] = (com.f3x4s[is1][k]+com.f3x4s[is2][k])/2;
373
374 if (noisy>=9)
375 matout(F0, f3x4, 3, 4);
376 for(j=0; j<n; j++) {
377 ic=FROM61[j]; b[0]=ic/16; b[1]=(ic%16)/4; b[2]=ic%4;
378 pi[j] = f3x4[b[0]] * f3x4[4+b[1]] * f3x4[8+b[2]];
379 }
380 abyx(1/sum(pi,n), pi, n);
381
382 return (0);
383 }
384
385
DistanceMatLWL85(FILE * fout)386 int DistanceMatLWL85 (FILE *fout)
387 {
388 /* This implements 3 methods: LWL85 (Li, Wu & Luo 1985), LPB (Li 1993,
389 Pamilo & Bianchi 1993), and LWL85m (equation 12 in book; check other refs).
390 alpha is not used.
391 */
392 int i,j,k, h, wname=15;
393 char *codon1, *codon2, str[4]=" ";
394 double L[3], sdiff[3], vdiff[3], Lt[3], sdifft[3], vdifft[3], A[3],B[3];
395 double P[3],Q[3], a,b, dS,dN, pS2, S,N, Sd,Nd;
396
397 for(i=0; i<com.ns; i++) {
398 for(j=0; j<i; j++) { /* pair i and j */
399 for(k=0; k<3; k++) L[k] = sdiff[k] = vdiff[k] = 0;
400
401 for (h=0; h<com.npatt; h++) {
402 codon1 = CODONs[com.z[i][h]];
403 codon2 = CODONs[com.z[j][h]];
404 difcodonLWL85(codon1, codon2, Lt, sdifft, vdifft, 0, com.icode);
405 for(k=0; k<3; k++) {
406 L[k] += Lt[k]*com.fpatt[h];
407 sdiff[k] += sdifft[k]*com.fpatt[h];
408 vdiff[k] += vdifft[k]*com.fpatt[h];
409 }
410 }
411
412 for(k=0; k<3; k++) {
413 P[k] = sdiff[k]/L[k];
414 Q[k] = vdiff[k]/L[k];
415 a = 1 - 2*P[k] - Q[k];
416 b = 1 - 2*Q[k];
417 A[k] = -log(a)/2 + log(b)/4;
418 B[k] = -log(b)/2;
419 }
420 if(fout) {
421 fprintf(fout, "\n%d (%s) vs. %d (%s)\n\n", i+1, com.spname[i], j+1, com.spname[j]);
422 fprintf(fout,"L(i): %9.1f %9.1f %9.1f sum=%9.1f\n", L[0],L[1],L[2],L[0]+L[1]+L[2]);
423 fprintf(fout,"Ns(i): %9.4f %9.4f %9.4f sum=%9.4f\n", sdiff[0],sdiff[1],sdiff[2], sdiff[0]+sdiff[1]+sdiff[2]);
424 fprintf(fout,"Nv(i): %9.4f %9.4f %9.4f sum=%9.4f\n", vdiff[0],vdiff[1],vdiff[2], vdiff[0]+vdiff[1]+vdiff[2]);
425 fprintf(fout,"A(i): %9.4f %9.4f %9.4f\n", A[0],A[1],A[2]);
426 fprintf(fout,"B(i): %9.4f %9.4f %9.4f\n", B[0],B[1],B[2]);
427
428 Sd = L[1]*A[1] + L[2]*(A[2]+B[2]);
429 Nd = L[1]*B[1] + L[0]*(A[0]+B[0]);
430 pS2 = 1/3.;
431 S = L[1]*pS2 + L[2];
432 N = L[1]*(1-pS2) + L[0];
433 dS = Sd/S;
434 dN = Nd/N;
435 fprintf(fout,"LWL85: dS = %7.4f dN = %7.4f w =%7.4f S =%7.1f N =%7.1f\n", dS,dN, dN/dS, S, N);
436 pS2 = A[2]/(A[2]+B[2]);
437 S = L[1]*pS2 + L[2];
438 N = L[1]*(1-pS2) + L[0];
439 dS = Sd/S;
440 dN = Nd/N;
441 fprintf(fout,"LWL85m: dS = %7.4f dN = %7.4f w =%7.4f S =%7.1f N =%7.1f (rho = %.3f)\n", dS,dN, dN/dS, S, N, pS2);
442
443 dS = (L[1]*A[1]+L[2]*A[2])/(L[1]+L[2]) + B[2];
444 dN = (L[0]*B[0]+L[1]*B[1])/(L[0]+L[1]) + A[0];
445 fprintf(fout,"LPB93: dS = %7.4f dN = %7.4f w =%7.4f\n", dS, dN, dN/dS);
446 }
447 }
448 if(noisy) printf(" %3d",i+1);
449 }
450 if(noisy) FPN(F0);
451 if(fout) FPN(fout);
452 return (0);
453 }
454
455
456
GetKappa(void)457 int GetKappa(void)
458 {
459 /* This calculates mutational transition/transversion rate ratio kappa
460 using 4-fold degenerate sites from pairwise comparisons
461 under HKY85, weighting estimates by the numbers of sites
462 */
463 int is,js,j,k,h, i1,pos,c[2],aa[2],b[2][3],a,ndeg,by[3]={16,4,1}, status=0;
464 double ka[2], F[2][16],S[2],wk[2], t,P,Q,pi[4];
465 /* F&S&wk [0]: non-degenerate; [1]:4-fold; S:sites */
466 double kdefault=(com.kappa>0?com.kappa:(com.icode==1?10:2));
467 char str1[4]=" ",str2[4]=" ", *sitestr[2]={"non-degenerate","4-fold"};
468
469 for(is=0,com.kappa=0;is<com.ns;is++) {
470 for(js=0; js<is; js++) {
471 if(noisy>=9) printf ("\n%4d vs. %3d", is+1, js+1);
472 for(k=0; k<2; k++) zero(F[k],16);
473 for(h=0; h<com.npatt; h++) {
474 c[0] = FROM61[com.z[is][h]];
475 c[1] = FROM61[com.z[js][h]];
476 for(k=0; k<2; k++) {
477 b[k][0] = c[k]/16;
478 b[k][1] = (c[k]%16)/4;
479 b[k][2] = c[k]%4;
480 aa[k] = GeneticCode[com.icode][c[k]];
481 }
482
483 /* find non-degenerate sites */
484 for(pos=0; pos<3; pos++) { /* check all positions */
485 for(k=0,ndeg=0;k<2;k++) { /* two codons */
486 for(i1=0; i1<4; i1++) {
487 if(i1==b[k][pos]) continue;
488 a = GeneticCode[com.icode][c[k]+(i1-b[k][pos])*by[pos]];
489 if(a==aa[k]) break;
490 }
491 if(i1==4) ndeg++;
492 }
493 if(ndeg==2) {
494 F[0][b[0][pos]*4+b[1][pos]] += .5*com.fpatt[h];
495 F[0][b[1][pos]*4+b[0][pos]] += .5*com.fpatt[h];
496 }
497
498 }
499 /* find 4-fold degenerate sites at 3rd positions */
500 for(k=0,ndeg=0;k<2;k++) { /* two codons */
501 for(j=0,i1=c[k]-b[k][2]; j<4; j++)
502 if(j!=b[k][2] && GeneticCode[com.icode][i1+j]!=aa[k]) break;
503 if(aa[0]==aa[1] && j==4) ndeg++;
504 }
505 if (ndeg<2) continue;
506 F[1][b[0][2]*4+b[1][2]] += .5*com.fpatt[h];
507 F[1][b[1][2]*4+b[0][2]] += .5*com.fpatt[h];
508 } /* for (h) */
509 for(k=0; k<2; k++) { /* two kinds of sites */
510 /*
511 if(noisy>3) printf("\n%s:\n",sitestr[k]);
512 */
513 S[k] = sum(F[k],16);
514 if(S[k]<=0) { wk[k]=0; continue; }
515 for(j=0; j<16; j++) F[k][j]/=S[k];
516 P = (F[k][0*4+1]+F[k][2*4+3])*2;
517 Q = 1-(F[k][0*4+0]+F[k][1*4+1]+F[k][2*4+2]+F[k][3*4+3]) - P;
518 for(j=0; j<4; j++)
519 pi[j] = sum(F[k]+j*4,4);
520 DistanceF84(S[k], P,Q,pi, &ka[k], &t, NULL);
521 wk[k] = (ka[k]>0?S[k]:0);
522
523 /* matout(F0,F[k],4,4); matout(F0,pi,1,4); */
524 /*
525 if(noisy>3)
526 printf("\nSPQkt:%9.4f%9.5f%9.5f%9.4f%9.4f\n",S[k],P,Q,ka[k],t);
527 */
528 }
529 if(wk[0]+wk[1]==0) {
530 status = -1;
531 ka[0] = kdefault;
532 if(noisy>3) printf("\ngot no kappa! fix it at %.4f\n",ka[0]);
533 }
534 else
535 ka[0] = (ka[0]*wk[0]+ka[1]*wk[1])/(wk[0]+wk[1]);
536 com.kappa += ka[0]/(com.ns*(com.ns-1.)/2);
537 } /* for(js) */
538 } /* for(is) */
539
540 return (status);
541 }
542
543
CountSites(char z[],double pi[],double * Stot,double * Ntot,double fbS[],double fbN[])544 int CountSites(char z[],double pi[],double*Stot,double*Ntot,double fbS[],double fbN[])
545 {
546 /* This calculates the total numbers of synonymous and nonsynonymous sites
547 (Stot & Ntot) in the sequence z[] using com.kappa and pi[].
548 It also count the base frequencies at the synonymous and nonsynonymous
549 sites. Total number of sites is scaled to be equal to sequence length
550 even if some changes are to stop codons. Since pi[] is scaled to sum
551 to one, rates to stop codons are not considered.
552 The counting goes through the sequence codon by codon, and so is different
553 from the counting in codeml, which uses pi[] to count the sites.
554 */
555 int h, j,k, c[2],aa[2], b[3], by[3]={16,4,1};
556 double r, S,N, kappa=com.kappa;
557
558 *Stot = *Ntot = 0;
559 for(k=0; k<4; k++)
560 fbS[k] = fbN[k] = 0;
561 for (h=0; h<com.npatt; h++) {
562 c[0] = FROM61[z[h]];
563 b[0] = c[0]/16; b[1]=(c[0]%16)/4; b[2]=c[0]%4;
564 aa[0] = GeneticCode[com.icode][c[0]];
565 if (aa[0]==-1)
566 error2("stop codon");
567 for (j=0,S=N=0; j<3; j++) {
568 for(k=0; k<4; k++) { /* b[j] changes to k */
569 if (k==b[j]) continue;
570 c[1] = c[0]+(k-b[j])*by[j];
571 aa[1] = GeneticCode[com.icode][c[1]];
572 if(aa[1] == -1) continue;
573 r = pi[FROM64[c[1]]];
574 if (k+b[j]==1 || k+b[j]==5) r *= kappa; /* transition */
575 if (aa[0]==aa[1]) { S += r; fbS[b[j]] += r*com.fpatt[h]; }
576 else { N += r; fbN[b[j]] += r*com.fpatt[h]; }
577 }
578 }
579 *Stot += com.fpatt[h]*S;
580 *Ntot += com.fpatt[h]*N;
581 }
582 r = 3*com.ls/(*Stot+*Ntot); *Stot*=r; *Ntot*=r;
583 r = sum(fbS,4); for(k=0; k<4; k++) fbS[k] /= r;
584 r = sum(fbN,4); for(k=0; k<4; k++) fbN[k] /= r;
585 return (0);
586 }
587
588
GetPMatCodon(double P[],double t,double kappa,double omega,double space[])589 int GetPMatCodon(double P[],double t, double kappa, double omega, double space[])
590 {
591 /* Get PMat=exp(Q*t) for weighting pathways
592 */
593 int nterms=100, n=com.ncode, ic1, ic2, i,j,k, aa[2],ndiff,pos=0,from[3],to[3];
594 double *Q=P, *U=space+n*n, *V=U+n*n, *Root=V+n*n, mr, spacesqrt[NCODE];
595
596 for(i=0; i<n*n; i++) Q[i] = 0;
597 for (i=0; i<n; i++) {
598 ic1=FROM61[i]; from[0]=ic1/16; from[1]=(ic1/4)%4; from[2]=ic1%4;
599 for(j=0; j<i; j++) {
600 ic2=FROM61[j]; to[0]=ic2/16; to[1]=(ic2/4)%4; to[2]=ic2%4;
601 aa[0] = GeneticCode[com.icode][ic1];
602 aa[1] = GeneticCode[com.icode][ic2];
603 if (aa[0]==-1 || aa[1]==-1) continue;
604 for (k=0,ndiff=0; k<3; k++)
605 if(from[k] != to[k]) { ndiff++; pos=k; }
606 if (ndiff!=1) continue;
607 Q[i*n+j] = 1;
608 if ((from[pos]+to[pos]-1)*(from[pos]+to[pos]-5)==0)
609 Q[i*n+j] *= kappa;
610 if(aa[0] != aa[1]) Q[i*n+j] *= omega;
611 Q[j*n+i] = Q[i*n+j];
612 }
613 }
614
615 for(i=0; i<n; i++) for(j=0; j<n; j++)
616 Q[i*n+j] *= com.pi[j];
617
618 for (i=0,mr=0; i<n; i++) {
619 Q[i*n+i] = -sum(Q+i*n,n);
620 mr -= com.pi[i]*Q[i*n+i];
621 }
622
623 eigenQREV(Q, com.pi, n, Root, U, V, spacesqrt);
624 for(i=0; i<n; i++) Root[i] /= mr;
625 PMatUVRoot(P, t, n, U, V, Root);
626 /*
627 testTransP(PMat, n);
628 fprintf(frub,"\a\nP(%.5f)\n", t);
629 for(i=0; i<n; i++,FPN(frub)) for(j=0; j<n; j++)
630 fprintf(frub, " %9.5g", PMat[i*n+j]);
631 fflush(frub);
632 */
633 return (0);
634 }
635
636
637
CountDiffs(char z1[],char z2[],double * Sdts,double * Sdtv,double * Ndts,double * Ndtv,double PMat[])638 int CountDiffs(char z1[],char z2[], double*Sdts,double*Sdtv,double*Ndts,double*Ndtv,double PMat[])
639 {
640 /* Count the numbers of synonymous and nonsynonymous differences between
641 sequences z1 and z2, weighting pathways with PMat. No weighting if PMat=NULL
642 Modified from difcodon()
643 dmark[i] (=0,1,2) is the i_th different codon position (i=0,1,ndiff).
644 step[j] (=0,1,2) is the codon position to be changed at step j (j=0,1,ndiff).
645 b[i][j] (=0,1,2,3) is the nucleotide at position j (0,1,2) in codon i (0,1)
646 sts,stv,nts,ntv are syn ts & tv and nonsyn ts & tv at a codon site.
647 stspath[k] stvpath[k] ntspath[k] ntvpath[k] are syn ts & tv and
648 nonsyn ts & tv differences on path k (k=2,6).
649 */
650 char str[4]=" ";
651 int n=com.ncode, h,i1,i2,i,k, transi, c[2],ct[2],aa[2], by[3]={16,4,1};
652 int dmark[3], step[3], b[2][3], bt1[3], bt2[3];
653 int ndiff, npath, nstop, stspath[6],stvpath[6],ntspath[6],ntvpath[6];
654 double sts,stv,nts,ntv; /* syn ts & tv, nonsyn ts & tv for 2 codons */
655 double ppath[6], sump,p;
656
657 *Sdts = *Sdtv = *Ndts = *Ndtv = 0;
658 for (h=0; h<com.npatt; h++) {
659 c[0] = FROM61[z1[h]];
660 c[1] = FROM61[z2[h]];
661 if (c[0]==c[1]) continue;
662 for(i=0; i<2; i++) {
663 b[i][0]=c[i]/16; b[i][1]=(c[i]%16)/4; b[i][2]=c[i]%4;
664 aa[i] = GeneticCode[com.icode][c[i]];
665 }
666 if (aa[0]==-1 || aa[1]==-1)
667 error2("stop codon in sequence.");
668 ndiff=0; sts=stv=nts=ntv=0;
669 for(k=0; k<3; k++) dmark[k] = -1;
670 for(k=0; k<3; k++) if(b[0][k] != b[1][k]) dmark[ndiff++] = k;
671 npath=1;
672 if(ndiff>1) npath = (ndiff==2 ? 2 : 6);
673 if (ndiff==1) {
674 transi = b[0][dmark[0]]+b[1][dmark[0]];
675 transi = (transi==1 || transi==5);
676 if (aa[0]==aa[1]) { if (transi) sts++; else stv++; }
677 else { if (transi) nts++; else ntv++; }
678 }
679 else { /* ndiff=2 or 3 */
680 if(debug==DIFF) {
681 printf("\n\nh=%d %s (%c) .. ", h+1,getcodon(str,c[0]),AAs[aa[0]]);
682 printf("%s (%c): ", getcodon(str,c[1]), AAs[aa[1]]);
683 }
684 nstop=0;
685 for(k=0; k<npath; k++) {
686 if(debug==DIFF) printf("\npath %d: ", k+1);
687
688 for(i1=0; i1<3; i1++) step[i1] = -1;
689 if (ndiff==2) {
690 step[0] = dmark[k];
691 step[1] = dmark[1-k];
692 }
693 else {
694 step[0] = k/2;
695 step[1] = k%2;
696 if (step[0]<=step[1]) step[1]++;
697 step[2] = 3-step[0]-step[1];
698 }
699 for(i1=0; i1<3; i1++) bt1[i1] = bt2[i1]=b[0][i1];
700 stspath[k] = stvpath[k] = ntspath[k] = ntvpath[k] = 0;
701 /* mutations along each path */
702 for (i1=0,ppath[k]=1; i1<ndiff; i1++) {
703 bt2[step[i1]] = b[1][step[i1]];
704 for (i2=0,ct[0]=ct[1]=0; i2<3; i2++) {
705 ct[0] += bt1[i2]*by[i2];
706 ct[1] += bt2[i2]*by[i2];
707 }
708 ppath[k] *= PMat[ FROM64[ct[0]]*n + FROM64[ct[1]] ];
709 for(i2=0; i2<2; i2++) aa[i2] = GeneticCode[com.icode][ct[i2]];
710
711 if(debug==DIFF) printf("%s (%c) %.5f: ", getcodon(str,ct[1]),AAs[aa[1]],PMat[ct[0]*n+ct[1]]);
712
713 if (aa[1]==-1) {
714 nstop++; ppath[k]=0; break;
715 }
716 transi = b[0][step[i1]]+b[1][step[i1]];
717 transi = (transi==1 || transi==5); /* transition? */
718
719 if(aa[0]==aa[1]) { if(transi) stspath[k]++; else stvpath[k]++; }
720 else { if(transi) ntspath[k]++; else ntvpath[k]++; }
721 for(i2=0; i2<3; i2++) bt1[i2] = bt2[i2];
722 }
723
724 if(debug==DIFF) printf(" p =%.9f", ppath[k]);
725
726 } /* for(k,npath) */
727 if (npath==nstop) { /* all paths through stop codons */
728 puts ("all paths through stop codons..");
729 if (ndiff==2) { nts=.5; ntv=1.5; }
730 else { nts=.5; ntv=2.5; }
731 }
732 else {
733 sump = sum(ppath,npath);
734 if(sump<1e-20) {
735 printf("\nsump=0, npath=%4d\nh=%2d ", npath, h+1);
736 printf("(%s ", getcodon(str,c[0]));
737 printf("%s)", getcodon(str,c[1]));
738 for(k=0; k<npath; k++) printf(" %9.6g", ppath[k]); FPN(F0);
739 matout(frub, PMat, n, n);
740 exit(-1);
741
742 /*
743 sump=1; FOR(k,npath) if(ppath[k]) ppath[k]=1./(npath-nstop);
744 */
745 }
746 for(k=0; k<npath; k++) {
747 p = ppath[k]/sump;
748 sts += stspath[k]*p;
749 stv += stvpath[k]*p;
750 nts += ntspath[k]*p;
751 ntv += ntvpath[k]*p;
752 }
753
754 if(debug==DIFF) {
755 for(k=0; k<npath; k++) printf("\n p =%.5f", ppath[k]/sump); FPN(F0);
756 printf(" syn ts & tv, nonsyn ts & tv:%9.5f%9.5f%9.5f%9.5f\n",sts,stv,nts,ntv);
757 }
758 }
759
760 if(debug==DIFF) getchar();
761
762 } /* if (ndiff) */
763 *Sdts += com.fpatt[h]*sts;
764 *Sdtv += com.fpatt[h]*stv;
765 *Ndts += com.fpatt[h]*nts;
766 *Ndtv += com.fpatt[h]*ntv;
767 } /* for (h) */
768 return (0);
769 }
770
771
DistanceF84(double n,double P,double Q,double pi[],double * k_HKY,double * t,double * SEt)772 int DistanceF84(double n, double P, double Q, double pi[],
773 double*k_HKY, double*t, double*SEt)
774 {
775 /* This calculates kappa and d from P (proportion of transitions) & Q
776 (proportion of transversions) & pi under F84.
777 When F84 fails, we try to use K80. When K80 fails, we try
778 to use JC69. When JC69 fails, we set distance t to maxt.
779 Variance formula under F84 is from Tateno et al. (1994), and briefly
780 checked against simulated data sets.
781 */
782 int failF84=0,failK80=0,failJC69=0;
783 double tc,ag, Y,R, a=0,b=0, A=-1,B=-1,C=-1, k_F84;
784 double Qsmall=min2(1e-10,0.1/n), maxkappa=999,maxt=99;
785
786 *k_HKY=-1;
787 Y=pi[0]+pi[1]; R=pi[2]+pi[3]; tc=pi[0]*pi[1]; ag=pi[2]*pi[3];
788 if (P+Q>1) { *t=maxt; *k_HKY=1; return(3); }
789 if (P<-1e-10 || Q<-1e-10 || fabs(Y+R-1)>1e-8) {
790 printf("\nPQYR & pi[]: %9.5f%9.5f%9.5f%9.5f",P,Q,Y,R);
791 matout(F0,pi,1,4);
792 error2("DistanceF84: input err.");
793 }
794 if(Q<Qsmall) failF84=failK80=1;
795 else if(Y<=0 || R<=0 || (tc<=0 && ag<=0)) failF84=1;
796 else {
797 A=tc/Y+ag/R; B=tc+ag; C=Y*R;
798 a=(2*B+2*(tc*R/Y+ag*Y/R)*(1-Q/(2*C)) - P) / (2*A);
799 b=1-Q/(2*C);
800 if (a<=0 || b<=0) failF84=1;
801 }
802 if (!failF84) {
803 a=-.5*log(a); b=-.5*log(b);
804 if(b<=0) failF84=1;
805 else {
806 k_F84 = a/b-1;
807 *t = 4*b*(tc*(1+ k_F84/Y) + ag*(1+ k_F84/R)+C);
808 *k_HKY = (B + (tc/Y+ag/R)* k_F84)/B; /* k_F84=>k_HKY85 */
809 if(SEt) {
810 a = A*C/(A*C-C*P/2-(A-B)*Q/2);
811 b = A*(A-B)/(A*C-C*P/2-(A-B)*Q/2) - (A-B-C)/(C-Q/2);
812 *SEt = sqrt((a*a*P+b*b*Q-square(a*P+b*Q))/n);
813 }
814 }
815 }
816 if(failF84 && !failK80) { /* try K80 */
817 if (noisy>=9) printf("\na=%.5f b=%.5f, use K80\n", a,b);
818 a=1-2*P-Q; b=1-2*Q;
819 if (a<=0 || b<=0) failK80=1;
820 else {
821 a=-log(a); b=-log(b);
822 if(b<=0) failK80=1;
823 else {
824 *k_HKY=(.5*a-.25*b)/(.25*b);
825 *t = .5*a+.25*b;
826 }
827 if(SEt) {
828 a=1/(1-2*P-Q); b=(a+1/(1-2*Q))/2;
829 *SEt = sqrt((a*a*P+b*b*Q-square(a*P+b*Q))/n);
830 }
831 }
832 }
833 if(failK80) {
834 if((P+=Q)>=.75) { failJC69=1; P=.75*(n-1.)/n; }
835 *t = -.75*log(1-P*4/3.);
836 if(*t>maxt) *t=maxt;
837 if(SEt) {
838 *SEt = sqrt(9*P*(1-P)/n) / (3-4*P);
839 }
840 }
841 if(*k_HKY>maxkappa) *k_HKY=maxkappa;
842
843 return(failF84 + failK80 + failJC69);
844 }
845
846
847
848 #if 0
849
850 double dsdnREV (int is, int js, double space[])
851 {
852 /* This calculates ds and dn by recovering the Q*t matrix using the equation
853 F(t) = PI * P(t) = PI * exp(Q*t)
854 This is found not to work well and is not published.
855 space[64*64*5]
856 The code here is broken since I changed the coding. Codons are now coded 0, 1, ..., 60.
857 */
858 int n=com.ncode, i,j, h;
859 double *F=PMat, *Qt=F;
860 double *Root=space+n*n,*pi=Root+n, *U=pi+n,*V=U+n*n;
861 double *T1=V+n*n,*T2=T1+n*n, t, small=1e-6;
862
863 fprintf(frst,"\npi in model\n");
864 matout(frst,com.pi,1,n);
865 FOR(i,n*n) F[i]=0;
866 FOR (h,com.npatt) {
867 F[com.z[is][h]*n+com.z[js][h]]+=com.fpatt[h]/(2*com.ls);
868 F[com.z[js][h]*n+com.z[is][h]]+=com.fpatt[h]/(2*com.ls);
869 }
870 if(fabs(1-sum(F,n*n))>1e-6) error2("Sum F != 1 in dsdnREV");
871
872 FOR (i,n) {
873 pi[i]=sum(F+i*n, n);
874 /*
875 if (F[i*n+i]<=small || F[i*n+i]<pi[i]/4)
876 */
877 if (F[i*n+i]<=small) F[i*n+i]=1-pi[i]+F[i*n+i];
878 else abyx(1/pi[i], F+i*n, n);
879 }
880 if (eigen (1, F, n, Root, T1, U, V, T2)) error2 ("eigen jgl");
881 xtoy (U, V, n*n);
882 matinv (V, n, n, T1);
883
884 fprintf(frst,"\npi in data\n");
885 matout (frst, pi, 1, n); FPN(F0);
886 matout (frst, Root, 1, n);
887
888 FOR (i,n) {
889 if (Root[i]<=0)
890 printf (" Root %d:%10.4f", i+1, Root[i]);
891 Root[i]=log(Root[i]);
892 }
893 FOR (i,n) FOR (j,n) T1[i*n+j]=U[i*n+j]*Root[j];
894 matby (T1, V, Qt, n, n, n);
895 for (i=0,t=0; i<n; i++) t-=pi[i]*Qt[i*n+i];
896 if (t<=0) puts ("err: dsdnREV");
897
898 FOR(i,n*n) Qt[i]+=1e-8; /* remove negative numbers from rounding errors */
899
900 matout(frst,Qt,n,n);
901 printf("\nt = %.5f\n", t);
902
903 return (0);
904 }
905
906
907 #endif
908