1 /* calculate stats from results file using scalesws.c
2    */
3 
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <math.h>
8 
9 #define max(a,b) ((a) > (b) ? (a) : (b))
10 #define min(a,b) ((a) < (b) ? (a) : (b))
11 
12 #define MAXBEST 50000
13 #define BIGNUM 1000000000
14 
15 #ifdef BIGMEM
16 #define MAX_LLEN 200
17 #else
18 #define MAX_LLEN 100
19 #endif
20 #define LN_FACT 10.0
21 
22 struct beststr {
23   int score;	/* smith-waterman score */
24   int sscore;	/* duplicate for compatibility with fasta */
25   float zscore;
26   float escore;
27   int n1;
28   long lseek;	/* position in library file */
29   int cont;	/* offset into sequence */
30   int frame;
31   int lib;
32   char libstr[13];
33 } *bbp, *bestptr, **bptr, *best;
34 
35 FILE *outfd;
36 
37 int nbest;	/* number of sequences better than bestcut in best */
38 int bestcut=1; 	/* cut off for getting into MAXBEST */
39 int bestfull;
40 
41 int nlib, onlib, ntt;
42 int dohist = 0;
43 int zsflag = 1;
44 int histflg = 1;
45 int outtty=1;
46 int llen=40;
47 
48 long *hist;		/* histogram of all score */
49 int histint, min_hist, max_hist, maxh;
50 extern long num_db_entries;
51 extern int *llen_hist;
52 extern float *score_sums, *score2_sums;
53 float zs_to_E(), zs_to_Ec(), find_z(), find_zm();
54 extern float ks_dev;
55 extern int ks_df;
56 
57 int nshow=20, mshow=50, ashow= -1;
58 float e_cut=10.0;
59 int dnaseq = 0;
60 
main(argc,argv)61 main(argc, argv)
62      int argc; char **argv;
63 {
64   FILE *fin;
65   char line[512];
66   int max, icol, iarg, i, qsfnum, lsfnum, n0, n1, s[3];
67   char libstr[20], *bp;
68   char bin_file[80];
69   FILE *bout=NULL;
70   float zscor, mu, var;
71 
72   outtty = isatty(1);
73 
74   if (argc < 2 ) {
75     fprintf(stderr," useage - res_stats -c col -r bin_file file\n");
76     exit(1);
77   }
78 
79   bin_file[0]='\0';
80   icol = 1;
81   iarg = 1;
82   while (1) {
83     if (argv[iarg][0]=='-' && argv[iarg][1]=='c') {
84       sscanf(argv[iarg+1],"%d",&icol);
85       iarg += 2;
86     }
87     else if (argv[iarg][0]=='-' && argv[iarg][1]=='r') {
88       strncpy(bin_file,argv[iarg+1],sizeof(bin_file));
89       iarg += 2;
90     }
91     else break;
92   }
93 
94   icol--;
95 
96   if ((fin=fopen(argv[iarg],"r"))==NULL) {
97     fprintf(stderr," cannot open %s\n",argv[1]);
98     exit(1);
99   }
100 
101   if (bin_file[0]!='\0' && ((bout=fopen(bin_file,"w"))==NULL)) {
102     fprintf(stderr,"cannot open %s for output\n",bin_file);
103   }
104 
105   initbest(MAXBEST+1);	/* +1 required for select() */
106   for (nbest=0; nbest<MAXBEST+1; nbest++)
107     bptr[nbest] = &best[nbest];
108   bptr++; best++;
109   best[-1].score= BIGNUM;
110 
111   nlib =  0;
112   ntt = 0l;
113   nbest = 0;
114 
115   /* read the best scores from the results file */
116 
117   while (fgets(line,sizeof(line),fin)!=NULL) {
118     if (line[0]=='/' && line[1]=='*') {
119       fputs(line,stdout);
120       continue;
121     }
122     if (line[0]==';') {
123       if ((bp=strchr(line,'|'))!=NULL) qsfnum = atoi(bp+1);
124       else continue;
125       if ((bp=strchr(line,'('))!=NULL) n0 = atoi(bp+1);
126       else {
127 	fprintf(stderr, "cannot find n0:\n %s\n",line);
128 	continue;
129       }
130     }
131     else {
132 	sscanf(line,"%s %d %d %d %d %d",
133 	       libstr,&lsfnum,&n1,&s[0],&s[1],&s[2]);
134 	if (lsfnum==0 && n1==0) {
135 	  fputs(line,stderr);
136 	  continue;
137 	}
138 	nlib++;
139 	ntt += n1;
140 
141 	if (dohist) addhistz(zscor=find_zm(s[icol],n1),n1);
142 	else zscor = (float)s[icol];
143 
144 	if (nbest >= MAXBEST) {
145 	  if (!dohist) {
146 	    do_bout(bout,bptr,nbest);
147 	    process_hist(n0,bptr,nbest);
148 	    addhistz(zscor=find_zm(s[icol],n1),n1);
149 	    dohist = 1;
150 	  }
151 	  bestfull = nbest-MAXBEST/4;
152 	  selectz(bestfull-1,nbest);
153 	  bestcut = (int)(bptr[bestfull-1]->zscore+0.5);
154 	  nbest = bestfull;
155 	}
156 	bestptr = bptr[nbest];
157 	bestptr->score = s[icol];
158 	bestptr->sscore = s[icol];
159 	bestptr->n1 = n1;
160 	bestptr->lib = lsfnum;
161 	bestptr->zscore = zscor;
162 	strncpy(bestptr->libstr,libstr,12);
163 	bestptr->libstr[12]='\0';
164 	nbest++;
165     }
166   }	/* done with reading results */
167 
168   if (!dohist) {
169     if (nbest < 20) {
170       zsflag = 0;
171       histflg = 0;
172     }
173     else {
174       do_bout(bout,bptr,nbest);
175       process_hist(n0,bptr,nbest);
176     }
177   }
178 
179   prhist(stdout);		/* print histogram, statistics */
180 
181   if (!zsflag) sortbest();
182   else {
183     sortbestz(bptr,nbest);
184     for (i=0; i<nbest; i++)
185       bptr[i]->escore = zs_to_E(bptr[i]->zscore,bptr[i]->n1);
186   }
187 
188   outfd = stdout;
189   showbest();	/* display best matches */
190 }
191 
initbest(nbest)192 initbest(nbest)		/* allocate arrays for best sort */
193      int nbest;
194 {
195 
196   if ((best=(struct beststr *)calloc((size_t)nbest,sizeof(struct beststr)))
197       == NULL) {fprintf(stderr,"cannot allocate best struct\n"); exit(1);}
198   if ((bptr=(struct beststr **)calloc((size_t)nbest,sizeof(struct beststr *)))
199       == NULL) {fprintf(stderr,"cannot allocate bptr\n"); exit(1);}
200 }
201 
prhist(fd)202 prhist(fd)
203 	FILE *fd;
204 {
205   int i,j;
206   long hl,hll, el, ell, ev, maxval, maxvalt;
207   char hline[80], pch;
208   int mh1, mht;
209   int dotsiz, ddotsiz,doinset;
210   float cur_e, prev_e, f_int;
211   float max_dev, x_tmp;
212   int n_chi_sq, cum_hl, max_i;
213 
214   mh1 = maxh-1;
215   mht = (3*maxh-3)/4 - 1;
216 
217   fprintf(fd,"\n");
218 
219   if (nbest < 20) {
220     fprintf(fd,
221 	    "%7ld residues in %5ld sequences\n",
222 	    ntt,nlib);
223     return;
224   }
225   if (histflg && mh1 > 0) {
226     for (i=maxval=maxvalt=0; i<maxh; i++) {
227       if (hist[i] > maxval) maxval = hist[i];
228       if (i >= mht &&  hist[i]>maxvalt) maxvalt = hist[i];
229     }
230     max_dev = 0.0;
231     n_chi_sq = 0;
232     cum_hl = -hist[0];
233     dotsiz = (maxval-1)/60+1;
234     ddotsiz = (maxvalt-1)/50+1;
235     doinset = (ddotsiz < dotsiz && dotsiz > 2);
236 
237     fprintf(fd,"\n one = represents %d library sequences\n",dotsiz);
238     if (doinset) fprintf(fd," for inset = represents %d library sequences\n",ddotsiz);
239     if (zsflag)
240       fprintf(fd,"\n       opt      E()\n");
241     else
242       fprintf(fd,"\n     opt\n");
243 
244     prev_e =  0.0;
245     for (i=0; i<=mh1; i++) {
246       pch = (i==mh1) ? '>' : ' ';
247       pch = (i==0) ? '<' : pch;
248       hll = hl = hist[i];
249       if (zsflag) {
250 	cum_hl += hl;
251 	f_int = (float)(i*histint + min_hist) + (float)histint/2.0;
252 	cur_e = zs_to_Ec(f_int);
253 	ev = el = ell = (long)(cur_e - prev_e + 0.5);
254 	if (hl > 0  && i > 5 && i < (90-min_hist)/histint) {
255 	  x_tmp  = fabs((double)cum_hl - cur_e);
256 	  if (x_tmp > max_dev) {
257 	    max_dev = x_tmp;
258 	    max_i = i;
259 	  }
260 	  n_chi_sq++;
261 	}
262 	if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60;
263 	if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40;
264 	fprintf(fd,"%c%3d %5d %5d:",
265 		pch,(i<mh1)?(i)*histint+min_hist :
266 		mh1*histint+min_hist,hl,ev);
267       }
268       else fprintf(fd,"%c%3d %5d :",
269 		   pch,(i<mh1)?(i)*histint+min_hist :
270 		   mh1*histint+min_hist,hl);
271 
272       if ((hl=(hl+dotsiz-1)/dotsiz) > 60) hl = 60;
273       if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40;
274       for (j=0; j<hl; j++) hline[j]='=';
275       if (zsflag) {
276 	if (el <= hl ) {
277 	  if (el > 0) hline[el-1]='*';
278 	  hline[hl]='\0';
279 	}
280 	else {
281 	  for (j = hl; j < el; j++) hline[j]=' ';
282 	  hline[el-1]='*';
283 	  hline[el]='\0';
284 	}
285       }
286       else hline[hl] = 0;
287 
288       if (i >= mht&& doinset ) {
289 	for (j = max(el,hl); j < 10; j++) hline[j]=' ';
290 	hline[10]=':';
291 	for (j = 11; j<11+hll; j++) hline[j]='=';
292 	hline[11+hll]='\0';
293 	if (zsflag) {
294 	  if (ell <= hll) hline[10+ell]='*';
295 	  else {
296 	    for (j = 11+hll; j < 10+ell; j++) hline[j]=' ';
297 	    hline[10+ell] = '*';
298 	    hline[11+ell] = '\0';
299 	  }
300 	}
301       }
302 
303       fprintf(fd,"%s\n",hline);
304       prev_e = cur_e;
305     }
306   }
307   fprintf(fd,
308 	  "%7ld residues in %5ld sequences\n",ntt,nlib);
309   if (zsflag) {
310     fprintf(fd," statistics extrapolated from %d to %ld sequences\n",
311 	    min(MAXBEST,nlib),num_db_entries);
312     if (histflg)
313       fprintf(fd," Kolmogorov-Smirnov  statistic: %6.4f (N=%d) at %3d\n",
314 	    max_dev/(float)cum_hl, n_chi_sq,max_i*histint+min_hist);
315   }
316   fprintf(fd," %4d scores better than %d saved\n",nbest,bestcut);
317 
318   fflush(fd);
319 }
320 
showbest()321 showbest()
322   {
323     int ib, istart, istop;
324     char bline[200], fmt[40], pad[200];
325     char rline[20];
326     int ntmp;
327     int lcont, ccont, loff;
328     int hcutoff;
329 
330     sprintf(fmt,"%%-%ds (%%3d)",llen-10);
331 
332     nshow = min(20,nbest);
333     mshow = min(20,nbest);
334 
335     if (outtty) {
336       printf(" How many scores would you like to see? [%d] ",nshow);
337       fflush(stdout);
338       if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0);
339       if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&nshow);
340       if (nshow<=0) nshow = min(20,nbest);
341     }
342     else nshow=mshow;
343 
344     memset(pad,' ',llen-10);
345     pad[llen-31]='\0';
346     if (zsflag)
347       fprintf(outfd,"The best scores are:%s s-w Z-score E(%ld)\n",pad,nlib);
348     else
349       fprintf(outfd,"The best scores are:%s s-w\n",pad);
350 
351     if (outfd != stdout)
352       if (zsflag)
353 	fprintf(stdout,"The best scores are:%s s-w Z-score E(%ld)\n",pad,nlib);
354       else
355 	fprintf(stdout,"The best scores are:%s s-w\n",pad);
356 
357     istart = 0;
358   l1:	istop = min(nbest,nshow);
359   for (ib=istart; ib<istop; ib++) {
360     bbp = bptr[ib];
361 
362     if (!outtty && zsflag && bbp->escore > e_cut) {
363       nshow = ib;
364       goto done;
365     }
366 
367     sprintf(bline,"%-12s %d",bbp->libstr,bbp->lib);
368     bline[13]='\0';
369 
370     fprintf(outfd,fmt,bline,bbp->n1);
371 
372     if (zsflag)
373       fprintf(outfd,"%4d %4.1f %6.2g\n",
374 	      bbp->score,bbp->zscore,
375 	      bbp->escore);
376     else
377       fprintf(outfd,"%4d\n",bbp->score);
378 
379     if (outfd!=stdout) {
380       fprintf(stdout,fmt,bline,bbp->n1);
381       if (zsflag)
382 	printf("%4d %4.1f %6.2g\n",
383 	       bbp->score,bbp->zscore,
384 	       bbp->escore);
385       else
386 	printf("%4d\n",bbp->score);
387     }
388   }
389 
390   fflush(outfd); if (outfd!=stdout) fflush(stdout);
391 
392   if (outtty) {
393     printf(" More scores? [0] ");
394     fflush(stdout);
395     if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0);
396     ntmp = 0;
397     if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&ntmp);
398     if (ntmp<=0) ntmp = 0;
399     if (ntmp>0) {
400       istart = istop;
401       nshow += ntmp;
402       mshow += ntmp;
403       goto l1;
404     }
405   }
406   else if (zsflag && bbp->escore < e_cut) {
407     istart=istop;
408     nshow += 10;
409     goto l1;
410   }
411 
412   done:
413   if (outfd!=stdout) fprintf(outfd,"\n");
414 }
415 
selectz(k,n)416 selectz(k,n)	/* k is rank in array */
417      int k,n;
418 {
419   int t, i, j, l, r;
420   float v;
421   struct beststr *tmptr;
422 
423   l=0; r=n-1;
424 
425   while ( r > l ) {
426     i = l-1;
427     j = r;
428     v = bptr[r]->zscore;
429     do {
430       while (bptr[++i]->zscore > v ) ;
431       while (bptr[--j]->zscore < v ) ;
432       tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
433     } while (j > i);
434     bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
435     if (i>=k) r = i-1;
436     if (i<=k) l = i+1;
437   }
438 }
439 
sortbest()440 sortbest()
441 {
442   int cmps(), cmp1(), cmpa(), cmpz();
443   ksort(bptr,nbest,cmps);
444 }
445 
sortbeste()446 sortbeste()
447 {
448   int cmpe();
449   ksort(bptr,nbest,cmpe);
450 }
451 
sortbestz()452 sortbestz()
453 {
454   int cmpz();
455   ksort(bptr,nbest,cmpz);
456 }
457 
458 cmps(ptr1,ptr2)
459      struct beststr *ptr1, *ptr2;
460 {
461   if (ptr1->score < ptr2->score) return (1);
462   else if (ptr1->score > ptr2->score) return (-1);
463   else return (0);
464 }
465 
466 cmpe(ptr1,ptr2)
467      struct beststr *ptr1, *ptr2;
468 {
469   if (ptr1->escore < ptr2->escore) return (-1);
470   else if (ptr1->escore > ptr2->escore) return (1);
471   else return (0);
472 }
473 
474 cmpz(ptr1,ptr2)
475      struct beststr *ptr1, *ptr2;
476 {
477   if (ptr1->zscore < ptr2->zscore) return (1);
478   else if (ptr1->zscore > ptr2->zscore) return (-1);
479   else return (0);
480 }
481 
ksort(v,n,comp)482 ksort(v,n,comp)
483      char *v[]; int n, (*comp)();
484 {
485   int gap, i, j;
486   char *tmp;
487 
488   for (gap=n/2; gap>0; gap/=2)
489     for (i=gap; i<n; i++)
490       for (j=i-gap; j>=0; j -= gap) {
491 	if ((*comp)(v[j],v[j+gap]) <=0)
492 	  break;
493 	tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
494       }
495 }
496 
do_bout(FILE * bout,struct beststr ** bptr,int nbest)497 do_bout(FILE *bout,struct beststr **bptr, int nbest)
498 {
499   int i, min_hist, max_hist;
500   float mu, var;
501 
502   if (bout==NULL) return;
503 
504   inithist();
505   for (i = 0; i<nbest; i++)
506     addhist(bptr[i]->sscore,bptr[i]->n1);
507 
508   for (i=0; i<MAX_LLEN; i++)
509     if (llen_hist[i]>0) {
510       min_hist=i;
511       break;
512     }
513 
514   for (i=MAX_LLEN-1; i>=0; i--)
515     if (llen_hist[i]>0) {
516       max_hist=i;
517       break;
518     }
519 
520   for (i=min_hist; i<=max_hist; i++) {
521     mu=(float)score_sums[i]/(float)llen_hist[i];
522     if (llen_hist[i]>1) {
523       var = ((float)score2_sums[i]-(float)llen_hist[i]*mu*mu)/
524 	(float)(llen_hist[i]-1);
525 
526       fprintf(bout,"%d\t%d\t%.1f\t%.1f\t%.1f\t%.4f\t%.4f\n",
527 	      i,llen_hist[i],exp(((double)(i))/LN_FACT),
528 	      score_sums[i],score2_sums[i],mu,var);
529     }
530   }
531   free_hist();
532   fclose(bout);
533 }
534