1 /* calculate stats from results file using scalesws.c
2 */
3
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <math.h>
8
9 #define max(a,b) ((a) > (b) ? (a) : (b))
10 #define min(a,b) ((a) < (b) ? (a) : (b))
11
12 #define MAXBEST 50000
13 #define BIGNUM 1000000000
14
15 #ifdef BIGMEM
16 #define MAX_LLEN 200
17 #else
18 #define MAX_LLEN 100
19 #endif
20 #define LN_FACT 10.0
21
22 struct beststr {
23 int score; /* smith-waterman score */
24 int sscore; /* duplicate for compatibility with fasta */
25 float zscore;
26 float escore;
27 int n1;
28 long lseek; /* position in library file */
29 int cont; /* offset into sequence */
30 int frame;
31 int lib;
32 char libstr[13];
33 } *bbp, *bestptr, **bptr, *best;
34
35 FILE *outfd;
36
37 int nbest; /* number of sequences better than bestcut in best */
38 int bestcut=1; /* cut off for getting into MAXBEST */
39 int bestfull;
40
41 int nlib, onlib, ntt;
42 int dohist = 0;
43 int zsflag = 1;
44 int histflg = 1;
45 int outtty=1;
46 int llen=40;
47
48 long *hist; /* histogram of all score */
49 int histint, min_hist, max_hist, maxh;
50 extern long num_db_entries;
51 extern int *llen_hist;
52 extern float *score_sums, *score2_sums;
53 float zs_to_E(), zs_to_Ec(), find_z(), find_zm();
54 extern float ks_dev;
55 extern int ks_df;
56
57 int nshow=20, mshow=50, ashow= -1;
58 float e_cut=10.0;
59 int dnaseq = 0;
60
main(argc,argv)61 main(argc, argv)
62 int argc; char **argv;
63 {
64 FILE *fin;
65 char line[512];
66 int max, icol, iarg, i, qsfnum, lsfnum, n0, n1, s[3];
67 char libstr[20], *bp;
68 char bin_file[80];
69 FILE *bout=NULL;
70 float zscor, mu, var;
71
72 outtty = isatty(1);
73
74 if (argc < 2 ) {
75 fprintf(stderr," useage - res_stats -c col -r bin_file file\n");
76 exit(1);
77 }
78
79 bin_file[0]='\0';
80 icol = 1;
81 iarg = 1;
82 while (1) {
83 if (argv[iarg][0]=='-' && argv[iarg][1]=='c') {
84 sscanf(argv[iarg+1],"%d",&icol);
85 iarg += 2;
86 }
87 else if (argv[iarg][0]=='-' && argv[iarg][1]=='r') {
88 strncpy(bin_file,argv[iarg+1],sizeof(bin_file));
89 iarg += 2;
90 }
91 else break;
92 }
93
94 icol--;
95
96 if ((fin=fopen(argv[iarg],"r"))==NULL) {
97 fprintf(stderr," cannot open %s\n",argv[1]);
98 exit(1);
99 }
100
101 if (bin_file[0]!='\0' && ((bout=fopen(bin_file,"w"))==NULL)) {
102 fprintf(stderr,"cannot open %s for output\n",bin_file);
103 }
104
105 initbest(MAXBEST+1); /* +1 required for select() */
106 for (nbest=0; nbest<MAXBEST+1; nbest++)
107 bptr[nbest] = &best[nbest];
108 bptr++; best++;
109 best[-1].score= BIGNUM;
110
111 nlib = 0;
112 ntt = 0l;
113 nbest = 0;
114
115 /* read the best scores from the results file */
116
117 while (fgets(line,sizeof(line),fin)!=NULL) {
118 if (line[0]=='/' && line[1]=='*') {
119 fputs(line,stdout);
120 continue;
121 }
122 if (line[0]==';') {
123 if ((bp=strchr(line,'|'))!=NULL) qsfnum = atoi(bp+1);
124 else continue;
125 if ((bp=strchr(line,'('))!=NULL) n0 = atoi(bp+1);
126 else {
127 fprintf(stderr, "cannot find n0:\n %s\n",line);
128 continue;
129 }
130 }
131 else {
132 sscanf(line,"%s %d %d %d %d %d",
133 libstr,&lsfnum,&n1,&s[0],&s[1],&s[2]);
134 if (lsfnum==0 && n1==0) {
135 fputs(line,stderr);
136 continue;
137 }
138 nlib++;
139 ntt += n1;
140
141 if (dohist) addhistz(zscor=find_zm(s[icol],n1),n1);
142 else zscor = (float)s[icol];
143
144 if (nbest >= MAXBEST) {
145 if (!dohist) {
146 do_bout(bout,bptr,nbest);
147 process_hist(n0,bptr,nbest);
148 addhistz(zscor=find_zm(s[icol],n1),n1);
149 dohist = 1;
150 }
151 bestfull = nbest-MAXBEST/4;
152 selectz(bestfull-1,nbest);
153 bestcut = (int)(bptr[bestfull-1]->zscore+0.5);
154 nbest = bestfull;
155 }
156 bestptr = bptr[nbest];
157 bestptr->score = s[icol];
158 bestptr->sscore = s[icol];
159 bestptr->n1 = n1;
160 bestptr->lib = lsfnum;
161 bestptr->zscore = zscor;
162 strncpy(bestptr->libstr,libstr,12);
163 bestptr->libstr[12]='\0';
164 nbest++;
165 }
166 } /* done with reading results */
167
168 if (!dohist) {
169 if (nbest < 20) {
170 zsflag = 0;
171 histflg = 0;
172 }
173 else {
174 do_bout(bout,bptr,nbest);
175 process_hist(n0,bptr,nbest);
176 }
177 }
178
179 prhist(stdout); /* print histogram, statistics */
180
181 if (!zsflag) sortbest();
182 else {
183 sortbestz(bptr,nbest);
184 for (i=0; i<nbest; i++)
185 bptr[i]->escore = zs_to_E(bptr[i]->zscore,bptr[i]->n1);
186 }
187
188 outfd = stdout;
189 showbest(); /* display best matches */
190 }
191
initbest(nbest)192 initbest(nbest) /* allocate arrays for best sort */
193 int nbest;
194 {
195
196 if ((best=(struct beststr *)calloc((size_t)nbest,sizeof(struct beststr)))
197 == NULL) {fprintf(stderr,"cannot allocate best struct\n"); exit(1);}
198 if ((bptr=(struct beststr **)calloc((size_t)nbest,sizeof(struct beststr *)))
199 == NULL) {fprintf(stderr,"cannot allocate bptr\n"); exit(1);}
200 }
201
prhist(fd)202 prhist(fd)
203 FILE *fd;
204 {
205 int i,j;
206 long hl,hll, el, ell, ev, maxval, maxvalt;
207 char hline[80], pch;
208 int mh1, mht;
209 int dotsiz, ddotsiz,doinset;
210 float cur_e, prev_e, f_int;
211 float max_dev, x_tmp;
212 int n_chi_sq, cum_hl, max_i;
213
214 mh1 = maxh-1;
215 mht = (3*maxh-3)/4 - 1;
216
217 fprintf(fd,"\n");
218
219 if (nbest < 20) {
220 fprintf(fd,
221 "%7ld residues in %5ld sequences\n",
222 ntt,nlib);
223 return;
224 }
225 if (histflg && mh1 > 0) {
226 for (i=maxval=maxvalt=0; i<maxh; i++) {
227 if (hist[i] > maxval) maxval = hist[i];
228 if (i >= mht && hist[i]>maxvalt) maxvalt = hist[i];
229 }
230 max_dev = 0.0;
231 n_chi_sq = 0;
232 cum_hl = -hist[0];
233 dotsiz = (maxval-1)/60+1;
234 ddotsiz = (maxvalt-1)/50+1;
235 doinset = (ddotsiz < dotsiz && dotsiz > 2);
236
237 fprintf(fd,"\n one = represents %d library sequences\n",dotsiz);
238 if (doinset) fprintf(fd," for inset = represents %d library sequences\n",ddotsiz);
239 if (zsflag)
240 fprintf(fd,"\n opt E()\n");
241 else
242 fprintf(fd,"\n opt\n");
243
244 prev_e = 0.0;
245 for (i=0; i<=mh1; i++) {
246 pch = (i==mh1) ? '>' : ' ';
247 pch = (i==0) ? '<' : pch;
248 hll = hl = hist[i];
249 if (zsflag) {
250 cum_hl += hl;
251 f_int = (float)(i*histint + min_hist) + (float)histint/2.0;
252 cur_e = zs_to_Ec(f_int);
253 ev = el = ell = (long)(cur_e - prev_e + 0.5);
254 if (hl > 0 && i > 5 && i < (90-min_hist)/histint) {
255 x_tmp = fabs((double)cum_hl - cur_e);
256 if (x_tmp > max_dev) {
257 max_dev = x_tmp;
258 max_i = i;
259 }
260 n_chi_sq++;
261 }
262 if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60;
263 if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40;
264 fprintf(fd,"%c%3d %5d %5d:",
265 pch,(i<mh1)?(i)*histint+min_hist :
266 mh1*histint+min_hist,hl,ev);
267 }
268 else fprintf(fd,"%c%3d %5d :",
269 pch,(i<mh1)?(i)*histint+min_hist :
270 mh1*histint+min_hist,hl);
271
272 if ((hl=(hl+dotsiz-1)/dotsiz) > 60) hl = 60;
273 if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40;
274 for (j=0; j<hl; j++) hline[j]='=';
275 if (zsflag) {
276 if (el <= hl ) {
277 if (el > 0) hline[el-1]='*';
278 hline[hl]='\0';
279 }
280 else {
281 for (j = hl; j < el; j++) hline[j]=' ';
282 hline[el-1]='*';
283 hline[el]='\0';
284 }
285 }
286 else hline[hl] = 0;
287
288 if (i >= mht&& doinset ) {
289 for (j = max(el,hl); j < 10; j++) hline[j]=' ';
290 hline[10]=':';
291 for (j = 11; j<11+hll; j++) hline[j]='=';
292 hline[11+hll]='\0';
293 if (zsflag) {
294 if (ell <= hll) hline[10+ell]='*';
295 else {
296 for (j = 11+hll; j < 10+ell; j++) hline[j]=' ';
297 hline[10+ell] = '*';
298 hline[11+ell] = '\0';
299 }
300 }
301 }
302
303 fprintf(fd,"%s\n",hline);
304 prev_e = cur_e;
305 }
306 }
307 fprintf(fd,
308 "%7ld residues in %5ld sequences\n",ntt,nlib);
309 if (zsflag) {
310 fprintf(fd," statistics extrapolated from %d to %ld sequences\n",
311 min(MAXBEST,nlib),num_db_entries);
312 if (histflg)
313 fprintf(fd," Kolmogorov-Smirnov statistic: %6.4f (N=%d) at %3d\n",
314 max_dev/(float)cum_hl, n_chi_sq,max_i*histint+min_hist);
315 }
316 fprintf(fd," %4d scores better than %d saved\n",nbest,bestcut);
317
318 fflush(fd);
319 }
320
showbest()321 showbest()
322 {
323 int ib, istart, istop;
324 char bline[200], fmt[40], pad[200];
325 char rline[20];
326 int ntmp;
327 int lcont, ccont, loff;
328 int hcutoff;
329
330 sprintf(fmt,"%%-%ds (%%3d)",llen-10);
331
332 nshow = min(20,nbest);
333 mshow = min(20,nbest);
334
335 if (outtty) {
336 printf(" How many scores would you like to see? [%d] ",nshow);
337 fflush(stdout);
338 if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0);
339 if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&nshow);
340 if (nshow<=0) nshow = min(20,nbest);
341 }
342 else nshow=mshow;
343
344 memset(pad,' ',llen-10);
345 pad[llen-31]='\0';
346 if (zsflag)
347 fprintf(outfd,"The best scores are:%s s-w Z-score E(%ld)\n",pad,nlib);
348 else
349 fprintf(outfd,"The best scores are:%s s-w\n",pad);
350
351 if (outfd != stdout)
352 if (zsflag)
353 fprintf(stdout,"The best scores are:%s s-w Z-score E(%ld)\n",pad,nlib);
354 else
355 fprintf(stdout,"The best scores are:%s s-w\n",pad);
356
357 istart = 0;
358 l1: istop = min(nbest,nshow);
359 for (ib=istart; ib<istop; ib++) {
360 bbp = bptr[ib];
361
362 if (!outtty && zsflag && bbp->escore > e_cut) {
363 nshow = ib;
364 goto done;
365 }
366
367 sprintf(bline,"%-12s %d",bbp->libstr,bbp->lib);
368 bline[13]='\0';
369
370 fprintf(outfd,fmt,bline,bbp->n1);
371
372 if (zsflag)
373 fprintf(outfd,"%4d %4.1f %6.2g\n",
374 bbp->score,bbp->zscore,
375 bbp->escore);
376 else
377 fprintf(outfd,"%4d\n",bbp->score);
378
379 if (outfd!=stdout) {
380 fprintf(stdout,fmt,bline,bbp->n1);
381 if (zsflag)
382 printf("%4d %4.1f %6.2g\n",
383 bbp->score,bbp->zscore,
384 bbp->escore);
385 else
386 printf("%4d\n",bbp->score);
387 }
388 }
389
390 fflush(outfd); if (outfd!=stdout) fflush(stdout);
391
392 if (outtty) {
393 printf(" More scores? [0] ");
394 fflush(stdout);
395 if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0);
396 ntmp = 0;
397 if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&ntmp);
398 if (ntmp<=0) ntmp = 0;
399 if (ntmp>0) {
400 istart = istop;
401 nshow += ntmp;
402 mshow += ntmp;
403 goto l1;
404 }
405 }
406 else if (zsflag && bbp->escore < e_cut) {
407 istart=istop;
408 nshow += 10;
409 goto l1;
410 }
411
412 done:
413 if (outfd!=stdout) fprintf(outfd,"\n");
414 }
415
selectz(k,n)416 selectz(k,n) /* k is rank in array */
417 int k,n;
418 {
419 int t, i, j, l, r;
420 float v;
421 struct beststr *tmptr;
422
423 l=0; r=n-1;
424
425 while ( r > l ) {
426 i = l-1;
427 j = r;
428 v = bptr[r]->zscore;
429 do {
430 while (bptr[++i]->zscore > v ) ;
431 while (bptr[--j]->zscore < v ) ;
432 tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
433 } while (j > i);
434 bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
435 if (i>=k) r = i-1;
436 if (i<=k) l = i+1;
437 }
438 }
439
sortbest()440 sortbest()
441 {
442 int cmps(), cmp1(), cmpa(), cmpz();
443 ksort(bptr,nbest,cmps);
444 }
445
sortbeste()446 sortbeste()
447 {
448 int cmpe();
449 ksort(bptr,nbest,cmpe);
450 }
451
sortbestz()452 sortbestz()
453 {
454 int cmpz();
455 ksort(bptr,nbest,cmpz);
456 }
457
458 cmps(ptr1,ptr2)
459 struct beststr *ptr1, *ptr2;
460 {
461 if (ptr1->score < ptr2->score) return (1);
462 else if (ptr1->score > ptr2->score) return (-1);
463 else return (0);
464 }
465
466 cmpe(ptr1,ptr2)
467 struct beststr *ptr1, *ptr2;
468 {
469 if (ptr1->escore < ptr2->escore) return (-1);
470 else if (ptr1->escore > ptr2->escore) return (1);
471 else return (0);
472 }
473
474 cmpz(ptr1,ptr2)
475 struct beststr *ptr1, *ptr2;
476 {
477 if (ptr1->zscore < ptr2->zscore) return (1);
478 else if (ptr1->zscore > ptr2->zscore) return (-1);
479 else return (0);
480 }
481
ksort(v,n,comp)482 ksort(v,n,comp)
483 char *v[]; int n, (*comp)();
484 {
485 int gap, i, j;
486 char *tmp;
487
488 for (gap=n/2; gap>0; gap/=2)
489 for (i=gap; i<n; i++)
490 for (j=i-gap; j>=0; j -= gap) {
491 if ((*comp)(v[j],v[j+gap]) <=0)
492 break;
493 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
494 }
495 }
496
do_bout(FILE * bout,struct beststr ** bptr,int nbest)497 do_bout(FILE *bout,struct beststr **bptr, int nbest)
498 {
499 int i, min_hist, max_hist;
500 float mu, var;
501
502 if (bout==NULL) return;
503
504 inithist();
505 for (i = 0; i<nbest; i++)
506 addhist(bptr[i]->sscore,bptr[i]->n1);
507
508 for (i=0; i<MAX_LLEN; i++)
509 if (llen_hist[i]>0) {
510 min_hist=i;
511 break;
512 }
513
514 for (i=MAX_LLEN-1; i>=0; i--)
515 if (llen_hist[i]>0) {
516 max_hist=i;
517 break;
518 }
519
520 for (i=min_hist; i<=max_hist; i++) {
521 mu=(float)score_sums[i]/(float)llen_hist[i];
522 if (llen_hist[i]>1) {
523 var = ((float)score2_sums[i]-(float)llen_hist[i]*mu*mu)/
524 (float)(llen_hist[i]-1);
525
526 fprintf(bout,"%d\t%d\t%.1f\t%.1f\t%.1f\t%.4f\t%.4f\n",
527 i,llen_hist[i],exp(((double)(i))/LN_FACT),
528 score_sums[i],score2_sums[i],mu,var);
529 }
530 }
531 free_hist();
532 fclose(bout);
533 }
534