1 /* urss.c Sept, 1991
2 copyright (c) 1984,1987,1988,1991 William R. Pearson
3
4 urss.c urss.c is a version of urdf.c that works with the
5 Smith-Waterman algorithm
6
7 Nov, 1991 include -n option to force DNA sequences
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <ctype.h>
14
15 char *refstr="\nPlease cite:\n W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448;\n T. F. Smith and M. S. Waterman, (1981) J. Mol. Biol. 147:195-197\n";
16 char *verstr="version 2.0u64, May, 1998";
17
18 #ifdef __MWERKS__
19 #include <Types.h>
20 #include <StandardFile.h>
21 StandardFileReply freply;
22 Point wpos;
23 int tval;
24 char prompt[256];
25 #include <sioux.h>
26 #define getenv mgetenv
27 #endif
28
29 #define NO 0
30
31 #ifndef BIGMEM
32 #define QFILE_SIZE 40
33 #define MAXTST 2000 /* longest test sequence */
34 #define MAXLIB 10000
35 #define MAXDIAG (MAXTST+MAXLIB)
36 #else
37 #define QFILE_SIZE 256
38 #define MAXTST 10000
39 #define MAXLIB 50000
40 #define MAXDIAG (MAXTST+MAXLIB)
41 #endif
42
43 #define MAXHIST 201 /* number of histogram divisions */
44
45 FILE *outfd,*fdata; /* fd for output file */
46 int smark[4];
47 long sq0off=1, sq1off=1;
48 long loffset = 0l; /* offset into sequence */
49
50
51 /* globals for matching */
52
53 long lmark; /* position in library file from ftell() */
54 int nlib;
55 long ntt; /* number of library sequences, number of
56 residues scanned*/
57 char libstr[21]; /* partial title from library sequence */
58
59 char *aa0, *aa10, *aa1; /* amino acid sequence data */
60 int maxn; /* max space for lib sequence (MAXDIAG-n0) */
61 int n0, n1, nd, noff; /* length of aa0, length of aa1, n0+n1,
62 diagonal offset */
63
64 int *sscor;
65
66 double K_s, Lambda_s;
67 double find_Evalue();
68 double find_Pvalue();
69
70 int nbest; /* number of sequences better than bestcut in best */
71 int bestcut; /* cut off for getting into MAXBEST */
72
73 int histint=2;
74 int bestscale=200;
75 int bkfact=5;
76 int scfact=4;
77 int bktup=2;
78 int ktmax=2;
79 int bestmax=50;
80 int bestoff=27; /* values for calculating bestcut */
81 int pamfact = 1; /* flag for using pam values for kfact */
82 int dnaseq = 0; /* true if DNA query sequence */
83 int histflag=1;
84 int dataflg=0;
85 char tmpfname[40];
86
87 int nsav, lowscor; /* number of saved runs, worst saved
88 run, score of worst saved run */
89
90 int hist[MAXHIST]; /* histogram of all score */
91 int lmax, lmin;
92 int nmean; /* number of scores averaged in mean */
93
94 int nshow; char rline[20],sline[20];
95 char resfile[QFILE_SIZE];
96
97 int showall;
98 long tstart, tscan, tdone, sstime();
99
100 #include "upam.gbl" /* includes pam array */
101
102 extern int optind;
103 char *getenv(), *smptr, *cptr; /* scoring matrix env */
104
105 extern int outtty;
106 char smstr[QFILE_SIZE];
107 int wflag = -1;
108 int wsiz;
109
110 #ifdef __MWERKS__
111 /* short ouvRef, q0vRef, q1vRef; */
112 FSSpec ouSpec, q0Spec, q1Spec;
113 OSErr error;
114 #define IntroDID 400 /* LFASTA */
115 #endif
116 char *iprompt0=" PRSS compares a test sequence to a shuffled sequence\n";
117 char *iprompt1=" test sequence file name: ";
118 char *iprompt2=" sequence to shuffle: ";
119
main(argc,argv)120 main(argc, argv)
121 int argc; char **argv;
122 {
123 char tname[QFILE_SIZE], lname[QFILE_SIZE], qline[20];
124 char ttitle[60], ltitle[60];
125 char *bp;
126 int score, score0; /* scores calculated */
127 int icnt, rcnt; /* number of random shuffles */
128 int i;
129
130 #ifdef UNIX
131 outtty=isatty(1);
132 #endif
133
134 #ifdef __MWERKS__
135 SIOUXSettings.asktosaveonclose=TRUE;
136 SIOUXSettings.showstatusline=FALSE;
137 SIOUXSettings.autocloseonquit=TRUE;
138
139 argc = ccommand(&argv);
140 if (GetResource('DLOG',IntroDID)==0L && OpenResFile("\pFASTA.rsrc")<0) {
141 SysBeep(100); fprintf(stderr," WARNING FASTA.rsrc file could not be found\n");
142 }
143 InitEvent();
144 /* GetVol((unsigned char *)prompt,&ouvRef); */
145 error=HGetVol(NULL,&ouSpec.vRefNum, &ouSpec.parID);
146 if (error != noErr) {
147 fprintf(stderr," cannot get current directory\n");
148 exit(1);
149 }
150 wpos.h=50; wpos.v=100;
151 #endif
152
153 if ((aa0=calloc(MAXTST+MAXLIB,sizeof(char)))==0) {
154 printf(" cannot allocate sequence array\n");
155 exit(1);
156 }
157 maxn = MAXTST+MAXLIB;
158 if ((aa10=calloc(MAXLIB,sizeof(char)))==0) {
159 printf(" cannot allocate sequence array\n");
160 exit(1);
161 }
162
163 initenv(argc,argv);
164
165 if (argc-optind < 3) {
166 #ifndef __MWERKS__
167 fputs(iprompt0,stdout);
168 fprintf(stdout," %s%s\n",verstr,refstr);
169 l1: fputs(iprompt1,stdout);
170 fflush(stdout);
171 if (fgets(tname,sizeof(tname),stdin)==NULL) exit(0);
172 if (tname[strlen(tname)-1]=='\n') tname[strlen(tname)-1]='\0';
173 if (tname[0]=='\0') goto l1;
174 #else
175 NIntroDlog(IntroDID,iprompt0,verstr,refstr,"\0");
176
177 l1: SFileDlog(iprompt1,&freply);
178 if (freply.sfGood==TRUE) {
179 PtoCstr((unsigned char *)freply.sfFile.name);
180 strcpy(tname,(char *)freply.sfFile.name);
181 /* q0vRef=freply.vRefNum;
182 SetVol(NULL,q0vRef);
183 */
184 q1Spec.vRefNum = q0Spec.vRefNum = freply.sfFile.vRefNum;
185 q1Spec.parID = q0Spec.parID = freply.sfFile.parID;
186 HSetVol(NULL,q0Spec.vRefNum,q0Spec.parID);
187 }
188 else exit(0);
189 #endif
190 if ((n0=getseq(tname,aa0,maxn,&dnaseq))==0) {
191 printf(" %s : sequence not found\n",tname);
192 goto l1;
193 }
194 resetp(dnaseq);
195
196 #ifndef __MWERKS__
197 l2: fputs(iprompt2,stdout);
198 fflush(stdout);
199 if (fgets(lname,sizeof(lname),stdin)==NULL) exit(1);
200 if (lname[strlen(lname)-1]=='\n') lname[strlen(lname)-1]='\0';
201 if (*lname==0) goto l2;
202 #else
203 l2: SFileDlog(iprompt2,&freply);
204 if (freply.sfGood==TRUE) {
205 PtoCstr((unsigned char *)freply.sfFile.name);
206 strcpy(lname,(char *)freply.sfFile.name);
207 /* q1vRef=freply.vRefNum;
208 SetVol(NULL,q1vRef);
209 */
210 q1Spec.vRefNum = freply.sfFile.vRefNum;
211 q1Spec.parID = freply.sfFile.parID;
212 HSetVol(NULL,q1Spec.vRefNum,q1Spec.parID);
213 }
214 else exit(0);
215 #endif
216
217 printf(" number of random shuffles? [500] ");
218 fgets(qline,sizeof(qline),stdin);
219 rcnt = 500;
220 if (qline[0]!='\0' && qline[0]!='\n') {
221 sscanf(qline,"%d",&rcnt);
222 if (rcnt < 50) rcnt = 100;
223 }
224
225 if (wflag<=0) {
226 printf(" local (window) (w) or uniform (u) shuffle [u]? ");
227 fgets(qline,sizeof(qline),stdin);
228 if ((bp=strchr(qline,'\n'))!=NULL) *bp='\0';
229 }
230 else qline[0]='\0';
231 if (tolower(qline[0])=='w' || wflag==1) {
232 wflag = 1;
233 wsiz = 10;
234 printf(" local shuffle window size [10] ");
235 fgets(qline,sizeof(qline),stdin);
236 if (qline[0]!='\0' && qline[0]!='\n') {
237 sscanf(qline,"%d",&wsiz);
238 if (wsiz <= 2) wsiz = 10;
239 }
240 }
241 else wflag=0;
242 }
243 else {
244 fputs(iprompt0,stdout);
245 fprintf(stdout," %s%s\n",verstr,refstr);
246 strncpy(tname,argv[optind+1],sizeof(tname));
247 if ((n0=getseq(tname,aa0,maxn,&dnaseq))==0) {
248 printf(" %s : sequence not found\n",tname);
249 exit(1);
250 }
251 resetp(dnaseq);
252 strncpy(lname,argv[optind+2],sizeof(lname));
253 rcnt = 100;
254 if (argc-optind>3) sscanf(argv[optind+3],"%d",&rcnt);
255 }
256
257 if ((sscor=(int *)calloc(rcnt, sizeof (int)))==NULL) {
258 fprintf(stderr," cannot allocate storage %d\n",rcnt);
259 exit(1);
260 }
261
262 printf(" %s : %4d %s\n",tname, n0, sqnam);
263 gettitle(tname,ttitle,50);
264 printf("%s\n",ttitle);
265
266 aa1 = aa0 + n0 + 2;
267 maxn -= n0 + 3;
268 if (maxn > MAXLIB) maxn = MAXLIB;
269
270 if ((n1=getseq(lname,aa10,maxn,&dnaseq))==0) {
271 printf(" %s : %s sequence not found\n",lname,sqtype);
272 exit(1);
273 }
274 printf(" %s : %4d %s\n",lname, n1, sqnam);
275 strncpy(libstr,lname,12);
276 gettitle(lname,ltitle,50);
277 printf("%s\n",ltitle);
278
279 initpam2(); /* convert 1-d pam to 2-d pam2 */
280
281 initparm();
282
283 tstart = sstime();
284
285 if (dataflg==1 && (fdata = fopen(tmpfname,"w"))!=NULL)
286 fprintf(fdata,"%.50s\n",ltitle);
287
288 inithist(); /* initialize histogram, mean, sd */
289 if (wsiz > n1) wsiz = n1;
290
291 for (i=0; i<n1; i++) aa1[i]=aa10[i];
292 aa1[n1]= -1;
293
294 score0 = smatch(aa0,n0,aa1,n1,NO);
295 if (fdata) {
296 fprintf(fdata,"%-12s %4d %6d %4d %4d %4d %8ld\n",
297 libstr,0,n1,score0,0,0,0);
298 fflush(fdata);
299 }
300
301 nlib = 0; /* counts number of sequences */
302 ntt = 0; /* counts number of residues */
303
304 irand(); /* seed the random number generator */
305
306 while (rcnt-- > 0) {
307 #ifdef __MWERKS__
308 ChkEvent();
309 #endif
310 if (wflag==1) wshuffle(aa10,aa1,n1,wsiz);
311 else shuffle(aa1,n1);
312 ntt += n1;
313 nlib++;
314 score=smatch(aa0,n0,aa1,n1,NO);
315 addhist(score);
316 if (fdata) {
317 fprintf(fdata,"%-12s %4d %6d %4d %4d %4d %8ld\n",
318 libstr,0,n1,score,0,0,0);
319 fflush(fdata);
320 }
321 }
322
323 tscan = sstime();
324
325 initialize_hist(lmax);
326 est_lambda_K(n0,n1,sscor,nmean,&K_s, &Lambda_s);
327 free_hist();
328
329 prhist(stdout,score0,n0,n1); /* print histogram, statistics */
330
331 outfd = stdout;
332
333 #ifdef __MWERKS__
334 HSetVol(NULL,ouSpec.vRefNum,ouSpec.parID);
335 /* SetVol(NULL,ouvRef); */
336 #endif
337 if (outtty && resfile[0]=='\0') {
338 printf(" Enter filename for results : ");
339 fgets(rline,sizeof(rline),stdin);
340 if ((bp=strchr(rline,'\n'))!=NULL) *bp='\0';
341 if (rline[0]!='\0') strncpy(resfile,rline,sizeof(resfile));
342 }
343 if (resfile[0]!='\0') {
344 if ((outfd=fopen(resfile,"w"))==NULL) exit(1);
345 fprintf(outfd," %s, %d %s vs %s\n",tname, n0, sqnam, lname);
346 prhist(outfd,score0,n0,n1);
347 #ifdef __MWERKS__
348 SIOUXSettings.asktosaveonclose=FALSE;
349 SIOUXSettings.autocloseonquit=TRUE;
350 #endif
351 }
352 }
353
354 extern int *sascii, nascii[], aascii[];
355
initenv(argc,argv)356 initenv(argc,argv)
357 int argc; char **argv;
358 {
359 char *cptr, *getenv();
360 int copt, getopt();
361 extern char *optarg;
362
363 sascii = aascii;
364 strncpy(smstr,"BLOSUM50",sizeof(smstr));
365 smptr = smstr;
366 pam = abl50;
367 sq = aa;
368 hsq = haa;
369 nsq = naa;
370 dnaseq = 0;
371
372 while ((copt=getopt(argc,argv,"qQf:g:nO:s:w:hr:"))!=EOF)
373 switch(copt) {
374 case 'q':
375 case 'Q': outtty=0; break;
376 case 'f': sscanf(optarg,"%d",&gdelval); del_set = 1; break;
377 case 'g': sscanf(optarg,"%d",&ggapval); gap_set = 1; break;
378 case 'n': dnaseq=1;
379 sascii = nascii;
380 strncpy(smstr,"DNA",sizeof(smstr));
381 smptr=smstr;
382 sq = nt;
383 nsq = nnt;
384 hsq = hnt;
385 pam = npam;
386 strcpy(sqnam,"nt");
387 strcpy(sqtype,"DNA");
388 resetp(dnaseq);
389 break;
390 case 'O': strncpy(resfile,optarg,sizeof(resfile));
391 break;
392 case 'h': histflag = 0; break;
393 case 'r': dataflg=1;
394 strncpy(tmpfname,optarg,sizeof(tmpfname));
395 break;
396 case 's': strncpy(smstr,optarg,sizeof(smstr));
397 smptr=smstr;
398 if (initpam(smptr)) dnaseq= -1;
399 histint = 1;
400 break;
401 case 'w': wflag = 1; sscanf(optarg,"%d",&wsiz);
402 break;
403 default : fprintf(stderr," illegal option -%c\n",copt);
404 }
405 optind--;
406
407 if (dnaseq>=0) {
408 if ((smptr=getenv("SMATRIX"))!=NULL && initpam(smptr)) dnaseq = -1;
409 else smptr=smstr;
410 }
411
412 if (dnaseq<0 && strlen(smptr)>0)
413 fprintf(stderr," matrix file reset to %s\n",smptr);
414 }
415
resetp(dnaseq)416 resetp(dnaseq)
417 int dnaseq;
418 {
419 if (dnaseq==1) {
420 fprintf(stderr," resetting to DNA\n");
421 if (!del_set) gdelval = -16;
422 if (!gap_set) ggapval = -4;
423 histint=4;
424 bestscale=80;
425 bkfact=5;
426 scfact=1;
427 bktup=6;
428 ktmax=6;
429 bestmax=80;
430 bestoff=45;
431 pam = npam;
432 smptr = "DNA";
433 if (pamfact>=0) pamfact = 0;
434 }
435 }
436
initparm()437 initparm()
438 {
439 char *getenv(), *cptr;
440 int itemp;
441
442 }
443
444
initpam2()445 initpam2()
446 {
447 int i, j, k;
448
449 k=0;
450 for (i=0; i<naa; i++)
451 for (j=0; j<=i; j++)
452 pam2[j][i] = pam2[i][j] = pam[k++];
453 }
454
inithist()455 inithist()
456 {
457 int i;
458
459 for (i=0; i<MAXHIST; i++) hist[i]=0;
460 nmean = lmax = 0;
461 lmin = MAXHIST;
462 initialize_hist(MAXHIST);
463 }
464
prhist(fd,score0,q_length,l_length)465 prhist(fd,score0, q_length, l_length)
466 FILE *fd; int score0;
467 {
468 int i,j,hl;
469 char hline[80], pch;
470 int maxval, dotsiz;
471 int gl,hl0;
472 int mh1, mn;
473 float lsd;
474 double prev_cum, cum;
475 int cp;
476
477 if (histflag) {
478 mh1 = lmax/histint + 4;
479 if (mh1 >= MAXHIST) mh1 = MAXHIST-1;
480 mn = lmin/histint - 3;
481 if (mn < 0) mn = 0;
482
483 for (i=0,maxval=0; i<mh1; i++) {
484 if (hist[i] > maxval) maxval = hist[i];
485 }
486 dotsiz = (maxval-1)/50+1;
487
488 prev_cum = find_Evalue(mn*histint, q_length, l_length, K_s, Lambda_s);
489 cum = 0;
490 fprintf(fd,"\n s-w est\n");
491 for (i=mn; i<=mh1; i++) {
492 cum = find_Evalue((i+1)*histint, q_length, l_length,K_s,Lambda_s);
493
494 cp = (int)(prev_cum - cum + 0.5);
495 prev_cum = cum;
496 pch = (i==mh1) ? '>' : ' ';
497 pch = (i==mn) ? '<' : pch;
498 fprintf(fd,"%c%3d %4d %4d:",pch,(i<mh1?(i)*histint: mh1*histint),
499 hist[i],cp);
500 hl = hist[i];
501 if ((hl=(hl+dotsiz-1)/dotsiz) > 50) hl = 50;
502 for (j=0; j<hl; j++) hline[j]='=';
503 if ((cp=(cp+dotsiz-1)/dotsiz) >= 50) cp = 50;
504 if (cp > 0) {
505 if (cp < hl) hline[cp-1]='*';
506 else {
507 for (; hl<cp && hl<50; hl++) hline[hl]=' ';
508 hline[hl-1]='*';
509 }
510 }
511 hline[hl]=0;
512 fprintf(fd,"%s",hline);
513 if ((score0 >= (i)*histint && score0 < (i+1)*histint) ||
514 (score0 >= mh1*histint && i==mh1)) fprintf(fd," O");
515 fprintf(fd,"\n");
516 }
517 }
518
519 fprintf(fd,"%7ld residues in %5d sequences,\n",ntt,nlib);
520 fprintf(fd," %s matrix, ",smptr);
521 fprintf(fd,"gap penalties: %d,%d\n",gdelval,ggapval);
522
523 if (wflag==1) fprintf(fd," local shuffle, window size: %d\n",wsiz);
524
525 fprintf(fd, " unshuffled s-w score: %d; shuffled score range: %d - %d\n",
526 score0,lmin,lmax);
527
528 cum = find_Pvalue(score0,n0,n1,K_s,Lambda_s);
529 fprintf(fd,"Lambda: %5.5g K: %5.5g; P(%d)= %5.5g\n",
530 Lambda_s,K_s,score0,cum);
531
532 fprintf(fd,"For %d sequences, a score >=%d is expected",nlib,score0);
533 cum = find_Evalue(score0,n0,n1,K_s,Lambda_s);
534 if (cum > 5.0) fprintf(fd," %d times\n",(int)(cum + 0.5));
535 else fprintf(fd," %5.3g times\n",cum);
536 }
537
addhist(score)538 addhist(score)
539 int score;
540 {
541 if (score > lmax) lmax = score;
542 if (score < lmin) lmin = score;
543 sscor[nmean++]=score;
544
545 score = (score-1)/histint;
546 if (score < 0) score=0;
547 else if (score >= MAXHIST) score = MAXHIST-1;
548 hist[score]++;
549 }
550
ssort(v,n)551 void ssort(v,n)
552 int *v; int n;
553 {
554 int gap, i, j;
555 int tmp;
556
557 for (gap=n/2; gap>0; gap/=2)
558 for (i=gap; i<n; i++)
559 for (j=i-gap; j>=0; j -= gap) {
560 if (v[j]<=v[j+gap]) break;
561 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
562 }
563 }
564
565 int ieven = 1;
wshuffle(from,to,n,wsiz)566 wshuffle(from,to,n,wsiz) /* copies from from to from shuffling */
567 char *from, *to; int n, wsiz;
568 {
569 int i,j, k, mm; char tmp, *top;
570
571 memcpy(to,from,n);
572
573 mm = n%wsiz;
574
575 if (ieven) {
576 for (k=0; k<(n-wsiz); k += wsiz) {
577 top = &to[k];
578 for (i=wsiz; i>1; i--) {
579 j = nrand(i);
580 tmp = top[j];
581 top[j] = top[i-1];
582 top[i-1] = tmp;
583 }
584 }
585 top = &to[n-mm];
586 for (i=mm; i>1; i--) {
587 j = nrand(i);
588 tmp = top[j];
589 top[j] = top[i-1];
590 top[i-1] = tmp;
591 }
592 ieven = 0;
593 }
594 else {
595 for (k=n; k>=wsiz; k -= wsiz) {
596 top = &to[k-wsiz];
597 for (i=wsiz; i>1; i--) {
598 j = nrand(i);
599 tmp = top[j];
600 top[j] = top[i-1];
601 top[i-1] = tmp;
602 }
603 }
604 top = &to[0];
605 for (i=mm; i>1; i--) {
606 j = nrand(i);
607 tmp = top[j];
608 top[j] = top[i-1];
609 top[i-1] = tmp;
610 }
611 ieven = 1;
612 }
613 to[n] = -1;
614 }
615
shuffle(from,n)616 shuffle(from,n) /* copies from from to from shuffling */
617 char *from; int n;
618 {
619 int i,j; char tmp;
620
621 for (i=n; i>1; i--) {
622 j = nrand(i);
623 tmp = from[j];
624 from[j] = from[i-1];
625 from[i-1] = tmp;
626 }
627 from[n] = -1;
628 }
629
cmpi(val1,val2)630 cmpi(val1,val2)
631 int val1, val2;
632 {
633 if (val1 < val2) return (-1);
634 else if (val1 > val2) return 1;
635 else return 0;
636 }
637
ksort(v,n,comp)638 ksort(v,n,comp)
639 char *v[]; int n, (*comp)();
640 {
641 int gap, i, j;
642 char *tmp;
643
644 for (gap=n/2; gap>0; gap/=2)
645 for (i=gap; i<n; i++)
646 for (j=i-gap; j>=0; j -= gap) {
647 if ((*comp)(v[j],v[j+gap]) <=0)
648 break;
649 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
650 }
651 }
652
653 /* stubs for linking */
654 int llen;
655
aancpy()656 aancpy()
657 {}
658
659 int markx;
disgraph()660 disgraph()
661 {}
662
ALIGN()663 ALIGN()
664 {}
665
discons()666 discons()
667 {}
668