1 /*      urss.c         Sept, 1991
2         copyright (c) 1984,1987,1988,1991    William R. Pearson
3 
4 	urss.c		urss.c is a version of urdf.c that works with the
5 			Smith-Waterman algorithm
6 
7 	Nov, 1991	include -n option to force DNA sequences
8 */
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <ctype.h>
14 
15 char *refstr="\nPlease cite:\n W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448;\n T. F. Smith and M. S. Waterman, (1981) J. Mol. Biol. 147:195-197\n";
16 char *verstr="version 2.0u64, May, 1998";
17 
18 #ifdef __MWERKS__
19 #include <Types.h>
20 #include <StandardFile.h>
21 StandardFileReply freply;
22 Point wpos;
23 int tval;
24 char prompt[256];
25 #include <sioux.h>
26 #define getenv mgetenv
27 #endif
28 
29 #define NO 0
30 
31 #ifndef BIGMEM
32 #define QFILE_SIZE 40
33 #define MAXTST 2000	/* longest test sequence */
34 #define MAXLIB 10000
35 #define MAXDIAG (MAXTST+MAXLIB)
36 #else
37 #define QFILE_SIZE 256
38 #define MAXTST 10000
39 #define MAXLIB 50000
40 #define MAXDIAG (MAXTST+MAXLIB)
41 #endif
42 
43 #define MAXHIST 201	/* number of histogram divisions */
44 
45 FILE *outfd,*fdata;		/* fd for output file */
46 int smark[4];
47 long sq0off=1, sq1off=1;
48 long loffset = 0l;		/* offset into sequence */
49 
50 
51 /* globals for matching */
52 
53 long lmark;		/* position in library file from ftell() */
54 int nlib;
55 long ntt;		/* number of library sequences, number of
56 				residues scanned*/
57 char libstr[21];	/* partial title from library sequence */
58 
59 char *aa0, *aa10, *aa1;	/* amino acid sequence data */
60 int maxn;		/* max space for lib sequence (MAXDIAG-n0) */
61 int n0, n1, nd, noff;	/* length of aa0, length of aa1, n0+n1,
62 				diagonal offset */
63 
64 int *sscor;
65 
66 double K_s, Lambda_s;
67 double find_Evalue();
68 double find_Pvalue();
69 
70 int nbest;	/* number of sequences better than bestcut in best */
71 int bestcut; 	/* cut off for getting into MAXBEST */
72 
73 int histint=2;
74 int bestscale=200;
75 int bkfact=5;
76 int scfact=4;
77 int bktup=2;
78 int ktmax=2;
79 int bestmax=50;
80 int bestoff=27;	/* values for calculating bestcut */
81 int pamfact = 1;	/* flag for using pam values for kfact */
82 int dnaseq = 0;	/* true if DNA query sequence */
83 int histflag=1;
84 int dataflg=0;
85 char tmpfname[40];
86 
87 int nsav, lowscor;	/* number of saved runs, worst saved
88 				run, score of worst saved run */
89 
90 int hist[MAXHIST];		/* histogram of all score */
91 int lmax, lmin;
92 int nmean;			/* number of scores averaged in mean */
93 
94 int nshow; char rline[20],sline[20];
95 char resfile[QFILE_SIZE];
96 
97 int showall;
98 long tstart, tscan, tdone, sstime();
99 
100 #include "upam.gbl"		/* includes pam array */
101 
102 extern int optind;
103 char *getenv(), *smptr, *cptr;		/* scoring matrix env */
104 
105 extern int outtty;
106 char smstr[QFILE_SIZE];
107 int wflag = -1;
108 int wsiz;
109 
110 #ifdef __MWERKS__
111 /* short ouvRef, q0vRef, q1vRef; */
112 FSSpec ouSpec, q0Spec, q1Spec;
113 OSErr error;
114 #define IntroDID 400	/* LFASTA */
115 #endif
116 char *iprompt0=" PRSS compares a test sequence to a shuffled sequence\n";
117 char *iprompt1=" test sequence file name: ";
118 char *iprompt2=" sequence to shuffle: ";
119 
main(argc,argv)120 main(argc, argv)
121      int argc; char **argv;
122 {
123   char tname[QFILE_SIZE], lname[QFILE_SIZE], qline[20];
124   char ttitle[60], ltitle[60];
125   char *bp;
126   int score, score0;	/* scores calculated */
127   int icnt, rcnt;			/* number of random shuffles */
128   int i;
129 
130 #ifdef UNIX
131   outtty=isatty(1);
132 #endif
133 
134 #ifdef __MWERKS__
135   SIOUXSettings.asktosaveonclose=TRUE;
136   SIOUXSettings.showstatusline=FALSE;
137   SIOUXSettings.autocloseonquit=TRUE;
138 
139   argc = ccommand(&argv);
140   if (GetResource('DLOG',IntroDID)==0L && OpenResFile("\pFASTA.rsrc")<0) {
141     SysBeep(100); fprintf(stderr," WARNING FASTA.rsrc file could not be found\n");
142   }
143   InitEvent();
144   /* GetVol((unsigned char *)prompt,&ouvRef); */
145   error=HGetVol(NULL,&ouSpec.vRefNum, &ouSpec.parID);
146   if (error != noErr) {
147   	fprintf(stderr," cannot get current directory\n");
148   	exit(1);
149   	}
150   wpos.h=50; wpos.v=100;
151 #endif
152 
153   if ((aa0=calloc(MAXTST+MAXLIB,sizeof(char)))==0) {
154     printf(" cannot allocate sequence array\n");
155     exit(1);
156   }
157   maxn = MAXTST+MAXLIB;
158   if ((aa10=calloc(MAXLIB,sizeof(char)))==0) {
159     printf(" cannot allocate sequence array\n");
160     exit(1);
161   }
162 
163   initenv(argc,argv);
164 
165   if (argc-optind < 3) {
166 #ifndef __MWERKS__
167     fputs(iprompt0,stdout);
168     fprintf(stdout," %s%s\n",verstr,refstr);
169   l1:	fputs(iprompt1,stdout);
170     fflush(stdout);
171     if (fgets(tname,sizeof(tname),stdin)==NULL) exit(0);
172     if (tname[strlen(tname)-1]=='\n') tname[strlen(tname)-1]='\0';
173     if (tname[0]=='\0') goto l1;
174 #else
175     NIntroDlog(IntroDID,iprompt0,verstr,refstr,"\0");
176 
177   l1:	SFileDlog(iprompt1,&freply);
178     if (freply.sfGood==TRUE) {
179       PtoCstr((unsigned char *)freply.sfFile.name);
180       strcpy(tname,(char *)freply.sfFile.name);
181 /*      q0vRef=freply.vRefNum;
182       SetVol(NULL,q0vRef);
183       	*/
184 	  q1Spec.vRefNum = q0Spec.vRefNum = freply.sfFile.vRefNum;
185 	  q1Spec.parID = q0Spec.parID = freply.sfFile.parID;
186 	  HSetVol(NULL,q0Spec.vRefNum,q0Spec.parID);
187     }
188     else exit(0);
189 #endif
190     if ((n0=getseq(tname,aa0,maxn,&dnaseq))==0) {
191       printf(" %s : sequence not found\n",tname);
192       goto l1;
193     }
194     resetp(dnaseq);
195 
196 #ifndef __MWERKS__
197   l2:	fputs(iprompt2,stdout);
198     fflush(stdout);
199     if (fgets(lname,sizeof(lname),stdin)==NULL) exit(1);
200     if (lname[strlen(lname)-1]=='\n') lname[strlen(lname)-1]='\0';
201     if (*lname==0) goto l2;
202 #else
203   l2:	SFileDlog(iprompt2,&freply);
204     if (freply.sfGood==TRUE) {
205       PtoCstr((unsigned char *)freply.sfFile.name);
206       strcpy(lname,(char *)freply.sfFile.name);
207 /*      q1vRef=freply.vRefNum;
208       SetVol(NULL,q1vRef);
209 */
210 	  q1Spec.vRefNum = freply.sfFile.vRefNum;
211 	  q1Spec.parID = freply.sfFile.parID;
212 	  HSetVol(NULL,q1Spec.vRefNum,q1Spec.parID);
213     }
214     else exit(0);
215 #endif
216 
217     printf(" number of random shuffles? [500] ");
218     fgets(qline,sizeof(qline),stdin);
219     rcnt = 500;
220     if (qline[0]!='\0' && qline[0]!='\n') {
221       sscanf(qline,"%d",&rcnt);
222       if (rcnt < 50) rcnt = 100;
223     }
224 
225     if (wflag<=0) {
226       printf(" local (window) (w) or uniform (u) shuffle [u]? ");
227       fgets(qline,sizeof(qline),stdin);
228       if ((bp=strchr(qline,'\n'))!=NULL) *bp='\0';
229     }
230     else qline[0]='\0';
231     if (tolower(qline[0])=='w' || wflag==1) {
232       wflag = 1;
233       wsiz = 10;
234       printf(" local shuffle window size [10] ");
235       fgets(qline,sizeof(qline),stdin);
236       if (qline[0]!='\0' && qline[0]!='\n') {
237 	sscanf(qline,"%d",&wsiz);
238 	if (wsiz <= 2) wsiz = 10;
239       }
240     }
241     else wflag=0;
242   }
243   else {
244     fputs(iprompt0,stdout);
245     fprintf(stdout," %s%s\n",verstr,refstr);
246     strncpy(tname,argv[optind+1],sizeof(tname));
247     if ((n0=getseq(tname,aa0,maxn,&dnaseq))==0) {
248       printf(" %s : sequence not found\n",tname);
249       exit(1);
250     }
251     resetp(dnaseq);
252     strncpy(lname,argv[optind+2],sizeof(lname));
253     rcnt = 100;
254     if (argc-optind>3) sscanf(argv[optind+3],"%d",&rcnt);
255   }
256 
257   if ((sscor=(int *)calloc(rcnt, sizeof (int)))==NULL) {
258     fprintf(stderr," cannot allocate storage %d\n",rcnt);
259     exit(1);
260   }
261 
262   printf(" %s : %4d %s\n",tname, n0, sqnam);
263   gettitle(tname,ttitle,50);
264   printf("%s\n",ttitle);
265 
266   aa1 = aa0 + n0 + 2;
267   maxn -= n0 + 3;
268   if (maxn > MAXLIB) maxn = MAXLIB;
269 
270   if ((n1=getseq(lname,aa10,maxn,&dnaseq))==0) {
271     printf(" %s : %s sequence not found\n",lname,sqtype);
272     exit(1);
273   }
274   printf(" %s : %4d %s\n",lname, n1, sqnam);
275   strncpy(libstr,lname,12);
276   gettitle(lname,ltitle,50);
277   printf("%s\n",ltitle);
278 
279   initpam2();		/* convert 1-d pam to 2-d pam2 */
280 
281   initparm();
282 
283   tstart = sstime();
284 
285   if (dataflg==1 && (fdata = fopen(tmpfname,"w"))!=NULL)
286         fprintf(fdata,"%.50s\n",ltitle);
287 
288   inithist();		/* initialize histogram, mean, sd */
289   if (wsiz > n1) wsiz = n1;
290 
291   for (i=0; i<n1; i++) aa1[i]=aa10[i];
292   aa1[n1]= -1;
293 
294   score0 = smatch(aa0,n0,aa1,n1,NO);
295   if (fdata) {
296     fprintf(fdata,"%-12s %4d %6d %4d %4d %4d %8ld\n",
297 	    libstr,0,n1,score0,0,0,0);
298     fflush(fdata);
299   }
300 
301   nlib = 0;		/* counts number of sequences */
302   ntt = 0;		/* counts number of residues */
303 
304   irand();	/* seed the random number generator */
305 
306   while (rcnt-- > 0) {
307 #ifdef __MWERKS__
308     ChkEvent();
309 #endif
310     if (wflag==1) wshuffle(aa10,aa1,n1,wsiz);
311     else shuffle(aa1,n1);
312     ntt += n1;
313     nlib++;
314     score=smatch(aa0,n0,aa1,n1,NO);
315     addhist(score);
316     if (fdata) {
317       fprintf(fdata,"%-12s %4d %6d %4d %4d %4d %8ld\n",
318 	      libstr,0,n1,score,0,0,0);
319       fflush(fdata);
320     }
321   }
322 
323   tscan = sstime();
324 
325   initialize_hist(lmax);
326   est_lambda_K(n0,n1,sscor,nmean,&K_s, &Lambda_s);
327   free_hist();
328 
329   prhist(stdout,score0,n0,n1);	/* print histogram, statistics */
330 
331   outfd = stdout;
332 
333 #ifdef __MWERKS__
334 	HSetVol(NULL,ouSpec.vRefNum,ouSpec.parID);
335 /*    SetVol(NULL,ouvRef);   */
336 #endif
337   if (outtty && resfile[0]=='\0') {
338     printf(" Enter filename for results : ");
339     fgets(rline,sizeof(rline),stdin);
340     if ((bp=strchr(rline,'\n'))!=NULL) *bp='\0';
341     if (rline[0]!='\0') strncpy(resfile,rline,sizeof(resfile));
342   }
343   if (resfile[0]!='\0') {
344     if ((outfd=fopen(resfile,"w"))==NULL) exit(1);
345     fprintf(outfd," %s, %d %s vs %s\n",tname, n0, sqnam, lname);
346     prhist(outfd,score0,n0,n1);
347 #ifdef __MWERKS__
348     SIOUXSettings.asktosaveonclose=FALSE;
349     SIOUXSettings.autocloseonquit=TRUE;
350 #endif
351   }
352 }
353 
354 extern int *sascii, nascii[], aascii[];
355 
initenv(argc,argv)356 initenv(argc,argv)
357      int argc; char **argv;
358 {
359   char *cptr, *getenv();
360   int copt, getopt();
361   extern char *optarg;
362 
363   sascii = aascii;
364   strncpy(smstr,"BLOSUM50",sizeof(smstr));
365   smptr = smstr;
366   pam = abl50;
367   sq = aa;
368   hsq = haa;
369   nsq = naa;
370   dnaseq = 0;
371 
372   while ((copt=getopt(argc,argv,"qQf:g:nO:s:w:hr:"))!=EOF)
373     switch(copt) {
374     case 'q':
375     case 'Q': outtty=0; break;
376     case 'f': sscanf(optarg,"%d",&gdelval); del_set = 1; break;
377     case 'g': sscanf(optarg,"%d",&ggapval); gap_set = 1; break;
378     case 'n': dnaseq=1;
379       sascii = nascii;
380       strncpy(smstr,"DNA",sizeof(smstr));
381       smptr=smstr;
382       sq = nt;
383       nsq = nnt;
384       hsq = hnt;
385       pam = npam;
386       strcpy(sqnam,"nt");
387       strcpy(sqtype,"DNA");
388       resetp(dnaseq);
389       break;
390     case 'O': strncpy(resfile,optarg,sizeof(resfile));
391       break;
392     case 'h': histflag = 0; break;
393     case 'r': dataflg=1;
394       strncpy(tmpfname,optarg,sizeof(tmpfname));
395       break;
396     case 's': strncpy(smstr,optarg,sizeof(smstr));
397       smptr=smstr;
398       if (initpam(smptr)) dnaseq= -1;
399       histint = 1;
400       break;
401     case 'w': wflag = 1; sscanf(optarg,"%d",&wsiz);
402       break;
403     default : fprintf(stderr," illegal option -%c\n",copt);
404     }
405   optind--;
406 
407   if (dnaseq>=0) {
408     if ((smptr=getenv("SMATRIX"))!=NULL && initpam(smptr)) dnaseq = -1;
409     else smptr=smstr;
410   }
411 
412   if (dnaseq<0 && strlen(smptr)>0)
413     fprintf(stderr," matrix file reset to %s\n",smptr);
414 }
415 
resetp(dnaseq)416 resetp(dnaseq)
417      int dnaseq;
418 {
419   if (dnaseq==1) {
420     fprintf(stderr," resetting to DNA\n");
421     if (!del_set) gdelval = -16;
422     if (!gap_set) ggapval = -4;
423     histint=4;
424     bestscale=80;
425     bkfact=5;
426     scfact=1;
427     bktup=6;
428     ktmax=6;
429     bestmax=80;
430     bestoff=45;
431     pam = npam;
432     smptr = "DNA";
433     if (pamfact>=0) pamfact = 0;
434   }
435 }
436 
initparm()437 initparm()
438 {
439   char *getenv(), *cptr;
440   int itemp;
441 
442 }
443 
444 
initpam2()445 initpam2()
446 {
447 	int i, j, k;
448 
449 	k=0;
450 	for (i=0; i<naa; i++)
451 		for (j=0; j<=i; j++)
452 			pam2[j][i] = pam2[i][j] = pam[k++];
453 	}
454 
inithist()455 inithist()
456 {
457 	int i;
458 
459 	for (i=0; i<MAXHIST; i++) hist[i]=0;
460 	nmean = lmax = 0;
461 	lmin = MAXHIST;
462 	initialize_hist(MAXHIST);
463 	}
464 
prhist(fd,score0,q_length,l_length)465 prhist(fd,score0, q_length, l_length)
466      FILE *fd; int score0;
467 {
468   int i,j,hl;
469   char hline[80], pch;
470   int maxval, dotsiz;
471   int gl,hl0;
472   int mh1, mn;
473   float lsd;
474   double prev_cum, cum;
475   int cp;
476 
477   if (histflag) {
478     mh1 = lmax/histint + 4;
479     if (mh1 >= MAXHIST) mh1 = MAXHIST-1;
480     mn = lmin/histint - 3;
481     if (mn < 0) mn = 0;
482 
483     for (i=0,maxval=0; i<mh1; i++) {
484       if (hist[i] > maxval) maxval = hist[i];
485     }
486     dotsiz = (maxval-1)/50+1;
487 
488     prev_cum = find_Evalue(mn*histint, q_length, l_length, K_s, Lambda_s);
489     cum = 0;
490     fprintf(fd,"\n       s-w  est\n");
491     for (i=mn; i<=mh1; i++) {
492       cum = find_Evalue((i+1)*histint, q_length, l_length,K_s,Lambda_s);
493 
494       cp = (int)(prev_cum - cum + 0.5);
495       prev_cum = cum;
496       pch = (i==mh1) ? '>' : ' ';
497       pch = (i==mn) ? '<' : pch;
498       fprintf(fd,"%c%3d %4d %4d:",pch,(i<mh1?(i)*histint: mh1*histint),
499 	      hist[i],cp);
500       hl = hist[i];
501       if ((hl=(hl+dotsiz-1)/dotsiz) > 50) hl = 50;
502       for (j=0; j<hl; j++) hline[j]='=';
503       if ((cp=(cp+dotsiz-1)/dotsiz) >= 50) cp = 50;
504       if (cp > 0) {
505 	if (cp < hl) hline[cp-1]='*';
506 	else {
507 	  for (; hl<cp && hl<50; hl++) hline[hl]=' ';
508 	  hline[hl-1]='*';
509 	}
510       }
511       hline[hl]=0;
512       fprintf(fd,"%s",hline);
513       if ((score0 >= (i)*histint && score0 < (i+1)*histint) ||
514 	  (score0 >= mh1*histint && i==mh1)) fprintf(fd," O");
515       fprintf(fd,"\n");
516     }
517   }
518 
519   fprintf(fd,"%7ld residues in %5d sequences,\n",ntt,nlib);
520   fprintf(fd," %s matrix, ",smptr);
521   fprintf(fd,"gap penalties: %d,%d\n",gdelval,ggapval);
522 
523   if (wflag==1) fprintf(fd," local shuffle, window size: %d\n",wsiz);
524 
525   fprintf(fd, " unshuffled s-w score: %d; shuffled score range: %d - %d\n",
526 	  score0,lmin,lmax);
527 
528   cum = find_Pvalue(score0,n0,n1,K_s,Lambda_s);
529   fprintf(fd,"Lambda: %5.5g K: %5.5g; P(%d)= %5.5g\n",
530 	  Lambda_s,K_s,score0,cum);
531 
532   fprintf(fd,"For %d sequences, a score >=%d is expected",nlib,score0);
533   cum = find_Evalue(score0,n0,n1,K_s,Lambda_s);
534   if (cum > 5.0) fprintf(fd," %d times\n",(int)(cum + 0.5));
535   else fprintf(fd," %5.3g times\n",cum);
536 }
537 
addhist(score)538 addhist(score)
539 	int score;
540 {
541   if (score > lmax) lmax = score;
542   if (score < lmin) lmin = score;
543   sscor[nmean++]=score;
544 
545   score = (score-1)/histint;
546   if (score < 0) score=0;
547   else if (score >= MAXHIST) score = MAXHIST-1;
548   hist[score]++;
549 }
550 
ssort(v,n)551 void ssort(v,n)
552 	int *v; int n;
553 {
554 	int gap, i, j;
555 	int tmp;
556 
557 	for (gap=n/2; gap>0; gap/=2)
558 		for (i=gap; i<n; i++)
559 			for (j=i-gap; j>=0; j -= gap) {
560 				if (v[j]<=v[j+gap]) break;
561 				tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
562 				}
563 	}
564 
565 int ieven = 1;
wshuffle(from,to,n,wsiz)566 wshuffle(from,to,n,wsiz)	/* copies from from to from shuffling */
567 	char  *from, *to; int n, wsiz;
568 {
569 	int i,j, k, mm; char tmp, *top;
570 
571 	memcpy(to,from,n);
572 
573 	mm = n%wsiz;
574 
575 	if (ieven) {
576 	    for (k=0; k<(n-wsiz); k += wsiz) {
577 		top = &to[k];
578 		for (i=wsiz; i>1; i--) {
579 			j = nrand(i);
580 			tmp = top[j];
581 			top[j] = top[i-1];
582 			top[i-1] = tmp;
583 			}
584 		}
585 		top = &to[n-mm];
586 		for (i=mm; i>1; i--) {
587 			j = nrand(i);
588 			tmp = top[j];
589 			top[j] = top[i-1];
590 			top[i-1] = tmp;
591 			}
592 	    ieven = 0;
593 	    }
594 	else {
595 	    for (k=n; k>=wsiz; k -= wsiz) {
596 		top = &to[k-wsiz];
597 		for (i=wsiz; i>1; i--) {
598 			j = nrand(i);
599 			tmp = top[j];
600 			top[j] = top[i-1];
601 			top[i-1] = tmp;
602 			}
603 		}
604 		top = &to[0];
605 		for (i=mm; i>1; i--) {
606 			j = nrand(i);
607 			tmp = top[j];
608 			top[j] = top[i-1];
609 			top[i-1] = tmp;
610 			}
611 	    ieven = 1;
612 	    }
613 	to[n] = -1;
614 	}
615 
shuffle(from,n)616 shuffle(from,n)	/* copies from from to from shuffling */
617 	char  *from; int n;
618 {
619 	int i,j; char tmp;
620 
621 	for (i=n; i>1; i--) {
622 		j = nrand(i);
623 		tmp = from[j];
624 		from[j] = from[i-1];
625 		from[i-1] = tmp;
626 		}
627 	from[n] = -1;
628 	}
629 
cmpi(val1,val2)630 cmpi(val1,val2)
631      int val1, val2;
632 {
633   if (val1 < val2) return (-1);
634   else if (val1 > val2) return 1;
635   else return 0;
636 }
637 
ksort(v,n,comp)638 ksort(v,n,comp)
639 	char *v[]; int n, (*comp)();
640 {
641 	int gap, i, j;
642 	char *tmp;
643 
644 	for (gap=n/2; gap>0; gap/=2)
645 		for (i=gap; i<n; i++)
646 			for (j=i-gap; j>=0; j -= gap) {
647 				if ((*comp)(v[j],v[j+gap]) <=0)
648 					break;
649 				tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
650 				}
651 	}
652 
653 /*  stubs for linking */
654 int llen;
655 
aancpy()656 aancpy()
657 {}
658 
659 int markx;
disgraph()660 disgraph()
661 {}
662 
ALIGN()663 ALIGN()
664 {}
665 
discons()666 discons()
667 {}
668