1 /*      prdf.c         12-Feb-1984, 22-Sept-85, 17-Oct-1985
2         copyright (c) 1984,1987,1988      William R. Pearson and David Lipman
3 
4 	prdf.c - version of udf.c which scrambles sequences for testing
5 	significance
6 
7 	modified for argc or queries
8 
9 	9-May-85	change shuffling algorithm
10 
11 	22-Sept-85	display both initial and optimized histogram
12 	17-Oct-85	correct bigscore bug, savemax bug
13 	27-Dec-85	correct savemax bug, put 32 wide window in ngdist
14 	22-June-86	added universal matrix
15 	 4-Apr-88	update to new kfact calculation
16 	Nov, 1991	include -n option to force DNA sequences
17 	Nov 8, 1992	fix pamfact for DNA sequences
18 	May 14, 1993	fix local shuffle to stay local
19 */
20 
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 char *refstr="\nPlease cite:\n W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448;\n";
25 char *verstr="version 2.0u1, September, 1995";
26 
27 #ifdef __MWERKS__
28 #include <Types.h>
29 #include <StandardFile.h>
30 StandardFileReply freply;
31 Point wpos;
32 int tval;
33 char prompt[256];
34 #include <sioux.h>
35 #define getenv mgetenv
36 #endif
37 
38 #define NO 0
39 
40 #define max(a,b) (((a)>(b))?(a):(b))
41 #define min(a,b) (((a)<(b))?(a):(b))
42 
43 #ifndef BIGMEM
44 #define MAXTST 2000	/* longest test sequence */
45 #define MAXLIB 10000
46 #define MAXDIAG (MAXTST+MAXLIB)
47 #define QFILE_SIZE 40
48 #else
49 #define MAXTST 10000
50 #define MAXLIB 50000
51 #define MAXDIAG (MAXTST+MAXLIB)
52 #define QFILE_SIZE 256
53 #endif
54 
55 #ifndef MAXSAV
56 #define MAXSAV 20	/* number of best diagonals saved */
57 #endif
58 #define MAXHIST 51	/* number of histogram divisions */
59 /*#define HISTSIZE 2	/* size of histogram division */
60 
61 FILE *outfd;		/* fd for output file */
62 int smark[4];
63 long sq0off=1, sq1off=1;
64 long loffset = 0l;		/* offset into sequence */
65 
66 
67 /* globals for matching */
68 
69 long lmark;		/* position in library file from ftell() */
70 int nlib;
71 long ntt;		/* number of library sequences, number of
72 				residues scanned*/
73 char libstr[21];	/* partial title from library sequence */
74 
75 char *aa0, *aa10, *aa1;	/* amino acid sequence data */
76 int maxn;		/* max space for lib sequence (MAXDIAG-n0) */
77 int n0, n1, nd, noff;	/* length of aa0, length of aa1, n0+n1,
78 				diagonal offset */
79 
80 struct dstruct {	/* diagonal structure for saving current run */
81         int start;	/* start of current match */
82         int stop;	/* end of current match */
83         int score;	/* hash score of current match */
84 	struct beststr *dmax;	/* location in vmax[] where best score data saved */
85         } *diag;
86 
87 
88 struct beststr {
89   int score;			/* pam score with segment opt */
90   int score0;			/* pam score of best single segment */
91   int gscore;			/* score from global match */
92   long lseek;			/* position in library file */
93   int dp;			/* diagonal of match */
94   int start;			/* start of match in lib seq */
95   int stop;			/* end of match in lib seq */
96 }
97 vmax[MAXSAV],	/* best matches saved for one sequence */
98 *vptr[MAXSAV];
99 
100 int *sscor, *sscor0, *gsscor;
101 
102 int optwid=32, optwidf=0;	/* width for band optimization */
103 				/* changed in initparm() */
104 
105 double K_n, K_0, K_g;
106 double Lambda_n, Lambda_0, Lambda_g;
107 
108 int cgap;	/* gap threshold */
109 int pgap;	/* gap penalty for optimized alignment of diagonals */
110 
111 int nbest;	/* number of sequences better than bestcut in best */
112 int bestcut; 	/* cut off for getting into MAXBEST */
113 
114 int histint=2;
115 int bestoff=36;	/* values for calculating bestcut */
116 int bestscale=300;
117 int bkfact=6;
118 int scfact=3;
119 int bktup=2;
120 int ktmax=2;
121 int bestmax=50;
122 int pamfact = 1;	/* flag for using pam values for kfact */
123 int dnaseq = 0;	/* true if DNA query sequence */
124 int lnsflg=0;
125 int histflg=1;
126 
127 int nsav, lowscor;	/* number of saved runs, worst saved
128 				run, score of worst saved run */
129 struct beststr *lowmax;
130 
131 int hist[MAXHIST];		/* histogram of all score */
132 int lmax, lmin;
133 int nmean;			/* number of scores averaged in mean */
134 
135 int hist0[MAXHIST];		/* histogram of init0 scores */
136 int lmax0, lmin0;
137 int nmean0;			/* number of init0 scores */
138 
139 int ghist[MAXHIST];		/* histogram of optimized score */
140 int glmax, glmin;
141 int gnmean;			/* number of optimized scores averaged */
142 
143 int fact, gm;                   /* scoring factors */
144 
145 static int hmask, hmax;		/* hash constants */
146 static int *pamh2;				/* pam based kfact array */
147 static int *link, *harr;		/* hash arrays */
148 static int ktup, kshft, kt1;		/* ktuple constants */
149 
150 int nshow; char rline[20],sline[20];
151 char resfile[QFILE_SIZE];
152 
153 int showall;
154 long tstart, tscan, tdone, sstime();
155 
156 #include "upam.gbl"		/* includes pam array */
157 
158 extern int optind;
159 extern int outtty;
160 char *getenv(), *smptr, *cptr;		/* scoring matrix env */
161 char smstr[QFILE_SIZE];
162 
163 int wflag = -1;
164 int wsiz;
165 
166 #ifdef __MWERKS__
167 /* short ouvRef, q0vRef, q1vRef; */
168 FSSpec ouSpec, q0Spec, q1Spec;
169 OSErr error;
170 #define IntroDID 400	/* LFASTA */
171 #endif
172 char *iprompt0=" PRDF compares a test sequence to a shuffled sequence\n";
173 char *iprompt1=" test sequence file name: ";
174 char *iprompt2=" sequence to shuffle: ";
175 
main(argc,argv)176 main(argc, argv)
177      int argc; char **argv;
178 {
179   char tname[QFILE_SIZE], lname[QFILE_SIZE], qline[QFILE_SIZE];
180   char *bp;
181   int score, gscore, score0, gscore0;	/* scores calculated */
182   int i0score, i0score0;
183   int rcnt;			/* number of random shuffles */
184   int i;
185 
186 #ifdef __MWERKS__
187   SIOUXSettings.asktosaveonclose=TRUE;
188   SIOUXSettings.showstatusline=FALSE;
189   SIOUXSettings.autocloseonquit=TRUE;
190 
191   argc = ccommand(&argv);
192   if (GetResource('DLOG',IntroDID)==0L && OpenResFile("\pFASTA.rsrc")<0) {
193     SysBeep(100); fprintf(stderr," WARNING FASTA.rsrc file could not be found\n");
194   }
195   InitEvent();
196   /* GetVol((unsigned char *)prompt,&ouvRef); */
197   error=HGetVol(NULL,&ouSpec.vRefNum, &ouSpec.parID);
198   if (error != noErr) {
199   	fprintf(stderr," cannot get current directory\n");
200   	exit(1);
201   	}
202   wpos.h=50; wpos.v=100;
203 #endif
204 
205   if ((aa0=calloc((size_t)MAXTST+MAXLIB,sizeof(char)))==0) {
206     printf(" cannot allocate sequence array\n");
207     exit(1);
208   }
209   maxn = MAXTST+MAXLIB;
210   if ((aa10=calloc((size_t)MAXLIB,sizeof(char)))==0) {
211     printf(" cannot allocate sequence array\n");
212     exit(1);
213   }
214 
215   initenv(argc,argv);
216 
217   if (argc-optind < 3) {
218 #ifndef __MWERKS__
219     fputs(iprompt0,stdout);
220     fprintf(stdout," %s%s\n",verstr,refstr);
221   l1: fputs(iprompt1,stdout);
222     fflush(stdout);
223     if (fgets(tname,sizeof(tname),stdin)==NULL) exit(0);
224     if (tname[strlen(tname)-1]=='\n') tname[strlen(tname)-1]='\0';
225     if (tname[0]=='\0') goto l1;
226 #else
227     NIntroDlog(IntroDID,iprompt0,verstr,refstr,"\0");
228 
229   l1:	SFileDlog(iprompt1,&freply);
230     if (freply.sfGood==TRUE) {
231       PtoCstr((unsigned char *)freply.sfFile.name);
232       strcpy(tname,(char *)freply.sfFile.name);
233 /*      q0vRef=freply.vRefNum;
234       SetVol(NULL,q0vRef);
235       	*/
236 	  q1Spec.vRefNum = q0Spec.vRefNum = freply.sfFile.vRefNum;
237 	  q1Spec.parID = q0Spec.parID = freply.sfFile.parID;
238 	  HSetVol(NULL,q0Spec.vRefNum,q0Spec.parID);
239     }
240     else exit(0);
241 #endif
242     if ((n0=getseq(tname,aa0,maxn,&dnaseq))==0) {
243       printf(" %s : sequence not found\n",tname);
244       goto l1;
245     }
246     resetp(dnaseq);
247 
248 #ifndef __MWERKS__
249   l2:	fputs(iprompt2,stdout);
250     fflush(stdout);
251     if (fgets(lname,sizeof(lname),stdin)==NULL) exit(1);
252     if (lname[strlen(lname)-1]=='\n') lname[strlen(lname)-1]='\0';
253     if (*lname==0) goto l2;
254 #else
255   l2:	SFileDlog(iprompt2,&freply);
256     if (freply.sfGood==TRUE) {
257       PtoCstr((unsigned char *)freply.sfFile.name);
258       strcpy(lname,(char *)freply.sfFile.name);
259 /*      q1vRef=freply.vRefNum;
260       SetVol(NULL,q1vRef);
261 */
262 	  q1Spec.vRefNum = freply.sfFile.vRefNum;
263 	  q1Spec.parID = freply.sfFile.parID;
264 	  HSetVol(NULL,q1Spec.vRefNum,q1Spec.parID);
265     }
266     else exit(0);
267 #endif
268 
269     printf(" ktup? (1 to %d) [%d] ",ktmax,ktmax);
270     fgets(qline,sizeof(qline),stdin);
271     ktup = ktmax;
272     if (qline[0]!='\0' && qline[0]!='\n') {
273       sscanf(qline,"%d",&ktup);
274       if (ktup < 1 || ktup>ktmax) ktup = ktmax;
275     }
276     printf(" number of random shuffles? [100] ");
277     fgets(qline,sizeof(qline),stdin);
278     rcnt = 100;
279     if (qline[0]!='\0' && qline[0]!='\n') {
280       sscanf(qline,"%d",&rcnt);
281       if (rcnt < 50) rcnt = 100;
282     }
283 
284     if (wflag<=0) {
285       printf(" local (window) (w) or uniform (u) shuffle [u]? ");
286       fgets(qline,sizeof(qline),stdin);
287       if ((bp=strchr(qline,'\n'))!=NULL) *bp='\0';
288     }
289     else qline[0]='\0';
290     if (tolower(qline[0])=='w' || wflag==1) {
291       wflag = 1;
292       wsiz = 10;
293       printf(" local shuffle window size [10] ");
294       fgets(qline,sizeof(qline),stdin);
295       if (qline[0]!='\0' && qline[0]!='\n') {
296 	sscanf(qline,"%d",&wsiz);
297 	if (wsiz < 2) wsiz = 10;
298       }
299     }
300     else wflag=0;
301   }
302   else {
303     fputs(iprompt0,stdout);
304     fprintf(stdout," %s%s\n",verstr,refstr);
305     strncpy(tname,argv[optind+1],sizeof(tname));
306     if ((n0=getseq(tname,aa0,maxn,&dnaseq))==0) {
307       printf(" %s : sequence not found\n",tname);
308       exit(1);
309     }
310     resetp(dnaseq);
311     strncpy(lname,argv[optind+2],sizeof(lname));
312     if (argc-optind>3) sscanf(argv[optind+3],"%d",&ktup);
313     else ktup=ktmax;
314     rcnt = 100;
315     if (argc-optind>4) sscanf(argv[optind+4],"%d",&rcnt);
316   }
317 
318   sscor=(int *)calloc((size_t)rcnt, sizeof (int));
319   sscor0=(int *)calloc((size_t)rcnt, sizeof (int));
320   gsscor=(int *)calloc((size_t)rcnt, sizeof (int));
321 
322   if (sscor == NULL || sscor0 == NULL || gsscor == NULL) {
323     fprintf(stderr," cannot allocate shuffled score arrays (%d)",rcnt);
324     exit(1);
325   }
326 
327   printf(" %s : %4d %s\n",tname, n0, sqnam);
328 
329   aa1 = aa0 + n0 + 2;
330   maxn -= n0 + 3;
331   if (maxn > MAXLIB) maxn = MAXLIB;
332 
333   if ((n1=getseq(lname,aa10,maxn,&dnaseq))==0) {
334     printf(" %s : %s sequence not found\n",lname,sqtype);
335     exit(1);
336   }
337   printf(" %s : %4d %s\n",lname, n1, sqnam);
338 
339   initpam2();		/* convert 1-d pam to 2-d pam2 */
340 
341   initparm();
342 
343   tstart = sstime();
344 
345   fact = ktup*scfact;
346   hashaa(aa0,n0,ktup);	/* hash test sequence */
347 
348 #ifndef ALLOCN0
349   allocdiag(MAXDIAG);
350 #else
351   allocdiag(n0);
352 #endif
353   inithist();		/* initialize histogram, mean, sd */
354 
355   if (wsiz > n1) wsiz=n1;
356 
357   for (i=0; i<n1; i++) aa1[i]=aa10[i];
358   aa1[n1]= -1;
359 
360   score0 = dhash(&i0score0,&gscore0);
361 
362   nlib = 0;		/* counts number of sequences */
363   ntt = 0;		/* counts number of residues */
364 
365   irand();	/* seed the random number generator */
366 
367   while (rcnt-- > 0) {
368 #ifdef __MWERKS__
369     ChkEvent();
370 #endif
371     if (wflag==1) wshuffle(aa10,aa1,n1,wsiz);
372     else shuffle(aa1,n1);
373     score=dhash(&i0score,&gscore); /* do the hash */
374     addhist(score);
375     addhist0(i0score);
376     addhistg(gscore);
377   }
378 
379   tscan = sstime();
380 
381   initialize_hist(lmax);
382   est_lambda_K(n0,n1,sscor,nmean, &K_n, &Lambda_n);
383   free_hist();
384 
385   initialize_hist(lmax0);
386   est_lambda_K(n0,n1,sscor0,nmean, &K_0, &Lambda_0);
387   free_hist();
388 
389   initialize_hist(glmax);
390   est_lambda_K(n0,n1,gsscor,nmean,&K_g, &Lambda_g);
391   free_hist();
392 
393   /* print histogram, statistics */
394   prhist(stdout,score0,i0score0,gscore0);
395 
396   outfd = stdout;
397 
398 #ifdef __MWERKS__
399 	HSetVol(NULL,ouSpec.vRefNum,ouSpec.parID);
400 /*    SetVol(NULL,ouvRef);   */
401 #endif
402   if (outtty && resfile[0]=='\0') {
403     printf(" Enter filename for results : ");
404     fgets(rline,sizeof(rline),stdin);
405     if ((bp=strchr(rline,'\n'))!=NULL) *bp='\0';
406     if (rline[0]!='\0') strncpy(resfile,rline,sizeof(resfile));
407   }
408   if (resfile[0]!='\0') {
409     if ((outfd=fopen(resfile,"w"))==NULL) exit(1);
410     fprintf(outfd," %s, %d %s vs %s\n",tname, n0, sqnam, lname);
411     prhist(outfd,score0,i0score0,gscore0);
412 #ifdef __MWERKS__
413     SIOUXSettings.asktosaveonclose=FALSE;
414     SIOUXSettings.autocloseonquit=TRUE;
415 #endif
416   }
417 }
418 
419 extern int *sascii, nascii[], aascii[];
420 
initenv(argc,argv)421 initenv(argc,argv)
422      int argc; char **argv;
423 {
424   char *cptr, *getenv();
425   int copt, getopt();
426   extern char *optarg;
427 
428   sascii = aascii;
429   strncpy(smstr,"BLOSUM50",sizeof(smstr));
430   smptr = smptr;
431   pam = abl50;
432   sq = aa;
433   hsq = haa;
434   nsq = naa;
435   dnaseq = 0;
436 
437   if ((cptr=getenv("GAPPEN"))!=NULL) sscanf(cptr,"%d",&cgap);
438   else cgap=0;
439 
440   if ((cptr=getenv("CUTOFF"))!=NULL) sscanf(cptr,"%d",&bestcut);
441   else bestcut = 0;
442 
443   if ((cptr=getenv("PAMFACT"))!=NULL) {
444     sscanf(cptr,"%d",&pamfact);
445     if (pamfact==1) pamfact = -2; else pamfact=0;
446   }
447 
448   while ((copt=getopt(argc,argv,"Qqc:f:g:hk:nO:p:s:y:w:"))!=EOF)
449     switch(copt) {
450     case 'q':
451     case 'Q': outtty=0; break;
452     case 'c': sscanf(optarg,"%d",&bestcut); break;
453     case 'f': sscanf(optarg,"%d",&gdelval); del_set=1; break;
454     case 'g': sscanf(optarg,"%d",&ggapval); gap_set=1; break;
455     case 'h': histflg = 0; break;
456     case 'n': dnaseq=1;
457       sascii = nascii;
458       strncpy(smstr,"DNA",sizeof(smstr));
459       smptr = smstr;
460       sq = nt;
461       nsq = nnt;
462       hsq = hnt;
463       pam = npam;
464       strcpy(sqnam,"nt");
465       strcpy(sqtype,"DNA");
466       resetp(dnaseq);
467       break;
468     case 'O': strncpy(resfile,optarg,sizeof(resfile));
469       break;
470     case 'k': sscanf(optarg,"%d",&cgap); break;
471     case 's': strncpy(smstr,optarg,sizeof(smstr));
472       smptr=smstr;
473       if (initpam(smptr)) dnaseq= -1;
474       break;
475     case 'w': wflag = 1;
476       sscanf(optarg,"%d",&wsiz);
477       break;
478     case 'y': sscanf(optarg,"%d",&optwid);
479       optwidf = 1;
480       break;
481     default : fprintf(stderr," illegal option -%c\n",copt);
482     }
483   optind--;
484 
485   if (dnaseq>=0) {
486     if ((smptr=getenv("SMATRIX"))!=NULL && initpam(smptr)) dnaseq = -1;
487     else smptr = smstr;
488   }
489 
490   ktmax = bktup;
491   if (dnaseq<0 && strlen(smptr)>0)
492     fprintf(stderr," matrix file reset to %s\n",smptr);
493 }
494 
resetp(dnaseq)495 resetp(dnaseq)
496      int dnaseq;
497 {
498   if (dnaseq==1) {
499     fprintf(stderr," resetting to DNA\n");
500     if (!del_set) gdelval = -16;
501     if (!gap_set) ggapval = -4;
502     histint=4;
503     bestscale=80;
504     bkfact=5;
505     scfact=1;
506     bktup=6;
507     ktmax=6;
508     bestmax=80;
509     bestoff=45;
510     pam = npam;
511     smptr = "DNA";
512     if (pamfact>=0) pamfact = 0;
513   }
514 }
515 
initparm()516 initparm()
517 {
518 	char *getenv(), *cptr;
519 	int itemp, btemp;
520 
521 	if (!optwidf) {
522 	  if (!dnaseq && ktup==1) optwid = 32;
523 	  else optwid = 16;
524 	}
525 
526 	btemp = 2*bestoff/3 + n0/bestscale + bkfact*(bktup-ktup);
527 	if (btemp>bestmax) btemp = bestmax;
528 	if (btemp > 3*n0) btemp = 3*shscore(aa0,n0)/5;
529 
530 	if (cgap<=0) cgap=btemp+bestoff/3;
531 	pgap = gdelval + ggapval;
532 	}
533 
534 /*	hashaa - hash sequence 0 for rapid lookup of seq 1 (library) */
535 
hashaa(aa0,n0,ktup)536 hashaa(aa0, n0, ktup)
537 	char *aa0; int n0, ktup;
538 {
539 	int mhv;
540  	int i0, hv, phv;
541 
542 	if (pamfact == -1) pamfact=0;
543 	else if (pamfact== -2) pamfact=1;
544 
545 	for (i0=0, mhv= -1; i0<naa; i0++)
546 		if (hsq[i0]>mhv) mhv=hsq[i0];
547 
548 	if (mhv<=0) {
549 		printf(" maximum hsq <=0 %d\n",mhv);
550 		exit(1);
551 		}
552 
553 	for (kshft=0; mhv>0; mhv/=2) {
554 		kshft++;
555 	}
556 
557 /*      kshft = 2;	*/
558         kt1 = ktup-1;
559 
560 	hv = 1;
561 	for (i0 = 0; i0<ktup; i0++)
562 		hv = hv<<kshft;
563 	hmax = hv;
564 	hmask = (hmax>>kshft)-1;
565 
566 	allochash(n0,hmax);
567 
568         for (i0=0; i0<hmax; i0++) harr[i0]= -1;
569 	for (i0=0; i0<n0; i0++) link[i0]= -1;
570 
571         /* encode the aa0 array */
572 
573 	hv=phv=0;
574 	for (i0=0; i0<kt1; i0++) {
575 		hv= (hv<<kshft)+hsq[aa0[i0]];
576 		phv += pam2[aa0[i0]][aa0[i0]]*ktup;
577 		}
578 
579 	for (; i0<n0; i0++) {
580 		hv=((hv&hmask)<<kshft) + hsq[aa0[i0]];
581 		link[i0]=harr[hv];
582 		harr[hv]=i0;
583 		if (pamfact) {
584 			pamh2[hv] = (phv += pam2[aa0[i0]][aa0[i0]]*ktup);
585 			phv -= pam2[aa0[i0-kt1]][aa0[i0-kt1]]*ktup;
586 			}
587 		else pamh2[hv]=fact*ktup;
588 		}
589 	if (pamfact)
590 		for (i0=0; i0<nsq; i0++) pamh1[i0]=pam2[i0][i0]*ktup;
591 	else
592 		for (i0=0; i0<nsq; i0++) pamh1[i0]=fact;
593 	}
594 
allochash(n0,hmax)595 allochash(n0, hmax)
596 	int n0, hmax;
597 {
598 
599 	if ((harr=(int *)calloc((size_t)hmax,sizeof(int)))== NULL) {
600 		printf(" cannot allocate hash array: %d\n",hmax);
601 		exit(1);
602 		}
603 	if ((pamh2=(int *)calloc((size_t)hmax,sizeof(int)))==NULL) {
604 		fprintf(stderr," cannot allocate pamh2 array: %d\n",hmax);
605 		exit(1);
606 		}
607 	if ((link=(int *)calloc((size_t)n0,sizeof(int)))== NULL) {
608 		printf(" cannot allocate hash link array: %d",n0);
609 		exit(1);
610 		}
611 	}
612 
613 /*      this is the main loop. First zero the diagonal arrays,
614         then go through the sequence ktup at a time updating the
615         diagonals appropriately.  Finally, scan the diagonals,
616         looking for the max score, and use the pam matrix
617 */
618 
dhash(i0score,gscore)619 dhash(i0score,gscore)
620 	int *i0score,*gscore;
621 {
622         int nd, ndo;                 /* diagonal array size */
623         int lhval;
624         int kfact;
625         register struct dstruct *dptr;
626 	register int tscor;
627 #ifndef ALLOCN0
628 	register struct dstruct *diagp;
629 #else
630 	register int dpos;
631 	int lposn0;
632 #endif
633 	struct dstruct *dpmax;
634         register int lpos;
635 	int tpos;
636 	struct beststr *vmptr;
637 	int scor;
638 	int im, ib, nsave;
639 	int cmps();			/* comparison routine for ksort */
640 	char *aa1ptr;
641 	double lnscale;
642 	int  lcont, ocont, loff;	/* lcont is returned by getlib to
643 					indicate there is more sequence
644 					remaining.  ocont is the previous
645 					value of lcont, for going back later.
646 					loff corrects maxn for the modified
647 					size of aa1 for continued sequences
648 					*/
649 
650 #ifdef FKFACT
651     kfact = ktup*fact;
652 #endif
653 	noff = n0-1;
654 	ndo = 0;
655 #ifdef ALLOCN0
656 	nd = n0;
657 #endif
658 	ntt += n1;
659 	nlib++;
660 
661 #ifndef ALLOCN0
662 		nd = n0+n1;
663 #endif
664 		if (lnsflg) lnscale = (log((double)n0)/log((double)n1));
665 
666 	   	dpmax = &diag[nd];
667 		for (dptr= &diag[ndo]; dptr<dpmax;)  {
668                         dptr->stop = -1;
669 			dptr->dmax = NULL;
670 			dptr++->score = 0;
671                         }
672 
673 		for (vmptr=vmax; vmptr<&vmax[MAXSAV]; vmptr++)
674 			vmptr->score = 0;
675 		lowmax = vmax;
676 		lowscor = 0;
677 
678 
679         /* start hashing */
680         lhval = 0;
681         for (lpos=0; lpos<kt1;)
682                 lhval= ((lhval&hmask)<<kshft)+hsq[aa1[lpos++]];
683 
684 #ifndef ALLOCN0
685 	diagp = &diag[noff + kt1];
686         for ( ; lpos<n1; lpos++,diagp++) {
687                 lhval = ((lhval&hmask)<<kshft) + hsq[aa1[lpos]];
688                 for (tpos=harr[lhval]; tpos>=0; tpos=link[tpos]) {
689 		    if ((tscor = (dptr = &diagp[-tpos])->stop)>=0) {
690 #else
691 	lposn0 = noff + lpos;
692         for ( ; lpos<n1; lpos++,lposn0++) {
693                 lhval = ((lhval&hmask)<<kshft) + hsq[aa1[lpos]];
694                 for (tpos=harr[lhval]; tpos>=0; tpos=link[tpos]) {
695 		    dpos = lposn0 - tpos;
696 		    if ((tscor = (dptr = &diag[dpos%nd])->stop)>=0) {
697 #endif
698 			tscor += ktup;
699 			if ((tscor -=lpos)<=0) {
700 			    scor = dptr->score;
701 #ifdef FKFACT
702 			    if ((tscor += kfact)<0 && lowscor<scor)
703 #else
704 			    if ((tscor += (kfact=pamh2[lhval]))<0
705 				    && lowscor < scor)
706 #endif
707 #ifdef ALLOCN0
708 				savemax(dptr,dpos);
709 #else
710 				savemax(dptr);
711 #endif
712 			    if ((tscor += scor)>=kfact) {
713  				dptr->score = tscor;
714  				dptr->stop=lpos;
715  				}
716  			    else {
717  				dptr->score = kfact;
718  				dptr->start = (dptr->stop = lpos) - kt1;
719 				}
720 			    }
721 			else {
722 #ifdef FKFACT
723 		    	    dptr->score += fact;
724 #else
725 		    	    dptr->score += pamh1[aa0[tpos]];
726 #endif
727 		    	    dptr->stop = lpos;
728 			    }
729 			}
730 		    else {
731 #ifdef FKFACT
732 		    	dptr->score = kfact;
733 #else
734 		    	dptr->score = pamh2[lhval];
735 #endif
736 		    	dptr->start = (dptr->stop=lpos) - kt1;
737 			}
738 		    }       /* end tpos */
739 #ifdef ALLOCN0
740 		/* reinitialize diag structure */
741 
742 		if ((dptr= &diag[lpos%nd])->score>lowscor)
743 			savemax(dptr,lpos);
744 		dptr->stop = -1;
745 		dptr->dmax = NULL;
746 		dptr->score = 0;
747 #endif
748                 }       /* end lpos */
749 
750 #ifdef ALLOCN0
751 	for (tpos=0, dpos = noff+n1-1; tpos < n0; tpos++,dpos--) {
752 		if ((dptr= &diag[dpos%nd])->score>lowscor) savemax(dptr,dpos);
753 		}
754 #else
755 	for (dptr=diag; dptr < dpmax; ) {
756 		if (dptr->score>lowscor) savemax(dptr);
757 		dptr->stop = -1;
758 		dptr->dmax = NULL;
759 		dptr++->score = 0;
760 		}
761 	ndo = nd;
762 #endif
763 
764 /*
765         at this point all of the elements of aa1[lpos]
766         have been searched for elements of aa0[tpos]
767         with the results in diag[dpos]
768 */
769 	for (ib= nsave =0,vmptr=vmax; vmptr < &vmax[MAXSAV]; vmptr++) {
770 		if (vmptr->score>0) {
771 			vmptr->score=spam(vmptr);
772 			vptr[ib++]= vmptr;
773 			nsave++;
774 			}
775 		}
776 
777 	if (nsave>0) {
778 	  scor = sconn(vptr,nsave);
779 	  ksort(vptr,nsave,cmps);
780 	  scor = max(scor,vptr[0]->score);
781 	  if (lnsflg) {
782 	    *i0score = (int)((double)(vptr[0]->score)*lnscale+0.5);
783 	    *gscore = dmatch(noff-vptr[0]->dp,NO);
784 	    *gscore = (int)((double)(*gscore)*lnscale+0.5);
785 	    return (int)((double)scor*lnscale+0.5);
786 	  }
787 	  else {
788 	    *i0score = vptr[0]->score;
789 	    *gscore = dmatch(noff-vptr[0]->dp,NO);
790 	    return scor;
791 	  }
792 	}
793 	else { *i0score=0; *gscore=0; return 0;}
794 	}
795 
796 #ifdef ALLOCN0
savemax(dptr,dpos)797 savemax(dptr,dpos)
798 	register struct dstruct *dptr; int dpos;
799 {
800 	register struct beststr *vmptr;
801 	register int i;
802 
803 #else
804 savemax(dptr)
805 	register struct dstruct *dptr;
806 {
807 	register int dpos;
808 	register struct beststr *vmptr;
809 	register int i;
810 #ifndef I86BUG
811 	dpos = (int)(dptr-diag);
812 #else
813 	dpos = ((unsigned)dptr-(unsigned)diag)>>L2DSTR;
814 #endif
815 #endif
816 /* check to see if this is the continuation of a run that is already saved */
817 
818 	if ((vmptr=dptr->dmax)!=NULL && vmptr->dp== dpos &&
819 		vmptr->start==dptr->start) {
820 		vmptr->stop = dptr->stop;
821 		if ((i=dptr->score)<=vmptr->score) return;
822 		vmptr->score = i;
823 		if (vmptr!=lowmax) return;
824 		}
825 	else {
826 		i=lowmax->score = dptr->score;
827 		lowmax->dp = dpos;
828 		lowmax->start = dptr->start;
829 		lowmax->stop = dptr->stop;
830 		dptr->dmax = lowmax;
831 		}
832 
833 	for (vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
834 		if (vmptr->score < i) {
835 			i = vmptr->score;
836 			lowmax = vmptr;
837 			}
838 	lowscor = i;
839 	}
840 
initpam2()841 initpam2()
842 {
843 	int i, j, k;
844 
845 	k=0;
846 	for (i=0; i<naa; i++)
847 		for (j=0; j<=i; j++)
848 			pam2[j][i] = pam2[i][j] = pam[k++];
849 	}
850 
851 spam(dmax)
852 	struct beststr *dmax;
853 {
854 	int lpos;
855 	int tot, mtot;
856 	struct {int start, stop, score;} curv, maxv;
857 	register char *aa0p, *aa1p;
858 
859 	aa1p= &aa1[lpos = dmax->start];
860 	aa0p= &aa0[lpos - dmax->dp + noff];
861 	curv.start = lpos;
862 
863 	tot = curv.score = maxv.score = 0;
864 	for  ( ; lpos <= dmax->stop; lpos++) {
865 		tot += pam2[*aa0p++][*aa1p++];
866 		if (tot > curv.score) {
867 			curv.stop = lpos;
868 			curv.score = tot;
869 			}
870 		else if (tot < 0) {
871 			if (curv.score > maxv.score) {
872 				maxv.start = curv.start;
873 				maxv.stop = curv.stop;
874 				maxv.score = curv.score;
875 				}
876 			tot = curv.score = 0;
877 			curv.start = lpos;
878 			}
879 		}
880 
881 	if (curv.score > maxv.score) {
882 		maxv.start = curv.start;
883 		maxv.stop = curv.stop;
884 		maxv.score = curv.score;
885 		}
886 
887 /*	if (maxv.start != dmax->start || maxv.stop != dmax->stop)
888 		printf(" new region: %3d %3d %3d %3d\n",maxv.start,
889 			dmax->start,maxv.stop,dmax->stop);
890 */
891 	dmax->start = maxv.start;
892 	dmax->stop = maxv.stop;
893 
894 	return maxv.score;
895 	}
896 
897 sconn(v,n)
898 	struct beststr *v[];
899 	int n;
900 {
901 	int i,si,cmpp();
902 	struct slink {
903 		int score;
904 		struct beststr *vp;
905 		struct slink *next;
906 		} *start, *sl, *sj, *so, sarr[MAXSAV];
907 	int lstart, tstart, plstop, ptstop;
908 
909 /*	sort the score left to right in lib pos */
910 
911 	ksort(v,n,cmpp);
912 
913 	start = NULL;
914 
915 /*	for the remaining runs, see if they fit */
916 
917 	for (i=0,si=0; i<n; i++) {
918 
919 /*	if the score is less than the gap penalty, it never helps */
920 		if (v[i]->score < cgap) continue;
921 		lstart=v[i]->start;
922 		tstart=lstart-v[i]->dp+noff;
923 
924 /*	put the run in the group */
925 		sarr[si].vp = v[i];
926 		sarr[si].score = v[i]->score;
927 		sarr[si].next = NULL;
928 
929 /* 	if it fits, then increase the score */
930 		for (sl=start; sl!= NULL; sl = sl->next) {
931 			plstop = sl->vp->stop;
932 			ptstop = plstop - sl->vp->dp + noff;
933 			if (plstop<lstart && ptstop<tstart) {
934 				sarr[si].score = sl->score+v[i]->score+pgap;
935 				break;
936 				}
937 			}
938 
939 /*	now recalculate where the score fits */
940 		if (start==NULL) start= &sarr[si];
941 		else for (sj=start, so=NULL; sj!=NULL; sj = sj->next) {
942 			if (sarr[si].score>sj->score) {
943 				sarr[si].next = sj;
944 				if (so!=NULL) so->next= &sarr[si];
945 				else start= &sarr[si];
946 				break;
947 				}
948 			so=sj;
949 			}
950 		si++;
951 		}
952 
953 	if (start!=NULL) return (start->score);
954 	else return (0);
955 	}
956 
shscore(aa0,n0)957 shscore(aa0,n0)	/* calculate the 100% identical score */
958 	char *aa0; int n0;
959 {
960 	int i, sum;
961 	for (i=0,sum=0; i<n0; i++)
962 		sum += pam2[aa0[i]][aa0[i]];
963 	return sum;
964 	}
965 
inithist()966 inithist()
967 {
968 	int i;
969 
970 	for (i=0; i<MAXHIST; i++) {
971 		hist[i]=0;
972 		ghist[i]=0;
973 		hist0[i]=0;
974 		}
975 	nmean = lmax = 0;
976 	gnmean = glmax = 0;
977 	nmean0 = lmax0 = 0;
978 	lmin = glmin = lmin0 = MAXHIST*histint;
979 	}
980 
prhist(fd,score0,i0score0,gscore0)981 prhist(fd,score0, i0score0, gscore0)
982      FILE *fd; int score0, i0score0, gscore0;
983 {
984   int i,j,hl;
985   char hline[80], pch;
986   int gl,hl0;
987   int mn, mh1;
988   double cum, find_Evalue(), find_Pvalue();
989 
990   mn = lmin/histint-3;
991   if (mn < 0) mn = 0;
992   mh1 = glmax/histint+4;
993   if (mh1 >= MAXHIST) mh1 = MAXHIST-1;
994 
995   if (histflg) {
996     fprintf(fd,"\n     initn   init0   opt\n");
997     for (i=mn; i<=mh1; i++) {
998       pch = (i==mh1) ? '>' : ' ';
999       pch = (i==mn) ? '<' : pch;
1000       fprintf(fd,"%c%3d %5d %5d %5d:",
1001 	      pch,(i<mh1)?(i)*histint : mh1*histint,
1002 	      hist[i],hist0[i],ghist[i]);
1003       hl = hist[i];
1004       gl = ghist[i];
1005       if (hl > 50) hl = 50;
1006       if (gl > 50) gl = 50;
1007       for (j=0; j<hl; j++) hline[j]='i';
1008       hline[hl]=0;
1009       if (gl>hl) {for (j=hl; j<gl; j++) hline[j]='o'; hline[gl]=0;}
1010       if (gl==hl) for (j=0; j<hl; j++) hline[j]='b';
1011       if (gl<hl) for (j=0; j<gl; j++) hline[j]='o';
1012       fprintf(fd,"%s",hline);
1013       if ((score0 >= (i)*histint && score0 < (i+1)*histint) ||
1014 	  (score0 >= mh1*histint && i==mh1)) fprintf(fd," I");
1015       if ((gscore0 >= (i)*histint && gscore0 < (i+1)*histint) ||
1016 	  (gscore0 >= mh1*histint && i==mh1)) fprintf(fd," O");
1017       fprintf(fd,"\n");
1018     }
1019   }
1020 
1021   fprintf(fd,"%7ld residues in %5d sequences,\n",ntt,nlib);
1022   fprintf(fd," %s matrix, ",smptr);
1023   fprintf(fd,"gap penalties: %d,%d\n",gdelval,ggapval);
1024   if (wflag==1) fprintf(fd," local shuffle, window size: %d\n",wsiz);
1025 
1026   fprintf(fd, "\n unshuffled initn score: %d; shuffled score range: %d - %d\n",
1027 	  score0,lmin,lmax);
1028   cum = find_Pvalue(score0,n0,n1,K_n,Lambda_n);
1029   fprintf(fd," initn Lambda: %5.5g K: %5.5g; P(%d) = %5.5g\n",
1030 	  Lambda_n,K_n,score0,cum);
1031   fprintf(fd," For %d sequences, an initn score >=%d is expected",nlib,score0);
1032   cum = find_Evalue(score0,n0,n1,K_n,Lambda_n);  if (cum > 5.0) fprintf(fd," %d times\n",(int)(cum + 0.5));
1033   else fprintf(fd," %5.3g times\n",cum);
1034 
1035   fprintf(fd, "\n unshuffled init0 score: %d; shuffled score range: %d - %d\n",
1036 	  i0score0,lmin0,lmax0);
1037   cum = find_Pvalue(i0score0,n0,n1,K_0,Lambda_0);
1038   fprintf(fd," init0 Lambda: %5.5g K: %5.5g; P(%d) = %5.5g\n",
1039 	  Lambda_n,K_n,i0score0,cum);
1040   fprintf(fd," For %d sequences, an init0 score >=%d is expected",
1041 	  nlib,i0score0);
1042   cum = find_Evalue(i0score0,n0,n1,K_0,Lambda_0);
1043   if (cum > 5.0) fprintf(fd," %d times\n",(int)(cum + 0.5));
1044   else fprintf(fd," %5.3g times\n",cum);
1045 
1046   fprintf(fd, "\n unshuffled opt score: %d; shuffled score range: %d - %d\n",
1047 	  gscore0,glmin,glmax);
1048   cum = find_Pvalue(gscore0,n0,n1,K_g,Lambda_g);
1049   fprintf(fd," opt Lambda: %5.5g K: %5.5g; P(%d) = %5.5g\n",
1050 	  Lambda_g,K_g,gscore0,cum);
1051   fprintf(fd," For %d sequences, an opt score >=%d is expected",
1052 	  nlib,gscore0);
1053   cum = find_Evalue(gscore0,n0,n1,K_g,Lambda_g);
1054   if (cum > 5.0) fprintf(fd," %d times\n",(int)(cum + 0.5));
1055   else fprintf(fd," %5.3g times\n",cum);
1056 
1057   fprintf(fd,"\n ktup: %d, fact: %d",ktup, fact);
1058   if (lnsflg) fprintf(fd,"; ln scaling");
1059   fprintf(fd,"  scan time: "); ptime(fd,tscan-tstart); fprintf(fd,"\n");
1060 }
1061 
addhist(score)1062 addhist(score)
1063 	int score;
1064 {
1065   if (score>lmax) lmax = score;
1066   if (score < lmin) lmin = score;
1067   sscor[nmean++] = score;
1068 
1069   score = (score-1)/histint;
1070   if (score < 0) score=0;
1071   else if (score >= MAXHIST) score = MAXHIST-1;
1072   hist[score]++;
1073 }
1074 
addhistg(score)1075 addhistg(score)
1076      int score;
1077 {
1078   if (score>glmax) glmax = score;
1079   if (score<glmin) glmin = score;
1080   gsscor[gnmean++] = score;
1081 
1082   score = (score-1)/histint;
1083   if (score < 0) score=0;
1084   else if (score >= MAXHIST) score = MAXHIST-1;
1085   ghist[score]++;
1086 }
1087 
addhist0(score)1088 addhist0(score)
1089      int score;
1090 {
1091   if (score>lmax0) lmax0 = score;
1092   if (score<lmin0) lmin0 = score;
1093   sscor0[nmean0++] = score;;
1094 
1095   score = (score-1)/histint;
1096   if (score < 0) score=0;
1097   else if (score >= MAXHIST) score = MAXHIST-1;
1098   hist0[score]++;
1099 }
1100 
allocdiag(dsize)1101 allocdiag(dsize)	/* allocates diagonal structures */
1102      int dsize;
1103 {
1104   diag = (struct dstruct *)calloc((size_t)dsize,sizeof(struct dstruct));
1105 
1106   if (diag==NULL) {
1107     printf(" cannot allocate diagonal arrays\n");
1108     exit(1);
1109   }
1110 }
1111 
1112 cmps(ptr1,ptr2)
1113      struct beststr *ptr1, *ptr2;
1114 {
1115   if (ptr1->score < ptr2->score) return (1);
1116   else if (ptr1->score > ptr2->score) return (-1);
1117   else return (0);
1118 }
1119 
1120 cmpp(ptr1,ptr2)
1121      struct beststr *ptr1, *ptr2;
1122 {
1123   if (ptr1->start < ptr2->start) return (-1);
1124   else if (ptr1->start > ptr2->start) return (1);
1125   else return (0);
1126 }
1127 
cmpi(val1,val2)1128 int cmpi(val1, val2)
1129      int *val1, *val2;
1130 {
1131   if (*val1 < *val2) return (-1);
1132   else if (*val1 > *val2) return 1;
1133   else return 0;
1134 }
1135 
ksort(v,n,comp)1136 ksort(v,n,comp)
1137 	void *v[]; int n, (*comp)();
1138 {
1139   int gap, i, j;
1140   char *tmp;
1141 
1142   for (gap=n/2; gap>0; gap/=2)
1143     for (i=gap; i<n; i++)
1144       for (j=i-gap; j>=0; j -= gap) {
1145 	if ((*comp)(v[j],v[j+gap]) <=0)
1146 	  break;
1147 	tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
1148       }
1149 }
1150 
ssort(v,n)1151 void ssort(v,n)
1152      int *v; int n;
1153 {
1154   int gap, i, j;
1155   int tmp;
1156 
1157   for (gap=n/2; gap>0; gap/=2)
1158     for (i=gap; i<n; i++)
1159       for (j=i-gap; j>=0; j -= gap) {
1160 	if (v[j]<=v[j+gap]) break;
1161 	tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
1162       }
1163 }
1164 
1165 int ieven = 1;
wshuffle(from,to,n,wsiz)1166 wshuffle(from,to,n,wsiz)	/* copies from from to from shuffling */
1167 	char  *from, *to; int n, wsiz;
1168 {
1169   int i,j, k, mm; char tmp, *top;
1170 
1171   memcpy(to,from,n);
1172 
1173   mm = n%wsiz;
1174 
1175   if (ieven) {
1176     for (k=0; k<(n-wsiz); k += wsiz) {
1177       top = &to[k];
1178       for (i=wsiz; i>1; i--) {
1179 	j = nrand(i);
1180 	tmp = top[j];
1181 	top[j] = top[i-1];
1182 	top[i-1] = tmp;
1183       }
1184     }
1185     top = &to[n-mm];
1186     for (i=mm; i>1; i--) {
1187       j = nrand(i);
1188       tmp = top[j];
1189       top[j] = top[i-1];
1190       top[i-1] = tmp;
1191     }
1192     ieven = 0;
1193   }
1194   else {
1195     for (k=n; k>=wsiz; k -= wsiz) {
1196       top = &to[k-wsiz];
1197       for (i=wsiz; i>1; i--) {
1198 	j = nrand(i);
1199 	tmp = top[j];
1200 	top[j] = top[i-1];
1201 	top[i-1] = tmp;
1202       }
1203     }
1204     top = &to[0];
1205     for (i=mm; i>1; i--) {
1206       j = nrand(i);
1207       tmp = top[j];
1208       top[j] = top[i-1];
1209       top[i-1] = tmp;
1210     }
1211     ieven = 1;
1212   }
1213   to[n] = -1;
1214 }
1215 
shuffle(from,n)1216 shuffle(from,n)	/* copies from from to from shuffling */
1217 	char  *from; int n;
1218 {
1219   int i,j; char tmp;
1220 
1221   for (i=n; i>1; i--) {
1222     j = nrand(i);
1223     tmp = from[j];
1224     from[j] = from[i-1];
1225     from[i-1] = tmp;
1226   }
1227   from[n] = -1;
1228 }
1229 
1230 /* ckalloc - allocate space; check for success */
ckalloc(amount)1231 char *ckalloc(amount)
1232 int amount;
1233 {
1234   char *p;
1235 
1236   if ((p = malloc((size_t)amount)) == NULL) {
1237     fprintf(stderr,"Ran out of memory.");
1238     exit(1);
1239   }
1240   return(p);
1241 }
1242 
1243 /*  stubs for linking */
1244 int llen;
1245 
aancpy()1246 aancpy()
1247 {}
1248 
1249 int markx;
disgraph()1250 disgraph()
1251 {}
1252 
B_ALIGN()1253 B_ALIGN() {}
1254 
ALIGN()1255 ALIGN() {}
1256 
LOCAL_ALIGN()1257 LOCAL_ALIGN() {}
1258 
discons()1259 discons()
1260 {}
1261