1 /* prdf.c 12-Feb-1984, 22-Sept-85, 17-Oct-1985
2 copyright (c) 1984,1987,1988 William R. Pearson and David Lipman
3
4 prdf.c - version of udf.c which scrambles sequences for testing
5 significance
6
7 modified for argc or queries
8
9 9-May-85 change shuffling algorithm
10
11 22-Sept-85 display both initial and optimized histogram
12 17-Oct-85 correct bigscore bug, savemax bug
13 27-Dec-85 correct savemax bug, put 32 wide window in ngdist
14 22-June-86 added universal matrix
15 4-Apr-88 update to new kfact calculation
16 Nov, 1991 include -n option to force DNA sequences
17 Nov 8, 1992 fix pamfact for DNA sequences
18 May 14, 1993 fix local shuffle to stay local
19 */
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 char *refstr="\nPlease cite:\n W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448;\n";
25 char *verstr="version 2.0u1, September, 1995";
26
27 #ifdef __MWERKS__
28 #include <Types.h>
29 #include <StandardFile.h>
30 StandardFileReply freply;
31 Point wpos;
32 int tval;
33 char prompt[256];
34 #include <sioux.h>
35 #define getenv mgetenv
36 #endif
37
38 #define NO 0
39
40 #define max(a,b) (((a)>(b))?(a):(b))
41 #define min(a,b) (((a)<(b))?(a):(b))
42
43 #ifndef BIGMEM
44 #define MAXTST 2000 /* longest test sequence */
45 #define MAXLIB 10000
46 #define MAXDIAG (MAXTST+MAXLIB)
47 #define QFILE_SIZE 40
48 #else
49 #define MAXTST 10000
50 #define MAXLIB 50000
51 #define MAXDIAG (MAXTST+MAXLIB)
52 #define QFILE_SIZE 256
53 #endif
54
55 #ifndef MAXSAV
56 #define MAXSAV 20 /* number of best diagonals saved */
57 #endif
58 #define MAXHIST 51 /* number of histogram divisions */
59 /*#define HISTSIZE 2 /* size of histogram division */
60
61 FILE *outfd; /* fd for output file */
62 int smark[4];
63 long sq0off=1, sq1off=1;
64 long loffset = 0l; /* offset into sequence */
65
66
67 /* globals for matching */
68
69 long lmark; /* position in library file from ftell() */
70 int nlib;
71 long ntt; /* number of library sequences, number of
72 residues scanned*/
73 char libstr[21]; /* partial title from library sequence */
74
75 char *aa0, *aa10, *aa1; /* amino acid sequence data */
76 int maxn; /* max space for lib sequence (MAXDIAG-n0) */
77 int n0, n1, nd, noff; /* length of aa0, length of aa1, n0+n1,
78 diagonal offset */
79
80 struct dstruct { /* diagonal structure for saving current run */
81 int start; /* start of current match */
82 int stop; /* end of current match */
83 int score; /* hash score of current match */
84 struct beststr *dmax; /* location in vmax[] where best score data saved */
85 } *diag;
86
87
88 struct beststr {
89 int score; /* pam score with segment opt */
90 int score0; /* pam score of best single segment */
91 int gscore; /* score from global match */
92 long lseek; /* position in library file */
93 int dp; /* diagonal of match */
94 int start; /* start of match in lib seq */
95 int stop; /* end of match in lib seq */
96 }
97 vmax[MAXSAV], /* best matches saved for one sequence */
98 *vptr[MAXSAV];
99
100 int *sscor, *sscor0, *gsscor;
101
102 int optwid=32, optwidf=0; /* width for band optimization */
103 /* changed in initparm() */
104
105 double K_n, K_0, K_g;
106 double Lambda_n, Lambda_0, Lambda_g;
107
108 int cgap; /* gap threshold */
109 int pgap; /* gap penalty for optimized alignment of diagonals */
110
111 int nbest; /* number of sequences better than bestcut in best */
112 int bestcut; /* cut off for getting into MAXBEST */
113
114 int histint=2;
115 int bestoff=36; /* values for calculating bestcut */
116 int bestscale=300;
117 int bkfact=6;
118 int scfact=3;
119 int bktup=2;
120 int ktmax=2;
121 int bestmax=50;
122 int pamfact = 1; /* flag for using pam values for kfact */
123 int dnaseq = 0; /* true if DNA query sequence */
124 int lnsflg=0;
125 int histflg=1;
126
127 int nsav, lowscor; /* number of saved runs, worst saved
128 run, score of worst saved run */
129 struct beststr *lowmax;
130
131 int hist[MAXHIST]; /* histogram of all score */
132 int lmax, lmin;
133 int nmean; /* number of scores averaged in mean */
134
135 int hist0[MAXHIST]; /* histogram of init0 scores */
136 int lmax0, lmin0;
137 int nmean0; /* number of init0 scores */
138
139 int ghist[MAXHIST]; /* histogram of optimized score */
140 int glmax, glmin;
141 int gnmean; /* number of optimized scores averaged */
142
143 int fact, gm; /* scoring factors */
144
145 static int hmask, hmax; /* hash constants */
146 static int *pamh2; /* pam based kfact array */
147 static int *link, *harr; /* hash arrays */
148 static int ktup, kshft, kt1; /* ktuple constants */
149
150 int nshow; char rline[20],sline[20];
151 char resfile[QFILE_SIZE];
152
153 int showall;
154 long tstart, tscan, tdone, sstime();
155
156 #include "upam.gbl" /* includes pam array */
157
158 extern int optind;
159 extern int outtty;
160 char *getenv(), *smptr, *cptr; /* scoring matrix env */
161 char smstr[QFILE_SIZE];
162
163 int wflag = -1;
164 int wsiz;
165
166 #ifdef __MWERKS__
167 /* short ouvRef, q0vRef, q1vRef; */
168 FSSpec ouSpec, q0Spec, q1Spec;
169 OSErr error;
170 #define IntroDID 400 /* LFASTA */
171 #endif
172 char *iprompt0=" PRDF compares a test sequence to a shuffled sequence\n";
173 char *iprompt1=" test sequence file name: ";
174 char *iprompt2=" sequence to shuffle: ";
175
main(argc,argv)176 main(argc, argv)
177 int argc; char **argv;
178 {
179 char tname[QFILE_SIZE], lname[QFILE_SIZE], qline[QFILE_SIZE];
180 char *bp;
181 int score, gscore, score0, gscore0; /* scores calculated */
182 int i0score, i0score0;
183 int rcnt; /* number of random shuffles */
184 int i;
185
186 #ifdef __MWERKS__
187 SIOUXSettings.asktosaveonclose=TRUE;
188 SIOUXSettings.showstatusline=FALSE;
189 SIOUXSettings.autocloseonquit=TRUE;
190
191 argc = ccommand(&argv);
192 if (GetResource('DLOG',IntroDID)==0L && OpenResFile("\pFASTA.rsrc")<0) {
193 SysBeep(100); fprintf(stderr," WARNING FASTA.rsrc file could not be found\n");
194 }
195 InitEvent();
196 /* GetVol((unsigned char *)prompt,&ouvRef); */
197 error=HGetVol(NULL,&ouSpec.vRefNum, &ouSpec.parID);
198 if (error != noErr) {
199 fprintf(stderr," cannot get current directory\n");
200 exit(1);
201 }
202 wpos.h=50; wpos.v=100;
203 #endif
204
205 if ((aa0=calloc((size_t)MAXTST+MAXLIB,sizeof(char)))==0) {
206 printf(" cannot allocate sequence array\n");
207 exit(1);
208 }
209 maxn = MAXTST+MAXLIB;
210 if ((aa10=calloc((size_t)MAXLIB,sizeof(char)))==0) {
211 printf(" cannot allocate sequence array\n");
212 exit(1);
213 }
214
215 initenv(argc,argv);
216
217 if (argc-optind < 3) {
218 #ifndef __MWERKS__
219 fputs(iprompt0,stdout);
220 fprintf(stdout," %s%s\n",verstr,refstr);
221 l1: fputs(iprompt1,stdout);
222 fflush(stdout);
223 if (fgets(tname,sizeof(tname),stdin)==NULL) exit(0);
224 if (tname[strlen(tname)-1]=='\n') tname[strlen(tname)-1]='\0';
225 if (tname[0]=='\0') goto l1;
226 #else
227 NIntroDlog(IntroDID,iprompt0,verstr,refstr,"\0");
228
229 l1: SFileDlog(iprompt1,&freply);
230 if (freply.sfGood==TRUE) {
231 PtoCstr((unsigned char *)freply.sfFile.name);
232 strcpy(tname,(char *)freply.sfFile.name);
233 /* q0vRef=freply.vRefNum;
234 SetVol(NULL,q0vRef);
235 */
236 q1Spec.vRefNum = q0Spec.vRefNum = freply.sfFile.vRefNum;
237 q1Spec.parID = q0Spec.parID = freply.sfFile.parID;
238 HSetVol(NULL,q0Spec.vRefNum,q0Spec.parID);
239 }
240 else exit(0);
241 #endif
242 if ((n0=getseq(tname,aa0,maxn,&dnaseq))==0) {
243 printf(" %s : sequence not found\n",tname);
244 goto l1;
245 }
246 resetp(dnaseq);
247
248 #ifndef __MWERKS__
249 l2: fputs(iprompt2,stdout);
250 fflush(stdout);
251 if (fgets(lname,sizeof(lname),stdin)==NULL) exit(1);
252 if (lname[strlen(lname)-1]=='\n') lname[strlen(lname)-1]='\0';
253 if (*lname==0) goto l2;
254 #else
255 l2: SFileDlog(iprompt2,&freply);
256 if (freply.sfGood==TRUE) {
257 PtoCstr((unsigned char *)freply.sfFile.name);
258 strcpy(lname,(char *)freply.sfFile.name);
259 /* q1vRef=freply.vRefNum;
260 SetVol(NULL,q1vRef);
261 */
262 q1Spec.vRefNum = freply.sfFile.vRefNum;
263 q1Spec.parID = freply.sfFile.parID;
264 HSetVol(NULL,q1Spec.vRefNum,q1Spec.parID);
265 }
266 else exit(0);
267 #endif
268
269 printf(" ktup? (1 to %d) [%d] ",ktmax,ktmax);
270 fgets(qline,sizeof(qline),stdin);
271 ktup = ktmax;
272 if (qline[0]!='\0' && qline[0]!='\n') {
273 sscanf(qline,"%d",&ktup);
274 if (ktup < 1 || ktup>ktmax) ktup = ktmax;
275 }
276 printf(" number of random shuffles? [100] ");
277 fgets(qline,sizeof(qline),stdin);
278 rcnt = 100;
279 if (qline[0]!='\0' && qline[0]!='\n') {
280 sscanf(qline,"%d",&rcnt);
281 if (rcnt < 50) rcnt = 100;
282 }
283
284 if (wflag<=0) {
285 printf(" local (window) (w) or uniform (u) shuffle [u]? ");
286 fgets(qline,sizeof(qline),stdin);
287 if ((bp=strchr(qline,'\n'))!=NULL) *bp='\0';
288 }
289 else qline[0]='\0';
290 if (tolower(qline[0])=='w' || wflag==1) {
291 wflag = 1;
292 wsiz = 10;
293 printf(" local shuffle window size [10] ");
294 fgets(qline,sizeof(qline),stdin);
295 if (qline[0]!='\0' && qline[0]!='\n') {
296 sscanf(qline,"%d",&wsiz);
297 if (wsiz < 2) wsiz = 10;
298 }
299 }
300 else wflag=0;
301 }
302 else {
303 fputs(iprompt0,stdout);
304 fprintf(stdout," %s%s\n",verstr,refstr);
305 strncpy(tname,argv[optind+1],sizeof(tname));
306 if ((n0=getseq(tname,aa0,maxn,&dnaseq))==0) {
307 printf(" %s : sequence not found\n",tname);
308 exit(1);
309 }
310 resetp(dnaseq);
311 strncpy(lname,argv[optind+2],sizeof(lname));
312 if (argc-optind>3) sscanf(argv[optind+3],"%d",&ktup);
313 else ktup=ktmax;
314 rcnt = 100;
315 if (argc-optind>4) sscanf(argv[optind+4],"%d",&rcnt);
316 }
317
318 sscor=(int *)calloc((size_t)rcnt, sizeof (int));
319 sscor0=(int *)calloc((size_t)rcnt, sizeof (int));
320 gsscor=(int *)calloc((size_t)rcnt, sizeof (int));
321
322 if (sscor == NULL || sscor0 == NULL || gsscor == NULL) {
323 fprintf(stderr," cannot allocate shuffled score arrays (%d)",rcnt);
324 exit(1);
325 }
326
327 printf(" %s : %4d %s\n",tname, n0, sqnam);
328
329 aa1 = aa0 + n0 + 2;
330 maxn -= n0 + 3;
331 if (maxn > MAXLIB) maxn = MAXLIB;
332
333 if ((n1=getseq(lname,aa10,maxn,&dnaseq))==0) {
334 printf(" %s : %s sequence not found\n",lname,sqtype);
335 exit(1);
336 }
337 printf(" %s : %4d %s\n",lname, n1, sqnam);
338
339 initpam2(); /* convert 1-d pam to 2-d pam2 */
340
341 initparm();
342
343 tstart = sstime();
344
345 fact = ktup*scfact;
346 hashaa(aa0,n0,ktup); /* hash test sequence */
347
348 #ifndef ALLOCN0
349 allocdiag(MAXDIAG);
350 #else
351 allocdiag(n0);
352 #endif
353 inithist(); /* initialize histogram, mean, sd */
354
355 if (wsiz > n1) wsiz=n1;
356
357 for (i=0; i<n1; i++) aa1[i]=aa10[i];
358 aa1[n1]= -1;
359
360 score0 = dhash(&i0score0,&gscore0);
361
362 nlib = 0; /* counts number of sequences */
363 ntt = 0; /* counts number of residues */
364
365 irand(); /* seed the random number generator */
366
367 while (rcnt-- > 0) {
368 #ifdef __MWERKS__
369 ChkEvent();
370 #endif
371 if (wflag==1) wshuffle(aa10,aa1,n1,wsiz);
372 else shuffle(aa1,n1);
373 score=dhash(&i0score,&gscore); /* do the hash */
374 addhist(score);
375 addhist0(i0score);
376 addhistg(gscore);
377 }
378
379 tscan = sstime();
380
381 initialize_hist(lmax);
382 est_lambda_K(n0,n1,sscor,nmean, &K_n, &Lambda_n);
383 free_hist();
384
385 initialize_hist(lmax0);
386 est_lambda_K(n0,n1,sscor0,nmean, &K_0, &Lambda_0);
387 free_hist();
388
389 initialize_hist(glmax);
390 est_lambda_K(n0,n1,gsscor,nmean,&K_g, &Lambda_g);
391 free_hist();
392
393 /* print histogram, statistics */
394 prhist(stdout,score0,i0score0,gscore0);
395
396 outfd = stdout;
397
398 #ifdef __MWERKS__
399 HSetVol(NULL,ouSpec.vRefNum,ouSpec.parID);
400 /* SetVol(NULL,ouvRef); */
401 #endif
402 if (outtty && resfile[0]=='\0') {
403 printf(" Enter filename for results : ");
404 fgets(rline,sizeof(rline),stdin);
405 if ((bp=strchr(rline,'\n'))!=NULL) *bp='\0';
406 if (rline[0]!='\0') strncpy(resfile,rline,sizeof(resfile));
407 }
408 if (resfile[0]!='\0') {
409 if ((outfd=fopen(resfile,"w"))==NULL) exit(1);
410 fprintf(outfd," %s, %d %s vs %s\n",tname, n0, sqnam, lname);
411 prhist(outfd,score0,i0score0,gscore0);
412 #ifdef __MWERKS__
413 SIOUXSettings.asktosaveonclose=FALSE;
414 SIOUXSettings.autocloseonquit=TRUE;
415 #endif
416 }
417 }
418
419 extern int *sascii, nascii[], aascii[];
420
initenv(argc,argv)421 initenv(argc,argv)
422 int argc; char **argv;
423 {
424 char *cptr, *getenv();
425 int copt, getopt();
426 extern char *optarg;
427
428 sascii = aascii;
429 strncpy(smstr,"BLOSUM50",sizeof(smstr));
430 smptr = smptr;
431 pam = abl50;
432 sq = aa;
433 hsq = haa;
434 nsq = naa;
435 dnaseq = 0;
436
437 if ((cptr=getenv("GAPPEN"))!=NULL) sscanf(cptr,"%d",&cgap);
438 else cgap=0;
439
440 if ((cptr=getenv("CUTOFF"))!=NULL) sscanf(cptr,"%d",&bestcut);
441 else bestcut = 0;
442
443 if ((cptr=getenv("PAMFACT"))!=NULL) {
444 sscanf(cptr,"%d",&pamfact);
445 if (pamfact==1) pamfact = -2; else pamfact=0;
446 }
447
448 while ((copt=getopt(argc,argv,"Qqc:f:g:hk:nO:p:s:y:w:"))!=EOF)
449 switch(copt) {
450 case 'q':
451 case 'Q': outtty=0; break;
452 case 'c': sscanf(optarg,"%d",&bestcut); break;
453 case 'f': sscanf(optarg,"%d",&gdelval); del_set=1; break;
454 case 'g': sscanf(optarg,"%d",&ggapval); gap_set=1; break;
455 case 'h': histflg = 0; break;
456 case 'n': dnaseq=1;
457 sascii = nascii;
458 strncpy(smstr,"DNA",sizeof(smstr));
459 smptr = smstr;
460 sq = nt;
461 nsq = nnt;
462 hsq = hnt;
463 pam = npam;
464 strcpy(sqnam,"nt");
465 strcpy(sqtype,"DNA");
466 resetp(dnaseq);
467 break;
468 case 'O': strncpy(resfile,optarg,sizeof(resfile));
469 break;
470 case 'k': sscanf(optarg,"%d",&cgap); break;
471 case 's': strncpy(smstr,optarg,sizeof(smstr));
472 smptr=smstr;
473 if (initpam(smptr)) dnaseq= -1;
474 break;
475 case 'w': wflag = 1;
476 sscanf(optarg,"%d",&wsiz);
477 break;
478 case 'y': sscanf(optarg,"%d",&optwid);
479 optwidf = 1;
480 break;
481 default : fprintf(stderr," illegal option -%c\n",copt);
482 }
483 optind--;
484
485 if (dnaseq>=0) {
486 if ((smptr=getenv("SMATRIX"))!=NULL && initpam(smptr)) dnaseq = -1;
487 else smptr = smstr;
488 }
489
490 ktmax = bktup;
491 if (dnaseq<0 && strlen(smptr)>0)
492 fprintf(stderr," matrix file reset to %s\n",smptr);
493 }
494
resetp(dnaseq)495 resetp(dnaseq)
496 int dnaseq;
497 {
498 if (dnaseq==1) {
499 fprintf(stderr," resetting to DNA\n");
500 if (!del_set) gdelval = -16;
501 if (!gap_set) ggapval = -4;
502 histint=4;
503 bestscale=80;
504 bkfact=5;
505 scfact=1;
506 bktup=6;
507 ktmax=6;
508 bestmax=80;
509 bestoff=45;
510 pam = npam;
511 smptr = "DNA";
512 if (pamfact>=0) pamfact = 0;
513 }
514 }
515
initparm()516 initparm()
517 {
518 char *getenv(), *cptr;
519 int itemp, btemp;
520
521 if (!optwidf) {
522 if (!dnaseq && ktup==1) optwid = 32;
523 else optwid = 16;
524 }
525
526 btemp = 2*bestoff/3 + n0/bestscale + bkfact*(bktup-ktup);
527 if (btemp>bestmax) btemp = bestmax;
528 if (btemp > 3*n0) btemp = 3*shscore(aa0,n0)/5;
529
530 if (cgap<=0) cgap=btemp+bestoff/3;
531 pgap = gdelval + ggapval;
532 }
533
534 /* hashaa - hash sequence 0 for rapid lookup of seq 1 (library) */
535
hashaa(aa0,n0,ktup)536 hashaa(aa0, n0, ktup)
537 char *aa0; int n0, ktup;
538 {
539 int mhv;
540 int i0, hv, phv;
541
542 if (pamfact == -1) pamfact=0;
543 else if (pamfact== -2) pamfact=1;
544
545 for (i0=0, mhv= -1; i0<naa; i0++)
546 if (hsq[i0]>mhv) mhv=hsq[i0];
547
548 if (mhv<=0) {
549 printf(" maximum hsq <=0 %d\n",mhv);
550 exit(1);
551 }
552
553 for (kshft=0; mhv>0; mhv/=2) {
554 kshft++;
555 }
556
557 /* kshft = 2; */
558 kt1 = ktup-1;
559
560 hv = 1;
561 for (i0 = 0; i0<ktup; i0++)
562 hv = hv<<kshft;
563 hmax = hv;
564 hmask = (hmax>>kshft)-1;
565
566 allochash(n0,hmax);
567
568 for (i0=0; i0<hmax; i0++) harr[i0]= -1;
569 for (i0=0; i0<n0; i0++) link[i0]= -1;
570
571 /* encode the aa0 array */
572
573 hv=phv=0;
574 for (i0=0; i0<kt1; i0++) {
575 hv= (hv<<kshft)+hsq[aa0[i0]];
576 phv += pam2[aa0[i0]][aa0[i0]]*ktup;
577 }
578
579 for (; i0<n0; i0++) {
580 hv=((hv&hmask)<<kshft) + hsq[aa0[i0]];
581 link[i0]=harr[hv];
582 harr[hv]=i0;
583 if (pamfact) {
584 pamh2[hv] = (phv += pam2[aa0[i0]][aa0[i0]]*ktup);
585 phv -= pam2[aa0[i0-kt1]][aa0[i0-kt1]]*ktup;
586 }
587 else pamh2[hv]=fact*ktup;
588 }
589 if (pamfact)
590 for (i0=0; i0<nsq; i0++) pamh1[i0]=pam2[i0][i0]*ktup;
591 else
592 for (i0=0; i0<nsq; i0++) pamh1[i0]=fact;
593 }
594
allochash(n0,hmax)595 allochash(n0, hmax)
596 int n0, hmax;
597 {
598
599 if ((harr=(int *)calloc((size_t)hmax,sizeof(int)))== NULL) {
600 printf(" cannot allocate hash array: %d\n",hmax);
601 exit(1);
602 }
603 if ((pamh2=(int *)calloc((size_t)hmax,sizeof(int)))==NULL) {
604 fprintf(stderr," cannot allocate pamh2 array: %d\n",hmax);
605 exit(1);
606 }
607 if ((link=(int *)calloc((size_t)n0,sizeof(int)))== NULL) {
608 printf(" cannot allocate hash link array: %d",n0);
609 exit(1);
610 }
611 }
612
613 /* this is the main loop. First zero the diagonal arrays,
614 then go through the sequence ktup at a time updating the
615 diagonals appropriately. Finally, scan the diagonals,
616 looking for the max score, and use the pam matrix
617 */
618
dhash(i0score,gscore)619 dhash(i0score,gscore)
620 int *i0score,*gscore;
621 {
622 int nd, ndo; /* diagonal array size */
623 int lhval;
624 int kfact;
625 register struct dstruct *dptr;
626 register int tscor;
627 #ifndef ALLOCN0
628 register struct dstruct *diagp;
629 #else
630 register int dpos;
631 int lposn0;
632 #endif
633 struct dstruct *dpmax;
634 register int lpos;
635 int tpos;
636 struct beststr *vmptr;
637 int scor;
638 int im, ib, nsave;
639 int cmps(); /* comparison routine for ksort */
640 char *aa1ptr;
641 double lnscale;
642 int lcont, ocont, loff; /* lcont is returned by getlib to
643 indicate there is more sequence
644 remaining. ocont is the previous
645 value of lcont, for going back later.
646 loff corrects maxn for the modified
647 size of aa1 for continued sequences
648 */
649
650 #ifdef FKFACT
651 kfact = ktup*fact;
652 #endif
653 noff = n0-1;
654 ndo = 0;
655 #ifdef ALLOCN0
656 nd = n0;
657 #endif
658 ntt += n1;
659 nlib++;
660
661 #ifndef ALLOCN0
662 nd = n0+n1;
663 #endif
664 if (lnsflg) lnscale = (log((double)n0)/log((double)n1));
665
666 dpmax = &diag[nd];
667 for (dptr= &diag[ndo]; dptr<dpmax;) {
668 dptr->stop = -1;
669 dptr->dmax = NULL;
670 dptr++->score = 0;
671 }
672
673 for (vmptr=vmax; vmptr<&vmax[MAXSAV]; vmptr++)
674 vmptr->score = 0;
675 lowmax = vmax;
676 lowscor = 0;
677
678
679 /* start hashing */
680 lhval = 0;
681 for (lpos=0; lpos<kt1;)
682 lhval= ((lhval&hmask)<<kshft)+hsq[aa1[lpos++]];
683
684 #ifndef ALLOCN0
685 diagp = &diag[noff + kt1];
686 for ( ; lpos<n1; lpos++,diagp++) {
687 lhval = ((lhval&hmask)<<kshft) + hsq[aa1[lpos]];
688 for (tpos=harr[lhval]; tpos>=0; tpos=link[tpos]) {
689 if ((tscor = (dptr = &diagp[-tpos])->stop)>=0) {
690 #else
691 lposn0 = noff + lpos;
692 for ( ; lpos<n1; lpos++,lposn0++) {
693 lhval = ((lhval&hmask)<<kshft) + hsq[aa1[lpos]];
694 for (tpos=harr[lhval]; tpos>=0; tpos=link[tpos]) {
695 dpos = lposn0 - tpos;
696 if ((tscor = (dptr = &diag[dpos%nd])->stop)>=0) {
697 #endif
698 tscor += ktup;
699 if ((tscor -=lpos)<=0) {
700 scor = dptr->score;
701 #ifdef FKFACT
702 if ((tscor += kfact)<0 && lowscor<scor)
703 #else
704 if ((tscor += (kfact=pamh2[lhval]))<0
705 && lowscor < scor)
706 #endif
707 #ifdef ALLOCN0
708 savemax(dptr,dpos);
709 #else
710 savemax(dptr);
711 #endif
712 if ((tscor += scor)>=kfact) {
713 dptr->score = tscor;
714 dptr->stop=lpos;
715 }
716 else {
717 dptr->score = kfact;
718 dptr->start = (dptr->stop = lpos) - kt1;
719 }
720 }
721 else {
722 #ifdef FKFACT
723 dptr->score += fact;
724 #else
725 dptr->score += pamh1[aa0[tpos]];
726 #endif
727 dptr->stop = lpos;
728 }
729 }
730 else {
731 #ifdef FKFACT
732 dptr->score = kfact;
733 #else
734 dptr->score = pamh2[lhval];
735 #endif
736 dptr->start = (dptr->stop=lpos) - kt1;
737 }
738 } /* end tpos */
739 #ifdef ALLOCN0
740 /* reinitialize diag structure */
741
742 if ((dptr= &diag[lpos%nd])->score>lowscor)
743 savemax(dptr,lpos);
744 dptr->stop = -1;
745 dptr->dmax = NULL;
746 dptr->score = 0;
747 #endif
748 } /* end lpos */
749
750 #ifdef ALLOCN0
751 for (tpos=0, dpos = noff+n1-1; tpos < n0; tpos++,dpos--) {
752 if ((dptr= &diag[dpos%nd])->score>lowscor) savemax(dptr,dpos);
753 }
754 #else
755 for (dptr=diag; dptr < dpmax; ) {
756 if (dptr->score>lowscor) savemax(dptr);
757 dptr->stop = -1;
758 dptr->dmax = NULL;
759 dptr++->score = 0;
760 }
761 ndo = nd;
762 #endif
763
764 /*
765 at this point all of the elements of aa1[lpos]
766 have been searched for elements of aa0[tpos]
767 with the results in diag[dpos]
768 */
769 for (ib= nsave =0,vmptr=vmax; vmptr < &vmax[MAXSAV]; vmptr++) {
770 if (vmptr->score>0) {
771 vmptr->score=spam(vmptr);
772 vptr[ib++]= vmptr;
773 nsave++;
774 }
775 }
776
777 if (nsave>0) {
778 scor = sconn(vptr,nsave);
779 ksort(vptr,nsave,cmps);
780 scor = max(scor,vptr[0]->score);
781 if (lnsflg) {
782 *i0score = (int)((double)(vptr[0]->score)*lnscale+0.5);
783 *gscore = dmatch(noff-vptr[0]->dp,NO);
784 *gscore = (int)((double)(*gscore)*lnscale+0.5);
785 return (int)((double)scor*lnscale+0.5);
786 }
787 else {
788 *i0score = vptr[0]->score;
789 *gscore = dmatch(noff-vptr[0]->dp,NO);
790 return scor;
791 }
792 }
793 else { *i0score=0; *gscore=0; return 0;}
794 }
795
796 #ifdef ALLOCN0
savemax(dptr,dpos)797 savemax(dptr,dpos)
798 register struct dstruct *dptr; int dpos;
799 {
800 register struct beststr *vmptr;
801 register int i;
802
803 #else
804 savemax(dptr)
805 register struct dstruct *dptr;
806 {
807 register int dpos;
808 register struct beststr *vmptr;
809 register int i;
810 #ifndef I86BUG
811 dpos = (int)(dptr-diag);
812 #else
813 dpos = ((unsigned)dptr-(unsigned)diag)>>L2DSTR;
814 #endif
815 #endif
816 /* check to see if this is the continuation of a run that is already saved */
817
818 if ((vmptr=dptr->dmax)!=NULL && vmptr->dp== dpos &&
819 vmptr->start==dptr->start) {
820 vmptr->stop = dptr->stop;
821 if ((i=dptr->score)<=vmptr->score) return;
822 vmptr->score = i;
823 if (vmptr!=lowmax) return;
824 }
825 else {
826 i=lowmax->score = dptr->score;
827 lowmax->dp = dpos;
828 lowmax->start = dptr->start;
829 lowmax->stop = dptr->stop;
830 dptr->dmax = lowmax;
831 }
832
833 for (vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
834 if (vmptr->score < i) {
835 i = vmptr->score;
836 lowmax = vmptr;
837 }
838 lowscor = i;
839 }
840
initpam2()841 initpam2()
842 {
843 int i, j, k;
844
845 k=0;
846 for (i=0; i<naa; i++)
847 for (j=0; j<=i; j++)
848 pam2[j][i] = pam2[i][j] = pam[k++];
849 }
850
851 spam(dmax)
852 struct beststr *dmax;
853 {
854 int lpos;
855 int tot, mtot;
856 struct {int start, stop, score;} curv, maxv;
857 register char *aa0p, *aa1p;
858
859 aa1p= &aa1[lpos = dmax->start];
860 aa0p= &aa0[lpos - dmax->dp + noff];
861 curv.start = lpos;
862
863 tot = curv.score = maxv.score = 0;
864 for ( ; lpos <= dmax->stop; lpos++) {
865 tot += pam2[*aa0p++][*aa1p++];
866 if (tot > curv.score) {
867 curv.stop = lpos;
868 curv.score = tot;
869 }
870 else if (tot < 0) {
871 if (curv.score > maxv.score) {
872 maxv.start = curv.start;
873 maxv.stop = curv.stop;
874 maxv.score = curv.score;
875 }
876 tot = curv.score = 0;
877 curv.start = lpos;
878 }
879 }
880
881 if (curv.score > maxv.score) {
882 maxv.start = curv.start;
883 maxv.stop = curv.stop;
884 maxv.score = curv.score;
885 }
886
887 /* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
888 printf(" new region: %3d %3d %3d %3d\n",maxv.start,
889 dmax->start,maxv.stop,dmax->stop);
890 */
891 dmax->start = maxv.start;
892 dmax->stop = maxv.stop;
893
894 return maxv.score;
895 }
896
897 sconn(v,n)
898 struct beststr *v[];
899 int n;
900 {
901 int i,si,cmpp();
902 struct slink {
903 int score;
904 struct beststr *vp;
905 struct slink *next;
906 } *start, *sl, *sj, *so, sarr[MAXSAV];
907 int lstart, tstart, plstop, ptstop;
908
909 /* sort the score left to right in lib pos */
910
911 ksort(v,n,cmpp);
912
913 start = NULL;
914
915 /* for the remaining runs, see if they fit */
916
917 for (i=0,si=0; i<n; i++) {
918
919 /* if the score is less than the gap penalty, it never helps */
920 if (v[i]->score < cgap) continue;
921 lstart=v[i]->start;
922 tstart=lstart-v[i]->dp+noff;
923
924 /* put the run in the group */
925 sarr[si].vp = v[i];
926 sarr[si].score = v[i]->score;
927 sarr[si].next = NULL;
928
929 /* if it fits, then increase the score */
930 for (sl=start; sl!= NULL; sl = sl->next) {
931 plstop = sl->vp->stop;
932 ptstop = plstop - sl->vp->dp + noff;
933 if (plstop<lstart && ptstop<tstart) {
934 sarr[si].score = sl->score+v[i]->score+pgap;
935 break;
936 }
937 }
938
939 /* now recalculate where the score fits */
940 if (start==NULL) start= &sarr[si];
941 else for (sj=start, so=NULL; sj!=NULL; sj = sj->next) {
942 if (sarr[si].score>sj->score) {
943 sarr[si].next = sj;
944 if (so!=NULL) so->next= &sarr[si];
945 else start= &sarr[si];
946 break;
947 }
948 so=sj;
949 }
950 si++;
951 }
952
953 if (start!=NULL) return (start->score);
954 else return (0);
955 }
956
shscore(aa0,n0)957 shscore(aa0,n0) /* calculate the 100% identical score */
958 char *aa0; int n0;
959 {
960 int i, sum;
961 for (i=0,sum=0; i<n0; i++)
962 sum += pam2[aa0[i]][aa0[i]];
963 return sum;
964 }
965
inithist()966 inithist()
967 {
968 int i;
969
970 for (i=0; i<MAXHIST; i++) {
971 hist[i]=0;
972 ghist[i]=0;
973 hist0[i]=0;
974 }
975 nmean = lmax = 0;
976 gnmean = glmax = 0;
977 nmean0 = lmax0 = 0;
978 lmin = glmin = lmin0 = MAXHIST*histint;
979 }
980
prhist(fd,score0,i0score0,gscore0)981 prhist(fd,score0, i0score0, gscore0)
982 FILE *fd; int score0, i0score0, gscore0;
983 {
984 int i,j,hl;
985 char hline[80], pch;
986 int gl,hl0;
987 int mn, mh1;
988 double cum, find_Evalue(), find_Pvalue();
989
990 mn = lmin/histint-3;
991 if (mn < 0) mn = 0;
992 mh1 = glmax/histint+4;
993 if (mh1 >= MAXHIST) mh1 = MAXHIST-1;
994
995 if (histflg) {
996 fprintf(fd,"\n initn init0 opt\n");
997 for (i=mn; i<=mh1; i++) {
998 pch = (i==mh1) ? '>' : ' ';
999 pch = (i==mn) ? '<' : pch;
1000 fprintf(fd,"%c%3d %5d %5d %5d:",
1001 pch,(i<mh1)?(i)*histint : mh1*histint,
1002 hist[i],hist0[i],ghist[i]);
1003 hl = hist[i];
1004 gl = ghist[i];
1005 if (hl > 50) hl = 50;
1006 if (gl > 50) gl = 50;
1007 for (j=0; j<hl; j++) hline[j]='i';
1008 hline[hl]=0;
1009 if (gl>hl) {for (j=hl; j<gl; j++) hline[j]='o'; hline[gl]=0;}
1010 if (gl==hl) for (j=0; j<hl; j++) hline[j]='b';
1011 if (gl<hl) for (j=0; j<gl; j++) hline[j]='o';
1012 fprintf(fd,"%s",hline);
1013 if ((score0 >= (i)*histint && score0 < (i+1)*histint) ||
1014 (score0 >= mh1*histint && i==mh1)) fprintf(fd," I");
1015 if ((gscore0 >= (i)*histint && gscore0 < (i+1)*histint) ||
1016 (gscore0 >= mh1*histint && i==mh1)) fprintf(fd," O");
1017 fprintf(fd,"\n");
1018 }
1019 }
1020
1021 fprintf(fd,"%7ld residues in %5d sequences,\n",ntt,nlib);
1022 fprintf(fd," %s matrix, ",smptr);
1023 fprintf(fd,"gap penalties: %d,%d\n",gdelval,ggapval);
1024 if (wflag==1) fprintf(fd," local shuffle, window size: %d\n",wsiz);
1025
1026 fprintf(fd, "\n unshuffled initn score: %d; shuffled score range: %d - %d\n",
1027 score0,lmin,lmax);
1028 cum = find_Pvalue(score0,n0,n1,K_n,Lambda_n);
1029 fprintf(fd," initn Lambda: %5.5g K: %5.5g; P(%d) = %5.5g\n",
1030 Lambda_n,K_n,score0,cum);
1031 fprintf(fd," For %d sequences, an initn score >=%d is expected",nlib,score0);
1032 cum = find_Evalue(score0,n0,n1,K_n,Lambda_n); if (cum > 5.0) fprintf(fd," %d times\n",(int)(cum + 0.5));
1033 else fprintf(fd," %5.3g times\n",cum);
1034
1035 fprintf(fd, "\n unshuffled init0 score: %d; shuffled score range: %d - %d\n",
1036 i0score0,lmin0,lmax0);
1037 cum = find_Pvalue(i0score0,n0,n1,K_0,Lambda_0);
1038 fprintf(fd," init0 Lambda: %5.5g K: %5.5g; P(%d) = %5.5g\n",
1039 Lambda_n,K_n,i0score0,cum);
1040 fprintf(fd," For %d sequences, an init0 score >=%d is expected",
1041 nlib,i0score0);
1042 cum = find_Evalue(i0score0,n0,n1,K_0,Lambda_0);
1043 if (cum > 5.0) fprintf(fd," %d times\n",(int)(cum + 0.5));
1044 else fprintf(fd," %5.3g times\n",cum);
1045
1046 fprintf(fd, "\n unshuffled opt score: %d; shuffled score range: %d - %d\n",
1047 gscore0,glmin,glmax);
1048 cum = find_Pvalue(gscore0,n0,n1,K_g,Lambda_g);
1049 fprintf(fd," opt Lambda: %5.5g K: %5.5g; P(%d) = %5.5g\n",
1050 Lambda_g,K_g,gscore0,cum);
1051 fprintf(fd," For %d sequences, an opt score >=%d is expected",
1052 nlib,gscore0);
1053 cum = find_Evalue(gscore0,n0,n1,K_g,Lambda_g);
1054 if (cum > 5.0) fprintf(fd," %d times\n",(int)(cum + 0.5));
1055 else fprintf(fd," %5.3g times\n",cum);
1056
1057 fprintf(fd,"\n ktup: %d, fact: %d",ktup, fact);
1058 if (lnsflg) fprintf(fd,"; ln scaling");
1059 fprintf(fd," scan time: "); ptime(fd,tscan-tstart); fprintf(fd,"\n");
1060 }
1061
addhist(score)1062 addhist(score)
1063 int score;
1064 {
1065 if (score>lmax) lmax = score;
1066 if (score < lmin) lmin = score;
1067 sscor[nmean++] = score;
1068
1069 score = (score-1)/histint;
1070 if (score < 0) score=0;
1071 else if (score >= MAXHIST) score = MAXHIST-1;
1072 hist[score]++;
1073 }
1074
addhistg(score)1075 addhistg(score)
1076 int score;
1077 {
1078 if (score>glmax) glmax = score;
1079 if (score<glmin) glmin = score;
1080 gsscor[gnmean++] = score;
1081
1082 score = (score-1)/histint;
1083 if (score < 0) score=0;
1084 else if (score >= MAXHIST) score = MAXHIST-1;
1085 ghist[score]++;
1086 }
1087
addhist0(score)1088 addhist0(score)
1089 int score;
1090 {
1091 if (score>lmax0) lmax0 = score;
1092 if (score<lmin0) lmin0 = score;
1093 sscor0[nmean0++] = score;;
1094
1095 score = (score-1)/histint;
1096 if (score < 0) score=0;
1097 else if (score >= MAXHIST) score = MAXHIST-1;
1098 hist0[score]++;
1099 }
1100
allocdiag(dsize)1101 allocdiag(dsize) /* allocates diagonal structures */
1102 int dsize;
1103 {
1104 diag = (struct dstruct *)calloc((size_t)dsize,sizeof(struct dstruct));
1105
1106 if (diag==NULL) {
1107 printf(" cannot allocate diagonal arrays\n");
1108 exit(1);
1109 }
1110 }
1111
1112 cmps(ptr1,ptr2)
1113 struct beststr *ptr1, *ptr2;
1114 {
1115 if (ptr1->score < ptr2->score) return (1);
1116 else if (ptr1->score > ptr2->score) return (-1);
1117 else return (0);
1118 }
1119
1120 cmpp(ptr1,ptr2)
1121 struct beststr *ptr1, *ptr2;
1122 {
1123 if (ptr1->start < ptr2->start) return (-1);
1124 else if (ptr1->start > ptr2->start) return (1);
1125 else return (0);
1126 }
1127
cmpi(val1,val2)1128 int cmpi(val1, val2)
1129 int *val1, *val2;
1130 {
1131 if (*val1 < *val2) return (-1);
1132 else if (*val1 > *val2) return 1;
1133 else return 0;
1134 }
1135
ksort(v,n,comp)1136 ksort(v,n,comp)
1137 void *v[]; int n, (*comp)();
1138 {
1139 int gap, i, j;
1140 char *tmp;
1141
1142 for (gap=n/2; gap>0; gap/=2)
1143 for (i=gap; i<n; i++)
1144 for (j=i-gap; j>=0; j -= gap) {
1145 if ((*comp)(v[j],v[j+gap]) <=0)
1146 break;
1147 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
1148 }
1149 }
1150
ssort(v,n)1151 void ssort(v,n)
1152 int *v; int n;
1153 {
1154 int gap, i, j;
1155 int tmp;
1156
1157 for (gap=n/2; gap>0; gap/=2)
1158 for (i=gap; i<n; i++)
1159 for (j=i-gap; j>=0; j -= gap) {
1160 if (v[j]<=v[j+gap]) break;
1161 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
1162 }
1163 }
1164
1165 int ieven = 1;
wshuffle(from,to,n,wsiz)1166 wshuffle(from,to,n,wsiz) /* copies from from to from shuffling */
1167 char *from, *to; int n, wsiz;
1168 {
1169 int i,j, k, mm; char tmp, *top;
1170
1171 memcpy(to,from,n);
1172
1173 mm = n%wsiz;
1174
1175 if (ieven) {
1176 for (k=0; k<(n-wsiz); k += wsiz) {
1177 top = &to[k];
1178 for (i=wsiz; i>1; i--) {
1179 j = nrand(i);
1180 tmp = top[j];
1181 top[j] = top[i-1];
1182 top[i-1] = tmp;
1183 }
1184 }
1185 top = &to[n-mm];
1186 for (i=mm; i>1; i--) {
1187 j = nrand(i);
1188 tmp = top[j];
1189 top[j] = top[i-1];
1190 top[i-1] = tmp;
1191 }
1192 ieven = 0;
1193 }
1194 else {
1195 for (k=n; k>=wsiz; k -= wsiz) {
1196 top = &to[k-wsiz];
1197 for (i=wsiz; i>1; i--) {
1198 j = nrand(i);
1199 tmp = top[j];
1200 top[j] = top[i-1];
1201 top[i-1] = tmp;
1202 }
1203 }
1204 top = &to[0];
1205 for (i=mm; i>1; i--) {
1206 j = nrand(i);
1207 tmp = top[j];
1208 top[j] = top[i-1];
1209 top[i-1] = tmp;
1210 }
1211 ieven = 1;
1212 }
1213 to[n] = -1;
1214 }
1215
shuffle(from,n)1216 shuffle(from,n) /* copies from from to from shuffling */
1217 char *from; int n;
1218 {
1219 int i,j; char tmp;
1220
1221 for (i=n; i>1; i--) {
1222 j = nrand(i);
1223 tmp = from[j];
1224 from[j] = from[i-1];
1225 from[i-1] = tmp;
1226 }
1227 from[n] = -1;
1228 }
1229
1230 /* ckalloc - allocate space; check for success */
ckalloc(amount)1231 char *ckalloc(amount)
1232 int amount;
1233 {
1234 char *p;
1235
1236 if ((p = malloc((size_t)amount)) == NULL) {
1237 fprintf(stderr,"Ran out of memory.");
1238 exit(1);
1239 }
1240 return(p);
1241 }
1242
1243 /* stubs for linking */
1244 int llen;
1245
aancpy()1246 aancpy()
1247 {}
1248
1249 int markx;
disgraph()1250 disgraph()
1251 {}
1252
B_ALIGN()1253 B_ALIGN() {}
1254
ALIGN()1255 ALIGN() {}
1256
LOCAL_ALIGN()1257 LOCAL_ALIGN() {}
1258
discons()1259 discons()
1260 {}
1261