1 /*
2 	zzlgmata.c Needleman-Wunsch nucleotide mapper to find overlaps
3 
4 copyright (c) 1983,1986,1987 William R. Pearson
5 
6 	August, 1994 - corrected error in alignments for ssearch.
7 
8 	July, 1994 - improved running time of smatch() by 30%
9 
10 	Aug, 1991 - completely dchanged match() to use Miller/Chao linear
11 space in band algorithm.
12 
13 	July 27, 1988 - improved output from consens, so that some context
14 of the match is shown.  Put in showall == -1 briefly, then removed it.
15 
16 	June 29, 1988 - fixed bug in call to initmat();
17 
18 	September 24, 1987 - combined zzlgmata.c and zggmata.c with
19 -DGLOBAL
20 
21 */
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <math.h>
26 #include <string.h>
27 
28 #ifndef max
29 #define max(a,b) (((a)>(b))?(a):(b))
30 #define min(a,b) (((a)<(b))?(a):(b))
31 #endif
32 
33 #include "zzgmata.gbl"
34 #define XTERNAL
35 #include "upam.gbl"
36 
37 extern int n0, n1;		/* length of sequences */
38 extern int dnaseq;
39 extern char *aa0, *aa1;		/* sequence arrays */
40 extern FILE *outfd;
41 extern int showall;	/* show complete sequences, not just overlaps */
42 extern int llen;
43 extern int lnsflg;
44 extern int optwid;
45 
46 int smin0, smin1, smins;	/* set bounds for discons */
47 extern int markx;
48 
49 #ifdef LFASTA
50 extern int oneseq;
51 extern int lcrc0[];
52 extern int lcrc1[];
53 extern int ncrc;
54 extern int maxcrc;
55 extern int iscore, gscore;
56 int pmirror;
57 #endif
58 
59 int window;
60 int nident;
61 static int kcount=0;
62 static int *res=NULL;
63 static int nres;
64 
65 
66 #ifndef SMATCH
67 #ifdef LFASTA
dmatch(s0,s1,display)68 dmatch(s0,s1,display)
69      int s0,s1,display;
70 {
71   int crctmp;
72 #else
73 dmatch(hoff,display)
74   int hoff, display;
75 {
76   int s0, s1;
77 #endif
78   int nc, ns;
79   float percent;
80   unsigned int i,j;
81   int low, up;
82   int score;
83 
84   window=min(n1,optwid);
85 
86 #ifndef LFASTA
87   low = -window/2-hoff;
88   up = low+window;
89 
90   if (low > up) {
91     fprintf(stderr," low: %d, up: %d/ window: %d, hoff: %d\n",
92 	    low, up,window,hoff);
93     return 0;
94   }
95 
96   if (!display) {
97     score = FLOCAL_ALIGN(aa0-1,aa1-1,n0,n1, low, up,
98 			 pam2,-(gdelval-ggapval),-ggapval,optwid);
99     if (score < 0) return 0;
100     if (lnsflg)
101       return (int)((double)score*log((double)n0)/log((double)n1)+0.5);
102     else return score;
103   }
104 
105   score=LOCAL_ALIGN(aa0-1,aa1-1,n0,n1, low, up,
106 		    pam2,-(gdelval-ggapval),-ggapval,1,
107 		    &min0,&min1,&max0,&max1,optwid);
108 
109   if (score <=0) return 0;
110 
111   if (showall==1) {
112     if (hoff>0) maxc=optwid+((n0-hoff>=n1) ? n0 : n1+hoff);
113     else maxc=optwid+((n1+hoff>=n0) ? n1 : n0-hoff);
114   }
115   else maxc = min(n0,n1)+2+4*optwid+2*llen;
116   initres(n0+2+4*optwid);
117 #else
118   low = - window/2 - (s0-s1);
119   up = low + window;
120 
121   /*	fprintf(stderr," starting at: %3d %3d (%3d %3d)\n",
122 	s0, s1, low, up);
123 	*/
124   RLOCAL_ALIGN(aa0-1,aa1-1,s0+1,s1+1, low, up, pam2,
125 	       -(gdelval-ggapval), -ggapval,
126 	       &min0, &min1, &max0, &max1,optwid);
127 
128   /*	fprintf(stderr," starting from: %3d %3d\n",
129 	min0, min1);
130 	*/
131   maxv = LLOCAL_ALIGN(aa0-1+min0-1,aa1-1+min1-1,n0-min0+1,n1-min1+1,
132 		      low-(min1-min0), up-(min1-min0),
133 		      pam2,-(gdelval-ggapval),-ggapval,0,
134 		      &min0,&min1,&max0,&max1,optwid);
135 
136   max0 += min0-1;
137   max1 += min1-1;
138 
139   /*	fprintf(stderr," ending at: %3d %3d\n",
140 	max0, max1);
141 	*/
142   maxc = n0+2+4*window;
143   initres(maxc);
144 #endif
145   /* fprintf(stderr," ALIGN: start0: %d start1: %d stop0: %d stop1: %d, bot: %d top: %d, win: %d MX %d\n",
146      min0-1,min1-1,max0-min0+1,max1-min1+1,low-(min1-min0),up-(min1-min0),
147      optwid,n0);
148      */
149   B_ALIGN(aa0-1+min0-1,aa1-1+min1-1,max0-min0+1,max1-min1+1,
150 	  low-(min1-min0),up-(min1-min0),
151 	  pam2,-(gdelval-ggapval),-ggapval,res,optwid,n0);
152 
153   /* 	DISPLAY(aa0-1+min0-1,aa1-1+min1-1,max0-min0+1,max1-min1+1,
154 	res,min0,min1);
155 	*/
156 #ifdef LFASTA
157   if (!salloc) initseq(maxc);
158 #else
159   initseq(maxc);
160 #endif
161 
162   ns = calcons(aa0,n0,aa1,n1,res,&nc);
163   percent = (float)nident*100.0/nc;
164 
165 #ifndef LFASTA
166   if (markx == 4 || markx == 5 || markx == 6)
167     disgraph(n0,n1,percent,score,min0,min1,max0,max1);
168   else if (markx < 10) {
169     fprintf(outfd," %6.3f%% identity in %d %s overlap\n",
170 	    percent,nc,sqnam);
171     discons(seqc0,seqc1,ns);
172   }
173   else if (markx == 10) {
174     fprintf(outfd,"; fa_ident: %5.3f\n",percent/100.0);
175     fprintf(outfd,"; fa_overlap: %d\n",nc);
176     discons(seqc0,seqc1,ns);
177   }
178   freeseq();
179 #else
180   if ((crctmp=crcknew(seqc0,seqc1,ns,maxcrc))== -1) {
181     fprintf(stderr," too many alignments\n");
182     return -1;
183   }
184   else if (crctmp==1) {
185     kcount++;
186     if (display) {
187       if (markx==10) {
188 	fprintf(outfd,">>#%d\n",kcount);
189 	fprintf(outfd,"; lfa_init: %d\n",iscore);
190 	fprintf(outfd,"; lfa_opt: %d\n",maxv);
191 	fprintf(outfd,"; lfa_ident: %5.3f\n",percent/100.0);
192 	fprintf(outfd,"; lfa_overlap: %d\n",nc);
193       }
194       else
195 	fprintf(outfd,
196 		"\n %6.3f%% identity in %d %s overlap; init: %4d, opt: %4d\n",
197 		percent,nc,sqnam,iscore,maxv);
198     }
199     gscore = maxv;
200     opnline((long)smin0,(long)smin1,gscore);
201     discons(seqc0,seqc1,ns);
202     clsline((long)smin0,(long)smin1,gscore);
203 #ifdef TPLOT
204     if (oneseq) {
205       pmirror = 1;
206       i = smin0;
207       smin0 = smin1;
208       smin1 = i;
209       opnline((long)smin0,(long)smin1,gscore);
210       discons(seqc1,seqc0,ns);
211       clsline((long)smin0,(long)smin1,gscore);
212       pmirror = 0;
213     }
214 #endif
215   }
216   else maxv = -1;
217 #endif
218   return maxv;
219 }
220 
221 #endif /* SMATCH */
222 
223 static struct swstr {
224   int H;
225   int E;
226 } *ss;
227 
smatch(aa0,n0,aa1,n1,flag)228 smatch(aa0,n0,aa1,n1,flag)
229      char *aa0, *aa1;
230      int n0, n1;
231      int flag;
232 {
233   int i, j;
234   int e, f, h, p;
235   int q, r, m;
236   int score, I, J, cost, K, L;
237   int nc, minc, maxc, lc;
238   float percent;
239   register int *waa1;
240   register char *aa0p;
241 
242   /* allocate space for the scoring arrays */
243   if (ss==NULL) {
244     if ((ss=(struct swstr *)calloc((size_t)n0+1,sizeof(struct swstr)))==NULL) {
245       fprintf(stderr,"cannot allocate ss struct %3d\n",n0);
246       exit(1);
247     }
248     ss++;
249   }
250 
251   q = -(gdelval - ggapval);
252   r = -ggapval;
253   m = q + r;
254 
255   /* initialize 0th row */
256 
257   score = I = J = 0;
258   for (j=0; j<n0; j++) {
259     ss[j].H = 0;
260     ss[j].E = -q;
261   }
262 
263   for (i=0; i<n1; i++) {
264     h = p = 0;
265     f = -q;
266     waa1=pam2[aa1[i]];
267     for (j=0,aa0p=aa0; j<n0; j++,aa0p++) {
268       if ((h =   h     - m) > (f =   f     - r)) f = h;
269       if ((h = ss[j].H - m) > (e = ss[j].E - r)) e = h;
270       h = p + waa1[*aa0p];
271       if (h < 0) h = 0;
272       if (h < f) h = f;
273       if (h < e) h = e;
274       p = ss[j].H;
275       ss[j].H = h;
276       ss[j].E = e;
277       if (h > score) {
278 	score = h;
279 	I = i;
280 	J = j;
281       }
282     }
283   } /* done with forward pass */
284 
285   if (flag==NO) return score;
286   if (score <= 0 ) return 0;
287 
288   /* to get the start point, go backwards */
289 
290   cost = K = L = 0;
291   for (j=J; j>=0; j--) ss[j].H= ss[j].E= -1;
292 
293   for (i=I; i>=0; i--) {
294     h = f = -1;
295     p = (i == I) ? 0 : -1;
296     for (j=J; j>=0; j--) {
297       f = max (f,h-q)-r;
298       ss[j].E=max(ss[j].E,ss[j].H-q)-r;
299       h = max(max(ss[j].E,f),p+pam2[aa0[j]][aa1[i]]);
300       p = ss[j].H;
301       ss[j].H=h;
302       if (h > cost) {
303 	cost = h;
304 	K = i;
305 	L = j;
306 	if (cost >= score) goto found;
307       }
308     }
309   }
310 
311 found:
312 
313 /* printf(" %d: L: %3d-%3d/%3d; K: %3d-%3d/%3d\n",score,L,J,n0,K,I,n1); */
314 
315  /* allocate consensus arrays */
316 
317   initres(n0*3/2);
318 
319   max0 = J+1; min0 = L+1; max1 = I+1; min1 = K+1;
320 
321   ALIGN(&aa0[min0-2],&aa1[min1-2],max0-min0+1,max1-min1+1,pam2,q,r,res,&nres);
322 
323 /*
324   DISPLAY(aa0-1+min0-1,aa1-1+min1-1,max0-min0+1,max1-min1+1,
325 		res,min0,min1);
326 */
327 
328 /*  fprintf(stderr,"old: %d, nres: %d\n",min(n0,n1)*5/4, nres); */
329 
330   if (showall==1) {
331     maxc = nres + max(min0,min1) + max((n0-max0),(n1-max1)) + 3;
332     initseq(maxc);
333   }
334   else {
335     maxc = nres + 4*llen;
336     initseq(maxc);
337   }
338 
339   nc=calcons(aa0,n0,aa1,n1,res,&lc);
340   percent = (100.0*(float)nident)/(float)lc;
341 
342   if (markx == 4)
343     disgraph(n0,n1,percent,score,min0,min1,max0,max1);
344   else if (markx < 4 || markx==5) {
345     fprintf(outfd,
346            "Smith-Waterman score: %d;   %6.3f%% identity in %d %s overlap\n",
347 	    score, percent,lc,sqnam);
348     if (markx==5) {
349       fputc('\n',outfd);
350       disgraph(n0, n1, percent, score, min0, min1, max0, max1);
351     }
352     discons(seqc0,seqc1,nc);
353     }
354   else if (markx==10) {
355 #ifndef SMATCH
356     fprintf(outfd,"; sw_score: %d\n",score);
357 #endif
358     fprintf(outfd,"; sw_ident: %5.3f\n",percent/100.0);
359     fprintf(outfd,"; sw_overlap: %d\n",lc);
360     discons(seqc0,seqc1,nc);
361   }
362   freeseq();
363 
364   return score;
365 }
366 
initres(rsiz)367 initres(rsiz)		/* initialize results array */
368      int rsiz;
369 {
370   if (res==NULL) res = (int *)calloc((size_t)rsiz,sizeof(int));
371   if (res==NULL)
372     {fprintf(stderr,"cannot allocate alignment results array %d\n",rsiz);
373      exit(1);}
374 }
375 
initseq(seqsiz)376 initseq(seqsiz)		/* initialize arrays */
377 	int seqsiz;
378 {
379   seqc0=calloc((size_t)seqsiz,sizeof(char));
380   seqc1=calloc((size_t)seqsiz,sizeof(char));
381   if (seqc0==NULL || seqc1==NULL)
382     {fprintf(stderr,"cannot allocate consensus arrays %d\n",seqsiz);
383      exit(1);}
384   salloc = 1;
385 }
386 
freeseq()387 freeseq()
388 {
389 	free(seqc0); free(seqc1);
390 	}
391 
392 /*
393 	this function builds a consensus sequence in place by
394 	going to the maximum match and moving left and up
395 */
396 
397 
calcons(aa0,n0,aa1,n1,res,nc)398 calcons(aa0,n0,aa1,n1,res,nc)
399      char *aa0, *aa1;
400      int n0, n1;
401      int *res;
402      int *nc;
403 {
404   int i0, i1;
405   int op, lenc, nd, ns, itmp;
406   char *sp0, *sp1;
407   int *rp;
408 
409   /* first fill in the ends */
410   min0--; min1--;
411 
412 #ifndef LFASTA
413   if (min(min0,min1)<llen || showall==1)     /* will we show all the start ?*/
414     if (min0>=min1) {                        /* aa0 extends more to left */
415       smins=0;
416       if (showall==1) mins=min0;
417       else mins = min(min0,llen/2);
418       aancpy(seqc0,aa0+min0-mins,mins);
419       smin0 = min0-mins;
420       if ((mins-min1)>0) {
421 	memset(seqc1,' ',mins-min1);
422 	aancpy(seqc1+mins-min1,aa1,min1);
423 	smin1 = 0;
424       }
425       else {
426 	aancpy(seqc1,aa1+min1-mins,mins);
427 	smin1 = min1-mins;
428       }
429     }
430     else {
431       smins=0;
432       if (showall == 1) mins=min1;
433       else mins = min(min1,llen/2);
434       aancpy(seqc1,aa1+min1-mins,mins);
435       smin1 = min1-mins;
436       if ((mins-min0)>0) {
437 	memset(seqc0,' ',mins-min0);
438 	aancpy(seqc0+mins-min0,aa0,min0);
439 	smin0 = 0;
440       }
441       else {
442 	aancpy(seqc0,aa0+min0-mins,mins);
443 	smin0 = min0-mins;
444       }
445     }
446   else {
447     mins= min(llen/2,min(min0,min1));
448     smins=mins;
449     smin0=min0;
450     smin1=min1;
451     aancpy(seqc0,aa0+min0-mins,mins);
452     aancpy(seqc1,aa1+min1-mins,mins);
453   }
454 #else
455   smin0 = min0;
456   smin1 = min1;
457   smins = mins = 0;
458 #endif
459 
460 /* now get the middle */
461 
462   sp0 = seqc0+mins;
463   sp1 = seqc1+mins;
464   rp = res;
465   lenc = nident = op = 0;
466   i0 = min0;
467   i1 = min1;
468 
469   while (i0 < max0 || i1 < max1) {
470     if (op == 0 && *rp == 0) {
471       op = *rp++;
472       *sp0 = sq[aa0[i0++]];
473       *sp1 = sq[aa1[i1++]];
474       lenc++;
475       if (*sp0 == *sp1) nident++;
476       else if (dnaseq && ((*sp0 == 'T' && *sp1 == 'U') ||
477 			  (*sp0=='U' && *sp1=='T')))
478 	nident++;
479       sp0++; sp1++;
480     }
481     else {
482       if (op==0) op = *rp++;
483       if (op>0) {
484 	*sp0++ = '-';
485 	*sp1++ = sq[aa1[i1++]];
486 	op--;
487 	lenc++;
488       }
489       else {
490 	*sp0++ = sq[aa0[i0++]];
491 	*sp1++ = '-';
492 	op++;
493 	lenc++;
494       }
495     }
496   }
497 
498   *nc = lenc;
499 /*	now we have the middle, get the right end */
500 
501 #ifndef LFASTA
502   ns = mins + lenc + llen;
503   ns -= (itmp = ns %llen);
504   if (itmp>llen/2) ns += llen;
505   nd = ns - (mins+lenc);
506   if (nd > max(n0-max0,n1-max1)) nd = max(n0-max0,n1-max1);
507 
508   if (showall==1) {
509     nd = max(n0-max0,n1-max1);		/* reset for showall=1 */
510     /* get right end */
511     aancpy(seqc0+mins+lenc,aa0+max0,n0-max0);
512     aancpy(seqc1+mins+lenc,aa1+max1,n1-max1);
513     /* fill with blanks - this is required to use one 'nc' */
514     memset(seqc0+mins+lenc+n0-max0,' ',nd-(n0-max0));
515     memset(seqc1+mins+lenc+n1-max1,' ',nd-(n1-max1));
516   }
517   else {
518     aancpy(seqc0+mins+lenc,aa0+max0,nd);
519     aancpy(seqc1+mins+lenc,aa1+max1,nd);
520     if ((nd-(n0-max0))>0)
521       memset(seqc0+mins+lenc+n0-max0,' ',nd-(n0-max0));
522     if ((nd-(n1-max1))>0)
523       memset(seqc1+mins+lenc+n1-max1,' ',nd-(n1-max1));
524   }
525 
526 #else	/* LFASTA */
527   nd = 0;
528 #endif
529   return mins+lenc+nd;
530 }
531 
532 #ifdef LFASTA
crcknew(seqc0,seqc1,nc,maxcrc)533 crcknew(seqc0,seqc1,nc,maxcrc)
534 	char *seqc0, *seqc1; int nc; int maxcrc;
535 {
536 	int crc0, crc1, ii;
537 
538 	crc0 = crck(seqc0,nc);
539 	crc1 = crck(seqc1,nc);
540 
541 	for (ii=0; ii<ncrc; ii++)
542 		if (lcrc0[ii]==crc0 && lcrc1[ii]==crc1) return 0;
543 
544 	if (ncrc >= maxcrc) return -1;
545 	lcrc0[ncrc] = crc0;
546 	lcrc1[ncrc] = crc1;
547 	ncrc++;
548 	return 1;
549 	}
550 #endif
551