1 /*
2 zzlgmata.c Needleman-Wunsch nucleotide mapper to find overlaps
3
4 copyright (c) 1983,1986,1987 William R. Pearson
5
6 August, 1994 - corrected error in alignments for ssearch.
7
8 July, 1994 - improved running time of smatch() by 30%
9
10 Aug, 1991 - completely dchanged match() to use Miller/Chao linear
11 space in band algorithm.
12
13 July 27, 1988 - improved output from consens, so that some context
14 of the match is shown. Put in showall == -1 briefly, then removed it.
15
16 June 29, 1988 - fixed bug in call to initmat();
17
18 September 24, 1987 - combined zzlgmata.c and zggmata.c with
19 -DGLOBAL
20
21 */
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <math.h>
26 #include <string.h>
27
28 #ifndef max
29 #define max(a,b) (((a)>(b))?(a):(b))
30 #define min(a,b) (((a)<(b))?(a):(b))
31 #endif
32
33 #include "zzgmata.gbl"
34 #define XTERNAL
35 #include "upam.gbl"
36
37 extern int n0, n1; /* length of sequences */
38 extern int dnaseq;
39 extern char *aa0, *aa1; /* sequence arrays */
40 extern FILE *outfd;
41 extern int showall; /* show complete sequences, not just overlaps */
42 extern int llen;
43 extern int lnsflg;
44 extern int optwid;
45
46 int smin0, smin1, smins; /* set bounds for discons */
47 extern int markx;
48
49 #ifdef LFASTA
50 extern int oneseq;
51 extern int lcrc0[];
52 extern int lcrc1[];
53 extern int ncrc;
54 extern int maxcrc;
55 extern int iscore, gscore;
56 int pmirror;
57 #endif
58
59 int window;
60 int nident;
61 static int kcount=0;
62 static int *res=NULL;
63 static int nres;
64
65
66 #ifndef SMATCH
67 #ifdef LFASTA
dmatch(s0,s1,display)68 dmatch(s0,s1,display)
69 int s0,s1,display;
70 {
71 int crctmp;
72 #else
73 dmatch(hoff,display)
74 int hoff, display;
75 {
76 int s0, s1;
77 #endif
78 int nc, ns;
79 float percent;
80 unsigned int i,j;
81 int low, up;
82 int score;
83
84 window=min(n1,optwid);
85
86 #ifndef LFASTA
87 low = -window/2-hoff;
88 up = low+window;
89
90 if (low > up) {
91 fprintf(stderr," low: %d, up: %d/ window: %d, hoff: %d\n",
92 low, up,window,hoff);
93 return 0;
94 }
95
96 if (!display) {
97 score = FLOCAL_ALIGN(aa0-1,aa1-1,n0,n1, low, up,
98 pam2,-(gdelval-ggapval),-ggapval,optwid);
99 if (score < 0) return 0;
100 if (lnsflg)
101 return (int)((double)score*log((double)n0)/log((double)n1)+0.5);
102 else return score;
103 }
104
105 score=LOCAL_ALIGN(aa0-1,aa1-1,n0,n1, low, up,
106 pam2,-(gdelval-ggapval),-ggapval,1,
107 &min0,&min1,&max0,&max1,optwid);
108
109 if (score <=0) return 0;
110
111 if (showall==1) {
112 if (hoff>0) maxc=optwid+((n0-hoff>=n1) ? n0 : n1+hoff);
113 else maxc=optwid+((n1+hoff>=n0) ? n1 : n0-hoff);
114 }
115 else maxc = min(n0,n1)+2+4*optwid+2*llen;
116 initres(n0+2+4*optwid);
117 #else
118 low = - window/2 - (s0-s1);
119 up = low + window;
120
121 /* fprintf(stderr," starting at: %3d %3d (%3d %3d)\n",
122 s0, s1, low, up);
123 */
124 RLOCAL_ALIGN(aa0-1,aa1-1,s0+1,s1+1, low, up, pam2,
125 -(gdelval-ggapval), -ggapval,
126 &min0, &min1, &max0, &max1,optwid);
127
128 /* fprintf(stderr," starting from: %3d %3d\n",
129 min0, min1);
130 */
131 maxv = LLOCAL_ALIGN(aa0-1+min0-1,aa1-1+min1-1,n0-min0+1,n1-min1+1,
132 low-(min1-min0), up-(min1-min0),
133 pam2,-(gdelval-ggapval),-ggapval,0,
134 &min0,&min1,&max0,&max1,optwid);
135
136 max0 += min0-1;
137 max1 += min1-1;
138
139 /* fprintf(stderr," ending at: %3d %3d\n",
140 max0, max1);
141 */
142 maxc = n0+2+4*window;
143 initres(maxc);
144 #endif
145 /* fprintf(stderr," ALIGN: start0: %d start1: %d stop0: %d stop1: %d, bot: %d top: %d, win: %d MX %d\n",
146 min0-1,min1-1,max0-min0+1,max1-min1+1,low-(min1-min0),up-(min1-min0),
147 optwid,n0);
148 */
149 B_ALIGN(aa0-1+min0-1,aa1-1+min1-1,max0-min0+1,max1-min1+1,
150 low-(min1-min0),up-(min1-min0),
151 pam2,-(gdelval-ggapval),-ggapval,res,optwid,n0);
152
153 /* DISPLAY(aa0-1+min0-1,aa1-1+min1-1,max0-min0+1,max1-min1+1,
154 res,min0,min1);
155 */
156 #ifdef LFASTA
157 if (!salloc) initseq(maxc);
158 #else
159 initseq(maxc);
160 #endif
161
162 ns = calcons(aa0,n0,aa1,n1,res,&nc);
163 percent = (float)nident*100.0/nc;
164
165 #ifndef LFASTA
166 if (markx == 4 || markx == 5 || markx == 6)
167 disgraph(n0,n1,percent,score,min0,min1,max0,max1);
168 else if (markx < 10) {
169 fprintf(outfd," %6.3f%% identity in %d %s overlap\n",
170 percent,nc,sqnam);
171 discons(seqc0,seqc1,ns);
172 }
173 else if (markx == 10) {
174 fprintf(outfd,"; fa_ident: %5.3f\n",percent/100.0);
175 fprintf(outfd,"; fa_overlap: %d\n",nc);
176 discons(seqc0,seqc1,ns);
177 }
178 freeseq();
179 #else
180 if ((crctmp=crcknew(seqc0,seqc1,ns,maxcrc))== -1) {
181 fprintf(stderr," too many alignments\n");
182 return -1;
183 }
184 else if (crctmp==1) {
185 kcount++;
186 if (display) {
187 if (markx==10) {
188 fprintf(outfd,">>#%d\n",kcount);
189 fprintf(outfd,"; lfa_init: %d\n",iscore);
190 fprintf(outfd,"; lfa_opt: %d\n",maxv);
191 fprintf(outfd,"; lfa_ident: %5.3f\n",percent/100.0);
192 fprintf(outfd,"; lfa_overlap: %d\n",nc);
193 }
194 else
195 fprintf(outfd,
196 "\n %6.3f%% identity in %d %s overlap; init: %4d, opt: %4d\n",
197 percent,nc,sqnam,iscore,maxv);
198 }
199 gscore = maxv;
200 opnline((long)smin0,(long)smin1,gscore);
201 discons(seqc0,seqc1,ns);
202 clsline((long)smin0,(long)smin1,gscore);
203 #ifdef TPLOT
204 if (oneseq) {
205 pmirror = 1;
206 i = smin0;
207 smin0 = smin1;
208 smin1 = i;
209 opnline((long)smin0,(long)smin1,gscore);
210 discons(seqc1,seqc0,ns);
211 clsline((long)smin0,(long)smin1,gscore);
212 pmirror = 0;
213 }
214 #endif
215 }
216 else maxv = -1;
217 #endif
218 return maxv;
219 }
220
221 #endif /* SMATCH */
222
223 static struct swstr {
224 int H;
225 int E;
226 } *ss;
227
smatch(aa0,n0,aa1,n1,flag)228 smatch(aa0,n0,aa1,n1,flag)
229 char *aa0, *aa1;
230 int n0, n1;
231 int flag;
232 {
233 int i, j;
234 int e, f, h, p;
235 int q, r, m;
236 int score, I, J, cost, K, L;
237 int nc, minc, maxc, lc;
238 float percent;
239 register int *waa1;
240 register char *aa0p;
241
242 /* allocate space for the scoring arrays */
243 if (ss==NULL) {
244 if ((ss=(struct swstr *)calloc((size_t)n0+1,sizeof(struct swstr)))==NULL) {
245 fprintf(stderr,"cannot allocate ss struct %3d\n",n0);
246 exit(1);
247 }
248 ss++;
249 }
250
251 q = -(gdelval - ggapval);
252 r = -ggapval;
253 m = q + r;
254
255 /* initialize 0th row */
256
257 score = I = J = 0;
258 for (j=0; j<n0; j++) {
259 ss[j].H = 0;
260 ss[j].E = -q;
261 }
262
263 for (i=0; i<n1; i++) {
264 h = p = 0;
265 f = -q;
266 waa1=pam2[aa1[i]];
267 for (j=0,aa0p=aa0; j<n0; j++,aa0p++) {
268 if ((h = h - m) > (f = f - r)) f = h;
269 if ((h = ss[j].H - m) > (e = ss[j].E - r)) e = h;
270 h = p + waa1[*aa0p];
271 if (h < 0) h = 0;
272 if (h < f) h = f;
273 if (h < e) h = e;
274 p = ss[j].H;
275 ss[j].H = h;
276 ss[j].E = e;
277 if (h > score) {
278 score = h;
279 I = i;
280 J = j;
281 }
282 }
283 } /* done with forward pass */
284
285 if (flag==NO) return score;
286 if (score <= 0 ) return 0;
287
288 /* to get the start point, go backwards */
289
290 cost = K = L = 0;
291 for (j=J; j>=0; j--) ss[j].H= ss[j].E= -1;
292
293 for (i=I; i>=0; i--) {
294 h = f = -1;
295 p = (i == I) ? 0 : -1;
296 for (j=J; j>=0; j--) {
297 f = max (f,h-q)-r;
298 ss[j].E=max(ss[j].E,ss[j].H-q)-r;
299 h = max(max(ss[j].E,f),p+pam2[aa0[j]][aa1[i]]);
300 p = ss[j].H;
301 ss[j].H=h;
302 if (h > cost) {
303 cost = h;
304 K = i;
305 L = j;
306 if (cost >= score) goto found;
307 }
308 }
309 }
310
311 found:
312
313 /* printf(" %d: L: %3d-%3d/%3d; K: %3d-%3d/%3d\n",score,L,J,n0,K,I,n1); */
314
315 /* allocate consensus arrays */
316
317 initres(n0*3/2);
318
319 max0 = J+1; min0 = L+1; max1 = I+1; min1 = K+1;
320
321 ALIGN(&aa0[min0-2],&aa1[min1-2],max0-min0+1,max1-min1+1,pam2,q,r,res,&nres);
322
323 /*
324 DISPLAY(aa0-1+min0-1,aa1-1+min1-1,max0-min0+1,max1-min1+1,
325 res,min0,min1);
326 */
327
328 /* fprintf(stderr,"old: %d, nres: %d\n",min(n0,n1)*5/4, nres); */
329
330 if (showall==1) {
331 maxc = nres + max(min0,min1) + max((n0-max0),(n1-max1)) + 3;
332 initseq(maxc);
333 }
334 else {
335 maxc = nres + 4*llen;
336 initseq(maxc);
337 }
338
339 nc=calcons(aa0,n0,aa1,n1,res,&lc);
340 percent = (100.0*(float)nident)/(float)lc;
341
342 if (markx == 4)
343 disgraph(n0,n1,percent,score,min0,min1,max0,max1);
344 else if (markx < 4 || markx==5) {
345 fprintf(outfd,
346 "Smith-Waterman score: %d; %6.3f%% identity in %d %s overlap\n",
347 score, percent,lc,sqnam);
348 if (markx==5) {
349 fputc('\n',outfd);
350 disgraph(n0, n1, percent, score, min0, min1, max0, max1);
351 }
352 discons(seqc0,seqc1,nc);
353 }
354 else if (markx==10) {
355 #ifndef SMATCH
356 fprintf(outfd,"; sw_score: %d\n",score);
357 #endif
358 fprintf(outfd,"; sw_ident: %5.3f\n",percent/100.0);
359 fprintf(outfd,"; sw_overlap: %d\n",lc);
360 discons(seqc0,seqc1,nc);
361 }
362 freeseq();
363
364 return score;
365 }
366
initres(rsiz)367 initres(rsiz) /* initialize results array */
368 int rsiz;
369 {
370 if (res==NULL) res = (int *)calloc((size_t)rsiz,sizeof(int));
371 if (res==NULL)
372 {fprintf(stderr,"cannot allocate alignment results array %d\n",rsiz);
373 exit(1);}
374 }
375
initseq(seqsiz)376 initseq(seqsiz) /* initialize arrays */
377 int seqsiz;
378 {
379 seqc0=calloc((size_t)seqsiz,sizeof(char));
380 seqc1=calloc((size_t)seqsiz,sizeof(char));
381 if (seqc0==NULL || seqc1==NULL)
382 {fprintf(stderr,"cannot allocate consensus arrays %d\n",seqsiz);
383 exit(1);}
384 salloc = 1;
385 }
386
freeseq()387 freeseq()
388 {
389 free(seqc0); free(seqc1);
390 }
391
392 /*
393 this function builds a consensus sequence in place by
394 going to the maximum match and moving left and up
395 */
396
397
calcons(aa0,n0,aa1,n1,res,nc)398 calcons(aa0,n0,aa1,n1,res,nc)
399 char *aa0, *aa1;
400 int n0, n1;
401 int *res;
402 int *nc;
403 {
404 int i0, i1;
405 int op, lenc, nd, ns, itmp;
406 char *sp0, *sp1;
407 int *rp;
408
409 /* first fill in the ends */
410 min0--; min1--;
411
412 #ifndef LFASTA
413 if (min(min0,min1)<llen || showall==1) /* will we show all the start ?*/
414 if (min0>=min1) { /* aa0 extends more to left */
415 smins=0;
416 if (showall==1) mins=min0;
417 else mins = min(min0,llen/2);
418 aancpy(seqc0,aa0+min0-mins,mins);
419 smin0 = min0-mins;
420 if ((mins-min1)>0) {
421 memset(seqc1,' ',mins-min1);
422 aancpy(seqc1+mins-min1,aa1,min1);
423 smin1 = 0;
424 }
425 else {
426 aancpy(seqc1,aa1+min1-mins,mins);
427 smin1 = min1-mins;
428 }
429 }
430 else {
431 smins=0;
432 if (showall == 1) mins=min1;
433 else mins = min(min1,llen/2);
434 aancpy(seqc1,aa1+min1-mins,mins);
435 smin1 = min1-mins;
436 if ((mins-min0)>0) {
437 memset(seqc0,' ',mins-min0);
438 aancpy(seqc0+mins-min0,aa0,min0);
439 smin0 = 0;
440 }
441 else {
442 aancpy(seqc0,aa0+min0-mins,mins);
443 smin0 = min0-mins;
444 }
445 }
446 else {
447 mins= min(llen/2,min(min0,min1));
448 smins=mins;
449 smin0=min0;
450 smin1=min1;
451 aancpy(seqc0,aa0+min0-mins,mins);
452 aancpy(seqc1,aa1+min1-mins,mins);
453 }
454 #else
455 smin0 = min0;
456 smin1 = min1;
457 smins = mins = 0;
458 #endif
459
460 /* now get the middle */
461
462 sp0 = seqc0+mins;
463 sp1 = seqc1+mins;
464 rp = res;
465 lenc = nident = op = 0;
466 i0 = min0;
467 i1 = min1;
468
469 while (i0 < max0 || i1 < max1) {
470 if (op == 0 && *rp == 0) {
471 op = *rp++;
472 *sp0 = sq[aa0[i0++]];
473 *sp1 = sq[aa1[i1++]];
474 lenc++;
475 if (*sp0 == *sp1) nident++;
476 else if (dnaseq && ((*sp0 == 'T' && *sp1 == 'U') ||
477 (*sp0=='U' && *sp1=='T')))
478 nident++;
479 sp0++; sp1++;
480 }
481 else {
482 if (op==0) op = *rp++;
483 if (op>0) {
484 *sp0++ = '-';
485 *sp1++ = sq[aa1[i1++]];
486 op--;
487 lenc++;
488 }
489 else {
490 *sp0++ = sq[aa0[i0++]];
491 *sp1++ = '-';
492 op++;
493 lenc++;
494 }
495 }
496 }
497
498 *nc = lenc;
499 /* now we have the middle, get the right end */
500
501 #ifndef LFASTA
502 ns = mins + lenc + llen;
503 ns -= (itmp = ns %llen);
504 if (itmp>llen/2) ns += llen;
505 nd = ns - (mins+lenc);
506 if (nd > max(n0-max0,n1-max1)) nd = max(n0-max0,n1-max1);
507
508 if (showall==1) {
509 nd = max(n0-max0,n1-max1); /* reset for showall=1 */
510 /* get right end */
511 aancpy(seqc0+mins+lenc,aa0+max0,n0-max0);
512 aancpy(seqc1+mins+lenc,aa1+max1,n1-max1);
513 /* fill with blanks - this is required to use one 'nc' */
514 memset(seqc0+mins+lenc+n0-max0,' ',nd-(n0-max0));
515 memset(seqc1+mins+lenc+n1-max1,' ',nd-(n1-max1));
516 }
517 else {
518 aancpy(seqc0+mins+lenc,aa0+max0,nd);
519 aancpy(seqc1+mins+lenc,aa1+max1,nd);
520 if ((nd-(n0-max0))>0)
521 memset(seqc0+mins+lenc+n0-max0,' ',nd-(n0-max0));
522 if ((nd-(n1-max1))>0)
523 memset(seqc1+mins+lenc+n1-max1,' ',nd-(n1-max1));
524 }
525
526 #else /* LFASTA */
527 nd = 0;
528 #endif
529 return mins+lenc+nd;
530 }
531
532 #ifdef LFASTA
crcknew(seqc0,seqc1,nc,maxcrc)533 crcknew(seqc0,seqc1,nc,maxcrc)
534 char *seqc0, *seqc1; int nc; int maxcrc;
535 {
536 int crc0, crc1, ii;
537
538 crc0 = crck(seqc0,nc);
539 crc1 = crck(seqc1,nc);
540
541 for (ii=0; ii<ncrc; ii++)
542 if (lcrc0[ii]==crc0 && lcrc1[ii]==crc1) return 0;
543
544 if (ncrc >= maxcrc) return -1;
545 lcrc0[ncrc] = crc0;
546 lcrc1[ncrc] = crc1;
547 ncrc++;
548 return 1;
549 }
550 #endif
551