1
2 /* A PACKAGE FOR SEQUENCE COMPARISON WITH AFFINE WEIGHTS */
3 /* Maximizes a similarity score and doesn't penalize end-gaps */
4
5 /* Globally passed params and macros */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9
10 #ifdef BIGMEM
11 #define NMAX 80000
12 #else
13 #define NMAX 3000
14 #endif
15
16 static int CHECK_SCORE();
17
18 static int **w; /* w = W */
19 static int g, h, m; /* g = G, h = H, m = g+h */
20
21 #define gap(k) ((k) <= 0 ? 0 : g+h*(k)) /* k-symbol indel cost */
22
23 static int *sapp; /* Current script append ptr */
24 static int last; /* Last script op appended */
25
26 /* Append "Delete k" op */
27 #define DEL(k) \
28 { if (last < 0) \
29 last = sapp[-1] -= (k); \
30 else \
31 last = *sapp++ = -(k); \
32 }
33 /* Append "Insert k" op */
34 #define INS(k) \
35 { if (last < 0) \
36 { sapp[-1] = (k); *sapp++ = last; } \
37 else \
38 last = *sapp++ = (k); \
39 }
40
41 #define REP { last = *sapp++ = 0; } /* Append "Replace" op */
42
43 static int CC[NMAX+1], DD[NMAX+1]; /* Forward cost-only vectors */
44 static int RR[NMAX+1], SS[NMAX+1]; /* Reverse cost-only vectors */
45
46 /* align(A,B,M,N,tb,te) returns the cost of an optimum conversion between
47 A[1..M] and B[1..N] that begins(ends) with a delete if tb(te) is zero
48 and appends such a conversion to the current script. */
49
align(A,B,M,N,tb,te,topr,botr,lc,rc)50 static int align(A,B,M,N,tb,te,topr,botr,lc,rc) char *A, *B; int M, N;
51 int tb, te; char topr, botr, lc, rc;
52
53 { int midi, midj, type; /* Midpoint, type, and cost */
54 int midc;
55 int c1, c2;
56
57 { register int i, j;
58 register int c, e, d, s;
59 int t, *wa;
60
61 /* Boundary cases: M <= 1 or N == 0 */
62
63 if (N <= 0)
64 { if (M > 0) DEL(M)
65 if (topr || botr) return 0;
66 else return -gap(M);
67 }
68 if (M <= 1)
69 { if (M <= 0)
70 { INS(N);
71 if (topr || botr) return 0;
72 else return -gap(N);
73 }
74 if (topr) {
75 if (rc) midc = 0;
76 else midc = te-h;
77 midj = 0;
78 wa = w[A[1]];
79 for (j = 1; j <= N; j++)
80 { c = wa[B[j]] - gap(N-j);
81 if (c > midc)
82 { midc = c;
83 midj = j;
84 }
85 }
86 } else if (botr) {
87 if (lc) midc = 0;
88 else midc = tb-h;
89 midj = 0;
90 wa = w[A[1]];
91 for (j = 1; j <= N; j++)
92 { c = -gap(j-1) + wa[B[j]];
93 if (c > midc)
94 { midc = c;
95 midj = j;
96 }
97 }
98 } else {
99 if (tb < te) tb = te;
100 if (lc || rc) midc = -gap(N);
101 else midc = (tb-h) - gap(N);
102 midj = 0;
103 wa = w[A[1]];
104 for (j = 1; j <= N; j++)
105 { c = -gap(j-1) + wa[B[j]] - gap(N-j);
106 if (c > midc)
107 { midc = c;
108 midj = j;
109 }
110 }
111 }
112 if (midj == 0)
113 { INS(N) DEL(1) }
114 else
115 { if (midj > 1) INS(midj-1)
116 REP
117 if (midj < N) INS(N-midj)
118 }
119 return midc;
120 }
121
122 /* Divide: Find optimum midpoint (midi,midj) of cost midc */
123
124 midi = M/2; /* Forward phase: */
125 CC[0] = 0; /* Compute C(M/2,k) & D(M/2,k) for all k */
126 if (topr) {
127 for (j = 1; j <= N; j++)
128 { CC[j] = 0;
129 DD[j] = -g;
130 }
131 } else {
132 t = -g;
133 for (j = 1; j <= N; j++)
134 { CC[j] = t = t-h;
135 DD[j] = t-g;
136 }
137 }
138 t = tb;
139 for (i = 1; i <= midi; i++)
140 { s = CC[0];
141 if (lc) {
142 CC[0] = c = 0;
143 e = -g;
144 } else {
145 CC[0] = c = t = t-h;
146 e = t-g;
147 }
148 wa = w[A[i]];
149 for (j = 1; j <= N; j++)
150 { if ((c = c - m) > (e = e - h)) e = c;
151 if ((j == N) && rc) {
152 if ((c = CC[j]) > (d = DD[j])) d = c;
153 } else {
154 if ((c = CC[j] - m) > (d = DD[j] - h)) d = c;
155 }
156 c = s + wa[B[j]];
157 if (e > c) c = e;
158 if (d > c) c = d;
159 s = CC[j];
160 CC[j] = c;
161 DD[j] = d;
162 }
163 }
164 DD[0] = CC[0];
165
166 RR[N] = 0; /* Reverse phase: */
167 /* Compute R(M/2,k) & S(M/2,k) for all k */
168 if (botr) {
169 for (j = N-1; j >= 0; j--)
170 { RR[j] = 0;
171 SS[j] = -g;
172 }
173 } else {
174 t = -g;
175 for (j = N-1; j >= 0; j--)
176 { RR[j] = t = t-h;
177 SS[j] = t-g;
178 }
179 }
180 t = te;
181 for (i = M-1; i >= midi; i--)
182 { s = RR[N];
183 if (rc) {
184 RR[N] = c = 0;
185 e = -g;
186 } else {
187 RR[N] = c = t = t-h;
188 e = t-g;
189 }
190 wa = w[A[i+1]];
191 for (j = N-1; j >= 0; j--)
192 { if ((c = c - m) > (e = e - h)) e = c;
193 if ((j == 0) && lc) {
194 if ((c = RR[j]) > (d = SS[j])) d = c;
195 } else {
196 if ((c = RR[j] - m) > (d = SS[j] - h)) d = c;
197 }
198 c = s + wa[B[j+1]];
199 if (e > c) c = e;
200 if (d > c) c = d;
201 s = RR[j];
202 RR[j] = c;
203 SS[j] = d;
204 }
205 }
206 SS[N] = RR[N];
207
208 midc = CC[0]+RR[0]; /* Find optimal midpoint */
209 midj = 0;
210 type = 1;
211 for (j = 0; j <= N; j++)
212 if ((c = CC[j] + RR[j]) >= midc)
213 if (c > midc || CC[j] != DD[j] && RR[j] == SS[j])
214 { midc = c;
215 midj = j;
216 }
217 if (rc) {
218 if ((c = DD[N] + SS[N]) > midc)
219 { midc = c;
220 midj = N;
221 type = 2;
222 }
223 } else {
224 if ((c = DD[N] + SS[N] + g) > midc)
225 { midc = c;
226 midj = N;
227 type = 2;
228 }
229 }
230 for (j = N-1; j > 0; j--)
231 if ((c = DD[j] + SS[j] + g) > midc)
232 { midc = c;
233 midj = j;
234 type = 2;
235 }
236 if (lc) {
237 if ((c = DD[0] + SS[0]) > midc)
238 { midc = c;
239 midj = 0;
240 type = 2;
241 }
242 } else {
243 if ((c = DD[0] + SS[0] + g) > midc)
244 { midc = c;
245 midj = 0;
246 type = 2;
247 }
248 }
249 }
250
251 /* Conquer: recursively around midpoint */
252
253 if (midj == 0 || midj == N) {
254 if (type == 1)
255 { align(A,B,midi,midj,tb,-g,topr,0,lc,rc);
256 align(A+midi,B+midj,M-midi,N-midj,-g,te,0,botr,lc,rc);
257 }
258 else
259 { align(A,B,midi-1,midj,tb,0,topr,0,lc,rc);
260 DEL(2);
261 align(A+midi+1,B+midj,M-midi-1,N-midj,0,te,0,botr,lc,rc);
262 }
263 } else {
264 if (type == 1)
265 { align(A,B,midi,midj,tb,-g,topr,0,lc,0);
266 align(A+midi,B+midj,M-midi,N-midj,-g,te,0,botr,0,rc);
267 }
268 else
269 { align(A,B,midi-1,midj,tb,0,topr,0,lc,0);
270 DEL(2);
271 align(A+midi+1,B+midj,M-midi-1,N-midj,0,te,0,botr,0,rc);
272 }
273 }
274 return midc;
275 }
276
277 /* Interface and top level of comparator */
278
ALIGN(A,B,M,N,W,G,H,S)279 int ALIGN(A,B,M,N,W,G,H,S)
280 char A[],B[];
281 int M,N;
282 int **W,G,H;
283 int S[];
284 {
285 int c, ck;
286 int t;
287
288 if (N > NMAX) return -1; /* Error check */
289
290 w = W; /* Setup global parameters */
291 g = G;
292 h = H;
293 m = g+h;
294 sapp = S;
295 last = 0;
296
297 c = align(A,B,M,N,-g,-g,1,1,1,1); /* OK, do it */
298 if ((abs(S[0]) < abs(S[1])) && S[0] != 0) {
299 t = S[1];
300 S[1] = S[0];
301 S[0] = t;
302 }
303 if ((abs(sapp[0]) < abs(sapp[-1])) && sapp[0] != 0) {
304 t = sapp[-1];
305 sapp[-1] = sapp[0];
306 sapp[0] = t;
307 }
308 ck = CHECK_SCORE(A,B,M,N,S);
309 if (c != ck) printf("Check_score error. c=%d, ck=%d\n",c,ck);
310 return c;
311 }
312
313 /* Alignment display routine */
314
315 static char ALINE[51], BLINE[51], CLINE[51];
316
317 void
DISPLAY(unsigned char * A,unsigned char * B,int M,int N,int * S,int AP,int BP)318 DISPLAY(unsigned char *A, unsigned char *B,
319 int M, int N,
320 int *S, int AP, int BP) {
321 register char *a, *b, *c;
322 register int i, j, op;
323 int lines, ap, bp;
324
325 i = j = op = lines = 0;
326 ap = AP;
327 bp = BP;
328 a = ALINE;
329 b = BLINE;
330 c = CLINE;
331 while (i < M || j < N) {
332 if (op == 0 && *S == 0) {
333 op = *S++;
334 *a = A[++i];
335 *b = B[++j];
336 *c++ = (*a++ == *b++) ? '|' : ' ';
337 }
338 else {
339 if (op == 0) op = *S++;
340 if (op > 0) {
341 *a++ = ' ';
342 *b++ = B[++j];
343 op--;
344 }
345 else {
346 *a++ = A[++i];
347 *b++ = ' ';
348 op++;
349 }
350 *c++ = '-';
351 }
352 if (a >= ALINE+50 || (i >= M && j >= N)) {
353 *a = *b = *c = '\0';
354 printf("\n%5d ",50*lines++);
355 for (b = ALINE+10; b <= a; b += 10) printf(" . :");
356 if (b <= a+5) printf(" .");
357 printf("\n%5d %s\n %s\n%5d %s\n",ap,ALINE,CLINE,bp,BLINE);
358 ap = AP + i;
359 bp = BP + j;
360 a = ALINE;
361 b = BLINE;
362 c = CLINE;
363 }
364 }
365 }
366
367 /* CHECK_SCORE - return the score of the alignment stored in S */
368
CHECK_SCORE(A,B,M,N,S)369 static int CHECK_SCORE(A,B,M,N,S) char A[], B[]; int M, N; int S[];
370 {
371 register int i, j, op;
372 int score;
373
374 score = i = j = op = 0;
375 while (i < M || j < N) {
376 op = *S++;
377 if (i == 0 && j == 0 && op != 0) {
378 if (op > 0) j = j+op;
379 else i = i-op;
380 } else if (i == M || j == N) {
381 i = M;
382 j = N;
383 } else if (op == 0)
384 score = w[A[++i]][B[++j]] + score;
385 else if (op > 0) {
386 score = score - (g+op*h);
387 j = j+op;
388 } else {
389 score = score - (g-op*h);
390 i = i-op;
391 }
392 }
393 return(score);
394 }
395
396 /* lib.c - library of C procedures. */
397
398 /* fatal - print message and die */
399 void
fatal(char * msg)400 fatal(char *msg) {
401
402 fprintf(stderr, "%s\n", msg);
403 exit(1);
404 }
405
406 /* fatalf - format message, print it, and die */
407 void
fatalf(char * msg,char * val)408 fatalf(char *msg, char *val) {
409 fprintf(stderr, msg, val);
410 putc('\n', stderr);
411 exit(1);
412 }
413
414 /* ckalloc - allocate space; check for success */
ckalloc(int amount)415 char *ckalloc(int amount) {
416 char *p;
417
418 if ((p = malloc((size_t) amount)) == NULL)
419 fatal("Ran out of memory.");
420 return(p);
421 }
422
423