1 static char const rcsid[] = "$Id: sim4.c,v 6.5 2003/05/30 17:25:38 coulouri Exp $";
2
3 /* sim4.c - an algorithm to align a cDNA sequence with a genomic sequence.
4
5 *Note:
6 - when compiled with flag -DDEBUG, it gives information on the
7 (intermediate and final) block decomposition of the sequences
8 - when compiled with flag -DSTATS, it generates the actual
9 alignment and gives the alignment scores on exon segments
10 */
11
12
13 /* #include <stdio.h> */
14 /* #include <string.h> */
15 /* #include <ctype.h> */
16 /* #include <math.h> */
17
18 #include <ncbi.h>
19 #include <simutil.h>
20
21 #define DEFAULT_W 8
22 #define D_MASK 65535 /* distribution mask : 2^16-1 */
23 #define MIN_EXON_SIZE W/2
24 #define DIST_CUTOFF 4
25 #define PERCENT .2
26 #define CUTOFF .2
27 #define LL 60
28 #define M1 M-W+1
29 #define N1 N-W+1
30 #define NW 5 /* number of words used in the heuristic phase from sim3 */
31
32 #define MAXINT 65535
33 #define ALPHA .5
34 #define BETA .5
35
36
37 #define MININT -99999
38 #define DEF_LIMIT 1000
39
40 typedef struct block {
41 Int4 init_pos1;
42 Int4 init_pos2;
43 Int4 end_pos1;
44 Int4 end_pos2;
45 Int4 next;
46 } Block;
47
48
49
50 typedef struct table {
51 Uint4 ecode;
52 Int4 pos;
53 struct table *next;
54 } Table;
55
56
57 typedef struct matchelt {
58 Int4 mblock;
59 Table *mref;
60 struct matchelt *mnext;
61 } Matchelt;
62
63
64 typedef struct links {
65 Matchelt *lptr;
66 struct links *lnext;
67 } Links;
68
69
70 typedef struct threshelt {
71 Int4 tblock; /* block in seq2; a value between 1..M-W+1 */
72 Table *tref;
73 } Threshelt;
74
75
76 struct edit {
77 struct edit *link; /* previous edit instruction */
78 Int4 op; /* INS or DEL */
79 Int4 line1; /* line number in file1 */
80 Int4 line2; /* line number in file2 */
81 };
82
83 typedef struct {
84 Int1Ptr seq1, seq2;
85 Int4 M, N, W;
86 Int4 encoding[128];
87 Int4 decoding[5];
88 Int4 b_offset;
89 Int4 limit;
90 } data_t;
91
92 #define MAX_BLOCKS 10000
93 typedef struct sim4global { /*the global variables used in SIM4*/
94 Block *Blist;
95 Table ***pos_tbl;
96 Table **lists;
97 Links **Link;
98 Matchelt **Matchlist;
99 }SIM4Global, PNTR SIM4GlobalPtr;
100
101
102 static Int4 segment(Int1Ptr, Int1Ptr, Int4, Int4, Int4, Int4, Int4, data_t *, SIM4GlobalPtr);
103 static void add_word(Uint4, Int4, SIM4GlobalPtr);
104 static Boolean init_tbl(Int4, SIM4GlobalPtr);
105 static void bld_table(Int1Ptr, Int4, Int4, data_t *, SIM4GlobalPtr);
106 static void bld_lists(Int1Ptr, Int4, Int4, data_t *, SIM4GlobalPtr);
107 static void create_mlists(Int4, Int4, Int4, SIM4GlobalPtr);
108 static void lisseq(Int4, Int4, Int4, Links **, SIM4GlobalPtr, ValNodePtr PNTR);
109 static Int4 btfind(Int4, Threshelt *, Int4);
110 static Int4 greedy(Int1Ptr, Int1Ptr, Int4, Int4, Int4, Int4,
111 Int4Ptr PNTR, Int4Ptr PNTR, Int4Ptr, data_t *, SIM4GlobalPtr);
112 static Int4 bgreedy(Int1Ptr, Int1Ptr, Int4, Int4, Int4, Int4, data_t *, SIM4GlobalPtr);
113 static Int4 bput_scr(struct edit *, Int4, Int4, Int4, Int4, Int4, Int4, Int4, SIM4GlobalPtr);
114 static Int4 fgreedy(Int1Ptr, Int1Ptr, Int4, Int4, Int4, Int4, data_t *, SIM4GlobalPtr);
115 static Int4 fput_scr(struct edit *, Int4, Int4, Int4, Int4, Int4, SIM4GlobalPtr);
116 static Int4 find_blocks PROTO((Int4 W, Links *lk, Int4 offset1, Int4 offset2,
117 data_t *data, SIM4GlobalPtr sgp));
118 static Int4 power2(Int4 n);
119 static Boolean sim4x(data_t *, SIM4GlobalPtr );
120 static Int4 snake(Int4, Int4, Int4, Int4, data_t *);
121 static Int4 rsnake(Int4, Int4, Int4, Int4, data_t *);
122 static void path(Int4, Int4, Int1, Int4, Int4, Int1, Int4,
123 EditScriptPtr PNTR, EditScriptPtr PNTR, data_t *);
124
125 static Int1 locate(Int4Ptr, Int4Ptr, Int4Ptr, Int4Ptr, Int1Ptr, Int1Ptr,
126 Int4, Int4, Int4, data_t *data);
127 static Int4 get_dist(Int4, Int4, Int4, Int4, data_t *data);
128 static void Condense_script(EditScriptPtr);
129 static void Reverse_Script(EditScriptPtr);
130
131 static EditScriptPtr SIM4 PROTO((Int1Ptr A, Int1Ptr B, Int4 len1, Int4 len2,
132 Int4 INPUT_W, Int4 LIMIT, Int4Ptr dist_ptr));
133 static void S2A(EditScriptPtr, Int4Ptr);
134 static void DISPLAY(Int1Ptr, Int1Ptr, Int4, Int4, Int4Ptr, Int4, Int4);
135 static void Print_Script(EditScriptPtr head, Int4 M, Int4 N, data_t *data);
136
137
SIM4ALN_choice(SeqLocPtr cslp1,SeqLocPtr cslp2,Int4 limit,Int4 word_size)138 SeqAlignPtr SIM4ALN_choice(SeqLocPtr cslp1, SeqLocPtr cslp2, Int4 limit, Int4 word_size)
139 {
140 SeqAlignPtr sap = NULL;
141 Boolean both_strand, is_dna;
142 Uint1 strand_2, strand_1;
143 CharPtr A, B;
144 Int4 len1, len2;
145 EditScriptPtr esp, c_esp;
146 Int4 dist = -1, c_dist;
147 Char buf[21];
148
149
150 if(cslp1 == NULL || cslp2 == NULL)
151 {
152 ErrPostEx(SEV_WARNING, 0, 0, "Insufficient input");
153 return NULL;
154 }
155
156 if(cslp1->choice != SEQLOC_INT || cslp2->choice !=SEQLOC_INT)
157 {
158 ErrPostEx(SEV_WARNING, 0, 0, "Incorrect type of Seq-loc. Only take Seq-int");
159 return NULL;
160 }
161 strand_1 = SeqLocStrand(cslp1);
162 strand_2 = SeqLocStrand(cslp2);
163 both_strand = check_strand_mol(cslp2, &is_dna);
164 if(!is_dna)
165 {
166 ErrPostEx(SEV_WARNING, 0, 0, "Sorry, SIM4 can only compare two DNA sequences");
167 return NULL;
168 }
169
170 if(both_strand)
171 {
172 Change_Loc_Strand(cslp1, Seq_strand_plus);
173 Change_Loc_Strand(cslp2, Seq_strand_plus);
174 }
175
176 A= make_sim_seq(cslp1, TRUE, NULL);
177 if(A == NULL)
178 {
179 Change_Loc_Strand(cslp1, strand_1);
180 Change_Loc_Strand(cslp2, strand_2);
181 SeqIdWrite(SeqLocId(cslp1), buf, PRINTID_FASTA_LONG, 20);
182 ErrPostEx(SEV_WARNING, 0, 0, "Fail to make the first sequence %s", buf);
183 return NULL;
184 }
185
186 B = make_sim_seq(cslp2, FALSE, NULL);
187 if(B == NULL)
188 {
189 MemFree(A);
190 Change_Loc_Strand(cslp1, strand_1);
191 Change_Loc_Strand(cslp2, strand_2);
192 SeqIdWrite(SeqLocId(cslp2), buf, PRINTID_FASTA_LONG, 20);
193 ErrPostEx(SEV_WARNING, 0, 0, "Fail to make the second sequence %s", buf);
194 return NULL;
195 }
196
197 len1 = SeqLocLen(cslp1);
198 len2 = SeqLocLen(cslp2);
199
200 if(limit <= 0)
201 limit = DEF_LIMIT;
202 word_size = MAX(DEFAULT_W, word_size);
203 esp = SIM4((Int1Ptr)(A+1), (Int1Ptr)(B+1), len1, len2,word_size, limit,&dist);
204 if(both_strand)
205 {
206 Change_Loc_Strand(cslp2,(Uint1)2);
207 make_sim_seq(cslp2, FALSE, B);
208 c_esp = SIM4((Int1Ptr)(A+1), (Int1Ptr)(B+1), len1, len2,word_size, limit,&c_dist);
209 if(c_esp != NULL)
210 {
211 if(esp == NULL || c_dist < dist)
212 {
213 Script_free(esp);
214 esp = c_esp;
215 dist = c_dist;
216 }
217 else
218 {
219 Change_Loc_Strand (cslp2, Seq_strand_plus);
220 Script_free(c_esp);
221 }
222 }
223 else
224 Change_Loc_Strand (cslp2, Seq_strand_plus);
225 }
226
227
228 if(esp != NULL)
229 {
230 sap = ConvertEditScript(esp, cslp1, cslp2, 0, 0, FALSE);
231 Script_free(esp);
232 }
233 else
234 sap = NULL;
235 Change_Loc_Strand(cslp1, strand_1);
236 Change_Loc_Strand(cslp2, strand_2);
237 MemFree(A);
238 MemFree(B);
239 return sap;
240
241 }
242
243
power2(Int4 n)244 static Int4 power2(Int4 n)
245 {
246 if(n < 0) {
247 ErrPostEx(SEV_FATAL, 1, 0,
248 "Positive integer power only. Perhaps invalid data.");
249 exit(1);
250 }
251
252 if(!n) return 1;
253 else return 2 * power2(n - 1);
254 }
255
init_tbl(Int4 W,SIM4GlobalPtr sgp)256 static Boolean init_tbl(Int4 W, SIM4GlobalPtr sgp)
257 {
258 Int4 i, j, No_branches;
259
260 sgp->Blist = MemGet((size_t)MAX_BLOCKS * sizeof(Block), MGET_ERRPOST);
261 if(sgp->Blist == NULL)
262 return FALSE;
263 MemSet((Pointer)(sgp->Blist), 0, MAX_BLOCKS * sizeof(Block));
264 No_branches = power2(MAX(0, 2 * W - 16));
265 sgp->pos_tbl = (Table ***)MemNew(No_branches * sizeof(Table **));
266
267 for(i = 0; i < No_branches; i++) {
268 sgp->pos_tbl[i] = (Table **)MemNew((D_MASK + 1) * sizeof(Table *));
269 for(j = 0; j <= D_MASK; j++)
270 sgp->pos_tbl[i][j] = NULL;
271 }
272 return TRUE;
273 }
274
bld_table(Int1Ptr s1,Int4 M,Int4 W,data_t * data,SIM4GlobalPtr sgp)275 static void bld_table(Int1Ptr s1, Int4 M, Int4 W, data_t *data, SIM4GlobalPtr sgp)
276 {
277 Uint4 ecode;
278 Int4 i, si, temp, mask;
279 Int1Ptr q;
280
281 /* initializing ... */
282
283 for(i = 0; i < 128; i++) data->encoding[i] = 0;
284
285 data->encoding['A'] = data->encoding['a'] = 0;
286 data->encoding['C'] = data->encoding['c'] = 1;
287 data->encoding['G'] = data->encoding['g'] = 2;
288 data->encoding['T'] = data->encoding['t'] = 3;
289 data->decoding[0] = 'A';
290 data->decoding[1] = 'C';
291 data->decoding[2] = 'G';
292 data->decoding[3] = 'T';
293
294 mask = (1 << (W + W - 2)) - 1;
295
296 q = s1 - 1;
297 ecode = 0L;
298 for(si = i = 1; i < W; ++si)
299 if((temp = data->encoding[*++q]) != 4) {
300 ecode = (ecode << 2) + temp;
301 i++;
302 }
303 for(; (*++q) && (si <= M); si++)
304 if((temp = data->encoding[*q]) != 4) {
305 ecode = ((ecode & mask) << 2) + temp;
306 add_word(ecode, (Int4)(q - s1 + 1), sgp);
307 }
308 }
309
add_word(Uint4 ecode,Int4 pos,SIM4GlobalPtr sgp)310 static void add_word(Uint4 ecode, Int4 pos, SIM4GlobalPtr sgp)
311 {
312 Table *ptr;
313
314 ptr = (Table *)MemNew(sizeof(Table));
315 ptr->ecode = ecode;
316 ptr->pos = pos;
317 ptr->next = sgp->pos_tbl[ecode >> 16][ecode&D_MASK];
318 sgp->pos_tbl[ecode >> 16][ecode&D_MASK] = ptr;
319 }
320
clean_up_global_val(SIM4GlobalPtr sgp)321 static void clean_up_global_val(SIM4GlobalPtr sgp)
322 {
323
324 MemFree(sgp->Link);
325 MemFree(sgp->lists);
326 MemFree(sgp->Blist);
327
328 }
329
SIM4(Int1Ptr in_seq1,Int1Ptr in_seq2,Int4 in_M,Int4 in_N,Int4 in_W,Int4 LIMIT,Int4Ptr dist_ptr)330 static EditScriptPtr SIM4(Int1Ptr in_seq1, Int1Ptr in_seq2,
331 Int4 in_M, Int4 in_N,
332 Int4 in_W, Int4 LIMIT, Int4Ptr dist_ptr)
333 {
334 data_t data;
335 Int4 sflag = FALSE;
336 Int4 i, i1, diff;
337 Int1Ptr aux;
338 EditScriptPtr head, tail, left, right, new;
339 SIM4Global sg;
340
341
342 data.limit = LIMIT;
343
344 /* initialize global variables ... */
345 data.seq1 = in_seq1;
346 data.seq2 = in_seq2;
347 data.M = in_M;
348 data.N = in_N;
349 data.W = in_W;
350 data.b_offset = 0;
351
352 if(data.M > data.N) {
353 aux = data.seq1;
354 data.seq1 = data.seq2;
355 data.seq2 = aux;
356
357 i = data.N;
358 data.N = data.M;
359 data.M = i;
360
361 sflag = TRUE;
362 }
363
364 /* Compute the distance between two sequences A and B */
365
366 /* printf("A heuristic phase 1 is tried.\n"); */
367
368 if(!data.limit)
369 data.limit = DEF_LIMIT;
370 /* printf("distance limit = %d\n", data.limit); */
371 *dist_ptr = 0;
372
373 if(!sim4x(&data, &sg))
374 return NULL;
375 i = 0; head = NULL;
376
377 while((i1 = sg.Blist[i].next) && sg.Blist[i1].end_pos1) {
378 diff = get_dist(sg.Blist[i1].init_pos1 - 1,
379 sg.Blist[i1].init_pos2 - 1,
380 sg.Blist[i1].end_pos1,
381 sg.Blist[i1].end_pos2,
382 &data);
383 if(diff == -1)
384 {
385 clean_up_global_val(&sg);
386 Script_free(head);
387 return NULL;
388 }
389
390 /* if(diff > MAX((Int4)(PERCENT * (sg.Blist[i1].end_pos1 - sg.Blist[i1].init_pos1 + 1)), 2 * data.W))
391 ErrPostEx(SEV_WARNING, 0, 0,
392 "Distance threshold on segment exceeded."); */
393
394 /* printf("Warning: Distance threshold on segment exceeded.\n"); */
395
396 path(sg.Blist[i1].init_pos1 - 1, sg.Blist[i1].init_pos2 - 1, SUB,
397 sg.Blist[i1].end_pos1, sg.Blist[i1].end_pos2, SUB,
398 diff, &left, &right, &data);
399 *dist_ptr += diff;
400
401 if((diff = sg.Blist[i].init_pos1 - sg.Blist[i1].end_pos1 - 1) != 0) {
402 new = (EditScriptPtr) MemNew(sizeof(EditScript));
403 new->op_type = DEL;
404 new->num = diff;
405 new->next = head;
406 head = new;
407 *dist_ptr += diff;
408 }
409 if((diff = sg.Blist[i].init_pos2 - sg.Blist[i1].end_pos2 - 1) != 0) {
410 new = (EditScriptPtr) MemNew(sizeof(EditScript));
411 new->op_type = INS;
412 new->num = diff;
413 new->next = head;
414 head = new;
415 }
416 right->next = head;
417 head = left;
418 i = i1;
419 }
420 if((diff = sg.Blist[i].init_pos2 - sg.Blist[i1].end_pos2 - 1) != 0) {
421 new = (EditScriptPtr) MemNew(sizeof(EditScript));
422 new->op_type = INS;
423 new->num = diff;
424 new->next = head;
425 head = new;
426 }
427 if((diff = sg.Blist[i].init_pos1 - sg.Blist[i1].end_pos1 - 1) != 0) {
428 new = (EditScriptPtr) MemNew(sizeof(EditScript));
429 new->op_type = DEL;
430 new->num = diff;
431 new->next = head;
432 head = new;
433 *dist_ptr += diff;
434 }
435
436 /* Merge the contiguous same operations together */
437 Condense_script(head);
438
439 /* Reset tail (due to the condensation) */
440 tail = head;
441 while(tail->next != NULL) tail = tail->next;
442
443 #ifdef STATS
444 /*printf("distance=%d\n", *dist_ptr); */
445 if(*dist_ptr > limit)
446 ErrPostEx(SEV_WARNING, 0, 0, "distance exceeds limit!");
447 /* printf("\nWarning: distance exceeds limit!\n\n"); */
448 #endif
449
450 if(sflag == TRUE) { /* Switch back the sequences */
451 aux = data.seq1;
452 data.seq1 = data.seq2;
453 data.seq2 = aux;
454 i = data.M;
455 data.M = data.N;
456 data.N = i;
457 Reverse_Script(head);
458 }
459
460 /* Print_Script(head, data.M, data.N, &data); */
461 clean_up_global_val(&sg);
462
463 return head;
464 }
465
466 /* locate possible exons and form block list */
sim4x(data_t * data,SIM4GlobalPtr sgp)467 static Boolean sim4x(data_t *data, SIM4GlobalPtr sgp)
468 {
469 Int4 bno, i, i1, diff, dist;
470 Int4Ptr last, first;
471
472 Int4 I1, I2, J1, J2;
473 Int4 offset1, offset2;
474 Int1 hflag;
475 Int4 p1, p2, length1, length2;
476
477 #ifdef XFLAG
478 Int4 ulcell, u, l;
479 Int4 j, mcell, mismatch;
480 Int1 temp[50];
481 #endif
482
483 if(data->M < data->W) data->W = data->M;
484 if(!init_tbl(data->W, sgp))
485 return FALSE;
486
487 bno = segment(data->seq1, data->seq2, data->M, data->N, data->W, 0, 0, data, sgp);
488
489 sgp->Blist[0].init_pos1 = data->M + 1;
490 sgp->Blist[0].init_pos2 = data->N + 1;
491 sgp->Blist[bno + 1].end_pos1 = sgp->Blist[bno + 1].end_pos2 = 0;
492 sgp->Blist[bno].next = bno + 1;
493 sgp->Blist[bno + 1].next = 0;
494 data->b_offset = bno + 2;
495
496 /* skip small blocks, as potential error source */
497 /* external thresholding: begin sequence */
498 i = data->b_offset - 1;
499 while(i && (sgp->Blist[i].end_pos1 - sgp->Blist[i].init_pos1 + 1 < 2.0 * data->W))
500 i--;
501 sgp->Blist[i].next = data->b_offset - 1;
502
503 /* external thresholding: end sequence */
504 i = sgp->Blist[0].next;
505 while(sgp->Blist[i].end_pos1 && (sgp->Blist[i].end_pos1 - sgp->Blist[i].init_pos1 + 1 < 2 * data->W))
506 i++;
507 sgp->Blist[0].next = i;
508
509 /* internal thresholding */
510 i = 0;
511 while((i1 = sgp->Blist[i].next) && (sgp->Blist[i1].end_pos1)) {
512 if(sgp->Blist[i1].end_pos1 - sgp->Blist[i1].init_pos1 + 1 < data->W)
513 sgp->Blist[i].next = sgp->Blist[i1].next;
514 /* ========= HERE STARTS THE NEW CODE =========== */
515
516 else if (sgp->Blist[i1].end_pos1-sgp->Blist[i1].init_pos1+1<2*data->W) {
517
518 Int4 tmp = sgp->Blist[i1].next;
519
520 /* extend block i to the left */
521 p1 = sgp->Blist[i].init_pos1-1;
522 p2 = sgp->Blist[i].init_pos2-1;
523 while ((p1>0) && (p2>0) &&
524 (p1>sgp->Blist[tmp].end_pos1-1) &&
525 (data->seq1[p1-1] == data->seq2[p2-1])) {
526 p1--; p2--;
527 }
528 sgp->Blist[i].init_pos1 = p1+1;
529 sgp->Blist[i].init_pos2 = p2+1;
530
531 /* if it exists, extend sgp->Blist[i1].next to the right */
532 p1 = sgp->Blist[tmp].end_pos1;
533 p2 = sgp->Blist[tmp].end_pos2;
534 while ((p1<sgp->Blist[i].init_pos1-1) &&
535 (data->seq1[p1] == data->seq2[p2])) {
536 p1++; p2++;
537 }
538 sgp->Blist[tmp].end_pos1 = p1;
539 sgp->Blist[tmp].end_pos2 = p2;
540
541 sgp->Blist[i1].init_pos2 += MAX(sgp->Blist[tmp].end_pos1+1,sgp->Blist[i1].init_pos1)
542 -sgp->Blist[i1].init_pos1;
543 sgp->Blist[i1].init_pos1 = MAX(sgp->Blist[tmp].end_pos1+1,sgp->Blist[i1].init_pos1);
544
545 sgp->Blist[i1].end_pos2 -= sgp->Blist[i1].end_pos1-
546 MIN(sgp->Blist[i].init_pos1-1,sgp->Blist[i1].end_pos1);
547 sgp->Blist[i1].end_pos1 = MIN(sgp->Blist[i].init_pos1-1,sgp->Blist[i1].end_pos1);
548 if (sgp->Blist[i1].end_pos1-sgp->Blist[i1].init_pos1+1<data->W) {
549
550 if (sgp->Blist[tmp].end_pos1>=sgp->Blist[i].init_pos1) {
551
552 /* i and next[i1] overlap */
553 length1 = sgp->Blist[tmp].end_pos1-sgp->Blist[tmp].init_pos1+1;
554 length2 = sgp->Blist[i].end_pos1-sgp->Blist[i].init_pos1+1;
555
556 if (length2 >= length1) {
557 sgp->Blist[tmp].end_pos2 -= sgp->Blist[tmp].end_pos1-sgp->Blist[i].init_pos1+1;
558 sgp->Blist[tmp].end_pos1 = sgp->Blist[i].init_pos1-1;
559 } else {
560 sgp->Blist[i].init_pos2 += sgp->Blist[tmp].end_pos1+1-
561 sgp->Blist[i].init_pos1;
562 sgp->Blist[i].init_pos1 = sgp->Blist[tmp].end_pos1+1;
563 }
564 }
565 sgp->Blist[i].next = sgp->Blist[i1].next;
566 } else i = i1;
567
568 /* =========== HERE ENDS THE NEW CODE =========== */
569
570 }
571
572 else i = i1;
573 }
574
575 /* separate forward greedy code for the right end */
576
577 i1 = sgp->Blist[i=0].next;
578 diff = sgp->Blist[i].init_pos1-sgp->Blist[i1].end_pos1-1;
579 if(i1 && diff) {
580
581 bno = fgreedy(data->seq1 + sgp->Blist[i1].end_pos1,
582 data->seq2 + sgp->Blist[i1].end_pos2,
583 diff,
584 sgp->Blist[i].init_pos2 - sgp->Blist[i1].end_pos2 - 1,
585 sgp->Blist[i1].end_pos1,sgp->Blist[i1].end_pos2,
586 data, sgp);
587
588 if(bno) {
589 sgp->Blist[i].next = data->b_offset;
590 sgp->Blist[data->b_offset + bno - 1].next = i1;
591 data->b_offset += bno;
592 }
593 else if(diff <= sgp->Blist[i].init_pos2 - sgp->Blist[i1].end_pos2 - 1) {
594 /* attempt the sim3 matching strategy */
595 hflag = locate(&I1, &J1, &I2, &J2,
596 data->seq1 + sgp->Blist[i1].end_pos1,
597 data->seq2 + sgp->Blist[i1].end_pos2,
598 diff,
599 sgp->Blist[i].init_pos2 - sgp->Blist[i1].end_pos2 - 1,
600 (Int4)(BETA * data->W), data);
601 if(hflag == TRUE) {
602 /*
603 printf("I1=%d, I2=%d,J1=%d,J2=%d\n",
604 I1+Blist[i1].end_pos1,
605 I2+Blist[i1].end_pos1,
606 J1+Blist[i1].end_pos2,
607 J2+Blist[i1].end_pos2);
608 */
609 offset1 = sgp->Blist[i1].end_pos1;
610 offset2 = sgp->Blist[i1].end_pos2;
611
612 sgp->Blist[data->b_offset].init_pos1 = offset1 + I1 + 1;
613 sgp->Blist[data->b_offset].init_pos2 = offset2 + J1 + 1;
614 sgp->Blist[data->b_offset].end_pos1 = offset1 + I2;
615 sgp->Blist[data->b_offset].end_pos2 = offset2 + J2;
616
617 sgp->Blist[i].next = data->b_offset;
618 sgp->Blist[data->b_offset].next = i1;
619 data->b_offset++;
620 }
621 /* else printf("Right end of the cDNA sequence does not match the genome.\n"); */
622 }
623
624 i = i1;
625 }
626
627
628 /* step 2: greedy method on unmatched subsequences */
629
630 while(i1 = sgp->Blist[i].next) {
631 diff = sgp->Blist[i].init_pos1-sgp->Blist[i1].end_pos1-1;
632 if (diff)
633 #ifdef XFLAG
634 if ((diff<=MIN_EXON_SIZE) && sgp->Blist[i1].end_pos1){
635 if (diff<=sgp->Blist[i].init_pos2-sgp->Blist[i1].end_pos2-1)
636 {
637 strncpy(temp,seq1+sgp->Blist[i1].end_pos1,diff);
638 temp[diff] = '\0';
639 ulcell = -1; mcell = MAXINT;
640 l = sgp->Blist[i1].end_pos2;
641 u = sgp->Blist[i].init_pos2-diff-1;
642 while (l<=u)
643 { mismatch = 0;
644 for (j=0; j<diff; j++)
645 if (seq2[u+j]!=temp[j]) mismatch++;
646 if (((sgp->Blist[i].init_pos2-u-diff<=ALPHA*W) && (mismatch<=1)) ||
647 ((sgp->Blist[i].init_pos2-u-diff>ALPHA*W) && !mismatch))
648 { ulcell = l = u;
649 mcell = mismatch;
650 break; }
651
652 if (mismatch<mcell)
653 { ulcell=u; mcell=mismatch; }
654 u--;
655
656 mismatch = 0;
657 for (j=0; j<diff; j++)
658 if (seq2[l+j]!=temp[j]) mismatch++;
659 if (((l+1-sgp->Blist[i1].end_pos2<=ALPHA*W) && (mismatch<=1)) ||
660 ((l+1-sgp->Blist[i1].end_pos2>ALPHA*W) && !mismatch))
661 { ulcell = u = l;
662 mcell = mismatch;
663 break; }
664 if (mismatch<mcell)
665 { ulcell=l; mcell=mismatch; }
666 l++;
667 }
668
669 /* introduce block */
670 sgp->Blist[b_offset].init_pos1 = sgp->Blist[i1].end_pos1+1;
671 sgp->Blist[b_offset].init_pos2 = ulcell+1;
672 sgp->Blist[b_offset].end_pos1 = sgp->Blist[i].init_pos1-1;
673 sgp->Blist[b_offset].end_pos2 = sgp->Blist[b_offset].init_pos2+diff-1;
674 sgp->Blist[i].next = b_offset;
675 sgp->Blist[b_offset].next = i1;
676 b_offset++;
677 }
678 else { sgp->Blist[b_offset].init_pos1 = sgp->Blist[i1].end_pos1+1;
679 sgp->Blist[b_offset].init_pos2 = sgp->Blist[i1].end_pos2+1;
680 sgp->Blist[b_offset].end_pos1 = sgp->Blist[i].init_pos1-1;
681 sgp->Blist[b_offset].end_pos2 = sgp->Blist[i].init_pos2-1;
682 sgp->Blist[i].next = b_offset;
683 sgp->Blist[b_offset].next = i1;
684 b_offset++;
685 }
686
687 }
688 else
689 #endif
690 {
691 /* gap is bigger than min-exon-size or begin sequence */
692 /*
693 print_format(diff, seq1+Blist[i1].end_pos1,LL,"d1");
694 print_format(Blist[i].init_pos2-Blist[i1].end_pos2-1,
695 seq2+Blist[i1].end_pos2,LL,"d2");
696 */
697
698 if(sgp->Blist[i1].end_pos1) {
699
700 last = (Int4Ptr )MemNew(sizeof(Int4));
701 first = (Int4Ptr )MemNew(sizeof(Int4));
702
703 bno = greedy(data->seq1 + sgp->Blist[i1].end_pos1, data->seq2 + sgp->Blist[i1].end_pos2,
704 diff, sgp->Blist[i].init_pos2 - sgp->Blist[i1].end_pos2 - 1,
705 sgp->Blist[i1].end_pos1, sgp->Blist[i1].end_pos2,
706 &first, &last, &dist, data, sgp);
707 if(dist > DIST_CUTOFF) {
708 if((hflag = locate(&I1, &J1, &I2, &J2,
709 data->seq1 + sgp->Blist[i1].end_pos1,
710 data->seq2 + sgp->Blist[i1].end_pos2,
711 diff,
712 sgp->Blist[i].init_pos2 - sgp->Blist[i1].end_pos2 - 1,
713 (Int4)(BETA * data->W), data)) == TRUE) {
714 offset1 = sgp->Blist[i1].end_pos1;
715 offset2 = sgp->Blist[i1].end_pos2;
716
717 sgp->Blist[data->b_offset].init_pos1 = offset1+I1+1;
718 sgp->Blist[data->b_offset].init_pos2 = offset2+J1+1;
719 sgp->Blist[data->b_offset].end_pos1 = offset1+I2;
720 sgp->Blist[data->b_offset].end_pos2 = offset2+J2;
721
722 sgp->Blist[i].next = data->b_offset;
723 sgp->Blist[data->b_offset].next = i1;
724 data->b_offset++;
725 }
726 else { /* else if hflag */
727 /* heuristic sim3 fails; use result produced by greedy */
728 sgp->Blist[i].next = *first;
729 sgp->Blist[*last].next = i1;
730 data->b_offset += bno;
731 }
732 }
733 else {
734 sgp->Blist[i].next = *first;
735 sgp->Blist[*last].next = i1;
736 data->b_offset += bno;
737 }
738 MemFree(last);
739 MemFree(first);
740 }
741 else {
742 bno = bgreedy(data->seq1 + sgp->Blist[i1].end_pos1,
743 data->seq2 + sgp->Blist[i1].end_pos2,
744 diff,
745 sgp->Blist[i].init_pos2 - sgp->Blist[i1].end_pos2 - 1,
746 sgp->Blist[i1].end_pos1,
747 sgp->Blist[i1].end_pos2,
748 data, sgp);
749 if(bno) {
750 sgp->Blist[i].next = bno - 1 + data->b_offset;
751 sgp->Blist[data->b_offset].next = i1;
752 data->b_offset += bno;
753 }
754 else if(diff <= sgp->Blist[i].init_pos2 - sgp->Blist[i1].end_pos2 - 1) {
755 /* attempt the matching strategy of sim3 */
756 hflag = locate(&I1, &J1, &I2, &J2,
757 data->seq1 + sgp->Blist[i1].end_pos1,
758 data->seq2 + sgp->Blist[i1].end_pos2,
759 diff,
760 sgp->Blist[i].init_pos2 - sgp->Blist[i1].end_pos2 - 1,
761 (Int4)(BETA * data->W), data);
762 if(hflag == TRUE) {
763 /*
764 printf("I1=%d, I2=%d,J1=%d,J2=%d\n",
765 I1+Blist[i1].end_pos1,
766 I2+Blist[i1].end_pos1,
767 J1+Blist[i1].end_pos2,
768 J2+Blist[i1].end_pos2);
769 */
770 offset1 = sgp->Blist[i1].end_pos1;
771 offset2 = sgp->Blist[i1].end_pos2;
772
773 sgp->Blist[data->b_offset].init_pos1 = offset1+I1+1;
774 sgp->Blist[data->b_offset].init_pos2 = offset2+J1+1;
775 sgp->Blist[data->b_offset].end_pos1 = offset1+I2;
776 sgp->Blist[data->b_offset].end_pos2 = offset2+J2;
777
778 sgp->Blist[i].next = data->b_offset;
779 sgp->Blist[data->b_offset].next = i1;
780 data->b_offset++;
781 }
782 /* else printf("Left end of the cDNA sequence does not match the genome.\n"); */
783 }
784 }
785 }
786 i = i1;
787 }
788
789 /* just printing ...*/
790 #ifdef DEBUG
791 i=0;
792 while (i=sgp->Blist[i].next)
793 {
794 fprintf (stderr,"B[%2d]: %5d %5d %5d %5d\n",i,sgp->Blist[i].init_pos1,
795 sgp->Blist[i].init_pos2,sgp->Blist[i].end_pos1,sgp->Blist[i].end_pos2);
796 }
797
798 fprintf(stderr,"\n**********************************\n\n");
799 fprintf(stderr,"NORMALIZATION:\n\n");
800 #endif
801
802
803 /* second cross - normalize blocks; compaction step */
804
805 i = sgp->Blist[0].next;
806 while((i1 = sgp->Blist[i].next) && (sgp->Blist[i1].init_pos1)) {
807 /*
808 if (Blist[i].init_pos2-Blist[i1].end_pos2<=
809 Blist[i].init_pos1-Blist[i1].end_pos1+ALPHA*W)
810 */
811 if(ABS(sgp->Blist[i].init_pos2 - sgp->Blist[i].init_pos1 -
812 sgp->Blist[i1].end_pos2 + sgp->Blist[i1].end_pos1) <= ALPHA * data->W)
813 /* discard block */
814 {
815 sgp->Blist[i].init_pos1 = sgp->Blist[i1].init_pos1;
816 sgp->Blist[i].init_pos2 = sgp->Blist[i1].init_pos2;
817 sgp->Blist[i].next = sgp->Blist[i1].next;
818 }
819 else i = i1;
820 }
821
822 #ifdef DEBUG
823 /* just printing ... */
824 printf("\n*******************************************\n");
825 printf ("\nBlock decomposition:\n\n");
826 i=0;
827 while ((i=sgp->Blist[i].next) && (sgp->Blist[i].end_pos1))
828 {
829 printf ("Block: I1=%5d, I2=%5d, J1=%5d, J2=%5d\n", sgp->Blist[i].init_pos1,
830 sgp->Blist[i].init_pos2,sgp->Blist[i].end_pos1,sgp->Blist[i].end_pos2);
831 }
832 printf("\n*******************************************\n\n");
833 #endif
834 return TRUE;
835 }
836
free_match_list_chain(Matchelt * curr)837 static void free_match_list_chain (Matchelt *curr)
838 {
839 Matchelt *next;
840
841 while(curr)
842 {
843 next = curr->mnext;
844 MemFree(curr);
845 curr = next;
846 }
847
848 }
849
free_table_chain(Table * curr)850 static void free_table_chain (Table *curr)
851 {
852 Table *next;
853
854 while(curr)
855 {
856 next = curr->next;
857 MemFree(curr);
858 curr = next;
859 }
860 }
861
segment(Int1 * s1,Int1 * s2,Int4 M,Int4 N,Int4 W,Int4 offset1,Int4 offset2,data_t * data,SIM4GlobalPtr sgp)862 static Int4 segment(Int1 *s1,Int1 *s2,Int4 M,Int4 N,Int4 W,
863 Int4 offset1, Int4 offset2, data_t *data, SIM4GlobalPtr sgp)
864 {
865 Int4 No_branches, bno;
866 Int4 i, j;
867 Links *Lkptr;
868 ValNodePtr data_list;
869
870 bld_table(s1, M, W, data, sgp);
871
872 /* attach to each position in seq2 the list of
873 positions in seq1 having the same content */
874 bld_lists(s2, N, W, data, sgp);
875
876 /* create matching lists */
877 create_mlists(M,N,W, sgp);
878
879 /* find longest increasing subsequence of positions */
880 lisseq(M,N,W,&Lkptr, sgp, &data_list);
881
882 bno = find_blocks(W, Lkptr, offset1, offset2, data, sgp);
883
884 No_branches = power2(MAX(0,2*W-16));
885 for (i=0; i<No_branches; i++)
886 {
887 for(j = 0; j <= D_MASK; j++)
888 {
889 if(sgp->pos_tbl[i][j] != NULL)
890 {
891 free_table_chain (sgp->pos_tbl[i][j]);
892 sgp->pos_tbl[i][j] = NULL;
893 }
894 }
895 MemFree(sgp->pos_tbl[i]);
896 }
897 MemFree(sgp->pos_tbl);
898
899
900 for(i = 0; i <= M - W + 1; i++)
901 {
902 if(sgp->Matchlist[i] != NULL)
903 {
904 free_match_list_chain (sgp->Matchlist[i]);
905 sgp->Matchlist[i] = NULL;
906 }
907 }
908 MemFree(sgp->Matchlist);
909
910 ValNodeFreeData(data_list);
911 return bno;
912 }
913
lisseq(Int4 M,Int4 N,Int4 W,Links ** Lkptr,SIM4GlobalPtr sgp,ValNodePtr PNTR data_list)914 static void lisseq(Int4 M, Int4 N, Int4 W, Links **Lkptr, SIM4GlobalPtr sgp, ValNodePtr PNTR data_list)
915 {
916 Threshelt *Thresh;
917 Matchelt *mptr;
918 Links *new;
919 Int4 count, k, thr_size; /* k an index of Thresh[.]; */
920 ValNodePtr curr, prev;
921 /* so k in 1,...MIN(M1,N1) */
922
923 *data_list = NULL;
924 prev = NULL;
925 thr_size = MIN(N1, M1); /* initially N1; modified */
926 Thresh = (Threshelt *)MemNew((thr_size + 1) * sizeof(Threshelt));
927 for(count = 1; count <= thr_size; count++) {
928 Thresh[count].tblock = N1 + 1;
929 Thresh[count].tref = NULL;
930 }
931
932 Thresh[0].tblock = 0;
933 Thresh[0].tref = NULL;
934
935 sgp->Link = (Links **)MemNew((thr_size + 1) * sizeof(Links *));
936 sgp->Link[0] = NULL;
937
938 for(count = 1; count <= M1; count++) {
939 /* Link[count] = NULL; */
940 /* for j in Matchlist[i] */
941 mptr = sgp->Matchlist[count];
942 while(mptr != NULL) {
943
944 /* find k */
945 if(!(k = btfind(mptr->mblock, Thresh, thr_size))) {
946 ErrPostEx(SEV_FATAL, 1, 0, "Invalid data!");
947 exit(1);
948 }
949 if((mptr->mblock<Thresh[k].tblock) ||
950 ((mptr->mblock==Thresh[k].tblock) &&
951 (count>(Thresh[k].tref)->pos))) {
952 Thresh[k].tblock = mptr->mblock;
953 Thresh[k].tref = mptr->mref;
954 new = (Links *)MemNew(sizeof(Links));
955 new->lptr = mptr;
956 new->lnext = sgp->Link[k-1];
957 sgp->Link[k] = new;
958
959 /*store the data into a chain. for freeing in the future */
960 curr = ValNodeNew(NULL);
961 curr->data.ptrvalue = new;
962 if(prev == NULL)
963 *data_list = curr;
964 else
965 prev->next = curr;
966 prev = curr;
967
968 }
969 mptr = mptr->mnext;
970 }
971 }
972 k = MIN(M1, N1);
973 while(Thresh[k].tblock == N1 + 1) k--;
974 *Lkptr = sgp->Link[k];
975 MemFree(Thresh);
976 }
977
978
btfind(Int4 key,Threshelt * Thr,Int4 thr_size)979 static Int4 btfind(Int4 key, Threshelt *Thr, Int4 thr_size)
980 {
981 Int4 l, u, index;
982
983 l = 1; u = thr_size;
984
985 while(l <= u) {
986 index = (Int4)(l + u)/2;
987 if((Thr[index - 1].tblock<key) && (key <= Thr[index].tblock))
988 return index;
989 else
990 if(key <= Thr[index].tblock) u = index - 1;
991 else l = index + 1;
992 }
993 return 0;
994 }
995
996 /* static void code2str(Uint4 ecode, Int1 *temp, int W) */
997 /* { int i, mask = 3; */
998
999 /* temp[W] = '\0'; */
1000 /* for (i=W-1; i>=0; i--) */
1001 /* { temp[i] = decoding[mask&ecode]; */
1002 /* ecode = ecode>>2; */
1003 /* } */
1004 /* } */
1005
1006 /* Int4 get_seq(Int1 *filename, Int1 **seq) */
1007 /* { */
1008 /* FILE *fp; */
1009 /* Int1 buffer[1000], *buf, *s; */
1010 /* int nlines=0; */
1011
1012 /* fp = ckopen(filename,"r"); */
1013 /* while (fgets(buffer,1000,fp)!=NULL) */
1014 /* if (buffer[0]!='>') nlines++; */
1015 /* s = *seq = (Int1 *)MemNew(nlines*LL+1); */
1016 /* rewind(fp); */
1017 /* while (fgets(buffer,1000,fp)!=NULL) */
1018 /* if (buffer[0]!='>') { */
1019 /* buf = buffer; */
1020 /* while ((*buf) && (*buf!='\n')) */
1021 /* *s++ = *buf++; */
1022 /* } */
1023 /* *s = '\0'; */
1024 /* return s-*seq; */
1025 /* } */
1026
create_mlists(Int4 M,Int4 N,Int4 W,SIM4GlobalPtr sgp)1027 static void create_mlists(Int4 M, Int4 N, Int4 W, SIM4GlobalPtr sgp)
1028 /* Matchlist[i] - retains the list corresponding to position i+W-1
1029 in sequence 2 (the long one ) */
1030 {
1031 Int4 i;
1032 Matchelt *new;
1033 Table *ptr;
1034
1035 sgp->Matchlist = (Matchelt **)MemNew((M - W + 2)*sizeof(Matchelt *));
1036 for(i = 0; i <= M - W + 1; i++)
1037 sgp->Matchlist[i] = NULL;
1038
1039 /* traverse s2 */
1040 for(i = 1; i <= N - W + 1; i++) {
1041 ptr = sgp->lists[i];
1042 while(ptr != NULL) {
1043 new = (Matchelt *)MemNew(sizeof(Matchelt));
1044 new->mblock = i;
1045 new->mnext = sgp->Matchlist[ptr->pos - W + 1];
1046 new->mref = ptr;
1047 sgp->Matchlist[ptr->pos - W + 1] = new;
1048 ptr = ptr->next;
1049 }
1050 }
1051 }
1052
bld_lists(Int1 * s2,Int4 N,Int4 W,data_t * data,SIM4GlobalPtr sgp)1053 static void bld_lists(Int1 *s2, Int4 N, Int4 W, data_t *data, SIM4GlobalPtr sgp)
1054 {
1055 Int4 i, si;
1056 Uint4 ecode, mask, temp;
1057 Int1 *s, *marker;
1058
1059 sgp->lists = (Table **)MemNew((N - W + 2) * sizeof(Table *));
1060 for(i = 0; i <= N - W + 1; i++) sgp->lists[i] = NULL;
1061 mask = (1 << (W + W - 2)) - 1;
1062 s = s2 - 1;
1063
1064 ecode = 0L;
1065 for(i = si = 1; i < W; ++si)
1066 if((temp = data->encoding[*++s]) != 4) {
1067 ecode = (ecode << 2) + temp;
1068 i++;
1069 }
1070 marker = s+1;
1071 for(; (*++s) && (si <= N); si++)
1072 if((temp = data->encoding[*s]) != 4) {
1073 ecode = ((ecode & mask) << 2) + temp;
1074 sgp->lists[s - marker + 1] = sgp->pos_tbl[ecode >> 16][ecode&D_MASK];
1075 }
1076 }
1077
1078
find_blocks(Int4 W,Links * lk,Int4 offset1,Int4 offset2,data_t * data,SIM4GlobalPtr sgp)1079 static Int4 find_blocks(Int4 W, Links *lk, Int4 offset1, Int4 offset2, data_t *data, SIM4GlobalPtr sgp)
1080 {
1081 Links *lkptr;
1082
1083 Int4 i, bno;
1084 Int4 ppos1, ppos2, cpos2, cpos1;
1085
1086 lkptr = lk;
1087 bno = 0;
1088 ppos2 = MAXINT;
1089 cpos1 = cpos2 = ppos1 = 0;
1090 while(lkptr != NULL) {
1091 cpos2 = lkptr->lptr->mblock+W-1;
1092 cpos1 = lkptr->lptr->mref->pos;
1093
1094 if(abs(ppos2 - ppos1 - cpos2 + cpos1) > ALPHA * W) {
1095 if((bno < 2) || (sgp->Blist[bno - 1 + data->b_offset].init_pos1 - offset1 - cpos1 > W)) {
1096 sgp->Blist[bno + data->b_offset].init_pos1 = ppos1 + offset1;
1097 sgp->Blist[bno + data->b_offset].init_pos2 = ppos2 + offset2;
1098 bno++;
1099 }
1100 sgp->Blist[bno + data->b_offset].end_pos1 = cpos1 + offset1;
1101 sgp->Blist[bno + data->b_offset].end_pos2 = cpos2 + offset2;
1102 }
1103 ppos1 = cpos1; ppos2 = cpos2;
1104 lkptr = lkptr->lnext;
1105 }
1106 if((bno < 2) || (sgp->Blist[bno - 1 + data->b_offset].init_pos1 - offset1 - cpos1 > W)) {
1107 sgp->Blist[bno + data->b_offset].init_pos1 = cpos1 + offset1;
1108 sgp->Blist[bno + data->b_offset].init_pos2 = cpos2 + offset2;
1109 }
1110 else bno--;
1111
1112 if((bno > 1) && (sgp->Blist[bno - 1 + data->b_offset].init_pos1 <= W + offset1)) bno--;
1113
1114 sgp->Blist[data->b_offset].next = data->b_offset + 1;
1115 sgp->Blist[bno + data->b_offset + 1].end_pos1 = sgp->Blist[bno + data->b_offset + 1].end_pos2 = 0;
1116
1117 for(i = 1; i <= bno; i++) {
1118 sgp->Blist[i + data->b_offset].next = i + data->b_offset + 1;
1119 cpos1 = MAX(sgp->Blist[i + data->b_offset].init_pos1 - W + 1,
1120 sgp->Blist[i + 1 + data->b_offset].end_pos1 + 1);
1121 sgp->Blist[i + data->b_offset].init_pos2 =
1122 MAX(sgp->Blist[i + data->b_offset].init_pos2 - sgp->Blist[i + data->b_offset].init_pos1 + cpos1,
1123 sgp->Blist[i + 1 + data->b_offset].end_pos2 + 1);
1124 sgp->Blist[i + data->b_offset].init_pos1 = cpos1;
1125 }
1126
1127 sgp->Blist[bno + data->b_offset].next = 0;
1128
1129 return bno;
1130 }
1131
link_to_data_list(Pointer data,ValNodePtr PNTR head,ValNodePtr PNTR prev)1132 static void link_to_data_list(Pointer data, ValNodePtr PNTR head, ValNodePtr PNTR prev)
1133 {
1134 ValNodePtr curr;
1135
1136 curr = ValNodeNew(NULL);
1137 curr->data.ptrvalue = data;
1138 if(*prev == NULL)
1139 *head = curr;
1140 else
1141 (*prev)->next = curr;
1142 *prev = curr;
1143 }
1144
1145
bgreedy(Int1 * s1,Int1 * s2,Int4 m,Int4 n,Int4 offset1,Int4 offset2,data_t * data,SIM4GlobalPtr sgp)1146 static Int4 bgreedy(Int1 *s1, Int1 *s2, Int4 m, Int4 n,
1147 Int4 offset1, Int4 offset2, data_t *data, SIM4GlobalPtr sgp)
1148 {
1149 Int4
1150 col, /* column number */
1151 row, /* row number */
1152 max_d, /* bound on the length of edit script */
1153 d, /* current distance */
1154 k, /* current diagonal */
1155 bno=0, /* returned number of blocks */
1156 DELTA, /* n-m */
1157 ORIGIN,
1158 lower, /* boundaries for searching diagonals */
1159 upper;
1160
1161 Int4 *last_d, *temp_d; /* row containing the last d */
1162 struct edit **script, /* corresponding edit script */
1163 **temp_script; /* temporary edit script */
1164 struct edit *new;
1165 ValNodePtr data_list = NULL, prev = NULL;
1166
1167 DELTA = n-m;
1168 max_d = DIST_CUTOFF;
1169
1170 /*
1171 if ((DELTA<0) &&
1172 (abs(DELTA) > MAX((Int4)(ALPHA*DEFAULT_W),(Int4)(CUTOFF*n))))
1173 fprintf(stderr,"\nWarning: data do not support proper segmentation!\n\n");
1174 */
1175
1176 ORIGIN = m;
1177 for(row = m, col = n; row > 0 && col > 0 && (s1[row - 1] == s2[col - 1]); row--,col--);
1178
1179 if(row == 0) {
1180 /* hit first row; stop search */
1181 sgp->Blist[data->b_offset].init_pos1 = offset1 + 1;
1182 sgp->Blist[data->b_offset].end_pos1 = offset1 + m;
1183 sgp->Blist[data->b_offset].init_pos2 = offset2 - m + n + 1;
1184 sgp->Blist[data->b_offset].end_pos2 = offset2 + n;
1185
1186 return ++bno;
1187 }
1188 if(col == 0) {
1189 /* hit first column */
1190 sgp->Blist[data->b_offset].init_pos1 = offset1 + m - n + 1;
1191 sgp->Blist[data->b_offset].end_pos1 = offset1 + m;
1192 sgp->Blist[data->b_offset].init_pos2 = offset2 + 1;
1193 sgp->Blist[data->b_offset].end_pos2 = offset2 + n;
1194
1195 return ++bno;
1196 }
1197
1198 last_d = (Int4Ptr )MemNew((m + n + 1) * sizeof(Int4));
1199 temp_d = (Int4Ptr )MemNew((m + n + 1) * sizeof(Int4));
1200 script = (struct edit **)MemNew((m + n + 1)*sizeof(struct edit *));
1201 temp_script = (struct edit **)MemNew((m + n + 1)*sizeof(struct edit *));
1202
1203 for(k = 0; k <= m + n; script[k] = NULL, last_d[k++] = m + 1);
1204 last_d[ORIGIN+DELTA] = row;
1205
1206 lower = ORIGIN + DELTA - 1;
1207 upper = ORIGIN + DELTA + 1;
1208
1209 for(d = 1; d <= max_d ; ++d) {
1210
1211 /* for each relevant diagonal ... */
1212 for(k = lower; k <= upper; k++) {
1213 /* get space for the next edit instruction */
1214 new = (struct edit *) MemNew(sizeof(struct edit));
1215 link_to_data_list((Pointer)new, &data_list, &prev);
1216
1217 /* find a d on diagonal k */
1218 if (k==-d+DELTA+ORIGIN) {
1219
1220 /* move left from the last d-1 on diagonal k+1 */
1221 row = last_d[k+1];
1222 new->link = script[k+1];
1223 new->op = INS;
1224 } else if (k==d+DELTA+ORIGIN) {
1225
1226 /* move up from the last d-1 on diagonal k-1 */
1227 row = last_d[k-1]-1;
1228 new->link = script[k-1];
1229 new->op = DEL;
1230 } else if ((last_d[k]-1<=last_d[k+1]) &&
1231 (last_d[k]-1<=last_d[k-1]-1)) {
1232 /* substitution */
1233 row = last_d[k]-1;
1234 new->link = script[k];
1235 new->op = SUB;
1236
1237 } else if ((last_d[k-1]-1<=last_d[k+1]) &&
1238 (last_d[k-1]-1<=last_d[k]-1)) {
1239 /* move right from the last d-1 on diagonal k-1 */
1240 row = last_d[k-1]-1;
1241 new->link = script[k-1];
1242 new->op = DEL;
1243 } else {
1244
1245 /* move left from the last d-1 on diagonal k+1 */
1246 row = last_d[k+1];
1247 new->link = script[k+1];
1248 new->op = INS;
1249 }
1250 /* code common to the three cases */
1251 new->line1 = row;
1252 new->line2 = col = row + k - ORIGIN;
1253 temp_script[k] = new;
1254
1255 /* slide up the diagonal */
1256 while (row > 0 && col > 0 && (s1[row-1]==s2[col-1])) {
1257 --row;
1258 --col;
1259 }
1260 temp_d[k] = row;
1261
1262 if (row == 0 && col == 0) {
1263 /* hit northwest corner; have the answer */
1264 script[k] = temp_script[k];
1265 MemFree(last_d); MemFree(temp_d);
1266 bno = bput_scr(script[k],m,n,offset1,offset2,0,0,data->b_offset, sgp);
1267 MemFree(script);
1268 MemFree(temp_script);
1269 ValNodeFreeData(data_list);
1270
1271 return bno;
1272 }
1273 if (row == 0) {
1274
1275 /* hit last row, don't look to the right;
1276 record first and final insertions in the
1277 list -- possibly identical.
1278 */
1279 script[k] = temp_script[k];
1280
1281 /* record first INS */
1282 new = (struct edit *) MemNew(sizeof(struct edit));
1283 link_to_data_list((Pointer)new, &data_list, &prev);
1284 new->line1 = 0;
1285 new->line2 = col = --k - ORIGIN;
1286 new->op = INS;
1287 new->link = script[k+1];
1288 script[k] = new;
1289
1290 /* record last INS */
1291 new = (struct edit *) MemNew(sizeof(struct edit));
1292 link_to_data_list((Pointer)new, &data_list, &prev);
1293 new->line1 = 0;
1294 new->line2 = col = 0;
1295 new->op = INS;
1296 new->link = script[k];
1297 script[ORIGIN] = new;
1298
1299 MemFree(last_d); MemFree(temp_d);
1300 bno = bput_scr(script[ORIGIN],m,n,offset1,offset2,0,0,data->b_offset, sgp);
1301 MemFree(script);
1302 MemFree(temp_script);
1303 ValNodeFreeData(data_list);
1304
1305 return bno;
1306 }
1307
1308 if (col == 0) {
1309
1310 /* hit last column, don't look forward;
1311 record first and final deletions in the
1312 list, possibly identical.
1313 */
1314 script[k] = temp_script[k];
1315
1316 /* record first DEL */
1317 new = (struct edit *) MemNew(sizeof(struct edit));
1318 link_to_data_list((Pointer)new, &data_list, &prev);
1319 new->line1 = -(++k) + ORIGIN;
1320 new->line2 = 0;
1321 new->op = DEL;
1322 new->link = script[k-1];
1323 script[k] = new;
1324
1325 /* record last DEL */
1326 new = (struct edit *) MemNew(sizeof(struct edit));
1327 link_to_data_list((Pointer)new, &data_list, &prev);
1328 new->line1 = 0;
1329 new->line2 = 0;
1330 new->op = DEL;
1331 new->link = script[k];
1332 script[ORIGIN] = new;
1333
1334
1335 bno = bput_scr(script[ORIGIN],m,n,offset1,offset2,0,0,data->b_offset, sgp);
1336 MemFree(last_d); MemFree(temp_d);
1337
1338 MemFree(script);
1339 MemFree(temp_script);
1340 ValNodeFreeData(data_list);
1341
1342 return bno;
1343 }
1344
1345 }
1346 for (k=lower; k<=upper; k++) {
1347 script[k] = temp_script[k];
1348 last_d[k] = temp_d[k];
1349 }
1350
1351 --lower;
1352 ++upper;
1353 }
1354 MemFree(last_d);
1355 MemFree(temp_d);
1356 MemFree(script);
1357 MemFree(temp_script);
1358 ValNodeFreeData(data_list);
1359
1360 return bno=0;
1361 }
1362
fgreedy(Int1 * s1,Int1 * s2,Int4 m,Int4 n,Int4 offset1,Int4 offset2,data_t * data,SIM4GlobalPtr sgp)1363 static Int4 fgreedy(Int1 *s1, Int1 *s2, Int4 m, Int4 n, Int4 offset1, Int4 offset2, data_t *data, SIM4GlobalPtr sgp)
1364 {
1365 Int4 col, /* column number */
1366 row, /* row number */
1367 max_d, /* bound on the length of the edit script */
1368 d, /* current compressed distance */
1369 k, /* current diagonal */
1370 bno=0, /* returned number of blocks */
1371 DELTA, /* n-m */
1372 ORIGIN,
1373 lower,
1374 upper;
1375 Int4 *last_d, *temp_d; /* column containing the last p */
1376 struct edit **script, /* corresponding edit script */
1377 **temp_script;
1378 struct edit *new;
1379 ValNodePtr data_list = NULL, prev = NULL;
1380
1381
1382 DELTA = n-m;
1383 max_d = DIST_CUTOFF;
1384 /*
1385 if ((DELTA<0) &&
1386 (abs(DELTA) > MAX((int)(ALPHA*DEFAULT_W),(int)(CUTOFF*n))))
1387 fprintf(stderr,"\nWarning: data do not support proper segmentation!\n\n");
1388 */
1389
1390 ORIGIN = m;
1391 for (row=0, col=0; col<n && row<m && (s1[row]==s2[col]); row++, col++);
1392
1393 if (row == m) {
1394 /* hit last row; stop search */
1395
1396 sgp->Blist[data->b_offset].init_pos1 = offset1+1;
1397 sgp->Blist[data->b_offset].end_pos1 = offset1+m;
1398 sgp->Blist[data->b_offset].init_pos2 = offset2+1;
1399 sgp->Blist[data->b_offset].end_pos2 = offset2+m;
1400
1401 return ++bno;
1402 }
1403
1404 if (col == n) {
1405 /* hit last column */
1406
1407 sgp->Blist[data->b_offset].init_pos1 = offset1+1;
1408 sgp->Blist[data->b_offset].end_pos1 = offset1+n;
1409 sgp->Blist[data->b_offset].init_pos2 = offset2+1;
1410 sgp->Blist[data->b_offset].end_pos2 = offset2+n;
1411
1412 return ++bno;
1413 }
1414
1415
1416 last_d = (Int4Ptr )MemNew((m+n+1)*sizeof(Int4));
1417 temp_d = (Int4Ptr )MemNew((m+n+1)*sizeof(Int4));
1418 script = (struct edit **)MemNew((m+n+1)*sizeof(struct edit *));
1419 temp_script = (struct edit **)MemNew((m+n+1)*sizeof(struct edit *));
1420
1421 for (k=0; k<=m+n; script[k] = NULL, last_d[k++]=-1);
1422 last_d[ORIGIN] = row;
1423
1424 lower = ORIGIN - 1;
1425 upper = ORIGIN + 1;
1426
1427 for (d = 1; d <= max_d ; ++d) {
1428
1429 /* for each relevant diagonal ... */
1430 for (k = lower; k <= upper; k++) {
1431 /* get space for the next edit instruction */
1432 new = (struct edit *) MemNew(sizeof(struct edit));
1433 link_to_data_list((Pointer)new, &data_list, &prev);
1434
1435 /* find a d on diagonal k */
1436 if (k==-d+ORIGIN) {
1437
1438 /* move down from the last d-1 on diagonal k+1 */
1439 row = last_d[k+1]+1;
1440 new->link = script[k+1];
1441 new->op = DEL;
1442 } else if (k==d+ORIGIN) {
1443
1444 /* move right from the last d-1 on diagonal k-1 */
1445 row = last_d[k-1];
1446 new->link = script[k-1];
1447 new->op = INS;
1448 } else if ((last_d[k]>=last_d[k+1]) &&
1449 (last_d[k]+1>=last_d[k-1])) {
1450
1451 /* substitution */
1452 row = last_d[k]+1;
1453 new->link = script[k];
1454 new->op = SUB;
1455 } else if ((last_d[k+1]+1>=last_d[k-1]) &&
1456 (last_d[k+1]>=last_d[k])) {
1457
1458 /* move down from the last d-1 on diagonal k+1 */
1459 row = last_d[k+1]+1;
1460 new->link = script[k+1];
1461 new->op = DEL;
1462 } else {
1463
1464 /* move right from the last d-1 on diagonal k-1 */
1465 row = last_d[k-1];
1466 new->link = script[k-1];
1467 new->op = INS;
1468 }
1469
1470 /* code common to the three cases */
1471 new->line1 = row;
1472 new->line2 = col = row + k - ORIGIN;
1473 temp_script[k] = new;
1474
1475 /* slide down the diagonal */
1476 if (row>=0)
1477 while (row < m && col < n && (s1[row]==s2[col])) {
1478 ++row;
1479 ++col;
1480 }
1481 temp_d[k] = row;
1482
1483 if (row == m && col == n) {
1484 /* hit southeast corner; have the answer */
1485 script[k] = temp_script[k];
1486 bno = fput_scr(script[k],offset1,offset2,m,n,data->b_offset, sgp);
1487 MemFree(last_d); MemFree(temp_d);
1488
1489 MemFree(script);
1490 MemFree(temp_script);
1491 ValNodeFreeData(data_list);
1492
1493 return bno;
1494 }
1495 if (row == m) {
1496 /* hit last row; don't look further */
1497 script[k] = temp_script[k];
1498
1499 /* record first INS */
1500 new = (struct edit *) MemNew(sizeof(struct edit));
1501 link_to_data_list((Pointer)new, &data_list, &prev);
1502 new->line1 = m;
1503 new->line2 = col = m + (++k) - ORIGIN;
1504 new->op = INS;
1505 new->link = script[k-1];
1506 script[k] = new;
1507
1508 /* record last INS */
1509 new = (struct edit *) MemNew(sizeof(struct edit));
1510 link_to_data_list((Pointer)new, &data_list, &prev);
1511 new->line1 = m;
1512 new->line2 = n;
1513 new->op = INS;
1514 new->link = script[k];
1515 script[ORIGIN + DELTA] = new;
1516
1517 MemFree(last_d); MemFree(temp_d);
1518 bno = fput_scr(script[ORIGIN+DELTA],offset1,offset2,m,n,data->b_offset, sgp);
1519
1520 MemFree(script);
1521 MemFree(temp_script);
1522 ValNodeFreeData(data_list);
1523
1524 return bno;
1525 }
1526
1527 if (col == n) {
1528 /* hit last column; don't look further */
1529 script[k] = temp_script[k];
1530
1531 /* record first DEL */
1532 new = (struct edit *) MemNew(sizeof(struct edit));
1533 link_to_data_list((Pointer)new, &data_list, &prev);
1534 new->line1 = n - (--k) + ORIGIN;
1535 new->line2 = n;
1536 new->op = DEL;
1537 new->link = script[k+1];
1538 script[k] = new;
1539
1540 /* record last DEL */
1541 new = (struct edit *) MemNew(sizeof(struct edit));
1542 link_to_data_list((Pointer)new, &data_list, &prev);
1543 new->line1 = m;
1544 new->line2 = n;
1545 new->op = DEL;
1546 new->link = script[k];
1547 script[ORIGIN] = new;
1548
1549 MemFree(last_d); MemFree(temp_d);
1550 bno = fput_scr(script[ORIGIN+DELTA],offset1,offset2,m,n,data->b_offset, sgp);
1551 MemFree(script);
1552 MemFree(temp_script);
1553 ValNodeFreeData(data_list);
1554 return bno;
1555 }
1556
1557
1558 }
1559 for (k=lower; k<=upper; k++) {
1560 script[k] = temp_script[k];
1561 last_d[k] = temp_d[k];
1562 }
1563
1564 --lower;
1565 ++upper;
1566 }
1567 ValNodeFreeData(data_list);
1568 MemFree(last_d);
1569 MemFree(temp_d);
1570 MemFree(script);
1571 MemFree(temp_script);
1572
1573 return bno=0;
1574 }
1575
free_2D_Int(Int4 ** val,Int4 size)1576 static void free_2D_Int(Int4 ** val, Int4 size)
1577 {
1578 Int4 i;
1579
1580 for(i = 0; i<size; ++i)
1581 MemFree(val[i]);
1582 MemFree(val);
1583 }
1584
free_this_script(struct edit *** val,Int4 size)1585 static void free_this_script (struct edit ***val, Int4 size)
1586 {
1587 Int4 i;
1588
1589 for(i = 0; i<size; ++i)
1590 MemFree(val[i]);
1591 MemFree(val);
1592 }
1593
greedy(Int1 * s1,Int1 * s2,Int4 m,Int4 n,Int4 offset1,Int4 offset2,Int4Ptr * first,Int4Ptr * last,Int4Ptr dist,data_t * data,SIM4GlobalPtr sgp)1594 static Int4 greedy(Int1 *s1, Int1 *s2, Int4 m, Int4 n,
1595 Int4 offset1, Int4 offset2,
1596 Int4Ptr *first, Int4Ptr *last, Int4Ptr dist, data_t *data, SIM4GlobalPtr sgp)
1597 {
1598 Int4 col, /* column number */
1599 d, /* current distance */
1600 k, /* current diagonal */
1601 max_d, /* bound on size of edit script */
1602 bno=0, /* returned number of blocks */
1603 Cost,
1604 blower,flower, /* boundaries for searching diagonals */
1605 bupper,fupper,
1606 row, /* row number */
1607 DELTA, /* n-m */
1608 ORIGIN;
1609 Int4 back, forth; /* backward and forward limits at exit */
1610
1611 Int4 **blast_d, **flast_d; /* rows containing the last d */
1612 Int4 *min_row, *min_diag, /* MIN (b)/ MAX (f) row (and diagonal) */
1613 *max_row, *max_diag; /* reached for cost d=0, ... m. */
1614
1615 struct edit ***bscript, /* corresponding edit scripts */
1616 ***fscript;
1617 struct edit *new;
1618 ValNodePtr data_list = NULL, prev = NULL;
1619
1620 DELTA = n-m;
1621 max_d = m+1;
1622 if (DELTA<0)
1623 { /* if (abs(DELTA) > MAX((Int4)(ALPHA*DEFAULT_W),(Int4)(CUTOFF*n)))
1624 fprintf(stderr,"\nWarning: data do not support proper segmentation!\n\n");
1625 */
1626
1627 /* record block, anyways */
1628 sgp->Blist[data->b_offset].init_pos1 = offset1+1;
1629 sgp->Blist[data->b_offset].init_pos2 = offset2+1;
1630 sgp->Blist[data->b_offset].end_pos1 = offset1+m;
1631 sgp->Blist[data->b_offset].end_pos2 = offset2+n;
1632 **last = **first = data->b_offset;
1633
1634 *dist = -1;
1635 return ++bno;
1636 }
1637
1638 ORIGIN = m;
1639 for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--);
1640
1641 if (row == 0) {
1642 /* hit last row; stop search */
1643 sgp->Blist[data->b_offset].init_pos1 = offset1+1;
1644 sgp->Blist[data->b_offset].end_pos1 = offset1+m;
1645 sgp->Blist[data->b_offset].init_pos2 = offset2-m+n+1;
1646 sgp->Blist[data->b_offset].end_pos2 = offset2+n;
1647 **last = **first = data->b_offset;
1648
1649 *dist = 0;
1650 return ++bno;
1651 }
1652
1653 blast_d = (Int4Ptr *)MemNew((m+1)*sizeof(Int4Ptr ));
1654 bscript = (struct edit ***)MemNew((m+1)*sizeof(struct edit **));
1655 for (d=0; d<=m; d++) {
1656 blast_d[d] = (Int4Ptr )MemNew((m+n+1)*sizeof(Int4));
1657 bscript[d] = (struct edit **)MemNew((m+n+1)*sizeof(struct edit *));
1658 }
1659
1660 for (d=0; d<=m; d++)
1661 for (k=0; k<=m+n; blast_d[d][k++]=m+1);
1662 blast_d[0][ORIGIN+DELTA] = row;
1663 bscript[0][ORIGIN+DELTA] = NULL;
1664
1665 blower = ORIGIN + DELTA - 1;
1666 bupper = ORIGIN + DELTA + 1;
1667
1668 for (row=0; row<n && row<m && (s1[row]==s2[row]); row++);
1669
1670 if (row == m) {
1671 /* hit last row; stop search */
1672 sgp->Blist[data->b_offset].init_pos1 = offset1+1;
1673 sgp->Blist[data->b_offset].end_pos1 = offset1+m;
1674 sgp->Blist[data->b_offset].init_pos2 = offset2+1;
1675 sgp->Blist[data->b_offset].end_pos2 = offset2+m;
1676 **last = **first = data->b_offset;
1677
1678 *dist = 0;
1679 free_this_script (bscript, m+1);
1680 free_2D_Int(blast_d, m+1);
1681
1682 return ++bno;
1683 }
1684
1685 flast_d = (Int4Ptr *)MemNew((m+1)*sizeof(Int4Ptr ));
1686 fscript = (struct edit ***)MemNew((m+1)*sizeof(struct edit **));
1687 for (d=0; d<=m; d++) {
1688 flast_d[d] = (Int4Ptr )MemNew((m+n+1)*sizeof(Int4));
1689 fscript[d] = (struct edit **)MemNew((m+n+1)*sizeof(struct edit *));
1690 }
1691
1692
1693 for (d=0; d<=m; d++)
1694 for (k=0; k<=m+n; flast_d[d][k++]=-1);
1695 flast_d[0][ORIGIN] = row;
1696 fscript[0][ORIGIN] = NULL;
1697
1698 flower = ORIGIN - 1;
1699 fupper = ORIGIN + 1;
1700
1701 max_row = (Int4Ptr )MemNew((m+1)*sizeof(Int4));
1702 min_row = (Int4Ptr )MemNew((m+1)*sizeof(Int4));
1703 max_diag = (Int4Ptr )MemNew((m+1)*sizeof(Int4));
1704 min_diag = (Int4Ptr )MemNew((m+1)*sizeof(Int4));
1705
1706 for (d=1; d<=m; d++) {
1707 min_row[d] = m+1;
1708 max_row[d] = -1;
1709 }
1710 min_row[0] = blast_d[0][ORIGIN+DELTA];
1711 min_diag[0] = ORIGIN+DELTA;
1712 max_row[0] = flast_d[0][ORIGIN];
1713 max_diag[0] = ORIGIN;
1714
1715 d = 1;
1716 while (d <= max_d) {
1717
1718 /* for each relevant diagonal ... */
1719 for (k = blower; k <= bupper; k++) {
1720 /* get space for the next edit instruction */
1721 new = (struct edit *)MemNew(sizeof(struct edit));
1722 link_to_data_list((Pointer)new, &data_list, &prev);
1723
1724 /* find a d on diagonal k */
1725 if (k==-d+DELTA+ORIGIN) {
1726
1727 /* move left from the last d-1 on diagonal k+1 */
1728 row = blast_d[d-1][k+1];
1729 new->link = bscript[d-1][k+1];
1730 new->op = INS;
1731 }
1732 else if (k==d+DELTA+ORIGIN) {
1733
1734 /* move up from the last d-1 on diagonal k-1 */
1735 row = blast_d[d-1][k-1]-1;
1736 new->link = bscript[d-1][k-1];
1737 new->op = DEL;
1738 } else if ((blast_d[d-1][k]<=blast_d[d-1][k+1]) &&
1739 (blast_d[d-1][k]-1<=blast_d[d-1][k-1])) {
1740
1741 /* substitution */
1742 row = blast_d[d-1][k]-1;
1743 new->link = bscript[d-1][k];
1744 new->op = SUB;
1745
1746 } else if ((blast_d[d-1][k-1]<=blast_d[d-1][k+1]-1) &&
1747 (blast_d[d-1][k-1]<=blast_d[d-1][k]-1)) {
1748 /* move right from the last d-1 on diagonal k-1 */
1749 row = blast_d[d-1][k-1]-1;
1750 new->link = bscript[d-1][k-1];
1751 new->op = DEL;
1752 } else {
1753 /* move left from the last d-1 on diagonal k+1 */
1754 row = blast_d[d-1][k+1];
1755 new->link = bscript[d-1][k+1];
1756 new->op = INS;
1757 }
1758 /* code common to the three cases */
1759 new->line1 = row;
1760 new->line2 = col = row + k - ORIGIN;
1761 bscript[d][k] = new;
1762
1763 /* slide up the diagonal */
1764 while (row > 0 && col > 0 && (s1[row-1]==s2[col-1])) {
1765 --row;
1766 --col;
1767 }
1768 blast_d[d][k] = row;
1769
1770 if (row == 0 && col == 0)
1771 if (d <= max_d) {
1772 /* hit northwest corner; have the answer */
1773 bno = bput_scr(bscript[d][k],m,n,offset1,offset2,0,0,data->b_offset, sgp);
1774 **first = data->b_offset+bno-1;
1775 **last = data->b_offset;
1776
1777 /* for (Cost=0; Cost<=m; Cost++) {
1778 MemFree(blast_d[Cost]); MemFree(flast_d[Cost]);
1779 MemFree(bscript[Cost]); MemFree(fscript[Cost]);
1780 } */
1781 free_this_script (bscript, m+1);
1782 free_this_script (fscript, m+1);
1783 free_2D_Int(blast_d, m+1);
1784 free_2D_Int(flast_d, m+1);
1785
1786 MemFree(min_row); MemFree(min_diag);
1787 MemFree(max_row); MemFree(max_diag);
1788 ValNodeFreeData(data_list);
1789
1790 *dist = d;
1791 return bno;
1792 } else goto fin;
1793
1794 if (row == 0)
1795 /* hit last row; fill in insertions */
1796 if (d <= max_d) {
1797 new = (struct edit *) MemNew(sizeof(struct edit));
1798 link_to_data_list((Pointer)new, &data_list, &prev);
1799 new->line1 = 0;
1800 new->line2 = col = --k - ORIGIN;
1801 new->op = INS;
1802 new->link = bscript[d][k+1];
1803 bscript[d][k] = new;
1804
1805 /* record last INS */
1806 new = (struct edit *)MemNew(sizeof(struct edit));
1807 link_to_data_list((Pointer)new, &data_list, &prev);
1808 new->line1 = 0;
1809 new->line2 = col = 0;
1810 new->op = INS;
1811 new->link = bscript[d][k];
1812
1813 bno = bput_scr(new,m,n,offset1,offset2,0,0,data->b_offset, sgp);
1814 **first = data->b_offset+bno-1;
1815 **last = data->b_offset;
1816
1817 /* for (Cost = 0; Cost<=m; Cost++) {
1818 MemFree(blast_d[Cost]); MemFree(flast_d[Cost]);
1819 MemFree(bscript[Cost]); MemFree(fscript[Cost]);
1820 }
1821 MemFree(min_row); MemFree(min_diag);
1822 MemFree(max_row); MemFree(max_diag);*/
1823 free_this_script (bscript, m+1);
1824 free_this_script (fscript, m+1);
1825 free_2D_Int(blast_d, m+1);
1826 free_2D_Int(flast_d, m+1);
1827
1828 MemFree(min_row); MemFree(min_diag);
1829 MemFree(max_row); MemFree(max_diag);
1830 ValNodeFreeData(data_list);
1831
1832
1833 *dist = d;
1834 return bno;
1835
1836 } else goto fin;
1837
1838 if (col == 0)
1839 if (d <= max_d) {
1840 /* hit last column; fill in deletions */
1841 /* record first DEL */
1842 new = (struct edit *) MemNew(sizeof(struct edit));
1843 link_to_data_list((Pointer)new, &data_list, &prev);
1844 new->line1 = -(++k) + ORIGIN;
1845 new->line2 = 0;
1846 new->op = DEL;
1847 new->link = bscript[d][k-1];
1848 bscript[d][k] = new;
1849
1850 /* record last DEL */
1851 new = (struct edit *)MemNew(sizeof(struct edit));
1852 link_to_data_list((Pointer)new, &data_list, &prev);
1853 new->line1 = 0;
1854 new->line2 = 0;
1855 new->op = DEL;
1856 new->link = bscript[d][k];
1857
1858 bno = bput_scr(new,m,n,offset1,offset2,0,0,data->b_offset, sgp);
1859 **first = data->b_offset+bno-1;
1860 **last = data->b_offset;
1861
1862 /* for (Cost = 0; Cost<=m; Cost++) {
1863 MemFree(blast_d[Cost]); MemFree(flast_d[Cost]);
1864 MemFree(bscript[Cost]); MemFree(fscript[Cost]);
1865 }
1866 MemFree(min_row); MemFree(min_diag);
1867 MemFree(max_row); MemFree(max_diag);*/
1868
1869 free_this_script (bscript, m+1);
1870 free_this_script (fscript, m+1);
1871 free_2D_Int(blast_d, m+1);
1872 free_2D_Int(flast_d, m+1);
1873
1874 MemFree(min_row); MemFree(min_diag);
1875 MemFree(max_row); MemFree(max_diag);
1876 ValNodeFreeData(data_list);
1877
1878
1879 *dist = d;
1880 return bno;
1881 } else goto fin;
1882
1883 } /* for k */
1884
1885 min_row[d] = blast_d[d][DELTA+ORIGIN];
1886 min_diag[d] = DELTA+ORIGIN;
1887 for (k=blower; k<=bupper; ++k)
1888 if (blast_d[d][k]<min_row[d]) {
1889 min_row[d] = blast_d[d][k];
1890 min_diag[d] = k;
1891 }
1892
1893 /* record cell, if paths overlap with minimum combined cost */
1894 /* obs: it suffices to search up to Cost=min(d-1,(max_d-d)) */
1895 for (Cost=0; Cost<d; Cost++)
1896 if ((min_row[d]<=max_row[Cost]) &&
1897 (max_d > d+Cost)) {
1898 max_d = d+Cost;
1899 back = d;
1900 forth = Cost;
1901 break;
1902 }
1903
1904 --blower;
1905 ++bupper;
1906
1907
1908 /* for each relevant diagonal ... */
1909 for (k = flower; k <= fupper; k++) {
1910 /* get space for the next edit instruction */
1911 new = (struct edit *) MemNew(sizeof(struct edit));
1912 link_to_data_list((Pointer)new, &data_list, &prev);
1913
1914 /* find a d on diagonal k */
1915 if (k==-d+ORIGIN) {
1916 /* move down from the last d-1 on diagonal k+1 */
1917 row = flast_d[d-1][k+1]+1;
1918 new->link = fscript[d-1][k+1];
1919 new->op = DEL;
1920
1921 } else if (k==d+ORIGIN) {
1922 /* move right from the last d-1 on diagonal k-1 */
1923 row = flast_d[d-1][k-1];
1924 new->link = fscript[d-1][k-1];
1925 new->op = INS;
1926
1927 } else if ((flast_d[d-1][k]>=flast_d[d-1][k+1]) &&
1928 (flast_d[d-1][k]+1>=flast_d[d-1][k-1])) {
1929
1930 /* substitution */
1931 row = flast_d[d-1][k]+1;
1932 new->link = fscript[d-1][k];
1933 new->op = SUB;
1934
1935 } else if ((flast_d[d-1][k+1]+1>=flast_d[d-1][k-1]) &&
1936 (flast_d[d-1][k+1]>=flast_d[d-1][k])) {
1937
1938 /* move left from the last d-1 on diagonal k+1 */
1939 row = flast_d[d-1][k+1]+1;
1940 new->link = fscript[d-1][k+1];
1941 new->op = DEL;
1942 } else {
1943 /* move right from the last d-1 on diagonal k-1 */
1944 row = flast_d[d-1][k-1];
1945 new->link = fscript[d-1][k-1];
1946 new->op = INS;
1947
1948 }
1949 /* code common to the three cases */
1950 new->line1 = row;
1951 new->line2 = col = row + k - ORIGIN;
1952 fscript[d][k] = new;
1953
1954 /* slide down the diagonal */
1955 if (row>=0)
1956 while (row < m && col < n && (s1[row]==s2[col])) {
1957 ++row;
1958 ++col;
1959 }
1960 flast_d[d][k] = row;
1961
1962 if (row == m && col == n)
1963 if (d<=max_d) {
1964 /* hit southeast corner; have the answer */
1965 bno = fput_scr(fscript[d][k],offset1,offset2,m,n,data->b_offset, sgp);
1966
1967 **first = data->b_offset;
1968 **last = data->b_offset+bno-1;
1969
1970 /* for (Cost = 0; Cost<=m; Cost++) {
1971 MemFree(blast_d[Cost]); MemFree(flast_d[Cost]);
1972 MemFree(bscript[Cost]); MemFree(fscript[Cost]);
1973 }
1974 MemFree(min_row); MemFree(min_diag);
1975 MemFree(max_row); MemFree(max_diag); */
1976
1977 free_this_script (bscript, m+1);
1978 free_this_script (fscript, m+1);
1979 free_2D_Int(blast_d, m+1);
1980 free_2D_Int(flast_d, m+1);
1981
1982 MemFree(min_row); MemFree(min_diag);
1983 MemFree(max_row); MemFree(max_diag);
1984 ValNodeFreeData(data_list);
1985
1986
1987 *dist = d;
1988 return bno;
1989 } else goto fin;
1990
1991 if (row == m)
1992 if (d<=max_d) {
1993 /* hit last row; don't look forward */
1994 /* record first INS */
1995 new = (struct edit *)MemNew(sizeof(struct edit));
1996 link_to_data_list((Pointer)new, &data_list, &prev);
1997 new->line1 = m;
1998 new->line2 = col = m + (++k) - ORIGIN;
1999 new->op = INS;
2000 new->link = fscript[d][k-1];
2001 fscript[d][k] = new;
2002
2003 /* record last INS */
2004 new = (struct edit *)MemNew(sizeof(struct edit));
2005 link_to_data_list((Pointer)new, &data_list, &prev);
2006 new->line1 = m;
2007 new->line2 = n;
2008 new->op = INS;
2009 new->link = fscript[d][k];
2010
2011 bno = fput_scr(new,offset1,offset2,m,n,data->b_offset, sgp);
2012 **first = data->b_offset;
2013 **last = data->b_offset+bno-1;
2014
2015 /* for (Cost = 0; Cost<=m; Cost++) {
2016 MemFree(blast_d[Cost]); MemFree(flast_d[Cost]);
2017 MemFree(bscript[Cost]); MemFree(fscript[Cost]);
2018 }
2019 MemFree(min_row); MemFree(min_diag);
2020 MemFree(max_row); MemFree(max_diag); */
2021
2022 free_this_script (bscript, m+1);
2023 free_this_script (fscript, m+1);
2024 free_2D_Int(blast_d, m+1);
2025 free_2D_Int(flast_d, m+1);
2026
2027 MemFree(min_row); MemFree(min_diag);
2028 MemFree(max_row); MemFree(max_diag);
2029 ValNodeFreeData(data_list);
2030
2031
2032 *dist = d;
2033 return bno;
2034
2035 } else goto fin;
2036
2037 if (col == n)
2038 /* hit last column; don't look forward */
2039 if (d<=max_d) {
2040 /* record first DEL */
2041 new = (struct edit *)MemNew(sizeof(struct edit));
2042 link_to_data_list((Pointer)new, &data_list, &prev);
2043 new->line1 = n - (--k) + ORIGIN;
2044 new->line2 = n;
2045 new->op = DEL;
2046 new->link = fscript[d][k+1];
2047 fscript[d][k] = new;
2048
2049 /* record last DEL */
2050 new = (struct edit *)MemNew(sizeof(struct edit));
2051 link_to_data_list((Pointer)new, &data_list, &prev);
2052 new->line1 = m;
2053 new->line2 = n;
2054 new->op = DEL;
2055 new->link = fscript[d][k];
2056
2057 bno = fput_scr(new,offset1,offset2,m,n,data->b_offset, sgp);
2058 **first = data->b_offset;
2059 **last = data->b_offset+bno-1;
2060
2061 /* for (Cost = 0; Cost<=m; Cost++) {
2062 MemFree(blast_d[Cost]); MemFree(flast_d[Cost]);
2063 MemFree(bscript[Cost]); MemFree(fscript[Cost]);
2064 }
2065 MemFree(min_row); MemFree(min_diag);
2066 MemFree(max_row); MemFree(max_diag); */
2067
2068 free_this_script (bscript, m+1);
2069 free_this_script (fscript, m+1);
2070 free_2D_Int(blast_d, m+1);
2071 free_2D_Int(flast_d, m+1);
2072
2073 MemFree(min_row); MemFree(min_diag);
2074 MemFree(max_row); MemFree(max_diag);
2075 ValNodeFreeData(data_list);
2076
2077
2078 *dist = d;
2079 return bno;
2080 } else goto fin;
2081 } /* for k */
2082
2083
2084 max_row[d] = flast_d[d][ORIGIN];
2085 max_diag[d] = ORIGIN;
2086 for (k=flower; k<=fupper; ++k)
2087 if (flast_d[d][k]>max_row[d]) {
2088 max_row[d] = flast_d[d][k];
2089 max_diag[d] = k;
2090 }
2091
2092 /* record backward and forward limits, if minimum combined
2093 * cost in overlapping. Note: it suffices to search up to
2094 * Cost=min(d,(max_d-d)).
2095 */
2096 for (Cost=0; Cost<=d; Cost++)
2097 if ((min_row[Cost]<=max_row[d]) &&
2098 (max_d > d+Cost)) {
2099 max_d = d+Cost;
2100 back = Cost;
2101 forth = d;
2102 break;
2103 }
2104 --flower;
2105 ++fupper;
2106
2107 ++d; /* for d */
2108 }
2109
2110
2111 fin:
2112 if (m-min_row[back]>=max_row[forth]) {
2113 bno = bput_scr(bscript[back][min_diag[back]],m,n,
2114 offset1,offset2,
2115 min_row[back],
2116 min_row[back]+min_diag[back]-ORIGIN,
2117 data->b_offset, sgp);
2118 **first = (bno) ? data->b_offset+bno-1 : data->b_offset+bno;
2119 col = fput_scr(fscript[forth][max_diag[forth]],
2120 offset1,offset2,
2121 min_row[back],
2122 min_row[back]+max_diag[forth]-ORIGIN,
2123 data->b_offset+bno, sgp);
2124 } else {
2125 bno = bput_scr(bscript[back][min_diag[back]],m,n,
2126 offset1,offset2,
2127 max_row[forth],
2128 max_row[forth]+min_diag[back]-ORIGIN,
2129 data->b_offset, sgp);
2130 **first = (bno) ? data->b_offset+bno-1 : data->b_offset+bno;
2131 col = fput_scr(fscript[forth][max_diag[forth]],
2132 offset1,offset2,
2133 max_row[forth],
2134 max_row[forth]+max_diag[forth]-ORIGIN,
2135 data->b_offset+bno, sgp);
2136 }
2137 if (bno) sgp->Blist[data->b_offset].next = data->b_offset+bno;
2138 **last = data->b_offset+bno+col-1;
2139
2140 /* for (Cost=0; Cost<=m; Cost++) {
2141 MemFree(blast_d[Cost]); MemFree(flast_d[Cost]);
2142 MemFree(bscript[Cost]); MemFree(fscript[Cost]);
2143 }
2144 MemFree(min_row); MemFree(min_diag);
2145 MemFree(max_row); MemFree(max_diag); */
2146
2147 free_this_script (bscript, m+1);
2148 free_this_script (fscript, m+1);
2149 free_2D_Int(blast_d, m+1);
2150 free_2D_Int(flast_d, m+1);
2151
2152 MemFree(min_row); MemFree(min_diag);
2153 MemFree(max_row); MemFree(max_diag);
2154 ValNodeFreeData(data_list);
2155
2156
2157 *dist = back+forth;
2158 return bno+col;
2159
2160 }
2161
2162
bput_scr(struct edit * start,Int4 len1,Int4 len2,Int4 offset1,Int4 offset2,Int4 row,Int4 col,Int4 offset,SIM4GlobalPtr sgp)2163 static Int4 bput_scr(struct edit *start, Int4 len1, Int4 len2,
2164 Int4 offset1, Int4 offset2, Int4 row, Int4 col,
2165 Int4 offset, SIM4GlobalPtr sgp)
2166 { struct edit *eptr;
2167 Int4 bno=0, ppos1, ppos2, i;
2168
2169 eptr = start;
2170 ppos1 = row; ppos2 = col-1;
2171
2172 while (eptr != NULL) {
2173 if (ppos1>eptr->line1) /* not used in this version */
2174 eptr = eptr->link;
2175 else
2176 switch (eptr->op) {
2177 case DEL :
2178 case SUB :
2179 eptr = eptr->link;
2180 break;
2181 case INS :
2182 if (ppos1==eptr->line1) {
2183 ppos2 = eptr->line2;
2184 } else {
2185 sgp->Blist[bno+offset].init_pos1 = ppos1+offset1+1;
2186 sgp->Blist[bno+offset].end_pos1 = eptr->line1-1+offset1+1;
2187 sgp->Blist[bno+offset].init_pos2 = ppos2+1+offset2+1;
2188 sgp->Blist[bno+offset].end_pos2 = eptr->line2-1+offset2+1;
2189 ppos1 = eptr->line1;
2190 ppos2 = eptr->line2;
2191 bno++;
2192 }
2193 eptr=eptr->link;
2194 }
2195 }
2196 if ((ppos1!=len1) || (ppos2+1!=len2)) {
2197
2198 sgp->Blist[bno+offset].init_pos1 = ppos1+offset1+1;
2199 sgp->Blist[bno+offset].end_pos1 = len1-1+offset1+1;
2200 sgp->Blist[bno+offset].init_pos2 = ppos2+1+offset2+1;
2201 sgp->Blist[bno+offset].end_pos2 = len2-1+offset2+1;
2202 bno++;
2203 }
2204 for (i=offset; i<bno+offset; i++)
2205 sgp->Blist[i+1].next = i;
2206
2207 return bno;
2208 }
2209
fput_scr(struct edit * start,Int4 offset1,Int4 offset2,Int4 row,Int4 col,Int4 offset,SIM4GlobalPtr sgp)2210 static Int4 fput_scr(struct edit *start, Int4 offset1, Int4 offset2,
2211 Int4 row, Int4 col, Int4 offset, SIM4GlobalPtr sgp)
2212 { struct edit *eptr;
2213 Int4 bno=0, ppos1, ppos2, i;
2214
2215 eptr = start;
2216 ppos1 = row; ppos2 = col+1;
2217
2218 while (eptr != NULL) {
2219 if (ppos1 < eptr->line1) /* not used in this version */
2220 eptr = eptr->link;
2221 else
2222 switch (eptr->op) {
2223 case DEL :
2224 case SUB :
2225 eptr = eptr->link;
2226 break;
2227 case INS :
2228 if (ppos1==eptr->line1) {
2229 ppos2 = eptr->line2;
2230 } else {
2231 sgp->Blist[bno+offset].init_pos1 = eptr->line1+1+offset1;
2232 sgp->Blist[bno+offset].end_pos1 = ppos1+offset1;
2233 sgp->Blist[bno+offset].init_pos2 = eptr->line2+1+offset2;
2234 sgp->Blist[bno+offset].end_pos2 = ppos2-1+offset2;
2235 ppos1 = eptr->line1;
2236 ppos2 = eptr->line2;
2237 bno++;
2238 }
2239 eptr=eptr->link;
2240 }
2241 }
2242 if (ppos1!=0) { /* not a final insert */
2243 sgp->Blist[bno+offset].init_pos1 = 1+offset1;
2244 sgp->Blist[bno+offset].end_pos1 = ppos1+offset1;
2245 sgp->Blist[bno+offset].init_pos2 = 1+offset2;
2246 sgp->Blist[bno+offset].end_pos2 = ppos2-1+offset2;
2247 bno++;
2248 }
2249
2250 for (i=offset; i<bno+offset; i++)
2251 sgp->Blist[i].next = i+1;
2252
2253 return bno;
2254 }
2255
path(Int4 i1,Int4 j1,Int1 type1,Int4 i2,Int4 j2,Int1 type2,Int4 dist,EditScriptPtr PNTR head,EditScriptPtr PNTR tail,data_t * data)2256 static void path(Int4 i1, Int4 j1, Int1 type1, Int4 i2, Int4 j2, Int1 type2, Int4 dist,
2257 EditScriptPtr PNTR head, EditScriptPtr PNTR tail, data_t *data)
2258 {
2259 Int4Ptr SS, DD, II; /* Forward vectors */
2260 Int4Ptr RS, RD, RI; /* Backward vectors */
2261
2262 EditScriptPtr head1, tail1, head2, tail2;
2263 Int4 midc, rmidc;
2264 Int4 start, lower, upper;
2265 Int4 rstart, rlower, rupper;
2266 Int4 c, k, t1, t2, t;
2267 Int4 maxint;
2268 Int4 mi, mj, mtype;
2269 Int1 flag;
2270
2271 /* Boundary cases */
2272 if (i1 == i2) {
2273 if (j1 == j2) *head = NULL;
2274 else {
2275 head1 = (EditScriptPtr) MemNew(sizeof(EditScript));
2276 head1->op_type = INS;
2277 head1->num = j2-j1;
2278 head1->next = NULL;
2279 *head = *tail = head1;
2280 }
2281 return;
2282 }
2283
2284 if (j1 == j2) {
2285 head1 = (EditScriptPtr) MemNew(sizeof(EditScript));
2286 head1->op_type = DEL;
2287 head1->num = i2-i1;
2288 head1->next = NULL;
2289 *head = *tail = head1;
2290 return;
2291 }
2292
2293 if (dist <= 1) {
2294 if (j2-i2 == j1-i1) {
2295 head1 = (EditScriptPtr) MemNew(sizeof(EditScript));
2296 head1->op_type = SUB;
2297 head1->num = i2-i1;
2298 head1->next = NULL;
2299 *head = *tail = head1;
2300 } else if (j2-i2 > j1-i1) {
2301 if (type1 == INS) {
2302 head1 = (EditScriptPtr) MemNew(sizeof(EditScript));
2303 head1->op_type = INS;
2304 head1->num = 1;
2305 head2 = (EditScriptPtr) MemNew(sizeof(EditScript));
2306 head2->op_type = SUB;
2307 head2->num = i2-i1;
2308 } else {
2309 head1 = (EditScriptPtr) MemNew(sizeof(EditScript));
2310 head1->op_type = SUB;
2311 head1->num = i2-i1;
2312 head2 = (EditScriptPtr) MemNew(sizeof(EditScript));
2313 head2->op_type = INS;
2314 head2->num = 1;
2315 }
2316 head1->next = head2;
2317 head2->next = NULL;
2318 *head = head1;
2319 *tail = head2;
2320 } else if (j2-i2 < j1-i1) {
2321 if (type1 == DEL) {
2322 head1 = (EditScriptPtr) MemNew(sizeof(EditScript));
2323 head1->op_type = DEL;
2324 head1->num = 1;
2325 head2 = (EditScriptPtr) MemNew(sizeof(EditScript));
2326 head2->op_type = SUB;
2327 head2->num = j2-j1;
2328 } else {
2329 head1 = (EditScriptPtr) MemNew(sizeof(EditScript));
2330 head1->op_type = SUB;
2331 head1->num = j2-j1;
2332 head2 = (EditScriptPtr) MemNew(sizeof(EditScript));
2333 head2->op_type = DEL;
2334 head2->num = 1;
2335 }
2336 head1->next = head2;
2337 head2->next = NULL;
2338 *head = head1;
2339 *tail = head2;
2340 }
2341 return;
2342 }
2343
2344 /* Divide the problem at the middle cost */
2345 midc = dist/2;
2346 rmidc = dist - midc;
2347
2348 /* Compute the boundary diagonals */
2349 start = j1 - i1;
2350 lower = MAX(j1-i2, start-midc);
2351 upper = MIN(j2-i1, start+midc);
2352 rstart = j2-i2;
2353 rlower = MAX(j1-i2, rstart-rmidc);
2354 rupper = MIN(j2-i1, rstart+rmidc);
2355
2356 /* Allocate space for forward vectors */
2357 SS = (Int4Ptr )MemNew((upper-lower+1)*sizeof(Int4)) - lower;
2358 DD = (Int4Ptr )MemNew((upper-lower+2)*sizeof(Int4)) - lower;
2359 II = (Int4Ptr )MemNew((upper-lower+2)*sizeof(Int4)) - lower + 1;
2360
2361 /* Forward computation */
2362 for (k=lower; k<=upper; ++k) SS[k] = MININT;
2363 for (k=lower; k<=upper+1; ++k) DD[k] = MININT;
2364 for (k=lower-1; k<=upper; ++k) II[k] = MININT;
2365 if (type1 == SUB) SS[start] = snake(start, i1, i2, j2, data);
2366 else if (type1 == DEL) {
2367 DD[start] = i1;
2368 SS[start] = snake(start, i1, i2, j2, data);
2369 } else {
2370 II[start] = i1;
2371 SS[start] = snake(start, i1, i2, j2, data);
2372 }
2373
2374 for (c = 1; c <= midc; ++c) {
2375 t = MAX(lower, start-c);
2376 t1 = II[t-1];
2377 for (k = t; k <= MIN(upper, start + c); ++k) {
2378 t2 = II[k];
2379 II[k] = MAX(t1, SS[k]);
2380 t1 = t2;
2381 DD[k] = MAX(DD[k + 1] + 1, SS[k]);
2382 SS[k] = snake(k, MIN(j2 - k, MAX(MAX(SS[k] + 1, II[k]), DD[k])), i2, j2, data);
2383 }
2384 }
2385
2386 /* Allocate space for backward vectors */
2387 RS = (Int4Ptr )MemNew((rupper-rlower+1)*sizeof(Int4)) - rlower;
2388 RD = (Int4Ptr )MemNew((rupper-rlower+2)*sizeof(Int4)) - rlower + 1;
2389 RI = (Int4Ptr )MemNew((rupper-rlower+2)*sizeof(Int4)) - rlower;
2390
2391 /* Backward computation */
2392 maxint = i2 + dist + data->N;
2393 for(k=rlower; k<=rupper; ++k) RS[k] = maxint;
2394 for(k=rlower-1; k<=rupper; ++k) RD[k] = maxint;
2395 for(k=rlower; k<=rupper+1; ++k) RI[k] = maxint;
2396 if(type2 == SUB)
2397 RI[rstart] = RD[rstart] = RS[rstart] = rsnake(rstart, i2, i1, j1, data);
2398 else if (type2 == DEL) RD[rstart] = i2;
2399 else RI[rstart] = i2;
2400
2401 for(c = 1; c <= rmidc; ++c) {
2402 t = MAX(rlower, rstart - c);
2403 t1 = RD[t - 1];
2404 for(k = t; k <= MIN(rupper, rstart + c); ++k) {
2405 RS[k] = rsnake(k, MAX(j1 - k, MIN(MIN(RS[k] - 1, RD[k]), RI[k])), i1, j1, data);
2406 t2 = RD[k];
2407 RD[k] = MIN(t1 - 1, RS[k]);
2408 t1 = t2;
2409 RI[k] = MIN(RI[k + 1], RS[k]);
2410 }
2411 }
2412
2413 /* Find (mi, mj, mtype) such that
2414 the distance from (i1, j1, type1) to (mi, mj, mtype) is midc
2415 and the distance from (mi, mj, mtype) to (i2, j2, type2) is rmidc.
2416 */
2417
2418 flag = FALSE;
2419 for (k=MAX(lower,rlower); k<=MIN(upper,rupper);++k) {
2420
2421 if (SS[k]>=RS[k] || DD[k]>=RD[k] || II[k]>=RI[k]) {
2422 if (DD[k]>=RD[k]) {
2423 mi = DD[k];
2424 mj = k+mi;
2425 mtype = DEL;
2426 } else if (II[k] >= RI[k]) {
2427 mi = II[k];
2428 mj = k+mi;
2429 mtype = INS;
2430 } else {
2431 mi = SS[k];
2432 mj = k+mi;
2433 mtype = SUB;
2434 }
2435
2436 flag = TRUE;
2437 break;
2438 }
2439 }
2440
2441 /* Free working vectors */
2442 MemFree(SS+lower);
2443 MemFree(DD+lower);
2444 MemFree(II+lower-1);
2445 MemFree(RS+rlower);
2446 MemFree(RD+rlower-1);
2447 MemFree(RI+rlower);
2448
2449 if (flag) {
2450 /* Find a path from (i1,j1,type1) to (mi,mj,mtype) */
2451 path(i1, j1, type1, mi, mj, mtype, midc, &head1, &tail1, data);
2452
2453 /* Find a path from (mi,mj,mtype) to (i2,j2,type2) */
2454 path(mi, mj, mtype, i2, j2, type2, rmidc, &head2, &tail2, data);
2455
2456 /* Join these two paths together */
2457 if(head1) tail1->next = head2;
2458 else head1 = head2;
2459 }
2460 else {
2461 ErrPostEx(SEV_WARNING, 0, 0, "Something wrong when dividing");
2462
2463 /* printf("Something wrong when dividing\n"); */
2464 head1 = NULL;
2465 }
2466 *head = head1;
2467 if (head2) *tail = tail2;
2468 else *tail = tail1;
2469 }
2470
2471 /* Condense_script - merge contiguous operations of the same type together */
Condense_script(EditScriptPtr head)2472 static void Condense_script(EditScriptPtr head)
2473 {
2474 EditScriptPtr tp, tp1;
2475
2476 tp = head;
2477 while (tp != NULL) {
2478 while (((tp1 = tp->next) != NULL) && (tp->op_type == tp1->op_type)) {
2479 tp->num = tp->num + tp1->num;
2480 tp->next = tp1->next;
2481 MemFree(tp1);
2482 }
2483 tp = tp->next;
2484 }
2485 }
2486
Reverse_Script(EditScriptPtr head)2487 static void Reverse_Script(EditScriptPtr head)
2488 {
2489 EditScriptPtr tp;
2490
2491 tp = head;
2492 while (tp != NULL) {
2493 if (tp->op_type == INS) tp->op_type = DEL;
2494 else if (tp->op_type == DEL) tp->op_type = INS;
2495 tp = tp->next;
2496 }
2497 }
2498
Print_Script(EditScriptPtr head,Int4 M,Int4 N,data_t * data)2499 static void Print_Script(EditScriptPtr head, Int4 M, Int4 N, data_t *data)
2500 {
2501 EditScriptPtr tp;
2502 Int4 i, j, k, count;
2503
2504 i = j = 0;
2505 tp = head;
2506
2507 while(tp) {
2508 if(tp->op_type == SUB) {
2509 k = 0;
2510 while(k < tp->num) {
2511 count = 0;
2512 while((data->seq1[i] == data->seq2[j]) && (k < tp->num)) {
2513 ++i;
2514 ++j;
2515 ++count;
2516 ++k;
2517 }
2518 if(count > 0) fprintf(stdout, "copy %d\n", count);
2519 if(k < tp->num) {
2520 fprintf(stdout, "replace %c by %c\n", data->seq1[i], data->seq2[j]);
2521 ++i;
2522 ++j;
2523 ++k;
2524 }
2525 }
2526 }
2527 else if (tp->op_type == INS) {
2528 if((tp == head || tp->next == NULL) && (M <= N))
2529 fprintf(stdout, "skip (second sequence) %d\n", tp->num);
2530 else {
2531 /*
2532 printf("insert %d\n", tp->num);
2533 */
2534 fprintf(stdout, "insert ");
2535 if(tp->num <= 10)
2536 for(k = j; k < j + tp->num; ++k) fprintf(stdout, "%c", data->seq2[k]);
2537 else fprintf(stdout, " %d ", tp->num);
2538 fprintf(stdout, "\n");
2539 }
2540 j += tp->num;
2541 }
2542 else { /* DEL */
2543 if((tp == head || tp->next == NULL) && (M > N))
2544 fprintf(stdout, "skip (first sequence) %d\n", tp->num);
2545 else {
2546 /*
2547 printf("delete %d\n", tp->num);
2548 */
2549 fprintf(stdout, "delete ");
2550 if(tp->num <= 10)
2551 for(k=i; k<i+tp->num; ++k)
2552 fprintf(stdout, "%c", data->seq1[k]);
2553 else
2554 fprintf(stdout, "%d ", tp->num);
2555 fprintf(stdout, "\n");
2556 }
2557 i += tp->num;
2558 }
2559 tp = tp->next;
2560 }
2561 }
2562
S2A(EditScriptPtr head,Int4Ptr S)2563 static void S2A(EditScriptPtr head, Int4Ptr S)
2564 {
2565 EditScriptPtr tp;
2566 Int4Ptr lastS;
2567 Int4 i;
2568
2569 tp = head;
2570 lastS = S;
2571 while (tp != NULL) {
2572 /*
2573 printf("tp->op_type=%d, tp->num=%d\n",tp->op_type, tp->num);
2574 */
2575 if (tp->op_type == SUB) {
2576 for (i=0; i<tp->num; ++i) *lastS++ = 0;
2577 } else if (tp->op_type == INS) {
2578 *lastS++ = tp->num;
2579 } else { /* DEL */
2580 *lastS++ = 0 - tp->num;
2581 }
2582 tp = tp->next;
2583 }
2584 }
2585
2586
2587
2588 /* Alignment display routine */
2589
2590 static Int1 ALINE[51], BLINE[51], CLINE[51];
2591
DISPLAY(Int1 A[],Int1 B[],Int4 M,Int4 N,Int4 S[],Int4 AP,Int4 BP)2592 static void DISPLAY(Int1 A[], Int1 B[], Int4 M, Int4 N, Int4 S[], Int4 AP, Int4 BP)
2593 { register Int1 *a, *b, *c;
2594 register Int4 i, j, op;
2595 Int4 lines, ap, bp;
2596
2597 i = j = op = lines = 0;
2598 ap = AP;
2599 bp = BP;
2600 a = ALINE;
2601 b = BLINE;
2602 c = CLINE;
2603 while (i < M || j < N)
2604 { if (op == 0 && *S == 0)
2605 { op = *S++;
2606 *a = A[++i];
2607 *b = B[++j];
2608 *c++ = (*a++ == *b++) ? '|' : ' ';
2609 }
2610 else
2611 { if (op == 0)
2612 op = *S++;
2613 if (op > 0)
2614 { *a++ = ' ';
2615 *b++ = B[++j];
2616 op--;
2617 }
2618 else
2619 { *a++ = A[++i];
2620 *b++ = ' ';
2621 op++;
2622 }
2623 *c++ = '-';
2624 }
2625 if (a >= ALINE+50 || i >= M && j >= N)
2626 { *a = *b = *c = '\0';
2627 fprintf(stdout, "\n%6ld ",(long int)50*lines++);
2628 for (b = ALINE+10; b <= a; b += 10)
2629 fprintf(stdout, " . :");
2630 if (b <= a+5)
2631 fprintf(stdout, " .");
2632 fprintf(stdout, "\n%6d %s\n %s\n%6d %s\n",ap,ALINE,CLINE,bp,BLINE);
2633 ap = AP + i;
2634 bp = BP + j;
2635 a = ALINE;
2636 b = BLINE;
2637 c = CLINE;
2638 }
2639 }
2640 }
2641
2642
2643
snake(Int4 k,Int4 x,Int4 endx,Int4 endy,data_t * data)2644 static Int4 snake(Int4 k, Int4 x, Int4 endx, Int4 endy, data_t *data)
2645 {
2646 Int4 y;
2647
2648 if(x < 0) return x;
2649 y = x + k;
2650
2651 while (x < endx && y < endy && data->seq1[x] == data->seq2[y]) {
2652 ++x; ++y;
2653 }
2654
2655 return x;
2656 }
2657
2658
rsnake(Int4 k,Int4 x,Int4 startx,Int4 starty,data_t * data)2659 static Int4 rsnake(Int4 k, Int4 x, Int4 startx, Int4 starty, data_t *data)
2660 {
2661 Int4 y;
2662
2663 if(x > data->M) return x;
2664 y = x + k;
2665 while(x > startx && y > starty && data->seq1[x - 1] == data->seq2[y - 1]) {
2666 --x; --y;
2667 }
2668
2669 return x;
2670 }
2671
get_dist(Int4 i1,Int4 j1,Int4 i2,Int4 j2,data_t * data)2672 static Int4 get_dist(Int4 i1, Int4 j1, Int4 i2, Int4 j2, data_t *data)
2673 {
2674 Int4Ptr SS, DD, II;
2675 Int4 goal_diag;
2676 Int4 c, k, t1, t2, t;
2677 Int4 start, lower, upper;
2678
2679 /* Compute the boundary diagonals */
2680 /* printf("i1 = %ld, j1 = %ld, i2= %ld, j2=%ld, limit = %ld", i1, j1, i2, j2, data->limit); */
2681 start = j1 - i1;
2682 lower = MAX(j1 - i2, start - data->limit);
2683 upper = MIN(j2 - i1, start + data->limit);
2684 goal_diag = j2 - i2;
2685
2686 if(goal_diag > upper || goal_diag < lower) {
2687 /* ErrPostEx(SEV_WARNING, 0, 0, "The two sequences are not really similar"); */
2688 return -1;
2689
2690 /* printf("The two sequences are not really similar.\n");
2691 printf("Please try an exact aligning method.\n");
2692 exit(1); */
2693 }
2694
2695 /* Allocate space for forward vectors */
2696 SS = (Int4Ptr)MemNew((upper - lower + 1) * sizeof(Int4)) - lower;
2697 DD = (Int4Ptr)MemNew((upper - lower + 2) * sizeof(Int4)) - lower;
2698 II = (Int4Ptr)MemNew((upper - lower + 2) * sizeof(Int4)) - lower + 1;
2699
2700 /* Initialization */
2701 for(k = lower; k <= upper; ++k) SS[k] = MININT;
2702 for(k = lower; k <= upper + 1; ++k) DD[k] = MININT;
2703 for(k = lower - 1; k <= upper; ++k) II[k] = MININT;
2704 SS[start] = snake(start, i1, i2, j2, data);
2705
2706 if(SS[goal_diag] >= i2) {
2707 /*
2708 #ifdef STATS
2709 printf("get_dist = %d\n", 0);
2710 #endif
2711 */
2712
2713 /* Free working vectors */
2714 MemFree(SS+lower);
2715 MemFree(DD+lower);
2716 MemFree(II+lower-1);
2717 return 0;
2718 }
2719
2720 for(c = 1; c <= data->limit; ++c) {
2721 t = MAX(lower, start-c);
2722 t1 = II[t - 1];
2723 for(k = t; k <= MIN(upper, start + c); ++k) {
2724 t2 = II[k];
2725 II[k] = MAX(t1, SS[k]);
2726 t1 = t2;
2727 DD[k] = MAX(DD[k + 1] + 1, SS[k]);
2728 SS[k] = snake(k, MIN(j2 - k,MAX(MAX(SS[k] + 1, II[k]), DD[k])), i2, j2, data);
2729 }
2730
2731 if (SS[goal_diag] >= i2) {
2732 #ifdef STATS
2733 printf("get_dist = %d\n",c);
2734 #endif
2735
2736 /* Free working vectors */
2737 MemFree(SS + lower);
2738 MemFree(DD + lower);
2739 MemFree(II + lower - 1);
2740 return c;
2741 }
2742 }
2743
2744 /* Ran out of distance limit */
2745 /* ErrPostEx(SEV_WARNING, 0, 0, "The two sequences are not really similar"); */
2746 return -1;
2747 /* printf("The two sequences are not really similar.\n");
2748 printf("Please try an exact method.\n");
2749 exit(1); */
2750 }
2751
2752
locate(Int4Ptr i1,Int4Ptr j1,Int4Ptr i2,Int4Ptr j2,Int1 * A,Int1 * B,Int4 m,Int4 n,Int4 W1,data_t * data)2753 static Int1 locate(Int4Ptr i1, Int4Ptr j1, Int4Ptr i2, Int4Ptr j2,
2754 Int1 *A, Int1 *B, Int4 m, Int4 n, Int4 W1,
2755 data_t *data)
2756 {
2757 Int4 mask;
2758 Int4 BUCKETS;
2759 Int1Ptr bucket;
2760 Int4 i, j;
2761 Int1Ptr p, q;
2762 Int4 ecode, ecode1;
2763 Int4 fcount, bcount, fmax, bmax;
2764 float trust_ratio;
2765 Int4 paired_fj, paired_bj, best_range, t;
2766 ValNodePtr prev = NULL, data_list = NULL;
2767
2768 typedef struct max_chain {
2769 Int4 j; /* j position */
2770 struct max_chain *next;
2771 } max_chain;
2772 max_chain *fj, *bj, *tj, *tj1, *tj2;
2773
2774 if(m < W1 + NW - 1) {
2775 /* printf("m=%d, The sequences are too short for the heuristic method.\n",m);
2776 */
2777 return FALSE;
2778 }
2779
2780 /* Set up the hash table (derived from Sequence A) */
2781
2782 mask = (1 << (W1 + W1 - 2)) - 1;
2783 BUCKETS = power2(2 * W1);
2784 bucket = (Int1Ptr) MemNew(BUCKETS * sizeof(Int1));
2785 for (i = 0; i<BUCKETS; ++i) bucket[i] = 0;
2786
2787 /* front end */
2788 p = A;
2789
2790 ecode = 0L;
2791 for(i = 1; i < W1; ++i) {
2792 ecode = (ecode << 2) + data->encoding[*p++];
2793 }
2794
2795 for(i = 0; i < NW; ++i) {
2796 ecode = ((ecode & mask) << 2) + data->encoding[*p++];
2797 bucket[ecode] = 1;
2798 }
2799 /* back end */
2800 p = A + m - W1 - NW + 1;
2801 ecode = 0L;
2802 for(i = 1; i < W1; ++i)
2803 ecode = (ecode << 2) + data->encoding[*p++];
2804 for(i = 0; i < NW; ++i) {
2805 ecode = ((ecode & mask) << 2) + data->encoding[*p++];
2806 if (bucket[ecode] == 1 || bucket[ecode] == 3) bucket[ecode] = 3;
2807 else bucket[ecode] = 2;
2808 }
2809
2810 /* Locate the interval with the maximum number of the matched 8-tuples */
2811 trust_ratio = 0.6;
2812 fmax = bmax = NW * trust_ratio;
2813 fj = bj = NULL;
2814 fcount = bcount = 0;
2815 p = B;
2816 ecode = 0L;
2817 for(j = 1; j < W1; ++j)
2818 ecode = (ecode << 2) + data->encoding[*p++];
2819 ecode1 = ecode;
2820 q = p;
2821 for(j = 0; j < NW; ++j) {
2822 ecode = ((ecode & mask) << 2) + data->encoding[*p++];
2823 if(bucket[ecode] == 1 || bucket[ecode] == 3) ++fcount;
2824 if(bucket[ecode] == 2 || bucket[ecode] == 3) ++bcount;
2825 }
2826
2827 if (fcount >= fmax) {
2828 fj = (max_chain *) MemNew(sizeof(max_chain));
2829 link_to_data_list((Pointer)fj, &data_list, &prev);
2830 fj->j = 0;
2831 fj->next = NULL;
2832 fmax = fcount;
2833 }
2834 if (bcount >= bmax) {
2835 bj = (max_chain *) MemNew(sizeof(max_chain));
2836 link_to_data_list((Pointer)bj, &data_list, &prev);
2837 bj->j = W1 + NW - 2;
2838 bj->next = NULL;
2839 bmax = bcount;
2840 }
2841
2842 for(j = 1; j <= n - W1 - NW + 1; ++j) {
2843 ecode1 = ((ecode1 & mask) << 2) + data->encoding[*q++];
2844 if (bucket[ecode1] == 1 || bucket[ecode1] == 3) --fcount;
2845 if (bucket[ecode1] == 2 || bucket[ecode1] == 3) --bcount;
2846 ecode = ((ecode & mask) << 2) + data->encoding[*p++];
2847 if (bucket[ecode] == 1 || bucket[ecode] == 3) ++fcount;
2848 if (bucket[ecode] == 2 || bucket[ecode] == 3) ++bcount;
2849
2850 if(fcount > fmax) {
2851 while(fj != NULL) {
2852 tj = fj->next;
2853 /* MemFree(fj); */
2854 fj = tj;
2855 }
2856 fj = (max_chain *) MemNew(sizeof(max_chain));
2857 link_to_data_list((Pointer)fj, &data_list, &prev);
2858 fj->j = j;
2859 fj->next = NULL;
2860 fmax = fcount;
2861 }
2862 else if(fcount == fmax) {
2863 tj = (max_chain *) MemNew(sizeof(max_chain));
2864 link_to_data_list((Pointer)tj, &data_list, &prev);
2865 tj->j = j;
2866 tj->next = fj;
2867 fj = tj;
2868 }
2869 if(bcount > bmax) {
2870 while(bj != NULL) {
2871 tj = bj->next;
2872 /* MemFree(bj); */
2873 bj = tj;
2874 }
2875 bj = (max_chain *) MemNew(sizeof(max_chain));
2876 link_to_data_list((Pointer)bj, &data_list, &prev);
2877 bj->j = j + W1 + NW - 2;
2878 bj->next = NULL;
2879 bmax = bcount;
2880 }
2881 else if(bcount == bmax) {
2882 tj = (max_chain *) MemNew(sizeof(max_chain));
2883 link_to_data_list((Pointer)tj, &data_list, &prev);
2884 tj->j = j + W1 + NW - 2;
2885 tj->next = bj;
2886 bj = tj;
2887 }
2888 }
2889
2890 MemFree(bucket);
2891
2892 if(fj == NULL || bj == NULL)
2893 {
2894 if(data_list != NULL)
2895 ValNodeFreeData(data_list);
2896 return FALSE;
2897 }
2898
2899 best_range = (Int4)(PERCENT * m) + 1;
2900 for(tj1 = fj; tj1 != NULL; tj1 = tj1->next) {
2901 for(tj2 = bj; tj2 != NULL; tj2 = tj2->next) {
2902 t = tj2->j - tj1->j + 1 - m;
2903 if(t < 0) t = -t;
2904
2905 if(t < best_range) {
2906 best_range = t;
2907 paired_fj = tj1->j;
2908 paired_bj = tj2->j;
2909 }
2910 }
2911 }
2912
2913 ValNodeFreeData(data_list);
2914 if(best_range > (Int4)(PERCENT * m)) return FALSE;
2915
2916 *i1 = 0;
2917 *i2 = m;
2918 *j1 = paired_fj;
2919 *j2 = paired_bj + 1;
2920
2921 return TRUE;
2922 }
2923