1 /*
2 * sim4.c - A program to align a cDNA sequence with a genomic sequence
3 * for the gene.
4 */
5
6 #ifndef __lint
7 /*@unused@*/
8 static const char rcsid[] =
9 "$Id: sim4b1.c,v 1.64 2002/03/03 23:29:48 florea Exp $";
10 #endif
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <math.h>
16 #include <assert.h>
17
18 #include "psublast.h"
19
20 #include "sim4.h"
21 #include "sim4b1.h"
22 #include "Xtend1.h"
23 #include "align.h"
24 #include "splice.h"
25 #include "poly.h"
26
27 #define EXTEND_FW (rs.acc_flag?Xextend_fw:extend_fw)
28 #define EXTEND_BW (rs.acc_flag?Xextend_bw:extend_bw)
29
30 #define SLIDE_INTRON(x) (((x)==TRUE)?sync_slide_intron:slide_intron)
31
32 uchar *seq1, *seq2;
33 int M, N, encoding[NACHARS];
34 coords last_GT, last_CT, last_AG, last_AC;
35 int file_type;
36
37 sim4_args_t rs;
38
39
40 static int numMSPs, K, W, X;
41 static int G_score, C_score;
42 static int *diag_lev;
43 static Msp_ptr msp_list, *msp;
44 static Exon_ptr exon_list;
45
46 static void merge(Exon **,Exon **);
47 static bool get_sync_flag(Exon *, Exon *, int);
48 static void slide_intron(int w, Exon **,uchar *,uchar *);
49 static void sync_slide_intron(int w, Exon **,uchar *,uchar *);
50 static void wobble(Exon **,Exon **,const char *,const char *,uchar *seq1);
51 static Exon *bmatch(uchar *,uchar *,int,int,int,int);
52 static Exon *fmatch(uchar *,uchar *,int,int,int,int);
53 static void compact_list(Exon **Lblock, Exon **Rblock);
54 static int resolve_overlap(Exon *,Exon *,uchar *);
55 static int greedy(uchar *,uchar *,int,int,int,int,Exon **, Exon **);
56 static int extend_bw(uchar *,uchar *,int,int,int,int,int *,int *);
57 static int extend_fw(uchar *,uchar *,int,int,int,int,int *,int *);
58 static void pluri_align(int *,int *,Exon *,struct edit_script_list **);
59 static void get_stats(Exon *,sim4_stats_t *);
60 static int get_edist(int,int,int,int,uchar *,uchar *);
61 static int get_msp_threshold(int len1, int len2);
62 static int find_log_entry(long *log4s, int n, int len, int offset);
63 static Exon *new_exon(int,int,int,int,int,int,int,Exon *);
64 static void add_word(int,int);
65 static void extend_hit(int,int,const uchar *const,const uchar * const,int,int,int);
66 static void sort_msps(void);
67 static void heapify(int,int);
68 static int smaller(int,int);
69 static void search(uchar *,uchar *,int,int,int);
70 static int link_msps(Msp_ptr *msp,int,int,int);
71 static int scale(int n);
72 static void msp2exons(Msp_ptr *,int,uchar *,uchar *);
73 static void free_msps(Msp_ptr **,int *);
74 static void exon_cores(uchar*,uchar*,int,int,int,int,int,int,int,int);
75 static void relink(Msp_ptr *,int,int,int,int,int,uchar *,uchar *);
76 static int dispatch_find_ends(int,int,int *,int *,edit_script_list *,int,int,int);
77 static int find_ends(edit_script_list *,int);
78 static bool get_match_quality(Exon *,Exon *,sim4_stats_t *,int);
79 static int check_consistency_intron_ori(Exon *,int,char *);
80
81 Exon *find_previous(Exon *,Exon *);
82 void script_flip_list(edit_script_list **);
83
84
85 #ifdef DEBUG
86 static void debug_print_exons(Exon *, const char *);
87 #endif
88
89 /* Not currently used: */
90 #ifdef AUXUTILS
91 static void remove_polyA_tails(Exon *,uchar *,uchar *,int);
92 static void find_introns(Exon *, Intron **);
93 static void print_introns(Intron *);
94 #endif
95
96 /* seq1 = genomic DNA (text); seq2 = cDNA */
SIM4(uchar * in_seq1,uchar * in_seq2,int in_M,int in_N,int in_W,int in_X,int in_K,int in_C,int in_H,int * dist_ptr,int * pT,int * pA,Exon ** Exons,sim4_stats_t * st)97 struct edit_script_list *SIM4(uchar *in_seq1, uchar *in_seq2,
98 int in_M, int in_N, int in_W,
99 int in_X, int in_K, int in_C, int in_H,
100 int *dist_ptr, int *pT, int *pA,
101 Exon **Exons, sim4_stats_t *st)
102 {
103 int cflag, diff, cost, rollbflag, sync_flag;
104 int u, v, I, J;
105 bool good_match;
106 Exon *Lblock, *Rblock=NULL, *tmp_block, *last, *prev, *tmp_block1,
107 *tmp_Lblock=NULL, *tmp_Rblock=NULL, *new;
108
109 struct edit_script_list *Script_head=NULL;
110 uchar tmp[50];
111 coords *sig;
112
113 seq1 = in_seq1;
114 seq2 = in_seq2;
115 M = in_M;
116 N = in_N;
117 W = in_W;
118 X = in_X;
119
120 if (M<=0 || in_N<=0) {
121 *Exons = NULL; return NULL;
122 }
123
124 if (rs.acc_flag) {
125 last_AG.pos1 = last_AG.pos2 = last_AC.pos1 = last_AC.pos2 = 0;
126 last_GT.pos1 = last_GT.pos2 = last_CT.pos1 = last_CT.pos2 = 0;
127 }
128
129 /* Compute the distance between two sequences A and B */
130
131 *dist_ptr = 0;
132
133 exon_cores(seq1-1, seq2-1, M, N, 1, 1, 0, W, in_K, PERM);
134
135 tmp_block = Lblock = exon_list;
136 while (tmp_block) {
137 if (tmp_block->next_exon==NULL)
138 Rblock = tmp_block;
139 tmp_block = tmp_block->next_exon;
140 }
141
142 if (Lblock &&
143 ((Lblock->from1>50000 && Lblock->from2>100) ||
144 ((M-Rblock->to1>50000) && (N-Rblock->to2>100)))) {
145 free_list(exon_list);
146 relink(msp,numMSPs,(in_H>0) ? in_H:DEFAULT_RELINK_H,1,1,0,seq1,seq2);
147 tmp_block = Lblock = exon_list;
148 while (tmp_block) {
149 if (tmp_block->next_exon==NULL)
150 Rblock = tmp_block;
151 tmp_block = tmp_block->next_exon;
152 }
153 }
154 free_msps(&msp, &numMSPs);
155
156 tmp_block = Lblock = exon_list;
157 while (tmp_block) {
158 if (tmp_block->next_exon==NULL)
159 Rblock = tmp_block;
160 tmp_block = tmp_block->next_exon;
161 }
162
163 /* enclose the current path in the (0,0,0,0) and (M+1,N+1,0,0) brackets */
164
165 Lblock = new_exon (0,0,0,0,0,0,0,Lblock);
166 if (Rblock == NULL)
167 Rblock = Lblock;
168 Rblock->next_exon = new_exon (M+1,N+1,0,0,0,0,0,NULL);
169
170 /* compute current statistics */
171 good_match = get_match_quality(Lblock, Rblock, st, N);
172
173 #ifdef DEBUG
174 debug_print_exons(Lblock, "LSIS");
175 #endif
176
177 tmp_block = Lblock;
178 while ((tmp_block1 = tmp_block->next_exon)!=NULL) {
179 rollbflag = 0;
180 diff = (int)(tmp_block1->from2-tmp_block->to2-1);
181 if (diff) {
182 if (diff<0) {
183 int best_u;
184
185 best_u = resolve_overlap(tmp_block,tmp_block1,seq1);
186
187 tmp_block1->from1 += best_u+1-tmp_block1->from2;
188 tmp_block1->from2 = best_u+1;
189 if (((u=tmp_block1->to2-tmp_block1->from2+1)<=0) || (u<8) ||
190 ((v=tmp_block1->to1-tmp_block1->from1+1)<=0) || (v<8)) {
191 /* remove exon associated with tmp_block1 */
192 tmp_block->next_exon = tmp_block1->next_exon;
193 tmp_block->flag = tmp_block1->flag;
194 rollbflag = 1;
195 free(tmp_block1);
196 tmp_block1 = NULL; /* not necessary, just to keep it 'clean'*/
197 }
198
199 tmp_block->to1 -= tmp_block->to2-best_u;
200 tmp_block->to2 = best_u;
201 if (((u=tmp_block->to2-tmp_block->from2+1)<=0) || (u<8) ||
202 ((v=tmp_block->to1-tmp_block->from1+1)<=0) || (v<8)) {
203
204 /* remove exon defined by tmp_block */
205 prev = find_previous(Lblock,tmp_block);
206 assert (prev!=NULL);
207 prev->next_exon = tmp_block->next_exon;
208 prev->flag = tmp_block->flag;
209 if (u>0) rollbflag = 1;
210 free(tmp_block);
211 tmp_block = prev;
212 }
213
214 if (tmp_block->to1)
215 tmp_block->length = tmp_block->to2-tmp_block->from2+1;
216 if (tmp_block1 && tmp_block1->to1)
217 tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1;
218
219 } else {
220 /* bridge the gap */
221 cflag = (tmp_block1->to2 && tmp_block->to2) ? 0 : 1;
222 if (diff && (tmp_block1->from1-tmp_block->to1-1>0)) {
223 if (!cflag) {
224 if (diff<=MAX_GRINIT) {
225 cost = greedy(seq2+tmp_block->to2,
226 seq1+tmp_block->to1,
227 diff,
228 tmp_block1->from1-tmp_block->to1-1,
229 tmp_block->to2,tmp_block->to1,
230 &tmp_Lblock, &tmp_Rblock);
231 } else cost = max(W,(int)(P*diff+1))+1;
232
233 if (cost>max(W,(int)(P*diff+1))) {
234 if (!tmp_block->flag && !tmp_block1->flag) {
235 exon_cores(seq1+tmp_block->to1-1,
236 seq2+tmp_block->to2-1,
237 tmp_block1->from1-tmp_block->to1-1,
238 diff,
239 tmp_block->to1+1,
240 tmp_block->to2+1,
241 1,
242 min(8,W),
243 in_C,
244 /* (min(8,W)==W) ? PERM : TEMP); */
245 TEMP);
246 tmp_Lblock = tmp_Rblock = exon_list;
247 while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
248 tmp_Rblock = tmp_Rblock->next_exon;
249
250 if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) ||
251 (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) &&
252 (tmp_Lblock->from1-tmp_block->from1>50000)) ||
253 (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) &&
254 (tmp_block1->from1-tmp_Rblock->from1>50000))) {
255 /* possible large intron; increase the score weight */
256 free_list(tmp_Lblock);
257 relink(msp, numMSPs,
258 (in_H>0) ? in_H:DEFAULT_RELINK_H,
259 tmp_block->to1+1,
260 tmp_block->to2+1,
261 1, seq1, seq2);
262 tmp_Lblock = tmp_Rblock = exon_list;
263 while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
264 tmp_Rblock = tmp_Rblock->next_exon;
265 }
266 free_msps(&msp, &numMSPs);
267
268 if (tmp_Lblock) rollbflag = 1;
269 else rollbflag = 0; /* already 0 */
270 } else
271 tmp_Lblock = tmp_Rblock = NULL;
272 }
273 } else if (tmp_block1->to1) {
274 /* start of seq; find last_AG, last_AC */
275 if (rs.acc_flag) {
276 for (v=tmp_block1->from1-1; v<=tmp_block1->to1-3; v++)
277 if (!strncmp((char *)(seq1+v-2),"AG",(size_t)2)) {
278 last_AG.pos1 = v+1;
279 last_AG.pos2 = tmp_block1->from2+
280 (v-tmp_block1->from1)+1;
281 break;
282 }
283 for (v=tmp_block1->from1-1; v<=tmp_block1->to1-3; v++)
284 if (!strncmp((char *)(seq1+v-2),"AC",(size_t)2)) {
285 last_AC.pos1 = v+1;
286 last_AC.pos2 = tmp_block1->from2+
287 (v-tmp_block1->from1)+1;
288 break;
289 }
290 } /* end acc_flag */
291
292 diff = (int)(min(diff,(int)(MAX_GRINIT/2)));
293 u = min(4*diff,tmp_block1->from1-tmp_block->to1-1);
294 cost = EXTEND_BW(seq2+tmp_block->to2+
295 (tmp_block1->from2-tmp_block->to2-1)-diff,
296 seq1+tmp_block->to1+
297 (tmp_block1->from1-tmp_block->to1-1)-u,
298 (int)diff, u,
299 tmp_block->to2+
300 (tmp_block1->from2-tmp_block->to2-1)-diff,
301 tmp_block->to1+
302 (tmp_block1->from1-tmp_block->to1-1)-u,
303 &I, &J);
304 if ((good_match==FALSE) || tmp_block->flag || (J==0) || (I==0)) {
305 tmp_block1->from2 = I+1;
306 tmp_block1->from1 = J+1;
307 tmp_block1->edist += cost;
308 tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1;
309 }
310
311 /* use blast if marginal gap still exists, and this is first scan */
312 if (!(diff=(int)(tmp_block1->from2-tmp_block->to2-1)) ||
313 tmp_block->flag) {
314 /* blast-treated region or no gap */
315 tmp_Rblock = tmp_Lblock = NULL;
316 } else {
317 exon_cores(seq1+tmp_block->to1-1,
318 seq2+tmp_block->to2-1,
319 tmp_block1->from1-tmp_block->to1-1,
320 diff,
321 tmp_block->to1+1,
322 tmp_block->to2+1,
323 1,
324 min(10,W),
325 in_C,
326 /* (min(10,W)==W) ? PERM : TEMP); */
327 TEMP);
328 tmp_block -> flag = 1;
329 tmp_Lblock = tmp_Rblock = exon_list;
330 while (tmp_Rblock && tmp_Rblock->next_exon)
331 tmp_Rblock = tmp_Rblock->next_exon;
332
333 if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) ||
334 (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) &&
335 (tmp_Lblock->from1-tmp_block->from1>50000)) ||
336 (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) &&
337 (tmp_block1->from1-tmp_Rblock->from1>50000))) {
338 /* possible large intron; increase the score weight */
339 free_list(tmp_Lblock);
340 relink(msp, numMSPs,
341 (in_H>0) ? in_H:DEFAULT_RELINK_H,
342 tmp_block->to1+1,
343 tmp_block->to2+1,
344 1,seq1,seq2);
345 tmp_Lblock = tmp_Rblock = exon_list;
346 while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
347 tmp_Rblock = tmp_Rblock->next_exon;
348 }
349 free_msps(&msp, &numMSPs);
350
351 if (tmp_Lblock) rollbflag = 1;
352 else {
353 tmp_block1->from2 = I+1;
354 tmp_block1->from1 = J+1;
355 tmp_block1->edist += cost;
356 tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1;
357 }
358 }
359
360 } else {
361 if (rs.acc_flag) {
362 for (v=tmp_block->to1; v>=tmp_block->from1; v--)
363 if (!strncmp((char *)(seq1+v),"GT",(size_t)2)) {
364 last_GT.pos1 = v;
365 last_GT.pos2 = tmp_block->to2-(tmp_block->to1-v);
366 break;
367 }
368 for (v=tmp_block->to1; v>=tmp_block->from1; v--)
369 if (!strncmp((char *)(seq1+v),"CT",(size_t)2)) {
370 last_CT.pos1 = v;
371 last_CT.pos2 = tmp_block->to2-(tmp_block->to1-v);
372 break;
373 }
374 }
375
376 diff = (int)(min(diff,(int)(MAX_GRINIT/2)));
377 cost = EXTEND_FW(seq2+tmp_block->to2,
378 seq1+tmp_block->to1,
379 diff,
380 min(4*diff,tmp_block1->from1-tmp_block->to1-1),
381 tmp_block->to2,tmp_block->to1,
382 &I, &J);
383 if ((good_match==FALSE) || tmp_block1->flag || (I==M) || (J==N)) {
384 if (tmp_block->to1) {
385 tmp_block->to2 = I;
386 tmp_block->to1 = J;
387 tmp_block->edist += cost;
388 tmp_block->length = tmp_block->to2-tmp_block->from2+1;
389 tmp_Rblock = tmp_Lblock = NULL;
390 } else
391 /* special case: no initial exon */
392 tmp_Lblock = tmp_Rblock = NULL;
393 }
394 /* use blast if marginal gap still exists, and this is first scan */
395 if (!(diff=(int)(tmp_block1->from2-tmp_block->to2-1)) ||
396 tmp_block1->flag) {
397 /* blast-treated region or no gap */
398 tmp_Rblock = tmp_Lblock = NULL;
399 } else {
400 exon_cores(seq1+tmp_block->to1-1,
401 seq2+tmp_block->to2-1,
402 tmp_block1->from1-tmp_block->to1-1,
403 diff,
404 tmp_block->to1+1,
405 tmp_block->to2+1,
406 1,
407 min(10,W),
408 in_C,
409 /* (min(10,W)==W) ? PERM : TEMP); */
410 TEMP);
411 tmp_Lblock = tmp_Rblock = exon_list;
412 while (tmp_Rblock && tmp_Rblock->next_exon)
413 tmp_Rblock = tmp_Rblock->next_exon;
414
415 if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) ||
416 (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) &&
417 (tmp_Lblock->from1-tmp_block->from1>50000)) ||
418 (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) &&
419 (tmp_block1->from1-tmp_Rblock->from1>50000))) {
420 /* possible large intron; increase the score weight */
421 free_list(tmp_Lblock);
422 relink(msp, numMSPs,
423 (in_H>0) ? in_H:DEFAULT_RELINK_H,
424 tmp_block->to1+1,
425 tmp_block->to2+1,
426 1,seq1,seq2);
427 tmp_Lblock = tmp_Rblock = exon_list;
428 while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
429 tmp_Rblock = tmp_Rblock->next_exon;
430 }
431 free_msps(&msp, &numMSPs);
432
433 tmp_block1->flag = 1;
434 if (tmp_Lblock) rollbflag = 1;
435 else {
436 if (tmp_block->to1) {
437 tmp_block->to2 = I;
438 tmp_block->to1 = J;
439 tmp_block->edist += cost;
440 tmp_block->length = tmp_block->to2-tmp_block->from2+1;
441 tmp_Rblock = tmp_Lblock = NULL;
442 } else
443 /* special case: no initial exon */
444 tmp_Lblock = tmp_Rblock = NULL;
445 }
446 }
447 }
448 } else if (diff) {
449 tmp_Rblock = tmp_Lblock = NULL;
450 }
451
452 /* merge block in the exon list; make connections
453 to the previous list of blocks; maintain
454 increasing order */
455
456 if (tmp_Lblock) {
457 tmp_block->next_exon = tmp_Lblock;
458 tmp_Rblock->next_exon = tmp_block1;
459 merge(&tmp_block,&tmp_block1);
460 }
461 }
462 } /* diff!=0 */
463 if (!rollbflag) tmp_block = tmp_block1;
464 }
465
466 /* just printing ... */
467 #ifdef DEBUG
468 debug_print_exons(Lblock, "EXTENSIONS");
469 #endif
470
471 /* compaction step; note: it resets the right end of the list to */
472 /* the last item in the block list */
473
474 compact_list(&(Lblock->next_exon), &Rblock);
475
476 /* just printing ... */
477 #ifdef DEBUG
478 debug_print_exons(Lblock, "NORMALIZATION");
479 #endif
480
481 /* eliminate marginal small blocks at the start of the sequence; */
482 /* resets the empty alignment to one block (Lblock) only */
483
484 tmp_block = Lblock->next_exon;
485
486 while ((tmp_block!=NULL) && (tmp_block->length<W) && tmp_block->to1) {
487 tmp_block1 = tmp_block; /* free */
488 tmp_block = tmp_block->next_exon;
489 free(tmp_block1); /* free */
490 }
491 Lblock->next_exon = tmp_block;
492
493 /* eliminate marginal small blocks at the end of the sequence */
494
495 last = Lblock->next_exon;
496 tmp_block = last;
497 while (tmp_block!=NULL) {
498 if (tmp_block->length>=W)
499 last = tmp_block;
500 tmp_block = tmp_block->next_exon;
501 }
502 if (last && last->to1)
503 last->next_exon = Rblock->next_exon;
504 Rblock = last;
505
506 /* if high accuracy requirement, adjust boundaries of marginal exons */
507 if (rs.acc_flag) {
508 tmp_block = Lblock->next_exon;
509
510 /* condition for non-signal */
511 if (tmp_block && tmp_block->to1 &&
512 (strncmp((char *)(seq1+tmp_block->from1-3), END_SIG, (size_t)2) ||
513 (tmp_block->from2!=1))) {
514 sig = (G_score>=abs(C_score)) ? &last_AG : &last_AC;
515 if (sig->pos1 && (sig->pos2<=20)) {
516 /* generated in extend_bw */
517 assert(sig->pos2 > 1);
518 (void)strcpy((char *)tmp,END_SIG);
519 (void)strncpy((char *)(tmp+2),(char *)seq2,(size_t)sig->pos2-1);
520 (void)strcpy((char *)(tmp+sig->pos2+1), START_SIG);
521 new = bmatch(seq1,tmp,tmp_block->from1-3,sig->pos2+3,1,1);
522 if (new) {
523 Lblock->next_exon->from1 = sig->pos1;
524 Lblock->next_exon->from2 = sig->pos2;
525 Lblock->next_exon->length -= sig->pos2-1;
526 new->next_exon = Lblock->next_exon;
527 new->ori = (G_score>=abs(C_score)) ? 'G' : 'C';
528 Lblock->next_exon = new;
529 }
530 }
531 }
532 while (tmp_block && tmp_block->next_exon && tmp_block->next_exon->to1)
533 tmp_block = tmp_block->next_exon;
534 if (tmp_block && tmp_block->to1 &&
535 (strncmp((char *)(seq1+tmp_block->to1),START_SIG,(size_t)2) || (tmp_block->to2!=N))) {
536 sig = (G_score>=abs(C_score)) ? &last_GT : &last_CT;
537 if (sig->pos1 && (N-sig->pos2<=20)) {
538 assert(N-sig->pos2 >= 0);
539 (void)strcpy((char *)tmp,END_SIG);
540 (void)strncpy((char *)(tmp+2),(char *)(seq2+sig->pos2),
541 (size_t)N-sig->pos2);
542 (void)strcpy((char *)(tmp+N-sig->pos2+2),START_SIG);
543 new = fmatch(seq1+sig->pos1-1,tmp,
544 M-sig->pos1+1,N-sig->pos2+4,
545 sig->pos1-1,sig->pos2+1);
546 if (new) {
547 tmp_block->to1 = sig->pos1;
548 tmp_block->to2 = sig->pos2;
549 new->next_exon = tmp_block->next_exon;
550 tmp_block->next_exon = new;
551 tmp_block->ori = (G_score>=abs(C_score)) ? 'G' : 'C';
552 }
553 }
554 }
555 }
556
557 /* Slide exon boundaries for optimal intron signals */
558 sync_flag = get_sync_flag(Lblock, Rblock, 6);
559 SLIDE_INTRON(sync_flag)(6,&Lblock,seq1,seq2);
560
561 /* decreasingly; script will be in reverse order */
562 flip_list(&Lblock, &Rblock);
563 pluri_align(dist_ptr,&(st->nmatches),Lblock,&Script_head);
564 flip_list(&Lblock, &Rblock); /* increasingly */
565
566 if (rs.poly_flag)
567 remove_poly(&Script_head,Lblock,seq1,seq2,N,pT,pA);
568 else
569 *pT = *pA = 0;
570
571 get_stats(Lblock, st);
572
573 *Exons = Lblock->next_exon;
574 free(Lblock);
575 if (!rs.ali_flag) {
576 free_align(Script_head);
577 return NULL;
578 } else
579 return Script_head;
580
581 }
582
583
584 struct hash_node {
585 int ecode; /* integer encoding of the word */
586 int pos; /* positions where word hits query sequence
587 */
588 struct hash_node *link; /* next word with same last 7.5 letters */
589 };
590
591 #define HASH_SIZE 32767 /* 2**15 - 1 */
592 #define GEN_LOG4_ENTRIES 45
593 #define CDNA_LOG4_ENTRIES 25
594 static struct hash_node *phashtab[HASH_SIZE+1];
595 static struct hash_node **hashtab;
596 static int mask;
597 static int *next_pos, *pnext_pos;
598
599 /* The log4 arrays were computed to mimick the behaviour of the log formula
600 for computing the msp threshold in exon_cores(). For genomic_log4s,
601 entry i stores the value for the length of a genomic sequence
602 for which the contribution to the msp threshold is i/2, i.e.:
603 1.4*log_4(3/4*len1) = i/2;
604 Similarly, cDNA_log4s entries store lengths of the cDNA sequence for which
605 the contribution to the msp threshold is i/2, i.e.:
606 1.4*log_4(len2) = i/2;
607 Both arrays are sorted in increasing order, and can be searched with
608 binary search.
609 */
610 static long genomic_log4s[]= {1, 2, 3, 5, 9, 15, 26, 42, 70, 114, \
611 188, 309, 507, 832, 1365, 1365, 2240, 2240, 3675, 6029,\
612 9892, 16231, 26629, 43690, 71681, \
613 117606, 192953, 316573, 519392, 852152,
614 1398101, 2293823, 3763409, 6174516, 10130347, \
615 16620564, 27268873, 44739242, 73402365, 120429110, \
616 197584514, 324171126, 531858072, 872603963, 1431655765
617 };
618 static long cDNA_log4s[]= {1, 1, 2, 4, 7, 11, 19, 32, 52, 86, \
619 141, 231, 380, 624, 1024, 1680, 2756, 4522, 7419, 12173, \
620 19972, 32768, 53761, 88204, 144715
621 };
622
623
get_msp_threshold(int len1,int len2)624 static int get_msp_threshold(int len1, int len2)
625 {
626 int i, j;
627
628 i = find_log_entry(genomic_log4s, GEN_LOG4_ENTRIES, len1, 0);
629 j = find_log_entry(cDNA_log4s, CDNA_LOG4_ENTRIES, len2, 0);
630
631 if (!(i % 2)) return (int)(i/2+j/2);
632 else if (!(j % 2)) return (int)(i/2+j/2);
633 else return (int)(i/2+j/2+1);
634 }
635
find_log_entry(long * log4s,int n,int len,int offset)636 static int find_log_entry(long *log4s, int n, int len, int offset)
637 {
638 int a;
639
640 a = n/2;
641 if ((len<log4s[a]) && (!a || (len>=log4s[a-1])))
642 return max(0,(a-1))+offset;
643 else if ((len>=log4s[a]) && ((a==n-1) || (len<log4s[a+1])))
644 return min(n-1,(a+1))+offset;
645 else if (len<log4s[a])
646 return find_log_entry(log4s,a-1,len, offset);
647 else if (len>log4s[a])
648 return find_log_entry(log4s+a+1,n-a-1,len, offset+a+1);
649 return -1;
650 }
651
652 /* -------------------- exon_cores() --------------------- */
653
exon_cores(uchar * s1,uchar * s2,int len1,int len2,int offset1,int offset2,int flag,int in_W,int in_K,int type)654 static void exon_cores(uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2, int flag, int in_W, int in_K, int type)
655 {
656 int i, W, last_msp, lower, upper;
657 int *allocated;
658 Exon *tmp_block;
659
660
661 if (in_K<=0) {
662 /* compute expected length of longest exact match .. */
663 /* K = (int) (log(.75*(double)len1)+log((double)len2))/log(4.0); */
664 /* .. and throw in a fudge factor */
665 /* K *= 1.4; */
666
667 K = get_msp_threshold(len1, len2);
668 if (K>=0) K--; /* compensate for the rounding in the log formula */
669 /* commented this to avoid fragmentation
670 if (flag) K = min(K, DEFAULT_C); second pass
671 */
672 } else
673 K = in_K;
674
675 numMSPs = 0;
676 exon_list = NULL;
677
678 allocated = ckalloc((len1+len2+1)*sizeof(int));
679 lower = ((file_type==EST_GEN) || (file_type==GEN_EST && type==TEMP))
680 ? -len1 : -len2;
681 upper = ((file_type==EST_GEN) || (file_type==GEN_EST && type==TEMP))
682 ? len2 : len1;
683 diag_lev = allocated - lower;
684 for (i=lower; i<=upper; ++i) diag_lev[i]=0;
685
686 W = min(in_W,len2);
687 switch (file_type) {
688 case EST_GEN: bld_table(s2,len2,W, type);
689 /* use longer sequence for permanent tables */
690 search(s1,s2,len1,len2,W);
691 break;
692 case GEN_EST: if (type!=TEMP) {
693 uchar *aux; int auxi;
694
695 aux = s1; s1 = s2; s2 = aux;
696 auxi = len1; len1 = len2; len2 = auxi;
697 }
698 bld_table(s2,len2,W, type);
699 /* use longer sequence for permanent tables */
700 search(s1,s2,len1,len2,W);
701 if (type!=TEMP) {
702 register int auxi;
703 uchar *aux;
704 Msp_ptr mp;
705
706 /* change s1 and s2 back */
707 aux = s1; s1 = s2; s2 = aux;
708 auxi = len1; len1 = len2; len2 = auxi;
709
710 for (mp=msp_list, i=0; i<numMSPs; i++) {
711 auxi = mp->pos1;
712 mp->pos1 = mp->pos2;
713 mp->pos2 = auxi;
714 mp = mp->next_msp;
715 }
716 }
717 break;
718 default: fatal("sim4b1.c: Invalid file type code.");
719 }
720
721 free(allocated);
722 if (type==TEMP) {
723 register struct hash_node *hptr, *tptr;
724 register int hval;
725
726 free(next_pos);
727 for (hval=0; hval<HASH_SIZE+1; hval++) {
728 hptr = hashtab[hval];
729 while (hptr) {
730 tptr = hptr;
731 hptr = hptr->link;
732 free(tptr);
733 }
734 }
735 free(hashtab);
736 }
737
738 msp = (Msp_ptr *) ckalloc(numMSPs*sizeof(Msp_ptr));
739 { Msp_ptr mp = msp_list;
740 for (i = 0; i < numMSPs; ++i) {
741 msp[i] = mp;
742 mp = mp->next_msp;
743 }
744 }
745
746 sort_msps(); /* sort in order of mp->pos2, in the shorter seq */
747
748 /* organize Blast hits (MSPs) into exons */
749 last_msp = link_msps(msp, numMSPs, DEFAULT_WEIGHT, LINK);
750
751 #ifdef DEBUG
752 for (i = last_msp; i >= 0; i = msp[i]->prev)
753 (void)printf("%d-%d\n", msp[i]->pos1, msp[i]->pos1 + msp[i]->len - 1);
754 #endif
755
756 msp2exons(msp,last_msp,s1,s2);
757
758 /* now free msp[]? No - may need to re-link */
759 /*
760 for (i=0; i<numMSPs; ++i)
761 free(msp[i]);
762 free(msp);
763 */
764
765 tmp_block = exon_list;
766 while (tmp_block!=NULL) {
767 tmp_block->length = tmp_block->to2-tmp_block->from2+1;
768 tmp_block->to1 += offset1;
769 tmp_block->from1 += offset1;
770 tmp_block->to2 += offset2;
771 tmp_block->from2 += offset2;
772 tmp_block->flag = flag;
773
774 tmp_block = tmp_block->next_exon;
775 }
776
777 return ;
778 }
779
relink(Msp_ptr * in_msp,int in_numMSPs,int H,int offset1,int offset2,int flag,uchar * s1,uchar * s2)780 static void relink(Msp_ptr *in_msp, int in_numMSPs, int H, int offset1, int offset2, int flag, uchar *s1, uchar *s2)
781 {
782 int last_msp;
783 Exon *tmp_block;
784
785 exon_list = NULL;
786
787 last_msp = link_msps(in_msp, in_numMSPs, H, RELINK);
788
789 msp2exons(in_msp,last_msp,s1,s2);
790
791 tmp_block = exon_list;
792 while (tmp_block!=NULL) {
793 tmp_block->length = tmp_block->to2-tmp_block->from2+1;
794 tmp_block->to1 += offset1;
795 tmp_block->from1 += offset1;
796 tmp_block->to2 += offset2;
797 tmp_block->from2 += offset2;
798 tmp_block->flag = flag;
799
800 tmp_block = tmp_block->next_exon;
801 }
802
803 return ;
804 }
805
free_msps(Msp_ptr ** in_msp,int * in_numMSPs)806 static void free_msps(Msp_ptr **in_msp, int *in_numMSPs)
807 {
808 int i;
809
810 for (i=0; i<*in_numMSPs; ++i) free((*in_msp)[i]);
811 free(*in_msp); *in_msp = NULL; *in_numMSPs = 0;
812 }
813
scale(int n)814 static int scale(int n)
815 {
816 return (n<=100000) ? n : (100000+(int)(10*log((double)(n-100000))));
817 }
818
link_msps(Msp_ptr * msp,int numMSPs,int H,int flag)819 static int link_msps(Msp_ptr *msp, int numMSPs, int H, int flag)
820 {
821 int i, j, f1, f2, best, diag, diff_diag, best_sc, try;
822
823
824 for (best = -1, best_sc = MININT, i = 0; i < numMSPs; ++i) {
825 f1 = msp[i]->pos1; /* start position in seq1 */
826 f2 = msp[i]->pos2; /* start position in seq2 */
827 diag = f1 - f2;
828 msp[i]->prev = -1;
829 msp[i]->Score = 0;
830 for (j = 0; j < i; ++j) {
831 int var_L =
832 ((msp[i]->pos2+msp[i]->len-msp[j]->pos2-msp[j]->len>2*W) &&
833 (msp[i]->pos2-msp[j]->pos2>2*W)) ? 2*L : L;
834
835 diff_diag = diag - msp[j]->pos1 + msp[j]->pos2;
836 if (diff_diag < -rs.DRANGE ||
837 (diff_diag > rs.DRANGE && diff_diag < MIN_INTRON) ||
838 (msp[j]->pos2+msp[j]->len-1-f2>var_L) ||
839 (msp[j]->pos1+msp[j]->len-1-f1>var_L))
840
841 continue;
842 try = msp[j]->Score - ((flag==RELINK) ?
843 scale(abs(diff_diag)) : abs(diff_diag));
844
845 if (try > msp[i]->Score) {
846 msp[i]->Score = try;
847 msp[i]->prev = j;
848 }
849 }
850 msp[i]->Score += (H*msp[i]->score);
851 if (msp[i]->Score > best_sc) {
852 best = i;
853 best_sc = msp[i]->Score;
854 }
855 }
856
857 return best;
858 }
859
860 /* ----------- build table of W-tuples in one of the sequences ------------*/
861
bld_table(uchar * s,int len,int in_W,int type)862 void bld_table(uchar *s, int len, int in_W, int type)
863 {
864 int ecode;
865 int i, j;
866 uchar *t;
867
868 if (type == PERM) {
869 mask = (1 << (in_W+in_W-2)) - 1;
870 next_pos = pnext_pos;
871 hashtab = phashtab;
872 return;
873 }
874
875 /* perform initializations */
876 if (type == INIT) {
877
878 /* perform initializations */
879 for (i=0; i<NACHARS; ++i) encoding[i]=-1;
880
881 encoding['A'] = 0;
882 encoding['C'] = 1;
883 encoding['G'] = 2;
884 encoding['T'] = 3;
885 mask = (1 << (in_W+in_W-2)) - 1;
886 /* added +1 in the allocation below */
887 next_pos = pnext_pos = (int *)ckalloc((len+1)*sizeof(int));
888 hashtab = phashtab;
889 } else {
890 mask = (1 << (in_W+in_W-2)) - 1;
891 next_pos = (int *)ckalloc((len+1)*sizeof(int));
892 hashtab = (struct hash_node **)
893 ckalloc((HASH_SIZE+1)*sizeof(struct hash_node *));
894 for (i=0; i<=HASH_SIZE; ++i) hashtab[i] = NULL;
895 }
896
897 /* skip any word containing an N/X */
898 t = s+1;
899 for (i=1; (i<=len) && *t; ) {
900 restart:
901 ecode = 0L;
902 for (j=1; (j<in_W) && (i<=len) && *t; ++j) {
903 int tmp = encoding[*t++]; ++i;
904 if (tmp<0) goto restart;
905 ecode = (ecode << 2) + tmp;
906 }
907
908 for (; (i<=len) && *t; ) {
909 int tmp = encoding[*t++]; i++;
910 if (tmp<0) goto restart;
911 ecode = ((ecode & mask) << 2) + tmp;
912 add_word(ecode, (int)(t-s-1));
913 }
914 }
915
916 /* previous version: include 'N's in the table
917 t = s;
918 ecode = 0L;
919 for (i = 1; i < in_W; ++i)
920 ecode = (ecode << 2) + encoding[*++t];
921 for (; (i<=len) && (*++t); i++) {
922 ecode = ((ecode & mask) << 2) + encoding[*t];
923 add_word(ecode, (int)(t - s));
924 }
925 */
926 }
927
928 /* add_word - add a word to the table of critical words */
add_word(int ecode,int pos)929 static void add_word(int ecode, int pos)
930 {
931 struct hash_node *h;
932 int hval;
933
934 hval = ecode & HASH_SIZE;
935 for (h = hashtab[hval]; h; h = h->link)
936 if (h->ecode == ecode)
937 break;
938 if (h == NULL) {
939 h = (struct hash_node *) ckalloc (sizeof(struct hash_node));
940 h->link = hashtab[hval];
941 hashtab[hval] = h;
942 h->ecode = ecode;
943 h->pos = -1;
944 }
945 next_pos[pos] = h->pos;
946 h->pos = pos;
947 }
948
949 /* ----------------------- search the other sequence ---------------------*/
950
search(uchar * s1,uchar * s2,int len1,int len2,int in_W)951 static void search(uchar *s1, uchar *s2, int len1, int len2, int in_W)
952 {
953 register struct hash_node *h;
954 register uchar *t;
955 register int ecode, hval;
956 int i, j, p;
957
958 t = s1+1;
959 for (i=1; (i<=len1) && *t; ) {
960 restart:
961 ecode = 0L;
962 for (j=1; (j<in_W) && (i<=len1) && *t; ++j) {
963 int tmp = encoding[*t++]; ++i;
964 if (tmp<0) goto restart;
965 ecode = (ecode << 2) + tmp;
966 }
967 for (; (i<=len1) && *t; ) {
968 int tmp = encoding[*t++]; ++i;
969 if (tmp<0) goto restart;
970 ecode = ((ecode & mask) << 2) + tmp;
971 hval = ecode & HASH_SIZE;
972 for (h = hashtab[hval]; h; h = h->link)
973 if (h->ecode == ecode) {
974 for (p = h->pos; p >= 0; p = next_pos[p])
975 extend_hit((int)(t-s1-1),p,s1,s2,len1,len2, in_W);
976 break;
977 }
978 }
979 }
980
981 /* previous version: allow 'N's in the table --------
982 t = s1;
983 ecode = 0L;
984 for (i = 1; i < in_W; ++i)
985 ecode = (ecode << 2) + encoding[*++t];
986 for (; (i<=len1) && (*++t); i++) {
987 ecode = ((ecode & mask) << 2) + encoding[*t];
988 hval = ecode & HASH_SIZE;
989 for (h = hashtab[hval]; h; h = h->link)
990 if (h->ecode == ecode) {
991 for (p = h->pos; p >= 0; p = next_pos[p])
992 extend_hit((int)(t-s1),p,s1,s2,len1,len2, in_W);
993 break;
994 }
995 }
996 ---------------------------------------------------*/
997 }
998
999 /* extend_hit - extend a word-sized hit to a longer match */
extend_hit(int pos1,int pos2,const uchar * const s1,const uchar * const s2,int len1,int len2,int in_W)1000 static void extend_hit(int pos1, int pos2,
1001 const uchar * const s1, const uchar * const s2,
1002 int len1, int len2, int in_W)
1003 {
1004 const uchar *beg2, *beg1, *end1, *q, *s;
1005 int right_sum, left_sum, sum, diag, score;
1006
1007 diag = pos2 - pos1;
1008 if (diag_lev[diag] > pos1)
1009 return;
1010
1011 /* extend to the right */
1012 left_sum = sum = 0;
1013 q = s1+1+pos1;
1014 s = s2+1+pos2;
1015 end1 = q;
1016 while ((*s != '\0') && (*q != '\0') &&
1017 (s<=s2+len2) && (q<=s1+len1) && sum >= left_sum - X) {
1018 sum += ((*s++ == *q++) ? MATCH : MISMATCH);
1019 if (sum > left_sum) {
1020 left_sum = sum;
1021 end1 = q;
1022 }
1023 }
1024
1025 /* extend to the left */
1026 right_sum = sum = 0;
1027 beg1 = q = (s1+1+pos1) - in_W;
1028 beg2 = s = (s2+1+pos2) - in_W;
1029 while ((s>s2+1) && (q>s1+1) && sum >= right_sum - X) {
1030 sum += ((*(--s) == *(--q)) ? MATCH : MISMATCH);
1031 if (sum > right_sum) {
1032 right_sum = sum;
1033 beg2 = s;
1034 beg1 = q;
1035 }
1036 }
1037
1038 score = in_W + left_sum + right_sum;
1039 if (score >= K) {
1040 Msp_ptr mp = (Msp_ptr)ckalloc(sizeof(*mp));
1041
1042 mp->len = end1 - beg1;
1043 mp->score = score;
1044 mp->pos1 = beg1 - (s1+1);
1045 mp->pos2 = beg2 - (s2+1);
1046 mp->next_msp = msp_list;
1047 msp_list = mp;
1048 ++numMSPs;
1049 }
1050
1051 /*diag_lev[diag] = (end1 - (s1+1)) + in_W; */
1052 diag_lev[diag] = (end1 - s1) - 1 + in_W;
1053 }
1054
1055 /* ---------------------------- sort the MSPs ----------------------------*/
1056
1057 /* sort_msps - order database sequence for printing */
sort_msps(void)1058 static void sort_msps(void)
1059 {
1060 int i;
1061 Msp_ptr mp;
1062
1063 for (i = (numMSPs/2) - 1; i >= 0; --i)
1064 heapify(i, (int) numMSPs-1);
1065 for (i = numMSPs-1; i > 0; --i) {
1066 mp = msp[0];
1067 msp[0] = msp[i];
1068 msp[i] = mp;
1069 if (i > 1)
1070 heapify(0, i-1);
1071 }
1072 }
1073
1074 /* heapify - core procedure for heapsort */
heapify(i,last)1075 static void heapify(i, last)
1076 int i, last;
1077 {
1078 int lim = (last-1)/2, left_son, small_son;
1079 Msp_ptr mp;
1080
1081 while (i <= lim) {
1082 left_son = 2*i + 1;
1083 if (left_son == last)
1084 small_son = left_son;
1085 else
1086 small_son = smaller(left_son, left_son+1);
1087 if (smaller(i, small_son) == small_son) {
1088 mp = msp[i];
1089 msp[i] = msp[small_son];
1090 msp[small_son] = mp;
1091 i = small_son;
1092 } else
1093 break;
1094 }
1095 }
1096 /* smaller - determine ordering relationship between two MSPs */
smaller(i,j)1097 static int smaller(i, j)
1098 int i, j;
1099 {
1100 Msp_ptr ki = msp[i], kj = msp[j];
1101
1102 if (ki->pos2 > kj->pos2) return i;
1103 if (ki->pos2 < kj->pos2) return j;
1104
1105 return ((ki->pos1 >= kj->pos1) ? i : j);
1106 }
1107
1108 /* --------------------- organize the MSPs into exons ---------------------*/
1109
msp2exons(Msp_ptr * msp,int last_msp,uchar * s1,uchar * s2)1110 static void msp2exons(Msp_ptr *msp, int last_msp, uchar *s1, uchar *s2)
1111 {
1112 Msp_ptr mp;
1113 int diag_dist, diff;
1114
1115 exon_list = NULL;
1116 if (last_msp<0) return;
1117
1118 /* Note: in new_exon, the 'flag' and 'length' fields need not be computed */
1119 mp = msp[last_msp];
1120 exon_list = new_exon (mp->pos1, mp->pos2, mp->pos1+mp->len-1,
1121 mp->pos2+mp->len-1, -1,
1122 (mp->len*MATCH-mp->score)/(MATCH-MISMATCH), 0, exon_list);
1123
1124 last_msp = mp->prev;
1125
1126 while (last_msp>=0) {
1127 mp = msp[last_msp];
1128 if (((diag_dist=abs((exon_list->from2-exon_list->from1)-(mp->pos2-mp->pos1)))<=L)
1129 && (exon_list->from2-(mp->pos2+mp->len-1))<MAX_INTERNAL_GAP) {
1130 /* merge with previous exon */
1131 exon_list->edist += diag_dist;
1132 exon_list->edist += (mp->len*MATCH-mp->score)/(MATCH-MISMATCH);
1133 if ((diff=mp->pos2+mp->len-exon_list->from2)>0) { /* overlap */
1134 int dist1, dist2;
1135 dist1 = get_edist(exon_list->from1,mp->pos2+mp->len-diff,
1136 exon_list->from1+diff-1,mp->pos2+mp->len-1,s1,s2);
1137 dist2 = get_edist(mp->pos1+mp->len-diff,mp->pos2+mp->len-diff,
1138 mp->pos1+mp->len-1,mp->pos2+mp->len-1,s1,s2);
1139 exon_list->edist -= max(dist1,dist2);
1140 } else if (diff<0) { /* gap */
1141 exon_list->edist += 0.5*P*(-1)*diff;
1142 }
1143 exon_list->to1 = max(exon_list->to1,mp->pos1+mp->len-1);
1144 exon_list->to2 = max(exon_list->to2,mp->pos2+mp->len-1);
1145 exon_list->from1 = min(exon_list->from1,mp->pos1);
1146 exon_list->from2 = min(exon_list->from2,mp->pos2);
1147 } else {
1148 /* new exon */
1149 exon_list = new_exon (mp->pos1, mp->pos2, mp->pos1+mp->len-1,
1150 mp->pos2+mp->len-1, -1,
1151 (mp->len*MATCH-mp->score)/(MATCH-MISMATCH),
1152 0, exon_list);
1153 }
1154 last_msp = mp->prev;
1155 }
1156 }
1157
get_edist(int f1,int f2,int t1,int t2,uchar * seq1,uchar * seq2)1158 static int get_edist(int f1, int f2, int t1, int t2, uchar *seq1, uchar *seq2)
1159 {
1160 uchar *s1, *s2, *q1, *q2;
1161 int dist=0;
1162
1163 s1 = seq1+f1+1; /* bc at this stage, the msp pos do not have added +1 */
1164 s2 = seq2+f2+1;
1165 q1 = seq1+t1+1;
1166 q2 = seq2+t2+1;
1167
1168 while (s1<=q1 && s2<=q2) { dist += (*s1!=*s2); s1++; s2++; }
1169
1170 return dist;
1171 }
1172
1173 /* ---------------------- print endpoints of exons --------------------*/
1174
1175 #ifdef AUXUTILS
find_introns(Exon * eleft,Intron ** Ilist)1176 static void find_introns(Exon *eleft, Intron **Ilist)
1177 {
1178 Exon *tmp_exon, *tmp_exon1;
1179 Intron *new, *tail;
1180 int GTAG_score, CTAC_score;
1181
1182 *Ilist = tail = NULL;
1183 if (!eleft) fatal("sim4b1.c: Something wrong in the exon list.\n");
1184
1185 tmp_exon = eleft->next_exon;
1186 while ((tmp_exon!=NULL) && (tmp_exon1=tmp_exon->next_exon) &&
1187 tmp_exon1->to1) {
1188
1189 new = (Intron *)ckalloc(sizeof(Intron));
1190 new->from1 = tmp_exon->to1+1;
1191 new->to1 = tmp_exon1->from1-1;
1192 new->from2 = tmp_exon->to2;
1193 new->to2 = tmp_exon1->from2;
1194 new->length = new->to1-new->from1+1;
1195 new->next_intron = NULL;
1196 if (!tail) *Ilist = new;
1197 else tail->next_intron = new;
1198 tail = new;
1199
1200 /* find orientation */
1201 GTAG_score = CTAC_score = 0;
1202 if (*(seq1+new->from1-1)=='G') GTAG_score++;
1203 else
1204 if (*(seq1+new->from1-1)=='C') CTAC_score++;
1205
1206 if (*(seq1+new->from1-1)=='T')
1207 { GTAG_score++; CTAC_score++; }
1208
1209 if (*(seq1+new->to1-1)=='A')
1210 { GTAG_score++; CTAC_score++; }
1211
1212 if (*(seq1+new->to1-1)=='G') GTAG_score++;
1213 else
1214 if (*(seq1+new->to1-1)=='C') CTAC_score++;
1215
1216 if (GTAG_score>=CTAC_score)
1217 new->orientation = '+';
1218 else new->orientation = 'c';
1219
1220 tmp_exon = tmp_exon1;
1221 }
1222
1223 }
1224 #endif
1225
1226
1227 /* should only be called when (file_type==EST_GEN) && (match_ori==BWD) */
complement_exons(Exon ** left,int M,int N)1228 void complement_exons(Exon **left, int M, int N)
1229 {
1230 Exon *tmp_block, *right;
1231 char prev, ch;
1232
1233 prev = 'U'; /* unknown; should trigger error */
1234 tmp_block = *left;
1235 while (tmp_block) {
1236 if (tmp_block->to1) {
1237 register int aux;
1238
1239 if (tmp_block->next_exon && tmp_block->next_exon->to1) {
1240 ch = tmp_block->ori;
1241 tmp_block->ori = prev;
1242 switch (ch) {
1243 case 'C': prev = 'G'; break;
1244 case 'G': prev = 'C'; break;
1245 case 'N': prev = 'N'; break;
1246 case 'E': prev = 'E'; break;
1247 default : fatal("sim4b1.c: Inconsistency. Check exon orientation at complementation.");
1248 }
1249 } else
1250 tmp_block->ori = prev;
1251 aux = tmp_block->from1;
1252 tmp_block->from1 = M+1-tmp_block->to1;
1253 tmp_block->to1 = M+1-aux;
1254 aux = tmp_block->from2;
1255 tmp_block->from2 = N+1-tmp_block->to2;
1256 tmp_block->to2 = N+1-aux;
1257 }
1258 tmp_block = tmp_block->next_exon;
1259 if (tmp_block && tmp_block->to1)
1260 right = tmp_block;
1261 }
1262 flip_list(left,&right);
1263 }
1264
print_exons(Exon * left)1265 void print_exons(Exon *left)
1266 {
1267 Exon *tmp_block, *tmp_block1;
1268
1269 tmp_block = left;
1270 while (tmp_block!=NULL) {
1271 if (tmp_block->to1) {
1272
1273 if (file_type==EST_GEN)
1274 (void)fprintf(stdout,"%d-%d (%d-%d) %d%%",
1275 tmp_block->from2, tmp_block->to2,
1276 tmp_block->from1, tmp_block->to1,
1277 tmp_block->match);
1278 else
1279 /* file_type==GEN_EST */
1280 (void)fprintf(stdout,"%d-%d (%d-%d) %d%%",
1281 tmp_block->from1, tmp_block->to1,
1282 tmp_block->from2, tmp_block->to2,
1283 tmp_block->match);
1284 if (((tmp_block1=tmp_block->next_exon)!=NULL) && tmp_block1->to1)
1285 switch (tmp_block->ori) {
1286 case 'C': (void)fprintf(stdout," <-\n");
1287 break;
1288 case 'E': (void)fprintf(stdout," ==\n");
1289 break;
1290 case 'G': (void)fprintf(stdout," ->\n");
1291 break;
1292 case 'N': (void)fprintf(stdout," --\n");
1293 break;
1294 default : fatal("sim4b1.c: Inconsistency. Check exon orientations.");
1295 }
1296 }
1297 tmp_block = tmp_block->next_exon;
1298 }
1299 }
1300
1301 /* to and from are in the original cDNA sequence */
print_pipmaker_exons(Exon * exons,edit_script_list * aligns,char * gene,int from,int to,int M,int N,uchar * seq1,uchar * seq2,int match_ori)1302 void print_pipmaker_exons(Exon *exons, edit_script_list *aligns, char *gene,
1303 int from, int to, int M, int N, uchar *seq1, uchar *seq2,
1304 int match_ori)
1305 {
1306 Exon *tmp_block, *left, *right;
1307 int From, To, cov, ori;
1308
1309 /* print the first line in the record */
1310 if ((exons==NULL) ||
1311 (!exons->to1 && ((exons->next_exon==NULL) || !exons->next_exon)))
1312 return;
1313
1314 left = right = tmp_block = (exons->to1) ? exons : exons->next_exon;
1315 while (tmp_block) {
1316 if (tmp_block->to1) right = tmp_block;
1317 tmp_block = tmp_block->next_exon;
1318 }
1319 /* report any inconsistencies between the intron orientations, as well
1320 as between the mRNA (CDS) match strand and the introns orientations */
1321 ori = check_consistency_intron_ori(exons, match_ori, gene);
1322
1323 /* determine the matching coordinates for the CDS */
1324 if ((from>0) && (to>0) && aligns) {
1325 cov = dispatch_find_ends(from, to, &From, &To, aligns, M, N, match_ori);
1326 switch (cov) {
1327 case OK: if ((match_ori==FWD) && strncmp((char *)(seq1+From-1),"ATG",3))
1328 fprintf(stderr, "Warning: No start codon at location %d in the genomic sequence (%s).\n", From, gene);
1329 else if ((match_ori==BWD) && strncmp((char *)(seq1+To-3),"CAT",3))
1330 fprintf(stderr, "Warning: No (complement) start codon at location %d in the genomic sequence (%s).\n", To-2, gene);
1331 if ((match_ori==FWD) && strncmp((char *)(seq1+To-3),"TAA",3) &&
1332 strncmp((char *)(seq1+To-3),"TAG",3) &&
1333 strncmp((char *)(seq1+To-3),"TGA",3))
1334 fprintf(stderr, "Warning: No stop codon at location %d in the genomic sequence (%s).\n", To-2, gene);
1335 else if ((match_ori==BWD) && strncmp((char *)(seq1+From-1),"TTA",3) &&
1336 strncmp((char *)(seq1+From-1),"CTA",3) &&
1337 strncmp((char *)(seq1+From-1),"TCA",3))
1338 fprintf(stderr, "Warning: No (complement) stop codon at location %d in the genomic sequence (%s).\n", From, gene);
1339 break;
1340
1341 case FREE_START:
1342 fprintf(stderr, "Warning: Start of CDS does not match (%s).\n", gene);
1343 if ((match_ori==FWD) && strncmp((char *)(seq1+To-3),"TAA",3) &&
1344 strncmp((char *)(seq1+To-3),"TAG",3) &&
1345 strncmp((char *)(seq1+To-3),"TGA",3))
1346 fprintf(stderr, "Warning: No stop codon at location %d in the genomic sequence (%s).\n", To-2, gene);
1347 else if ((match_ori==BWD) && strncmp((char *)(seq1+From-1),"TTA",3) &&
1348 strncmp((char *)(seq1+From-1),"CTA",3) &&
1349 strncmp((char *)(seq1+From-1),"TCA",3))
1350 fprintf(stderr, "Warning: No (complement) stop codon at location %d in the genomic sequence (%s).\n", From, gene);
1351
1352 break;
1353
1354 case FREE_END:
1355 fprintf(stderr, "Warning: End of CDS does not match (%s).\n", gene);
1356 if ((match_ori==FWD) && strncmp((char *)(seq1+From-1),"ATG",3))
1357 fprintf(stderr, "Warning: No start codon at location %d in the genomic sequence (%s).\n", From, gene);
1358 else if ((match_ori==BWD) && strncmp((char *)(seq1+To-3),"CAT",3))
1359 fprintf(stderr, "Warning: No (complement) start codon at location %d in the genomic sequence (%s).\n", To-2, gene);
1360
1361 break;
1362
1363 case FREE_BOTH_ENDS:
1364 fprintf(stderr, "Warning: Start of CDS does not match (%s).\n", gene);
1365 fprintf(stderr, "Warning: End of CDS does not match (%s).\n", gene);
1366 break;
1367
1368 default: fatal("Unrecognized warning code.");
1369 }
1370 }
1371
1372 /* report codon inconsistencies in the cDNA */
1373 if (to>0 && from>0) {
1374 if (strncmp((char *)(seq2+from-1),"ATG",3))
1375 fprintf(stderr, "Warning: No start codon at location %d in the mRNA (%s).\n", from, gene);
1376 if (strncmp((char *)(seq2+to-3),"TAA",3) &&
1377 strncmp((char *)(seq2+to-3),"TAG",3) &&
1378 strncmp((char *)(seq2+to-3),"TGA",3))
1379 fprintf(stderr, "Warning: No end codon at location %d in the mRNA (%s).\n", to-2, gene);
1380 }
1381
1382 printf("%c %d %d %s%s\n",
1383 (ori==FWD) ? '>':'<', left->from1, right->to1, gene ? gene:"",
1384 (match_ori==FWD) ? "":" (complement)");
1385 if ((from>0) && (to>0) && aligns)
1386 printf("+ %d %d\n", From, To);
1387
1388 /* now print the exons */
1389 /* if (match_ori==BWD) flip_list(&left,&right); not accepted by PipMaker */
1390 tmp_block = left;
1391 while (tmp_block!=NULL) {
1392 if (tmp_block->to1)
1393 (void)fprintf(stdout,"%d %d\n",
1394 tmp_block->from1, tmp_block->to1);
1395 tmp_block = tmp_block->next_exon;
1396 }
1397 if (match_ori==BWD) flip_list(&left, &right);
1398
1399 return;
1400 }
1401
check_consistency_intron_ori(Exon * exons,int match_ori,char * gene)1402 static int check_consistency_intron_ori(Exon *exons, int match_ori, char *gene)
1403 {
1404 Exon *t=exons;
1405 int numG, numC, numE, numN;
1406
1407 numG = numC = numE = numN = 0;
1408
1409 if (!t->to1) t = t->next_exon;
1410 while (t && t->to1) {
1411 if (t->next_exon && t->next_exon->to1) {
1412 switch (t->ori) {
1413 case 'G': numG++; break;
1414 case 'C': numC++; break;
1415 case 'N': numN++; break;
1416 case 'E': numE++; break;
1417 default : fatal("sim4b1.c: Unrecognized intron orientation.");
1418 }
1419 }
1420 t = t->next_exon;
1421 }
1422 if (numG && numC)
1423 fprintf(stderr, "Warning: Introns reported on both strands (%s).\n", gene);
1424 /*
1425 Note: a match can be reverse complemented either b/c the complement
1426 genomic sequence was used as input, while the mRNA was actually transcribed
1427 in forward orientation (CT_AC), or b/c the forward strand was given for the
1428 genomic sequence, but the mRNA was transcribed in the reverse direction
1429 (GT_AG); hence there is no relevant test for this
1430 else if ((numG && (match_ori==BWD)) || (numC && (match_ori==FWD)))
1431 fprintf(stderr, "Warning: Introns orientations inconsistent with the reported match strand (%s).\n", gene);
1432 */
1433
1434 if (numN)
1435 fprintf(stderr, "Warning: Ambiguous intron orientation (%s).\n", gene);
1436 if (numE)
1437 fprintf(stderr, "Warning: Internal gap in the mRNA (%s).\n", gene);
1438
1439 return (numG>=numC) ? FWD:BWD;
1440 }
1441
1442
1443 /* from and to are given in the original cDNA sequence */
dispatch_find_ends(int from,int to,int * From,int * To,edit_script_list * aligns,int M,int N,int match_ori)1444 static int dispatch_find_ends(int from, int to, int *From, int *To, edit_script_list *aligns, int M, int N, int match_ori)
1445 {
1446 int f1, f2, t1, t2, ot1, ot2, xto, xfrom;
1447 int free_start, free_end;
1448
1449 free_start = free_end = 1;
1450
1451 if (aligns->next_script && (aligns->offset2 > aligns->next_script->offset2))
1452 script_flip_list(&aligns);
1453
1454 if (match_ori==FWD) {
1455 xto = to; xfrom = from;
1456 } else if (file_type == EST_GEN) {
1457 xto = to; xfrom = from;
1458 } else {
1459 xto = N-from+1; xfrom = N-to+1;
1460 }
1461
1462 *From = *To = 0; t1 = t2 = 0;
1463 while (aligns) {
1464 ot2 = t2; ot1 = t1;
1465 f1 = aligns->offset1; f2 = aligns->offset2;
1466 t1 = f1+aligns->len1-1; t2 = f2+aligns->len2-1;
1467 if (ot2 < xfrom && xfrom < f2) {
1468 *From = f1; break;
1469 }
1470 if (f2 <= xfrom && xfrom <= t2) {
1471 *From = find_ends(aligns, xfrom); free_start = 0; break;
1472 }
1473 aligns = aligns->next_script;
1474 }
1475 if (*From == 0) return FREE_BOTH_ENDS;
1476
1477 if (ot2 < xto && xto < f2) *To = ot1;
1478 else if (xto <= t2) { *To = find_ends(aligns, xto); free_end = 0; }
1479 else {
1480 *To = 0;
1481 while (aligns && ((aligns=aligns->next_script)!=NULL)) {
1482 ot2 = t2; ot1 = t1;
1483 f1 = aligns->offset1; f2 = aligns->offset2;
1484 t1 = f1+aligns->len1-1; t2 = f2+aligns->len2-1;
1485 if (ot2 < xto && xto < f2) { *To = ot1; break; }
1486 if (t2 < xto) *To = t1;
1487 else if (f2 <= xto && xto <=t2) {
1488 *To = find_ends(aligns, xto); free_end = 0; break;
1489 }
1490 }
1491 if (*To==0) *To = t1;
1492 }
1493
1494 if (*To == 0) { *From = 0; free_start = 1; }
1495
1496 if (*To && *From && match_ori==BWD && file_type==EST_GEN) {
1497 int aux = M-(*From)+1; *From = M-(*To)+1; *To = aux;
1498 }
1499
1500 if (free_start && free_end) return FREE_BOTH_ENDS;
1501 else if (free_start) return FREE_START;
1502 else if (free_end) return FREE_END;
1503
1504 return OK;
1505 }
1506
find_ends(edit_script_list * head,int j0)1507 static int find_ends(edit_script_list *head, int j0)
1508 {
1509 int i, j, e1, e2;
1510 edit_script *tp;
1511
1512 i = head->offset1; e1 = i+head->len1-1;
1513 j = head->offset2; e2 = j+head->len2-1;
1514
1515 tp = head->script;
1516 i--; j--;
1517 while (i<=e1 && j<=e2 && tp) {
1518 if (j==j0) return i;
1519 switch (tp->op_type) {
1520 case DELETE: i += tp->num; break;
1521 case INSERT: j += tp->num; if (j>=j0) return i; break;
1522 case SUBSTITUTE: i += tp->num; j += tp->num;
1523 if (j>=j0) return (i-(j-j0)); break;
1524 default: fatal("Illegal opcode in script.");
1525 }
1526 tp = tp->next;
1527 }
1528
1529 /* not found: failure */
1530 fatal("Inconsistency in script.");
1531 }
1532
1533
1534 #ifdef AUXUTILS
print_introns(Intron_ptr intron_list)1535 static void print_introns(Intron_ptr intron_list)
1536 {
1537 Intron_ptr ep=intron_list;
1538
1539 (void)printf("\nIntrons {\n\n");
1540 while (ep!=NULL) {
1541 (void)printf("genome: %6ld - %-6ld cDNA: %5ld - %-5ld, l: %-5d, o: %c \n",
1542 ep->from1, ep->to1, ep->from2, ep->to2,
1543 ep->length,
1544 ep->orientation);
1545 ep = ep->next_intron;
1546 }
1547 (void)printf("}\n\n");
1548 }
1549 #endif
1550
1551
pluri_align(int * dist_ptr,int * num_matches,Exon * lblock,struct edit_script_list ** Aligns)1552 static void pluri_align(int *dist_ptr,int *num_matches,Exon *lblock,struct edit_script_list **Aligns)
1553 {
1554 int tmpi, di_count, i, end1, end2, diff, ali_dist, nmatches;
1555 uchar *a, *b;
1556 Exon *tmp_block=lblock, *tmp_block1;
1557
1558 struct edit_script_list *enew;
1559 struct edit_script *head, *tmp_script, *new, *left, *right, *prev;
1560
1561 nmatches = 0;
1562
1563 head = NULL;
1564 *Aligns = NULL;
1565 *dist_ptr = ali_dist = 0;
1566
1567 end1 = M;
1568 end2 = N;
1569
1570 while (((tmp_block1=tmp_block->next_exon)!=NULL) && tmp_block1->to1) {
1571
1572 if ((diff=tmp_block->from2-tmp_block1->to2-1)!=0) {
1573 if (tmp_block->to1) {
1574 enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
1575 enew->next_script = *Aligns;
1576 *Aligns = enew;
1577 (*Aligns)->script = head;
1578 (*Aligns)->offset1 = tmp_block->from1;
1579 (*Aligns)->offset2 = tmp_block->from2;
1580 (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
1581 (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
1582 (*Aligns)->score = ali_dist;
1583 ali_dist = 0;
1584 head = NULL;
1585 }
1586 end1 = tmp_block1->to1;
1587 end2 = tmp_block1->to2;
1588
1589 } else if (((diff=tmp_block->from1-tmp_block1->to1-1)!=0) &&
1590 tmp_block->to1) {
1591 new = (edit_script *) ckalloc(sizeof(edit_script));
1592 new->op_type = DELETE;
1593 new->num = diff;
1594 new->next = head;
1595 head = new;
1596 } else if (diff)
1597 end1 = tmp_block1->to1;
1598
1599 diff = align_get_dist(tmp_block1->from1-1, tmp_block1->from2-1,
1600 tmp_block1->to1, tmp_block1->to2,
1601 max(1000,.2*(tmp_block1->to2-tmp_block1->from2+1)));
1602
1603 if (diff<0) {
1604 (void)printf("The two sequences are not really similar.\n");
1605 (void)printf("Please try an exact method.\n");
1606 exit(1);
1607 }
1608
1609 #ifdef STATS
1610 if (diff>P*(tmp_block1->to2-tmp_block1->from2+1))
1611 (void)printf("Warning: Distance threshold on segment exceeded.\n");
1612 #endif
1613
1614 align_path(tmp_block1->from1-1, tmp_block1->from2-1,
1615 tmp_block1->to1, tmp_block1->to2, diff, &left, &right);
1616
1617 Condense_both_Ends(&left, &right, &prev);
1618
1619 if (!tmp_block->to1 && right->op_type == DELETE) {
1620 /* remove gaps at end of alignment */
1621 diff -= 0+right->num; /* subtract GAP_OPEN = 0 */
1622 tmp_block1->to1 -= right->num;
1623 end1 -= right->num;
1624 if (head && (head->op_type == DELETE))
1625 head->num += right->num;
1626 free(right); prev->next = NULL;
1627 right = prev;
1628 }
1629 if ((!tmp_block1->next_exon || !tmp_block1->next_exon->to1) &&
1630 left && (left->op_type == DELETE)) {
1631 diff -= 0+left->num; /* subtract GAP_OPEN = 0 */
1632 tmp_block1->from1 += left->num;
1633 tmp_script = left->next;
1634 if (right == left) right = tmp_script;
1635 free(left); left = tmp_script;
1636 }
1637
1638 *dist_ptr += diff;
1639 ali_dist += diff;
1640
1641 a = seq1+tmp_block1->from1-1;
1642 b = seq2+tmp_block1->from2-1;
1643 tmpi = di_count = 0;
1644 tmp_script = left;
1645 while (tmp_script) {
1646 switch (tmp_script->op_type) {
1647 case DELETE:
1648 di_count += tmp_script->num;
1649 tmpi += tmp_script->num;
1650 a += tmp_script->num;
1651 break;
1652 case INSERT:
1653 di_count += tmp_script->num;
1654 tmpi += tmp_script->num;
1655 b += tmp_script->num;
1656 break;
1657 case SUBSTITUTE:
1658 for (i=0; i<tmp_script->num; ++i, ++a, ++b)
1659 if (*a!=*b) tmpi++; else nmatches++;
1660 break;
1661 }
1662 tmp_script = tmp_script->next;
1663 }
1664 tmp_block1->alen = (int)((tmp_block1->to1-tmp_block1->from1+1+
1665 tmp_block1->to2-tmp_block1->from2+1+di_count)/(double)2);
1666 tmp_block1->nmatches = tmp_block1->alen - tmpi;
1667 tmp_block1->match = (int)floor(100*(1-tmpi/(double)tmp_block1->alen));
1668
1669 right->next = head;
1670 head = left;
1671 tmp_block = tmp_block1;
1672 }
1673
1674
1675 /* at the beginning of the sequences */
1676 if (tmp_block1!=NULL) {
1677
1678 if ((diff=tmp_block->from2-tmp_block1->to2-1)!=0 && (diff!=N)) {
1679 enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
1680 enew->next_script = *Aligns;
1681 *Aligns = enew;
1682 (*Aligns)->offset1 = tmp_block->from1;
1683 (*Aligns)->offset2 = tmp_block->from2;
1684 (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
1685 (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
1686 (*Aligns)->script = head;
1687 (*Aligns)->score = ali_dist;
1688
1689 } else if (diff!=N) {
1690
1691 /* modified to cut introns at the beginning of the sequence */
1692 enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
1693 enew->next_script = *Aligns;
1694 *Aligns = enew;
1695 (*Aligns)->offset1 = tmp_block->from1;
1696 (*Aligns)->offset2 = 1;
1697 (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
1698 (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
1699 (*Aligns)->script = head;
1700 (*Aligns)->score = ali_dist;
1701 }
1702 }
1703 *num_matches = nmatches;
1704 }
1705
new_exon(int f1,int f2,int t1,int t2,int len,int edist,int flag,Exon * next)1706 static Exon *new_exon(int f1, int f2, int t1, int t2, int len, int edist, int flag, Exon *next)
1707 {
1708 Exon *new = (Exon *)ckalloc(sizeof(Exon));
1709
1710 new->from1 = f1;
1711 new->from2 = f2;
1712 new->to1 = t1;
1713 new->to2 = t2;
1714 new->length = (len < 0) ? (t2-f2+1) : len;
1715 new->edist = edist;
1716 new->flag = flag;
1717 new->next_exon = next;
1718
1719 return new;
1720 }
1721
1722
get_stats(Exon * lblock,sim4_stats_t * st)1723 static void get_stats(Exon *lblock, sim4_stats_t *st)
1724 {
1725 Exon *t, *t1;
1726
1727 #ifdef _STATS
1728 t = lblock;
1729 if (!t->next_exon) { /* no alignment found */
1730 st->marginals = 2.0;
1731 } else {
1732 while (t) {
1733 if ((t1 = t->next_exon)!=NULL) {
1734 if (!t->to1 && t1->to1)
1735 st->marginals += (float)(t1->from2-1)/t1->to2;
1736 else if (t->to1 && !t1->to1)
1737 st->marginals += (float)(N-t->to2)/
1738 (N-t->from2+1);
1739 else if (!t->to1 && !t1->to1)
1740 st->marginals = 2.0;
1741 }
1742 t = t1;
1743 }
1744 }
1745 st->marginals = st->marginals/2;
1746 #endif
1747
1748 st->icoverage = 0; st->internal = 1; st->mult = 0;
1749 t = lblock->next_exon;
1750 while (t) {
1751 st->icoverage += t->length;
1752 if (t->length) st->mult++;
1753 t = t->next_exon;
1754 }
1755 st->fcoverage = ((float)st->icoverage)/N;
1756
1757 t = lblock;
1758 if ((t->next_exon==NULL) || !t->next_exon->to1)
1759 st->internal = 0;
1760 while (t) {
1761 if ((t->to1) && ((t1=t->next_exon)!=NULL) &&
1762 (t1->from2-t->to2-1>0) && t1->to1)
1763 st->internal = 0;
1764 t = t->next_exon;
1765 }
1766 }
1767
resolve_overlap(Exon * tmp_block,Exon * tmp_block1,uchar * seq1)1768 static int resolve_overlap(Exon *tmp_block, Exon *tmp_block1, uchar *seq1)
1769 {
1770 int diff, best_u, l0, l1, u, cost;
1771 int GTAG_score, CTAC_score;
1772 uchar *s1, *s2, *e1;
1773
1774 diff = tmp_block1->from2-tmp_block->to2-1;
1775 if (diff>=0) return (tmp_block1->from2-1);
1776
1777 /* resolve overlap using the GT-AG criterion */
1778 /* u-1 = actual position in the sequence */
1779
1780 l0 = tmp_block->length-diff;
1781 l1 = tmp_block1->length;
1782
1783 best_u = u = tmp_block1->from2-1;
1784 s1 = seq1+tmp_block->to1-(tmp_block->to2-u);
1785 s2 = seq1-2+tmp_block1->from1+u-tmp_block1->from2;
1786
1787 cost = 0;
1788 e1 = seq1+tmp_block->to1;
1789 while (s1<=e1) {
1790 GTAG_score = CTAC_score = 0;
1791 GTAG_score += ((char)(*s1)=='G') ? 1 : 0;
1792 GTAG_score += ((char)(*(s1+1))=='T') ? 1 : 0;
1793 GTAG_score += ((char)(*s2)=='A') ? 1 : 0;
1794 GTAG_score += ((char)(*(s2+1))=='G') ? 1 : 0;
1795
1796 if (GTAG_score > abs(cost) && ((l0>=8) || (l1>=8))) {
1797 cost = GTAG_score;
1798 best_u = u;
1799 if (cost == 4) break;
1800 }
1801
1802 CTAC_score += ((char)(*s1)=='C') ? 1 : 0;
1803 CTAC_score += ((char)(*(s1+1))=='T') ? 1 : 0;
1804 CTAC_score += ((char)(*s2)=='A') ? 1 : 0;
1805 CTAC_score += ((char)(*(s2+1))=='C') ? 1 : 0;
1806
1807 if (CTAC_score > abs(cost)) {
1808 cost = -CTAC_score;
1809 best_u = u;
1810 if (cost == 4) break;
1811 }
1812
1813 u++; s1++; s2++;
1814 l0++; l1--;
1815 }
1816
1817 return best_u;
1818 }
1819
1820
greedy(uchar * s1,uchar * s2,int m,int n,int offset1,int offset2,Exon ** lblock,Exon ** rblock)1821 static int greedy(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, Exon **lblock, Exon **rblock)
1822 {
1823 int col, /* column number */
1824 d, /* current distance */
1825 k, /* current diagonal */
1826 max_d, /* bound on size of edit script */
1827 Cost,
1828 blower,flower, /* boundaries for searching diagonals */
1829 bupper,fupper,
1830 row, /* row number */
1831 DELTA, /* n-m */
1832 MAX_D,
1833 B_ORIGIN, F_ORIGIN;
1834 int back, forth; /* backward and forward limits at exit */
1835
1836 int *blast_d, *flast_d, /* rows containing the last d (at crt step, d-1) */
1837 *btemp_d, *ftemp_d; /* rows containing tmp values for the last d */
1838 int *min_row, *min_diag, /* min (b)/ max (f) row (and diagonal) */
1839 *max_row, *max_diag; /* reached for cost d=0, ... m. */
1840
1841
1842 DELTA = n-m;
1843 /*max_d = MAX_D = m+1; */
1844 max_d = MAX_D = max(W,(int)(P*m+1));
1845
1846 if (DELTA<0) {
1847 if (m<=min(W,(1+P)*n)) {
1848 *lblock = *rblock = new_exon(offset2+1,offset1+1,offset2+n,offset1+m,
1849 m,n-m+(int)(P*m+1),0,NULL);
1850 return m-n+(int)(P*n+1);
1851 } else {
1852 *lblock = *rblock = NULL;
1853 return max(W,(int)(P*m+1))+1;
1854 }
1855 }
1856
1857 F_ORIGIN = MAX_D;
1858 B_ORIGIN = MAX_D-DELTA;
1859 for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--)
1860 /*LINTED empty loop body*/;
1861
1862 if (row == 0) {
1863 /* hit last row; stop search */
1864 *lblock = *rblock = new_exon(offset2-m+n+1,offset1+1,offset2+n,
1865 offset1+m,m,0,0,NULL);
1866 return 0;
1867 }
1868
1869
1870 blast_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int));
1871 btemp_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int));
1872
1873 for (k=0; k<=MAX_D+n; ++k) { blast_d[k]=m+1; btemp_d[k]=m+1; }
1874 blast_d[B_ORIGIN+DELTA] = row;
1875
1876 blower = B_ORIGIN + DELTA - 1;
1877 bupper = B_ORIGIN + DELTA + 1;
1878
1879
1880 for (row=0; row<n && row<m && (s1[row]==s2[row]); row++)
1881 /*LINTED empty loop body*/;
1882
1883 if (row == m) {
1884 /* hit last row; stop search */
1885 *lblock = *rblock = new_exon(offset2+1,offset1+1,offset2+m,
1886 offset1+m,m,0,0,NULL);
1887 free(blast_d); free(btemp_d);
1888
1889 return 0;
1890 }
1891
1892 flast_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int));
1893 ftemp_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int));
1894
1895 for (k=0; k<=MAX_D+n; ++k) { flast_d[k]=-1; ftemp_d[k]=-1; }
1896 flast_d[F_ORIGIN] = row;
1897
1898 flower = F_ORIGIN - 1;
1899 fupper = F_ORIGIN + 1;
1900
1901 max_row = (int *)ckalloc((MAX_D+1)*sizeof(int));
1902 min_row = (int *)ckalloc((MAX_D+1)*sizeof(int));
1903 max_diag = (int *)ckalloc((MAX_D+1)*sizeof(int));
1904 min_diag = (int *)ckalloc((MAX_D+1)*sizeof(int));
1905
1906 for (d=1; d<=MAX_D; d++) {
1907 min_row[d] = m+1; max_row[d] = -1;
1908 }
1909 min_row[0] = blast_d[B_ORIGIN+DELTA];
1910 min_diag[0] = B_ORIGIN+DELTA;
1911 max_row[0] = flast_d[F_ORIGIN];
1912 max_diag[0] = F_ORIGIN;
1913
1914 back = forth = -1;
1915
1916 d = 1;
1917 while (d <= max_d) {
1918
1919 /* for each relevant diagonal ... */
1920 for (k = blower; k <= bupper; k++) {
1921 /* process the next edit instruction */
1922
1923 /* find a d on diagonal k */
1924 if (k==-d+DELTA+B_ORIGIN) {
1925
1926 /* move left from the last d-1 on diagonal k+1 */
1927 row = blast_d[k+1]; /* INSERT */
1928 } else if (k==d+DELTA+B_ORIGIN) {
1929
1930 /* move up from the last d-1 on diagonal k-1 */
1931 row = blast_d[k-1]-1; /* DELETE */
1932 } else if ((blast_d[k]<=blast_d[k+1]) &&
1933 (blast_d[k]-1<=blast_d[k-1])) {
1934
1935 /* substitution */
1936 row = blast_d[k]-1; /* SUBSTITUTE */
1937
1938 } else if ((blast_d[k-1]<=blast_d[k+1]-1) &&
1939 (blast_d[k-1]<=blast_d[k]-1)) {
1940 /* move right from the last d-1 on diagonal k-1 */
1941 row = blast_d[k-1]-1; /* DELETE */
1942 } else {
1943 /* move left from the last d-1 on diagonal k+1 */
1944 row = blast_d[k+1]; /* INSERT */
1945 }
1946 /* code common to the three cases */
1947 col = row + k - B_ORIGIN;
1948
1949 /* slide up the diagonal */
1950 while (row > 0 && col > 0 && (s1[row-1]==s2[col-1])) {
1951 --row;
1952 --col;
1953 }
1954 btemp_d[k] = row;
1955
1956 /* if (row == 0 || col == 0) max_d = d; */
1957
1958 } /* for k */
1959
1960 min_row[d] = btemp_d[DELTA+B_ORIGIN];
1961 min_diag[d] = DELTA+B_ORIGIN;
1962 for (k=blower; k<=bupper; ++k) {
1963 blast_d[k] = btemp_d[k]; btemp_d[k] = m+1;
1964 if (blast_d[k]<min_row[d]) {
1965 min_row[d] = blast_d[k];
1966 min_diag[d] = k;
1967 }
1968 }
1969
1970 /* record cell, if paths overlap with minimum combined cost */
1971 /* obs: it suffices to search up to Cost=min(d-1,(max_d-d)) */
1972 for (Cost=0; Cost<d; Cost++) {
1973 if ((min_row[d]<=max_row[Cost]) &&
1974 ((max_d > d+Cost) || (max_d==d+Cost && (forth<0)))) {
1975 max_d = d+Cost;
1976 back = d;
1977 forth = Cost;
1978 break;
1979 }
1980 }
1981
1982 --blower; ++bupper;
1983
1984 /* for each relevant diagonal ... */
1985 for (k = flower; k <= fupper; k++) {
1986 /* process the next edit instruction */
1987
1988 /* find a d on diagonal k */
1989 if (k==-d+F_ORIGIN) {
1990 /* move down from the last d-1 on diagonal k+1 */
1991 row = flast_d[k+1]+1; /* DELETE */
1992
1993 } else if (k==d+F_ORIGIN) {
1994 /* move right from the last d-1 on diagonal k-1 */
1995 row = flast_d[k-1]; /* INSERT */
1996
1997 } else if ((flast_d[k]>=flast_d[k+1]) &&
1998 (flast_d[k]+1>=flast_d[k-1])) {
1999
2000 /* substitution */
2001 row = flast_d[k]+1; /* SUBSTITUTE */
2002
2003 } else if ((flast_d[k+1]+1>=flast_d[k-1]) &&
2004 (flast_d[k+1]>=flast_d[k])) {
2005
2006 /* move left from the last d-1 on diagonal k+1 */
2007 row = flast_d[k+1]+1; /* DELETE */
2008 } else {
2009 /* move right from the last d-1 on diagonal k-1 */
2010 row = flast_d[k-1]; /* INSERT */
2011 }
2012 /* code common to the three cases */
2013 col = row + k - F_ORIGIN;
2014
2015 /* slide down the diagonal */
2016 if (row>=0)
2017 while (row < m && col < n && (s1[row]==s2[col])) {
2018 ++row;
2019 ++col;
2020 }
2021 ftemp_d[k] = row;
2022
2023 /* if (row == m || col == n) max_d = d; */
2024 } /* for k */
2025
2026 max_row[d] = ftemp_d[F_ORIGIN];
2027 max_diag[d] = F_ORIGIN;
2028 for (k=flower; k<=fupper; ++k) {
2029 flast_d[k] = ftemp_d[k]; ftemp_d[k] = -1;
2030 if (flast_d[k]>max_row[d]) {
2031 max_row[d] = flast_d[k];
2032 max_diag[d] = k;
2033 }
2034 }
2035
2036 /* record backward and forward limits, if minimum combined
2037 * cost in overlapping. Note: it suffices to search up to
2038 * Cost=min(d,(max_d-d)).
2039 */
2040 for (Cost=0; Cost<=d; Cost++) {
2041 if ((min_row[Cost]<=max_row[d]) &&
2042 ((max_d>d+Cost) || (max_d==d+Cost && (forth<0)))) {
2043 max_d = d+Cost;
2044 back = Cost;
2045 forth = d;
2046 break;
2047 }
2048 }
2049 --flower; ++fupper;
2050
2051 ++d; /* for d */
2052 }
2053
2054 if (d>MAX_D) {
2055 *lblock = *rblock = NULL;
2056
2057 free(blast_d); free(btemp_d);
2058 free(flast_d); free(ftemp_d);
2059 free(min_row); free(min_diag);
2060 free(max_row); free(max_diag);
2061
2062 return d;
2063 }
2064 /*fin:*/
2065 if (m-min_row[back]>=max_row[forth]) {
2066 *rblock = new_exon(offset2+1+min_row[back]+min_diag[back]-B_ORIGIN,
2067 offset1+1+min_row[back],
2068 offset2+n,offset1+m,
2069 m-min_row[back],back,0,NULL);
2070 *lblock = new_exon(offset2+1,offset1+1,
2071 offset2+min_row[back]+max_diag[forth]-F_ORIGIN,
2072 offset1+min_row[back],
2073 min_row[back],forth,0,*rblock);
2074 } else {
2075 *rblock = new_exon(offset2+1+max_row[forth]+min_diag[back]-B_ORIGIN,
2076 offset1+1+max_row[forth],
2077 offset2+n,offset1+m,m-max_row[forth],back,0,NULL);
2078 *lblock = new_exon(offset2+1,offset1+1,
2079 offset2+max_row[forth]+max_diag[forth]-F_ORIGIN,
2080 offset1+max_row[forth],max_row[forth],forth,0,*rblock);
2081 }
2082
2083 free(blast_d); free(btemp_d);
2084 free(flast_d); free(ftemp_d);
2085 free(min_row); free(min_diag);
2086 free(max_row); free(max_diag);
2087
2088 return back+forth;
2089 }
2090
2091
flip_list(Exon ** left,Exon ** right)2092 void flip_list(Exon **left, Exon **right)
2093 {
2094 Exon *ep, *ahead, *behind;
2095
2096 *right = *left;
2097 ahead = *left;
2098 ep = NULL;
2099 while (ahead!=NULL) {
2100 behind = ep;
2101 ep = ahead;
2102 ahead = ahead->next_exon;
2103 ep->next_exon = behind;
2104 }
2105 *left = ep;
2106 }
2107
2108 /* reverse a list of edit script chains */
script_flip_list(edit_script_list ** left)2109 void script_flip_list(edit_script_list **left)
2110 {
2111 edit_script_list *ep, *ahead, *behind;
2112
2113 ahead = *left;
2114 ep = NULL;
2115 while (ahead!=NULL) {
2116 behind = ep; ep = ahead;
2117 ahead = ahead->next_script;
2118 ep->next_script = behind;
2119 }
2120 *left = ep;
2121 }
2122
2123
2124 /* operates on a list sorted in increasing order of exon coordinates */
compact_list(Exon ** Lblock,Exon ** Rblock)2125 static void compact_list(Exon **Lblock, Exon **Rblock)
2126 {
2127 Exon *tmp_block=*Lblock, *tmp_block1;
2128 int diff;
2129
2130 while ((tmp_block!=NULL) &&
2131 ((tmp_block1=tmp_block->next_exon)!=NULL) &&
2132 tmp_block1->to1) {
2133 if ((abs((tmp_block1->from2-tmp_block1->from1) -
2134 (tmp_block->to2-tmp_block->to1))<=W) &&
2135 ((diff=tmp_block1->from2-tmp_block->to2-1)<=MAX_INTERNAL_GAP)) {
2136 /* merge blocks */
2137 tmp_block->to1 = tmp_block1->to1;
2138 tmp_block->to2 = tmp_block1->to2;
2139 tmp_block->length = tmp_block->to2-tmp_block->from2+1;
2140 tmp_block->edist += tmp_block1->edist;
2141 tmp_block->edist -= P*diff;
2142 tmp_block->next_exon = tmp_block1->next_exon;
2143
2144 free(tmp_block1);
2145 } else
2146 tmp_block = tmp_block1;
2147 }
2148 /* reset right end of the list */
2149 *Rblock = tmp_block;
2150 }
2151
2152 /* ------------------ memory management routines --------------- */
2153
link_to_data_list(Pointer data,ValNodePtr * head,ValNodePtr * prev)2154 void link_to_data_list(Pointer data, ValNodePtr *head, ValNodePtr *prev)
2155 {
2156 ValNodePtr curr;
2157
2158 curr = (ValNodePtr)ckalloc(sizeof(struct ValNode));
2159 curr->data = data;
2160 curr->next = NULL;
2161
2162 if(*prev == NULL)
2163 *head = curr;
2164 else
2165 (*prev)->next = curr;
2166 *prev = curr;
2167 }
2168
ValNodeFreeData(ValNodePtr data_list)2169 void ValNodeFreeData(ValNodePtr data_list)
2170 {
2171 ValNodePtr tmp_node;
2172
2173 while ((tmp_node=data_list)!=NULL) {
2174 free(tmp_node->data);
2175 data_list = data_list->next;
2176 free(tmp_node);
2177 }
2178 }
2179
2180
good_ratio(int length)2181 int good_ratio(int length)
2182 {
2183 if (length<=W/2) return 2;
2184 else if (length<2*W) return rs.cutoff;
2185 else return (int)(.75*P*length+1);
2186 }
2187
extend_bw(uchar * s1,uchar * s2,int m,int n,int offset1,int offset2,int * line1,int * line2)2188 static int extend_bw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2)
2189 {
2190 int col, /* column number */
2191 row, /* row number */
2192 max_d, /* bound on the length of the edit script
2193 */
2194 d, /* current compressed distance */
2195 k, /* current diagonal */
2196 DELTA, /* n-m */
2197 ORIGIN,
2198 lower,
2199 upper;
2200 int *last_d, *temp_d; /* column containing the last p */
2201 int *min_row, *min_diag; /* min (b)/ max (f) row (and diagonal) */
2202 /* reached for cost d=0, ... m. */
2203 DELTA = n-m;
2204 max_d = m+1;
2205
2206 ORIGIN = m;
2207 for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--)
2208 /*LINTED empty loop body*/;
2209
2210 if ((row == 0) || (col == 0)) {
2211 *line1 = row+offset1;
2212 *line2 = col+offset2;
2213
2214 return 0;
2215 }
2216
2217 last_d = (int *)ckalloc((m+n+1)*sizeof(int));
2218 temp_d = (int *)ckalloc((m+n+1)*sizeof(int));
2219
2220 for (k=0; k<=m+n; ++k) last_d[k]=m+1;
2221 last_d[ORIGIN+DELTA] = row;
2222
2223 lower = ORIGIN + DELTA - 1;
2224 upper = ORIGIN + DELTA + 1;
2225
2226 min_row = (int *)ckalloc((m+1)*sizeof(int));
2227 min_diag = (int *)ckalloc((m+1)*sizeof(int));
2228
2229 for (d=1; d<=m; d++)
2230 min_row[d] = m+1;
2231
2232 min_row[0] = last_d[ORIGIN+DELTA];
2233 min_diag[0] = ORIGIN + DELTA;
2234
2235 d = 0;
2236 while ((++d<=max_d) &&
2237 ((d-1<=good_ratio(m-min_row[d-1])) ||
2238 ((d>=2) && (d-2<=good_ratio(m-min_row[d-2]))))) {
2239
2240 /* for each relevant diagonal ... */
2241 for (k = lower; k <= upper; k++) {
2242
2243 /* find a d on diagonal k */
2244 if (k==-d+DELTA+ORIGIN) {
2245 /* move down from the last d-1 on diagonal k+1 */
2246 row = last_d[k+1];
2247 /* op = INSERT; */
2248
2249 } else if (k==d+DELTA+ORIGIN) {
2250 /* move right from the last d-1 on diagonal k-1 */
2251 row = last_d[k-1]-1;
2252 /* op = DELETE; */
2253
2254 } else if ((last_d[k]-1<=last_d[k+1]) &&
2255 (last_d[k]-1<=last_d[k-1]-1)) {
2256 /* substitution */
2257 row = last_d[k]-1;
2258 /* op = SUBSTITUTE; */
2259
2260 } else if ((last_d[k-1]-1<=last_d[k+1]) &&
2261 (last_d[k-1]-1<=last_d[k]-1)) {
2262 /* move right from the last d-1 on diagonal k-1 */
2263 row = last_d[k-1]-1;
2264 /* op = DELETE; */
2265
2266 } else {
2267 /* move left from the last d-1 on diagonal k+1 */
2268 row = last_d[k+1];
2269 /* op = INSERT; */
2270
2271 }
2272
2273 /* code common to the three cases */
2274 /* slide down the diagonal */
2275
2276 col = row+k-ORIGIN;
2277
2278 while ((row > 0) && (col > 0) && (s1[row-1]==s2[col-1]))
2279 { row--; col--; }
2280
2281 temp_d[k] = row;
2282
2283 if ((row == 0) && (col == 0)) {
2284 /* hit southeast corner; have the answer */
2285
2286 free(last_d); free(temp_d);
2287 free(min_row); free(min_diag);
2288
2289 *line1 = row+offset1;
2290 *line2 = col+offset2;
2291
2292 return d;
2293 }
2294 if (row == 0) {
2295 /* hit first row; don't look further */
2296
2297 free(last_d); free(temp_d);
2298 free(min_row); free(min_diag);
2299
2300 *line1 = row+offset1;
2301 *line2 = col+offset2;
2302
2303 return d;
2304 }
2305
2306 if (col == 0) {
2307 /* hit last column; don't look further */
2308 free(last_d); free(temp_d);
2309 free(min_row); free(min_diag);
2310
2311 *line1 = row+offset1;
2312 *line2 = col+offset2;
2313
2314 return d;
2315 }
2316 }
2317
2318 min_row[d] = last_d[ORIGIN+DELTA];
2319 min_diag[d] = ORIGIN+DELTA;
2320 for (k=lower; k<=upper; ++k)
2321 if (temp_d[k]<min_row[d]) {
2322 min_row[d] = temp_d[k];
2323 min_diag[d] = k;
2324 }
2325
2326 for (k=lower; k<=upper; k++) {
2327 last_d[k] = temp_d[k];
2328 }
2329
2330 --lower;
2331 ++upper;
2332 }
2333
2334 /* report here the previous maximal match, stored in min_diag and min_row */
2335 while ((d>0) && (min_row[d-1]-min_row[d]<3))
2336 d--;
2337
2338 *line1 = min_row[d]+offset1;
2339 *line2 = min_row[d]+min_diag[d]-ORIGIN+offset2;
2340
2341 free(min_row);
2342 free(min_diag);
2343 free(last_d);
2344 free(temp_d);
2345
2346 return d;
2347 }
2348
2349
extend_fw(uchar * s1,uchar * s2,int m,int n,int offset1,int offset2,int * line1,int * line2)2350 static int extend_fw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2)
2351 {
2352 int col, /* column number */
2353 row, /* row number */
2354 max_d, /* bound on the length of the edit script
2355 */
2356 d, /* current compressed distance */
2357 k, /* current diagonal */
2358 ORIGIN,
2359 lower,
2360 upper;
2361 int *last_d, *temp_d; /* column containing the last p */
2362 int *max_row, *max_diag; /* min (b)/ max (f) row (and diagonal) */
2363 /* reached for cost d=0, ... m. */
2364 max_d = m+1;
2365
2366 ORIGIN = m;
2367 for (row=0, col=0; col<n && row<m && (s1[row]==s2[col]); row++, col++)
2368 /*LINTED empty loop body*/;
2369
2370 if (row == m) {
2371 *line1 = row+offset1;
2372 *line2 = col+offset2;
2373
2374 return 0;
2375 }
2376 if (col == n) {
2377 *line1 = row+offset1;
2378 *line2 = col+offset2;
2379
2380 return 0;
2381 }
2382
2383 last_d = (int *)ckalloc((m+n+1)*sizeof(int));
2384 temp_d = (int *)ckalloc((m+n+1)*sizeof(int));
2385
2386 for (k=0; k<=m+n; ++k) last_d[k]=-1;
2387 last_d[ORIGIN] = row;
2388
2389 lower = ORIGIN - 1;
2390 upper = ORIGIN + 1;
2391
2392 max_row = (int *)ckalloc((m+1)*sizeof(int));
2393 max_diag = (int *)ckalloc((m+1)*sizeof(int));
2394
2395 for (d=1; d<=m; d++)
2396 max_row[d] = -1;
2397
2398 max_row[0] = last_d[ORIGIN];
2399 max_diag[0] = ORIGIN;
2400
2401
2402 d = 0;
2403 while ((++d<=max_d) &&
2404 ((d-1<=good_ratio(max_row[d-1])) ||
2405 ((d>=2) && (d-2<=good_ratio(max_row[d-2]))))) {
2406
2407 /* for each relevant diagonal ... */
2408 for (k = lower; k <= upper; k++) {
2409
2410 /* find a d on diagonal k */
2411 if (k==-d+ORIGIN) {
2412
2413 /* move down from the last d-1 on diagonal k+1 */
2414 row = last_d[k+1]+1;
2415 /* op = DELETE; */
2416 } else if (k==d+ORIGIN) {
2417
2418 /* move right from the last d-1 on diagonal k-1 */
2419 row = last_d[k-1];
2420 /* op = INSERT; */
2421 } else if ((last_d[k]>=last_d[k+1]) &&
2422 (last_d[k]+1>=last_d[k-1])) {
2423
2424 /* substitution */
2425 row = last_d[k]+1;
2426 /* op = SUBSTITUTE; */
2427 } else if ((last_d[k+1]+1>=last_d[k-1]) &&
2428 (last_d[k+1]>=last_d[k])) {
2429
2430 /* move down from the last d-1 on diagonal k+1 */
2431 row = last_d[k+1]+1;
2432 /* op = DELETE; */
2433 } else {
2434
2435 /* move right from the last d-1 on diagonal k-1 */
2436 row = last_d[k-1];
2437 /* op = INSERT; */
2438 }
2439
2440 /* code common to the three cases */
2441 /* slide down the diagonal */
2442
2443 col = row+k-ORIGIN;
2444
2445 if (row>=0)
2446 while ((row < m) && (col < n) && (s1[row]==s2[col]))
2447 { row++; col++; }
2448
2449 temp_d[k] = row;
2450
2451 if ((row == m) && (col == n)) {
2452 /* hit southeast corner; have the answer */
2453 free(last_d); free(temp_d);
2454 free(max_row); free(max_diag);
2455 *line1 = row+offset1;
2456 *line2 = col+offset2;
2457
2458 return d;
2459 }
2460 if (row == m) {
2461 /* hit last row; don't look further */
2462 free(temp_d); free(last_d);
2463 free(max_row); free(max_diag);
2464
2465 *line1 = row+offset1;
2466 *line2 = col+offset2;
2467
2468 return d;
2469 }
2470
2471 if (col == n) {
2472 /* hit last column; don't look further */
2473 free(temp_d); free(last_d);
2474 free(max_row); free(max_diag);
2475
2476 *line1 = row+offset1;
2477 *line2 = col+offset2;
2478
2479 return d;
2480 }
2481 }
2482 max_row[d] = last_d[ORIGIN];
2483 max_diag[d] = ORIGIN;
2484 for (k=lower; k<=upper; ++k)
2485 if (temp_d[k]>max_row[d]) {
2486 max_row[d] = temp_d[k];
2487 max_diag[d] = k;
2488 }
2489
2490 for (k=lower; k<=upper; k++) {
2491 last_d[k] = temp_d[k];
2492 }
2493
2494 --lower;
2495 ++upper;
2496 }
2497
2498 /* report here the previous maximal match, stored in max_diag and max_row */
2499
2500 while ((d>0) && (max_row[d]-max_row[d-1]<3))
2501 d--;
2502
2503 *line1 = max_row[d]+offset1;
2504 *line2 = max_row[d]+max_diag[d]-ORIGIN+offset2;
2505
2506 free(max_row);
2507 free(max_diag);
2508 free(last_d);
2509 free(temp_d);
2510
2511 return d;
2512
2513 /*
2514 if ((d>2) && (max_row[d-1]-max_row[d-2]<3)) {
2515 *line1 = max_row[d-2]+offset1;
2516 *line2 = max_row[d-2]+max_diag[d-2]-ORIGIN+offset2;
2517
2518 free(max_row); free(max_diag);
2519 free(last_d); free(temp_d);
2520
2521 return d-2;
2522 }
2523
2524 *line1 = max_row[d-1]+offset1;
2525 *line2 = max_row[d-1]+max_diag[d-1]-ORIGIN+offset2;
2526
2527 free(max_row);
2528 free(max_diag);
2529 free(last_d);
2530 free(temp_d);
2531
2532 return d-1;
2533 */
2534 }
2535
merge(Exon ** t0,Exon ** t1)2536 static void merge(Exon **t0, Exon **t1)
2537 {
2538 Exon *tmp0, *tmp1;
2539 int diff;
2540
2541 if ((*t0) && !(*t0)->to1)
2542 tmp0 = (*t0)->next_exon;
2543 else
2544 tmp0 = *t0;
2545
2546 while (tmp0 && (tmp0!=*t1)) {
2547 tmp1 = tmp0->next_exon;
2548 assert(tmp1!=NULL);
2549 if (tmp1 && tmp1->to1 && tmp0->to1 &&
2550 (abs((tmp1->from2-tmp1->from1)-(tmp0->to2-tmp0->to1))<=W) &&
2551 ((diff=tmp1->from2-tmp0->to2-1<=W))) {
2552
2553 /* merge blocks tmp0 and tmp1 */
2554 tmp0->from1 = min(tmp0->from1, tmp1->from1);
2555 tmp0->from2 = min(tmp0->from2, tmp1->from2);
2556 tmp0->to1 = max(tmp1->to1, tmp0->to1);
2557 tmp0->to2 = max(tmp1->to2, tmp0->to2);
2558 tmp0->length = tmp0->to2-tmp0->from2+1;
2559 tmp0->flag = tmp1->flag;
2560 tmp0->edist += tmp1->edist;
2561 tmp0->edist -= P*diff;
2562 if (tmp1==*t1) {
2563 /* tmp0->flag = (*t1)->flag; */
2564 *t1 = tmp0;
2565 }
2566 tmp0->next_exon = tmp1->next_exon;
2567
2568 free(tmp1);
2569 } else
2570 tmp0 = tmp0->next_exon;
2571 }
2572 }
2573
free_align(edit_script_list * aligns)2574 void free_align(edit_script_list *aligns)
2575 {
2576 edit_script_list *head;
2577
2578 head = aligns;
2579
2580 while ((head=aligns)!=NULL) {
2581 aligns = aligns->next_script;
2582 Free_script(head->script);
2583 free(head);
2584 }
2585 }
2586
2587
free_list(Exon * left)2588 void free_list(Exon *left)
2589 {
2590 Exon *tmp_block;
2591
2592 while ((tmp_block=left)!=NULL) {
2593 left = left->next_exon;
2594 free(tmp_block);
2595 }
2596 }
2597
free_table(void)2598 void free_table(void)
2599 {
2600 register struct hash_node *hptr, *tptr;
2601 register int hval;
2602
2603 free(pnext_pos);
2604 for (hval=0; hval<HASH_SIZE+1; hval++) {
2605 hptr = phashtab[hval];
2606 while (hptr) {
2607 tptr = hptr;
2608 hptr = hptr->link;
2609 free(tptr);
2610 }
2611 }
2612 }
2613
bmatch(uchar * s1,uchar * s2,int len1,int len2,int offset1,int offset2)2614 static Exon *bmatch (uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2)
2615 {
2616 int i, j, i1, score;
2617 Exon *new=NULL;
2618
2619 for (i1=i=len1-3; i>=len2-3; i--, i1=i) {
2620 for (j=len2-3; j>=2; j--, i1--)
2621 if (*(s1+i1)!=*(s2+j))
2622 break;
2623
2624 if (j<2) {
2625 /* exact match for CDS found; check signals */
2626 score = 0;
2627 if (*(s1+(i1--))==*(s2+(j--))) score++;
2628 if (*(s1+(i1--))==*(s2+(j--))) score++;
2629 if (*(s1+i1+len2-1)==*(s2+j+len2-1)) score++;
2630 if (*(s1+i1+len2)==*(s2+j+len2)) score++;
2631 if (score>=3) {
2632 new = new_exon(i1+3+offset1, offset2, i1+3+offset1+len2-5,
2633 offset2+len2-5, len2-4, 0, 0, NULL);
2634 new->ori = (G_score >= abs(C_score)) ? 'G' : 'C';
2635
2636 return new;
2637 }
2638 }
2639 }
2640 return NULL;
2641 }
2642
fmatch(uchar * s1,uchar * s2,int len1,int len2,int offset1,int offset2)2643 static Exon *fmatch (uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2)
2644 {
2645 int i, j, i1, score;
2646 Exon *new=NULL;
2647
2648 for (i1=i=2; i<len1-len2+3; i++, i1=i) {
2649 for (j=2; j<len2-2; j++, i1++)
2650 if (*(s1+i1)!=*(s2+j))
2651 break;
2652
2653 if (j>=len2-2) {
2654 /* exact match found for internal part, look for signals */
2655 score = 0;
2656 if (*(s1+(i1++))==*(s2+(j++))) score++;
2657 if (*(s1+(i1++))==*(s2+(j++))) score++;
2658 if (*(s1+i1-len2)==*s2) score++;
2659 if (*(s1+i1-len2+1)==*(s2+1)) score++;
2660 if (score>=3) {
2661 new = new_exon(i+offset1,offset2,i1+offset1-2,offset2+len2-5,
2662 len2-4,0,0,NULL);
2663 new->ori = (G_score >= abs(C_score)) ? 'G' : 'C';
2664
2665 return new;
2666 }
2667 }
2668 }
2669 return NULL;
2670 }
2671
2672 #ifdef DEBUG
debug_print_exons(Exon * lblock,const char * label)2673 static void debug_print_exons (Exon *lblock, const char *label)
2674 {
2675 Exon *tmp_block = lblock;
2676
2677 (void)fprintf(stderr,"\n====================%s:\n\n", label);
2678 while (tmp_block) {
2679 (void)fprintf(stderr," [ %d, %d, %d, %d, l: %d ]\n ", tmp_block->from1,
2680 tmp_block->from2, tmp_block->to1,
2681 tmp_block->to2, tmp_block->length);
2682 tmp_block = tmp_block->next_exon;
2683 }
2684 }
2685 #endif
2686
2687 /* -------------------- to be added to psublast ---------------------- */
2688
seq_toupper(uchar * seq,int len,char * filename)2689 void seq_toupper(uchar *seq, int len, char *filename)
2690 {
2691 int i=0, flag = 0;
2692 uchar *s=seq;
2693
2694 for (; *s && (i<min(100,len)); i++, s++)
2695 if (islower(*s)) { flag = 1; break; }
2696
2697 if (flag) {
2698 fprintf(stderr, "Warning: lowercase letter in %s data. Convert sequence to uppercase.\n", filename);
2699 for (s=seq, i=0; *s && (i<len); s++, i++)
2700 *s = toupper(*s);
2701 }
2702 }
2703
get_sync_flag(Exon * lblock,Exon * rblock,int w)2704 static bool get_sync_flag(Exon *lblock, Exon *rblock, int w)
2705 {
2706 int numx=0, e2;
2707 Exon *t;
2708
2709 if (((t=lblock->next_exon)==NULL) || !t->to1) return FALSE;
2710 numx++; e2 = t->to2;
2711
2712 while (((t=t->next_exon)!=NULL) && t->to1) {
2713 ++numx;
2714 if ((t->from2-e2>1) ||
2715 (t!=rblock && ((t->to2-t->from2+1<2*w+2) ||
2716 (t->to1-t->from1+1<2*w+2)))) return FALSE;
2717 e2 = t->to2;
2718 }
2719 return ((numx<3) ? FALSE:TRUE);
2720 }
2721
2722
sync_slide_intron(int in_w,Exon ** lblock,uchar * seq1,uchar * seq2)2723 static void sync_slide_intron(int in_w, Exon **lblock, uchar *seq1, uchar *seq2)
2724 {
2725 Exon *t0=NULL, *t1=NULL, *head = *lblock;
2726 splice_t *g=NULL, *c=NULL, *cell=NULL;
2727 splice_t *Glist[500], *Clist[500];
2728 int Gscore=0, Cscore=0;
2729 char oris[500];
2730 int w1, w2, ni, i, numC, numG;
2731
2732 memset(Glist, 0, 200*sizeof(splice_t *));
2733 memset(Clist, 0, 200*sizeof(splice_t *));
2734
2735 ni = 0; numG = numC = 0;
2736
2737 /* assume forward orientation */
2738 t0 = head->next_exon;
2739 while (t0 && (t1=t0->next_exon) && t1->to1) {
2740 g = c = NULL;
2741 if (t1->from2-t0->to2-1==0) {
2742 if (!strncmp((char *)(seq1+t0->to1),"GT",2) &&
2743 !strncmp((char *)(seq1+t1->from1-3),"AG",2)) {
2744 g = new_splice('G',t0->to1,t1->from1,t0->to2,t1->from2,-1,NULL);
2745 t0->ori = 'G';
2746 oris[ni] = 'G';
2747 numG++;
2748 } else if (!strncmp((char *)(seq1+t0->to1),"CT",2) &&
2749 !strncmp((char *)(seq1+t1->from1-3),"AC",2)) {
2750 c = new_splice('C',t0->to1,t1->from1,t0->to2,t1->from2,-1,NULL);
2751 t0->ori = 'C';
2752 oris[ni] = 'C';
2753 numC++;
2754 } else {
2755 w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1));
2756 w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1));
2757 splice(seq1, t0->to1-w1, t0->to1+w1, t1->from1-w2, t1->from1+w2,
2758 seq2, t0->to2-w1, t1->from2+w2, &g, &c, BOTH);
2759
2760 Gscore += g->score; Cscore += c->score;
2761 cell = NULL; oris[ni] = '*';
2762 if (g->score>c->score) {
2763 numG++; cell = g; oris[ni] = 'G';
2764 } else if (c->score>g->score) {
2765 numC++; cell = c; oris[ni] = 'C';
2766 } else if (c->score==g->score) {
2767 numG++; numC++; cell = g; oris[ni] = 'G';
2768 }
2769 t0->ori = oris[ni];
2770 t0->to1 = cell->xs; t0->to2 = cell->ys;
2771 t1->from1 = cell->xe; t1->from2 = cell->ye;
2772 t0->length = t0->to2-t0->from2+1;
2773 t1->length = t1->to2-t1->from2+1;
2774 }
2775 Clist[ni] = c; Glist[ni] = g;
2776 } else {
2777 t0->ori = 'E'; oris[ni] = 'E';
2778 }
2779 ni++;
2780 t0 = t1;
2781 }
2782
2783 /*
2784 if (numG==ni) {
2785 for (i=0; i<ni; i++) {
2786 if (Glist[i]) free(Glist[i]);
2787 if (Clist[i]) free(Clist[i]);
2788 }
2789 return;
2790 }
2791 */
2792 if ((numG==1) && (numC==1) &&
2793 (!Glist[0] || !Clist[0] || !Glist[1] || !Clist[1])) goto free_all;
2794
2795 if (numG>=numC) {
2796 /* revisit all previous assignments that are inconsistent */
2797 for (i=0, t0=head->next_exon; i<ni; i++, t0=t1) {
2798 t1 = t0->next_exon;
2799 switch (oris[i]) {
2800 case 'G': break;
2801 case 'C': if (Glist[i]==NULL) {
2802 /* compute the values for C */
2803 w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1));
2804 w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1));
2805 splice(seq1, t0->to1-w1, t0->to1+w1,
2806 t1->from1-w2, t1->from1+w2, seq2,
2807 t0->to2-w1, t1->from2+w2, &g, &c, FWD);
2808 } else g = Glist[i];
2809
2810 t0->ori = 'G';
2811 t0->to1 = g->xs; t0->to2 = g->ys;
2812 t1->from1 = g->xe; t1->from2 = g->ye;
2813 t0->length = t0->to2-t0->from2+1;
2814 t1->length = t1->to2-t1->from2+1;
2815
2816 break;
2817 case 'E': break;
2818 default : fatal("sim4b1.c: intron orientation not initialized.");
2819 }
2820 if (oris[i]!='E') wobble(&t0,&t1,"GT","AG",seq1);
2821 }
2822 } else {
2823 /* analyze all assignments for consistency */
2824 for (i=0, t0=head->next_exon; i<ni; i++, t0=t1) {
2825 t1 = t0->next_exon;
2826 switch (oris[i]) {
2827 case 'C': break;
2828 case 'G': if (Clist[i]==NULL) {
2829 /* compute the values for C */
2830 w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1));
2831 w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1));
2832 splice(seq1, t0->to1-w1, t0->to1+w1,
2833 t1->from1-w2, t1->from1+w2,
2834 seq2, t0->to2-w1, t1->from2+w2, &g, &c, BWD);
2835 } else c = Clist[i];
2836
2837 t0->ori = 'C';
2838 t0->to1 = c->xs; t0->to2 = c->ys;
2839 t1->from1 = c->xe; t1->from2 = c->ye;
2840 t0->length = t0->to2-t0->from2+1;
2841 t1->length = t1->to2-t1->from2+1;
2842 break;
2843 case 'E': break;
2844 default : fatal("sim4b1.c: intron orientation not initialized.");
2845 }
2846 if (oris[i]!='E') wobble(&t0,&t1,"CT","AC",seq1);
2847 }
2848 }
2849
2850 /* now free all memory allocated */
2851 free_all:
2852 for (i=0; i<ni; i++) {
2853 if (Glist[i]) free(Glist[i]);
2854 if (Clist[i]) free(Clist[i]);
2855 }
2856 return;
2857 }
2858
wobble(Exon ** t0,Exon ** t1,const char * donor,const char * acceptor,uchar * seq1)2859 static void wobble(Exon **t0, Exon **t1, const char *donor, const char *acceptor, uchar *seq1)
2860 {
2861 uchar *s = seq1+(*t0)->to1; /* first nt of donor */
2862 uchar *q = seq1+(*t1)->from1-3; /* first nt of acceptor */
2863
2864 if (!strncmp((char *)(s), donor, 2)) {
2865 /* match in place */
2866 if (!strncmp((char *)(q), acceptor, 2)) {
2867 return;
2868 } else if (!strncmp((char *)(q-1), acceptor, 2)) {
2869 (*t1)->from1--; return;
2870 } else if (!strncmp((char *)(q+1), acceptor, 2)) {
2871 (*t1)->from1++; return;
2872 }
2873
2874 } else if (!strncmp((char *)(s-1), donor, 2)) {
2875 /* match is 1 off to the left */
2876 if (!strncmp((char *)(q), acceptor, 2)) {
2877 (*t0)->to1--; return;
2878 } else if (!strncmp((char *)(q-1), acceptor, 2)) {
2879 (*t0)->to1--; (*t1)->from1--;
2880 (*t0)->to2--; (*t1)->from2--;
2881 (*t0)->length++; (*t1)->length--;
2882 return;
2883 } else if (!strncmp((char *)(q+1), acceptor, 2)) {
2884 (*t0)->to1--; (*t1)->from1++; return;
2885 }
2886
2887 } else if (!strncmp((char *)(s+1), donor, 2)) {
2888 /* match is 1 off to the right */
2889 if (!strncmp((char *)(q), acceptor, 2)) {
2890 (*t0)->to1++; return;
2891 } else if (!strncmp((char *)(q-1), acceptor, 2)) {
2892 (*t0)->to1++; (*t1)->from1--; return;
2893 } else if (!strncmp((char *)(q+1), acceptor, 2)) {
2894 (*t0)->to1++; (*t1)->from1++;
2895 (*t0)->to2++; (*t1)->from2++;
2896 (*t0)->length--; (*t1)->length++;
2897 return;
2898 }
2899 } else if (!strncmp((char *)(q-1), acceptor, 2)) {
2900 /* match is 1 off to the left */
2901 (*t1)->from1--; return;
2902
2903 } else if (!strncmp((char *)(q+1), acceptor, 2)) {
2904 /* match is 1 off to the right */
2905 (*t1)->from1++; return;
2906 }
2907
2908 return;
2909 }
2910
slide_intron(int in_w,Exon ** lblock,uchar * seq1,uchar * seq2)2911 static void slide_intron(int in_w, Exon **lblock, uchar *seq1, uchar *seq2)
2912 {
2913 Exon *t0, *t1, *head = *lblock;
2914 splice_t *g, *c, *cell;
2915 char type;
2916 int w1, w2;
2917
2918 t0 = head->next_exon;
2919 while (t0 && (t1=t0->next_exon) && t1->to1) {
2920 g = c = NULL;
2921 if (t1->from2-t0->to2-1==0) {
2922 if (!strncmp((char *)(seq1+t0->to1),"GT",2) &&
2923 !strncmp((char *)(seq1+t1->from1-3),"AG",2))
2924 t0->ori = 'G';
2925 else if (!strncmp((char *)(seq1+t0->to1),"CT",2) &&
2926 !strncmp((char *)(seq1+t1->from1-3),"AC",2))
2927 t0->ori = 'C';
2928 else {
2929 int gtag=0, ctac=0;
2930 uchar *s;
2931
2932 w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1));
2933 w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1));
2934 splice(seq1, t0->to1-w1, t0->to1+w1, t1->from1-w2, t1->from1+w2,
2935 seq2, t0->to2-w1, t1->from2+w2, &g, &c, BOTH);
2936 if (g->score>c->score) { cell = g; type = 'G'; }
2937 else if (c->score>g->score) { cell = c; type = 'C'; }
2938 else { cell = g; type = 'G'; }
2939
2940 t0->to1 = cell->xs; t0->to2 = cell->ys;
2941 t1->from1 = cell->xe; t1->from2 = cell->ye;
2942 t0->length = t0->to2-t0->from2+1;
2943 t1->length = t1->to2-t1->from2+1;
2944
2945 wobble(&t0,&t1,(type=='G')? "GT":"CT",(type=='G')? "AG":"AC",seq1);
2946
2947 free(g); free(c);
2948
2949 /* determine the type, based on the # matches w/ GT-AG (CT-AC) */
2950 s = seq1+t0->to1;
2951 if (*s=='G') gtag++; else if (*s=='C') ctac++;
2952 ++s;
2953 if (*s=='T') { gtag++; ctac++;}
2954 s = seq1+t1->from1-3;
2955 if (*s=='A') { gtag++; ctac++; }
2956 ++s;
2957 if (*s=='G') gtag++; else if (*s=='C') ctac++;
2958 if (gtag>ctac) type = 'G';
2959 else if (ctac>gtag) type = 'C';
2960 else type = 'N';
2961
2962 t0->ori = type;
2963 }
2964 } else
2965 t0->ori = 'E';
2966 t0 = t1;
2967 }
2968
2969 }
2970
2971 #ifdef AUXUTILS
remove_polyA_tails(Exon * lblock,uchar * seq1,uchar * seq2,int len2)2972 static void remove_polyA_tails(Exon *lblock, uchar *seq1, uchar *seq2, int len2)
2973 {
2974 Exon *t, *prev;
2975 uchar *s, *q;
2976 int xcut, diff, u, tmp, I, J, first = 1;
2977
2978 t = lblock->next_exon; prev = lblock;
2979
2980 s = seq2; q = seq2+len2-1; while (s<=q && *s=='T') s++;
2981 if (s>=seq2+t->from2-1) {
2982 while (t && t->to1) {
2983 s = seq2+t->from2-1; q = seq2+t->to2-1;
2984 if (first && strncmp((char *)s,"TTTTT",5)) break;
2985 first = 0;
2986 while ((s<=q) && (*s=='T')) s++;
2987 diff = t->to2-(s-seq2); u = min(diff,12);
2988 xcut = t->to2-t->from2+1-diff;
2989 if (diff>6) {
2990 tmp = (int)((1+P)*u); /* was diff */
2991 /*
2992 EXTEND_BW(s,seq1+t->to1-(diff-u)-tmp,u,tmp,
2993 s-seq2,t->to1-(diff-u)-tmp,&I,&J);
2994 */
2995 EXTEND_BW(seq2+t->from2+xcut-1,
2996 seq1+t->from1+(int)((1-P)*xcut)-1,
2997 u,(int)(P*xcut+(1+P)*u)+1, /* 1 is for looser margin */
2998 t->from2+xcut-1,t->from1+(int)((1-P)*xcut)-1,
2999 &I,&J);
3000 t->from2 = I+1; t->from1 = J+1;
3001 t->length = t->to2-t->from2+1;
3002 break;
3003 } else if (diff>0) {
3004 prev->next_exon = t->next_exon; free(t); t = prev->next_exon;
3005
3006 tmp = (int)((1+P)*diff);
3007 EXTEND_BW(s,seq1+t->from1-tmp,diff,tmp,
3008 s-seq2,t->from1-tmp,&I,&J);
3009 t->from2 = I+1; t->from1 = J+1; t->length = t->to2-t->from2+1;
3010 break;
3011 } else {
3012 /* remove entire exon and repeat the process */
3013 prev->next_exon = t->next_exon; free(t); t = prev->next_exon;
3014 continue;
3015 }
3016 }
3017 }
3018
3019 while (t && t->next_exon && t->next_exon->to1) {
3020 prev = t; t = t->next_exon;
3021 }
3022
3023 first = 1;
3024 s = seq2; q = seq2+len2-1; while (q>=s && *q=='A') q--;
3025 if (t && t->to1 && q<seq2+t->to2-1) {
3026 while (t && t->to1) {
3027 s = seq2+t->to2-1; q = seq2+t->from2-1;
3028 if (first && strncmp((char *)(s-4), "AAAAA", 5)) break;
3029 first = 0;
3030 while ((s>=q) && (*s=='A')) s--;
3031 diff = (int)(s-seq2+1)-t->from2+1; u = min(diff, 12);
3032 xcut = t->to2-t->from2+1-diff;
3033 if (diff>6) {
3034 /*
3035 EXTEND_FW(seq2+t->from2+(diff-u)-1, seq1+t->from1+(diff-u)-1,
3036 u, (int)((1+P)*u), t->from2+(diff-u)-1,
3037 t->from1+(diff-u)-1, &I, &J);
3038
3039 */
3040 EXTEND_FW(s-u+1, seq1+t->to1-(int)((1+P)*xcut)-(int)((1+P)*u),
3041 u,(int)((1+P)*u+P*xcut)+1, /*+1 is for looser margin */
3042 t->from2+(diff-u)-1,
3043 t->to1-(int)((1+P)*xcut)-(int)((1+P)*u), &I, &J);
3044
3045 t->to1 = J; t->to2 = I; t->length = t->to2-t->from2+1;
3046 break;
3047 } else if (diff>0) {
3048 if (prev==NULL) prev = find_previous(lblock, t);
3049 assert(prev!=NULL);
3050 prev->next_exon = t->next_exon; free(t);
3051 t = prev->next_exon; prev = NULL;
3052 EXTEND_FW(seq2+t->to2, seq1+t->to1, diff, (int)((1+P)*diff),
3053 t->to2, t->to1, &I, &J);
3054 t->to1 = J; t->to2 = I; t->length = t->to2-t->from2+1;
3055 break;
3056 } else {
3057 if (prev==NULL) prev = find_previous(lblock, t);
3058 assert(prev!=NULL);
3059 prev->next_exon = t->next_exon; free(t);
3060 t = prev->next_exon; prev = NULL;
3061 continue;
3062 }
3063 }
3064 }
3065
3066 return;
3067 }
3068 #endif
3069
find_previous(Exon * lblock,Exon * tgt)3070 Exon *find_previous(Exon *lblock, Exon *tgt)
3071 {
3072 Exon *t=lblock;
3073
3074 while (t && (t->next_exon!=tgt)) t = t->next_exon;
3075 if (t==NULL)
3076 fatal("sim4b1.c: Corrupted exon list: could not find previous.");
3077
3078 return t;
3079 }
3080
get_match_quality(Exon * lblock,Exon * rblock,sim4_stats_t * st,int N)3081 static bool get_match_quality(Exon *lblock, Exon *rblock, sim4_stats_t *st, int N)
3082 {
3083 int tcov;
3084 bool good_match;
3085 Exon *t;
3086
3087 good_match = TRUE; st->icoverage = 0;
3088 t = lblock->next_exon;
3089 while (t->to1) {
3090 st->icoverage += t->to2-t->from2+1;
3091 if (100*t->edist>=5*(t->to2-t->from2+1)) {
3092 good_match = FALSE; break;
3093 }
3094 t = t->next_exon;
3095 }
3096 tcov = rblock->to2-lblock->next_exon->from2+1;
3097 if (lblock->next_exon->from2>=.5*N &&
3098 tcov>=.8*(N-lblock->next_exon->from2) &&
3099 st->icoverage>=max(.95*tcov,100))
3100 ;
3101 else if (rblock->to2<=.5*N && tcov>=.8*rblock->to2 &&
3102 st->icoverage>=max(.95*tcov,100))
3103 ;
3104 else if ((tcov<.8*N) || (st->icoverage<.9*tcov))
3105 good_match = FALSE;
3106
3107 return good_match;
3108 }
3109