1 /*
2  * sim4.c - A program to align a cDNA sequence with a genomic sequence
3  *          for the gene.
4  */
5 
6 #ifndef __lint
7 /*@unused@*/
8 static const char rcsid[] =
9 "$Id: sim4b1.c,v 1.64 2002/03/03 23:29:48 florea Exp $";
10 #endif
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <math.h>
16 #include <assert.h>
17 
18 #include "psublast.h"
19 
20 #include "sim4.h"
21 #include "sim4b1.h"
22 #include "Xtend1.h"
23 #include "align.h"
24 #include "splice.h"
25 #include "poly.h"
26 
27 #define  EXTEND_FW (rs.acc_flag?Xextend_fw:extend_fw)
28 #define  EXTEND_BW (rs.acc_flag?Xextend_bw:extend_bw)
29 
30 #define  SLIDE_INTRON(x)   (((x)==TRUE)?sync_slide_intron:slide_intron)
31 
32 uchar      *seq1, *seq2;
33 int         M, N, encoding[NACHARS];
34 coords      last_GT, last_CT, last_AG, last_AC;
35 int         file_type;
36 
37 sim4_args_t rs;
38 
39 
40 static int         numMSPs, K, W, X;
41 static int         G_score, C_score;
42 static int        *diag_lev;
43 static Msp_ptr     msp_list, *msp;
44 static Exon_ptr    exon_list;
45 
46 static void   merge(Exon **,Exon **);
47 static bool   get_sync_flag(Exon *, Exon *, int);
48 static void   slide_intron(int w, Exon **,uchar *,uchar *);
49 static void   sync_slide_intron(int w, Exon **,uchar *,uchar *);
50 static void   wobble(Exon **,Exon **,const char *,const char *,uchar *seq1);
51 static Exon  *bmatch(uchar *,uchar *,int,int,int,int);
52 static Exon  *fmatch(uchar *,uchar *,int,int,int,int);
53 static void   compact_list(Exon **Lblock, Exon **Rblock);
54 static int    resolve_overlap(Exon *,Exon *,uchar *);
55 static int    greedy(uchar *,uchar *,int,int,int,int,Exon **, Exon **);
56 static int    extend_bw(uchar *,uchar *,int,int,int,int,int *,int *);
57 static int    extend_fw(uchar *,uchar *,int,int,int,int,int *,int *);
58 static void   pluri_align(int *,int *,Exon *,struct edit_script_list **);
59 static void   get_stats(Exon *,sim4_stats_t *);
60 static int    get_edist(int,int,int,int,uchar *,uchar *);
61 static int    get_msp_threshold(int len1, int len2);
62 static int    find_log_entry(long *log4s, int n, int len, int offset);
63 static Exon  *new_exon(int,int,int,int,int,int,int,Exon *);
64 static void   add_word(int,int);
65 static void   extend_hit(int,int,const uchar *const,const uchar * const,int,int,int);
66 static void   sort_msps(void);
67 static void   heapify(int,int);
68 static int    smaller(int,int);
69 static void   search(uchar *,uchar *,int,int,int);
70 static int    link_msps(Msp_ptr *msp,int,int,int);
71 static int    scale(int n);
72 static void   msp2exons(Msp_ptr *,int,uchar *,uchar *);
73 static void   free_msps(Msp_ptr **,int *);
74 static void   exon_cores(uchar*,uchar*,int,int,int,int,int,int,int,int);
75 static void   relink(Msp_ptr *,int,int,int,int,int,uchar *,uchar *);
76 static int    dispatch_find_ends(int,int,int *,int *,edit_script_list *,int,int,int);
77 static int    find_ends(edit_script_list *,int);
78 static bool   get_match_quality(Exon *,Exon *,sim4_stats_t *,int);
79 static int    check_consistency_intron_ori(Exon *,int,char *);
80 
81 Exon  *find_previous(Exon *,Exon *);
82 void   script_flip_list(edit_script_list **);
83 
84 
85 #ifdef DEBUG
86 static void   debug_print_exons(Exon *, const char *);
87 #endif
88 
89 /*  Not currently used: */
90 #ifdef AUXUTILS
91    static void   remove_polyA_tails(Exon *,uchar *,uchar *,int);
92    static void   find_introns(Exon *, Intron **);
93    static void   print_introns(Intron *);
94 #endif
95 
96 /* seq1 = genomic  DNA (text); seq2 = cDNA */
SIM4(uchar * in_seq1,uchar * in_seq2,int in_M,int in_N,int in_W,int in_X,int in_K,int in_C,int in_H,int * dist_ptr,int * pT,int * pA,Exon ** Exons,sim4_stats_t * st)97 struct edit_script_list *SIM4(uchar *in_seq1, uchar *in_seq2,
98                          int in_M, int in_N, int in_W,
99                          int in_X, int in_K, int in_C, int in_H,
100                          int *dist_ptr, int *pT, int *pA,
101                          Exon **Exons, sim4_stats_t *st)
102 {
103   int    cflag, diff, cost, rollbflag, sync_flag;
104   int    u, v, I, J;
105   bool   good_match;
106   Exon   *Lblock, *Rblock=NULL, *tmp_block, *last, *prev, *tmp_block1,
107          *tmp_Lblock=NULL, *tmp_Rblock=NULL, *new;
108 
109   struct edit_script_list *Script_head=NULL;
110   uchar tmp[50];
111   coords *sig;
112 
113   seq1 = in_seq1;
114   seq2 = in_seq2;
115   M = in_M;
116   N = in_N;
117   W = in_W;
118   X = in_X;
119 
120   if (M<=0 || in_N<=0) {
121       *Exons = NULL; return NULL;
122   }
123 
124   if (rs.acc_flag) {
125       last_AG.pos1 = last_AG.pos2 = last_AC.pos1 = last_AC.pos2 = 0;
126       last_GT.pos1 = last_GT.pos2 = last_CT.pos1 = last_CT.pos2 = 0;
127   }
128 
129   /* Compute the distance between two sequences A and B */
130 
131   *dist_ptr = 0;
132 
133   exon_cores(seq1-1, seq2-1, M, N, 1, 1, 0, W, in_K, PERM);
134 
135   tmp_block = Lblock = exon_list;
136   while (tmp_block) {
137      if (tmp_block->next_exon==NULL)
138          Rblock = tmp_block;
139      tmp_block = tmp_block->next_exon;
140   }
141 
142   if (Lblock &&
143       ((Lblock->from1>50000 && Lblock->from2>100) ||
144        ((M-Rblock->to1>50000) && (N-Rblock->to2>100)))) {
145       free_list(exon_list);
146       relink(msp,numMSPs,(in_H>0) ? in_H:DEFAULT_RELINK_H,1,1,0,seq1,seq2);
147       tmp_block = Lblock = exon_list;
148       while (tmp_block) {
149          if (tmp_block->next_exon==NULL)
150              Rblock = tmp_block;
151          tmp_block = tmp_block->next_exon;
152       }
153   }
154   free_msps(&msp, &numMSPs);
155 
156   tmp_block = Lblock = exon_list;
157   while (tmp_block) {
158      if (tmp_block->next_exon==NULL)
159          Rblock = tmp_block;
160      tmp_block = tmp_block->next_exon;
161   }
162 
163   /* enclose the current path in the (0,0,0,0) and (M+1,N+1,0,0) brackets */
164 
165   Lblock = new_exon (0,0,0,0,0,0,0,Lblock);
166   if (Rblock == NULL)
167       Rblock = Lblock;
168   Rblock->next_exon = new_exon (M+1,N+1,0,0,0,0,0,NULL);
169 
170   /* compute current statistics */
171   good_match = get_match_quality(Lblock, Rblock, st, N);
172 
173 #ifdef DEBUG
174   debug_print_exons(Lblock, "LSIS");
175 #endif
176 
177   tmp_block = Lblock;
178   while ((tmp_block1 = tmp_block->next_exon)!=NULL) {
179      rollbflag = 0;
180      diff = (int)(tmp_block1->from2-tmp_block->to2-1);
181      if (diff) {
182        if (diff<0) {
183          int best_u;
184 
185          best_u = resolve_overlap(tmp_block,tmp_block1,seq1);
186 
187          tmp_block1->from1 += best_u+1-tmp_block1->from2;
188          tmp_block1->from2 = best_u+1;
189          if (((u=tmp_block1->to2-tmp_block1->from2+1)<=0) || (u<8) ||
190              ((v=tmp_block1->to1-tmp_block1->from1+1)<=0) || (v<8)) {
191              /* remove exon associated with tmp_block1 */
192              tmp_block->next_exon = tmp_block1->next_exon;
193              tmp_block->flag = tmp_block1->flag;
194              rollbflag = 1;
195              free(tmp_block1);
196              tmp_block1 = NULL; /* not necessary, just to keep it 'clean'*/
197          }
198 
199          tmp_block->to1 -= tmp_block->to2-best_u;
200          tmp_block->to2 = best_u;
201          if (((u=tmp_block->to2-tmp_block->from2+1)<=0) || (u<8) ||
202              ((v=tmp_block->to1-tmp_block->from1+1)<=0) || (v<8)) {
203 
204              /* remove exon defined by tmp_block */
205              prev = find_previous(Lblock,tmp_block);
206              assert (prev!=NULL);
207              prev->next_exon = tmp_block->next_exon;
208              prev->flag = tmp_block->flag;
209              if (u>0) rollbflag = 1;
210              free(tmp_block);
211              tmp_block = prev;
212          }
213 
214          if (tmp_block->to1)
215              tmp_block->length = tmp_block->to2-tmp_block->from2+1;
216          if (tmp_block1 && tmp_block1->to1)
217              tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1;
218 
219        } else {
220          /* bridge the gap */
221          cflag = (tmp_block1->to2 && tmp_block->to2) ? 0 : 1;
222          if (diff && (tmp_block1->from1-tmp_block->to1-1>0)) {
223              if (!cflag) {
224               if (diff<=MAX_GRINIT) {
225                 cost = greedy(seq2+tmp_block->to2,
226                               seq1+tmp_block->to1,
227                               diff,
228                               tmp_block1->from1-tmp_block->to1-1,
229                               tmp_block->to2,tmp_block->to1,
230                               &tmp_Lblock, &tmp_Rblock);
231               } else cost = max(W,(int)(P*diff+1))+1;
232 
233               if (cost>max(W,(int)(P*diff+1))) {
234                   if (!tmp_block->flag && !tmp_block1->flag) {
235                       exon_cores(seq1+tmp_block->to1-1,
236                                  seq2+tmp_block->to2-1,
237                                  tmp_block1->from1-tmp_block->to1-1,
238                                  diff,
239                                  tmp_block->to1+1,
240                                  tmp_block->to2+1,
241                                  1,
242                                  min(8,W),
243                                  in_C,
244 /*                               (min(8,W)==W) ? PERM : TEMP); */
245                                  TEMP);
246                       tmp_Lblock = tmp_Rblock = exon_list;
247                       while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
248                          tmp_Rblock = tmp_Rblock->next_exon;
249 
250                       if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) ||
251                            (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) &&
252                            (tmp_Lblock->from1-tmp_block->from1>50000)) ||
253                           (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) &&
254                            (tmp_block1->from1-tmp_Rblock->from1>50000))) {
255                          /* possible large intron; increase the score weight */
256                          free_list(tmp_Lblock);
257                          relink(msp, numMSPs,
258                                 (in_H>0) ? in_H:DEFAULT_RELINK_H,
259                                 tmp_block->to1+1,
260                                 tmp_block->to2+1,
261                                 1, seq1, seq2);
262                          tmp_Lblock = tmp_Rblock = exon_list;
263                          while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
264                             tmp_Rblock = tmp_Rblock->next_exon;
265                       }
266                       free_msps(&msp, &numMSPs);
267 
268                       if (tmp_Lblock) rollbflag = 1;
269                       else rollbflag = 0;   /* already 0 */
270                   } else
271                         tmp_Lblock = tmp_Rblock = NULL;
272               }
273              } else if (tmp_block1->to1) {
274                    /* start of seq; find last_AG, last_AC */
275                    if (rs.acc_flag) {
276                      for (v=tmp_block1->from1-1; v<=tmp_block1->to1-3; v++)
277                         if (!strncmp((char *)(seq1+v-2),"AG",(size_t)2)) {
278                           last_AG.pos1 = v+1;
279                           last_AG.pos2 = tmp_block1->from2+
280                                          (v-tmp_block1->from1)+1;
281                           break;
282                         }
283                      for (v=tmp_block1->from1-1; v<=tmp_block1->to1-3; v++)
284                         if (!strncmp((char *)(seq1+v-2),"AC",(size_t)2)) {
285                           last_AC.pos1 = v+1;
286                           last_AC.pos2 = tmp_block1->from2+
287                                          (v-tmp_block1->from1)+1;
288                           break;
289                         }
290                    } /* end acc_flag */
291 
292                    diff = (int)(min(diff,(int)(MAX_GRINIT/2)));
293                    u = min(4*diff,tmp_block1->from1-tmp_block->to1-1);
294                    cost = EXTEND_BW(seq2+tmp_block->to2+
295                                     (tmp_block1->from2-tmp_block->to2-1)-diff,
296                                     seq1+tmp_block->to1+
297                                     (tmp_block1->from1-tmp_block->to1-1)-u,
298                                     (int)diff, u,
299                                     tmp_block->to2+
300                                     (tmp_block1->from2-tmp_block->to2-1)-diff,
301                                     tmp_block->to1+
302                                     (tmp_block1->from1-tmp_block->to1-1)-u,
303                                     &I, &J);
304                    if ((good_match==FALSE) || tmp_block->flag || (J==0) || (I==0)) {
305                        tmp_block1->from2 = I+1;
306                        tmp_block1->from1 = J+1;
307                        tmp_block1->edist += cost;
308                        tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1;
309                    }
310 
311                    /* use blast if marginal gap still exists, and this is first scan */
312                    if (!(diff=(int)(tmp_block1->from2-tmp_block->to2-1)) ||
313                        tmp_block->flag) {
314                        /* blast-treated region or no gap */
315                        tmp_Rblock = tmp_Lblock = NULL;
316                    } else {
317                       exon_cores(seq1+tmp_block->to1-1,
318                                  seq2+tmp_block->to2-1,
319                                  tmp_block1->from1-tmp_block->to1-1,
320                                  diff,
321                                  tmp_block->to1+1,
322                                  tmp_block->to2+1,
323                                  1,
324                                  min(10,W),
325                                  in_C,
326 /*                               (min(10,W)==W) ? PERM : TEMP); */
327                                  TEMP);
328                       tmp_block -> flag = 1;
329                       tmp_Lblock = tmp_Rblock = exon_list;
330                       while (tmp_Rblock && tmp_Rblock->next_exon)
331                          tmp_Rblock = tmp_Rblock->next_exon;
332 
333                       if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) ||
334                            (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) &&
335                            (tmp_Lblock->from1-tmp_block->from1>50000)) ||
336                           (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) &&
337                            (tmp_block1->from1-tmp_Rblock->from1>50000))) {
338                          /* possible large intron; increase the score weight */
339                          free_list(tmp_Lblock);
340                          relink(msp, numMSPs,
341                                 (in_H>0) ? in_H:DEFAULT_RELINK_H,
342                                 tmp_block->to1+1,
343                                 tmp_block->to2+1,
344                                 1,seq1,seq2);
345                          tmp_Lblock = tmp_Rblock = exon_list;
346                          while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
347                             tmp_Rblock = tmp_Rblock->next_exon;
348                       }
349                       free_msps(&msp, &numMSPs);
350 
351                       if (tmp_Lblock) rollbflag = 1;
352                       else {
353                          tmp_block1->from2 = I+1;
354                          tmp_block1->from1 = J+1;
355                          tmp_block1->edist += cost;
356                          tmp_block1->length = tmp_block1->to2-tmp_block1->from2+1;
357                       }
358                    }
359 
360              } else {
361                    if (rs.acc_flag) {
362                      for (v=tmp_block->to1; v>=tmp_block->from1; v--)
363                         if (!strncmp((char *)(seq1+v),"GT",(size_t)2)) {
364                           last_GT.pos1 = v;
365                           last_GT.pos2 = tmp_block->to2-(tmp_block->to1-v);
366                           break;
367                         }
368                      for (v=tmp_block->to1; v>=tmp_block->from1; v--)
369                         if (!strncmp((char *)(seq1+v),"CT",(size_t)2)) {
370                           last_CT.pos1 = v;
371                           last_CT.pos2 = tmp_block->to2-(tmp_block->to1-v);
372                           break;
373                         }
374                    }
375 
376                    diff = (int)(min(diff,(int)(MAX_GRINIT/2)));
377                    cost = EXTEND_FW(seq2+tmp_block->to2,
378                                     seq1+tmp_block->to1,
379                                     diff,
380                                     min(4*diff,tmp_block1->from1-tmp_block->to1-1),
381                                     tmp_block->to2,tmp_block->to1,
382                                     &I, &J);
383                    if ((good_match==FALSE) || tmp_block1->flag || (I==M) || (J==N)) {
384                        if (tmp_block->to1) {
385                           tmp_block->to2 = I;
386                           tmp_block->to1 = J;
387                           tmp_block->edist += cost;
388                           tmp_block->length = tmp_block->to2-tmp_block->from2+1;
389                           tmp_Rblock = tmp_Lblock = NULL;
390                        } else
391                           /* special case: no initial exon */
392                           tmp_Lblock = tmp_Rblock = NULL;
393                    }
394                    /* use blast if marginal gap still exists, and this is first scan */
395                    if (!(diff=(int)(tmp_block1->from2-tmp_block->to2-1)) ||
396                         tmp_block1->flag) {
397                        /* blast-treated region or no gap */
398                        tmp_Rblock = tmp_Lblock = NULL;
399                    } else {
400                       exon_cores(seq1+tmp_block->to1-1,
401                                  seq2+tmp_block->to2-1,
402                                  tmp_block1->from1-tmp_block->to1-1,
403                                  diff,
404                                  tmp_block->to1+1,
405                                  tmp_block->to2+1,
406                                  1,
407                                  min(10,W),
408                                  in_C,
409 /*                               (min(10,W)==W) ? PERM : TEMP); */
410                                  TEMP);
411                       tmp_Lblock = tmp_Rblock = exon_list;
412                       while (tmp_Rblock && tmp_Rblock->next_exon)
413                          tmp_Rblock = tmp_Rblock->next_exon;
414 
415                       if ((!tmp_Lblock && tmp_block1->from1-tmp_block->to1>50000) ||
416                            (tmp_Lblock && (tmp_Lblock->from2-tmp_block->to2>100) &&
417                            (tmp_Lblock->from1-tmp_block->from1>50000)) ||
418                           (tmp_Lblock && (tmp_block1->from2-tmp_Rblock->to2>100) &&
419                            (tmp_block1->from1-tmp_Rblock->from1>50000))) {
420                          /* possible large intron; increase the score weight */
421                          free_list(tmp_Lblock);
422                          relink(msp, numMSPs,
423                                 (in_H>0) ? in_H:DEFAULT_RELINK_H,
424                                 tmp_block->to1+1,
425                                 tmp_block->to2+1,
426                                 1,seq1,seq2);
427                          tmp_Lblock = tmp_Rblock = exon_list;
428                          while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
429                             tmp_Rblock = tmp_Rblock->next_exon;
430                       }
431                       free_msps(&msp, &numMSPs);
432 
433                       tmp_block1->flag = 1;
434                       if (tmp_Lblock) rollbflag = 1;
435                       else {
436                          if (tmp_block->to1) {
437                             tmp_block->to2 = I;
438                             tmp_block->to1 = J;
439                             tmp_block->edist += cost;
440                             tmp_block->length = tmp_block->to2-tmp_block->from2+1;
441                             tmp_Rblock = tmp_Lblock = NULL;
442                          } else
443                             /* special case: no initial exon */
444                             tmp_Lblock = tmp_Rblock = NULL;
445                       }
446                    }
447              }
448            } else if (diff) {
449                   tmp_Rblock = tmp_Lblock = NULL;
450            }
451 
452            /* merge block in the exon list; make connections
453               to the previous list of blocks; maintain
454               increasing order */
455 
456            if (tmp_Lblock) {
457                tmp_block->next_exon = tmp_Lblock;
458                tmp_Rblock->next_exon = tmp_block1;
459                merge(&tmp_block,&tmp_block1);
460            }
461        }
462     }  /* diff!=0 */
463     if (!rollbflag) tmp_block = tmp_block1;
464   }
465 
466   /* just printing ... */
467 #ifdef DEBUG
468   debug_print_exons(Lblock, "EXTENSIONS");
469 #endif
470 
471   /* compaction step; note: it resets the right end of the list to   */
472   /* the last item in the block list                                 */
473 
474   compact_list(&(Lblock->next_exon), &Rblock);
475 
476   /* just printing ... */
477 #ifdef DEBUG
478   debug_print_exons(Lblock, "NORMALIZATION");
479 #endif
480 
481   /* eliminate marginal small blocks at the start of the sequence;   */
482   /* resets the empty alignment to one block (Lblock) only           */
483 
484   tmp_block = Lblock->next_exon;
485 
486   while ((tmp_block!=NULL) && (tmp_block->length<W) && tmp_block->to1) {
487           tmp_block1 = tmp_block; /* free */
488           tmp_block = tmp_block->next_exon;
489           free(tmp_block1); /* free */
490   }
491   Lblock->next_exon = tmp_block;
492 
493   /* eliminate marginal small blocks at the end of the sequence      */
494 
495   last = Lblock->next_exon;
496   tmp_block = last;
497   while (tmp_block!=NULL) {
498      if (tmp_block->length>=W)
499          last = tmp_block;
500      tmp_block = tmp_block->next_exon;
501   }
502   if (last && last->to1)
503       last->next_exon = Rblock->next_exon;
504   Rblock = last;
505 
506   /* if high accuracy requirement, adjust boundaries of marginal exons */
507   if (rs.acc_flag) {
508       tmp_block = Lblock->next_exon;
509 
510       /* condition for non-signal */
511       if (tmp_block && tmp_block->to1 &&
512           (strncmp((char *)(seq1+tmp_block->from1-3), END_SIG, (size_t)2) ||
513           (tmp_block->from2!=1))) {
514           sig = (G_score>=abs(C_score)) ? &last_AG : &last_AC;
515           if (sig->pos1 && (sig->pos2<=20)) {
516               /* generated in extend_bw */
517 	      assert(sig->pos2 > 1);
518               (void)strcpy((char *)tmp,END_SIG);
519               (void)strncpy((char *)(tmp+2),(char *)seq2,(size_t)sig->pos2-1);
520               (void)strcpy((char *)(tmp+sig->pos2+1), START_SIG);
521               new = bmatch(seq1,tmp,tmp_block->from1-3,sig->pos2+3,1,1);
522               if (new) {
523                   Lblock->next_exon->from1 = sig->pos1;
524                   Lblock->next_exon->from2 = sig->pos2;
525                   Lblock->next_exon->length -= sig->pos2-1;
526                   new->next_exon = Lblock->next_exon;
527                   new->ori = (G_score>=abs(C_score)) ? 'G' : 'C';
528                   Lblock->next_exon = new;
529               }
530           }
531       }
532       while (tmp_block && tmp_block->next_exon && tmp_block->next_exon->to1)
533              tmp_block = tmp_block->next_exon;
534       if (tmp_block && tmp_block->to1 &&
535           (strncmp((char *)(seq1+tmp_block->to1),START_SIG,(size_t)2) || (tmp_block->to2!=N))) {
536           sig = (G_score>=abs(C_score)) ? &last_GT : &last_CT;
537           if (sig->pos1 && (N-sig->pos2<=20)) {
538 	      assert(N-sig->pos2 >= 0);
539               (void)strcpy((char *)tmp,END_SIG);
540               (void)strncpy((char *)(tmp+2),(char *)(seq2+sig->pos2),
541 			    (size_t)N-sig->pos2);
542               (void)strcpy((char *)(tmp+N-sig->pos2+2),START_SIG);
543               new = fmatch(seq1+sig->pos1-1,tmp,
544                            M-sig->pos1+1,N-sig->pos2+4,
545                            sig->pos1-1,sig->pos2+1);
546               if (new) {
547                   tmp_block->to1 = sig->pos1;
548                   tmp_block->to2 = sig->pos2;
549                   new->next_exon = tmp_block->next_exon;
550                   tmp_block->next_exon = new;
551                   tmp_block->ori = (G_score>=abs(C_score)) ? 'G' : 'C';
552               }
553           }
554       }
555   }
556 
557   /* Slide exon boundaries for optimal intron signals */
558   sync_flag = get_sync_flag(Lblock, Rblock, 6);
559   SLIDE_INTRON(sync_flag)(6,&Lblock,seq1,seq2);
560 
561   /* decreasingly; script will be in reverse order */
562   flip_list(&Lblock, &Rblock);
563   pluri_align(dist_ptr,&(st->nmatches),Lblock,&Script_head);
564   flip_list(&Lblock, &Rblock);      /* increasingly */
565 
566   if (rs.poly_flag)
567       remove_poly(&Script_head,Lblock,seq1,seq2,N,pT,pA);
568   else
569       *pT = *pA = 0;
570 
571   get_stats(Lblock, st);
572 
573   *Exons = Lblock->next_exon;
574   free(Lblock);
575   if (!rs.ali_flag) {
576      free_align(Script_head);
577      return NULL;
578   } else
579      return Script_head;
580 
581 }
582 
583 
584 struct hash_node {
585         int ecode;             /* integer encoding of the word */
586         int pos;                /* positions where word hits query sequence
587  */
588         struct hash_node *link; /* next word with same last 7.5 letters */
589 };
590 
591 #define HASH_SIZE 32767        /* 2**15 - 1 */
592 #define GEN_LOG4_ENTRIES 45
593 #define CDNA_LOG4_ENTRIES 25
594 static struct hash_node *phashtab[HASH_SIZE+1];
595 static struct hash_node **hashtab;
596 static int mask;
597 static int *next_pos, *pnext_pos;
598 
599 /* The log4 arrays were computed to mimick the behaviour of the log formula
600    for computing the msp threshold in exon_cores(). For genomic_log4s,
601    entry i stores the value for the length of a genomic sequence
602    for which the contribution to the msp threshold is i/2, i.e.:
603                1.4*log_4(3/4*len1) = i/2;
604    Similarly, cDNA_log4s entries store lengths of the cDNA sequence for which
605    the contribution to the msp threshold is i/2, i.e.:
606                1.4*log_4(len2) = i/2;
607    Both arrays are sorted in increasing order, and can be searched with
608    binary search.
609 */
610 static long genomic_log4s[]= {1, 2, 3, 5, 9, 15, 26, 42, 70, 114, \
611             188, 309, 507, 832, 1365, 1365, 2240, 2240, 3675, 6029,\
612             9892, 16231, 26629, 43690, 71681, \
613             117606, 192953, 316573, 519392, 852152,
614             1398101, 2293823, 3763409, 6174516, 10130347, \
615             16620564, 27268873, 44739242, 73402365, 120429110, \
616             197584514, 324171126, 531858072, 872603963, 1431655765
617             };
618 static long cDNA_log4s[]= {1, 1, 2, 4, 7, 11, 19, 32, 52, 86, \
619             141, 231, 380, 624, 1024, 1680, 2756, 4522, 7419, 12173, \
620             19972, 32768, 53761, 88204, 144715
621             };
622 
623 
get_msp_threshold(int len1,int len2)624 static int get_msp_threshold(int len1, int len2)
625 {
626     int i, j;
627 
628     i = find_log_entry(genomic_log4s, GEN_LOG4_ENTRIES, len1, 0);
629     j = find_log_entry(cDNA_log4s, CDNA_LOG4_ENTRIES, len2, 0);
630 
631     if (!(i % 2)) return (int)(i/2+j/2);
632     else if (!(j % 2)) return (int)(i/2+j/2);
633     else return (int)(i/2+j/2+1);
634 }
635 
find_log_entry(long * log4s,int n,int len,int offset)636 static int find_log_entry(long *log4s, int n, int len, int offset)
637 {
638     int a;
639 
640     a = n/2;
641     if ((len<log4s[a]) && (!a || (len>=log4s[a-1])))
642                 return max(0,(a-1))+offset;
643     else if ((len>=log4s[a]) && ((a==n-1) || (len<log4s[a+1])))
644                 return min(n-1,(a+1))+offset;
645     else if (len<log4s[a])
646                 return find_log_entry(log4s,a-1,len, offset);
647     else if (len>log4s[a])
648                 return find_log_entry(log4s+a+1,n-a-1,len, offset+a+1);
649     return -1;
650 }
651 
652 /* --------------------   exon_cores()   --------------------- */
653 
exon_cores(uchar * s1,uchar * s2,int len1,int len2,int offset1,int offset2,int flag,int in_W,int in_K,int type)654 static void  exon_cores(uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2, int flag, int in_W, int in_K, int type)
655 {
656     int       i, W, last_msp, lower, upper;
657     int      *allocated;
658     Exon     *tmp_block;
659 
660 
661     if (in_K<=0) {
662         /* compute expected length of longest exact match .. */
663         /* K = (int) (log(.75*(double)len1)+log((double)len2))/log(4.0); */
664         /* .. and throw in a fudge factor */
665         /* K *= 1.4; */
666 
667         K = get_msp_threshold(len1, len2);
668         if (K>=0) K--; /* compensate for the rounding in the log formula */
669 /*      commented this to avoid fragmentation
670         if (flag) K = min(K, DEFAULT_C); second pass
671 */
672     } else
673         K = in_K;
674 
675     numMSPs = 0;
676     exon_list = NULL;
677 
678     allocated = ckalloc((len1+len2+1)*sizeof(int));
679     lower = ((file_type==EST_GEN) || (file_type==GEN_EST && type==TEMP))
680              ? -len1 : -len2;
681     upper = ((file_type==EST_GEN) || (file_type==GEN_EST && type==TEMP))
682              ?  len2 :  len1;
683     diag_lev = allocated - lower;
684     for (i=lower; i<=upper; ++i) diag_lev[i]=0;
685 
686     W = min(in_W,len2);
687     switch (file_type) {
688        case EST_GEN:  bld_table(s2,len2,W, type);
689                       /* use longer sequence for permanent tables */
690                       search(s1,s2,len1,len2,W);
691                       break;
692        case GEN_EST:  if (type!=TEMP) {
693                       uchar *aux; int   auxi;
694 
695                       aux = s1; s1 = s2; s2 = aux;
696                       auxi = len1; len1 = len2; len2 = auxi;
697                       }
698                       bld_table(s2,len2,W, type);
699                       /* use longer sequence for permanent tables */
700                       search(s1,s2,len1,len2,W);
701                       if (type!=TEMP) {
702                           register int   auxi;
703                           uchar *aux;
704                           Msp_ptr mp;
705 
706                           /* change s1 and s2 back */
707                           aux = s1; s1 = s2; s2 = aux;
708                           auxi = len1; len1 = len2; len2 = auxi;
709 
710                           for (mp=msp_list, i=0; i<numMSPs; i++) {
711 		      	      auxi = mp->pos1;
712 			      mp->pos1 = mp->pos2;
713 			      mp->pos2 = auxi;
714 			      mp = mp->next_msp;
715                           }
716                       }
717                       break;
718        default:       fatal("sim4b1.c: Invalid file type code.");
719     }
720 
721     free(allocated);
722     if (type==TEMP) {
723         register struct hash_node *hptr, *tptr;
724         register int    hval;
725 
726         free(next_pos);
727         for (hval=0; hval<HASH_SIZE+1; hval++) {
728              hptr = hashtab[hval];
729              while (hptr) {
730                 tptr = hptr;
731                 hptr = hptr->link;
732                 free(tptr);
733              }
734         }
735         free(hashtab);
736     }
737 
738     msp = (Msp_ptr *) ckalloc(numMSPs*sizeof(Msp_ptr));
739     { Msp_ptr mp = msp_list;
740       for (i = 0; i < numMSPs; ++i) {
741                 msp[i] = mp;
742 		mp = mp->next_msp;
743       }
744     }
745 
746     sort_msps();    /* sort in order of mp->pos2, in the shorter seq */
747 
748     /* organize Blast hits (MSPs) into exons */
749     last_msp = link_msps(msp, numMSPs, DEFAULT_WEIGHT, LINK);
750 
751 #ifdef DEBUG
752     for (i = last_msp; i >= 0; i = msp[i]->prev)
753          (void)printf("%d-%d\n", msp[i]->pos1, msp[i]->pos1 + msp[i]->len - 1);
754 #endif
755 
756     msp2exons(msp,last_msp,s1,s2);
757 
758     /* now free msp[]? No - may need to re-link */
759     /*
760     for (i=0; i<numMSPs; ++i)
761              free(msp[i]);
762     free(msp);
763     */
764 
765     tmp_block = exon_list;
766     while (tmp_block!=NULL) {
767        tmp_block->length = tmp_block->to2-tmp_block->from2+1;
768        tmp_block->to1 += offset1;
769        tmp_block->from1 += offset1;
770        tmp_block->to2 += offset2;
771        tmp_block->from2 += offset2;
772        tmp_block->flag = flag;
773 
774        tmp_block = tmp_block->next_exon;
775     }
776 
777     return ;
778 }
779 
relink(Msp_ptr * in_msp,int in_numMSPs,int H,int offset1,int offset2,int flag,uchar * s1,uchar * s2)780 static void relink(Msp_ptr *in_msp, int in_numMSPs, int H, int offset1, int offset2, int flag, uchar *s1, uchar *s2)
781 {
782     int last_msp;
783     Exon *tmp_block;
784 
785     exon_list = NULL;
786 
787     last_msp = link_msps(in_msp, in_numMSPs, H, RELINK);
788 
789     msp2exons(in_msp,last_msp,s1,s2);
790 
791     tmp_block = exon_list;
792     while (tmp_block!=NULL) {
793        tmp_block->length = tmp_block->to2-tmp_block->from2+1;
794        tmp_block->to1 += offset1;
795        tmp_block->from1 += offset1;
796        tmp_block->to2 += offset2;
797        tmp_block->from2 += offset2;
798        tmp_block->flag = flag;
799 
800        tmp_block = tmp_block->next_exon;
801     }
802 
803     return ;
804 }
805 
free_msps(Msp_ptr ** in_msp,int * in_numMSPs)806 static void free_msps(Msp_ptr **in_msp, int *in_numMSPs)
807 {
808      int i;
809 
810      for (i=0; i<*in_numMSPs; ++i) free((*in_msp)[i]);
811      free(*in_msp); *in_msp = NULL; *in_numMSPs = 0;
812 }
813 
scale(int n)814 static int scale(int n)
815 {
816     return (n<=100000) ? n : (100000+(int)(10*log((double)(n-100000))));
817 }
818 
link_msps(Msp_ptr * msp,int numMSPs,int H,int flag)819 static int link_msps(Msp_ptr *msp, int numMSPs, int H, int flag)
820 {
821 	int i, j, f1, f2, best, diag, diff_diag, best_sc, try;
822 
823 
824 	for (best = -1, best_sc = MININT, i = 0; i < numMSPs; ++i) {
825 		f1 = msp[i]->pos1;      /* start position in seq1 */
826 		f2 = msp[i]->pos2;      /* start position in seq2 */
827 		diag = f1 - f2;
828 		msp[i]->prev = -1;
829 		msp[i]->Score = 0;
830 		for (j = 0; j < i; ++j) {
831                      int var_L =
832                      ((msp[i]->pos2+msp[i]->len-msp[j]->pos2-msp[j]->len>2*W) &&
833                       (msp[i]->pos2-msp[j]->pos2>2*W)) ? 2*L : L;
834 
835 		     diff_diag = diag - msp[j]->pos1 + msp[j]->pos2;
836 	             if (diff_diag < -rs.DRANGE ||
837                          (diff_diag > rs.DRANGE && diff_diag < MIN_INTRON) ||
838                          (msp[j]->pos2+msp[j]->len-1-f2>var_L) ||
839                          (msp[j]->pos1+msp[j]->len-1-f1>var_L))
840 
841 				continue;
842 		     try = msp[j]->Score - ((flag==RELINK) ?
843                            scale(abs(diff_diag)) : abs(diff_diag));
844 
845 		     if (try > msp[i]->Score) {
846 		                msp[i]->Score = try;
847 				msp[i]->prev = j;
848 		     }
849 		}
850 		msp[i]->Score += (H*msp[i]->score);
851 		if (msp[i]->Score > best_sc) {
852 			best = i;
853 			best_sc = msp[i]->Score;
854 		}
855 	}
856 
857 	return best;
858 }
859 
860 /* -----------   build table of W-tuples in one of the sequences  ------------*/
861 
bld_table(uchar * s,int len,int in_W,int type)862 void bld_table(uchar *s, int len, int in_W, int type)
863 {
864         int ecode;
865         int i, j;
866         uchar *t;
867 
868         if (type == PERM) {
869             mask = (1 << (in_W+in_W-2)) - 1;
870             next_pos = pnext_pos;
871             hashtab = phashtab;
872             return;
873         }
874 
875         /* perform initializations */
876         if (type == INIT) {
877 
878            /* perform initializations */
879            for (i=0; i<NACHARS; ++i) encoding[i]=-1;
880 
881            encoding['A'] = 0;
882            encoding['C'] = 1;
883            encoding['G'] = 2;
884            encoding['T'] = 3;
885            mask = (1 << (in_W+in_W-2)) - 1;
886            /* added +1 in the allocation below */
887            next_pos = pnext_pos = (int *)ckalloc((len+1)*sizeof(int));
888            hashtab = phashtab;
889         } else {
890            mask = (1 << (in_W+in_W-2)) - 1;
891            next_pos = (int *)ckalloc((len+1)*sizeof(int));
892            hashtab = (struct hash_node **)
893                       ckalloc((HASH_SIZE+1)*sizeof(struct hash_node *));
894            for (i=0; i<=HASH_SIZE; ++i) hashtab[i] = NULL;
895         }
896 
897         /* skip any word containing an N/X  */
898         t = s+1;
899         for (i=1; (i<=len) && *t; ) {
900         restart:
901            ecode = 0L;
902            for (j=1; (j<in_W) && (i<=len) && *t; ++j) {
903                 int tmp = encoding[*t++]; ++i;
904                 if (tmp<0) goto restart;
905                 ecode = (ecode << 2) + tmp;
906            }
907 
908            for (; (i<=len) && *t; ) {
909                 int tmp = encoding[*t++]; i++;
910                 if (tmp<0) goto restart;
911                 ecode = ((ecode & mask) << 2) + tmp;
912                 add_word(ecode, (int)(t-s-1));
913            }
914         }
915 
916 /* previous version: include 'N's in the table
917         t = s;
918         ecode = 0L;
919         for (i = 1; i < in_W; ++i)
920                 ecode = (ecode << 2) + encoding[*++t];
921         for (; (i<=len) && (*++t); i++) {
922                 ecode = ((ecode & mask) << 2) + encoding[*t];
923                 add_word(ecode, (int)(t - s));
924         }
925  */
926 }
927 
928 /* add_word - add a word to the table of critical words */
add_word(int ecode,int pos)929 static void add_word(int ecode, int pos)
930 {
931         struct hash_node *h;
932         int hval;
933 
934         hval = ecode & HASH_SIZE;
935         for (h = hashtab[hval]; h; h = h->link)
936                 if (h->ecode == ecode)
937                         break;
938         if (h == NULL) {
939                 h = (struct hash_node *) ckalloc (sizeof(struct hash_node));
940                 h->link = hashtab[hval];
941                 hashtab[hval] = h;
942                 h->ecode = ecode;
943                 h->pos = -1;
944         }
945         next_pos[pos] = h->pos;
946         h->pos = pos;
947 }
948 
949 /* -----------------------   search the other sequence   ---------------------*/
950 
search(uchar * s1,uchar * s2,int len1,int len2,int in_W)951 static void search(uchar *s1, uchar *s2, int len1, int len2, int in_W)
952 {
953         register struct hash_node *h;
954         register uchar *t;
955         register int ecode, hval;
956         int i, j, p;
957 
958         t = s1+1;
959         for (i=1; (i<=len1) && *t; ) {
960         restart:
961                 ecode = 0L;
962                 for (j=1; (j<in_W) && (i<=len1) && *t; ++j) {
963                     int tmp = encoding[*t++]; ++i;
964                     if (tmp<0) goto restart;
965                     ecode = (ecode << 2) + tmp;
966                 }
967                 for (; (i<=len1) && *t; ) {
968                     int tmp = encoding[*t++]; ++i;
969                     if (tmp<0) goto restart;
970                     ecode = ((ecode & mask) << 2) + tmp;
971                     hval = ecode & HASH_SIZE;
972                     for (h = hashtab[hval]; h; h = h->link)
973                          if (h->ecode == ecode) {
974                              for (p = h->pos; p >= 0; p = next_pos[p])
975                                   extend_hit((int)(t-s1-1),p,s1,s2,len1,len2, in_W);
976                              break;
977                          }
978                 }
979         }
980 
981 /* previous version: allow 'N's in the table --------
982         t = s1;
983         ecode = 0L;
984         for (i = 1; i < in_W; ++i)
985                 ecode = (ecode << 2) + encoding[*++t];
986         for (; (i<=len1) && (*++t); i++) {
987                 ecode = ((ecode & mask) << 2) + encoding[*t];
988                 hval = ecode & HASH_SIZE;
989                 for (h = hashtab[hval]; h; h = h->link)
990                         if (h->ecode == ecode) {
991                                 for (p = h->pos; p >= 0; p = next_pos[p])
992                                         extend_hit((int)(t-s1),p,s1,s2,len1,len2, in_W);
993                                 break;
994                         }
995         }
996 ---------------------------------------------------*/
997 }
998 
999 /* extend_hit - extend a word-sized hit to a longer match */
extend_hit(int pos1,int pos2,const uchar * const s1,const uchar * const s2,int len1,int len2,int in_W)1000 static void extend_hit(int pos1, int pos2,
1001 		       const uchar * const s1, const uchar * const s2,
1002 		       int len1, int len2, int in_W)
1003 {
1004         const uchar *beg2, *beg1, *end1, *q, *s;
1005         int right_sum, left_sum, sum, diag, score;
1006 
1007         diag = pos2 - pos1;
1008         if (diag_lev[diag] > pos1)
1009                 return;
1010 
1011         /* extend to the right */
1012         left_sum = sum = 0;
1013 	q = s1+1+pos1;
1014         s = s2+1+pos2;
1015         end1 = q;
1016         while ((*s != '\0') && (*q != '\0') &&
1017                (s<=s2+len2) && (q<=s1+len1) && sum >= left_sum - X) {
1018                 sum += ((*s++ == *q++) ? MATCH : MISMATCH);
1019                 if (sum > left_sum) {
1020                         left_sum = sum;
1021                         end1 = q;
1022                 }
1023         }
1024 
1025         /* extend to the left */
1026         right_sum = sum = 0;
1027         beg1 = q = (s1+1+pos1) - in_W;
1028         beg2 = s = (s2+1+pos2) - in_W;
1029         while ((s>s2+1) && (q>s1+1) && sum >= right_sum - X) {
1030                 sum += ((*(--s) == *(--q)) ? MATCH : MISMATCH);
1031                 if (sum > right_sum) {
1032                         right_sum = sum;
1033                         beg2 = s;
1034                         beg1 = q;
1035                 }
1036         }
1037 
1038         score = in_W + left_sum + right_sum;
1039         if (score >= K) {
1040                 Msp_ptr mp = (Msp_ptr)ckalloc(sizeof(*mp));
1041 
1042                 mp->len = end1 - beg1;
1043                 mp->score = score;
1044                 mp->pos1 = beg1 - (s1+1);
1045                 mp->pos2 = beg2 - (s2+1);
1046                 mp->next_msp = msp_list;
1047                 msp_list = mp;
1048                 ++numMSPs;
1049         }
1050 
1051         /*diag_lev[diag] = (end1 - (s1+1)) + in_W; */
1052         diag_lev[diag] = (end1 - s1) - 1 + in_W;
1053 }
1054 
1055 /*  ----------------------------   sort the MSPs  ----------------------------*/
1056 
1057 /* sort_msps - order database sequence for printing */
sort_msps(void)1058 static void sort_msps(void)
1059 {
1060         int i;
1061         Msp_ptr mp;
1062 
1063         for (i = (numMSPs/2) - 1; i >= 0; --i)
1064                 heapify(i, (int) numMSPs-1);
1065         for (i = numMSPs-1; i > 0; --i) {
1066                 mp = msp[0];
1067                 msp[0] = msp[i];
1068                 msp[i] = mp;
1069                 if (i > 1)
1070                         heapify(0, i-1);
1071         }
1072 }
1073 
1074 /* heapify - core procedure for heapsort */
heapify(i,last)1075 static void heapify(i, last)
1076 int i, last;
1077 {
1078         int lim = (last-1)/2, left_son, small_son;
1079         Msp_ptr mp;
1080 
1081         while (i <= lim) {
1082                 left_son = 2*i + 1;
1083                 if (left_son == last)
1084                         small_son = left_son;
1085                 else
1086                         small_son = smaller(left_son, left_son+1);
1087                 if (smaller(i, small_son) == small_son) {
1088                         mp = msp[i];
1089                         msp[i] = msp[small_son];
1090                         msp[small_son] = mp;
1091                         i = small_son;
1092                 } else
1093                         break;
1094         }
1095 }
1096 /* smaller - determine ordering relationship between two MSPs */
smaller(i,j)1097 static int smaller(i, j)
1098 int i, j;
1099 {
1100 	Msp_ptr ki = msp[i], kj = msp[j];
1101 
1102         if (ki->pos2 > kj->pos2) return i;
1103         if (ki->pos2 < kj->pos2) return j;
1104 
1105 	return ((ki->pos1 >= kj->pos1) ? i : j);
1106 }
1107 
1108 /* ---------------------  organize the MSPs into exons  ---------------------*/
1109 
msp2exons(Msp_ptr * msp,int last_msp,uchar * s1,uchar * s2)1110 static void msp2exons(Msp_ptr *msp, int last_msp, uchar *s1, uchar *s2)
1111 {
1112   Msp_ptr   mp;
1113   int diag_dist, diff;
1114 
1115   exon_list = NULL;
1116   if (last_msp<0) return;
1117 
1118   /* Note: in new_exon, the 'flag' and 'length' fields need not be computed */
1119   mp = msp[last_msp];
1120   exon_list = new_exon (mp->pos1, mp->pos2, mp->pos1+mp->len-1,
1121                         mp->pos2+mp->len-1, -1,
1122                         (mp->len*MATCH-mp->score)/(MATCH-MISMATCH), 0, exon_list);
1123 
1124   last_msp = mp->prev;
1125 
1126   while (last_msp>=0) {
1127     mp = msp[last_msp];
1128     if (((diag_dist=abs((exon_list->from2-exon_list->from1)-(mp->pos2-mp->pos1)))<=L)
1129         && (exon_list->from2-(mp->pos2+mp->len-1))<MAX_INTERNAL_GAP) {
1130           /* merge with previous exon */
1131           exon_list->edist += diag_dist;
1132           exon_list->edist += (mp->len*MATCH-mp->score)/(MATCH-MISMATCH);
1133           if ((diff=mp->pos2+mp->len-exon_list->from2)>0) {   /* overlap */
1134              int dist1, dist2;
1135              dist1 = get_edist(exon_list->from1,mp->pos2+mp->len-diff,
1136                                exon_list->from1+diff-1,mp->pos2+mp->len-1,s1,s2);
1137              dist2 = get_edist(mp->pos1+mp->len-diff,mp->pos2+mp->len-diff,
1138                                mp->pos1+mp->len-1,mp->pos2+mp->len-1,s1,s2);
1139              exon_list->edist -= max(dist1,dist2);
1140           } else if (diff<0) {  /* gap */
1141              exon_list->edist += 0.5*P*(-1)*diff;
1142           }
1143           exon_list->to1 = max(exon_list->to1,mp->pos1+mp->len-1);
1144           exon_list->to2 = max(exon_list->to2,mp->pos2+mp->len-1);
1145           exon_list->from1 = min(exon_list->from1,mp->pos1);
1146           exon_list->from2 = min(exon_list->from2,mp->pos2);
1147     } else {
1148           /* new exon */
1149           exon_list = new_exon (mp->pos1, mp->pos2, mp->pos1+mp->len-1,
1150                                 mp->pos2+mp->len-1, -1,
1151                                 (mp->len*MATCH-mp->score)/(MATCH-MISMATCH),
1152                                 0, exon_list);
1153     }
1154     last_msp = mp->prev;
1155   }
1156 }
1157 
get_edist(int f1,int f2,int t1,int t2,uchar * seq1,uchar * seq2)1158 static int get_edist(int f1, int f2, int t1, int t2, uchar *seq1, uchar *seq2)
1159 {
1160     uchar *s1, *s2, *q1, *q2;
1161     int dist=0;
1162 
1163     s1 = seq1+f1+1;   /* bc at this stage, the msp pos do not have added +1 */
1164     s2 = seq2+f2+1;
1165     q1 = seq1+t1+1;
1166     q2 = seq2+t2+1;
1167 
1168     while (s1<=q1 && s2<=q2) { dist += (*s1!=*s2); s1++; s2++; }
1169 
1170     return dist;
1171 }
1172 
1173 /* ----------------------  print endpoints of exons  --------------------*/
1174 
1175 #ifdef AUXUTILS
find_introns(Exon * eleft,Intron ** Ilist)1176 static void find_introns(Exon *eleft, Intron **Ilist)
1177 {
1178   Exon   *tmp_exon, *tmp_exon1;
1179   Intron *new, *tail;
1180   int     GTAG_score, CTAC_score;
1181 
1182   *Ilist = tail = NULL;
1183   if (!eleft) fatal("sim4b1.c: Something wrong in the exon list.\n");
1184 
1185   tmp_exon = eleft->next_exon;
1186   while ((tmp_exon!=NULL) && (tmp_exon1=tmp_exon->next_exon) &&
1187          tmp_exon1->to1) {
1188 
1189      new = (Intron *)ckalloc(sizeof(Intron));
1190      new->from1 = tmp_exon->to1+1;
1191      new->to1 = tmp_exon1->from1-1;
1192      new->from2 = tmp_exon->to2;
1193      new->to2 = tmp_exon1->from2;
1194      new->length = new->to1-new->from1+1;
1195      new->next_intron = NULL;
1196      if (!tail) *Ilist = new;
1197      else tail->next_intron = new;
1198      tail = new;
1199 
1200      /* find orientation */
1201      GTAG_score = CTAC_score = 0;
1202      if (*(seq1+new->from1-1)=='G') GTAG_score++;
1203      else
1204        if (*(seq1+new->from1-1)=='C') CTAC_score++;
1205 
1206      if (*(seq1+new->from1-1)=='T')
1207           { GTAG_score++;  CTAC_score++; }
1208 
1209      if (*(seq1+new->to1-1)=='A')
1210           { GTAG_score++;  CTAC_score++; }
1211 
1212      if (*(seq1+new->to1-1)=='G')  GTAG_score++;
1213      else
1214        if (*(seq1+new->to1-1)=='C') CTAC_score++;
1215 
1216      if  (GTAG_score>=CTAC_score)
1217           new->orientation = '+';
1218      else new->orientation = 'c';
1219 
1220      tmp_exon = tmp_exon1;
1221   }
1222 
1223 }
1224 #endif
1225 
1226 
1227 /* should only be called when (file_type==EST_GEN) && (match_ori==BWD) */
complement_exons(Exon ** left,int M,int N)1228 void complement_exons(Exon **left, int M, int N)
1229 {
1230   Exon  *tmp_block, *right;
1231   char   prev, ch;
1232 
1233   prev = 'U'; /* unknown; should trigger error */
1234   tmp_block = *left;
1235   while (tmp_block) {
1236      if (tmp_block->to1) {
1237          register int aux;
1238 
1239          if (tmp_block->next_exon && tmp_block->next_exon->to1) {
1240                ch = tmp_block->ori;
1241                tmp_block->ori = prev;
1242                switch (ch) {
1243                  case  'C': prev = 'G'; break;
1244                  case  'G': prev = 'C'; break;
1245                  case  'N': prev = 'N'; break;
1246                  case  'E': prev = 'E'; break;
1247                  default  : fatal("sim4b1.c: Inconsistency. Check exon orientation at complementation.");
1248                }
1249          } else
1250                tmp_block->ori = prev;
1251          aux = tmp_block->from1;
1252          tmp_block->from1 = M+1-tmp_block->to1;
1253          tmp_block->to1 = M+1-aux;
1254          aux = tmp_block->from2;
1255          tmp_block->from2 = N+1-tmp_block->to2;
1256          tmp_block->to2 = N+1-aux;
1257      }
1258      tmp_block = tmp_block->next_exon;
1259      if (tmp_block && tmp_block->to1)
1260          right = tmp_block;
1261   }
1262   flip_list(left,&right);
1263 }
1264 
print_exons(Exon * left)1265 void print_exons(Exon *left)
1266 {
1267   Exon  *tmp_block, *tmp_block1;
1268 
1269   tmp_block = left;
1270   while (tmp_block!=NULL) {
1271      if (tmp_block->to1) {
1272 
1273          if (file_type==EST_GEN)
1274             (void)fprintf(stdout,"%d-%d  (%d-%d)   %d%%",
1275                           tmp_block->from2, tmp_block->to2,
1276                           tmp_block->from1, tmp_block->to1,
1277                           tmp_block->match);
1278          else
1279             /* file_type==GEN_EST */
1280             (void)fprintf(stdout,"%d-%d  (%d-%d)   %d%%",
1281                            tmp_block->from1, tmp_block->to1,
1282                            tmp_block->from2, tmp_block->to2,
1283                            tmp_block->match);
1284          if (((tmp_block1=tmp_block->next_exon)!=NULL) && tmp_block1->to1)
1285               switch (tmp_block->ori) {
1286                  case 'C':  (void)fprintf(stdout," <-\n");
1287                             break;
1288                  case 'E':  (void)fprintf(stdout," ==\n");
1289                             break;
1290                  case 'G':  (void)fprintf(stdout," ->\n");
1291                             break;
1292                  case 'N':  (void)fprintf(stdout," --\n");
1293                             break;
1294                  default :  fatal("sim4b1.c: Inconsistency. Check exon orientations.");
1295               }
1296      }
1297      tmp_block = tmp_block->next_exon;
1298   }
1299 }
1300 
1301 /* to and from are in the original cDNA sequence */
print_pipmaker_exons(Exon * exons,edit_script_list * aligns,char * gene,int from,int to,int M,int N,uchar * seq1,uchar * seq2,int match_ori)1302 void print_pipmaker_exons(Exon *exons, edit_script_list *aligns, char *gene,
1303                           int from, int to, int M, int N, uchar *seq1, uchar *seq2,
1304                           int match_ori)
1305 {
1306    Exon  *tmp_block, *left, *right;
1307    int From, To, cov, ori;
1308 
1309    /* print the first line in the record */
1310    if ((exons==NULL) ||
1311        (!exons->to1 && ((exons->next_exon==NULL) || !exons->next_exon)))
1312        return;
1313 
1314    left = right = tmp_block = (exons->to1) ? exons : exons->next_exon;
1315    while (tmp_block) {
1316       if (tmp_block->to1) right = tmp_block;
1317       tmp_block = tmp_block->next_exon;
1318    }
1319    /* report any inconsistencies between the intron orientations, as well
1320    as between the mRNA (CDS) match strand and the introns orientations */
1321    ori = check_consistency_intron_ori(exons, match_ori, gene);
1322 
1323    /* determine the matching coordinates for the CDS */
1324    if ((from>0) && (to>0) && aligns) {
1325        cov = dispatch_find_ends(from, to, &From, &To, aligns, M, N, match_ori);
1326        switch (cov) {
1327           case OK: if ((match_ori==FWD) && strncmp((char *)(seq1+From-1),"ATG",3))
1328                       fprintf(stderr, "Warning: No start codon at location %d in the genomic sequence (%s).\n", From, gene);
1329                    else if ((match_ori==BWD) && strncmp((char *)(seq1+To-3),"CAT",3))
1330                       fprintf(stderr, "Warning: No (complement) start codon at location %d in the genomic sequence (%s).\n", To-2, gene);
1331                    if ((match_ori==FWD) && strncmp((char *)(seq1+To-3),"TAA",3) &&
1332                        strncmp((char *)(seq1+To-3),"TAG",3) &&
1333                        strncmp((char *)(seq1+To-3),"TGA",3))
1334                        fprintf(stderr, "Warning: No stop codon at location %d in the genomic sequence (%s).\n", To-2, gene);
1335                    else if ((match_ori==BWD) && strncmp((char *)(seq1+From-1),"TTA",3) &&
1336                        strncmp((char *)(seq1+From-1),"CTA",3) &&
1337                        strncmp((char *)(seq1+From-1),"TCA",3))
1338                        fprintf(stderr, "Warning: No (complement) stop codon at location %d in the genomic sequence (%s).\n", From, gene);
1339                    break;
1340 
1341           case FREE_START:
1342                    fprintf(stderr, "Warning: Start of CDS does not match (%s).\n", gene);
1343                    if ((match_ori==FWD) && strncmp((char *)(seq1+To-3),"TAA",3) &&
1344                        strncmp((char *)(seq1+To-3),"TAG",3) &&
1345                        strncmp((char *)(seq1+To-3),"TGA",3))
1346                        fprintf(stderr, "Warning: No stop codon at location %d in the genomic sequence (%s).\n", To-2, gene);
1347                    else if ((match_ori==BWD) && strncmp((char *)(seq1+From-1),"TTA",3) &&
1348                        strncmp((char *)(seq1+From-1),"CTA",3) &&
1349                        strncmp((char *)(seq1+From-1),"TCA",3))
1350                        fprintf(stderr, "Warning: No (complement) stop codon at location %d in the genomic sequence (%s).\n", From, gene);
1351 
1352                    break;
1353 
1354           case FREE_END:
1355                    fprintf(stderr, "Warning: End of CDS does not match (%s).\n", gene);
1356                    if ((match_ori==FWD) && strncmp((char *)(seq1+From-1),"ATG",3))
1357                       fprintf(stderr, "Warning: No start codon at location %d in the genomic sequence (%s).\n", From, gene);
1358                    else if ((match_ori==BWD) && strncmp((char *)(seq1+To-3),"CAT",3))
1359                       fprintf(stderr, "Warning: No (complement) start codon at location %d in the genomic sequence (%s).\n", To-2, gene);
1360 
1361                    break;
1362 
1363           case FREE_BOTH_ENDS:
1364                    fprintf(stderr, "Warning: Start of CDS does not match (%s).\n", gene);
1365                    fprintf(stderr, "Warning: End of CDS does not match (%s).\n", gene);
1366                    break;
1367 
1368           default: fatal("Unrecognized warning code.");
1369        }
1370    }
1371 
1372    /* report codon inconsistencies in the cDNA */
1373    if (to>0 && from>0) {
1374       if (strncmp((char *)(seq2+from-1),"ATG",3))
1375          fprintf(stderr, "Warning: No start codon at location %d in the mRNA (%s).\n", from, gene);
1376       if (strncmp((char *)(seq2+to-3),"TAA",3) &&
1377           strncmp((char *)(seq2+to-3),"TAG",3) &&
1378           strncmp((char *)(seq2+to-3),"TGA",3))
1379          fprintf(stderr, "Warning: No end codon at location %d in the mRNA (%s).\n", to-2, gene);
1380    }
1381 
1382    printf("%c %d %d %s%s\n",
1383           (ori==FWD) ? '>':'<', left->from1, right->to1, gene ? gene:"",
1384           (match_ori==FWD) ? "":" (complement)");
1385    if ((from>0) && (to>0) && aligns)
1386           printf("+ %d %d\n", From, To);
1387 
1388    /* now print the exons */
1389    /* if (match_ori==BWD) flip_list(&left,&right); not accepted by PipMaker */
1390    tmp_block = left;
1391    while (tmp_block!=NULL) {
1392       if (tmp_block->to1)
1393           (void)fprintf(stdout,"%d %d\n",
1394                            tmp_block->from1, tmp_block->to1);
1395       tmp_block = tmp_block->next_exon;
1396    }
1397    if (match_ori==BWD) flip_list(&left, &right);
1398 
1399    return;
1400 }
1401 
check_consistency_intron_ori(Exon * exons,int match_ori,char * gene)1402 static int check_consistency_intron_ori(Exon *exons, int match_ori, char *gene)
1403 {
1404    Exon *t=exons;
1405    int numG, numC, numE, numN;
1406 
1407    numG = numC = numE = numN = 0;
1408 
1409    if (!t->to1) t = t->next_exon;
1410    while (t && t->to1) {
1411      if (t->next_exon  && t->next_exon->to1) {
1412        switch (t->ori) {
1413          case 'G': numG++; break;
1414          case 'C': numC++; break;
1415          case 'N': numN++; break;
1416          case 'E': numE++; break;
1417          default : fatal("sim4b1.c: Unrecognized intron orientation.");
1418        }
1419      }
1420      t = t->next_exon;
1421    }
1422    if (numG && numC)
1423     fprintf(stderr, "Warning: Introns reported on both strands (%s).\n", gene);
1424 /*
1425    Note: a match can be reverse complemented either b/c the complement
1426    genomic sequence was used as input, while the mRNA was actually transcribed
1427    in forward orientation (CT_AC), or b/c the forward strand was given for the
1428    genomic sequence, but the mRNA was transcribed in the reverse direction
1429    (GT_AG); hence there is no relevant test for this
1430    else if ((numG && (match_ori==BWD)) || (numC && (match_ori==FWD)))
1431     fprintf(stderr, "Warning: Introns orientations inconsistent with the reported match strand (%s).\n", gene);
1432 */
1433 
1434    if (numN)
1435     fprintf(stderr, "Warning: Ambiguous intron orientation (%s).\n", gene);
1436    if (numE)
1437     fprintf(stderr, "Warning: Internal gap in the mRNA (%s).\n", gene);
1438 
1439    return (numG>=numC) ? FWD:BWD;
1440 }
1441 
1442 
1443 /* from and to are given in the original cDNA sequence */
dispatch_find_ends(int from,int to,int * From,int * To,edit_script_list * aligns,int M,int N,int match_ori)1444 static int dispatch_find_ends(int from, int to, int *From, int *To, edit_script_list *aligns, int M, int N, int match_ori)
1445 {
1446    int f1, f2, t1, t2, ot1, ot2, xto, xfrom;
1447    int free_start, free_end;
1448 
1449    free_start = free_end = 1;
1450 
1451    if (aligns->next_script && (aligns->offset2 > aligns->next_script->offset2))
1452        script_flip_list(&aligns);
1453 
1454    if (match_ori==FWD) {
1455        xto = to; xfrom = from;
1456    } else if (file_type == EST_GEN) {
1457        xto = to; xfrom = from;
1458    } else {
1459        xto = N-from+1; xfrom = N-to+1;
1460    }
1461 
1462    *From = *To = 0; t1 = t2 = 0;
1463    while (aligns) {
1464       ot2 = t2; ot1 = t1;
1465       f1 = aligns->offset1; f2 = aligns->offset2;
1466       t1 = f1+aligns->len1-1; t2 = f2+aligns->len2-1;
1467       if (ot2 < xfrom && xfrom < f2) {
1468          *From = f1; break;
1469       }
1470       if (f2 <= xfrom && xfrom <= t2) {
1471          *From = find_ends(aligns, xfrom); free_start = 0; break;
1472       }
1473       aligns = aligns->next_script;
1474    }
1475    if (*From == 0) return FREE_BOTH_ENDS;
1476 
1477    if (ot2 < xto && xto < f2) *To = ot1;
1478    else if (xto <= t2) { *To = find_ends(aligns, xto); free_end = 0; }
1479    else {
1480      *To = 0;
1481      while (aligns && ((aligns=aligns->next_script)!=NULL)) {
1482         ot2 = t2; ot1 = t1;
1483         f1 = aligns->offset1; f2 = aligns->offset2;
1484         t1 = f1+aligns->len1-1; t2 = f2+aligns->len2-1;
1485         if (ot2 < xto && xto < f2) { *To = ot1; break; }
1486         if (t2 < xto) *To = t1;
1487         else if (f2 <= xto && xto <=t2) {
1488           *To = find_ends(aligns, xto); free_end = 0; break;
1489         }
1490      }
1491      if (*To==0) *To = t1;
1492    }
1493 
1494    if (*To == 0) { *From = 0; free_start = 1; }
1495 
1496    if (*To && *From && match_ori==BWD && file_type==EST_GEN) {
1497        int aux = M-(*From)+1; *From = M-(*To)+1; *To = aux;
1498    }
1499 
1500    if (free_start && free_end) return FREE_BOTH_ENDS;
1501    else if (free_start) return FREE_START;
1502    else if (free_end) return FREE_END;
1503 
1504    return OK;
1505 }
1506 
find_ends(edit_script_list * head,int j0)1507 static int find_ends(edit_script_list *head, int j0)
1508 {
1509    int i, j, e1, e2;
1510    edit_script *tp;
1511 
1512    i = head->offset1; e1 = i+head->len1-1;
1513    j = head->offset2; e2 = j+head->len2-1;
1514 
1515    tp = head->script;
1516    i--; j--;
1517    while (i<=e1 && j<=e2 && tp) {
1518       if (j==j0) return i;
1519       switch (tp->op_type) {
1520          case DELETE:     i += tp->num; break;
1521          case INSERT:     j += tp->num; if (j>=j0) return i; break;
1522          case SUBSTITUTE: i += tp->num; j += tp->num;
1523                           if (j>=j0) return (i-(j-j0)); break;
1524          default:         fatal("Illegal opcode in script.");
1525       }
1526       tp = tp->next;
1527    }
1528 
1529    /* not found: failure */
1530    fatal("Inconsistency in script.");
1531 }
1532 
1533 
1534 #ifdef AUXUTILS
print_introns(Intron_ptr intron_list)1535 static void print_introns(Intron_ptr intron_list)
1536 {
1537   Intron_ptr ep=intron_list;
1538 
1539   (void)printf("\nIntrons {\n\n");
1540   while (ep!=NULL) {
1541      (void)printf("genome: %6ld - %-6ld    cDNA: %5ld - %-5ld,  l: %-5d,  o: %c \n",
1542                    ep->from1, ep->to1, ep->from2, ep->to2,
1543                    ep->length,
1544                    ep->orientation);
1545      ep = ep->next_intron;
1546   }
1547   (void)printf("}\n\n");
1548 }
1549 #endif
1550 
1551 
pluri_align(int * dist_ptr,int * num_matches,Exon * lblock,struct edit_script_list ** Aligns)1552 static void pluri_align(int *dist_ptr,int *num_matches,Exon *lblock,struct edit_script_list **Aligns)
1553 {
1554    int    tmpi, di_count, i, end1, end2, diff, ali_dist, nmatches;
1555    uchar *a, *b;
1556    Exon  *tmp_block=lblock, *tmp_block1;
1557 
1558    struct edit_script_list *enew;
1559    struct edit_script *head, *tmp_script, *new, *left, *right, *prev;
1560 
1561    nmatches = 0;
1562 
1563    head = NULL;
1564    *Aligns = NULL;
1565    *dist_ptr = ali_dist = 0;
1566 
1567    end1 = M;
1568    end2 = N;
1569 
1570    while (((tmp_block1=tmp_block->next_exon)!=NULL) && tmp_block1->to1) {
1571 
1572          if ((diff=tmp_block->from2-tmp_block1->to2-1)!=0) {
1573            if (tmp_block->to1) {
1574               enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
1575               enew->next_script = *Aligns;
1576               *Aligns = enew;
1577               (*Aligns)->script = head;
1578               (*Aligns)->offset1 = tmp_block->from1;
1579               (*Aligns)->offset2 = tmp_block->from2;
1580               (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
1581               (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
1582               (*Aligns)->score = ali_dist;
1583               ali_dist = 0;
1584               head = NULL;
1585             }
1586             end1 = tmp_block1->to1;
1587             end2 = tmp_block1->to2;
1588 
1589          } else if (((diff=tmp_block->from1-tmp_block1->to1-1)!=0) &&
1590                       tmp_block->to1) {
1591               new = (edit_script *) ckalloc(sizeof(edit_script));
1592               new->op_type = DELETE;
1593               new->num = diff;
1594               new->next = head;
1595               head = new;
1596          } else if (diff)
1597               end1 = tmp_block1->to1;
1598 
1599          diff = align_get_dist(tmp_block1->from1-1, tmp_block1->from2-1,
1600                                tmp_block1->to1, tmp_block1->to2,
1601                                max(1000,.2*(tmp_block1->to2-tmp_block1->from2+1)));
1602 
1603          if (diff<0) {
1604              (void)printf("The two sequences are not really similar.\n");
1605              (void)printf("Please try an exact method.\n");
1606              exit(1);
1607          }
1608 
1609 #ifdef STATS
1610          if (diff>P*(tmp_block1->to2-tmp_block1->from2+1))
1611               (void)printf("Warning: Distance threshold on segment exceeded.\n");
1612 #endif
1613 
1614          align_path(tmp_block1->from1-1, tmp_block1->from2-1,
1615                     tmp_block1->to1, tmp_block1->to2, diff, &left, &right);
1616 
1617          Condense_both_Ends(&left, &right, &prev);
1618 
1619          if (!tmp_block->to1 && right->op_type == DELETE) {
1620             /* remove gaps at end of alignment */
1621             diff -= 0+right->num;         /* subtract GAP_OPEN = 0 */
1622             tmp_block1->to1 -= right->num;
1623             end1 -= right->num;
1624             if (head && (head->op_type == DELETE))
1625                 head->num += right->num;
1626             free(right); prev->next = NULL;
1627             right = prev;
1628          }
1629          if ((!tmp_block1->next_exon || !tmp_block1->next_exon->to1) &&
1630               left && (left->op_type == DELETE)) {
1631             diff -= 0+left->num;          /* subtract GAP_OPEN = 0 */
1632             tmp_block1->from1 += left->num;
1633             tmp_script = left->next;
1634             if (right == left) right = tmp_script;
1635             free(left); left = tmp_script;
1636          }
1637 
1638          *dist_ptr += diff;
1639          ali_dist += diff;
1640 
1641          a = seq1+tmp_block1->from1-1;
1642          b = seq2+tmp_block1->from2-1;
1643          tmpi = di_count = 0;
1644          tmp_script = left;
1645          while (tmp_script) {
1646             switch (tmp_script->op_type) {
1647                 case  DELETE:
1648                       di_count += tmp_script->num;
1649                       tmpi += tmp_script->num;
1650                       a += tmp_script->num;
1651                       break;
1652                 case  INSERT:
1653                       di_count += tmp_script->num;
1654                       tmpi += tmp_script->num;
1655                       b += tmp_script->num;
1656                       break;
1657                 case  SUBSTITUTE:
1658                       for (i=0; i<tmp_script->num; ++i, ++a, ++b)
1659                         if (*a!=*b) tmpi++; else nmatches++;
1660                       break;
1661             }
1662             tmp_script = tmp_script->next;
1663          }
1664          tmp_block1->alen = (int)((tmp_block1->to1-tmp_block1->from1+1+
1665                              tmp_block1->to2-tmp_block1->from2+1+di_count)/(double)2);
1666          tmp_block1->nmatches = tmp_block1->alen - tmpi;
1667          tmp_block1->match = (int)floor(100*(1-tmpi/(double)tmp_block1->alen));
1668 
1669          right->next = head;
1670          head = left;
1671          tmp_block = tmp_block1;
1672   }
1673 
1674 
1675    /* at the beginning of the sequences */
1676   if (tmp_block1!=NULL) {
1677 
1678     if ((diff=tmp_block->from2-tmp_block1->to2-1)!=0 && (diff!=N)) {
1679          enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
1680          enew->next_script = *Aligns;
1681          *Aligns = enew;
1682          (*Aligns)->offset1 = tmp_block->from1;
1683          (*Aligns)->offset2 = tmp_block->from2;
1684          (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
1685          (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
1686          (*Aligns)->script = head;
1687          (*Aligns)->score = ali_dist;
1688 
1689     } else if (diff!=N) {
1690 
1691          /* modified to cut introns at the beginning of the sequence */
1692          enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
1693          enew->next_script = *Aligns;
1694          *Aligns = enew;
1695          (*Aligns)->offset1 = tmp_block->from1;
1696          (*Aligns)->offset2 = 1;
1697          (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
1698          (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
1699          (*Aligns)->script = head;
1700          (*Aligns)->score = ali_dist;
1701     }
1702   }
1703   *num_matches = nmatches;
1704 }
1705 
new_exon(int f1,int f2,int t1,int t2,int len,int edist,int flag,Exon * next)1706 static Exon *new_exon(int f1, int f2, int t1, int t2, int len, int edist, int flag, Exon *next)
1707 {
1708   Exon *new = (Exon *)ckalloc(sizeof(Exon));
1709 
1710   new->from1 = f1;
1711   new->from2 = f2;
1712   new->to1 = t1;
1713   new->to2 = t2;
1714   new->length = (len < 0) ? (t2-f2+1) : len;
1715   new->edist = edist;
1716   new->flag = flag;
1717   new->next_exon = next;
1718 
1719   return new;
1720 }
1721 
1722 
get_stats(Exon * lblock,sim4_stats_t * st)1723 static void get_stats(Exon *lblock, sim4_stats_t *st)
1724 {
1725    Exon *t, *t1;
1726 
1727 #ifdef _STATS
1728    t = lblock;
1729    if (!t->next_exon) { /* no alignment found */
1730        st->marginals = 2.0;
1731    } else {
1732      while (t) {
1733        if ((t1 = t->next_exon)!=NULL) {
1734         if (!t->to1 && t1->to1)
1735             st->marginals += (float)(t1->from2-1)/t1->to2;
1736         else if (t->to1 && !t1->to1)
1737             st->marginals += (float)(N-t->to2)/
1738                                 (N-t->from2+1);
1739         else if (!t->to1 && !t1->to1)
1740             st->marginals = 2.0;
1741        }
1742        t = t1;
1743      }
1744    }
1745    st->marginals = st->marginals/2;
1746 #endif
1747 
1748    st->icoverage = 0;  st->internal = 1; st->mult = 0;
1749    t = lblock->next_exon;
1750    while (t) {
1751       st->icoverage += t->length;
1752       if (t->length) st->mult++;
1753       t = t->next_exon;
1754    }
1755    st->fcoverage = ((float)st->icoverage)/N;
1756 
1757    t = lblock;
1758    if ((t->next_exon==NULL) || !t->next_exon->to1)
1759       st->internal = 0;
1760    while (t) {
1761       if ((t->to1) && ((t1=t->next_exon)!=NULL) &&
1762           (t1->from2-t->to2-1>0) && t1->to1)
1763                     st->internal = 0;
1764       t = t->next_exon;
1765    }
1766 }
1767 
resolve_overlap(Exon * tmp_block,Exon * tmp_block1,uchar * seq1)1768 static int resolve_overlap(Exon *tmp_block, Exon *tmp_block1, uchar *seq1)
1769 {
1770      int   diff, best_u, l0, l1, u, cost;
1771      int    GTAG_score, CTAC_score;
1772      uchar *s1, *s2, *e1;
1773 
1774      diff = tmp_block1->from2-tmp_block->to2-1;
1775      if (diff>=0) return (tmp_block1->from2-1);
1776 
1777      /* resolve overlap using the GT-AG criterion */
1778      /* u-1 = actual position in the sequence */
1779 
1780      l0 = tmp_block->length-diff;
1781      l1 = tmp_block1->length;
1782 
1783      best_u = u = tmp_block1->from2-1;
1784      s1 = seq1+tmp_block->to1-(tmp_block->to2-u);
1785      s2 = seq1-2+tmp_block1->from1+u-tmp_block1->from2;
1786 
1787      cost = 0;
1788      e1 = seq1+tmp_block->to1;
1789      while (s1<=e1) {
1790        GTAG_score = CTAC_score = 0;
1791        GTAG_score += ((char)(*s1)=='G') ? 1 : 0;
1792        GTAG_score += ((char)(*(s1+1))=='T') ? 1 : 0;
1793        GTAG_score += ((char)(*s2)=='A') ? 1 : 0;
1794        GTAG_score += ((char)(*(s2+1))=='G') ? 1 : 0;
1795 
1796        if (GTAG_score > abs(cost) && ((l0>=8) || (l1>=8))) {
1797            cost = GTAG_score;
1798            best_u = u;
1799            if (cost == 4) break;
1800        }
1801 
1802        CTAC_score += ((char)(*s1)=='C') ? 1 : 0;
1803        CTAC_score += ((char)(*(s1+1))=='T') ? 1 : 0;
1804        CTAC_score += ((char)(*s2)=='A') ? 1 : 0;
1805        CTAC_score += ((char)(*(s2+1))=='C') ? 1 : 0;
1806 
1807        if (CTAC_score > abs(cost)) {
1808            cost = -CTAC_score;
1809            best_u = u;
1810            if (cost == 4) break;
1811        }
1812 
1813        u++; s1++; s2++;
1814        l0++; l1--;
1815      }
1816 
1817      return best_u;
1818 }
1819 
1820 
greedy(uchar * s1,uchar * s2,int m,int n,int offset1,int offset2,Exon ** lblock,Exon ** rblock)1821 static int  greedy(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, Exon **lblock, Exon **rblock)
1822 {
1823   int     col,                    /* column number */
1824           d,                      /* current distance */
1825           k,                      /* current diagonal */
1826           max_d,                  /* bound on size of edit script */
1827           Cost,
1828           blower,flower,          /* boundaries for searching diagonals */
1829           bupper,fupper,
1830           row,                    /* row number */
1831           DELTA,                  /* n-m  */
1832           MAX_D,
1833           B_ORIGIN, F_ORIGIN;
1834   int     back, forth;            /* backward and forward limits at exit */
1835 
1836   int     *blast_d, *flast_d,     /* rows containing the last d (at crt step, d-1) */
1837           *btemp_d, *ftemp_d;     /* rows containing tmp values for the last d */
1838   int     *min_row, *min_diag,    /* min (b)/ max (f) row (and diagonal) */
1839           *max_row, *max_diag;    /* reached for cost d=0, ... m.  */
1840 
1841 
1842   DELTA = n-m;
1843 /*max_d = MAX_D = m+1; */
1844   max_d = MAX_D = max(W,(int)(P*m+1));
1845 
1846   if (DELTA<0) {
1847       if (m<=min(W,(1+P)*n)) {
1848           *lblock = *rblock = new_exon(offset2+1,offset1+1,offset2+n,offset1+m,
1849                                        m,n-m+(int)(P*m+1),0,NULL);
1850           return m-n+(int)(P*n+1);
1851       } else {
1852           *lblock = *rblock = NULL;
1853           return max(W,(int)(P*m+1))+1;
1854       }
1855   }
1856 
1857   F_ORIGIN = MAX_D;
1858   B_ORIGIN = MAX_D-DELTA;
1859   for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--)
1860         /*LINTED empty loop body*/;
1861 
1862   if (row == 0) {
1863       /* hit last row; stop search */
1864       *lblock = *rblock = new_exon(offset2-m+n+1,offset1+1,offset2+n,
1865                                    offset1+m,m,0,0,NULL);
1866       return 0;
1867   }
1868 
1869 
1870   blast_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int));
1871   btemp_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int));
1872 
1873   for (k=0; k<=MAX_D+n; ++k) { blast_d[k]=m+1; btemp_d[k]=m+1; }
1874   blast_d[B_ORIGIN+DELTA] = row;
1875 
1876   blower = B_ORIGIN + DELTA - 1;
1877   bupper = B_ORIGIN + DELTA + 1;
1878 
1879 
1880   for (row=0; row<n && row<m && (s1[row]==s2[row]); row++)
1881         /*LINTED empty loop body*/;
1882 
1883   if (row == m) {
1884        /* hit last row; stop search */
1885        *lblock = *rblock = new_exon(offset2+1,offset1+1,offset2+m,
1886                                     offset1+m,m,0,0,NULL);
1887        free(blast_d); free(btemp_d);
1888 
1889        return 0;
1890   }
1891 
1892   flast_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int));
1893   ftemp_d = (int *)ckalloc((MAX_D+n+1)*sizeof(int));
1894 
1895   for (k=0; k<=MAX_D+n; ++k) { flast_d[k]=-1; ftemp_d[k]=-1; }
1896   flast_d[F_ORIGIN] = row;
1897 
1898   flower = F_ORIGIN - 1;
1899   fupper = F_ORIGIN + 1;
1900 
1901   max_row = (int *)ckalloc((MAX_D+1)*sizeof(int));
1902   min_row = (int *)ckalloc((MAX_D+1)*sizeof(int));
1903   max_diag = (int *)ckalloc((MAX_D+1)*sizeof(int));
1904   min_diag = (int *)ckalloc((MAX_D+1)*sizeof(int));
1905 
1906   for (d=1; d<=MAX_D; d++) {
1907        min_row[d] = m+1; max_row[d] = -1;
1908   }
1909   min_row[0] = blast_d[B_ORIGIN+DELTA];
1910   min_diag[0] = B_ORIGIN+DELTA;
1911   max_row[0] = flast_d[F_ORIGIN];
1912   max_diag[0] = F_ORIGIN;
1913 
1914   back = forth = -1;
1915 
1916   d = 1;
1917   while (d <= max_d) {
1918 
1919         /* for each relevant diagonal ... */
1920         for (k = blower; k <= bupper; k++) {
1921              /* process the next edit instruction */
1922 
1923              /* find a d on diagonal k */
1924              if (k==-d+DELTA+B_ORIGIN) {
1925 
1926                       /* move left from the last d-1 on diagonal k+1 */
1927                       row = blast_d[k+1];   /* INSERT */
1928              } else if (k==d+DELTA+B_ORIGIN) {
1929 
1930                       /* move up from the last d-1 on diagonal k-1 */
1931                       row = blast_d[k-1]-1;  /* DELETE */
1932              } else if ((blast_d[k]<=blast_d[k+1]) &&
1933                         (blast_d[k]-1<=blast_d[k-1])) {
1934 
1935                       /* substitution */
1936                       row = blast_d[k]-1;  /* SUBSTITUTE */
1937 
1938              } else if ((blast_d[k-1]<=blast_d[k+1]-1) &&
1939                         (blast_d[k-1]<=blast_d[k]-1)) {
1940                       /* move right from the last d-1 on diagonal k-1 */
1941                       row = blast_d[k-1]-1;  /* DELETE */
1942              } else  {
1943                       /* move left from the last d-1 on diagonal k+1 */
1944                       row = blast_d[k+1];    /* INSERT */
1945              }
1946              /* code common to the three cases */
1947              col = row + k - B_ORIGIN;
1948 
1949              /* slide up the diagonal */
1950              while (row > 0 && col > 0 && (s1[row-1]==s2[col-1])) {
1951                              --row;
1952                              --col;
1953              }
1954              btemp_d[k] = row;
1955 
1956 /*           if (row == 0 || col == 0) max_d = d;   */
1957 
1958         }     /* for k */
1959 
1960         min_row[d] = btemp_d[DELTA+B_ORIGIN];
1961         min_diag[d] = DELTA+B_ORIGIN;
1962         for (k=blower; k<=bupper; ++k) {
1963              blast_d[k] = btemp_d[k]; btemp_d[k] = m+1;
1964              if (blast_d[k]<min_row[d]) {
1965                        min_row[d] = blast_d[k];
1966                        min_diag[d] = k;
1967              }
1968         }
1969 
1970         /* record cell, if paths overlap with minimum combined cost */
1971         /* obs: it suffices to search up to Cost=min(d-1,(max_d-d)) */
1972         for (Cost=0; Cost<d; Cost++) {
1973              if ((min_row[d]<=max_row[Cost]) &&
1974                  ((max_d > d+Cost) || (max_d==d+Cost && (forth<0)))) {
1975                            max_d = d+Cost;
1976                            back = d;
1977                            forth = Cost;
1978                            break;
1979              }
1980         }
1981 
1982         --blower; ++bupper;
1983 
1984         /* for each relevant diagonal ... */
1985         for (k = flower; k <= fupper; k++) {
1986                /* process the next edit instruction */
1987 
1988                /* find a d on diagonal k */
1989                if (k==-d+F_ORIGIN) {
1990                         /* move down from the last d-1 on diagonal k+1 */
1991                         row = flast_d[k+1]+1; /* DELETE */
1992 
1993                } else if (k==d+F_ORIGIN) {
1994                         /* move right from the last d-1 on diagonal k-1 */
1995                         row = flast_d[k-1]; /* INSERT */
1996 
1997                } else if ((flast_d[k]>=flast_d[k+1]) &&
1998                           (flast_d[k]+1>=flast_d[k-1])) {
1999 
2000                         /* substitution */
2001                         row = flast_d[k]+1;   /* SUBSTITUTE */
2002 
2003                } else if ((flast_d[k+1]+1>=flast_d[k-1]) &&
2004                           (flast_d[k+1]>=flast_d[k])) {
2005 
2006                         /* move left from the last d-1 on diagonal k+1 */
2007                         row = flast_d[k+1]+1;  /* DELETE */
2008                } else {
2009                         /* move right from the last d-1 on diagonal k-1 */
2010                         row = flast_d[k-1];    /* INSERT */
2011                }
2012                /* code common to the three cases */
2013                col = row + k - F_ORIGIN;
2014 
2015                /* slide down the diagonal */
2016                if (row>=0)
2017                while (row < m && col < n && (s1[row]==s2[col])) {
2018                                 ++row;
2019                                 ++col;
2020                }
2021                ftemp_d[k] = row;
2022 
2023 /*             if (row == m || col == n) max_d = d;  */
2024           }     /* for k */
2025 
2026           max_row[d] = ftemp_d[F_ORIGIN];
2027           max_diag[d] = F_ORIGIN;
2028           for (k=flower; k<=fupper; ++k) {
2029                flast_d[k] = ftemp_d[k]; ftemp_d[k] = -1;
2030                if (flast_d[k]>max_row[d]) {
2031                    max_row[d] = flast_d[k];
2032                    max_diag[d] = k;
2033                }
2034           }
2035 
2036           /* record backward and forward limits, if minimum combined
2037            * cost in overlapping. Note: it suffices to search up to
2038            * Cost=min(d,(max_d-d)).
2039            */
2040           for (Cost=0; Cost<=d; Cost++) {
2041              if ((min_row[Cost]<=max_row[d]) &&
2042                  ((max_d>d+Cost) || (max_d==d+Cost && (forth<0)))) {
2043                       max_d = d+Cost;
2044                       back = Cost;
2045                       forth = d;
2046                       break;
2047              }
2048           }
2049           --flower; ++fupper;
2050 
2051           ++d;  /* for d */
2052   }
2053 
2054   if (d>MAX_D) {
2055       *lblock = *rblock = NULL;
2056 
2057       free(blast_d); free(btemp_d);
2058       free(flast_d); free(ftemp_d);
2059       free(min_row); free(min_diag);
2060       free(max_row); free(max_diag);
2061 
2062       return d;
2063   }
2064 /*fin:*/
2065   if (m-min_row[back]>=max_row[forth]) {
2066       *rblock = new_exon(offset2+1+min_row[back]+min_diag[back]-B_ORIGIN,
2067                          offset1+1+min_row[back],
2068                          offset2+n,offset1+m,
2069                          m-min_row[back],back,0,NULL);
2070       *lblock = new_exon(offset2+1,offset1+1,
2071                          offset2+min_row[back]+max_diag[forth]-F_ORIGIN,
2072                          offset1+min_row[back],
2073                          min_row[back],forth,0,*rblock);
2074   } else {
2075       *rblock = new_exon(offset2+1+max_row[forth]+min_diag[back]-B_ORIGIN,
2076                          offset1+1+max_row[forth],
2077                          offset2+n,offset1+m,m-max_row[forth],back,0,NULL);
2078       *lblock = new_exon(offset2+1,offset1+1,
2079                          offset2+max_row[forth]+max_diag[forth]-F_ORIGIN,
2080                          offset1+max_row[forth],max_row[forth],forth,0,*rblock);
2081   }
2082 
2083   free(blast_d); free(btemp_d);
2084   free(flast_d); free(ftemp_d);
2085   free(min_row); free(min_diag);
2086   free(max_row); free(max_diag);
2087 
2088   return back+forth;
2089 }
2090 
2091 
flip_list(Exon ** left,Exon ** right)2092 void flip_list(Exon **left, Exon **right)
2093 {
2094    Exon   *ep, *ahead, *behind;
2095 
2096    *right = *left;
2097    ahead = *left;
2098    ep = NULL;
2099    while (ahead!=NULL) {
2100           behind = ep;
2101           ep = ahead;
2102           ahead = ahead->next_exon;
2103           ep->next_exon = behind;
2104   }
2105   *left = ep;
2106 }
2107 
2108 /* reverse a list of edit script chains */
script_flip_list(edit_script_list ** left)2109 void script_flip_list(edit_script_list **left)
2110 {
2111    edit_script_list  *ep, *ahead, *behind;
2112 
2113    ahead = *left;
2114    ep = NULL;
2115    while (ahead!=NULL) {
2116           behind = ep; ep = ahead;
2117           ahead = ahead->next_script;
2118           ep->next_script = behind;
2119   }
2120   *left = ep;
2121 }
2122 
2123 
2124 /* operates on a list sorted in increasing order of exon coordinates */
compact_list(Exon ** Lblock,Exon ** Rblock)2125 static void compact_list(Exon **Lblock, Exon **Rblock)
2126 {
2127      Exon *tmp_block=*Lblock, *tmp_block1;
2128      int diff;
2129 
2130      while ((tmp_block!=NULL) &&
2131             ((tmp_block1=tmp_block->next_exon)!=NULL) &&
2132             tmp_block1->to1) {
2133         if ((abs((tmp_block1->from2-tmp_block1->from1) -
2134                 (tmp_block->to2-tmp_block->to1))<=W) &&
2135                 ((diff=tmp_block1->from2-tmp_block->to2-1)<=MAX_INTERNAL_GAP)) {
2136             /* merge blocks */
2137             tmp_block->to1 = tmp_block1->to1;
2138             tmp_block->to2 = tmp_block1->to2;
2139             tmp_block->length = tmp_block->to2-tmp_block->from2+1;
2140             tmp_block->edist += tmp_block1->edist;
2141             tmp_block->edist -= P*diff;
2142             tmp_block->next_exon = tmp_block1->next_exon;
2143 
2144             free(tmp_block1);
2145         } else
2146             tmp_block = tmp_block1;
2147      }
2148      /* reset right end of the list */
2149      *Rblock = tmp_block;
2150 }
2151 
2152 /* ------------------  memory management routines --------------- */
2153 
link_to_data_list(Pointer data,ValNodePtr * head,ValNodePtr * prev)2154 void link_to_data_list(Pointer data, ValNodePtr *head, ValNodePtr *prev)
2155 {
2156      ValNodePtr curr;
2157 
2158      curr = (ValNodePtr)ckalloc(sizeof(struct ValNode));
2159      curr->data = data;
2160      curr->next = NULL;
2161 
2162      if(*prev == NULL)
2163         *head = curr;
2164      else
2165         (*prev)->next = curr;
2166      *prev = curr;
2167 }
2168 
ValNodeFreeData(ValNodePtr data_list)2169 void   ValNodeFreeData(ValNodePtr data_list)
2170 {
2171        ValNodePtr   tmp_node;
2172 
2173        while ((tmp_node=data_list)!=NULL) {
2174           free(tmp_node->data);
2175           data_list = data_list->next;
2176           free(tmp_node);
2177        }
2178 }
2179 
2180 
good_ratio(int length)2181 int  good_ratio(int length)
2182 {
2183      if (length<=W/2) return 2;
2184      else if (length<2*W) return rs.cutoff;
2185      else return (int)(.75*P*length+1);
2186 }
2187 
extend_bw(uchar * s1,uchar * s2,int m,int n,int offset1,int offset2,int * line1,int * line2)2188 static int extend_bw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2)
2189 {
2190   int     col,                    /* column number */
2191           row,                    /* row number */
2192           max_d,                  /* bound on the length of the edit script
2193  */
2194           d,                      /* current compressed distance */
2195           k,                      /* current diagonal */
2196           DELTA,                  /* n-m  */
2197           ORIGIN,
2198           lower,
2199           upper;
2200   int     *last_d, *temp_d;       /* column containing the last p */
2201   int     *min_row, *min_diag;    /* min (b)/ max (f) row (and diagonal) */
2202                                   /* reached for cost d=0, ... m.  */
2203   DELTA = n-m;
2204   max_d = m+1;
2205 
2206   ORIGIN = m;
2207   for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--)
2208         /*LINTED empty loop body*/;
2209 
2210   if ((row == 0) || (col == 0)) {
2211        *line1 = row+offset1;
2212        *line2 = col+offset2;
2213 
2214        return 0;
2215   }
2216 
2217   last_d = (int *)ckalloc((m+n+1)*sizeof(int));
2218   temp_d = (int *)ckalloc((m+n+1)*sizeof(int));
2219 
2220   for (k=0; k<=m+n; ++k) last_d[k]=m+1;
2221   last_d[ORIGIN+DELTA] = row;
2222 
2223   lower = ORIGIN + DELTA - 1;
2224   upper = ORIGIN + DELTA + 1;
2225 
2226   min_row = (int *)ckalloc((m+1)*sizeof(int));
2227   min_diag = (int *)ckalloc((m+1)*sizeof(int));
2228 
2229   for (d=1; d<=m; d++)
2230        min_row[d] = m+1;
2231 
2232   min_row[0] = last_d[ORIGIN+DELTA];
2233   min_diag[0] = ORIGIN + DELTA;
2234 
2235   d = 0;
2236   while ((++d<=max_d) &&
2237          ((d-1<=good_ratio(m-min_row[d-1])) ||
2238           ((d>=2) && (d-2<=good_ratio(m-min_row[d-2]))))) {
2239 
2240           /* for each relevant diagonal ... */
2241           for (k = lower; k <= upper; k++) {
2242 
2243                /* find a d on diagonal k */
2244                if (k==-d+DELTA+ORIGIN) {
2245                         /* move down from the last d-1 on diagonal k+1 */
2246                         row = last_d[k+1];
2247                         /* op = INSERT; */
2248 
2249                } else if (k==d+DELTA+ORIGIN) {
2250                         /* move right from the last d-1 on diagonal k-1 */
2251                         row = last_d[k-1]-1;
2252                         /* op = DELETE; */
2253 
2254                } else if ((last_d[k]-1<=last_d[k+1]) &&
2255                           (last_d[k]-1<=last_d[k-1]-1)) {
2256                         /* substitution */
2257                         row = last_d[k]-1;
2258                         /* op = SUBSTITUTE; */
2259 
2260                } else if ((last_d[k-1]-1<=last_d[k+1]) &&
2261                           (last_d[k-1]-1<=last_d[k]-1)) {
2262                         /* move right from the last d-1 on diagonal k-1 */
2263                         row = last_d[k-1]-1;
2264                         /* op = DELETE; */
2265 
2266                } else  {
2267                         /* move left from the last d-1 on diagonal k+1 */
2268                         row = last_d[k+1];
2269                         /* op = INSERT; */
2270 
2271                }
2272 
2273                /* code common to the three cases */
2274                /* slide down the diagonal */
2275 
2276                col = row+k-ORIGIN;
2277 
2278                while ((row > 0) && (col > 0) && (s1[row-1]==s2[col-1]))
2279                   { row--; col--; }
2280 
2281                temp_d[k] = row;
2282 
2283                if ((row == 0) && (col == 0)) {
2284                    /* hit southeast corner; have the answer */
2285 
2286                    free(last_d); free(temp_d);
2287                    free(min_row); free(min_diag);
2288 
2289                    *line1 = row+offset1;
2290                    *line2 = col+offset2;
2291 
2292                    return d;
2293                }
2294                if (row == 0) {
2295                    /* hit first row; don't look further */
2296 
2297                    free(last_d); free(temp_d);
2298                    free(min_row); free(min_diag);
2299 
2300                    *line1 = row+offset1;
2301                    *line2 = col+offset2;
2302 
2303                    return d;
2304                }
2305 
2306                if (col == 0) {
2307                    /* hit last column; don't look further */
2308                    free(last_d); free(temp_d);
2309                    free(min_row); free(min_diag);
2310 
2311                    *line1 = row+offset1;
2312                    *line2 = col+offset2;
2313 
2314                    return d;
2315                }
2316           }
2317 
2318           min_row[d] = last_d[ORIGIN+DELTA];
2319           min_diag[d] = ORIGIN+DELTA;
2320           for (k=lower; k<=upper; ++k)
2321                  if (temp_d[k]<min_row[d]) {
2322                      min_row[d] = temp_d[k];
2323                      min_diag[d] = k;
2324                  }
2325 
2326           for (k=lower; k<=upper; k++) {
2327                last_d[k] = temp_d[k];
2328           }
2329 
2330           --lower;
2331           ++upper;
2332      }
2333 
2334     /* report here the previous maximal match, stored in min_diag and min_row */
2335      while ((d>0) && (min_row[d-1]-min_row[d]<3))
2336         d--;
2337 
2338      *line1 = min_row[d]+offset1;
2339      *line2 = min_row[d]+min_diag[d]-ORIGIN+offset2;
2340 
2341      free(min_row);
2342      free(min_diag);
2343      free(last_d);
2344      free(temp_d);
2345 
2346      return d;
2347 }
2348 
2349 
extend_fw(uchar * s1,uchar * s2,int m,int n,int offset1,int offset2,int * line1,int * line2)2350 static int extend_fw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2, int *line1, int *line2)
2351 {
2352   int     col,                    /* column number */
2353           row,                    /* row number */
2354           max_d,                  /* bound on the length of the edit script
2355  */
2356           d,                      /* current compressed distance */
2357           k,                      /* current diagonal */
2358           ORIGIN,
2359           lower,
2360           upper;
2361   int     *last_d, *temp_d;       /* column containing the last p */
2362   int     *max_row, *max_diag;    /* min (b)/ max (f) row (and diagonal) */
2363                                   /* reached for cost d=0, ... m.  */
2364   max_d = m+1;
2365 
2366   ORIGIN = m;
2367   for (row=0, col=0; col<n && row<m && (s1[row]==s2[col]); row++, col++)
2368         /*LINTED empty loop body*/;
2369 
2370   if (row == m) {
2371        *line1 = row+offset1;
2372        *line2 = col+offset2;
2373 
2374        return 0;
2375   }
2376   if (col == n) {
2377        *line1 = row+offset1;
2378        *line2 = col+offset2;
2379 
2380        return 0;
2381   }
2382 
2383   last_d = (int *)ckalloc((m+n+1)*sizeof(int));
2384   temp_d = (int *)ckalloc((m+n+1)*sizeof(int));
2385 
2386   for (k=0; k<=m+n; ++k) last_d[k]=-1;
2387   last_d[ORIGIN] = row;
2388 
2389   lower = ORIGIN - 1;
2390   upper = ORIGIN + 1;
2391 
2392   max_row = (int *)ckalloc((m+1)*sizeof(int));
2393   max_diag = (int *)ckalloc((m+1)*sizeof(int));
2394 
2395   for (d=1; d<=m; d++)
2396        max_row[d] = -1;
2397 
2398   max_row[0] = last_d[ORIGIN];
2399   max_diag[0] = ORIGIN;
2400 
2401 
2402   d = 0;
2403   while ((++d<=max_d) &&
2404          ((d-1<=good_ratio(max_row[d-1])) ||
2405           ((d>=2) && (d-2<=good_ratio(max_row[d-2]))))) {
2406 
2407           /* for each relevant diagonal ... */
2408           for (k = lower; k <= upper; k++) {
2409 
2410                /* find a d on diagonal k */
2411                if (k==-d+ORIGIN) {
2412 
2413                         /* move down from the last d-1 on diagonal k+1 */
2414                         row = last_d[k+1]+1;
2415                         /* op = DELETE; */
2416                } else if (k==d+ORIGIN) {
2417 
2418                         /* move right from the last d-1 on diagonal k-1 */
2419                         row = last_d[k-1];
2420                         /* op = INSERT; */
2421                } else if ((last_d[k]>=last_d[k+1]) &&
2422                           (last_d[k]+1>=last_d[k-1])) {
2423 
2424                         /* substitution */
2425                         row = last_d[k]+1;
2426                         /* op = SUBSTITUTE; */
2427                } else if ((last_d[k+1]+1>=last_d[k-1]) &&
2428                           (last_d[k+1]>=last_d[k])) {
2429 
2430                         /* move down from the last d-1 on diagonal k+1 */
2431                         row = last_d[k+1]+1;
2432                         /* op = DELETE; */
2433                } else {
2434 
2435                         /* move right from the last d-1 on diagonal k-1 */
2436                         row = last_d[k-1];
2437                         /* op = INSERT; */
2438                }
2439 
2440                /* code common to the three cases */
2441                /* slide down the diagonal */
2442 
2443                col = row+k-ORIGIN;
2444 
2445                if (row>=0)
2446                while ((row < m) && (col < n) && (s1[row]==s2[col]))
2447                          { row++; col++; }
2448 
2449                temp_d[k] = row;
2450 
2451                if ((row == m) && (col == n)) {
2452                    /* hit southeast corner; have the answer */
2453                    free(last_d); free(temp_d);
2454                    free(max_row); free(max_diag);
2455                    *line1 = row+offset1;
2456                    *line2 = col+offset2;
2457 
2458                    return d;
2459                }
2460                if (row == m) {
2461                    /* hit last row; don't look further */
2462                    free(temp_d); free(last_d);
2463                    free(max_row); free(max_diag);
2464 
2465                    *line1 = row+offset1;
2466                    *line2 = col+offset2;
2467 
2468                    return d;
2469                }
2470 
2471                if (col == n) {
2472                    /* hit last column; don't look further */
2473                    free(temp_d); free(last_d);
2474                    free(max_row); free(max_diag);
2475 
2476                    *line1 = row+offset1;
2477                    *line2 = col+offset2;
2478 
2479                    return d;
2480                }
2481           }
2482           max_row[d] = last_d[ORIGIN];
2483           max_diag[d] = ORIGIN;
2484           for (k=lower; k<=upper; ++k)
2485                  if (temp_d[k]>max_row[d]) {
2486                      max_row[d] = temp_d[k];
2487                      max_diag[d] = k;
2488                  }
2489 
2490           for (k=lower; k<=upper; k++) {
2491                last_d[k] = temp_d[k];
2492           }
2493 
2494           --lower;
2495           ++upper;
2496      }
2497 
2498     /* report here the previous maximal match, stored in max_diag and max_row */
2499 
2500      while ((d>0) && (max_row[d]-max_row[d-1]<3))
2501         d--;
2502 
2503      *line1 = max_row[d]+offset1;
2504      *line2 = max_row[d]+max_diag[d]-ORIGIN+offset2;
2505 
2506      free(max_row);
2507      free(max_diag);
2508      free(last_d);
2509      free(temp_d);
2510 
2511      return d;
2512 
2513 /*
2514      if ((d>2) && (max_row[d-1]-max_row[d-2]<3)) {
2515           *line1 = max_row[d-2]+offset1;
2516           *line2 = max_row[d-2]+max_diag[d-2]-ORIGIN+offset2;
2517 
2518           free(max_row); free(max_diag);
2519           free(last_d); free(temp_d);
2520 
2521           return d-2;
2522      }
2523 
2524      *line1 = max_row[d-1]+offset1;
2525      *line2 = max_row[d-1]+max_diag[d-1]-ORIGIN+offset2;
2526 
2527      free(max_row);
2528      free(max_diag);
2529      free(last_d);
2530      free(temp_d);
2531 
2532      return d-1;
2533 */
2534 }
2535 
merge(Exon ** t0,Exon ** t1)2536 static  void merge(Exon **t0, Exon **t1)
2537 {
2538     Exon  *tmp0, *tmp1;
2539     int    diff;
2540 
2541     if ((*t0) && !(*t0)->to1)
2542          tmp0 = (*t0)->next_exon;
2543     else
2544          tmp0 = *t0;
2545 
2546     while (tmp0 && (tmp0!=*t1)) {
2547        tmp1 = tmp0->next_exon;
2548        assert(tmp1!=NULL);
2549        if (tmp1 && tmp1->to1 && tmp0->to1 &&
2550            (abs((tmp1->from2-tmp1->from1)-(tmp0->to2-tmp0->to1))<=W) &&
2551            ((diff=tmp1->from2-tmp0->to2-1<=W))) {
2552 
2553           /* merge blocks tmp0 and tmp1 */
2554           tmp0->from1 = min(tmp0->from1, tmp1->from1);
2555           tmp0->from2 = min(tmp0->from2, tmp1->from2);
2556           tmp0->to1 = max(tmp1->to1, tmp0->to1);
2557           tmp0->to2 = max(tmp1->to2, tmp0->to2);
2558           tmp0->length = tmp0->to2-tmp0->from2+1;
2559           tmp0->flag = tmp1->flag;
2560           tmp0->edist += tmp1->edist;
2561           tmp0->edist -= P*diff;
2562           if (tmp1==*t1) {
2563           /*  tmp0->flag = (*t1)->flag; */
2564               *t1 = tmp0;
2565           }
2566           tmp0->next_exon = tmp1->next_exon;
2567 
2568           free(tmp1);
2569        } else
2570           tmp0 = tmp0->next_exon;
2571     }
2572 }
2573 
free_align(edit_script_list * aligns)2574 void free_align(edit_script_list *aligns)
2575 {
2576      edit_script_list *head;
2577 
2578      head = aligns;
2579 
2580      while ((head=aligns)!=NULL) {
2581         aligns = aligns->next_script;
2582         Free_script(head->script);
2583         free(head);
2584      }
2585 }
2586 
2587 
free_list(Exon * left)2588 void free_list(Exon *left)
2589 {
2590      Exon *tmp_block;
2591 
2592      while ((tmp_block=left)!=NULL) {
2593           left = left->next_exon;
2594           free(tmp_block);
2595      }
2596 }
2597 
free_table(void)2598 void free_table(void)
2599 {
2600      register struct hash_node *hptr, *tptr;
2601      register int    hval;
2602 
2603      free(pnext_pos);
2604      for (hval=0; hval<HASH_SIZE+1; hval++) {
2605           hptr = phashtab[hval];
2606           while (hptr) {
2607              tptr = hptr;
2608              hptr = hptr->link;
2609              free(tptr);
2610           }
2611      }
2612 }
2613 
bmatch(uchar * s1,uchar * s2,int len1,int len2,int offset1,int offset2)2614 static Exon *bmatch (uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2)
2615 {
2616        int  i, j, i1, score;
2617        Exon *new=NULL;
2618 
2619        for (i1=i=len1-3; i>=len2-3; i--, i1=i) {
2620          for (j=len2-3; j>=2; j--, i1--)
2621            if (*(s1+i1)!=*(s2+j))
2622                break;
2623 
2624          if (j<2) {
2625             /* exact match for CDS found; check signals */
2626             score = 0;
2627             if (*(s1+(i1--))==*(s2+(j--))) score++;
2628             if (*(s1+(i1--))==*(s2+(j--))) score++;
2629             if (*(s1+i1+len2-1)==*(s2+j+len2-1)) score++;
2630             if (*(s1+i1+len2)==*(s2+j+len2)) score++;
2631             if (score>=3) {
2632                 new = new_exon(i1+3+offset1, offset2, i1+3+offset1+len2-5,
2633                                offset2+len2-5, len2-4, 0, 0, NULL);
2634                 new->ori = (G_score >= abs(C_score)) ? 'G' : 'C';
2635 
2636                 return new;
2637             }
2638          }
2639        }
2640        return NULL;
2641 }
2642 
fmatch(uchar * s1,uchar * s2,int len1,int len2,int offset1,int offset2)2643 static Exon *fmatch (uchar *s1, uchar *s2, int len1, int len2, int offset1, int offset2)
2644 {
2645        int  i, j, i1, score;
2646        Exon *new=NULL;
2647 
2648        for (i1=i=2; i<len1-len2+3; i++, i1=i) {
2649          for (j=2; j<len2-2; j++, i1++)
2650            if (*(s1+i1)!=*(s2+j))
2651                break;
2652 
2653          if (j>=len2-2) {
2654             /* exact match found for internal part, look for signals */
2655             score = 0;
2656             if (*(s1+(i1++))==*(s2+(j++))) score++;
2657             if (*(s1+(i1++))==*(s2+(j++))) score++;
2658             if (*(s1+i1-len2)==*s2) score++;
2659             if (*(s1+i1-len2+1)==*(s2+1)) score++;
2660             if (score>=3) {
2661                 new = new_exon(i+offset1,offset2,i1+offset1-2,offset2+len2-5,
2662                                len2-4,0,0,NULL);
2663                 new->ori = (G_score >= abs(C_score)) ? 'G' : 'C';
2664 
2665                 return new;
2666             }
2667          }
2668        }
2669        return NULL;
2670 }
2671 
2672 #ifdef DEBUG
debug_print_exons(Exon * lblock,const char * label)2673 static void debug_print_exons (Exon *lblock, const char *label)
2674 {
2675    Exon *tmp_block = lblock;
2676 
2677    (void)fprintf(stderr,"\n====================%s:\n\n", label);
2678    while (tmp_block) {
2679       (void)fprintf(stderr," [ %d, %d, %d, %d, l: %d ]\n ", tmp_block->from1,
2680                     tmp_block->from2, tmp_block->to1,
2681                     tmp_block->to2, tmp_block->length);
2682       tmp_block = tmp_block->next_exon;
2683   }
2684 }
2685 #endif
2686 
2687 /* -------------------- to be added to psublast ---------------------- */
2688 
seq_toupper(uchar * seq,int len,char * filename)2689 void seq_toupper(uchar *seq, int len, char *filename)
2690 {
2691      int i=0, flag = 0;
2692      uchar *s=seq;
2693 
2694      for (; *s && (i<min(100,len)); i++, s++)
2695        if (islower(*s)) { flag = 1; break; }
2696 
2697      if (flag) {
2698        fprintf(stderr, "Warning: lowercase letter in %s data. Convert sequence to uppercase.\n", filename);
2699        for (s=seq, i=0; *s && (i<len); s++, i++)
2700             *s = toupper(*s);
2701      }
2702 }
2703 
get_sync_flag(Exon * lblock,Exon * rblock,int w)2704 static bool get_sync_flag(Exon *lblock, Exon *rblock, int w)
2705 {
2706      int numx=0, e2;
2707      Exon *t;
2708 
2709      if (((t=lblock->next_exon)==NULL) || !t->to1) return FALSE;
2710      numx++; e2 = t->to2;
2711 
2712      while (((t=t->next_exon)!=NULL) && t->to1) {
2713         ++numx;
2714         if ((t->from2-e2>1) ||
2715             (t!=rblock && ((t->to2-t->from2+1<2*w+2) ||
2716                            (t->to1-t->from1+1<2*w+2)))) return FALSE;
2717         e2 = t->to2;
2718      }
2719      return ((numx<3) ? FALSE:TRUE);
2720 }
2721 
2722 
sync_slide_intron(int in_w,Exon ** lblock,uchar * seq1,uchar * seq2)2723 static void sync_slide_intron(int in_w, Exon **lblock, uchar *seq1, uchar *seq2)
2724 {
2725     Exon *t0=NULL, *t1=NULL, *head = *lblock;
2726     splice_t *g=NULL, *c=NULL, *cell=NULL;
2727     splice_t *Glist[500], *Clist[500];
2728     int Gscore=0, Cscore=0;
2729     char oris[500];
2730     int w1, w2, ni, i, numC, numG;
2731 
2732     memset(Glist, 0, 200*sizeof(splice_t *));
2733     memset(Clist, 0, 200*sizeof(splice_t *));
2734 
2735     ni = 0; numG = numC = 0;
2736 
2737     /* assume forward orientation */
2738     t0 = head->next_exon;
2739     while (t0 && (t1=t0->next_exon) && t1->to1) {
2740        g = c = NULL;
2741        if (t1->from2-t0->to2-1==0) {
2742          if (!strncmp((char *)(seq1+t0->to1),"GT",2) &&
2743              !strncmp((char *)(seq1+t1->from1-3),"AG",2)) {
2744               g = new_splice('G',t0->to1,t1->from1,t0->to2,t1->from2,-1,NULL);
2745               t0->ori = 'G';
2746               oris[ni] = 'G';
2747               numG++;
2748          } else if (!strncmp((char *)(seq1+t0->to1),"CT",2) &&
2749                   !strncmp((char *)(seq1+t1->from1-3),"AC",2)) {
2750               c = new_splice('C',t0->to1,t1->from1,t0->to2,t1->from2,-1,NULL);
2751               t0->ori = 'C';
2752               oris[ni] = 'C';
2753               numC++;
2754          } else {
2755               w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1));
2756               w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1));
2757               splice(seq1, t0->to1-w1, t0->to1+w1, t1->from1-w2, t1->from1+w2,
2758                      seq2, t0->to2-w1, t1->from2+w2, &g, &c, BOTH);
2759 
2760               Gscore += g->score; Cscore += c->score;
2761               cell = NULL; oris[ni] = '*';
2762               if (g->score>c->score) {
2763                   numG++; cell = g; oris[ni] = 'G';
2764               } else if (c->score>g->score) {
2765                   numC++; cell = c; oris[ni] = 'C';
2766               } else if (c->score==g->score) {
2767                   numG++; numC++; cell = g;  oris[ni] = 'G';
2768               }
2769               t0->ori = oris[ni];
2770               t0->to1 = cell->xs; t0->to2 = cell->ys;
2771               t1->from1 = cell->xe; t1->from2 = cell->ye;
2772               t0->length = t0->to2-t0->from2+1;
2773               t1->length = t1->to2-t1->from2+1;
2774          }
2775          Clist[ni] = c; Glist[ni] = g;
2776        } else {
2777          t0->ori = 'E'; oris[ni] = 'E';
2778        }
2779        ni++;
2780        t0 = t1;
2781     }
2782 
2783 /*
2784     if (numG==ni) {
2785         for (i=0; i<ni; i++) {
2786              if (Glist[i]) free(Glist[i]);
2787              if (Clist[i]) free(Clist[i]);
2788         }
2789         return;
2790     }
2791 */
2792     if ((numG==1) && (numC==1) &&
2793         (!Glist[0] || !Clist[0] || !Glist[1] || !Clist[1])) goto free_all;
2794 
2795     if (numG>=numC) {
2796         /* revisit all previous assignments that are inconsistent */
2797         for (i=0, t0=head->next_exon; i<ni; i++, t0=t1) {
2798           t1 = t0->next_exon;
2799           switch (oris[i]) {
2800              case 'G': break;
2801              case 'C': if (Glist[i]==NULL) {
2802                           /* compute the values for C */
2803                           w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1));
2804                           w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1));
2805                           splice(seq1, t0->to1-w1, t0->to1+w1,
2806                                  t1->from1-w2, t1->from1+w2, seq2,
2807                                  t0->to2-w1, t1->from2+w2, &g, &c, FWD);
2808                        } else g = Glist[i];
2809 
2810                        t0->ori = 'G';
2811                        t0->to1 = g->xs; t0->to2 = g->ys;
2812                        t1->from1 = g->xe; t1->from2 = g->ye;
2813                        t0->length = t0->to2-t0->from2+1;
2814                        t1->length = t1->to2-t1->from2+1;
2815 
2816                        break;
2817              case 'E': break;
2818              default : fatal("sim4b1.c: intron orientation not initialized.");
2819           }
2820           if (oris[i]!='E') wobble(&t0,&t1,"GT","AG",seq1);
2821         }
2822     } else {
2823         /* analyze all assignments for consistency */
2824         for (i=0, t0=head->next_exon; i<ni; i++, t0=t1) {
2825           t1 = t0->next_exon;
2826           switch (oris[i]) {
2827              case 'C': break;
2828              case 'G': if (Clist[i]==NULL) {
2829                           /* compute the values for C */
2830                           w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1));
2831                           w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1));
2832                           splice(seq1, t0->to1-w1, t0->to1+w1,
2833                                  t1->from1-w2, t1->from1+w2,
2834                                  seq2, t0->to2-w1, t1->from2+w2, &g, &c, BWD);
2835                        } else c = Clist[i];
2836 
2837                        t0->ori = 'C';
2838                        t0->to1 = c->xs; t0->to2 = c->ys;
2839                        t1->from1 = c->xe; t1->from2 = c->ye;
2840                        t0->length = t0->to2-t0->from2+1;
2841                        t1->length = t1->to2-t1->from2+1;
2842                        break;
2843              case 'E': break;
2844              default : fatal("sim4b1.c: intron orientation not initialized.");
2845           }
2846           if (oris[i]!='E') wobble(&t0,&t1,"CT","AC",seq1);
2847         }
2848     }
2849 
2850     /* now free all memory allocated */
2851     free_all:
2852     for (i=0; i<ni; i++) {
2853         if (Glist[i]) free(Glist[i]);
2854         if (Clist[i]) free(Clist[i]);
2855     }
2856     return;
2857 }
2858 
wobble(Exon ** t0,Exon ** t1,const char * donor,const char * acceptor,uchar * seq1)2859 static void wobble(Exon **t0, Exon **t1, const char *donor, const char *acceptor, uchar *seq1)
2860 {
2861     uchar *s = seq1+(*t0)->to1;  /* first nt of donor */
2862     uchar *q = seq1+(*t1)->from1-3;  /* first nt of acceptor */
2863 
2864     if (!strncmp((char *)(s), donor, 2)) {
2865        /* match in place */
2866        if (!strncmp((char *)(q), acceptor, 2)) {
2867           return;
2868        } else if (!strncmp((char *)(q-1), acceptor, 2)) {
2869           (*t1)->from1--; return;
2870        } else if (!strncmp((char *)(q+1), acceptor, 2)) {
2871           (*t1)->from1++; return;
2872        }
2873 
2874     } else if (!strncmp((char *)(s-1), donor, 2)) {
2875        /* match is 1 off to the left */
2876        if (!strncmp((char *)(q), acceptor, 2)) {
2877           (*t0)->to1--; return;
2878        } else if (!strncmp((char *)(q-1), acceptor, 2)) {
2879           (*t0)->to1--; (*t1)->from1--;
2880           (*t0)->to2--; (*t1)->from2--;
2881           (*t0)->length++; (*t1)->length--;
2882           return;
2883        } else if (!strncmp((char *)(q+1), acceptor, 2)) {
2884           (*t0)->to1--; (*t1)->from1++; return;
2885        }
2886 
2887     } else if (!strncmp((char *)(s+1), donor, 2)) {
2888        /* match is 1 off to the right */
2889        if (!strncmp((char *)(q), acceptor, 2)) {
2890           (*t0)->to1++; return;
2891        } else if (!strncmp((char *)(q-1), acceptor, 2)) {
2892           (*t0)->to1++; (*t1)->from1--; return;
2893        } else if (!strncmp((char *)(q+1), acceptor, 2)) {
2894           (*t0)->to1++; (*t1)->from1++;
2895           (*t0)->to2++; (*t1)->from2++;
2896           (*t0)->length--; (*t1)->length++;
2897           return;
2898        }
2899     } else if (!strncmp((char *)(q-1), acceptor, 2)) {
2900        /* match is 1 off to the left */
2901        (*t1)->from1--; return;
2902 
2903     } else if (!strncmp((char *)(q+1), acceptor, 2)) {
2904        /* match is 1 off to the right */
2905           (*t1)->from1++; return;
2906     }
2907 
2908     return;
2909 }
2910 
slide_intron(int in_w,Exon ** lblock,uchar * seq1,uchar * seq2)2911 static void slide_intron(int in_w, Exon **lblock, uchar *seq1, uchar *seq2)
2912 {
2913     Exon *t0, *t1, *head = *lblock;
2914     splice_t *g, *c, *cell;
2915     char type;
2916     int w1, w2;
2917 
2918     t0 = head->next_exon;
2919     while (t0 && (t1=t0->next_exon) && t1->to1) {
2920        g = c = NULL;
2921        if (t1->from2-t0->to2-1==0) {
2922          if (!strncmp((char *)(seq1+t0->to1),"GT",2) &&
2923              !strncmp((char *)(seq1+t1->from1-3),"AG",2))
2924                    t0->ori = 'G';
2925          else if (!strncmp((char *)(seq1+t0->to1),"CT",2) &&
2926                   !strncmp((char *)(seq1+t1->from1-3),"AC",2))
2927                    t0->ori = 'C';
2928          else {
2929            int gtag=0, ctac=0;
2930            uchar *s;
2931 
2932            w1 = min(in_w, min(t0->length-1, t0->to1-t0->from1));
2933            w2 = min(in_w, min(t1->length-1, t1->to1-t1->from1));
2934            splice(seq1, t0->to1-w1, t0->to1+w1, t1->from1-w2, t1->from1+w2,
2935                   seq2, t0->to2-w1, t1->from2+w2, &g, &c, BOTH);
2936            if (g->score>c->score) { cell = g; type = 'G'; }
2937            else if (c->score>g->score) { cell = c; type = 'C'; }
2938            else { cell = g; type = 'G'; }
2939 
2940            t0->to1 = cell->xs; t0->to2 = cell->ys;
2941            t1->from1 = cell->xe; t1->from2 = cell->ye;
2942            t0->length = t0->to2-t0->from2+1;
2943            t1->length = t1->to2-t1->from2+1;
2944 
2945            wobble(&t0,&t1,(type=='G')? "GT":"CT",(type=='G')? "AG":"AC",seq1);
2946 
2947            free(g); free(c);
2948 
2949            /* determine the type, based on the # matches w/ GT-AG (CT-AC) */
2950            s = seq1+t0->to1;
2951            if (*s=='G') gtag++; else if (*s=='C') ctac++;
2952            ++s;
2953            if (*s=='T') { gtag++; ctac++;}
2954            s = seq1+t1->from1-3;
2955            if (*s=='A') { gtag++; ctac++; }
2956            ++s;
2957            if (*s=='G') gtag++; else if (*s=='C') ctac++;
2958            if (gtag>ctac) type = 'G';
2959            else if (ctac>gtag) type = 'C';
2960            else type = 'N';
2961 
2962            t0->ori = type;
2963          }
2964        } else
2965            t0->ori = 'E';
2966        t0 = t1;
2967     }
2968 
2969 }
2970 
2971 #ifdef AUXUTILS
remove_polyA_tails(Exon * lblock,uchar * seq1,uchar * seq2,int len2)2972 static void remove_polyA_tails(Exon *lblock, uchar *seq1, uchar *seq2, int len2)
2973 {
2974     Exon *t, *prev;
2975     uchar *s, *q;
2976     int xcut, diff, u, tmp, I, J, first = 1;
2977 
2978     t = lblock->next_exon; prev = lblock;
2979 
2980     s = seq2; q = seq2+len2-1; while (s<=q && *s=='T') s++;
2981     if (s>=seq2+t->from2-1) {
2982         while (t && t->to1) {
2983            s = seq2+t->from2-1; q = seq2+t->to2-1;
2984            if (first && strncmp((char *)s,"TTTTT",5)) break;
2985            first = 0;
2986            while ((s<=q) && (*s=='T')) s++;
2987            diff = t->to2-(s-seq2); u = min(diff,12);
2988            xcut = t->to2-t->from2+1-diff;
2989            if (diff>6) {
2990                tmp = (int)((1+P)*u); /* was diff */
2991 /*
2992                EXTEND_BW(s,seq1+t->to1-(diff-u)-tmp,u,tmp,
2993                          s-seq2,t->to1-(diff-u)-tmp,&I,&J);
2994 */
2995                EXTEND_BW(seq2+t->from2+xcut-1,
2996                          seq1+t->from1+(int)((1-P)*xcut)-1,
2997                          u,(int)(P*xcut+(1+P)*u)+1, /* 1 is for looser margin */
2998                          t->from2+xcut-1,t->from1+(int)((1-P)*xcut)-1,
2999                          &I,&J);
3000                t->from2 = I+1; t->from1 = J+1;
3001                t->length = t->to2-t->from2+1;
3002                break;
3003            } else if (diff>0) {
3004                prev->next_exon = t->next_exon; free(t); t = prev->next_exon;
3005 
3006                tmp = (int)((1+P)*diff);
3007                EXTEND_BW(s,seq1+t->from1-tmp,diff,tmp,
3008                          s-seq2,t->from1-tmp,&I,&J);
3009                t->from2 = I+1; t->from1 = J+1; t->length = t->to2-t->from2+1;
3010                break;
3011            } else {
3012                /* remove entire exon and repeat the process */
3013                prev->next_exon = t->next_exon; free(t); t = prev->next_exon;
3014                continue;
3015            }
3016         }
3017     }
3018 
3019     while (t && t->next_exon && t->next_exon->to1) {
3020        prev = t; t = t->next_exon;
3021     }
3022 
3023     first = 1;
3024     s = seq2; q = seq2+len2-1; while (q>=s && *q=='A') q--;
3025     if (t && t->to1 && q<seq2+t->to2-1) {
3026         while (t && t->to1) {
3027            s = seq2+t->to2-1; q = seq2+t->from2-1;
3028            if (first && strncmp((char *)(s-4), "AAAAA", 5)) break;
3029            first = 0;
3030            while ((s>=q) && (*s=='A')) s--;
3031            diff = (int)(s-seq2+1)-t->from2+1; u = min(diff, 12);
3032            xcut = t->to2-t->from2+1-diff;
3033            if (diff>6) {
3034 /*
3035                EXTEND_FW(seq2+t->from2+(diff-u)-1, seq1+t->from1+(diff-u)-1,
3036                          u, (int)((1+P)*u), t->from2+(diff-u)-1,
3037                          t->from1+(diff-u)-1, &I, &J);
3038 
3039 */
3040                EXTEND_FW(s-u+1, seq1+t->to1-(int)((1+P)*xcut)-(int)((1+P)*u),
3041                          u,(int)((1+P)*u+P*xcut)+1, /*+1 is for looser margin */
3042                          t->from2+(diff-u)-1,
3043                          t->to1-(int)((1+P)*xcut)-(int)((1+P)*u), &I, &J);
3044 
3045                t->to1 = J; t->to2 = I; t->length = t->to2-t->from2+1;
3046                break;
3047            } else if (diff>0) {
3048                if (prev==NULL) prev = find_previous(lblock, t);
3049                assert(prev!=NULL);
3050                prev->next_exon = t->next_exon; free(t);
3051                t = prev->next_exon; prev = NULL;
3052                EXTEND_FW(seq2+t->to2, seq1+t->to1, diff, (int)((1+P)*diff),
3053                          t->to2, t->to1, &I, &J);
3054                t->to1 = J; t->to2 = I; t->length = t->to2-t->from2+1;
3055                break;
3056            } else {
3057                if (prev==NULL) prev = find_previous(lblock, t);
3058                assert(prev!=NULL);
3059                prev->next_exon = t->next_exon; free(t);
3060                t = prev->next_exon; prev = NULL;
3061                continue;
3062            }
3063        }
3064     }
3065 
3066     return;
3067 }
3068 #endif
3069 
find_previous(Exon * lblock,Exon * tgt)3070 Exon *find_previous(Exon *lblock, Exon *tgt)
3071 {
3072     Exon *t=lblock;
3073 
3074     while (t && (t->next_exon!=tgt)) t = t->next_exon;
3075     if (t==NULL)
3076         fatal("sim4b1.c: Corrupted exon list: could not find previous.");
3077 
3078     return t;
3079 }
3080 
get_match_quality(Exon * lblock,Exon * rblock,sim4_stats_t * st,int N)3081 static bool get_match_quality(Exon *lblock, Exon *rblock, sim4_stats_t *st, int N)
3082 {
3083   int  tcov;
3084   bool good_match;
3085   Exon *t;
3086 
3087   good_match = TRUE; st->icoverage = 0;
3088   t = lblock->next_exon;
3089   while (t->to1) {
3090      st->icoverage += t->to2-t->from2+1;
3091      if (100*t->edist>=5*(t->to2-t->from2+1)) {
3092          good_match = FALSE; break;
3093      }
3094      t = t->next_exon;
3095   }
3096   tcov = rblock->to2-lblock->next_exon->from2+1;
3097   if (lblock->next_exon->from2>=.5*N &&
3098       tcov>=.8*(N-lblock->next_exon->from2) &&
3099       st->icoverage>=max(.95*tcov,100))
3100          ;
3101   else if (rblock->to2<=.5*N && tcov>=.8*rblock->to2 &&
3102            st->icoverage>=max(.95*tcov,100))
3103          ;
3104   else if ((tcov<.8*N) || (st->icoverage<.9*tcov))
3105                  good_match = FALSE;
3106 
3107   return good_match;
3108 }
3109