1 static char rcsid[] = "$Id: chimera.c 215899 2018-06-29 21:30:57Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "chimera.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <math.h>		/* For sqrt */
10 #include <string.h>		/* For memset */
11 #include "mem.h"
12 #include "genomicpos.h"
13 #include "types.h"
14 #include "maxent.h"
15 #include "intron.h"
16 #include "comp.h"
17 #include "complement.h"
18 
19 
20 #define GBUFFERLEN 1024
21 
22 /* Chimera assembly/bestpath matrix */
23 #ifdef DEBUG
24 #define debug(x) x
25 #else
26 #define debug(x)
27 #endif
28 
29 /* Chimera detection */
30 #ifdef DEBUG1
31 #define debug1(x) x
32 #else
33 #define debug1(x)
34 #endif
35 
36 /* Exon-exon boundary detection */
37 #ifdef DEBUG2
38 #define debug2(x) x
39 #else
40 #define debug2(x)
41 #endif
42 
43 /* local_join_p and distant_join_p */
44 #ifdef DEBUG3
45 #define debug3(x) x
46 #else
47 #define debug3(x)
48 #endif
49 
50 /* Chimera_bestpath */
51 #ifdef DEBUG4
52 #define debug4(x) x
53 #else
54 #define debug4(x)
55 #endif
56 
57 
58 #define T Chimera_T
59 struct T {
60   Stage3_T from;
61   Stage3_T to;
62 
63   int chimerapos;
64   int equivpos;
65 
66   int cdna_direction;
67   int exonexonpos;
68 
69   char donor1;
70   char donor2;
71   char acceptor2;
72   char acceptor1;
73 
74   bool donor_watsonp;
75   bool acceptor_watsonp;
76 
77   double donor_prob;
78   double acceptor_prob;
79 };
80 
81 
82 Stage3_T
Chimera_left_part(T this)83 Chimera_left_part (T this) {
84   return this->from;
85 }
86 
87 Stage3_T
Chimera_right_part(T this)88 Chimera_right_part (T this) {
89   return this->to;
90 }
91 
92 
93 int
Chimera_pos(T this)94 Chimera_pos (T this) {
95   return this->chimerapos;
96 }
97 
98 int
Chimera_equivpos(T this)99 Chimera_equivpos (T this) {
100   return this->equivpos;
101 }
102 
103 int
Chimera_cdna_direction(T this)104 Chimera_cdna_direction (T this) {
105   return this->cdna_direction;
106 }
107 
108 
109 void
Chimera_print_sam_tag(Filestring_T fp,T this,Univ_IIT_T chromosome_iit)110 Chimera_print_sam_tag (Filestring_T fp, T this, Univ_IIT_T chromosome_iit) {
111   char donor_strand, acceptor_strand;
112   char *donor_chr, *acceptor_chr;
113   bool alloc1p, alloc2p;
114 
115   if (this->cdna_direction >= 0) {
116     if (this->donor_watsonp == true) {
117       donor_strand = '+';
118     } else {
119       donor_strand = '-';
120     }
121     if (this->acceptor_watsonp == true) {
122       acceptor_strand = '+';
123     } else {
124       acceptor_strand = '-';
125     }
126   } else {
127     if (this->donor_watsonp == true) {
128       donor_strand = '-';
129     } else {
130       donor_strand = '+';
131     }
132     if (this->acceptor_watsonp == true) {
133       acceptor_strand = '-';
134     } else {
135       acceptor_strand = '+';
136     }
137   }
138 
139   FPRINTF(fp,"%c%c-%c%c,%.2f,%.2f",
140 	  this->donor1,this->donor2,this->acceptor2,this->acceptor1,this->donor_prob,this->acceptor_prob);
141   donor_chr = Univ_IIT_label(chromosome_iit,Stage3_chrnum(this->from),&alloc1p);
142   acceptor_chr = Univ_IIT_label(chromosome_iit,Stage3_chrnum(this->to),&alloc2p);
143   FPRINTF(fp,",%c%s@%u..%c%s@%u",
144 	  donor_strand,donor_chr,Stage3_chrend(this->from),
145 	  acceptor_strand,acceptor_chr,Stage3_chrstart(this->to));
146   FPRINTF(fp,",%d..%d",this->chimerapos+1,this->equivpos+1);
147   if (alloc2p == true) {
148     FREE(acceptor_chr);
149   }
150   if (alloc1p == true) {
151     FREE(donor_chr);
152   }
153 
154   return;
155 }
156 
157 
158 double
Chimera_donor_prob(T this)159 Chimera_donor_prob (T this) {
160   if (this->exonexonpos < 0) {
161     return 0.0;
162   } else {
163     return this->donor_prob;
164   }
165 }
166 
167 double
Chimera_acceptor_prob(T this)168 Chimera_acceptor_prob (T this) {
169   if (this->exonexonpos < 0) {
170     return 0.0;
171   } else {
172     return this->acceptor_prob;
173   }
174 }
175 
176 
177 T
Chimera_new(Stage3_T from,Stage3_T to,int chimerapos,int chimeraequivpos,int exonexonpos,int cdna_direction,char donor1,char donor2,char acceptor2,char acceptor1,bool donor_watsonp,bool acceptor_watsonp,double donor_prob,double acceptor_prob)178 Chimera_new (Stage3_T from, Stage3_T to, int chimerapos, int chimeraequivpos,
179 	     int exonexonpos, int cdna_direction,
180 	     char donor1, char donor2, char acceptor2, char acceptor1,
181 	     bool donor_watsonp, bool acceptor_watsonp,
182 	     double donor_prob, double acceptor_prob) {
183   T new = (T) MALLOC(sizeof(*new));
184 
185   new->from = from;
186   new->to = to;
187 
188   new->chimerapos = chimerapos;
189   new->equivpos = chimeraequivpos;
190   new->exonexonpos = exonexonpos;
191   new->cdna_direction = cdna_direction;
192   new->donor1 = donor1;
193   new->donor2 = donor2;
194   new->acceptor2 = acceptor2;
195   new->acceptor1 = acceptor1;
196   new->donor_watsonp = donor_watsonp;
197   new->acceptor_watsonp = acceptor_watsonp;
198   new->donor_prob = donor_prob;
199   new->acceptor_prob = acceptor_prob;
200 
201   return new;
202 }
203 
204 void
Chimera_free(T * old)205 Chimera_free (T *old) {
206   FREE(*old);
207   return;
208 }
209 
210 
211 void
Chimera_print(Filestring_T fp,T this)212 Chimera_print (Filestring_T fp, T this) {
213   if (this->exonexonpos > 0) {
214     FPRINTF(fp," *** Possible chimera with exon-exon boundary");
215     if (this->cdna_direction > 0) {
216       FPRINTF(fp," (sense)");
217     } else if (this->cdna_direction < 0) {
218       FPRINTF(fp," (antisense)");
219     }
220     FPRINTF(fp," at %d (dinucl = %c%c-%c%c, donor_prob = %.3f, acceptor_prob = %.3f)",
221 	    this->exonexonpos+1,this->donor1,this->donor2,this->acceptor2,this->acceptor1,
222 	    this->donor_prob,this->acceptor_prob);
223   } else if (this->equivpos == this->chimerapos) {
224     FPRINTF(fp," *** Possible chimera with breakpoint at %d",this->chimerapos+1);
225   } else {
226     FPRINTF(fp," *** Possible chimera with breakpoint at %d..%d",this->chimerapos+1,this->equivpos+1);
227   }
228 
229   return;
230 }
231 
232 
233 #define NPSEUDO 10.0
234 
235 int
Chimera_alignment_break(int * newstart,int * newend,Stage3_T stage3,int queryntlength,double fthreshold)236 Chimera_alignment_break (int *newstart, int *newend, Stage3_T stage3, int queryntlength, double fthreshold) {
237   int breakpoint;
238   int *matchscores;
239 
240   /* x signifies nmatches, y signifies nmismatches, x + y = n */
241   int start, end, pos, x = 0, y, n, x_left, y_left, x_right, y_right, n_left, n_right;
242   double theta, x_pseudo, theta_left, theta_right, rss, rss_left, rss_right, rss_sep;
243   double fscore, min_rss_sep, best_pos = -1, best_theta_left, best_theta_right;
244 
245   /*
246   start = Sequence_trim_start(queryseq);
247   end = Sequence_trim_end(queryseq);
248   */
249 
250   start = Stage3_querystart(stage3);
251   end = Stage3_queryend(stage3);
252 
253   if (queryntlength < MAX_QUERYLENGTH_STACK) {
254     matchscores = (int *) CALLOCA(queryntlength,sizeof(int));
255   } else {
256     matchscores = (int *) CALLOC(queryntlength,sizeof(int));
257   }
258   Pair_matchscores(matchscores,Stage3_pairarray(stage3),Stage3_npairs(stage3));
259 
260   x = 0;
261   for (pos = start; pos < end; pos++) {
262     x += matchscores[pos];
263   }
264   n = end - start;
265   y = n - x;
266 
267   /* when rss_sep == rss, fscore == 0 */
268   min_rss_sep = rss = (double) x * (double) y/(double) n;
269   if (rss == 0.0) {
270     if (queryntlength < MAX_QUERYLENGTH_STACK) {
271       FREEA(matchscores);
272     } else {
273       FREE(matchscores);
274     }
275     return 0;
276   }
277 
278   theta = (double) x/(double) n;
279   x_pseudo = NPSEUDO * theta;
280   debug1(printf("%d %d %d %f\n",x,y,n,theta));
281 
282   x_left = y_left = n_left = 0;
283   x_right = x;
284   y_right = y;
285   n_right = n;
286 
287   debug1(printf("%s %s %s %s %s %s %s %s %s %s %s %s %s\n",
288 		"pos","match","x.left","y.left","n.left","x.right","y.right","n.right",
289 		"theta.left","theta.right","rss.left","rss.right","fscore"));
290 
291   for (pos = start; pos < end-1; pos++) {
292     if (matchscores[pos] == 1) {
293       x_left++;
294       x_right--;
295     } else {
296       y_left++;
297       y_right--;
298     }
299     n_left++;
300     n_right--;
301 
302     theta_left = ((double) x_left + x_pseudo)/((double) n_left + NPSEUDO);
303     theta_right = ((double) x_right + x_pseudo)/((double) n_right + NPSEUDO);
304     rss_left = x_left*(1.0-theta_left)*(1.0-theta_left) + y_left*theta_left*theta_left;
305     rss_right = x_right*(1.0-theta_right)*(1.0-theta_right) + y_right*theta_right*theta_right;
306     rss_sep = rss_left + rss_right;
307 
308     if (rss_sep == 0) {
309       debug1(printf("%d %d %d %d %d %d %d %d %f %f %f %f NA\n",
310 		    pos,matchscores[pos],x_left,y_left,n_left,x_right,y_right,n_right,
311 		    theta_left,theta_right,rss_left,rss_right));
312     } else {
313       debug1(
314 	     fscore = ((double) (n - 2))*(rss - rss_sep)/rss_sep;
315 	     printf("%d %d %d %d %d %d %d %d %f %f %f %f %f\n",
316 		    pos,matchscores[pos],x_left,y_left,n_left,x_right,y_right,n_right,
317 		    theta_left,theta_right,rss_left,rss_right,fscore);
318 	     );
319 
320       /* fscore = (n-2)*(rss - rss_sep)/rss_sep = (n-2)*(rss/rss_sep -
321 	 1) is maximized when rss_sep is minimized */
322 
323       if (rss_sep < min_rss_sep) {
324 	min_rss_sep = rss_sep;
325 	best_pos = pos;
326 	best_theta_left = theta_left;
327 	best_theta_right = theta_right;
328       }
329     }
330   }
331   if (queryntlength < MAX_QUERYLENGTH_STACK) {
332     FREEA(matchscores);
333   } else {
334     FREE(matchscores);
335   }
336 
337   fscore = ((double) (n - 2))*(rss - min_rss_sep)/min_rss_sep;
338   if (fscore < fthreshold) {
339     return 0;
340   } else {
341     breakpoint = best_pos;
342     debug1(printf("at %d, fscore = %f\n",breakpoint,fscore));
343     if (best_theta_left < best_theta_right) {
344       /* trim left */
345       *newstart = breakpoint;
346       *newend = end;
347       return breakpoint - start;
348     } else {
349       /* trim right */
350       *newstart = start;
351       *newend = breakpoint;
352       return end - breakpoint;
353     }
354   }
355 }
356 
357 
358 bool
Chimera_local_join_p(Stage3_T from,Stage3_T to,int chimera_slop)359 Chimera_local_join_p (Stage3_T from, Stage3_T to, int chimera_slop) {
360   debug3(printf("? local_join_p from [%p] %d..%d (%u..%u) -> to [%p] %d..%d (%u..%u) => ",
361 		from,Stage3_querystart(from),Stage3_queryend(from),
362 		Stage3_chrstart(from),Stage3_chrend(from),
363 		to,Stage3_querystart(to),Stage3_queryend(to),
364 		Stage3_chrstart(to),Stage3_chrend(to)));
365 
366   if (Stage3_npairs(from) == 0) {
367     return false;
368 
369   } else if (Stage3_npairs(to) == 0) {
370     return false;
371 
372   } else if (Stage3_chimera_right_p(from) == true) {
373     debug3(printf("false, because from is already part of a chimera on its right\n"));
374     return false;
375 
376   } else if (Stage3_chimera_left_p(to) == true) {
377     debug3(printf("false, because to is already part of a chimera on its left\n"));
378     return false;
379 
380   } else if (Stage3_chrnum(from) != Stage3_chrnum(to)) {
381     debug3(printf("false, because different chromosomes\n"));
382     return false;
383 
384   } else if (Stage3_watsonp(from) != Stage3_watsonp(to)) {
385     debug3(printf("false, because different strands\n"));
386     return false;
387 
388   } else if (Stage3_querystart(from) >= Stage3_querystart(to) &&
389 	     Stage3_queryend(from) <= Stage3_queryend(to)) {
390     debug3(printf("false, because from %d..%d is subsumed by to %d..%d\n",
391 		  Stage3_querystart(from),Stage3_queryend(from),
392 		  Stage3_querystart(to),Stage3_queryend(to)));
393     return false;
394 
395   } else if (Stage3_querystart(to) >= Stage3_querystart(from) &&
396 	     Stage3_queryend(to) <= Stage3_queryend(from)) {
397     debug3(printf("false, because to %d..%d is subsumed by from %d..%d\n",
398 		  Stage3_querystart(to),Stage3_queryend(to),
399 		  Stage3_querystart(from),Stage3_queryend(from)));
400     return false;
401 
402   } else if (Stage3_queryend(from) - Stage3_querystart(to) > chimera_slop ||
403 	     Stage3_querystart(to) - Stage3_queryend(from) > chimera_slop) {
404     debug3(printf("false, because %d - %d > chimera_slop %d or %d - %d > chimera_slop %d\n",
405 		  Stage3_queryend(from),Stage3_querystart(to),chimera_slop,
406 		  Stage3_querystart(to),Stage3_queryend(from),chimera_slop));
407     return false;
408 
409   } else {
410     if (Stage3_watsonp(from) == true) {
411       if (Stage3_genomicend(from) > Stage3_genomicstart(to) + chimera_slop) {
412 	debug3(printf("false, because genomic %u > %u + %d\n",
413 		      Stage3_genomicend(from),Stage3_genomicstart(to),chimera_slop));
414 	return false;
415       } else {
416 	return true;
417       }
418 
419     } else {
420       if (Stage3_genomicend(from) + chimera_slop < Stage3_genomicstart(to)) {
421 	debug3(printf("false, because genomic %u + %d < %u\n",
422 		      Stage3_genomicend(from),chimera_slop,Stage3_genomicstart(to)));
423 	return false;
424       } else {
425 	return true;
426       }
427 
428     }
429 
430   }
431 }
432 
433 
434 bool
Chimera_distant_join_p(Stage3_T from,Stage3_T to,int chimera_slop)435 Chimera_distant_join_p (Stage3_T from, Stage3_T to, int chimera_slop) {
436   debug3(printf("? chimeric_join_p from %d..%d (%u..%u) -> to %d..%d (%u..%u) => ",
437 		Stage3_querystart(from),Stage3_queryend(from),
438 		Stage3_chrstart(from),Stage3_chrend(from),
439 		Stage3_querystart(to),Stage3_queryend(to),
440 		Stage3_chrstart(to),Stage3_chrend(to)));
441 
442   if (Stage3_chimera_right_p(from) == true) {
443     debug3(printf("false, because from is already part of a chimera on its right\n"));
444     return false;
445 
446   } else if (Stage3_chimera_left_p(to) == true) {
447     debug3(printf("false, because to is already part of a chimera on its left\n"));
448     return false;
449 
450   } else if (Stage3_querystart(from) >= Stage3_querystart(to) &&
451       Stage3_queryend(from) <= Stage3_queryend(to)) {
452     debug3(printf("false, because from %d..%d is subsumed by to %d..%d\n",
453 		  Stage3_querystart(from),Stage3_queryend(from),
454 		  Stage3_querystart(to),Stage3_queryend(to)));
455     return false;
456   } else if (Stage3_querystart(to) >= Stage3_querystart(from) &&
457 	     Stage3_queryend(to) <= Stage3_queryend(from)) {
458     debug3(printf("false, because to %d..%d is subsumed by from %d..%d\n",
459 		  Stage3_querystart(to),Stage3_queryend(to),
460 		  Stage3_querystart(from),Stage3_queryend(from)));
461     return false;
462   } else if (Stage3_queryend(from) - Stage3_querystart(to) <= chimera_slop &&
463 	     Stage3_querystart(to) - Stage3_queryend(from) <= chimera_slop) {
464     debug3(printf("true, because %d - %d <= %d and %d - %d <= %d\n",
465 		  Stage3_queryend(from),Stage3_querystart(to),chimera_slop,
466 		  Stage3_querystart(to),Stage3_queryend(from),chimera_slop));
467     return true;
468   } else {
469     debug3(printf(" %d and %d not within chimera_slop %d",
470 		  Stage3_queryend(from) - Stage3_querystart(to),Stage3_querystart(to) - Stage3_queryend(from),
471 		  chimera_slop));
472     debug3(printf("false\n"));
473     return false;
474   }
475 }
476 
477 
478 #define NEG_INFINITY -1000000
479 #define PRE_EXTENSION_SLOP 6
480 
481 bool
Chimera_bestpath(int * five_score,int * three_score,int * chimerapos,int * chimeraequivpos,int * bestfrom,int * bestto,Stage3_T * stage3array_sub1,int npaths_sub1,Stage3_T * stage3array_sub2,int npaths_sub2,int queryntlength,int chimera_slop,bool * circularp,bool localp)482 Chimera_bestpath (int *five_score, int *three_score, int *chimerapos, int *chimeraequivpos, int *bestfrom, int *bestto,
483 		  Stage3_T *stage3array_sub1, int npaths_sub1, Stage3_T *stage3array_sub2, int npaths_sub2,
484 		  int queryntlength, int chimera_slop, bool *circularp, bool localp) {
485   int **matrix_sub1, **matrix_sub2, *from, *to, *bestscoreatpos, i, j, pos, score,
486     bestscore = NEG_INFINITY;
487   bool **gapp_sub1, **gapp_sub2;
488   bool foundp = false;
489 
490   debug4(printf("Chimera_bestpath called\n"));
491 
492   from = (int *) CALLOC(queryntlength,sizeof(int));
493   to = (int *) CALLOC(queryntlength,sizeof(int));
494   bestscoreatpos = (int *) CALLOC(queryntlength,sizeof(int));
495 
496   matrix_sub1 = (int **) CALLOC(npaths_sub1,sizeof(int *));
497   gapp_sub1 = (bool **) CALLOC(npaths_sub1,sizeof(bool *));
498   debug4(printf("sub1:"));
499   for (i = 0; i < npaths_sub1; i++) {
500     debug4(printf(" %p",stage3array_sub1[i]));
501     matrix_sub1[i] = (int *) CALLOC(queryntlength,sizeof(int));
502     gapp_sub1[i] = (bool *) CALLOC(queryntlength,sizeof(bool));
503     debug4(Pair_dump_array(Stage3_pairarray(stage3array_sub1[i]),Stage3_npairs(stage3array_sub1[i]),true));
504     /* Allow pre_extension_slop, in case the parts need extensions to merge */
505     Pair_pathscores(gapp_sub1[i],matrix_sub1[i],Stage3_pairarray(stage3array_sub1[i]),
506 		    Stage3_npairs(stage3array_sub1[i]),Stage3_cdna_direction(stage3array_sub1[i]),
507 		    queryntlength,FIVE,PRE_EXTENSION_SLOP);
508   }
509   debug4(printf("\n"));
510 
511   debug4(printf("sub2:"));
512   matrix_sub2 = (int **) CALLOC(npaths_sub2,sizeof(int *));
513   gapp_sub2 = (bool **) CALLOC(npaths_sub2,sizeof(bool *));
514   for (i = 0; i < npaths_sub2; i++) {
515     debug4(printf(" %p",stage3array_sub2[i]));
516     matrix_sub2[i] = (int *) CALLOC(queryntlength,sizeof(int));
517     gapp_sub2[i] = (bool *) CALLOC(queryntlength,sizeof(bool));
518     debug4(Pair_dump_array(Stage3_pairarray(stage3array_sub2[i]),Stage3_npairs(stage3array_sub2[i]),true));
519     /* Allow pre_extension_slop, in case the parts need extensions to merge */
520     Pair_pathscores(gapp_sub2[i],matrix_sub2[i],Stage3_pairarray(stage3array_sub2[i]),
521 		    Stage3_npairs(stage3array_sub2[i]),Stage3_cdna_direction(stage3array_sub2[i]),
522 		    queryntlength,THREE,PRE_EXTENSION_SLOP);
523   }
524   debug4(printf("\n"));
525 
526   for (pos = 0; pos < queryntlength; pos++) {
527     bestscoreatpos[pos] = NEG_INFINITY;
528   }
529   debug4(printf("npaths_sub1 = %d, npaths_sub2 = %d\n",npaths_sub1,npaths_sub2));
530   for (i = 0; i < npaths_sub1; i++) {
531     if (circularp != NULL && circularp[Stage3_chrnum(stage3array_sub1[i])] == true) {
532       /* Do not make chimeras to circular chromosomes */
533     } else {
534       for (j = 0; j < npaths_sub2; j++) {
535 	if (circularp != NULL && circularp[Stage3_chrnum(stage3array_sub2[j])] == true) {
536 	  /* Do not make chimeras to circular chromosomes */
537 	} else if (stage3array_sub1[i] == stage3array_sub2[j]) {
538 	  /* Same stage3 object, so not joinable */
539 	} else if (localp == true && Chimera_local_join_p(stage3array_sub1[i],stage3array_sub2[j],chimera_slop) == false) {
540 	  /* Not joinable */
541 	} else {
542 	  for (pos = 0; pos < queryntlength - 1; pos++) {
543 	    debug4(printf("pos %d, gapp %d and %d\n",pos,gapp_sub1[i][pos],gapp_sub2[j][pos]));
544 	    if (gapp_sub1[i][pos] == false && gapp_sub2[j][pos+1] == false) {
545 #if 0
546 	      score = matrix_sub2[j][queryntlength-1] - matrix_sub2[j][pos] + matrix_sub1[i][pos] /* - 0 */;
547 #else
548 	      /* For new Pair_pairscores computation */
549 	      score = matrix_sub1[i][pos] + matrix_sub2[j][pos];
550 #endif
551 	      debug4(printf("score %d\n",score));
552 	      if (score > bestscoreatpos[pos]) {
553 		bestscoreatpos[pos] = score;
554 		from[pos] = i;
555 		to[pos] = j;
556 	      }
557 	    }
558 	  }
559 	}
560       }
561     }
562   }
563 
564   for (pos = 0; pos < queryntlength - 1; pos++) {
565     if (bestscoreatpos[pos] > bestscore) {
566       bestscore = bestscoreatpos[pos];
567       *chimerapos = *chimeraequivpos = pos;
568       *bestfrom = from[pos];
569       *bestto = to[pos];
570       foundp = true;
571     } else if (bestscoreatpos[pos] == bestscore) {
572       *chimeraequivpos = pos;
573     }
574   }
575 
576   if (foundp == true) {
577     *five_score = matrix_sub1[*bestfrom][*chimerapos] /* - 0 */;
578 #if 0
579     *three_score = matrix_sub2[*bestto][queryntlength-1] - matrix_sub2[*bestto][*chimerapos];
580 #else
581     *three_score = matrix_sub2[*bestto][*chimerapos];
582 #endif
583 
584     debug4(
585 	  for (pos = 0; pos < queryntlength - 1; pos++) {
586 	    printf("%d:",pos);
587 	    for (i = 0; i < npaths_sub1; i++) {
588 	      printf("\t%d",matrix_sub1[i][pos]);
589 	      if (gapp_sub1[i][pos] == true) {
590 		printf("X");
591 	      }
592 	    }
593 	    printf("\t|");
594 	    for (i = 0; i < npaths_sub2; i++) {
595 	      printf("\t%d",matrix_sub2[i][pos]);
596 	      if (gapp_sub2[i][pos] == true) {
597 		printf("X");
598 	      }
599 	    }
600 	    printf("\t||");
601 	    printf("%d (%d->%d)",bestscoreatpos[pos],from[pos],to[pos]);
602 	    if (pos >= *chimerapos && pos <= *chimeraequivpos) {
603 	      printf(" ** ");
604 	    }
605 	    printf("\n");
606 	  }
607 	  printf("From path %d to path %d at pos %d..%d.  5 score = %d, 3 score = %d\n",
608 		 *bestfrom,*bestto,*chimerapos,*chimeraequivpos,*five_score,*three_score);
609 	  fflush(stdout);
610 	  );
611   }
612 
613   for (i = 0; i < npaths_sub2; i++) {
614     FREE(gapp_sub2[i]);
615     FREE(matrix_sub2[i]);
616   }
617   FREE(gapp_sub2);
618   FREE(matrix_sub2);
619 
620   for (i = 0; i < npaths_sub1; i++) {
621     FREE(gapp_sub1[i]);
622     FREE(matrix_sub1[i]);
623   }
624   FREE(gapp_sub1);
625   FREE(matrix_sub1);
626 
627   FREE(bestscoreatpos);
628   FREE(to);
629   FREE(from);
630 
631   debug4(printf("Chimera_bestpath returning %d\n",foundp));
632   return foundp;
633 }
634 
635 static char *complCode = COMPLEMENT_UC;
636 
637 /* Modeled after Chimera_bestpath */
638 /* Called if Chimera_find_exonexon fails */
639 int
Chimera_find_breakpoint(int * chimeraequivpos,int * rangelow,int * rangehigh,char * donor1,char * donor2,char * acceptor2,char * acceptor1,Stage3_T left_part,Stage3_T right_part,int queryntlength,Genome_T genome,Chrpos_T left_chrlength,Chrpos_T right_chrlength)640 Chimera_find_breakpoint (int *chimeraequivpos, int *rangelow, int *rangehigh,
641 			 char *donor1, char *donor2, char *acceptor2, char *acceptor1,
642 			 Stage3_T left_part, Stage3_T right_part, int queryntlength, Genome_T genome,
643 			 Chrpos_T left_chrlength, Chrpos_T right_chrlength) {
644   int chimerapos = 0, breakpoint;
645   int *matrix_sub1, *matrix_sub2, pos, score, bestscore, secondbest;
646   bool *gapp_sub1, *gapp_sub2;
647   Univcoord_T left;
648 
649   if (Stage3_npairs(left_part) == 0 || Stage3_npairs(right_part) == 0) {
650     return -1;
651   }
652 
653   /* Don't allow pre_extension_slop here, because the ends have already been extended */
654   matrix_sub1 = (int *) CALLOC(queryntlength,sizeof(int));
655   gapp_sub1 = (bool *) CALLOC(queryntlength,sizeof(bool));
656   debug4(Pair_dump_array(Stage3_pairarray(left_part),Stage3_npairs(left_part),true));
657   Pair_pathscores(gapp_sub1,matrix_sub1,Stage3_pairarray(left_part),Stage3_npairs(left_part),
658 		  Stage3_cdna_direction(left_part),queryntlength,FIVE,/*pre_extension_slop*/0);
659 
660   matrix_sub2 = (int *) CALLOC(queryntlength,sizeof(int));
661   gapp_sub2 = (bool *) CALLOC(queryntlength,sizeof(bool));
662   debug4(Pair_dump_array(Stage3_pairarray(right_part),Stage3_npairs(right_part),true));
663   Pair_pathscores(gapp_sub2,matrix_sub2,Stage3_pairarray(right_part),Stage3_npairs(right_part),
664 		  Stage3_cdna_direction(right_part),queryntlength,THREE,/*pre_extension_slop*/0);
665 
666 
667   bestscore = secondbest = -100000;
668   for (pos = 0; pos < queryntlength - 1; pos++) {
669     debug(
670 	  printf("%d:",pos);
671 	  printf("\t%d",matrix_sub1[pos]);
672 	  if (gapp_sub1[pos] == true) {
673 	    printf("X");
674 	  }
675 	  printf("\t|");
676 	  printf("\t%d",matrix_sub2[pos]);
677 	  if (gapp_sub2[pos] == true) {
678 	    printf("X");
679 	  }
680 	  printf("\t||");
681 	  );
682 
683     if (gapp_sub1[pos] == false) {
684       if (gapp_sub2[pos+1] == false) {
685 	/* Check for the same stage3 object on both lists */
686 #if 0
687 	/* ? Old formula for use before Pair_pathscores had cdnaend argument */
688 	score = matrix_sub2[queryntlength-1] - matrix_sub2[pos] + matrix_sub1[pos] /* - 0 */;
689 #else
690 	score = matrix_sub1[pos] + matrix_sub2[pos+1];
691 #endif
692 
693 	if (score > bestscore) {
694 	  secondbest = bestscore;
695 	  bestscore = score;
696 	  chimerapos = *chimeraequivpos = pos;
697 	} else if (score == bestscore) {
698 	  *chimeraequivpos = pos;
699 	} else if (score > secondbest) {
700 	  secondbest = score;
701 	}
702 
703 	debug(
704 	      printf("%d = %d + %d",score,matrix_sub1[pos],matrix_sub2[pos+1]);
705 	      if (pos >= chimerapos && pos <= *chimeraequivpos) {
706 		printf(" ** chimerapos %d, chimeraequivpos %d",chimerapos,*chimeraequivpos);
707 	      }
708 	      );
709 
710       }
711     }
712     debug(printf("\n"));
713   }
714   debug(printf("chimerapos %d, chimeraequivpos %d\n",chimerapos,*chimeraequivpos));
715 
716 
717   /* Use secondbest to find a range for exon-exon searching */
718   *rangelow = *rangehigh = 0;
719   for (pos = 0; pos < queryntlength - 1; pos++) {
720     if (gapp_sub1[pos] == false) {
721       if (gapp_sub2[pos+1] == false) {
722 	/* Check for the same stage3 object on both lists */
723 #if 0
724 	/* ? Old formula for use before Pair_pathscores had cdnaend argument */
725 	score = matrix_sub2[queryntlength-1] - matrix_sub2[pos] + matrix_sub1[pos] /* - 0 */;
726 #else
727 	score = matrix_sub1[pos] + matrix_sub2[pos+1];
728 #endif
729 
730 	if (score == secondbest) {
731 	  if (*rangelow == 0) {
732 	    *rangelow = *rangehigh = pos;
733 	  } else {
734 	    *rangehigh = pos;
735 	  }
736 	}
737       }
738     }
739   }
740   if (*rangelow > chimerapos) {
741     *rangelow = chimerapos;
742   }
743   if (*rangehigh < *chimeraequivpos) {
744     *rangehigh = *chimeraequivpos;
745   }
746   debug(printf("For secondbest score of %d: rangelow %d, rangehigh %d\n",secondbest,*rangelow,*rangehigh));
747 
748 
749 #if 0
750   *five_score = matrix_sub1[*chimerapos] /* - 0 */;
751   *three_score = matrix_sub2[queryntlength-1] - matrix_sub2[*chimerapos];
752 #endif
753 
754   FREE(gapp_sub2);
755   FREE(matrix_sub2);
756 
757   FREE(gapp_sub1);
758   FREE(matrix_sub1);
759 
760   if (chimerapos == 0) {
761     /* Never found a breakpoint */
762     return -1;
763   } else {
764     breakpoint = (chimerapos + (*chimeraequivpos))/2;
765 
766     if (Stage3_watsonp(left_part) == true) {
767       if ((left = Stage3_genomicpos(left_part,breakpoint,/*headp*/false)) >= left_chrlength - 2) {
768 	debug(printf("left %u >= left_chrlength %u - 2, so not finding donor dinucleotides\n",left,left_chrlength));
769 	*donor1 = *donor2 = 'N';
770       } else {
771 	debug(printf("left %u < left_chrlength %u - 2, so okay\n",left,left_chrlength));
772 	*donor1 = Genome_get_char(genome,left+1);
773 	*donor2 = Genome_get_char(genome,left+2);
774       }
775     } else {
776       if ((left = Stage3_genomicpos(left_part,breakpoint,/*headp*/false)) < 2) {
777 	debug(printf("left %u < 2, so not finding donor dinucleotides\n",left));
778 	*donor1 = *donor2 = 'N';
779       } else {
780 	debug(printf("left %u >= 2, so okay\n",left));
781 	*donor1 = complCode[(int) Genome_get_char(genome,left-1)];
782 	*donor2 = complCode[(int) Genome_get_char(genome,left-2)];
783       }
784     }
785 
786     if (Stage3_watsonp(right_part) == true) {
787       if ((left = Stage3_genomicpos(right_part,breakpoint+1,/*headp*/true)) < 2) {
788 	debug(printf("left %u < 2, so not finding acceptor dinucleotides\n",left));
789 	*acceptor1 = *acceptor2 = 'N';
790       } else {
791 	debug(printf("left %u >= 2, so okay\n",left));
792 	*acceptor2 = Genome_get_char(genome,left-2);
793 	*acceptor1 = Genome_get_char(genome,left-1);
794       }
795     } else {
796       if ((left = Stage3_genomicpos(right_part,breakpoint+1,/*headp*/true)) >= right_chrlength - 2) {
797 	debug(printf("left %u >= right_chrlength %u - 2, so not finding acceptor dinucleotides\n",left,right_chrlength));
798 	*acceptor1 = *acceptor2 = 'N';
799       } else {
800 	debug(printf("left %u <right_chrlength %u - 2, so okay\n",left,right_chrlength));
801 	*acceptor2 = complCode[(int) Genome_get_char(genome,left+2)];
802 	*acceptor1 = complCode[(int) Genome_get_char(genome,left+1)];
803       }
804     }
805 
806     return chimerapos;
807   }
808 }
809 
810 
811 static double
find_exonexon_fwd(int * exonexonpos,char * donor1,char * donor2,char * acceptor2,char * acceptor1,char * comp,bool * donor_watsonp,bool * acceptor_watsonp,double * donor_prob,double * acceptor_prob,Stage3_T left_part,Stage3_T right_part,Genome_T genome,Genome_T genomealt,Univ_IIT_T chromosome_iit,int breakpoint_start,int breakpoint_end)812 find_exonexon_fwd (int *exonexonpos, char *donor1, char *donor2, char *acceptor2, char *acceptor1,
813 		   char *comp, bool *donor_watsonp, bool *acceptor_watsonp, double *donor_prob, double *acceptor_prob,
814 		   Stage3_T left_part, Stage3_T right_part, Genome_T genome, Genome_T genomealt,
815 		   Univ_IIT_T chromosome_iit, int breakpoint_start, int breakpoint_end) {
816   Sequence_T donor_genomicseg, acceptor_genomicseg, donor_genomicalt, acceptor_genomicalt;
817   char *donor_ptr, *acceptor_ptr, *donor_altptr, *acceptor_altptr;
818   int i, j;
819   Univcoord_T left;
820   int donor_length, acceptor_length;
821   bool revcomp;
822   char left1, left2, right2, right1;
823   char left1_alt, left2_alt, right2_alt, right1_alt;
824   int introntype;
825   double donor_prob_1, acceptor_prob_1, donor_altprob_1, acceptor_altprob_1, bestproduct = 0.0, product;
826 
827   *exonexonpos = -1;
828 
829   donor_length = breakpoint_end - breakpoint_start + DONOR_MODEL_LEFT_MARGIN + DONOR_MODEL_RIGHT_MARGIN;
830   left = Stage3_genomicpos(left_part,breakpoint_start,/*headp*/false);
831   if ((*donor_watsonp = Stage3_watsonp(left_part)) == true) {
832     left -= DONOR_MODEL_LEFT_MARGIN;
833     revcomp = false;
834   } else {
835     left += DONOR_MODEL_LEFT_MARGIN;
836     left -= donor_length;
837     revcomp = true;
838   }
839 
840   debug2(printf("Getting donor at left %u\n",left));
841   donor_genomicseg = Genome_get_segment(genome,left,donor_length+1,chromosome_iit,revcomp);
842   donor_ptr = Sequence_fullpointer(donor_genomicseg);
843   donor_genomicalt = Genome_get_segment_alt(genomealt,left,donor_length+1,chromosome_iit,revcomp);
844   donor_altptr = Sequence_fullpointer(donor_genomicalt);
845 
846   debug2(
847 	 for (i = 0; i < ACCEPTOR_MODEL_LEFT_MARGIN - DONOR_MODEL_LEFT_MARGIN; i++) {
848 	   printf(" ");
849 	 }
850 	 Sequence_stdout(donor_genomicseg,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/false);
851 	 );
852 
853   acceptor_length = breakpoint_end - breakpoint_start + ACCEPTOR_MODEL_LEFT_MARGIN + ACCEPTOR_MODEL_RIGHT_MARGIN;
854   left = Stage3_genomicpos(right_part,breakpoint_end+1,/*headp*/true);
855   if ((*acceptor_watsonp = Stage3_watsonp(right_part)) == true) {
856     left += ACCEPTOR_MODEL_RIGHT_MARGIN;
857     left -= acceptor_length;
858     revcomp = false;
859   } else {
860     left -= ACCEPTOR_MODEL_RIGHT_MARGIN;
861     revcomp = true;
862   }
863 
864   debug2(printf("Getting acceptor at left %u\n",left));
865   acceptor_genomicseg = Genome_get_segment(genome,left,acceptor_length+1,chromosome_iit,revcomp);
866   acceptor_ptr = Sequence_fullpointer(acceptor_genomicseg);
867   acceptor_genomicalt = Genome_get_segment(genomealt,left,acceptor_length+1,chromosome_iit,revcomp);
868   acceptor_altptr = Sequence_fullpointer(acceptor_genomicalt);
869   debug2(
870 	 for (i = 0; i < DONOR_MODEL_LEFT_MARGIN - ACCEPTOR_MODEL_LEFT_MARGIN; i++) {
871 	   printf(" ");
872 	 }
873 	 Sequence_stdout(acceptor_genomicseg,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/false);
874 	 );
875 
876   *donor_prob = 0.0;
877   *acceptor_prob = 0.0;
878   for (i = DONOR_MODEL_LEFT_MARGIN, j = ACCEPTOR_MODEL_LEFT_MARGIN;
879        i <= donor_length - DONOR_MODEL_RIGHT_MARGIN &&
880 	 j <= acceptor_length - ACCEPTOR_MODEL_RIGHT_MARGIN;
881        i++, j++) {
882 
883     left1 = donor_ptr[i+1];
884     left2 = donor_ptr[i+2];
885     right2 = acceptor_ptr[j-2];
886     right1 = acceptor_ptr[j-1];
887 
888     left1_alt = donor_altptr[i+1];
889     left2_alt = donor_altptr[i+2];
890     right2_alt = acceptor_altptr[j-2];
891     right1_alt = acceptor_altptr[j-1];
892 
893     debug2(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
894     introntype = Intron_type(left1,left2,right2,right1,
895 			     left1_alt,left2_alt,right2_alt,right1_alt,
896 			     /*cdna_direction*/+1);
897     debug2(printf("  Introntype is %s\n",Intron_type_string(introntype)));
898 
899     donor_prob_1 = Maxent_donor_prob(&(donor_ptr[i+1-DONOR_MODEL_LEFT_MARGIN]));
900     acceptor_prob_1 = Maxent_acceptor_prob(&(acceptor_ptr[j-ACCEPTOR_MODEL_LEFT_MARGIN]));
901     donor_altprob_1 = Maxent_donor_prob(&(donor_altptr[i+1-DONOR_MODEL_LEFT_MARGIN]));
902     acceptor_altprob_1 = Maxent_acceptor_prob(&(acceptor_altptr[j-ACCEPTOR_MODEL_LEFT_MARGIN]));
903 
904     debug2(printf("%d %c%c %c%c %.2f %.2f\n",
905 		  breakpoint_start - DONOR_MODEL_LEFT_MARGIN + i,
906 		  donor_ptr[i+1],donor_ptr[i+2],acceptor_ptr[j-2],acceptor_ptr[j-1],
907 		  donor_prob_1,acceptor_prob_1));
908 
909     if (donor_prob_1 < 0.50 && acceptor_prob_1 < 0.50 && donor_altprob_1 < 0.50 && acceptor_altprob_1 < 0.50) {
910       /* Skip */
911     } else if (introntype != NONINTRON || donor_prob_1 > 0.90 || acceptor_prob_1 > 0.90 || donor_altprob_1 > 0.90 || acceptor_altprob_1 > 0.90) {
912       if ((product = donor_prob_1*acceptor_prob_1) > bestproduct) {
913 	bestproduct = product;
914 	*donor1 = donor_ptr[i+1];
915 	*donor2 = donor_ptr[i+2];
916 	*acceptor2 = acceptor_ptr[j-2];
917 	*acceptor1 = acceptor_ptr[j-1];
918 	if (donor_prob_1 >= donor_altprob_1) {
919 	  *donor_prob = donor_prob_1;
920 	} else {
921 	  *donor_prob = donor_altprob_1;
922 	}
923 	if (acceptor_prob_1 >= acceptor_altprob_1) {
924 	  *acceptor_prob = acceptor_prob_1;
925 	} else {
926 	  *acceptor_prob = acceptor_altprob_1;
927 	}
928 	*exonexonpos = breakpoint_start - DONOR_MODEL_LEFT_MARGIN + i;
929 
930 	switch (introntype) {
931 	case GTAG_FWD: *comp = FWD_CANONICAL_INTRON_COMP; break;
932 	case GCAG_FWD: *comp = FWD_GCAG_INTRON_COMP; break;
933 	case ATAC_FWD: *comp = FWD_ATAC_INTRON_COMP; break;
934 	default: *comp = NONINTRON_COMP; break;
935 	}
936 
937       }
938     }
939   }
940 
941   Sequence_free(&acceptor_genomicalt);
942   Sequence_free(&donor_genomicalt);
943   Sequence_free(&acceptor_genomicseg);
944   Sequence_free(&donor_genomicseg);
945 
946   return bestproduct;
947 }
948 
949 static double
find_exonexon_rev(int * exonexonpos,char * donor1,char * donor2,char * acceptor2,char * acceptor1,char * comp,bool * donor_watsonp,bool * acceptor_watsonp,double * donor_prob,double * acceptor_prob,Stage3_T left_part,Stage3_T right_part,Genome_T genome,Genome_T genomealt,Univ_IIT_T chromosome_iit,int breakpoint_start,int breakpoint_end)950 find_exonexon_rev (int *exonexonpos, char *donor1, char *donor2, char *acceptor2, char *acceptor1,
951 		   char *comp, bool *donor_watsonp, bool *acceptor_watsonp, double *donor_prob, double *acceptor_prob,
952 		   Stage3_T left_part, Stage3_T right_part, Genome_T genome, Genome_T genomealt,
953 		   Univ_IIT_T chromosome_iit, int breakpoint_start, int breakpoint_end) {
954   Sequence_T donor_genomicseg, acceptor_genomicseg, donor_genomicalt, acceptor_genomicalt;
955   char *donor_ptr, *acceptor_ptr, *donor_altptr, *acceptor_altptr;
956   int i, j;
957   Univcoord_T left;
958   int donor_length, acceptor_length;
959   bool revcomp;
960   char left1, left2, right2, right1;
961   char left1_alt, left2_alt, right2_alt, right1_alt;
962   int introntype;
963   double donor_prob_1, acceptor_prob_1, donor_altprob_1, acceptor_altprob_1, bestproduct = 0.0, product;
964 
965   *exonexonpos = -1;
966 
967   donor_length = breakpoint_end - breakpoint_start + DONOR_MODEL_LEFT_MARGIN + DONOR_MODEL_RIGHT_MARGIN;
968   left = Stage3_genomicpos(right_part,breakpoint_end+1,/*headp*/true);
969   if ((*donor_watsonp = Stage3_watsonp(right_part)) == true) {
970     left += DONOR_MODEL_LEFT_MARGIN;
971     left -= donor_length;
972     revcomp = true;
973   } else {
974     left -= DONOR_MODEL_LEFT_MARGIN;
975     revcomp = false;
976   }
977 
978   debug2(printf("Getting donor at left %u\n",left));
979   donor_genomicseg = Genome_get_segment(genome,left,donor_length+1,chromosome_iit,revcomp);
980   donor_ptr = Sequence_fullpointer(donor_genomicseg);
981   donor_genomicalt = Genome_get_segment(genomealt,left,donor_length+1,chromosome_iit,revcomp);
982   donor_altptr = Sequence_fullpointer(donor_genomicalt);
983   debug2(
984 	 for (i = 0; i < ACCEPTOR_MODEL_LEFT_MARGIN - DONOR_MODEL_LEFT_MARGIN; i++) {
985 	   printf(" ");
986 	 }
987 	 Sequence_stdout(donor_genomicseg,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/false);
988 	 );
989 
990   acceptor_length = breakpoint_end - breakpoint_start + ACCEPTOR_MODEL_LEFT_MARGIN + ACCEPTOR_MODEL_RIGHT_MARGIN;
991   left = Stage3_genomicpos(left_part,breakpoint_start,/*headp*/false);
992   if ((*acceptor_watsonp = Stage3_watsonp(left_part)) == true) {
993     left -= ACCEPTOR_MODEL_RIGHT_MARGIN;
994     revcomp = true;
995   } else {
996     left += ACCEPTOR_MODEL_RIGHT_MARGIN;
997     left -= acceptor_length;
998     revcomp = false;
999   }
1000 
1001   debug2(printf("Getting acceptor at left %u\n",left));
1002   acceptor_genomicseg = Genome_get_segment(genome,left,acceptor_length+1,chromosome_iit,revcomp);
1003   acceptor_ptr = Sequence_fullpointer(acceptor_genomicseg);
1004   acceptor_genomicalt = Genome_get_segment(genomealt,left,acceptor_length+1,chromosome_iit,revcomp);
1005   acceptor_altptr = Sequence_fullpointer(acceptor_genomicalt);
1006   debug2(
1007 	 for (i = 0; i < DONOR_MODEL_LEFT_MARGIN - ACCEPTOR_MODEL_LEFT_MARGIN; i++) {
1008 	   printf(" ");
1009 	 }
1010 	 Sequence_stdout(acceptor_genomicseg,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/false);
1011 	 );
1012 
1013   *donor_prob = 0.0;
1014   *acceptor_prob = 0.0;
1015   for (i = DONOR_MODEL_LEFT_MARGIN, j = ACCEPTOR_MODEL_LEFT_MARGIN;
1016        i <= donor_length - DONOR_MODEL_RIGHT_MARGIN &&
1017 	 j <= acceptor_length - ACCEPTOR_MODEL_RIGHT_MARGIN;
1018        i++, j++) {
1019 
1020     left1 = donor_ptr[i+1];
1021     left2 = donor_ptr[i+2];
1022     right2 = acceptor_ptr[j-2];
1023     right1 = acceptor_ptr[j-1];
1024 
1025     left1_alt = donor_altptr[i+1];
1026     left2_alt = donor_altptr[i+2];
1027     right2_alt = acceptor_altptr[j-2];
1028     right1_alt = acceptor_altptr[j-1];
1029 
1030     /* Use cdna_direction == +1, because revcomp already applied */
1031     debug2(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
1032     introntype = Intron_type(left1,left2,right2,right1,
1033 			     left1_alt,left2_alt,right2_alt,right1_alt,
1034 			     /*cdna_direction*/+1);
1035     debug2(printf("  Introntype is %s\n",Intron_type_string(introntype)));
1036 
1037     donor_prob_1 = Maxent_donor_prob(&(donor_ptr[i+1-DONOR_MODEL_LEFT_MARGIN]));
1038     acceptor_prob_1 = Maxent_acceptor_prob(&(acceptor_ptr[j-ACCEPTOR_MODEL_LEFT_MARGIN]));
1039     donor_altprob_1 = Maxent_donor_prob(&(donor_altptr[i+1-DONOR_MODEL_LEFT_MARGIN]));
1040     acceptor_altprob_1 = Maxent_acceptor_prob(&(acceptor_altptr[j-ACCEPTOR_MODEL_LEFT_MARGIN]));
1041 
1042     debug2(printf("%d %c%c %c%c %.2f %.2f\n",
1043 		  breakpoint_end + DONOR_MODEL_LEFT_MARGIN - i,
1044 		  donor_ptr[i+1],donor_ptr[i+2],acceptor_ptr[j-2],acceptor_ptr[j-1],
1045 		  donor_prob_1,acceptor_prob_1));
1046 
1047     if (donor_prob_1 < 0.50 && acceptor_prob_1 < 0.50 && donor_altprob_1 < 0.50 && acceptor_altprob_1 < 0.50) {
1048       /* Skip */
1049     } else if (introntype != NONINTRON || donor_prob_1 > 0.90 || acceptor_prob_1 > 0.90 || donor_altprob_1 > 0.90 || acceptor_altprob_1 > 0.90) {
1050       if ((product = donor_prob_1*acceptor_prob_1) > bestproduct) {
1051 	bestproduct = product;
1052 	*donor1 = donor_ptr[i+1];
1053 	*donor2 = donor_ptr[i+2];
1054 	*acceptor2 = acceptor_ptr[j-2];
1055 	*acceptor1 = acceptor_ptr[j-1];
1056 	if (donor_prob_1 >= donor_altprob_1) {
1057 	  *donor_prob = donor_prob_1;
1058 	} else {
1059 	  *donor_prob = donor_altprob_1;
1060 	}
1061 	if (acceptor_prob_1 >= acceptor_altprob_1) {
1062 	  *acceptor_prob = acceptor_prob_1;
1063 	} else {
1064 	  *acceptor_prob = acceptor_altprob_1;
1065 	}
1066 	*exonexonpos = breakpoint_end + DONOR_MODEL_LEFT_MARGIN - i;
1067 
1068 	/* Have to look for forward intron types, but return the revcomp comp */
1069 	switch (introntype) {
1070 	case GTAG_FWD: *comp = REV_CANONICAL_INTRON_COMP; break;
1071 	case GCAG_FWD: *comp = REV_GCAG_INTRON_COMP; break;
1072 	case ATAC_FWD: *comp = REV_ATAC_INTRON_COMP; break;
1073 	default: *comp = NONINTRON_COMP; break;
1074 	}
1075       }
1076     }
1077   }
1078 
1079   Sequence_free(&acceptor_genomicalt);
1080   Sequence_free(&donor_genomicalt);
1081   Sequence_free(&acceptor_genomicseg);
1082   Sequence_free(&donor_genomicseg);
1083 
1084   return bestproduct;
1085 }
1086 
1087 
1088 int
Chimera_find_exonexon(int * found_cdna_direction,int * try_cdna_direction,char * donor1,char * donor2,char * acceptor2,char * acceptor1,char * comp,bool * donor_watsonp,bool * acceptor_watsonp,double * donor_prob,double * acceptor_prob,Stage3_T left_part,Stage3_T right_part,Genome_T genome,Genome_T genomealt,Univ_IIT_T chromosome_iit,int breakpoint_start,int breakpoint_end)1089 Chimera_find_exonexon (int *found_cdna_direction, int *try_cdna_direction,
1090 		       char *donor1, char *donor2, char *acceptor2, char *acceptor1,
1091 		       char *comp, bool *donor_watsonp, bool *acceptor_watsonp, double *donor_prob, double *acceptor_prob,
1092 		       Stage3_T left_part, Stage3_T right_part, Genome_T genome, Genome_T genomealt,
1093 		       Univ_IIT_T chromosome_iit, int breakpoint_start, int breakpoint_end) {
1094   int exonexonpos_fwd, exonexonpos_rev;
1095   char donor1_fwd, donor2_fwd, acceptor2_fwd, acceptor1_fwd,
1096     donor1_rev, donor2_rev, acceptor2_rev, acceptor1_rev;
1097   char comp_fwd, comp_rev;
1098   bool donor_watsonp_fwd, acceptor_watsonp_fwd, donor_watsonp_rev, acceptor_watsonp_rev;
1099   double bestproduct_fwd, bestproduct_rev, donor_prob_fwd, donor_prob_rev, acceptor_prob_fwd, acceptor_prob_rev;
1100   int left_cdna_direction, right_cdna_direction;
1101 
1102   debug2(printf("Starting Chimera_find_exonexon with breakpoint %d..%d\n",breakpoint_start,breakpoint_end));
1103   debug2(printf("left part covers query %d to %d\n",Stage3_querystart(left_part),Stage3_queryend(left_part)));
1104   debug2(printf("right part covers query %d to %d\n",Stage3_querystart(right_part),Stage3_queryend(right_part)));
1105 
1106 #if 0
1107   if (Stage3_queryend(left_part) < Stage3_querystart(right_part)) {
1108     breakpoint_start = Stage3_queryend(left_part);
1109     breakpoint_end = Stage3_querystart(right_part);
1110   } else {
1111     breakpoint_start = Stage3_querystart(right_part);
1112     breakpoint_end = Stage3_queryend(left_part);
1113   }
1114   breakpoint_start -= 10;
1115   if (breakpoint_start < 0) {
1116     breakpoint_start = 0;
1117   }
1118   breakpoint_end += 10;
1119   if (breakpoint_end > querylength-1) {
1120     breakpoint_end = querylength-1;
1121   }
1122 #endif
1123 
1124   if (breakpoint_end < breakpoint_start) {
1125     debug2(printf("Breakpoints do not make sense, so not computing\n"));
1126     *found_cdna_direction = *try_cdna_direction = 0;
1127     return -1;
1128   }
1129 
1130   debug2(printf("Starting search for exon-exon boundary at breakpoint_start %d to breakpoint_end %d\n",
1131 		breakpoint_start,breakpoint_end));
1132 
1133   left_cdna_direction = Stage3_cdna_direction(left_part);
1134   right_cdna_direction = Stage3_cdna_direction(right_part);
1135 
1136   if (left_cdna_direction == 0 && right_cdna_direction == 0) {
1137     *try_cdna_direction = 0;
1138   } else if (left_cdna_direction >= 0 && right_cdna_direction >= 0) {
1139     *try_cdna_direction = +1;
1140   } else if (left_cdna_direction <= 0 && right_cdna_direction <= 0) {
1141     *try_cdna_direction = -1;
1142   } else {
1143     *try_cdna_direction = 0;
1144   }
1145 
1146   if (*try_cdna_direction == +1) {
1147     *found_cdna_direction = +1;
1148     bestproduct_fwd = find_exonexon_fwd(&exonexonpos_fwd,&donor1_fwd,&donor2_fwd,&acceptor2_fwd,&acceptor1_fwd,
1149 					&comp_fwd,&donor_watsonp_fwd,&acceptor_watsonp_fwd,&donor_prob_fwd,&acceptor_prob_fwd,
1150 					left_part,right_part,genome,genomealt,chromosome_iit,breakpoint_start,breakpoint_end);
1151     bestproduct_rev = 0.0;
1152 
1153   } else if (*try_cdna_direction == -1) {
1154     *found_cdna_direction = -1;
1155     bestproduct_rev = find_exonexon_rev(&exonexonpos_rev,&donor1_rev,&donor2_rev,&acceptor2_rev,&acceptor1_rev,
1156 					&comp_rev,&donor_watsonp_rev,&acceptor_watsonp_rev,&donor_prob_rev,&acceptor_prob_rev,
1157 					left_part,right_part,genome,genomealt,chromosome_iit,breakpoint_start,breakpoint_end);
1158     bestproduct_fwd = 0.0;
1159 
1160   } else {
1161     bestproduct_fwd = find_exonexon_fwd(&exonexonpos_fwd,&donor1_fwd,&donor2_fwd,&acceptor2_fwd,&acceptor1_fwd,
1162 					&comp_fwd,&donor_watsonp_fwd,&acceptor_watsonp_fwd,&donor_prob_fwd,&acceptor_prob_fwd,
1163 					left_part,right_part,genome,genomealt,chromosome_iit,breakpoint_start,breakpoint_end);
1164     bestproduct_rev = find_exonexon_rev(&exonexonpos_rev,&donor1_rev,&donor2_rev,&acceptor2_rev,&acceptor1_rev,
1165 					&comp_rev,&donor_watsonp_rev,&acceptor_watsonp_rev,&donor_prob_rev,&acceptor_prob_rev,
1166 					left_part,right_part,genome,genomealt,chromosome_iit,breakpoint_start,breakpoint_end);
1167   }
1168 
1169   if (bestproduct_fwd == 0.0 && bestproduct_rev == 0.0) {
1170     *found_cdna_direction = 0;
1171     *donor1 = 'N';
1172     *donor2 = 'N';
1173     *acceptor2 = 'N';
1174     *acceptor1 = 'N';
1175     *comp = NONINTRON_COMP;
1176     *donor_watsonp = true;
1177     *acceptor_watsonp = true;
1178     *donor_prob = 0.0;
1179     *acceptor_prob = 0.0;
1180     return -1;
1181 
1182   } else if (bestproduct_fwd >= bestproduct_rev) {
1183     *found_cdna_direction = +1;
1184     *donor1 = donor1_fwd;
1185     *donor2 = donor2_fwd;
1186     *acceptor2 = acceptor2_fwd;
1187     *acceptor1 = acceptor1_fwd;
1188     *comp = comp_fwd;
1189     *donor_watsonp = donor_watsonp_fwd;
1190     *acceptor_watsonp = acceptor_watsonp_fwd;
1191     *donor_prob = donor_prob_fwd;
1192     *acceptor_prob = acceptor_prob_fwd;
1193     return exonexonpos_fwd;
1194 
1195   } else {
1196     *found_cdna_direction = -1;
1197     *donor1 = donor1_rev;
1198     *donor2 = donor2_rev;
1199     *acceptor2 = acceptor2_rev;
1200     *acceptor1 = acceptor1_rev;
1201     *comp = comp_rev;
1202     *donor_watsonp = donor_watsonp_rev;
1203     *acceptor_watsonp = acceptor_watsonp_rev;
1204     *donor_prob = donor_prob_rev;
1205     *acceptor_prob = acceptor_prob_rev;
1206     return exonexonpos_rev;
1207   }
1208 }
1209 
1210 
1211