1 static char rcsid[] = "$Id: chimera.c 215899 2018-06-29 21:30:57Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "chimera.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <math.h> /* For sqrt */
10 #include <string.h> /* For memset */
11 #include "mem.h"
12 #include "genomicpos.h"
13 #include "types.h"
14 #include "maxent.h"
15 #include "intron.h"
16 #include "comp.h"
17 #include "complement.h"
18
19
20 #define GBUFFERLEN 1024
21
22 /* Chimera assembly/bestpath matrix */
23 #ifdef DEBUG
24 #define debug(x) x
25 #else
26 #define debug(x)
27 #endif
28
29 /* Chimera detection */
30 #ifdef DEBUG1
31 #define debug1(x) x
32 #else
33 #define debug1(x)
34 #endif
35
36 /* Exon-exon boundary detection */
37 #ifdef DEBUG2
38 #define debug2(x) x
39 #else
40 #define debug2(x)
41 #endif
42
43 /* local_join_p and distant_join_p */
44 #ifdef DEBUG3
45 #define debug3(x) x
46 #else
47 #define debug3(x)
48 #endif
49
50 /* Chimera_bestpath */
51 #ifdef DEBUG4
52 #define debug4(x) x
53 #else
54 #define debug4(x)
55 #endif
56
57
58 #define T Chimera_T
59 struct T {
60 Stage3_T from;
61 Stage3_T to;
62
63 int chimerapos;
64 int equivpos;
65
66 int cdna_direction;
67 int exonexonpos;
68
69 char donor1;
70 char donor2;
71 char acceptor2;
72 char acceptor1;
73
74 bool donor_watsonp;
75 bool acceptor_watsonp;
76
77 double donor_prob;
78 double acceptor_prob;
79 };
80
81
82 Stage3_T
Chimera_left_part(T this)83 Chimera_left_part (T this) {
84 return this->from;
85 }
86
87 Stage3_T
Chimera_right_part(T this)88 Chimera_right_part (T this) {
89 return this->to;
90 }
91
92
93 int
Chimera_pos(T this)94 Chimera_pos (T this) {
95 return this->chimerapos;
96 }
97
98 int
Chimera_equivpos(T this)99 Chimera_equivpos (T this) {
100 return this->equivpos;
101 }
102
103 int
Chimera_cdna_direction(T this)104 Chimera_cdna_direction (T this) {
105 return this->cdna_direction;
106 }
107
108
109 void
Chimera_print_sam_tag(Filestring_T fp,T this,Univ_IIT_T chromosome_iit)110 Chimera_print_sam_tag (Filestring_T fp, T this, Univ_IIT_T chromosome_iit) {
111 char donor_strand, acceptor_strand;
112 char *donor_chr, *acceptor_chr;
113 bool alloc1p, alloc2p;
114
115 if (this->cdna_direction >= 0) {
116 if (this->donor_watsonp == true) {
117 donor_strand = '+';
118 } else {
119 donor_strand = '-';
120 }
121 if (this->acceptor_watsonp == true) {
122 acceptor_strand = '+';
123 } else {
124 acceptor_strand = '-';
125 }
126 } else {
127 if (this->donor_watsonp == true) {
128 donor_strand = '-';
129 } else {
130 donor_strand = '+';
131 }
132 if (this->acceptor_watsonp == true) {
133 acceptor_strand = '-';
134 } else {
135 acceptor_strand = '+';
136 }
137 }
138
139 FPRINTF(fp,"%c%c-%c%c,%.2f,%.2f",
140 this->donor1,this->donor2,this->acceptor2,this->acceptor1,this->donor_prob,this->acceptor_prob);
141 donor_chr = Univ_IIT_label(chromosome_iit,Stage3_chrnum(this->from),&alloc1p);
142 acceptor_chr = Univ_IIT_label(chromosome_iit,Stage3_chrnum(this->to),&alloc2p);
143 FPRINTF(fp,",%c%s@%u..%c%s@%u",
144 donor_strand,donor_chr,Stage3_chrend(this->from),
145 acceptor_strand,acceptor_chr,Stage3_chrstart(this->to));
146 FPRINTF(fp,",%d..%d",this->chimerapos+1,this->equivpos+1);
147 if (alloc2p == true) {
148 FREE(acceptor_chr);
149 }
150 if (alloc1p == true) {
151 FREE(donor_chr);
152 }
153
154 return;
155 }
156
157
158 double
Chimera_donor_prob(T this)159 Chimera_donor_prob (T this) {
160 if (this->exonexonpos < 0) {
161 return 0.0;
162 } else {
163 return this->donor_prob;
164 }
165 }
166
167 double
Chimera_acceptor_prob(T this)168 Chimera_acceptor_prob (T this) {
169 if (this->exonexonpos < 0) {
170 return 0.0;
171 } else {
172 return this->acceptor_prob;
173 }
174 }
175
176
177 T
Chimera_new(Stage3_T from,Stage3_T to,int chimerapos,int chimeraequivpos,int exonexonpos,int cdna_direction,char donor1,char donor2,char acceptor2,char acceptor1,bool donor_watsonp,bool acceptor_watsonp,double donor_prob,double acceptor_prob)178 Chimera_new (Stage3_T from, Stage3_T to, int chimerapos, int chimeraequivpos,
179 int exonexonpos, int cdna_direction,
180 char donor1, char donor2, char acceptor2, char acceptor1,
181 bool donor_watsonp, bool acceptor_watsonp,
182 double donor_prob, double acceptor_prob) {
183 T new = (T) MALLOC(sizeof(*new));
184
185 new->from = from;
186 new->to = to;
187
188 new->chimerapos = chimerapos;
189 new->equivpos = chimeraequivpos;
190 new->exonexonpos = exonexonpos;
191 new->cdna_direction = cdna_direction;
192 new->donor1 = donor1;
193 new->donor2 = donor2;
194 new->acceptor2 = acceptor2;
195 new->acceptor1 = acceptor1;
196 new->donor_watsonp = donor_watsonp;
197 new->acceptor_watsonp = acceptor_watsonp;
198 new->donor_prob = donor_prob;
199 new->acceptor_prob = acceptor_prob;
200
201 return new;
202 }
203
204 void
Chimera_free(T * old)205 Chimera_free (T *old) {
206 FREE(*old);
207 return;
208 }
209
210
211 void
Chimera_print(Filestring_T fp,T this)212 Chimera_print (Filestring_T fp, T this) {
213 if (this->exonexonpos > 0) {
214 FPRINTF(fp," *** Possible chimera with exon-exon boundary");
215 if (this->cdna_direction > 0) {
216 FPRINTF(fp," (sense)");
217 } else if (this->cdna_direction < 0) {
218 FPRINTF(fp," (antisense)");
219 }
220 FPRINTF(fp," at %d (dinucl = %c%c-%c%c, donor_prob = %.3f, acceptor_prob = %.3f)",
221 this->exonexonpos+1,this->donor1,this->donor2,this->acceptor2,this->acceptor1,
222 this->donor_prob,this->acceptor_prob);
223 } else if (this->equivpos == this->chimerapos) {
224 FPRINTF(fp," *** Possible chimera with breakpoint at %d",this->chimerapos+1);
225 } else {
226 FPRINTF(fp," *** Possible chimera with breakpoint at %d..%d",this->chimerapos+1,this->equivpos+1);
227 }
228
229 return;
230 }
231
232
233 #define NPSEUDO 10.0
234
235 int
Chimera_alignment_break(int * newstart,int * newend,Stage3_T stage3,int queryntlength,double fthreshold)236 Chimera_alignment_break (int *newstart, int *newend, Stage3_T stage3, int queryntlength, double fthreshold) {
237 int breakpoint;
238 int *matchscores;
239
240 /* x signifies nmatches, y signifies nmismatches, x + y = n */
241 int start, end, pos, x = 0, y, n, x_left, y_left, x_right, y_right, n_left, n_right;
242 double theta, x_pseudo, theta_left, theta_right, rss, rss_left, rss_right, rss_sep;
243 double fscore, min_rss_sep, best_pos = -1, best_theta_left, best_theta_right;
244
245 /*
246 start = Sequence_trim_start(queryseq);
247 end = Sequence_trim_end(queryseq);
248 */
249
250 start = Stage3_querystart(stage3);
251 end = Stage3_queryend(stage3);
252
253 if (queryntlength < MAX_QUERYLENGTH_STACK) {
254 matchscores = (int *) CALLOCA(queryntlength,sizeof(int));
255 } else {
256 matchscores = (int *) CALLOC(queryntlength,sizeof(int));
257 }
258 Pair_matchscores(matchscores,Stage3_pairarray(stage3),Stage3_npairs(stage3));
259
260 x = 0;
261 for (pos = start; pos < end; pos++) {
262 x += matchscores[pos];
263 }
264 n = end - start;
265 y = n - x;
266
267 /* when rss_sep == rss, fscore == 0 */
268 min_rss_sep = rss = (double) x * (double) y/(double) n;
269 if (rss == 0.0) {
270 if (queryntlength < MAX_QUERYLENGTH_STACK) {
271 FREEA(matchscores);
272 } else {
273 FREE(matchscores);
274 }
275 return 0;
276 }
277
278 theta = (double) x/(double) n;
279 x_pseudo = NPSEUDO * theta;
280 debug1(printf("%d %d %d %f\n",x,y,n,theta));
281
282 x_left = y_left = n_left = 0;
283 x_right = x;
284 y_right = y;
285 n_right = n;
286
287 debug1(printf("%s %s %s %s %s %s %s %s %s %s %s %s %s\n",
288 "pos","match","x.left","y.left","n.left","x.right","y.right","n.right",
289 "theta.left","theta.right","rss.left","rss.right","fscore"));
290
291 for (pos = start; pos < end-1; pos++) {
292 if (matchscores[pos] == 1) {
293 x_left++;
294 x_right--;
295 } else {
296 y_left++;
297 y_right--;
298 }
299 n_left++;
300 n_right--;
301
302 theta_left = ((double) x_left + x_pseudo)/((double) n_left + NPSEUDO);
303 theta_right = ((double) x_right + x_pseudo)/((double) n_right + NPSEUDO);
304 rss_left = x_left*(1.0-theta_left)*(1.0-theta_left) + y_left*theta_left*theta_left;
305 rss_right = x_right*(1.0-theta_right)*(1.0-theta_right) + y_right*theta_right*theta_right;
306 rss_sep = rss_left + rss_right;
307
308 if (rss_sep == 0) {
309 debug1(printf("%d %d %d %d %d %d %d %d %f %f %f %f NA\n",
310 pos,matchscores[pos],x_left,y_left,n_left,x_right,y_right,n_right,
311 theta_left,theta_right,rss_left,rss_right));
312 } else {
313 debug1(
314 fscore = ((double) (n - 2))*(rss - rss_sep)/rss_sep;
315 printf("%d %d %d %d %d %d %d %d %f %f %f %f %f\n",
316 pos,matchscores[pos],x_left,y_left,n_left,x_right,y_right,n_right,
317 theta_left,theta_right,rss_left,rss_right,fscore);
318 );
319
320 /* fscore = (n-2)*(rss - rss_sep)/rss_sep = (n-2)*(rss/rss_sep -
321 1) is maximized when rss_sep is minimized */
322
323 if (rss_sep < min_rss_sep) {
324 min_rss_sep = rss_sep;
325 best_pos = pos;
326 best_theta_left = theta_left;
327 best_theta_right = theta_right;
328 }
329 }
330 }
331 if (queryntlength < MAX_QUERYLENGTH_STACK) {
332 FREEA(matchscores);
333 } else {
334 FREE(matchscores);
335 }
336
337 fscore = ((double) (n - 2))*(rss - min_rss_sep)/min_rss_sep;
338 if (fscore < fthreshold) {
339 return 0;
340 } else {
341 breakpoint = best_pos;
342 debug1(printf("at %d, fscore = %f\n",breakpoint,fscore));
343 if (best_theta_left < best_theta_right) {
344 /* trim left */
345 *newstart = breakpoint;
346 *newend = end;
347 return breakpoint - start;
348 } else {
349 /* trim right */
350 *newstart = start;
351 *newend = breakpoint;
352 return end - breakpoint;
353 }
354 }
355 }
356
357
358 bool
Chimera_local_join_p(Stage3_T from,Stage3_T to,int chimera_slop)359 Chimera_local_join_p (Stage3_T from, Stage3_T to, int chimera_slop) {
360 debug3(printf("? local_join_p from [%p] %d..%d (%u..%u) -> to [%p] %d..%d (%u..%u) => ",
361 from,Stage3_querystart(from),Stage3_queryend(from),
362 Stage3_chrstart(from),Stage3_chrend(from),
363 to,Stage3_querystart(to),Stage3_queryend(to),
364 Stage3_chrstart(to),Stage3_chrend(to)));
365
366 if (Stage3_npairs(from) == 0) {
367 return false;
368
369 } else if (Stage3_npairs(to) == 0) {
370 return false;
371
372 } else if (Stage3_chimera_right_p(from) == true) {
373 debug3(printf("false, because from is already part of a chimera on its right\n"));
374 return false;
375
376 } else if (Stage3_chimera_left_p(to) == true) {
377 debug3(printf("false, because to is already part of a chimera on its left\n"));
378 return false;
379
380 } else if (Stage3_chrnum(from) != Stage3_chrnum(to)) {
381 debug3(printf("false, because different chromosomes\n"));
382 return false;
383
384 } else if (Stage3_watsonp(from) != Stage3_watsonp(to)) {
385 debug3(printf("false, because different strands\n"));
386 return false;
387
388 } else if (Stage3_querystart(from) >= Stage3_querystart(to) &&
389 Stage3_queryend(from) <= Stage3_queryend(to)) {
390 debug3(printf("false, because from %d..%d is subsumed by to %d..%d\n",
391 Stage3_querystart(from),Stage3_queryend(from),
392 Stage3_querystart(to),Stage3_queryend(to)));
393 return false;
394
395 } else if (Stage3_querystart(to) >= Stage3_querystart(from) &&
396 Stage3_queryend(to) <= Stage3_queryend(from)) {
397 debug3(printf("false, because to %d..%d is subsumed by from %d..%d\n",
398 Stage3_querystart(to),Stage3_queryend(to),
399 Stage3_querystart(from),Stage3_queryend(from)));
400 return false;
401
402 } else if (Stage3_queryend(from) - Stage3_querystart(to) > chimera_slop ||
403 Stage3_querystart(to) - Stage3_queryend(from) > chimera_slop) {
404 debug3(printf("false, because %d - %d > chimera_slop %d or %d - %d > chimera_slop %d\n",
405 Stage3_queryend(from),Stage3_querystart(to),chimera_slop,
406 Stage3_querystart(to),Stage3_queryend(from),chimera_slop));
407 return false;
408
409 } else {
410 if (Stage3_watsonp(from) == true) {
411 if (Stage3_genomicend(from) > Stage3_genomicstart(to) + chimera_slop) {
412 debug3(printf("false, because genomic %u > %u + %d\n",
413 Stage3_genomicend(from),Stage3_genomicstart(to),chimera_slop));
414 return false;
415 } else {
416 return true;
417 }
418
419 } else {
420 if (Stage3_genomicend(from) + chimera_slop < Stage3_genomicstart(to)) {
421 debug3(printf("false, because genomic %u + %d < %u\n",
422 Stage3_genomicend(from),chimera_slop,Stage3_genomicstart(to)));
423 return false;
424 } else {
425 return true;
426 }
427
428 }
429
430 }
431 }
432
433
434 bool
Chimera_distant_join_p(Stage3_T from,Stage3_T to,int chimera_slop)435 Chimera_distant_join_p (Stage3_T from, Stage3_T to, int chimera_slop) {
436 debug3(printf("? chimeric_join_p from %d..%d (%u..%u) -> to %d..%d (%u..%u) => ",
437 Stage3_querystart(from),Stage3_queryend(from),
438 Stage3_chrstart(from),Stage3_chrend(from),
439 Stage3_querystart(to),Stage3_queryend(to),
440 Stage3_chrstart(to),Stage3_chrend(to)));
441
442 if (Stage3_chimera_right_p(from) == true) {
443 debug3(printf("false, because from is already part of a chimera on its right\n"));
444 return false;
445
446 } else if (Stage3_chimera_left_p(to) == true) {
447 debug3(printf("false, because to is already part of a chimera on its left\n"));
448 return false;
449
450 } else if (Stage3_querystart(from) >= Stage3_querystart(to) &&
451 Stage3_queryend(from) <= Stage3_queryend(to)) {
452 debug3(printf("false, because from %d..%d is subsumed by to %d..%d\n",
453 Stage3_querystart(from),Stage3_queryend(from),
454 Stage3_querystart(to),Stage3_queryend(to)));
455 return false;
456 } else if (Stage3_querystart(to) >= Stage3_querystart(from) &&
457 Stage3_queryend(to) <= Stage3_queryend(from)) {
458 debug3(printf("false, because to %d..%d is subsumed by from %d..%d\n",
459 Stage3_querystart(to),Stage3_queryend(to),
460 Stage3_querystart(from),Stage3_queryend(from)));
461 return false;
462 } else if (Stage3_queryend(from) - Stage3_querystart(to) <= chimera_slop &&
463 Stage3_querystart(to) - Stage3_queryend(from) <= chimera_slop) {
464 debug3(printf("true, because %d - %d <= %d and %d - %d <= %d\n",
465 Stage3_queryend(from),Stage3_querystart(to),chimera_slop,
466 Stage3_querystart(to),Stage3_queryend(from),chimera_slop));
467 return true;
468 } else {
469 debug3(printf(" %d and %d not within chimera_slop %d",
470 Stage3_queryend(from) - Stage3_querystart(to),Stage3_querystart(to) - Stage3_queryend(from),
471 chimera_slop));
472 debug3(printf("false\n"));
473 return false;
474 }
475 }
476
477
478 #define NEG_INFINITY -1000000
479 #define PRE_EXTENSION_SLOP 6
480
481 bool
Chimera_bestpath(int * five_score,int * three_score,int * chimerapos,int * chimeraequivpos,int * bestfrom,int * bestto,Stage3_T * stage3array_sub1,int npaths_sub1,Stage3_T * stage3array_sub2,int npaths_sub2,int queryntlength,int chimera_slop,bool * circularp,bool localp)482 Chimera_bestpath (int *five_score, int *three_score, int *chimerapos, int *chimeraequivpos, int *bestfrom, int *bestto,
483 Stage3_T *stage3array_sub1, int npaths_sub1, Stage3_T *stage3array_sub2, int npaths_sub2,
484 int queryntlength, int chimera_slop, bool *circularp, bool localp) {
485 int **matrix_sub1, **matrix_sub2, *from, *to, *bestscoreatpos, i, j, pos, score,
486 bestscore = NEG_INFINITY;
487 bool **gapp_sub1, **gapp_sub2;
488 bool foundp = false;
489
490 debug4(printf("Chimera_bestpath called\n"));
491
492 from = (int *) CALLOC(queryntlength,sizeof(int));
493 to = (int *) CALLOC(queryntlength,sizeof(int));
494 bestscoreatpos = (int *) CALLOC(queryntlength,sizeof(int));
495
496 matrix_sub1 = (int **) CALLOC(npaths_sub1,sizeof(int *));
497 gapp_sub1 = (bool **) CALLOC(npaths_sub1,sizeof(bool *));
498 debug4(printf("sub1:"));
499 for (i = 0; i < npaths_sub1; i++) {
500 debug4(printf(" %p",stage3array_sub1[i]));
501 matrix_sub1[i] = (int *) CALLOC(queryntlength,sizeof(int));
502 gapp_sub1[i] = (bool *) CALLOC(queryntlength,sizeof(bool));
503 debug4(Pair_dump_array(Stage3_pairarray(stage3array_sub1[i]),Stage3_npairs(stage3array_sub1[i]),true));
504 /* Allow pre_extension_slop, in case the parts need extensions to merge */
505 Pair_pathscores(gapp_sub1[i],matrix_sub1[i],Stage3_pairarray(stage3array_sub1[i]),
506 Stage3_npairs(stage3array_sub1[i]),Stage3_cdna_direction(stage3array_sub1[i]),
507 queryntlength,FIVE,PRE_EXTENSION_SLOP);
508 }
509 debug4(printf("\n"));
510
511 debug4(printf("sub2:"));
512 matrix_sub2 = (int **) CALLOC(npaths_sub2,sizeof(int *));
513 gapp_sub2 = (bool **) CALLOC(npaths_sub2,sizeof(bool *));
514 for (i = 0; i < npaths_sub2; i++) {
515 debug4(printf(" %p",stage3array_sub2[i]));
516 matrix_sub2[i] = (int *) CALLOC(queryntlength,sizeof(int));
517 gapp_sub2[i] = (bool *) CALLOC(queryntlength,sizeof(bool));
518 debug4(Pair_dump_array(Stage3_pairarray(stage3array_sub2[i]),Stage3_npairs(stage3array_sub2[i]),true));
519 /* Allow pre_extension_slop, in case the parts need extensions to merge */
520 Pair_pathscores(gapp_sub2[i],matrix_sub2[i],Stage3_pairarray(stage3array_sub2[i]),
521 Stage3_npairs(stage3array_sub2[i]),Stage3_cdna_direction(stage3array_sub2[i]),
522 queryntlength,THREE,PRE_EXTENSION_SLOP);
523 }
524 debug4(printf("\n"));
525
526 for (pos = 0; pos < queryntlength; pos++) {
527 bestscoreatpos[pos] = NEG_INFINITY;
528 }
529 debug4(printf("npaths_sub1 = %d, npaths_sub2 = %d\n",npaths_sub1,npaths_sub2));
530 for (i = 0; i < npaths_sub1; i++) {
531 if (circularp != NULL && circularp[Stage3_chrnum(stage3array_sub1[i])] == true) {
532 /* Do not make chimeras to circular chromosomes */
533 } else {
534 for (j = 0; j < npaths_sub2; j++) {
535 if (circularp != NULL && circularp[Stage3_chrnum(stage3array_sub2[j])] == true) {
536 /* Do not make chimeras to circular chromosomes */
537 } else if (stage3array_sub1[i] == stage3array_sub2[j]) {
538 /* Same stage3 object, so not joinable */
539 } else if (localp == true && Chimera_local_join_p(stage3array_sub1[i],stage3array_sub2[j],chimera_slop) == false) {
540 /* Not joinable */
541 } else {
542 for (pos = 0; pos < queryntlength - 1; pos++) {
543 debug4(printf("pos %d, gapp %d and %d\n",pos,gapp_sub1[i][pos],gapp_sub2[j][pos]));
544 if (gapp_sub1[i][pos] == false && gapp_sub2[j][pos+1] == false) {
545 #if 0
546 score = matrix_sub2[j][queryntlength-1] - matrix_sub2[j][pos] + matrix_sub1[i][pos] /* - 0 */;
547 #else
548 /* For new Pair_pairscores computation */
549 score = matrix_sub1[i][pos] + matrix_sub2[j][pos];
550 #endif
551 debug4(printf("score %d\n",score));
552 if (score > bestscoreatpos[pos]) {
553 bestscoreatpos[pos] = score;
554 from[pos] = i;
555 to[pos] = j;
556 }
557 }
558 }
559 }
560 }
561 }
562 }
563
564 for (pos = 0; pos < queryntlength - 1; pos++) {
565 if (bestscoreatpos[pos] > bestscore) {
566 bestscore = bestscoreatpos[pos];
567 *chimerapos = *chimeraequivpos = pos;
568 *bestfrom = from[pos];
569 *bestto = to[pos];
570 foundp = true;
571 } else if (bestscoreatpos[pos] == bestscore) {
572 *chimeraequivpos = pos;
573 }
574 }
575
576 if (foundp == true) {
577 *five_score = matrix_sub1[*bestfrom][*chimerapos] /* - 0 */;
578 #if 0
579 *three_score = matrix_sub2[*bestto][queryntlength-1] - matrix_sub2[*bestto][*chimerapos];
580 #else
581 *three_score = matrix_sub2[*bestto][*chimerapos];
582 #endif
583
584 debug4(
585 for (pos = 0; pos < queryntlength - 1; pos++) {
586 printf("%d:",pos);
587 for (i = 0; i < npaths_sub1; i++) {
588 printf("\t%d",matrix_sub1[i][pos]);
589 if (gapp_sub1[i][pos] == true) {
590 printf("X");
591 }
592 }
593 printf("\t|");
594 for (i = 0; i < npaths_sub2; i++) {
595 printf("\t%d",matrix_sub2[i][pos]);
596 if (gapp_sub2[i][pos] == true) {
597 printf("X");
598 }
599 }
600 printf("\t||");
601 printf("%d (%d->%d)",bestscoreatpos[pos],from[pos],to[pos]);
602 if (pos >= *chimerapos && pos <= *chimeraequivpos) {
603 printf(" ** ");
604 }
605 printf("\n");
606 }
607 printf("From path %d to path %d at pos %d..%d. 5 score = %d, 3 score = %d\n",
608 *bestfrom,*bestto,*chimerapos,*chimeraequivpos,*five_score,*three_score);
609 fflush(stdout);
610 );
611 }
612
613 for (i = 0; i < npaths_sub2; i++) {
614 FREE(gapp_sub2[i]);
615 FREE(matrix_sub2[i]);
616 }
617 FREE(gapp_sub2);
618 FREE(matrix_sub2);
619
620 for (i = 0; i < npaths_sub1; i++) {
621 FREE(gapp_sub1[i]);
622 FREE(matrix_sub1[i]);
623 }
624 FREE(gapp_sub1);
625 FREE(matrix_sub1);
626
627 FREE(bestscoreatpos);
628 FREE(to);
629 FREE(from);
630
631 debug4(printf("Chimera_bestpath returning %d\n",foundp));
632 return foundp;
633 }
634
635 static char *complCode = COMPLEMENT_UC;
636
637 /* Modeled after Chimera_bestpath */
638 /* Called if Chimera_find_exonexon fails */
639 int
Chimera_find_breakpoint(int * chimeraequivpos,int * rangelow,int * rangehigh,char * donor1,char * donor2,char * acceptor2,char * acceptor1,Stage3_T left_part,Stage3_T right_part,int queryntlength,Genome_T genome,Chrpos_T left_chrlength,Chrpos_T right_chrlength)640 Chimera_find_breakpoint (int *chimeraequivpos, int *rangelow, int *rangehigh,
641 char *donor1, char *donor2, char *acceptor2, char *acceptor1,
642 Stage3_T left_part, Stage3_T right_part, int queryntlength, Genome_T genome,
643 Chrpos_T left_chrlength, Chrpos_T right_chrlength) {
644 int chimerapos = 0, breakpoint;
645 int *matrix_sub1, *matrix_sub2, pos, score, bestscore, secondbest;
646 bool *gapp_sub1, *gapp_sub2;
647 Univcoord_T left;
648
649 if (Stage3_npairs(left_part) == 0 || Stage3_npairs(right_part) == 0) {
650 return -1;
651 }
652
653 /* Don't allow pre_extension_slop here, because the ends have already been extended */
654 matrix_sub1 = (int *) CALLOC(queryntlength,sizeof(int));
655 gapp_sub1 = (bool *) CALLOC(queryntlength,sizeof(bool));
656 debug4(Pair_dump_array(Stage3_pairarray(left_part),Stage3_npairs(left_part),true));
657 Pair_pathscores(gapp_sub1,matrix_sub1,Stage3_pairarray(left_part),Stage3_npairs(left_part),
658 Stage3_cdna_direction(left_part),queryntlength,FIVE,/*pre_extension_slop*/0);
659
660 matrix_sub2 = (int *) CALLOC(queryntlength,sizeof(int));
661 gapp_sub2 = (bool *) CALLOC(queryntlength,sizeof(bool));
662 debug4(Pair_dump_array(Stage3_pairarray(right_part),Stage3_npairs(right_part),true));
663 Pair_pathscores(gapp_sub2,matrix_sub2,Stage3_pairarray(right_part),Stage3_npairs(right_part),
664 Stage3_cdna_direction(right_part),queryntlength,THREE,/*pre_extension_slop*/0);
665
666
667 bestscore = secondbest = -100000;
668 for (pos = 0; pos < queryntlength - 1; pos++) {
669 debug(
670 printf("%d:",pos);
671 printf("\t%d",matrix_sub1[pos]);
672 if (gapp_sub1[pos] == true) {
673 printf("X");
674 }
675 printf("\t|");
676 printf("\t%d",matrix_sub2[pos]);
677 if (gapp_sub2[pos] == true) {
678 printf("X");
679 }
680 printf("\t||");
681 );
682
683 if (gapp_sub1[pos] == false) {
684 if (gapp_sub2[pos+1] == false) {
685 /* Check for the same stage3 object on both lists */
686 #if 0
687 /* ? Old formula for use before Pair_pathscores had cdnaend argument */
688 score = matrix_sub2[queryntlength-1] - matrix_sub2[pos] + matrix_sub1[pos] /* - 0 */;
689 #else
690 score = matrix_sub1[pos] + matrix_sub2[pos+1];
691 #endif
692
693 if (score > bestscore) {
694 secondbest = bestscore;
695 bestscore = score;
696 chimerapos = *chimeraequivpos = pos;
697 } else if (score == bestscore) {
698 *chimeraequivpos = pos;
699 } else if (score > secondbest) {
700 secondbest = score;
701 }
702
703 debug(
704 printf("%d = %d + %d",score,matrix_sub1[pos],matrix_sub2[pos+1]);
705 if (pos >= chimerapos && pos <= *chimeraequivpos) {
706 printf(" ** chimerapos %d, chimeraequivpos %d",chimerapos,*chimeraequivpos);
707 }
708 );
709
710 }
711 }
712 debug(printf("\n"));
713 }
714 debug(printf("chimerapos %d, chimeraequivpos %d\n",chimerapos,*chimeraequivpos));
715
716
717 /* Use secondbest to find a range for exon-exon searching */
718 *rangelow = *rangehigh = 0;
719 for (pos = 0; pos < queryntlength - 1; pos++) {
720 if (gapp_sub1[pos] == false) {
721 if (gapp_sub2[pos+1] == false) {
722 /* Check for the same stage3 object on both lists */
723 #if 0
724 /* ? Old formula for use before Pair_pathscores had cdnaend argument */
725 score = matrix_sub2[queryntlength-1] - matrix_sub2[pos] + matrix_sub1[pos] /* - 0 */;
726 #else
727 score = matrix_sub1[pos] + matrix_sub2[pos+1];
728 #endif
729
730 if (score == secondbest) {
731 if (*rangelow == 0) {
732 *rangelow = *rangehigh = pos;
733 } else {
734 *rangehigh = pos;
735 }
736 }
737 }
738 }
739 }
740 if (*rangelow > chimerapos) {
741 *rangelow = chimerapos;
742 }
743 if (*rangehigh < *chimeraequivpos) {
744 *rangehigh = *chimeraequivpos;
745 }
746 debug(printf("For secondbest score of %d: rangelow %d, rangehigh %d\n",secondbest,*rangelow,*rangehigh));
747
748
749 #if 0
750 *five_score = matrix_sub1[*chimerapos] /* - 0 */;
751 *three_score = matrix_sub2[queryntlength-1] - matrix_sub2[*chimerapos];
752 #endif
753
754 FREE(gapp_sub2);
755 FREE(matrix_sub2);
756
757 FREE(gapp_sub1);
758 FREE(matrix_sub1);
759
760 if (chimerapos == 0) {
761 /* Never found a breakpoint */
762 return -1;
763 } else {
764 breakpoint = (chimerapos + (*chimeraequivpos))/2;
765
766 if (Stage3_watsonp(left_part) == true) {
767 if ((left = Stage3_genomicpos(left_part,breakpoint,/*headp*/false)) >= left_chrlength - 2) {
768 debug(printf("left %u >= left_chrlength %u - 2, so not finding donor dinucleotides\n",left,left_chrlength));
769 *donor1 = *donor2 = 'N';
770 } else {
771 debug(printf("left %u < left_chrlength %u - 2, so okay\n",left,left_chrlength));
772 *donor1 = Genome_get_char(genome,left+1);
773 *donor2 = Genome_get_char(genome,left+2);
774 }
775 } else {
776 if ((left = Stage3_genomicpos(left_part,breakpoint,/*headp*/false)) < 2) {
777 debug(printf("left %u < 2, so not finding donor dinucleotides\n",left));
778 *donor1 = *donor2 = 'N';
779 } else {
780 debug(printf("left %u >= 2, so okay\n",left));
781 *donor1 = complCode[(int) Genome_get_char(genome,left-1)];
782 *donor2 = complCode[(int) Genome_get_char(genome,left-2)];
783 }
784 }
785
786 if (Stage3_watsonp(right_part) == true) {
787 if ((left = Stage3_genomicpos(right_part,breakpoint+1,/*headp*/true)) < 2) {
788 debug(printf("left %u < 2, so not finding acceptor dinucleotides\n",left));
789 *acceptor1 = *acceptor2 = 'N';
790 } else {
791 debug(printf("left %u >= 2, so okay\n",left));
792 *acceptor2 = Genome_get_char(genome,left-2);
793 *acceptor1 = Genome_get_char(genome,left-1);
794 }
795 } else {
796 if ((left = Stage3_genomicpos(right_part,breakpoint+1,/*headp*/true)) >= right_chrlength - 2) {
797 debug(printf("left %u >= right_chrlength %u - 2, so not finding acceptor dinucleotides\n",left,right_chrlength));
798 *acceptor1 = *acceptor2 = 'N';
799 } else {
800 debug(printf("left %u <right_chrlength %u - 2, so okay\n",left,right_chrlength));
801 *acceptor2 = complCode[(int) Genome_get_char(genome,left+2)];
802 *acceptor1 = complCode[(int) Genome_get_char(genome,left+1)];
803 }
804 }
805
806 return chimerapos;
807 }
808 }
809
810
811 static double
find_exonexon_fwd(int * exonexonpos,char * donor1,char * donor2,char * acceptor2,char * acceptor1,char * comp,bool * donor_watsonp,bool * acceptor_watsonp,double * donor_prob,double * acceptor_prob,Stage3_T left_part,Stage3_T right_part,Genome_T genome,Genome_T genomealt,Univ_IIT_T chromosome_iit,int breakpoint_start,int breakpoint_end)812 find_exonexon_fwd (int *exonexonpos, char *donor1, char *donor2, char *acceptor2, char *acceptor1,
813 char *comp, bool *donor_watsonp, bool *acceptor_watsonp, double *donor_prob, double *acceptor_prob,
814 Stage3_T left_part, Stage3_T right_part, Genome_T genome, Genome_T genomealt,
815 Univ_IIT_T chromosome_iit, int breakpoint_start, int breakpoint_end) {
816 Sequence_T donor_genomicseg, acceptor_genomicseg, donor_genomicalt, acceptor_genomicalt;
817 char *donor_ptr, *acceptor_ptr, *donor_altptr, *acceptor_altptr;
818 int i, j;
819 Univcoord_T left;
820 int donor_length, acceptor_length;
821 bool revcomp;
822 char left1, left2, right2, right1;
823 char left1_alt, left2_alt, right2_alt, right1_alt;
824 int introntype;
825 double donor_prob_1, acceptor_prob_1, donor_altprob_1, acceptor_altprob_1, bestproduct = 0.0, product;
826
827 *exonexonpos = -1;
828
829 donor_length = breakpoint_end - breakpoint_start + DONOR_MODEL_LEFT_MARGIN + DONOR_MODEL_RIGHT_MARGIN;
830 left = Stage3_genomicpos(left_part,breakpoint_start,/*headp*/false);
831 if ((*donor_watsonp = Stage3_watsonp(left_part)) == true) {
832 left -= DONOR_MODEL_LEFT_MARGIN;
833 revcomp = false;
834 } else {
835 left += DONOR_MODEL_LEFT_MARGIN;
836 left -= donor_length;
837 revcomp = true;
838 }
839
840 debug2(printf("Getting donor at left %u\n",left));
841 donor_genomicseg = Genome_get_segment(genome,left,donor_length+1,chromosome_iit,revcomp);
842 donor_ptr = Sequence_fullpointer(donor_genomicseg);
843 donor_genomicalt = Genome_get_segment_alt(genomealt,left,donor_length+1,chromosome_iit,revcomp);
844 donor_altptr = Sequence_fullpointer(donor_genomicalt);
845
846 debug2(
847 for (i = 0; i < ACCEPTOR_MODEL_LEFT_MARGIN - DONOR_MODEL_LEFT_MARGIN; i++) {
848 printf(" ");
849 }
850 Sequence_stdout(donor_genomicseg,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/false);
851 );
852
853 acceptor_length = breakpoint_end - breakpoint_start + ACCEPTOR_MODEL_LEFT_MARGIN + ACCEPTOR_MODEL_RIGHT_MARGIN;
854 left = Stage3_genomicpos(right_part,breakpoint_end+1,/*headp*/true);
855 if ((*acceptor_watsonp = Stage3_watsonp(right_part)) == true) {
856 left += ACCEPTOR_MODEL_RIGHT_MARGIN;
857 left -= acceptor_length;
858 revcomp = false;
859 } else {
860 left -= ACCEPTOR_MODEL_RIGHT_MARGIN;
861 revcomp = true;
862 }
863
864 debug2(printf("Getting acceptor at left %u\n",left));
865 acceptor_genomicseg = Genome_get_segment(genome,left,acceptor_length+1,chromosome_iit,revcomp);
866 acceptor_ptr = Sequence_fullpointer(acceptor_genomicseg);
867 acceptor_genomicalt = Genome_get_segment(genomealt,left,acceptor_length+1,chromosome_iit,revcomp);
868 acceptor_altptr = Sequence_fullpointer(acceptor_genomicalt);
869 debug2(
870 for (i = 0; i < DONOR_MODEL_LEFT_MARGIN - ACCEPTOR_MODEL_LEFT_MARGIN; i++) {
871 printf(" ");
872 }
873 Sequence_stdout(acceptor_genomicseg,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/false);
874 );
875
876 *donor_prob = 0.0;
877 *acceptor_prob = 0.0;
878 for (i = DONOR_MODEL_LEFT_MARGIN, j = ACCEPTOR_MODEL_LEFT_MARGIN;
879 i <= donor_length - DONOR_MODEL_RIGHT_MARGIN &&
880 j <= acceptor_length - ACCEPTOR_MODEL_RIGHT_MARGIN;
881 i++, j++) {
882
883 left1 = donor_ptr[i+1];
884 left2 = donor_ptr[i+2];
885 right2 = acceptor_ptr[j-2];
886 right1 = acceptor_ptr[j-1];
887
888 left1_alt = donor_altptr[i+1];
889 left2_alt = donor_altptr[i+2];
890 right2_alt = acceptor_altptr[j-2];
891 right1_alt = acceptor_altptr[j-1];
892
893 debug2(printf(" Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
894 introntype = Intron_type(left1,left2,right2,right1,
895 left1_alt,left2_alt,right2_alt,right1_alt,
896 /*cdna_direction*/+1);
897 debug2(printf(" Introntype is %s\n",Intron_type_string(introntype)));
898
899 donor_prob_1 = Maxent_donor_prob(&(donor_ptr[i+1-DONOR_MODEL_LEFT_MARGIN]));
900 acceptor_prob_1 = Maxent_acceptor_prob(&(acceptor_ptr[j-ACCEPTOR_MODEL_LEFT_MARGIN]));
901 donor_altprob_1 = Maxent_donor_prob(&(donor_altptr[i+1-DONOR_MODEL_LEFT_MARGIN]));
902 acceptor_altprob_1 = Maxent_acceptor_prob(&(acceptor_altptr[j-ACCEPTOR_MODEL_LEFT_MARGIN]));
903
904 debug2(printf("%d %c%c %c%c %.2f %.2f\n",
905 breakpoint_start - DONOR_MODEL_LEFT_MARGIN + i,
906 donor_ptr[i+1],donor_ptr[i+2],acceptor_ptr[j-2],acceptor_ptr[j-1],
907 donor_prob_1,acceptor_prob_1));
908
909 if (donor_prob_1 < 0.50 && acceptor_prob_1 < 0.50 && donor_altprob_1 < 0.50 && acceptor_altprob_1 < 0.50) {
910 /* Skip */
911 } else if (introntype != NONINTRON || donor_prob_1 > 0.90 || acceptor_prob_1 > 0.90 || donor_altprob_1 > 0.90 || acceptor_altprob_1 > 0.90) {
912 if ((product = donor_prob_1*acceptor_prob_1) > bestproduct) {
913 bestproduct = product;
914 *donor1 = donor_ptr[i+1];
915 *donor2 = donor_ptr[i+2];
916 *acceptor2 = acceptor_ptr[j-2];
917 *acceptor1 = acceptor_ptr[j-1];
918 if (donor_prob_1 >= donor_altprob_1) {
919 *donor_prob = donor_prob_1;
920 } else {
921 *donor_prob = donor_altprob_1;
922 }
923 if (acceptor_prob_1 >= acceptor_altprob_1) {
924 *acceptor_prob = acceptor_prob_1;
925 } else {
926 *acceptor_prob = acceptor_altprob_1;
927 }
928 *exonexonpos = breakpoint_start - DONOR_MODEL_LEFT_MARGIN + i;
929
930 switch (introntype) {
931 case GTAG_FWD: *comp = FWD_CANONICAL_INTRON_COMP; break;
932 case GCAG_FWD: *comp = FWD_GCAG_INTRON_COMP; break;
933 case ATAC_FWD: *comp = FWD_ATAC_INTRON_COMP; break;
934 default: *comp = NONINTRON_COMP; break;
935 }
936
937 }
938 }
939 }
940
941 Sequence_free(&acceptor_genomicalt);
942 Sequence_free(&donor_genomicalt);
943 Sequence_free(&acceptor_genomicseg);
944 Sequence_free(&donor_genomicseg);
945
946 return bestproduct;
947 }
948
949 static double
find_exonexon_rev(int * exonexonpos,char * donor1,char * donor2,char * acceptor2,char * acceptor1,char * comp,bool * donor_watsonp,bool * acceptor_watsonp,double * donor_prob,double * acceptor_prob,Stage3_T left_part,Stage3_T right_part,Genome_T genome,Genome_T genomealt,Univ_IIT_T chromosome_iit,int breakpoint_start,int breakpoint_end)950 find_exonexon_rev (int *exonexonpos, char *donor1, char *donor2, char *acceptor2, char *acceptor1,
951 char *comp, bool *donor_watsonp, bool *acceptor_watsonp, double *donor_prob, double *acceptor_prob,
952 Stage3_T left_part, Stage3_T right_part, Genome_T genome, Genome_T genomealt,
953 Univ_IIT_T chromosome_iit, int breakpoint_start, int breakpoint_end) {
954 Sequence_T donor_genomicseg, acceptor_genomicseg, donor_genomicalt, acceptor_genomicalt;
955 char *donor_ptr, *acceptor_ptr, *donor_altptr, *acceptor_altptr;
956 int i, j;
957 Univcoord_T left;
958 int donor_length, acceptor_length;
959 bool revcomp;
960 char left1, left2, right2, right1;
961 char left1_alt, left2_alt, right2_alt, right1_alt;
962 int introntype;
963 double donor_prob_1, acceptor_prob_1, donor_altprob_1, acceptor_altprob_1, bestproduct = 0.0, product;
964
965 *exonexonpos = -1;
966
967 donor_length = breakpoint_end - breakpoint_start + DONOR_MODEL_LEFT_MARGIN + DONOR_MODEL_RIGHT_MARGIN;
968 left = Stage3_genomicpos(right_part,breakpoint_end+1,/*headp*/true);
969 if ((*donor_watsonp = Stage3_watsonp(right_part)) == true) {
970 left += DONOR_MODEL_LEFT_MARGIN;
971 left -= donor_length;
972 revcomp = true;
973 } else {
974 left -= DONOR_MODEL_LEFT_MARGIN;
975 revcomp = false;
976 }
977
978 debug2(printf("Getting donor at left %u\n",left));
979 donor_genomicseg = Genome_get_segment(genome,left,donor_length+1,chromosome_iit,revcomp);
980 donor_ptr = Sequence_fullpointer(donor_genomicseg);
981 donor_genomicalt = Genome_get_segment(genomealt,left,donor_length+1,chromosome_iit,revcomp);
982 donor_altptr = Sequence_fullpointer(donor_genomicalt);
983 debug2(
984 for (i = 0; i < ACCEPTOR_MODEL_LEFT_MARGIN - DONOR_MODEL_LEFT_MARGIN; i++) {
985 printf(" ");
986 }
987 Sequence_stdout(donor_genomicseg,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/false);
988 );
989
990 acceptor_length = breakpoint_end - breakpoint_start + ACCEPTOR_MODEL_LEFT_MARGIN + ACCEPTOR_MODEL_RIGHT_MARGIN;
991 left = Stage3_genomicpos(left_part,breakpoint_start,/*headp*/false);
992 if ((*acceptor_watsonp = Stage3_watsonp(left_part)) == true) {
993 left -= ACCEPTOR_MODEL_RIGHT_MARGIN;
994 revcomp = true;
995 } else {
996 left += ACCEPTOR_MODEL_RIGHT_MARGIN;
997 left -= acceptor_length;
998 revcomp = false;
999 }
1000
1001 debug2(printf("Getting acceptor at left %u\n",left));
1002 acceptor_genomicseg = Genome_get_segment(genome,left,acceptor_length+1,chromosome_iit,revcomp);
1003 acceptor_ptr = Sequence_fullpointer(acceptor_genomicseg);
1004 acceptor_genomicalt = Genome_get_segment(genomealt,left,acceptor_length+1,chromosome_iit,revcomp);
1005 acceptor_altptr = Sequence_fullpointer(acceptor_genomicalt);
1006 debug2(
1007 for (i = 0; i < DONOR_MODEL_LEFT_MARGIN - ACCEPTOR_MODEL_LEFT_MARGIN; i++) {
1008 printf(" ");
1009 }
1010 Sequence_stdout(acceptor_genomicseg,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/false);
1011 );
1012
1013 *donor_prob = 0.0;
1014 *acceptor_prob = 0.0;
1015 for (i = DONOR_MODEL_LEFT_MARGIN, j = ACCEPTOR_MODEL_LEFT_MARGIN;
1016 i <= donor_length - DONOR_MODEL_RIGHT_MARGIN &&
1017 j <= acceptor_length - ACCEPTOR_MODEL_RIGHT_MARGIN;
1018 i++, j++) {
1019
1020 left1 = donor_ptr[i+1];
1021 left2 = donor_ptr[i+2];
1022 right2 = acceptor_ptr[j-2];
1023 right1 = acceptor_ptr[j-1];
1024
1025 left1_alt = donor_altptr[i+1];
1026 left2_alt = donor_altptr[i+2];
1027 right2_alt = acceptor_altptr[j-2];
1028 right1_alt = acceptor_altptr[j-1];
1029
1030 /* Use cdna_direction == +1, because revcomp already applied */
1031 debug2(printf(" Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
1032 introntype = Intron_type(left1,left2,right2,right1,
1033 left1_alt,left2_alt,right2_alt,right1_alt,
1034 /*cdna_direction*/+1);
1035 debug2(printf(" Introntype is %s\n",Intron_type_string(introntype)));
1036
1037 donor_prob_1 = Maxent_donor_prob(&(donor_ptr[i+1-DONOR_MODEL_LEFT_MARGIN]));
1038 acceptor_prob_1 = Maxent_acceptor_prob(&(acceptor_ptr[j-ACCEPTOR_MODEL_LEFT_MARGIN]));
1039 donor_altprob_1 = Maxent_donor_prob(&(donor_altptr[i+1-DONOR_MODEL_LEFT_MARGIN]));
1040 acceptor_altprob_1 = Maxent_acceptor_prob(&(acceptor_altptr[j-ACCEPTOR_MODEL_LEFT_MARGIN]));
1041
1042 debug2(printf("%d %c%c %c%c %.2f %.2f\n",
1043 breakpoint_end + DONOR_MODEL_LEFT_MARGIN - i,
1044 donor_ptr[i+1],donor_ptr[i+2],acceptor_ptr[j-2],acceptor_ptr[j-1],
1045 donor_prob_1,acceptor_prob_1));
1046
1047 if (donor_prob_1 < 0.50 && acceptor_prob_1 < 0.50 && donor_altprob_1 < 0.50 && acceptor_altprob_1 < 0.50) {
1048 /* Skip */
1049 } else if (introntype != NONINTRON || donor_prob_1 > 0.90 || acceptor_prob_1 > 0.90 || donor_altprob_1 > 0.90 || acceptor_altprob_1 > 0.90) {
1050 if ((product = donor_prob_1*acceptor_prob_1) > bestproduct) {
1051 bestproduct = product;
1052 *donor1 = donor_ptr[i+1];
1053 *donor2 = donor_ptr[i+2];
1054 *acceptor2 = acceptor_ptr[j-2];
1055 *acceptor1 = acceptor_ptr[j-1];
1056 if (donor_prob_1 >= donor_altprob_1) {
1057 *donor_prob = donor_prob_1;
1058 } else {
1059 *donor_prob = donor_altprob_1;
1060 }
1061 if (acceptor_prob_1 >= acceptor_altprob_1) {
1062 *acceptor_prob = acceptor_prob_1;
1063 } else {
1064 *acceptor_prob = acceptor_altprob_1;
1065 }
1066 *exonexonpos = breakpoint_end + DONOR_MODEL_LEFT_MARGIN - i;
1067
1068 /* Have to look for forward intron types, but return the revcomp comp */
1069 switch (introntype) {
1070 case GTAG_FWD: *comp = REV_CANONICAL_INTRON_COMP; break;
1071 case GCAG_FWD: *comp = REV_GCAG_INTRON_COMP; break;
1072 case ATAC_FWD: *comp = REV_ATAC_INTRON_COMP; break;
1073 default: *comp = NONINTRON_COMP; break;
1074 }
1075 }
1076 }
1077 }
1078
1079 Sequence_free(&acceptor_genomicalt);
1080 Sequence_free(&donor_genomicalt);
1081 Sequence_free(&acceptor_genomicseg);
1082 Sequence_free(&donor_genomicseg);
1083
1084 return bestproduct;
1085 }
1086
1087
1088 int
Chimera_find_exonexon(int * found_cdna_direction,int * try_cdna_direction,char * donor1,char * donor2,char * acceptor2,char * acceptor1,char * comp,bool * donor_watsonp,bool * acceptor_watsonp,double * donor_prob,double * acceptor_prob,Stage3_T left_part,Stage3_T right_part,Genome_T genome,Genome_T genomealt,Univ_IIT_T chromosome_iit,int breakpoint_start,int breakpoint_end)1089 Chimera_find_exonexon (int *found_cdna_direction, int *try_cdna_direction,
1090 char *donor1, char *donor2, char *acceptor2, char *acceptor1,
1091 char *comp, bool *donor_watsonp, bool *acceptor_watsonp, double *donor_prob, double *acceptor_prob,
1092 Stage3_T left_part, Stage3_T right_part, Genome_T genome, Genome_T genomealt,
1093 Univ_IIT_T chromosome_iit, int breakpoint_start, int breakpoint_end) {
1094 int exonexonpos_fwd, exonexonpos_rev;
1095 char donor1_fwd, donor2_fwd, acceptor2_fwd, acceptor1_fwd,
1096 donor1_rev, donor2_rev, acceptor2_rev, acceptor1_rev;
1097 char comp_fwd, comp_rev;
1098 bool donor_watsonp_fwd, acceptor_watsonp_fwd, donor_watsonp_rev, acceptor_watsonp_rev;
1099 double bestproduct_fwd, bestproduct_rev, donor_prob_fwd, donor_prob_rev, acceptor_prob_fwd, acceptor_prob_rev;
1100 int left_cdna_direction, right_cdna_direction;
1101
1102 debug2(printf("Starting Chimera_find_exonexon with breakpoint %d..%d\n",breakpoint_start,breakpoint_end));
1103 debug2(printf("left part covers query %d to %d\n",Stage3_querystart(left_part),Stage3_queryend(left_part)));
1104 debug2(printf("right part covers query %d to %d\n",Stage3_querystart(right_part),Stage3_queryend(right_part)));
1105
1106 #if 0
1107 if (Stage3_queryend(left_part) < Stage3_querystart(right_part)) {
1108 breakpoint_start = Stage3_queryend(left_part);
1109 breakpoint_end = Stage3_querystart(right_part);
1110 } else {
1111 breakpoint_start = Stage3_querystart(right_part);
1112 breakpoint_end = Stage3_queryend(left_part);
1113 }
1114 breakpoint_start -= 10;
1115 if (breakpoint_start < 0) {
1116 breakpoint_start = 0;
1117 }
1118 breakpoint_end += 10;
1119 if (breakpoint_end > querylength-1) {
1120 breakpoint_end = querylength-1;
1121 }
1122 #endif
1123
1124 if (breakpoint_end < breakpoint_start) {
1125 debug2(printf("Breakpoints do not make sense, so not computing\n"));
1126 *found_cdna_direction = *try_cdna_direction = 0;
1127 return -1;
1128 }
1129
1130 debug2(printf("Starting search for exon-exon boundary at breakpoint_start %d to breakpoint_end %d\n",
1131 breakpoint_start,breakpoint_end));
1132
1133 left_cdna_direction = Stage3_cdna_direction(left_part);
1134 right_cdna_direction = Stage3_cdna_direction(right_part);
1135
1136 if (left_cdna_direction == 0 && right_cdna_direction == 0) {
1137 *try_cdna_direction = 0;
1138 } else if (left_cdna_direction >= 0 && right_cdna_direction >= 0) {
1139 *try_cdna_direction = +1;
1140 } else if (left_cdna_direction <= 0 && right_cdna_direction <= 0) {
1141 *try_cdna_direction = -1;
1142 } else {
1143 *try_cdna_direction = 0;
1144 }
1145
1146 if (*try_cdna_direction == +1) {
1147 *found_cdna_direction = +1;
1148 bestproduct_fwd = find_exonexon_fwd(&exonexonpos_fwd,&donor1_fwd,&donor2_fwd,&acceptor2_fwd,&acceptor1_fwd,
1149 &comp_fwd,&donor_watsonp_fwd,&acceptor_watsonp_fwd,&donor_prob_fwd,&acceptor_prob_fwd,
1150 left_part,right_part,genome,genomealt,chromosome_iit,breakpoint_start,breakpoint_end);
1151 bestproduct_rev = 0.0;
1152
1153 } else if (*try_cdna_direction == -1) {
1154 *found_cdna_direction = -1;
1155 bestproduct_rev = find_exonexon_rev(&exonexonpos_rev,&donor1_rev,&donor2_rev,&acceptor2_rev,&acceptor1_rev,
1156 &comp_rev,&donor_watsonp_rev,&acceptor_watsonp_rev,&donor_prob_rev,&acceptor_prob_rev,
1157 left_part,right_part,genome,genomealt,chromosome_iit,breakpoint_start,breakpoint_end);
1158 bestproduct_fwd = 0.0;
1159
1160 } else {
1161 bestproduct_fwd = find_exonexon_fwd(&exonexonpos_fwd,&donor1_fwd,&donor2_fwd,&acceptor2_fwd,&acceptor1_fwd,
1162 &comp_fwd,&donor_watsonp_fwd,&acceptor_watsonp_fwd,&donor_prob_fwd,&acceptor_prob_fwd,
1163 left_part,right_part,genome,genomealt,chromosome_iit,breakpoint_start,breakpoint_end);
1164 bestproduct_rev = find_exonexon_rev(&exonexonpos_rev,&donor1_rev,&donor2_rev,&acceptor2_rev,&acceptor1_rev,
1165 &comp_rev,&donor_watsonp_rev,&acceptor_watsonp_rev,&donor_prob_rev,&acceptor_prob_rev,
1166 left_part,right_part,genome,genomealt,chromosome_iit,breakpoint_start,breakpoint_end);
1167 }
1168
1169 if (bestproduct_fwd == 0.0 && bestproduct_rev == 0.0) {
1170 *found_cdna_direction = 0;
1171 *donor1 = 'N';
1172 *donor2 = 'N';
1173 *acceptor2 = 'N';
1174 *acceptor1 = 'N';
1175 *comp = NONINTRON_COMP;
1176 *donor_watsonp = true;
1177 *acceptor_watsonp = true;
1178 *donor_prob = 0.0;
1179 *acceptor_prob = 0.0;
1180 return -1;
1181
1182 } else if (bestproduct_fwd >= bestproduct_rev) {
1183 *found_cdna_direction = +1;
1184 *donor1 = donor1_fwd;
1185 *donor2 = donor2_fwd;
1186 *acceptor2 = acceptor2_fwd;
1187 *acceptor1 = acceptor1_fwd;
1188 *comp = comp_fwd;
1189 *donor_watsonp = donor_watsonp_fwd;
1190 *acceptor_watsonp = acceptor_watsonp_fwd;
1191 *donor_prob = donor_prob_fwd;
1192 *acceptor_prob = acceptor_prob_fwd;
1193 return exonexonpos_fwd;
1194
1195 } else {
1196 *found_cdna_direction = -1;
1197 *donor1 = donor1_rev;
1198 *donor2 = donor2_rev;
1199 *acceptor2 = acceptor2_rev;
1200 *acceptor1 = acceptor1_rev;
1201 *comp = comp_rev;
1202 *donor_watsonp = donor_watsonp_rev;
1203 *acceptor_watsonp = acceptor_watsonp_rev;
1204 *donor_prob = donor_prob_rev;
1205 *acceptor_prob = acceptor_prob_rev;
1206 return exonexonpos_rev;
1207 }
1208 }
1209
1210
1211