1 static char rcsid[] = "$Id: smooth.c 218183 2019-01-17 13:03:20Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "smooth.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <math.h> /* For pow(), log() */
10 #include "bool.h"
11 #include "mem.h"
12 #include "except.h"
13 #include "comp.h"
14 #include "pair.h"
15 #include "pairdef.h"
16 #include "intlist.h"
17 #include "doublelist.h"
18 #include "pbinom.h"
19
20
21 /* Will examine internal exons smaller than this prior to single gaps */
22 #define ZERONETGAP 9
23 #define SHORTEXONLEN_NETGAP 15
24
25 /* Will delete/mark internal exons smaller than this for dual genome gap */
26 #define DELETE_THRESHOLD 0.1
27 #define MARK_THRESHOLD 1e-7
28
29 /* Will delete internal exons smaller than this at ends */
30 #define SHORTEXONLEN_END 10
31
32
33 #define LOOSE_EXON_PVALUE 0.05
34 #define STRICT_EXON_PVALUE 1e-4
35
36
37 #ifdef GSNAP
38 /* Allow more intron predictions in ends of short reads */
39 #define SHORTEXONPROB_END 0.10
40 #else
41 #define SHORTEXONPROB_END 0.05
42 #endif
43
44 #define BIGGAP 9
45
46
47 #ifdef DEBUG
48 #define debug(x) x
49 #else
50 #define debug(x)
51 #endif
52
53
54 typedef enum {KEEP, DELETE, MARK} Exonstatus_T;
55
56
57 static bool
big_gap_p(Pair_T pair,bool bysizep)58 big_gap_p (Pair_T pair, bool bysizep) {
59
60 #if 0
61 /* No longer following the logic below */
62 /* When bysizep is true, single gaps should have been solved already
63 (in pass 2), so any gap remaining must be a poor one (must have
64 been removed because of a negative score), and we should consider
65 all gaps, regardless of size. When bysizep is false, we must be
66 doing initial smoothing, before any single gaps have been solved,
67 so we want to consider only big gaps. This change was motivated
68 by alignment of mouse HER2 against human genome, where a 58/58
69 section was deleted in stage 3, but subsequent smoothing failed
70 to recognize the short exon between this section and the next
71 intron. */
72 #endif
73
74 if (bysizep == true) {
75 return true;
76 } else if (pair->genomejump > pair->queryjump) {
77 return (pair->genomejump - pair->queryjump) > BIGGAP ? true : false;
78 } else {
79 return false;
80 }
81 }
82
83
84 static int *
get_exonlengths(int ** exonmatches,int ** exonprotects,int * nexons,List_T pairs,bool bysizep)85 get_exonlengths (int **exonmatches, int **exonprotects, int *nexons, List_T pairs, bool bysizep) {
86 int *exonlengths;
87 Intlist_T list = NULL, matchlist = NULL, protectlist = NULL;
88 Pair_T firstpair, pair;
89 int querypos, nmatches = 0, nprotected = 0;
90 #ifdef DEBUG
91 int i;
92 #endif
93
94 firstpair = List_head(pairs);
95 querypos = firstpair->querypos;
96
97 while (pairs != NULL) {
98 pair = List_head(pairs);
99 if (pair->gapp == true && big_gap_p(pair,bysizep) == true) {
100 list = Intlist_push(list,querypos-firstpair->querypos+1);
101 matchlist = Intlist_push(matchlist,nmatches);
102 protectlist = Intlist_push(protectlist,nprotected);
103
104 pairs = List_next(pairs);
105 if (pairs != NULL) {
106 firstpair = List_head(pairs);
107 querypos = firstpair->querypos;
108 nmatches = nprotected = 0;
109 }
110 } else {
111 querypos = pair->querypos;
112 if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP) {
113 nmatches++;
114 }
115 if (pair->protectedp == true) {
116 nprotected++;
117 }
118 pairs = List_next(pairs);
119 }
120 }
121
122 list = Intlist_push(list,querypos-firstpair->querypos+1);
123 list = Intlist_reverse(list);
124 matchlist = Intlist_push(matchlist,nmatches);
125 matchlist = Intlist_reverse(matchlist);
126 protectlist = Intlist_push(protectlist,nprotected);
127 protectlist = Intlist_reverse(protectlist);
128
129 exonlengths = Intlist_to_array(&(*nexons),list);
130 *exonmatches = Intlist_to_array(&(*nexons),matchlist);
131 *exonprotects = Intlist_to_array(&(*nexons),protectlist);
132 debug(
133 printf("%d exons: ",*nexons);
134 for (i = 0; i < *nexons; i++) {
135 printf("%d (%d matches),",exonlengths[i],(*exonmatches)[i]);
136 }
137 printf("\n");
138 );
139 Intlist_free(&list);
140 Intlist_free(&matchlist);
141 Intlist_free(&protectlist);
142
143 return exonlengths;
144 }
145
146
147 static void
get_exonmatches(int * total_matches,int * total_denominator,int ** exonmatches,int * nexons,List_T pairs)148 get_exonmatches (int *total_matches, int *total_denominator, int **exonmatches,
149 int *nexons, List_T pairs) {
150 Intlist_T matchlist = NULL;
151 Pair_T pair;
152 int nmatches = 0, nmismatches = 0;
153 #ifdef DEBUG
154 int i;
155 #endif
156
157 *total_matches = *total_denominator = 0;
158
159 /* firstpair = List_head(pairs); */
160 /* querypos = firstpair->querypos; */
161
162 while (pairs != NULL) {
163 pair = List_head(pairs);
164 if (pair->gapp == true && big_gap_p(pair,/*bysizep*/false) == true) {
165 matchlist = Intlist_push(matchlist,nmatches);
166 /* denominatorlist = Intlist_push(denominatorlist,nmatches + nmismatches); */
167 *total_matches += nmatches;
168 *total_denominator += (nmatches + nmismatches);
169
170 pairs = List_next(pairs);
171 if (pairs != NULL) {
172 /* firstpair = List_head(pairs); */
173 /* querypos = firstpair->querypos; */
174 nmatches = nmismatches = 0;
175 }
176 } else {
177 /* querypos = pair->querypos; */
178 if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP) {
179 nmatches++;
180 } else {
181 nmismatches++;
182 }
183 pairs = List_next(pairs);
184 }
185 }
186
187 matchlist = Intlist_push(matchlist,nmatches);
188 matchlist = Intlist_reverse(matchlist);
189 /* denominatorlist = Intlist_push(denominatorlist,nmatches + nmismatches); */
190 /* denominatorlist = Intlist_reverse(denominatorlist); */
191 *total_matches += nmatches;
192 *total_denominator += (nmatches + nmismatches);
193
194 *exonmatches = Intlist_to_array(&(*nexons),matchlist);
195 /* *exon_denominator = Intlist_to_array(&(*nexons),denominatorlist); */
196 debug(
197 printf("%d exons: ",*nexons);
198 for (i = 0; i < *nexons; i++) {
199 printf("%d matches, %d denominator,",(*exonmatches)[i],(*exon_denominator)[i]);
200 }
201 printf("\n");
202 );
203 /* Intlist_free(&denominatorlist); */
204 Intlist_free(&matchlist);
205
206 return;
207 }
208
209
210
211 static List_T
get_intron_neighborhoods(int ** intron_matches_left,int ** intron_denominator_left,int ** intron_matches_right,int ** intron_denominator_right,int nintrons,List_T pairs)212 get_intron_neighborhoods (int **intron_matches_left, int **intron_denominator_left,
213 int **intron_matches_right, int **intron_denominator_right,
214 int nintrons, List_T pairs) {
215 List_T path, p;
216 Pair_T pair;
217 int introni;
218 int i;
219
220 *intron_matches_left = (int *) CALLOC(nintrons,sizeof(int));
221 *intron_denominator_left = (int *) CALLOC(nintrons,sizeof(int));
222 *intron_matches_right = (int *) CALLOC(nintrons,sizeof(int));
223 *intron_denominator_right = (int *) CALLOC(nintrons,sizeof(int));
224
225 introni = -1;
226 for (p = pairs; p != NULL; p = List_next(p)) {
227 pair = List_head(p);
228 if (pair->gapp == true && big_gap_p(pair,/*bysizep*/false) == true) {
229 introni += 1;
230 i = 0;
231
232 } else if (introni >= 0 && i < 25) {
233 if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP) {
234 (*intron_matches_left)[introni] += 1;
235 }
236 (*intron_denominator_left)[introni] += 1;
237 i++;
238
239 } else {
240 i++;
241 }
242 }
243
244 path = List_reverse(pairs);
245 introni = nintrons;
246 for (p = path; p != NULL; p = List_next(p)) {
247 pair = List_head(p);
248 if (pair->gapp == true && big_gap_p(pair,/*bysizep*/false) == true) {
249 introni -= 1;
250 i = 0;
251
252 } else if (introni < nintrons && i < 25) {
253 if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP) {
254 (*intron_matches_right)[introni] += 1;
255 }
256 (*intron_denominator_right)[introni] += 1;
257 i++;
258
259 } else {
260 i++;
261 }
262 }
263
264 pairs = List_reverse(path);
265 return pairs;
266 }
267
268
269 static int *
get_intronlengths(int * nintrons,List_T pairs,bool bysizep)270 get_intronlengths (int *nintrons, List_T pairs, bool bysizep) {
271 int *intronlengths, length;
272 Intlist_T list = NULL;
273 Pair_T pair;
274 #ifdef DEBUG
275 int i;
276 #endif
277
278 while (pairs != NULL) {
279 pair = List_head(pairs);
280 if (pair->gapp == true && big_gap_p(pair,bysizep) == true) {
281 #if 0
282 if (pair->genomejump > pair->queryjump) {
283 list = Intlist_push(list,pair->genomejump);
284 } else {
285 list = Intlist_push(list,pair->queryjump);
286 }
287 #else
288 if ((length = pair->genomejump - pair->queryjump) < 0) {
289 /* cDNA insertion */
290 list = Intlist_push(list,-length);
291 } else {
292 list = Intlist_push(list,length);
293 }
294 #endif
295 }
296 pairs = List_next(pairs);
297 }
298
299 list = Intlist_reverse(list);
300 intronlengths = Intlist_to_array(&(*nintrons),list);
301 debug(
302 printf("%d introns: ",*nintrons);
303 for (i = 0; i < *nintrons; i++) {
304 printf("%d,",intronlengths[i]);
305 }
306 printf("\n");
307 );
308 Intlist_free(&list);
309
310 return intronlengths;
311 }
312
313
314 static void
get_intronprobs(double ** donor_probs,double ** acceptor_probs,int * nintrons,List_T pairs)315 get_intronprobs (double **donor_probs, double **acceptor_probs, int *nintrons, List_T pairs) {
316 Doublelist_T donor_prob_list = NULL, acceptor_prob_list = NULL;
317 Pair_T pair;
318 #ifdef DEBUG
319 int i;
320 #endif
321
322 while (pairs != NULL) {
323 pair = List_head(pairs);
324 if (pair->gapp == true && big_gap_p(pair,/*bysizep*/false) == true) {
325 donor_prob_list = Doublelist_push(donor_prob_list,pair->donor_prob);
326 acceptor_prob_list = Doublelist_push(acceptor_prob_list,pair->acceptor_prob);
327 }
328 pairs = List_next(pairs);
329 }
330
331 donor_prob_list = Doublelist_reverse(donor_prob_list);
332 acceptor_prob_list = Doublelist_reverse(acceptor_prob_list);
333 *donor_probs = Doublelist_to_array(&(*nintrons),donor_prob_list);
334 *acceptor_probs = Doublelist_to_array(&(*nintrons),acceptor_prob_list);
335 debug(
336 printf("%d introns: ",*nintrons);
337 for (i = 0; i < *nintrons; i++) {
338 printf("%f+%f,",(*donor_probs)[i],(*acceptor_probs)[i]);
339 }
340 printf("\n");
341 );
342 Doublelist_free(&acceptor_prob_list);
343 Doublelist_free(&donor_prob_list);
344
345 return;
346 }
347
348
349 static const Except_T length_error = {"Negative exon or intron length"};
350
351 static double
compute_prob(int exonlen,int intronlen,int indexsize)352 compute_prob (int exonlen, int intronlen, int indexsize) {
353 double prob;
354
355 if (exonlen < indexsize) {
356 prob = 1.0;
357 } else {
358 prob = 1 - pow(1.0-pow(4.0,(double) -exonlen),(double) intronlen);
359 }
360 debug(printf("Probability of exon of length %d (indexsize %d) with intron of length %d is %g\n",
361 exonlen,indexsize,intronlen,prob));
362 return prob;
363 }
364
365 #if 0
366 static bool
367 short_exon_byprob (int exonlen, int intronlen, int indexsize, double prob_threshold) {
368 double prob;
369
370 prob = compute_prob(exonlen,intronlen,indexsize);
371 if (prob < prob_threshold) {
372 return false;
373 } else {
374 return true;
375 }
376 }
377 #endif
378
379 #if 0
380 static bool
381 short_exon_bylength (int exonlen, int length_threshold) {
382
383 if (exonlen >= length_threshold) {
384 return false;
385 } else {
386 return true;
387 }
388 }
389 #endif
390
391 static void
zero_net_gap(int * starti,int * startj,int i,int j,int * intronlengths)392 zero_net_gap (int *starti, int *startj, int i, int j, int *intronlengths) {
393 int netgap, bestnetgap = 1000000;
394 int k, l, adji;
395
396 if (i == 0) {
397 adji = 0;
398 } else {
399 adji = i-1;
400 }
401
402 for (k = adji; k < j; k++) {
403 netgap = intronlengths[k];
404 for (l = k+1; l < j; l++) {
405 netgap += intronlengths[l];
406 debug(printf("zero_net_gap: netgap from %d to %d is %d\n",k,l,netgap));
407 if (abs(netgap) < bestnetgap) {
408 bestnetgap = abs(netgap);
409 *starti = k+1;
410 *startj = l;
411 }
412 }
413 }
414
415 debug(printf("zero_net_gap: best result is %d from %d to %d\n",bestnetgap,*starti,*startj));
416
417 if (0 && bestnetgap > ZERONETGAP) {
418 debug(printf("zero_net_gap: not recommending any deletions because bestnetgap %d > zeronetgap %d\n",
419 bestnetgap,ZERONETGAP));
420 *starti = *startj = -1;
421 }
422
423 return;
424 }
425
426
427 static void
find_internal_shorts_by_netgap(bool * deletep,int * exonstatus,int * exonmatches,int nexons,int * intronlengths)428 find_internal_shorts_by_netgap (bool *deletep, int *exonstatus, int *exonmatches, int nexons,
429 int *intronlengths) {
430 int starti, startj, i, j;
431 int exonlen;
432
433 *deletep = false;
434 for (i = 0; i < nexons; i++) {
435 exonstatus[i] = KEEP;
436 }
437
438 /* Mark short middle exons */
439 for (i = 1; i < nexons - 1; i++) {
440 exonlen = exonmatches[i];
441 if (exonlen < SHORTEXONLEN_NETGAP) {
442 exonstatus[i] = MARK;
443 }
444 }
445
446 /* Find internal shorts */
447 i = 0;
448 while (i < nexons) {
449 if (exonstatus[i] == MARK) {
450 j = i;
451 while (j < nexons && exonstatus[j] == MARK) {
452 j++;
453 }
454 debug(printf("Calling zero_net_gap with %d exons\n",j-i));
455 zero_net_gap(&starti,&startj,i,j,intronlengths);
456 if (starti >= 0) {
457 for (j = starti; j <= startj; j++) {
458 *deletep = true;
459 exonstatus[j] = DELETE;
460 }
461 } else if (j - i == 1) {
462 exonstatus[i] = MARK;
463 }
464 i = j;
465 } else {
466 i++;
467 }
468 }
469
470 return;
471 }
472
473
474 static void
find_internal_shorts_by_size(bool * shortp,bool * deletep,int * exonstatus,int * exonmatches,int * exonprotects,int nexons,int * intronlengths,int stage2_indexsize)475 find_internal_shorts_by_size (bool *shortp, bool *deletep, int *exonstatus,
476 int *exonmatches, int *exonprotects, int nexons,
477 int *intronlengths, int stage2_indexsize) {
478 int i;
479 int exonlen, intronlen;
480 double prob;
481
482 *shortp = *deletep = false;
483 for (i = 0; i < nexons; i++) {
484 exonstatus[i] = KEEP;
485 }
486
487 /* Mark short middle exons */
488 for (i = 1; i < nexons - 1; i++) {
489 exonlen = exonmatches[i];
490 intronlen = intronlengths[i-1]+intronlengths[i];
491 if ((double) exonprotects[i] > 0.50 * (double) exonlen) {
492 /* Keep protected exon */
493 } else {
494 prob = compute_prob(exonlen,intronlen,stage2_indexsize); /* Hack: add 4 for possible canonical dinucleotides */
495 if (prob > DELETE_THRESHOLD) {
496 *deletep = true;
497 exonstatus[i] = DELETE;
498 } else if (prob > MARK_THRESHOLD) {
499 *shortp = true;
500 exonstatus[i] = MARK;
501 }
502 }
503 }
504
505 return;
506 }
507
508
509 static bool
bad_intron_p(double donor_prob,double acceptor_prob,int intron_matches_left,int intron_denominator_left,int intron_matches_right,int intron_denominator_right,int total_matches,int total_denominator)510 bad_intron_p (double donor_prob, double acceptor_prob, int intron_matches_left, int intron_denominator_left,
511 int intron_matches_right, int intron_denominator_right, int total_matches, int total_denominator) {
512 double theta_left, theta_right;
513
514 if (donor_prob < 0.9) {
515 debug(printf("Intron with donor_prob %f is bad\n",donor_prob));
516 return true;
517 } else if (acceptor_prob < 0.9) {
518 debug(printf("Intron with acceptor_prob %f is bad\n",acceptor_prob));
519 return true;
520 } else {
521 theta_left = (double) (total_matches - intron_matches_left)/(double) (total_denominator - intron_denominator_left + 1);
522 if (theta_left > 1.0) {
523 theta_left = 1.0;
524 }
525 if (Pbinom(intron_matches_left,intron_denominator_left,theta_left) < 1e-3) {
526 debug(printf("Intron with matches %d/%d is bad with pvalue %f\n",
527 intron_matches_left,intron_denominator_left,Pbinom(intron_matches_left,intron_denominator_left,theta_left)));
528 return true;
529 } else {
530 theta_right = (double) (total_matches - intron_matches_right)/(double) (total_denominator - intron_denominator_right + 1);
531 if (theta_right > 1.0) {
532 theta_right = 1.0;
533 }
534 if (Pbinom(intron_matches_right,intron_denominator_right,theta_right) < 1e-3) {
535 debug(printf("Intron with matches %d/%d is bad with pvalue %f\n",
536 intron_matches_right,intron_denominator_right,Pbinom(intron_matches_right,intron_denominator_right,theta_right)));
537 return true;
538 } else {
539 debug(printf("Intron with matches %d/%d and %d/%d is okay with pvalues %f and %f\n",
540 intron_matches_left,intron_denominator_left,intron_matches_right,intron_denominator_right,
541 Pbinom(intron_matches_left,intron_denominator_left,theta_left),
542 Pbinom(intron_matches_right,intron_denominator_right,theta_right)));
543 return false;
544 }
545 }
546 }
547 }
548
549
550 struct Smoothcell_T {
551 int exoni;
552 /* double pvalue; */
553 int exonmatches;
554 int exonstatus;
555 };
556
557 #if 0
558 static int
559 Smoothcell_cmp (const void *x, const void *y) {
560 struct Smoothcell_T a = * (struct Smoothcell_T *) x;
561 struct Smoothcell_T b = * (struct Smoothcell_T *) y;
562
563 if (a.pvalue < b.pvalue) {
564 return -1;
565 } else if (b.pvalue < a.pvalue) {
566 return +1;
567 } else {
568 return 0;
569 }
570 }
571 #else
572 static int
Smoothcell_cmp(const void * x,const void * y)573 Smoothcell_cmp (const void *x, const void *y) {
574 struct Smoothcell_T a = * (struct Smoothcell_T *) x;
575 struct Smoothcell_T b = * (struct Smoothcell_T *) y;
576
577 if (a.exonmatches < b.exonmatches) {
578 return -1;
579 } else if (b.exonmatches < a.exonmatches) {
580 return +1;
581 } else {
582 return 0;
583 }
584 }
585 #endif
586
587
588
589 static void
find_internal_bads_by_prob(bool * deletep,int * exonstatus,int * exonmatches,int nexons,double * donor_probs,double * acceptor_probs,int * intron_matches_left,int * intron_denominator_left,int * intron_matches_right,int * intron_denominator_right,int total_matches,int total_denominator)590 find_internal_bads_by_prob (bool *deletep, int *exonstatus, int *exonmatches, int nexons,
591 double *donor_probs, double *acceptor_probs,
592 int *intron_matches_left, int *intron_denominator_left,
593 int *intron_matches_right, int *intron_denominator_right,
594 int total_matches, int total_denominator) {
595 struct Smoothcell_T *cells;
596 int i;
597 /* double theta0; */
598 /* int numerator0, denominator0; */
599 bool intron1_bad_p, intron2_bad_p;
600
601 /* double pvalue; */
602 /* double worst_exon_pvalue = 1.0; */
603 /* int worst_exoni = -1; */
604
605 *deletep = false;
606 cells = (struct Smoothcell_T *) MALLOCA(nexons * sizeof(struct Smoothcell_T));
607 for (i = 0; i < nexons; i++) {
608 exonstatus[i] = KEEP;
609 cells[i].exoni = i;
610 cells[i].exonmatches = exonmatches[i];
611 cells[i].exonstatus = KEEP;
612 }
613
614 /* Mark short middle exons */
615 intron1_bad_p = bad_intron_p(donor_probs[0],acceptor_probs[0],intron_matches_left[0],intron_denominator_left[0],
616 intron_matches_right[0],intron_denominator_right[0],total_matches,total_denominator);
617 for (i = 1; i < nexons - 1; i++) {
618 /* exonlen = exonmatches[i]; */
619 intron2_bad_p = bad_intron_p(donor_probs[i],acceptor_probs[i],intron_matches_left[i],intron_denominator_left[i],
620 intron_matches_right[i],intron_denominator_right[i],total_matches,total_denominator);
621 debug(printf("For exon %d, left intron bad %d, right intron bad %d\n",i,intron1_bad_p,intron2_bad_p));
622
623 if (intron1_bad_p == true || intron2_bad_p == true) {
624 #if 0
625 numerator0 = exonmatches[i];
626 denominator0 = exon_denominator[i];
627 theta0 = (double) (total_matches - numerator0 + 1)/(double) (total_denominator - denominator0 + 1);
628 debug(printf(" Checking %d matches, %d denominator, theta %f => pvalue %g\n",
629 numerator0,denominator0,theta0,Pbinom(numerator0,denominator0,theta0)));
630 if ((pvalue = Pbinom(numerator0,denominator0,theta0)) < STRICT_EXON_PVALUE) {
631 cells[i].pvalue = pvalue;
632 cells[i].exonstatus = DELETE;
633 }
634 #else
635 if (exonmatches[i] < 15) {
636 cells[i].exonstatus = DELETE;
637 }
638 #endif
639
640 } else {
641 /* Do nothing */
642 }
643
644 intron1_bad_p = intron2_bad_p;
645 }
646
647 qsort(cells,nexons,sizeof(struct Smoothcell_T),Smoothcell_cmp);
648 i = 0;
649 while (i < nexons && cells[i].exonmatches < 15) {
650 if (cells[i].exonstatus == DELETE) {
651 debug(printf(" Will delete exon %d\n",i));
652 *deletep = true;
653 exonstatus[cells[i].exoni] = DELETE;
654 exonstatus[cells[i].exoni - 1] = KEEP; /* Prevent consecutive deletes */
655 exonstatus[cells[i].exoni + 1] = KEEP; /* Prevent consecutive deletes */
656 }
657 i++;
658 }
659 FREEA(cells);
660
661 return;
662 }
663
664
665
666 #if 0
667 /* For ends, we turn off the indexsize parameter */
668 static void
669 find_end_shorts (bool *deletep, int *exonstatus, int *exonmatches, int nexons, int *intronlengths) {
670 int i;
671 bool shortp;
672
673 *deletep = false;
674 for (i = 0; i < nexons; i++) {
675 exonstatus[i] = KEEP;
676 }
677
678 shortp = true;
679 i = 0;
680 while (i < nexons - 1 && shortp == true) {
681 if (short_exon_bylength(exonmatches[i],SHORTEXONLEN_END) == true &&
682 short_exon_byprob(exonmatches[i],intronlengths[i],/*indexsize*/0,SHORTEXONPROB_END) == true) {
683 *deletep = true;
684 exonstatus[i] = DELETE;
685 } else {
686 shortp = false;
687 }
688 i++;
689 }
690
691 shortp = true;
692 i = nexons - 1;
693 while (i > 0 && shortp == true) {
694 if (short_exon_bylength(exonmatches[i],SHORTEXONLEN_END) == true &&
695 short_exon_byprob(exonmatches[i],intronlengths[i-1],/*indexsize*/0,SHORTEXONPROB_END) == true) {
696 *deletep = true;
697 exonstatus[i] = DELETE;
698 } else {
699 shortp = false;
700 }
701 --i;
702 }
703
704 return;
705 }
706 #endif
707
708
709 #ifdef DEBUG
710 static void
print_exon_status(int * exonstatus,int * exonmatches,int * exonprotects,int nexons)711 print_exon_status (int *exonstatus, int *exonmatches, int *exonprotects, int nexons) {
712 int i;
713
714 if (exonprotects == NULL) {
715 for (i = 0; i < nexons; i++) {
716 if (exonstatus[i] == KEEP) {
717 printf("Long exon %d of %d matches => keep\n",i,exonmatches[i]);
718 } else if (exonstatus[i] == MARK) {
719 printf("Short exon %d of %d matches => mark\n",i,exonmatches[i]);
720 } else if (exonstatus[i] == DELETE) {
721 printf("Exon %d of %d matches => delete\n",i,exonmatches[i]);
722 } else {
723 abort();
724 }
725 }
726 } else {
727 for (i = 0; i < nexons; i++) {
728 if (exonstatus[i] == KEEP) {
729 printf("Long exon %d of %d matches (%d protected) => keep\n",i,exonmatches[i],exonprotects[i]);
730 } else if (exonstatus[i] == MARK) {
731 printf("Short exon %d of %d matches (%d protected) => mark\n",i,exonmatches[i],exonprotects[i]);
732 } else if (exonstatus[i] == DELETE) {
733 printf("Exon %d of %d matches (%d protected) => delete\n",i,exonmatches[i],exonprotects[i]);
734 } else {
735 abort();
736 }
737 }
738 }
739
740 return;
741 }
742 #endif
743
744
745 static List_T
delete_and_mark_exons(List_T pairs,Pairpool_T pairpool,int * exonstatus,bool markp,bool bysizep)746 delete_and_mark_exons (List_T pairs, Pairpool_T pairpool,
747 int *exonstatus, bool markp, bool bysizep) {
748 List_T newpairs = NULL;
749 Pair_T pair;
750 int currstatus, prevstatus = KEEP;
751 int i;
752
753 i = 0;
754 currstatus = exonstatus[i];
755 while (pairs != NULL) {
756 /* pairptr = pairs; */
757 /* pairs = Pairpool_pop(pairs,&pair); */
758 pair = (Pair_T) pairs->first;
759 if (pair->gapp == true && big_gap_p(pair,bysizep) == true) {
760 prevstatus = currstatus;
761 currstatus = exonstatus[++i];
762 debug(printf("Gap observed\n"));
763 if (prevstatus != DELETE && currstatus != DELETE) {
764 #ifdef WASTE
765 newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
766 #else
767 newpairs = List_transfer_one(newpairs,&pairs);
768 #endif
769 } else {
770 pairs = Pairpool_pop(pairs,&pair);
771 }
772
773 } else if (currstatus == KEEP) {
774 /* debug(printf("Marking position %d as keep\n",pair->querypos)); */
775 if (prevstatus == DELETE) {
776 if (newpairs != NULL) {
777 newpairs = Pairpool_push_gapholder(newpairs,pairpool,/*queryjump*/0,/*genomejump*/0,
778 /*leftpair*/newpairs->first,/*rightpair*/pair,/*knownp*/false);
779 }
780 prevstatus = /*currstatus*/ KEEP;
781 }
782
783 #ifdef WASTE
784 newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
785 #else
786 newpairs = List_transfer_one(newpairs,&pairs);
787 #endif
788 } else if (currstatus == MARK) {
789 debug(printf("Marking position %d as short in pair %p\n",pair->querypos,pair));
790 if (prevstatus == DELETE) {
791 if (newpairs != NULL) {
792 newpairs = Pairpool_push_gapholder(newpairs,pairpool,/*queryjump*/0,/*genomejump*/0,
793 /*leftpair*/newpairs->first,/*rightpair*/pair,/*knownp*/false);
794 }
795 prevstatus = /*currstatus*/ MARK;
796 }
797
798 if (markp == true) {
799 pair->shortexonp = true;
800 }
801 #ifdef WASTE
802 newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
803 #else
804 newpairs = List_transfer_one(newpairs,&pairs);
805 #endif
806 } else {
807 debug(printf("Marking position %d for deletion\n",pair->querypos));
808 pairs = Pairpool_pop(pairs,&pair);
809 }
810 }
811
812 /* Remove gaps at end */
813 if (newpairs != NULL) {
814 pair = List_head(newpairs);
815 }
816 while (newpairs != NULL && pair->gapp == true) {
817 debug(printf("Popping gap at end\n"));
818 newpairs = Pairpool_pop(newpairs,&pair);
819 }
820
821 /* Remove gaps at beginning */
822 newpairs = List_reverse(newpairs);
823 if (newpairs != NULL) {
824 pair = List_head(newpairs);
825 }
826 while (newpairs != NULL && pair->gapp == true) {
827 debug(printf("Popping gap at beginning\n"));
828 newpairs = Pairpool_pop(newpairs,&pair);
829 }
830
831 debug(printf("Result of delete_and_mark_exons:\n"));
832 debug(Pair_dump_list(newpairs,/*zerobasedp*/true));
833 debug(printf("\n"));
834
835 return newpairs;
836 }
837
838
839 #if 0
840 static List_T
841 mark_exons (List_T pairs,
842 #ifdef WASTE
843 Pairpool_T pairpool,
844 #endif
845 int *exonstatus) {
846 List_T newpairs = NULL, pairptr;
847 Pair_T pair;
848 int currstatus, prevstatus;
849 int i;
850
851 debug(
852 for (i = 0; i < nexons; i++) {
853 if (exonstatus[i] == KEEP) {
854 printf("Long exon %d of %d matches => keep\n",i,exonmatches[i]);
855 } else if (exonstatus[i] == MARK) {
856 printf("Short exon %d of %d matches => mark\n",i,exonmatches[i]);
857 } else if (exonstatus[i] == DELETE) {
858 printf("Exon %d of %d matches => delete\n",i,exonmatches[i]);
859 } else {
860 abort();
861 }
862 }
863 );
864
865 i = 0;
866 currstatus = exonstatus[i];
867 while (pairs != NULL) {
868 pairptr = pairs;
869 pairs = Pairpool_pop(pairs,&pair);
870 if (pair->gapp == true && big_gap_p(pair,/*bysizep*/true) == true) {
871 prevstatus = currstatus;
872 currstatus = exonstatus[++i];
873 debug(printf("Gap observed\n"));
874 if (prevstatus != DELETE && currstatus != DELETE) {
875 #ifdef WASTE
876 newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
877 #else
878 newpairs = List_push_existing(newpairs,pairptr);
879 #endif
880 }
881
882 } else if (currstatus == KEEP) {
883 /* debug(printf("Marking position %d as keep\n",pair->querypos)); */
884 #ifdef WASTE
885 newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
886 #else
887 newpairs = List_push_existing(newpairs,pairptr);
888 #endif
889 } else if (currstatus == MARK) {
890 debug(printf("Marking position %d as short in pair %p\n",pair->querypos,pair));
891 pair->shortexonp = true;
892
893 #ifdef WASTE
894 newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
895 #else
896 newpairs = List_push_existing(newpairs,pairptr);
897 #endif
898 } else {
899 /* Normally would delete */
900 #ifdef WASTE
901 newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
902 #else
903 newpairs = List_push_existing(newpairs,pairptr);
904 #endif
905 }
906 }
907
908 debug(printf("Result of mark_exons:\n"));
909 debug(Pair_dump_list(newpairs,/*zerobasedp*/true));
910 debug(printf("\n"));
911
912 return List_reverse(newpairs);
913 }
914 #endif
915
916
917
918 void
Smooth_reset(List_T pairs)919 Smooth_reset (List_T pairs) {
920 List_T p;
921 Pair_T pair;
922
923 for (p = pairs; p != NULL; p = List_next(p)) {
924 pair = List_head(p);
925 pair->shortexonp = false;
926 }
927 return;
928 }
929
930
931
932 /* Needed for low-identity alignments */
933 /* Assumes pairs are from 1..querylength. Reverses the pairs to be querylength..1 */
934 List_T
Smooth_pairs_by_netgap(bool * deletep,List_T pairs,Pairpool_T pairpool)935 Smooth_pairs_by_netgap (bool *deletep, List_T pairs, Pairpool_T pairpool) {
936 int *exonstatus;
937 int *exonlengths, *exonmatches, *exonprotects, *intronlengths, nexons, nintrons;
938 #ifdef DEBUG
939 int i;
940 #endif
941
942 *deletep = false;
943 /* smooth_reset(pairs); */
944 if (pairs != NULL) {
945 /* Remove internal shorts */
946 exonlengths = get_exonlengths(&exonmatches,&exonprotects,&nexons,pairs,/*bysizep*/false);
947 intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/false);
948
949 debug(
950 printf("Beginning of smoothing. Initial structure:\n");
951 for (i = 0; i < nexons-1; i++) {
952 printf("Exon %d of length %d (%d matches, %d protects)\n",i,exonlengths[i],exonmatches[i],exonprotects[i]);
953 printf("Intron %d of length %d\n",i,intronlengths[i]);
954 }
955 printf("Exon %d of length %d (%d matches)\n",nexons-1,exonlengths[nexons-1],exonmatches[nexons-1]);
956 );
957
958 debug(printf("\nFind internal shorts\n"));
959 exonstatus = (int *) MALLOCA(nexons * sizeof(int));
960 find_internal_shorts_by_netgap(&(*deletep),exonstatus,exonmatches,nexons,intronlengths);
961 debug(printf("\nRemove internal shorts\n"));
962 if (*deletep == true) {
963 debug(print_exon_status(exonstatus,exonmatches,exonprotects,nexons));
964 pairs = delete_and_mark_exons(pairs,pairpool,exonstatus,/*markp*/false,/*bysizep*/false);
965 }
966
967 #if 0
968 /* This is not correct */
969 debug(
970 printf("After removing internal shorts:\n");
971 for (i = 0; i < nexons-1; i++) {
972 printf("Exon %d of length %d (%d matches)\n",i,exonlengths[i],exonmatches[i]);
973 printf("Intron %d of length %d\n",i,intronlengths[i]);
974 }
975 printf("Exon %d of length %d\n",nexons-1,exonlengths[nexons-1]);
976 );
977 #endif
978
979 FREEA(exonstatus);
980 FREE(intronlengths);
981 FREE(exonprotects);
982 FREE(exonmatches);
983 FREE(exonlengths);
984
985 debug(printf("Ending of smoothing\n\n"));
986 }
987
988 return pairs;
989 }
990
991
992 /* Assumes pairs are from 1..querylength. Reverses the pairs to be querylength..1 */
993 List_T
Smooth_pairs_by_size(bool * shortp,bool * deletep,List_T pairs,Pairpool_T pairpool,int stage2_indexsize)994 Smooth_pairs_by_size (bool *shortp, bool *deletep, List_T pairs, Pairpool_T pairpool, int stage2_indexsize) {
995 int *exonstatus;
996 int *exonlengths, *exonmatches, *exonprotects, *intronlengths, nexons, nintrons;
997 bool delete2p;
998 #ifdef DEBUG
999 int i;
1000 #endif
1001
1002 *shortp = *deletep = false;
1003
1004 #if 0
1005 /* Should be handled instead by trim_noncanonical_end5 and trim_noncanonical_end3 */
1006 /* smooth_reset(pairs); */
1007 if (pairs != NULL) {
1008 /* Trim ends */
1009 exonlengths = get_exonlengths(&exonmatches,&exonprotects,&nexons,pairs,/*bysizep*/true);
1010 intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/true);
1011
1012 debug(printf("\nFind end shorts\n"));
1013 exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1014 find_end_shorts(&delete1p,exonstatus,exonmatches,nexons,intronlengths);
1015 if (delete1p == true) {
1016 *deletep = true;
1017 debug(print_exon_status(exonstatus,exonmatches,exonprotects,nexons));
1018 pairs = delete_and_mark_exons(pairs,pairpool,exonstatus,/*markp*/false,/*bysizep*/true);
1019 }
1020
1021 FREEA(exonstatus);
1022
1023 FREE(intronlengths);
1024 FREE(exonprotects);
1025 FREE(exonmatches);
1026 FREE(exonlengths);
1027 }
1028 #endif
1029
1030 if (pairs != NULL) {
1031 /* Remove internal shorts */
1032 exonlengths = get_exonlengths(&exonmatches,&exonprotects,&nexons,pairs,/*bysizep*/true);
1033 intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/true);
1034
1035 debug(
1036 printf("Beginning of smoothing. Initial structure:\n");
1037 for (i = 0; i < nexons-1; i++) {
1038 printf("Exon %d of length %d (%d matches, %d protects)\n",i,exonlengths[i],exonmatches[i],exonprotects[i]);
1039 printf("Intron %d of length %d\n",i,intronlengths[i]);
1040 }
1041 printf("Exon %d of length %d (%d matches)\n",nexons-1,exonlengths[nexons-1],exonmatches[nexons-1]);
1042 );
1043
1044 debug(printf("\nFind internal shorts\n"));
1045 exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1046 find_internal_shorts_by_size(&(*shortp),&delete2p,exonstatus,exonmatches,exonprotects,nexons,intronlengths,stage2_indexsize);
1047 debug(printf("\nRemove internal shorts\n"));
1048 if (delete2p == true) {
1049 *deletep = true;
1050 }
1051 if (delete2p == true || *shortp == true) {
1052 debug(print_exon_status(exonstatus,exonmatches,exonprotects,nexons));
1053 pairs = delete_and_mark_exons(pairs,pairpool,exonstatus,/*markp*/true,/*bysizep*/true);
1054 }
1055
1056 debug(
1057 printf("After removing internal shorts:\n");
1058 for (i = 0; i < nexons-1; i++) {
1059 printf("Exon %d of length %d (%d matches)\n",i,exonlengths[i],exonmatches[i]);
1060 printf("Intron %d of length %d\n",i,intronlengths[i]);
1061 }
1062 printf("Exon %d of length %d\n",nexons-1,exonlengths[nexons-1]);
1063 );
1064
1065 FREEA(exonstatus);
1066 FREE(intronlengths);
1067 FREE(exonprotects);
1068 FREE(exonmatches);
1069 FREE(exonlengths);
1070
1071 }
1072
1073 debug(printf("Ending of smoothing\n\n"));
1074
1075 return pairs;
1076 }
1077
1078
1079 #if 0
1080 List_T
1081 Smooth_mark_short_exons (List_T pairs, Pairpool_T pairpool, int stage2_indexsize) {
1082 int *exonstatus;
1083 int *exonlengths, *exonmatches, *intronlengths, nexons, nintrons;
1084 bool shortp;
1085 bool delete1p, delete2p;
1086 #ifdef DEBUG
1087 int i;
1088 #endif
1089
1090 shortp = false;
1091 /* smooth_reset(pairs); */
1092 if (pairs != NULL) {
1093 /* Trim ends */
1094 exonlengths = get_exonlengths(&exonmatches,&nexons,pairs,/*bysizep*/true);
1095 intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/true);
1096
1097 debug(printf("\nFind end shorts\n"));
1098 exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1099 find_end_shorts(&delete1p,exonstatus,exonmatches,nexons,intronlengths);
1100 pairs = mark_exons(pairs,
1101 #ifdef WASTE
1102 pairpool,
1103 #endif
1104 exonstatus);
1105
1106 FREEA(exonstatus);
1107
1108 FREE(intronlengths);
1109 FREE(exonmatches);
1110 FREE(exonlengths);
1111 }
1112
1113 if (pairs != NULL) {
1114 /* Find internal shorts */
1115 exonlengths = get_exonlengths(&exonmatches,&exonprotects,&nexons,pairs,/*bysizep*/true);
1116 intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/true);
1117
1118 debug(
1119 printf("Beginning of smoothing. Initial structure:\n");
1120 for (i = 0; i < nexons-1; i++) {
1121 printf("Exon %d of length %d (%d matches, %d protects)\n",i,exonlengths[i],exonmatches[i],exonprotects[i]);
1122 printf("Intron %d of length %d\n",i,intronlengths[i]);
1123 }
1124 printf("Exon %d of length %d (%d matches)\n",nexons-1,exonlengths[nexons-1],exonmatches[nexons-1]);
1125 );
1126
1127 debug(printf("\nFind internal shorts\n"));
1128 exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1129 find_internal_shorts_by_size(&shortp,&delete2p,exonstatus,exonmatches,nexons,intronlengths,stage2_indexsize);
1130 debug(printf("\nMark internal shorts\n"));
1131 pairs = mark_exons(pairs,
1132 #ifdef WASTE
1133 pairpool,
1134 #endif
1135 exonstatus);
1136
1137 debug(
1138 printf("After marking internal shorts:\n");
1139 for (i = 0; i < nexons-1; i++) {
1140 printf("Exon %d of length %d (%d matches)\n",i,exonlengths[i],exonmatches[i]);
1141 printf("Intron %d of length %d\n",i,intronlengths[i]);
1142 }
1143 printf("Exon %d of length %d\n",nexons-1,exonlengths[nexons-1]);
1144 );
1145
1146 FREEA(exonstatus);
1147 FREE(intronlengths);
1148 FREE(exonmatches);
1149 FREE(exonlengths);
1150
1151 }
1152
1153 debug(printf("End of Smooth_mark_short_exons\n\n"));
1154
1155 return pairs;
1156 }
1157 #endif
1158
1159
1160
1161 List_T
Smooth_pairs_by_intronprobs(bool * badp,List_T pairs,Pairpool_T pairpool)1162 Smooth_pairs_by_intronprobs (bool *badp, List_T pairs, Pairpool_T pairpool) {
1163 int *exonstatus;
1164 int *exonmatches, nexons, nintrons;
1165 int *intron_matches_left, *intron_denominator_left, *intron_matches_right, *intron_denominator_right;
1166 double *donor_probs, *acceptor_probs;
1167 int total_matches, total_denominator;
1168 #ifdef DEBUG
1169 int *intronlengths;
1170 int i;
1171 #endif
1172
1173 debug(Pair_dump_list(pairs,true));
1174
1175 *badp = false;
1176 /* smooth_reset(pairs); */
1177 if (pairs != NULL) {
1178 /* Remove internal shorts */
1179 get_intronprobs(&donor_probs,&acceptor_probs,&nintrons,pairs);
1180 if (nintrons > 0) {
1181 get_exonmatches(&total_matches,&total_denominator,&exonmatches,&nexons,pairs);
1182 debug(intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/false));
1183
1184 pairs = get_intron_neighborhoods(&intron_matches_left,&intron_denominator_left,
1185 &intron_matches_right,&intron_denominator_right,nintrons,pairs);
1186
1187 debug(
1188 printf("Beginning of smoothing. Initial structure:\n");
1189 for (i = 0; i < nexons-1; i++) {
1190 printf("Exon %d with %d matches and %d denominator\n",i,exonmatches[i],exon_denominator[i]);
1191 printf("Intron %d, length %d, with probs %f+%f and matches %d/%d and %d/%d\n",
1192 i,intronlengths[i],donor_probs[i],acceptor_probs[i],intron_matches_left[i],intron_denominator_left[i],
1193 intron_matches_right[i],intron_denominator_right[i]);
1194 }
1195 printf("Exon %d with %d matches and %d denominator\n",nexons-1,exonmatches[nexons-1],exon_denominator[nexons-1]);
1196 );
1197
1198 debug(printf("\nFind internal bads\n"));
1199 exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1200 find_internal_bads_by_prob(&(*badp),exonstatus,exonmatches,nexons,
1201 donor_probs,acceptor_probs,
1202 intron_matches_left,intron_denominator_left,
1203 intron_matches_right,intron_denominator_right,
1204 total_matches,total_denominator);
1205 debug(printf("\nRemove internal bads\n"));
1206 if (*badp == false) {
1207 debug(printf("No internal bads\n"));
1208 } else {
1209 debug(print_exon_status(exonstatus,exonmatches,/*exonprotects*/NULL,nexons));
1210 pairs = delete_and_mark_exons(pairs,pairpool,exonstatus,/*markp*/true,/*bysizep*/false);
1211 }
1212
1213 #if 0
1214 /* This is not correct */
1215 debug(
1216 printf("After removing internal shorts:\n");
1217 for (i = 0; i < nexons-1; i++) {
1218 printf("Exon %d of length %d (%d matches)\n",i,exonlengths[i],exonmatches[i]);
1219 printf("Intron %d of length %d\n",i,intronlengths[i]);
1220 }
1221 printf("Exon %d of length %d\n",nexons-1,exonlengths[nexons-1]);
1222 );
1223 #endif
1224
1225 debug(FREE(intronlengths));
1226 FREEA(exonstatus);
1227 FREE(intron_denominator_right);
1228 FREE(intron_matches_right);
1229 FREE(intron_denominator_left);
1230 FREE(intron_matches_left);
1231 FREE(exonmatches);
1232
1233 debug(printf("Ending of smoothing\n\n"));
1234 }
1235
1236 FREE(acceptor_probs);
1237 FREE(donor_probs);
1238 }
1239
1240 return pairs;
1241 }
1242