1 static char rcsid[] = "$Id: smooth.c 218183 2019-01-17 13:03:20Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "smooth.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <math.h>		/* For pow(), log() */
10 #include "bool.h"
11 #include "mem.h"
12 #include "except.h"
13 #include "comp.h"
14 #include "pair.h"
15 #include "pairdef.h"
16 #include "intlist.h"
17 #include "doublelist.h"
18 #include "pbinom.h"
19 
20 
21 /* Will examine internal exons smaller than this prior to single gaps */
22 #define ZERONETGAP 9
23 #define SHORTEXONLEN_NETGAP 15
24 
25 /* Will delete/mark internal exons smaller than this for dual genome gap */
26 #define DELETE_THRESHOLD 0.1
27 #define MARK_THRESHOLD 1e-7
28 
29 /* Will delete internal exons smaller than this at ends */
30 #define SHORTEXONLEN_END 10
31 
32 
33 #define LOOSE_EXON_PVALUE 0.05
34 #define STRICT_EXON_PVALUE 1e-4
35 
36 
37 #ifdef GSNAP
38 /* Allow more intron predictions in ends of short reads */
39 #define SHORTEXONPROB_END 0.10
40 #else
41 #define SHORTEXONPROB_END 0.05
42 #endif
43 
44 #define BIGGAP 9
45 
46 
47 #ifdef DEBUG
48 #define debug(x) x
49 #else
50 #define debug(x)
51 #endif
52 
53 
54 typedef enum {KEEP, DELETE, MARK} Exonstatus_T;
55 
56 
57 static bool
big_gap_p(Pair_T pair,bool bysizep)58 big_gap_p (Pair_T pair, bool bysizep) {
59 
60 #if 0
61   /* No longer following the logic below */
62   /* When bysizep is true, single gaps should have been solved already
63      (in pass 2), so any gap remaining must be a poor one (must have
64      been removed because of a negative score), and we should consider
65      all gaps, regardless of size.  When bysizep is false, we must be
66      doing initial smoothing, before any single gaps have been solved,
67      so we want to consider only big gaps.  This change was motivated
68      by alignment of mouse HER2 against human genome, where a 58/58
69      section was deleted in stage 3, but subsequent smoothing failed
70      to recognize the short exon between this section and the next
71      intron. */
72 #endif
73 
74   if (bysizep == true) {
75     return true;
76   } else if (pair->genomejump > pair->queryjump) {
77     return (pair->genomejump - pair->queryjump) > BIGGAP ? true : false;
78   } else {
79     return false;
80   }
81 }
82 
83 
84 static int *
get_exonlengths(int ** exonmatches,int ** exonprotects,int * nexons,List_T pairs,bool bysizep)85 get_exonlengths (int **exonmatches, int **exonprotects, int *nexons, List_T pairs, bool bysizep) {
86   int *exonlengths;
87   Intlist_T list = NULL, matchlist = NULL, protectlist = NULL;
88   Pair_T firstpair, pair;
89   int querypos, nmatches = 0, nprotected = 0;
90 #ifdef DEBUG
91   int i;
92 #endif
93 
94   firstpair = List_head(pairs);
95   querypos = firstpair->querypos;
96 
97   while (pairs != NULL) {
98     pair = List_head(pairs);
99     if (pair->gapp == true && big_gap_p(pair,bysizep) == true) {
100       list = Intlist_push(list,querypos-firstpair->querypos+1);
101       matchlist = Intlist_push(matchlist,nmatches);
102       protectlist = Intlist_push(protectlist,nprotected);
103 
104       pairs = List_next(pairs);
105       if (pairs != NULL) {
106 	firstpair = List_head(pairs);
107 	querypos = firstpair->querypos;
108 	nmatches = nprotected = 0;
109       }
110     } else {
111       querypos = pair->querypos;
112       if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP) {
113 	nmatches++;
114       }
115       if (pair->protectedp == true) {
116 	nprotected++;
117       }
118       pairs = List_next(pairs);
119     }
120   }
121 
122   list = Intlist_push(list,querypos-firstpair->querypos+1);
123   list = Intlist_reverse(list);
124   matchlist = Intlist_push(matchlist,nmatches);
125   matchlist = Intlist_reverse(matchlist);
126   protectlist = Intlist_push(protectlist,nprotected);
127   protectlist = Intlist_reverse(protectlist);
128 
129   exonlengths = Intlist_to_array(&(*nexons),list);
130   *exonmatches = Intlist_to_array(&(*nexons),matchlist);
131   *exonprotects = Intlist_to_array(&(*nexons),protectlist);
132   debug(
133 	printf("%d exons: ",*nexons);
134 	for (i = 0; i < *nexons; i++) {
135 	  printf("%d (%d matches),",exonlengths[i],(*exonmatches)[i]);
136 	}
137 	printf("\n");
138 	);
139   Intlist_free(&list);
140   Intlist_free(&matchlist);
141   Intlist_free(&protectlist);
142 
143   return exonlengths;
144 }
145 
146 
147 static void
get_exonmatches(int * total_matches,int * total_denominator,int ** exonmatches,int * nexons,List_T pairs)148 get_exonmatches (int *total_matches, int *total_denominator, int **exonmatches,
149 		 int *nexons, List_T pairs) {
150   Intlist_T matchlist = NULL;
151   Pair_T pair;
152   int nmatches = 0, nmismatches = 0;
153 #ifdef DEBUG
154   int i;
155 #endif
156 
157   *total_matches = *total_denominator = 0;
158 
159   /* firstpair = List_head(pairs); */
160   /* querypos = firstpair->querypos; */
161 
162   while (pairs != NULL) {
163     pair = List_head(pairs);
164     if (pair->gapp == true && big_gap_p(pair,/*bysizep*/false) == true) {
165       matchlist = Intlist_push(matchlist,nmatches);
166       /* denominatorlist = Intlist_push(denominatorlist,nmatches + nmismatches); */
167       *total_matches += nmatches;
168       *total_denominator += (nmatches + nmismatches);
169 
170       pairs = List_next(pairs);
171       if (pairs != NULL) {
172 	/* firstpair = List_head(pairs); */
173 	/* querypos = firstpair->querypos; */
174 	nmatches = nmismatches = 0;
175       }
176     } else {
177       /* querypos = pair->querypos; */
178       if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP) {
179 	nmatches++;
180       } else {
181 	nmismatches++;
182       }
183       pairs = List_next(pairs);
184     }
185   }
186 
187   matchlist = Intlist_push(matchlist,nmatches);
188   matchlist = Intlist_reverse(matchlist);
189   /* denominatorlist = Intlist_push(denominatorlist,nmatches + nmismatches); */
190   /* denominatorlist = Intlist_reverse(denominatorlist); */
191   *total_matches += nmatches;
192   *total_denominator += (nmatches + nmismatches);
193 
194   *exonmatches = Intlist_to_array(&(*nexons),matchlist);
195   /* *exon_denominator = Intlist_to_array(&(*nexons),denominatorlist); */
196   debug(
197 	printf("%d exons: ",*nexons);
198 	for (i = 0; i < *nexons; i++) {
199 	  printf("%d matches, %d denominator,",(*exonmatches)[i],(*exon_denominator)[i]);
200 	}
201 	printf("\n");
202 	);
203   /* Intlist_free(&denominatorlist); */
204   Intlist_free(&matchlist);
205 
206   return;
207 }
208 
209 
210 
211 static List_T
get_intron_neighborhoods(int ** intron_matches_left,int ** intron_denominator_left,int ** intron_matches_right,int ** intron_denominator_right,int nintrons,List_T pairs)212 get_intron_neighborhoods (int **intron_matches_left, int **intron_denominator_left,
213 			  int **intron_matches_right, int **intron_denominator_right,
214 			  int nintrons, List_T pairs) {
215   List_T path, p;
216   Pair_T pair;
217   int introni;
218   int i;
219 
220   *intron_matches_left = (int *) CALLOC(nintrons,sizeof(int));
221   *intron_denominator_left = (int *) CALLOC(nintrons,sizeof(int));
222   *intron_matches_right = (int *) CALLOC(nintrons,sizeof(int));
223   *intron_denominator_right = (int *) CALLOC(nintrons,sizeof(int));
224 
225   introni = -1;
226   for (p = pairs; p != NULL; p = List_next(p)) {
227     pair = List_head(p);
228     if (pair->gapp == true && big_gap_p(pair,/*bysizep*/false) == true) {
229       introni += 1;
230       i = 0;
231 
232     } else if (introni >= 0 && i < 25) {
233       if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP) {
234 	(*intron_matches_left)[introni] += 1;
235       }
236       (*intron_denominator_left)[introni] += 1;
237       i++;
238 
239     } else {
240       i++;
241     }
242   }
243 
244   path = List_reverse(pairs);
245   introni = nintrons;
246   for (p = path; p != NULL; p = List_next(p)) {
247     pair = List_head(p);
248     if (pair->gapp == true && big_gap_p(pair,/*bysizep*/false) == true) {
249       introni -= 1;
250       i = 0;
251 
252     } else if (introni < nintrons && i < 25) {
253       if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP) {
254 	(*intron_matches_right)[introni] += 1;
255       }
256       (*intron_denominator_right)[introni] += 1;
257       i++;
258 
259     } else {
260       i++;
261     }
262   }
263 
264   pairs = List_reverse(path);
265   return pairs;
266 }
267 
268 
269 static int *
get_intronlengths(int * nintrons,List_T pairs,bool bysizep)270 get_intronlengths (int *nintrons, List_T pairs, bool bysizep) {
271   int *intronlengths, length;
272   Intlist_T list = NULL;
273   Pair_T pair;
274 #ifdef DEBUG
275   int i;
276 #endif
277 
278   while (pairs != NULL) {
279     pair = List_head(pairs);
280     if (pair->gapp == true && big_gap_p(pair,bysizep) == true) {
281 #if 0
282       if (pair->genomejump > pair->queryjump) {
283 	list = Intlist_push(list,pair->genomejump);
284       } else {
285 	list = Intlist_push(list,pair->queryjump);
286       }
287 #else
288       if ((length = pair->genomejump - pair->queryjump) < 0) {
289 	/* cDNA insertion */
290 	list = Intlist_push(list,-length);
291       } else {
292 	list = Intlist_push(list,length);
293       }
294 #endif
295     }
296     pairs = List_next(pairs);
297   }
298 
299   list = Intlist_reverse(list);
300   intronlengths = Intlist_to_array(&(*nintrons),list);
301   debug(
302 	printf("%d introns: ",*nintrons);
303 	for (i = 0; i < *nintrons; i++) {
304 	  printf("%d,",intronlengths[i]);
305 	}
306 	printf("\n");
307 	);
308   Intlist_free(&list);
309 
310   return intronlengths;
311 }
312 
313 
314 static void
get_intronprobs(double ** donor_probs,double ** acceptor_probs,int * nintrons,List_T pairs)315 get_intronprobs (double **donor_probs, double **acceptor_probs, int *nintrons, List_T pairs) {
316   Doublelist_T donor_prob_list = NULL, acceptor_prob_list = NULL;
317   Pair_T pair;
318 #ifdef DEBUG
319   int i;
320 #endif
321 
322   while (pairs != NULL) {
323     pair = List_head(pairs);
324     if (pair->gapp == true && big_gap_p(pair,/*bysizep*/false) == true) {
325       donor_prob_list = Doublelist_push(donor_prob_list,pair->donor_prob);
326       acceptor_prob_list = Doublelist_push(acceptor_prob_list,pair->acceptor_prob);
327     }
328     pairs = List_next(pairs);
329   }
330 
331   donor_prob_list = Doublelist_reverse(donor_prob_list);
332   acceptor_prob_list = Doublelist_reverse(acceptor_prob_list);
333   *donor_probs = Doublelist_to_array(&(*nintrons),donor_prob_list);
334   *acceptor_probs = Doublelist_to_array(&(*nintrons),acceptor_prob_list);
335   debug(
336 	printf("%d introns: ",*nintrons);
337 	for (i = 0; i < *nintrons; i++) {
338 	  printf("%f+%f,",(*donor_probs)[i],(*acceptor_probs)[i]);
339 	}
340 	printf("\n");
341 	);
342   Doublelist_free(&acceptor_prob_list);
343   Doublelist_free(&donor_prob_list);
344 
345   return;
346 }
347 
348 
349 static const Except_T length_error = {"Negative exon or intron length"};
350 
351 static double
compute_prob(int exonlen,int intronlen,int indexsize)352 compute_prob (int exonlen, int intronlen, int indexsize) {
353   double prob;
354 
355   if (exonlen < indexsize) {
356     prob = 1.0;
357   } else {
358     prob = 1 - pow(1.0-pow(4.0,(double) -exonlen),(double) intronlen);
359   }
360   debug(printf("Probability of exon of length %d (indexsize %d) with intron of length %d is %g\n",
361 	       exonlen,indexsize,intronlen,prob));
362   return prob;
363 }
364 
365 #if 0
366 static bool
367 short_exon_byprob (int exonlen, int intronlen, int indexsize, double prob_threshold) {
368   double prob;
369 
370   prob = compute_prob(exonlen,intronlen,indexsize);
371   if (prob < prob_threshold) {
372     return false;
373   } else {
374     return true;
375   }
376 }
377 #endif
378 
379 #if 0
380 static bool
381 short_exon_bylength (int exonlen, int length_threshold) {
382 
383   if (exonlen >= length_threshold) {
384     return false;
385   } else {
386     return true;
387   }
388 }
389 #endif
390 
391 static void
zero_net_gap(int * starti,int * startj,int i,int j,int * intronlengths)392 zero_net_gap (int *starti, int *startj, int i, int j, int *intronlengths) {
393   int netgap, bestnetgap = 1000000;
394   int k, l, adji;
395 
396   if (i == 0) {
397     adji = 0;
398   } else {
399     adji = i-1;
400   }
401 
402   for (k = adji; k < j; k++) {
403     netgap = intronlengths[k];
404     for (l = k+1; l < j; l++) {
405       netgap += intronlengths[l];
406       debug(printf("zero_net_gap: netgap from %d to %d is %d\n",k,l,netgap));
407       if (abs(netgap) < bestnetgap) {
408 	bestnetgap = abs(netgap);
409 	*starti = k+1;
410 	*startj = l;
411       }
412     }
413   }
414 
415   debug(printf("zero_net_gap: best result is %d from %d to %d\n",bestnetgap,*starti,*startj));
416 
417   if (0 && bestnetgap > ZERONETGAP) {
418     debug(printf("zero_net_gap: not recommending any deletions because bestnetgap %d > zeronetgap %d\n",
419 		 bestnetgap,ZERONETGAP));
420     *starti = *startj = -1;
421   }
422 
423   return;
424 }
425 
426 
427 static void
find_internal_shorts_by_netgap(bool * deletep,int * exonstatus,int * exonmatches,int nexons,int * intronlengths)428 find_internal_shorts_by_netgap (bool *deletep, int *exonstatus, int *exonmatches, int nexons,
429 				int *intronlengths) {
430   int starti, startj, i, j;
431   int exonlen;
432 
433   *deletep = false;
434   for (i = 0; i < nexons; i++) {
435     exonstatus[i] = KEEP;
436   }
437 
438   /* Mark short middle exons */
439   for (i = 1; i < nexons - 1; i++) {
440     exonlen = exonmatches[i];
441     if (exonlen < SHORTEXONLEN_NETGAP) {
442       exonstatus[i] = MARK;
443     }
444   }
445 
446   /* Find internal shorts */
447   i = 0;
448   while (i < nexons) {
449     if (exonstatus[i] == MARK) {
450       j = i;
451       while (j < nexons && exonstatus[j] == MARK) {
452 	j++;
453       }
454       debug(printf("Calling zero_net_gap with %d exons\n",j-i));
455       zero_net_gap(&starti,&startj,i,j,intronlengths);
456       if (starti >= 0) {
457 	for (j = starti; j <= startj; j++) {
458 	  *deletep = true;
459 	  exonstatus[j] = DELETE;
460 	}
461       } else if (j - i == 1) {
462 	exonstatus[i] = MARK;
463       }
464       i = j;
465     } else {
466       i++;
467     }
468   }
469 
470   return;
471 }
472 
473 
474 static void
find_internal_shorts_by_size(bool * shortp,bool * deletep,int * exonstatus,int * exonmatches,int * exonprotects,int nexons,int * intronlengths,int stage2_indexsize)475 find_internal_shorts_by_size (bool *shortp, bool *deletep, int *exonstatus,
476 			      int *exonmatches, int *exonprotects, int nexons,
477 			      int *intronlengths, int stage2_indexsize) {
478   int i;
479   int exonlen, intronlen;
480   double prob;
481 
482   *shortp = *deletep = false;
483   for (i = 0; i < nexons; i++) {
484     exonstatus[i] = KEEP;
485   }
486 
487   /* Mark short middle exons */
488   for (i = 1; i < nexons - 1; i++) {
489     exonlen = exonmatches[i];
490     intronlen = intronlengths[i-1]+intronlengths[i];
491     if ((double) exonprotects[i] > 0.50 * (double) exonlen) {
492       /* Keep protected exon */
493     } else {
494       prob = compute_prob(exonlen,intronlen,stage2_indexsize); /* Hack: add 4 for possible canonical dinucleotides */
495       if (prob > DELETE_THRESHOLD) {
496 	*deletep = true;
497 	exonstatus[i] = DELETE;
498       } else if (prob > MARK_THRESHOLD) {
499 	*shortp = true;
500 	exonstatus[i] = MARK;
501       }
502     }
503   }
504 
505   return;
506 }
507 
508 
509 static bool
bad_intron_p(double donor_prob,double acceptor_prob,int intron_matches_left,int intron_denominator_left,int intron_matches_right,int intron_denominator_right,int total_matches,int total_denominator)510 bad_intron_p (double donor_prob, double acceptor_prob, int intron_matches_left, int intron_denominator_left,
511 	      int intron_matches_right, int intron_denominator_right, int total_matches, int total_denominator) {
512   double theta_left, theta_right;
513 
514   if (donor_prob < 0.9) {
515     debug(printf("Intron with donor_prob %f is bad\n",donor_prob));
516     return true;
517   } else if (acceptor_prob < 0.9) {
518     debug(printf("Intron with acceptor_prob %f is bad\n",acceptor_prob));
519     return true;
520   } else {
521     theta_left = (double) (total_matches - intron_matches_left)/(double) (total_denominator - intron_denominator_left + 1);
522     if (theta_left > 1.0) {
523       theta_left = 1.0;
524     }
525     if (Pbinom(intron_matches_left,intron_denominator_left,theta_left) < 1e-3) {
526       debug(printf("Intron with matches %d/%d is bad with pvalue %f\n",
527 		   intron_matches_left,intron_denominator_left,Pbinom(intron_matches_left,intron_denominator_left,theta_left)));
528       return true;
529     } else {
530       theta_right = (double) (total_matches - intron_matches_right)/(double) (total_denominator - intron_denominator_right + 1);
531       if (theta_right > 1.0) {
532 	theta_right = 1.0;
533       }
534       if (Pbinom(intron_matches_right,intron_denominator_right,theta_right) < 1e-3) {
535 	debug(printf("Intron with matches %d/%d is bad with pvalue %f\n",
536 		     intron_matches_right,intron_denominator_right,Pbinom(intron_matches_right,intron_denominator_right,theta_right)));
537 	return true;
538       } else {
539 	debug(printf("Intron with matches %d/%d and %d/%d is okay with pvalues %f and %f\n",
540 		     intron_matches_left,intron_denominator_left,intron_matches_right,intron_denominator_right,
541 		     Pbinom(intron_matches_left,intron_denominator_left,theta_left),
542 		     Pbinom(intron_matches_right,intron_denominator_right,theta_right)));
543 	return false;
544       }
545     }
546   }
547 }
548 
549 
550 struct Smoothcell_T {
551   int exoni;
552   /* double pvalue; */
553   int exonmatches;
554   int exonstatus;
555 };
556 
557 #if 0
558 static int
559 Smoothcell_cmp (const void *x, const void *y) {
560   struct Smoothcell_T a = * (struct Smoothcell_T *) x;
561   struct Smoothcell_T b = * (struct Smoothcell_T *) y;
562 
563   if (a.pvalue < b.pvalue) {
564     return -1;
565   } else if (b.pvalue < a.pvalue) {
566     return +1;
567   } else {
568     return 0;
569   }
570 }
571 #else
572 static int
Smoothcell_cmp(const void * x,const void * y)573 Smoothcell_cmp (const void *x, const void *y) {
574   struct Smoothcell_T a = * (struct Smoothcell_T *) x;
575   struct Smoothcell_T b = * (struct Smoothcell_T *) y;
576 
577   if (a.exonmatches < b.exonmatches) {
578     return -1;
579   } else if (b.exonmatches < a.exonmatches) {
580     return +1;
581   } else {
582     return 0;
583   }
584 }
585 #endif
586 
587 
588 
589 static void
find_internal_bads_by_prob(bool * deletep,int * exonstatus,int * exonmatches,int nexons,double * donor_probs,double * acceptor_probs,int * intron_matches_left,int * intron_denominator_left,int * intron_matches_right,int * intron_denominator_right,int total_matches,int total_denominator)590 find_internal_bads_by_prob (bool *deletep, int *exonstatus, int *exonmatches, int nexons,
591 			    double *donor_probs, double *acceptor_probs,
592 			    int *intron_matches_left, int *intron_denominator_left,
593 			    int *intron_matches_right, int *intron_denominator_right,
594 			    int total_matches, int total_denominator) {
595   struct Smoothcell_T *cells;
596   int i;
597   /* double theta0; */
598   /* int numerator0, denominator0; */
599   bool intron1_bad_p, intron2_bad_p;
600 
601   /* double pvalue; */
602   /* double worst_exon_pvalue = 1.0; */
603   /* int worst_exoni = -1; */
604 
605   *deletep = false;
606   cells = (struct Smoothcell_T *) MALLOCA(nexons * sizeof(struct Smoothcell_T));
607   for (i = 0; i < nexons; i++) {
608     exonstatus[i] = KEEP;
609     cells[i].exoni = i;
610     cells[i].exonmatches = exonmatches[i];
611     cells[i].exonstatus = KEEP;
612   }
613 
614   /* Mark short middle exons */
615   intron1_bad_p = bad_intron_p(donor_probs[0],acceptor_probs[0],intron_matches_left[0],intron_denominator_left[0],
616 			       intron_matches_right[0],intron_denominator_right[0],total_matches,total_denominator);
617   for (i = 1; i < nexons - 1; i++) {
618     /* exonlen = exonmatches[i]; */
619     intron2_bad_p = bad_intron_p(donor_probs[i],acceptor_probs[i],intron_matches_left[i],intron_denominator_left[i],
620 				 intron_matches_right[i],intron_denominator_right[i],total_matches,total_denominator);
621     debug(printf("For exon %d, left intron bad %d, right intron bad %d\n",i,intron1_bad_p,intron2_bad_p));
622 
623     if (intron1_bad_p == true || intron2_bad_p == true) {
624 #if 0
625       numerator0 = exonmatches[i];
626       denominator0 = exon_denominator[i];
627       theta0 = (double) (total_matches - numerator0 + 1)/(double) (total_denominator - denominator0 + 1);
628       debug(printf("  Checking %d matches, %d denominator, theta %f => pvalue %g\n",
629 		   numerator0,denominator0,theta0,Pbinom(numerator0,denominator0,theta0)));
630       if ((pvalue = Pbinom(numerator0,denominator0,theta0)) < STRICT_EXON_PVALUE) {
631 	cells[i].pvalue = pvalue;
632 	cells[i].exonstatus = DELETE;
633       }
634 #else
635       if (exonmatches[i] < 15) {
636 	cells[i].exonstatus = DELETE;
637       }
638 #endif
639 
640     } else {
641       /* Do nothing */
642     }
643 
644     intron1_bad_p = intron2_bad_p;
645   }
646 
647   qsort(cells,nexons,sizeof(struct Smoothcell_T),Smoothcell_cmp);
648   i = 0;
649   while (i < nexons && cells[i].exonmatches < 15) {
650     if (cells[i].exonstatus == DELETE) {
651       debug(printf("  Will delete exon %d\n",i));
652       *deletep = true;
653       exonstatus[cells[i].exoni] = DELETE;
654       exonstatus[cells[i].exoni - 1] = KEEP; /* Prevent consecutive deletes */
655       exonstatus[cells[i].exoni + 1] = KEEP; /* Prevent consecutive deletes */
656     }
657     i++;
658   }
659   FREEA(cells);
660 
661   return;
662 }
663 
664 
665 
666 #if 0
667 /* For ends, we turn off the indexsize parameter */
668 static void
669 find_end_shorts (bool *deletep, int *exonstatus, int *exonmatches, int nexons, int *intronlengths) {
670   int i;
671   bool shortp;
672 
673   *deletep = false;
674   for (i = 0; i < nexons; i++) {
675     exonstatus[i] = KEEP;
676   }
677 
678   shortp = true;
679   i = 0;
680   while (i < nexons - 1 && shortp == true) {
681     if (short_exon_bylength(exonmatches[i],SHORTEXONLEN_END) == true &&
682 	short_exon_byprob(exonmatches[i],intronlengths[i],/*indexsize*/0,SHORTEXONPROB_END) == true) {
683       *deletep = true;
684       exonstatus[i] = DELETE;
685     } else {
686       shortp = false;
687     }
688     i++;
689   }
690 
691   shortp = true;
692   i = nexons - 1;
693   while (i > 0 && shortp == true) {
694     if (short_exon_bylength(exonmatches[i],SHORTEXONLEN_END) == true &&
695 	short_exon_byprob(exonmatches[i],intronlengths[i-1],/*indexsize*/0,SHORTEXONPROB_END) == true) {
696       *deletep = true;
697       exonstatus[i] = DELETE;
698     } else {
699       shortp = false;
700     }
701     --i;
702   }
703 
704   return;
705 }
706 #endif
707 
708 
709 #ifdef DEBUG
710 static void
print_exon_status(int * exonstatus,int * exonmatches,int * exonprotects,int nexons)711 print_exon_status (int *exonstatus, int *exonmatches, int *exonprotects, int nexons) {
712   int i;
713 
714   if (exonprotects == NULL) {
715     for (i = 0; i < nexons; i++) {
716       if (exonstatus[i] == KEEP) {
717 	printf("Long exon %d of %d matches => keep\n",i,exonmatches[i]);
718       } else if (exonstatus[i] == MARK) {
719 	printf("Short exon %d of %d matches => mark\n",i,exonmatches[i]);
720       } else if (exonstatus[i] == DELETE) {
721 	printf("Exon %d of %d matches => delete\n",i,exonmatches[i]);
722       } else {
723 	abort();
724       }
725     }
726   } else {
727     for (i = 0; i < nexons; i++) {
728       if (exonstatus[i] == KEEP) {
729 	printf("Long exon %d of %d matches (%d protected) => keep\n",i,exonmatches[i],exonprotects[i]);
730       } else if (exonstatus[i] == MARK) {
731 	printf("Short exon %d of %d matches (%d protected) => mark\n",i,exonmatches[i],exonprotects[i]);
732       } else if (exonstatus[i] == DELETE) {
733 	printf("Exon %d of %d matches (%d protected) => delete\n",i,exonmatches[i],exonprotects[i]);
734       } else {
735 	abort();
736       }
737     }
738   }
739 
740   return;
741 }
742 #endif
743 
744 
745 static List_T
delete_and_mark_exons(List_T pairs,Pairpool_T pairpool,int * exonstatus,bool markp,bool bysizep)746 delete_and_mark_exons (List_T pairs, Pairpool_T pairpool,
747 		       int *exonstatus, bool markp, bool bysizep) {
748   List_T newpairs = NULL;
749   Pair_T pair;
750   int currstatus, prevstatus = KEEP;
751   int i;
752 
753   i = 0;
754   currstatus = exonstatus[i];
755   while (pairs != NULL) {
756     /* pairptr = pairs; */
757     /* pairs = Pairpool_pop(pairs,&pair); */
758     pair = (Pair_T) pairs->first;
759     if (pair->gapp == true && big_gap_p(pair,bysizep) == true) {
760       prevstatus = currstatus;
761       currstatus = exonstatus[++i];
762       debug(printf("Gap observed\n"));
763       if (prevstatus != DELETE && currstatus != DELETE) {
764 #ifdef WASTE
765 	newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
766 #else
767 	newpairs = List_transfer_one(newpairs,&pairs);
768 #endif
769       } else {
770 	pairs = Pairpool_pop(pairs,&pair);
771       }
772 
773     } else if (currstatus == KEEP) {
774       /* debug(printf("Marking position %d as keep\n",pair->querypos)); */
775       if (prevstatus == DELETE) {
776 	if (newpairs != NULL) {
777 	  newpairs = Pairpool_push_gapholder(newpairs,pairpool,/*queryjump*/0,/*genomejump*/0,
778 					     /*leftpair*/newpairs->first,/*rightpair*/pair,/*knownp*/false);
779 	}
780 	prevstatus = /*currstatus*/ KEEP;
781       }
782 
783 #ifdef WASTE
784       newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
785 #else
786       newpairs = List_transfer_one(newpairs,&pairs);
787 #endif
788     } else if (currstatus == MARK) {
789       debug(printf("Marking position %d as short in pair %p\n",pair->querypos,pair));
790       if (prevstatus == DELETE) {
791 	if (newpairs != NULL) {
792 	  newpairs = Pairpool_push_gapholder(newpairs,pairpool,/*queryjump*/0,/*genomejump*/0,
793 					     /*leftpair*/newpairs->first,/*rightpair*/pair,/*knownp*/false);
794 	}
795 	prevstatus = /*currstatus*/ MARK;
796       }
797 
798       if (markp == true) {
799 	pair->shortexonp = true;
800       }
801 #ifdef WASTE
802       newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
803 #else
804       newpairs = List_transfer_one(newpairs,&pairs);
805 #endif
806     } else {
807       debug(printf("Marking position %d for deletion\n",pair->querypos));
808       pairs = Pairpool_pop(pairs,&pair);
809     }
810   }
811 
812   /* Remove gaps at end */
813   if (newpairs != NULL) {
814     pair = List_head(newpairs);
815   }
816   while (newpairs != NULL && pair->gapp == true) {
817     debug(printf("Popping gap at end\n"));
818     newpairs = Pairpool_pop(newpairs,&pair);
819   }
820 
821   /* Remove gaps at beginning */
822   newpairs = List_reverse(newpairs);
823   if (newpairs != NULL) {
824     pair = List_head(newpairs);
825   }
826   while (newpairs != NULL && pair->gapp == true) {
827     debug(printf("Popping gap at beginning\n"));
828     newpairs = Pairpool_pop(newpairs,&pair);
829   }
830 
831   debug(printf("Result of delete_and_mark_exons:\n"));
832   debug(Pair_dump_list(newpairs,/*zerobasedp*/true));
833   debug(printf("\n"));
834 
835   return newpairs;
836 }
837 
838 
839 #if 0
840 static List_T
841 mark_exons (List_T pairs,
842 #ifdef WASTE
843 	    Pairpool_T pairpool,
844 #endif
845 	    int *exonstatus) {
846   List_T newpairs = NULL, pairptr;
847   Pair_T pair;
848   int currstatus, prevstatus;
849   int i;
850 
851   debug(
852 	for (i = 0; i < nexons; i++) {
853 	  if (exonstatus[i] == KEEP) {
854 	    printf("Long exon %d of %d matches => keep\n",i,exonmatches[i]);
855 	  } else if (exonstatus[i] == MARK) {
856 	    printf("Short exon %d of %d matches => mark\n",i,exonmatches[i]);
857 	  } else if (exonstatus[i] == DELETE) {
858 	    printf("Exon %d of %d matches => delete\n",i,exonmatches[i]);
859 	  } else {
860 	    abort();
861 	  }
862 	}
863 	);
864 
865   i = 0;
866   currstatus = exonstatus[i];
867   while (pairs != NULL) {
868     pairptr = pairs;
869     pairs = Pairpool_pop(pairs,&pair);
870     if (pair->gapp == true && big_gap_p(pair,/*bysizep*/true) == true) {
871       prevstatus = currstatus;
872       currstatus = exonstatus[++i];
873       debug(printf("Gap observed\n"));
874       if (prevstatus != DELETE && currstatus != DELETE) {
875 #ifdef WASTE
876 	newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
877 #else
878 	newpairs = List_push_existing(newpairs,pairptr);
879 #endif
880       }
881 
882     } else if (currstatus == KEEP) {
883       /* debug(printf("Marking position %d as keep\n",pair->querypos)); */
884 #ifdef WASTE
885       newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
886 #else
887       newpairs = List_push_existing(newpairs,pairptr);
888 #endif
889     } else if (currstatus == MARK) {
890       debug(printf("Marking position %d as short in pair %p\n",pair->querypos,pair));
891       pair->shortexonp = true;
892 
893 #ifdef WASTE
894       newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
895 #else
896       newpairs = List_push_existing(newpairs,pairptr);
897 #endif
898     } else {
899       /* Normally would delete */
900 #ifdef WASTE
901       newpairs = Pairpool_push_existing(newpairs,pairpool,pair);
902 #else
903       newpairs = List_push_existing(newpairs,pairptr);
904 #endif
905     }
906   }
907 
908   debug(printf("Result of mark_exons:\n"));
909   debug(Pair_dump_list(newpairs,/*zerobasedp*/true));
910   debug(printf("\n"));
911 
912   return List_reverse(newpairs);
913 }
914 #endif
915 
916 
917 
918 void
Smooth_reset(List_T pairs)919 Smooth_reset (List_T pairs) {
920   List_T p;
921   Pair_T pair;
922 
923   for (p = pairs; p != NULL; p = List_next(p)) {
924     pair = List_head(p);
925     pair->shortexonp = false;
926   }
927   return;
928 }
929 
930 
931 
932 /* Needed for low-identity alignments */
933 /* Assumes pairs are from 1..querylength.  Reverses the pairs to be querylength..1 */
934 List_T
Smooth_pairs_by_netgap(bool * deletep,List_T pairs,Pairpool_T pairpool)935 Smooth_pairs_by_netgap (bool *deletep, List_T pairs, Pairpool_T pairpool) {
936   int *exonstatus;
937   int *exonlengths, *exonmatches, *exonprotects, *intronlengths, nexons, nintrons;
938 #ifdef DEBUG
939   int i;
940 #endif
941 
942   *deletep = false;
943   /* smooth_reset(pairs); */
944   if (pairs != NULL) {
945     /* Remove internal shorts */
946     exonlengths = get_exonlengths(&exonmatches,&exonprotects,&nexons,pairs,/*bysizep*/false);
947     intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/false);
948 
949     debug(
950 	  printf("Beginning of smoothing.  Initial structure:\n");
951 	  for (i = 0; i < nexons-1; i++) {
952 	    printf("Exon %d of length %d (%d matches, %d protects)\n",i,exonlengths[i],exonmatches[i],exonprotects[i]);
953 	    printf("Intron %d of length %d\n",i,intronlengths[i]);
954 	  }
955 	  printf("Exon %d of length %d (%d matches)\n",nexons-1,exonlengths[nexons-1],exonmatches[nexons-1]);
956 	  );
957 
958     debug(printf("\nFind internal shorts\n"));
959     exonstatus = (int *) MALLOCA(nexons * sizeof(int));
960     find_internal_shorts_by_netgap(&(*deletep),exonstatus,exonmatches,nexons,intronlengths);
961     debug(printf("\nRemove internal shorts\n"));
962     if (*deletep == true) {
963       debug(print_exon_status(exonstatus,exonmatches,exonprotects,nexons));
964       pairs = delete_and_mark_exons(pairs,pairpool,exonstatus,/*markp*/false,/*bysizep*/false);
965     }
966 
967 #if 0
968     /* This is not correct */
969     debug(
970 	  printf("After removing internal shorts:\n");
971 	  for (i = 0; i < nexons-1; i++) {
972 	    printf("Exon %d of length %d (%d matches)\n",i,exonlengths[i],exonmatches[i]);
973 	    printf("Intron %d of length %d\n",i,intronlengths[i]);
974 	  }
975 	  printf("Exon %d of length %d\n",nexons-1,exonlengths[nexons-1]);
976 	  );
977 #endif
978 
979     FREEA(exonstatus);
980     FREE(intronlengths);
981     FREE(exonprotects);
982     FREE(exonmatches);
983     FREE(exonlengths);
984 
985     debug(printf("Ending of smoothing\n\n"));
986   }
987 
988   return pairs;
989 }
990 
991 
992 /* Assumes pairs are from 1..querylength.  Reverses the pairs to be querylength..1 */
993 List_T
Smooth_pairs_by_size(bool * shortp,bool * deletep,List_T pairs,Pairpool_T pairpool,int stage2_indexsize)994 Smooth_pairs_by_size (bool *shortp, bool *deletep, List_T pairs, Pairpool_T pairpool, int stage2_indexsize) {
995   int *exonstatus;
996   int *exonlengths, *exonmatches, *exonprotects, *intronlengths, nexons, nintrons;
997   bool delete2p;
998 #ifdef DEBUG
999   int i;
1000 #endif
1001 
1002   *shortp = *deletep = false;
1003 
1004 #if 0
1005   /* Should be handled instead by trim_noncanonical_end5 and trim_noncanonical_end3 */
1006   /* smooth_reset(pairs); */
1007   if (pairs != NULL) {
1008     /* Trim ends */
1009     exonlengths = get_exonlengths(&exonmatches,&exonprotects,&nexons,pairs,/*bysizep*/true);
1010     intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/true);
1011 
1012     debug(printf("\nFind end shorts\n"));
1013     exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1014     find_end_shorts(&delete1p,exonstatus,exonmatches,nexons,intronlengths);
1015     if (delete1p == true) {
1016       *deletep = true;
1017       debug(print_exon_status(exonstatus,exonmatches,exonprotects,nexons));
1018       pairs = delete_and_mark_exons(pairs,pairpool,exonstatus,/*markp*/false,/*bysizep*/true);
1019     }
1020 
1021     FREEA(exonstatus);
1022 
1023     FREE(intronlengths);
1024     FREE(exonprotects);
1025     FREE(exonmatches);
1026     FREE(exonlengths);
1027   }
1028 #endif
1029 
1030   if (pairs != NULL) {
1031     /* Remove internal shorts */
1032     exonlengths = get_exonlengths(&exonmatches,&exonprotects,&nexons,pairs,/*bysizep*/true);
1033     intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/true);
1034 
1035     debug(
1036 	  printf("Beginning of smoothing.  Initial structure:\n");
1037 	  for (i = 0; i < nexons-1; i++) {
1038 	    printf("Exon %d of length %d (%d matches, %d protects)\n",i,exonlengths[i],exonmatches[i],exonprotects[i]);
1039 	    printf("Intron %d of length %d\n",i,intronlengths[i]);
1040 	  }
1041 	  printf("Exon %d of length %d (%d matches)\n",nexons-1,exonlengths[nexons-1],exonmatches[nexons-1]);
1042 	  );
1043 
1044     debug(printf("\nFind internal shorts\n"));
1045     exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1046     find_internal_shorts_by_size(&(*shortp),&delete2p,exonstatus,exonmatches,exonprotects,nexons,intronlengths,stage2_indexsize);
1047     debug(printf("\nRemove internal shorts\n"));
1048     if (delete2p == true) {
1049       *deletep = true;
1050     }
1051     if (delete2p == true || *shortp == true) {
1052       debug(print_exon_status(exonstatus,exonmatches,exonprotects,nexons));
1053       pairs = delete_and_mark_exons(pairs,pairpool,exonstatus,/*markp*/true,/*bysizep*/true);
1054     }
1055 
1056     debug(
1057 	  printf("After removing internal shorts:\n");
1058 	  for (i = 0; i < nexons-1; i++) {
1059 	    printf("Exon %d of length %d (%d matches)\n",i,exonlengths[i],exonmatches[i]);
1060 	    printf("Intron %d of length %d\n",i,intronlengths[i]);
1061 	  }
1062 	  printf("Exon %d of length %d\n",nexons-1,exonlengths[nexons-1]);
1063 	  );
1064 
1065     FREEA(exonstatus);
1066     FREE(intronlengths);
1067     FREE(exonprotects);
1068     FREE(exonmatches);
1069     FREE(exonlengths);
1070 
1071   }
1072 
1073   debug(printf("Ending of smoothing\n\n"));
1074 
1075   return pairs;
1076 }
1077 
1078 
1079 #if 0
1080 List_T
1081 Smooth_mark_short_exons (List_T pairs, Pairpool_T pairpool, int stage2_indexsize) {
1082   int *exonstatus;
1083   int *exonlengths, *exonmatches, *intronlengths, nexons, nintrons;
1084   bool shortp;
1085   bool delete1p, delete2p;
1086 #ifdef DEBUG
1087   int i;
1088 #endif
1089 
1090   shortp = false;
1091   /* smooth_reset(pairs); */
1092   if (pairs != NULL) {
1093     /* Trim ends */
1094     exonlengths = get_exonlengths(&exonmatches,&nexons,pairs,/*bysizep*/true);
1095     intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/true);
1096 
1097     debug(printf("\nFind end shorts\n"));
1098     exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1099     find_end_shorts(&delete1p,exonstatus,exonmatches,nexons,intronlengths);
1100     pairs = mark_exons(pairs,
1101 #ifdef WASTE
1102 		       pairpool,
1103 #endif
1104 		       exonstatus);
1105 
1106     FREEA(exonstatus);
1107 
1108     FREE(intronlengths);
1109     FREE(exonmatches);
1110     FREE(exonlengths);
1111   }
1112 
1113   if (pairs != NULL) {
1114     /* Find internal shorts */
1115     exonlengths = get_exonlengths(&exonmatches,&exonprotects,&nexons,pairs,/*bysizep*/true);
1116     intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/true);
1117 
1118     debug(
1119 	  printf("Beginning of smoothing.  Initial structure:\n");
1120 	  for (i = 0; i < nexons-1; i++) {
1121 	    printf("Exon %d of length %d (%d matches, %d protects)\n",i,exonlengths[i],exonmatches[i],exonprotects[i]);
1122 	    printf("Intron %d of length %d\n",i,intronlengths[i]);
1123 	  }
1124 	  printf("Exon %d of length %d (%d matches)\n",nexons-1,exonlengths[nexons-1],exonmatches[nexons-1]);
1125 	  );
1126 
1127     debug(printf("\nFind internal shorts\n"));
1128     exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1129     find_internal_shorts_by_size(&shortp,&delete2p,exonstatus,exonmatches,nexons,intronlengths,stage2_indexsize);
1130     debug(printf("\nMark internal shorts\n"));
1131     pairs = mark_exons(pairs,
1132 #ifdef WASTE
1133 		       pairpool,
1134 #endif
1135 		       exonstatus);
1136 
1137     debug(
1138 	  printf("After marking internal shorts:\n");
1139 	  for (i = 0; i < nexons-1; i++) {
1140 	    printf("Exon %d of length %d (%d matches)\n",i,exonlengths[i],exonmatches[i]);
1141 	    printf("Intron %d of length %d\n",i,intronlengths[i]);
1142 	  }
1143 	  printf("Exon %d of length %d\n",nexons-1,exonlengths[nexons-1]);
1144 	  );
1145 
1146     FREEA(exonstatus);
1147     FREE(intronlengths);
1148     FREE(exonmatches);
1149     FREE(exonlengths);
1150 
1151   }
1152 
1153   debug(printf("End of Smooth_mark_short_exons\n\n"));
1154 
1155   return pairs;
1156 }
1157 #endif
1158 
1159 
1160 
1161 List_T
Smooth_pairs_by_intronprobs(bool * badp,List_T pairs,Pairpool_T pairpool)1162 Smooth_pairs_by_intronprobs (bool *badp, List_T pairs, Pairpool_T pairpool) {
1163   int *exonstatus;
1164   int *exonmatches, nexons, nintrons;
1165   int *intron_matches_left, *intron_denominator_left, *intron_matches_right, *intron_denominator_right;
1166   double *donor_probs, *acceptor_probs;
1167   int total_matches, total_denominator;
1168 #ifdef DEBUG
1169   int *intronlengths;
1170   int i;
1171 #endif
1172 
1173   debug(Pair_dump_list(pairs,true));
1174 
1175   *badp = false;
1176   /* smooth_reset(pairs); */
1177   if (pairs != NULL) {
1178     /* Remove internal shorts */
1179     get_intronprobs(&donor_probs,&acceptor_probs,&nintrons,pairs);
1180     if (nintrons > 0) {
1181       get_exonmatches(&total_matches,&total_denominator,&exonmatches,&nexons,pairs);
1182       debug(intronlengths = get_intronlengths(&nintrons,pairs,/*bysizep*/false));
1183 
1184       pairs = get_intron_neighborhoods(&intron_matches_left,&intron_denominator_left,
1185 				       &intron_matches_right,&intron_denominator_right,nintrons,pairs);
1186 
1187       debug(
1188 	    printf("Beginning of smoothing.  Initial structure:\n");
1189 	    for (i = 0; i < nexons-1; i++) {
1190 	      printf("Exon %d with %d matches and %d denominator\n",i,exonmatches[i],exon_denominator[i]);
1191 	      printf("Intron %d, length %d, with probs %f+%f and matches %d/%d and %d/%d\n",
1192 		     i,intronlengths[i],donor_probs[i],acceptor_probs[i],intron_matches_left[i],intron_denominator_left[i],
1193 		     intron_matches_right[i],intron_denominator_right[i]);
1194 	    }
1195 	    printf("Exon %d with %d matches and %d denominator\n",nexons-1,exonmatches[nexons-1],exon_denominator[nexons-1]);
1196 	    );
1197 
1198       debug(printf("\nFind internal bads\n"));
1199       exonstatus = (int *) MALLOCA(nexons * sizeof(int));
1200       find_internal_bads_by_prob(&(*badp),exonstatus,exonmatches,nexons,
1201 				 donor_probs,acceptor_probs,
1202 				 intron_matches_left,intron_denominator_left,
1203 				 intron_matches_right,intron_denominator_right,
1204 				 total_matches,total_denominator);
1205       debug(printf("\nRemove internal bads\n"));
1206       if (*badp == false) {
1207 	debug(printf("No internal bads\n"));
1208       } else {
1209 	debug(print_exon_status(exonstatus,exonmatches,/*exonprotects*/NULL,nexons));
1210 	pairs = delete_and_mark_exons(pairs,pairpool,exonstatus,/*markp*/true,/*bysizep*/false);
1211       }
1212 
1213 #if 0
1214       /* This is not correct */
1215       debug(
1216 	    printf("After removing internal shorts:\n");
1217 	    for (i = 0; i < nexons-1; i++) {
1218 	      printf("Exon %d of length %d (%d matches)\n",i,exonlengths[i],exonmatches[i]);
1219 	      printf("Intron %d of length %d\n",i,intronlengths[i]);
1220 	    }
1221 	    printf("Exon %d of length %d\n",nexons-1,exonlengths[nexons-1]);
1222 	    );
1223 #endif
1224 
1225       debug(FREE(intronlengths));
1226       FREEA(exonstatus);
1227       FREE(intron_denominator_right);
1228       FREE(intron_matches_right);
1229       FREE(intron_denominator_left);
1230       FREE(intron_matches_left);
1231       FREE(exonmatches);
1232 
1233       debug(printf("Ending of smoothing\n\n"));
1234     }
1235 
1236     FREE(acceptor_probs);
1237     FREE(donor_probs);
1238   }
1239 
1240   return pairs;
1241 }
1242