1 static char rcsid[] = "$Id: gregion.c 218147 2019-01-16 21:28:41Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "gregion.h"
7 #include <stdlib.h>
8 #include "assert.h"
9 #include "mem.h"
10 #include "indexdb.h"		/* For SUFFICIENT_SUPPORT */
11 #include "univinterval.h"
12 #include "chrnum.h"
13 
14 #define USE_CLEAN 1
15 
16 #ifdef USE_CLEAN
17 #include "uinttable_rh.h"
18 #endif
19 
20 
21 #define MAX_GENOMICLENGTH 2000000
22 #define MAX_NCHRS_FOR_ALLOCA 100
23 #define MAX_GREGIONS_FOR_ALLOCA 100
24 
25 
26 #define EXTRA_SHORTEND  30000
27 #define EXTRA_LONGEND   100000
28 
29 #ifdef DEBUG
30 #define debug(x) x
31 #else
32 #define debug(x)
33 #endif
34 
35 /* filter_clean */
36 #ifdef DEBUG2
37 #define debug2(x) x
38 #else
39 #define debug2(x)
40 #endif
41 
42 /* Sufficient support */
43 #ifdef DEBUG5
44 #define debug5(x) x
45 #else
46 #define debug5(x)
47 #endif
48 
49 
50 #define T Gregion_T
51 struct T {
52   int nexons;
53 
54   Chrpos_T genomiclength;
55   bool plusp;
56   int genestrand;
57 
58   Chrpos_T extension5;
59   Chrpos_T extension3;
60   bool extendedp;
61 
62   Chrnum_T chrnum;
63 
64   Chrpos_T chrstart;
65   Chrpos_T chrend;
66 
67   Chrpos_T extentstart;
68   Chrpos_T extentend;
69 
70   Univcoord_T chroffset;	/* This is for chr, not the segment */
71   Univcoord_T chrhigh;
72   Chrpos_T chrlength;
73 
74   int querystart;		/* Used only for maponly mode */
75   int queryend;			/* Used only for maponly mode */
76   int matchsize;		/* Used only for maponly mode */
77 
78   int trimstart;
79   int trimend;
80   bool sufficient_support_p;
81 
82   double weight;		/* Product of match weights */
83   int support;
84 
85   int ncovered;
86   int source;			/* Oligoindex source in stage 2 */
87 
88   /* For cleaning */
89 
90 
91   int total_up;
92   int total_down;
93   Chrpos_T bestprev_ceiling;
94   Chrpos_T bestprev_floor;
95   int score_ceiling;
96   int score_floor;
97 
98 #ifdef USE_CLEAN
99   bool bounded_low_p;
100   bool bounded_high_p;
101 #endif
102 };
103 
104 
105 
106 void
Gregion_print(T this)107 Gregion_print (T this) {
108 
109 #if 1
110   /* Off for debugging */
111   printf(" %d..%d ",this->querystart,this->queryend);
112 #endif
113   printf("%u %u %u %u #%d:%u..%u  length:%u  weight:%.2f  support:%d",
114 	 this->extentstart,this->extentend,this->chrstart,this->chrend,
115 	 this->chrnum,this->chrstart,this->chrend,this->genomiclength,
116 	 this->weight,this->support);
117 #ifdef USE_CLEAN
118   printf("  bounded_low:%d, bounded_high:%d",this->bounded_low_p,this->bounded_high_p);
119 #endif
120 
121   printf("\n");
122 
123   return;
124 }
125 
126 
127 void
Gregion_free(T * old)128 Gregion_free (T *old) {
129   FREE(*old);
130   return;
131 }
132 
133 Univcoord_T
Gregion_genomicstart(T this)134 Gregion_genomicstart (T this) {
135   return this->chroffset + this->chrstart;
136 }
137 
138 Univcoord_T
Gregion_genomicend(T this)139 Gregion_genomicend (T this) {
140   return this->chroffset + this->chrend;
141 }
142 
143 Chrpos_T
Gregion_chrstart(T this)144 Gregion_chrstart (T this) {
145   return this->chrstart;
146 }
147 
148 Chrpos_T
Gregion_chrend(T this)149 Gregion_chrend (T this) {
150   return this->chrend;
151 }
152 
153 Chrpos_T
Gregion_genomiclength(T this)154 Gregion_genomiclength (T this) {
155   return this->genomiclength;
156 }
157 
158 bool
Gregion_plusp(T this)159 Gregion_plusp (T this) {
160   return this->plusp;
161 }
162 
163 bool
Gregion_revcompp(T this)164 Gregion_revcompp (T this) {
165   return !(this->plusp);
166 }
167 
168 int
Gregion_genestrand(T this)169 Gregion_genestrand (T this) {
170   return this->genestrand;
171 }
172 
173 Chrnum_T
Gregion_chrnum(T this)174 Gregion_chrnum (T this) {
175   return this->chrnum;
176 }
177 
178 /* Used only for debugging.  String is allocated and should be freed. */
179 char *
Gregion_chr(T this,Univ_IIT_T chromosome_iit)180 Gregion_chr (T this, Univ_IIT_T chromosome_iit) {
181   return Chrnum_to_string(this->chrnum,chromosome_iit);
182 }
183 
184 Univcoord_T
Gregion_chroffset(T this)185 Gregion_chroffset (T this) {
186   return this->chroffset;
187 }
188 
189 Univcoord_T
Gregion_chrhigh(T this)190 Gregion_chrhigh (T this) {
191   return this->chrhigh;
192 }
193 
194 Chrpos_T
Gregion_chrlength(T this)195 Gregion_chrlength (T this) {
196   return this->chrlength;
197 }
198 
199 int
Gregion_querystart(T this)200 Gregion_querystart (T this) {
201   return this->querystart;
202 }
203 
204 int
Gregion_queryend(T this)205 Gregion_queryend (T this) {
206   return this->queryend;
207 }
208 
209 int
Gregion_matchsize(T this)210 Gregion_matchsize (T this) {
211   return this->matchsize;
212 }
213 
214 double
Gregion_weight(T this)215 Gregion_weight (T this) {
216   return this->weight;
217 }
218 
219 int
Gregion_support(T this)220 Gregion_support (T this) {
221   return this->support;
222 }
223 
224 bool
Gregion_extendedp(T this)225 Gregion_extendedp (T this) {
226   return this->extendedp;
227 }
228 
229 void
Gregion_set_ncovered(T this,int ncovered,int source)230 Gregion_set_ncovered (T this, int ncovered, int source) {
231   this->ncovered = ncovered;
232   this->source = source;
233   return;
234 }
235 
236 int
Gregion_ncovered(T this)237 Gregion_ncovered (T this) {
238   return this->ncovered;
239 }
240 
241 
242 
243 T
Gregion_new(int nexons,Univcoord_T genomicstart,Univcoord_T genomicend,bool plusp,int genestrand,Univ_IIT_T chromosome_iit,int querystart,int queryend,int querylength,int matchsize,int trimstart,int trimend,int circular_typeint)244 Gregion_new (int nexons, Univcoord_T genomicstart, Univcoord_T genomicend,
245 	     bool plusp, int genestrand, Univ_IIT_T chromosome_iit, int querystart, int queryend, int querylength,
246 	     int matchsize, int trimstart, int trimend, int circular_typeint) {
247   T new = (T) MALLOC(sizeof(*new));
248 
249   debug(printf("Creating gregion with genomicstart %u, genomicend %u, trimstart %d, trimend %d\n",
250 	       genomicstart,genomicend,trimstart,trimend));
251 
252   new->nexons = nexons;
253   if (chromosome_iit == NULL) {
254     /* Should not reach here, since user_genomicseg does not call stage 1 */
255     new->chrnum = 0;
256     new->chroffset = 0U;
257     new->chrlength = 0U;
258   } else {
259     new->chrnum = Univ_IIT_get_one(chromosome_iit,genomicstart,genomicstart);
260     /* new->chroffset = Univinterval_low(Univ_IIT_interval(chromosome_iit,new->chrnum)); */
261     /* new->chrhigh = Univinterval_high(Univ_IIT_interval(chromosome_iit,new->chrnum)); */
262     Univ_IIT_interval_bounds(&new->chroffset,&new->chrhigh,&new->chrlength,
263 			     chromosome_iit,new->chrnum,circular_typeint);
264   }
265 
266   assert(genomicstart < genomicend);
267   new->genomiclength = genomicend - genomicstart;
268 
269   new->chrstart = genomicstart - new->chroffset;
270   new->chrend = new->chrstart + new->genomiclength;
271 
272   new->plusp = plusp;
273   new->genestrand = genestrand;
274 
275   if (plusp == true) {
276     if (new->chrstart < (Chrpos_T) querystart) {
277       /* Extends past beginning of genome */
278       new->extentstart = 0;
279     } else {
280       new->extentstart = new->chrstart - querystart;
281     }
282     new->extentend = new->chrend + (querylength - queryend);
283   } else {
284     if (new->chrstart < (Chrpos_T) (querylength - queryend)) {
285       /* Extends past beginning of genome */
286       new->extentstart = 0;
287     } else {
288       new->extentstart = new->chrstart - (querylength - queryend);
289     }
290     new->extentend = new->chrend + querystart;
291   }
292 
293   new->extension5 = 0U;
294   new->extension3 = 0U;
295   new->extendedp = false;
296 
297   new->querystart = querystart;
298   new->queryend = queryend;
299   new->matchsize = matchsize;
300 
301   new->trimstart = trimstart;
302   new->trimend = trimend;
303 
304 #ifdef PMAP
305   debug5(printf("  Testing bound5+extension5 = %d - %u < %d + %d, bound3+extension3 = %d + %u > %d - %d\n",
306 		new->querystart,new->extension5,trimstart,SUFFICIENT_SUPPORT,
307 		new->queryend,new->extension3,trimend,SUFFICIENT_SUPPORT));
308   if (new->querystart - new->extension5 < trimstart + SUFFICIENT_SUPPORT &&
309       new->queryend + new->extension3 > trimend - SUFFICIENT_SUPPORT) {
310     new->sufficient_support_p = true;
311   } else {
312     new->sufficient_support_p = false;
313   }
314 #else
315   debug5(printf("  Testing bound5+extension5 = %d - %u < %d + %d, bound3 = %d + %u > %d - %d\n",
316 		new->querystart,new->extension5,trimstart,SUFFICIENT_SUPPORT,
317 		new->queryend,new->extension3,trimend,SUFFICIENT_SUPPORT));
318   if (new->querystart - new->extension5 < (Chrpos_T) (trimstart + SUFFICIENT_SUPPORT) &&
319       new->queryend + new->extension3 > (Chrpos_T) (trimend - SUFFICIENT_SUPPORT)) {
320     new->sufficient_support_p = true;
321   } else {
322     new->sufficient_support_p = false;
323   }
324 #endif
325 
326   new->weight = 0.0;
327   new->support = queryend - querystart + matchsize;
328 
329   new->total_up = 0;
330   new->total_down = 0;
331   new->bestprev_ceiling = 0;
332   new->bestprev_floor = (Chrpos_T) -1;
333   new->score_ceiling = 0;
334   new->score_floor = 0;
335 
336 #ifdef USE_CLEAN
337   new->bounded_low_p = false;
338   new->bounded_high_p = false;
339 #endif
340 
341   return new;
342 }
343 
344 
345 T
Gregion_new_from_matches(Match_T match5,Match_T match3,int genestrand,Univ_IIT_T chromosome_iit,int querylength,int matchsize,int trimstart,int trimend,int circular_typeint)346 Gregion_new_from_matches (Match_T match5, Match_T match3, int genestrand, Univ_IIT_T chromosome_iit,
347 			  int querylength, int matchsize, int trimstart, int trimend, int circular_typeint) {
348   T gregion;
349   Univcoord_T genomicstart, genomicend;
350   int querystart, queryend;
351 
352   querystart = Match_querypos(match5);
353   queryend = Match_querypos(match3);
354 
355   if (Match_forwardp(match5)) {
356     genomicstart = Match_position(match5);
357     genomicend = Match_position(match3) + 1U;
358     /* chrnum = Match_chrnum(match5); */
359 
360   } else {
361     genomicstart = Match_position(match3);
362     genomicend = Match_position(match5) + 1U;
363     /* chrnum = Match_chrnum(match3); */
364   }
365 
366 #if 0
367   if (chromosome_iit == NULL) {
368     chroffset = 0U;
369   } else {
370     chroffset = Univinterval_low(Univ_IIT_interval(chromosome_iit,chrnum));
371   }
372 #endif
373 
374   debug(printf("Coordinates are %u .. %u\n",genomicstart,genomicend));
375 
376   gregion = Gregion_new(/*nexons*/0,genomicstart,genomicend,Match_forwardp(match5),genestrand,
377 			chromosome_iit,querystart,queryend,querylength,matchsize,trimstart,trimend,
378 			circular_typeint);
379 
380   gregion->weight = Match_weight(match5) * Match_weight(match3);
381   Match_incr_npairings(match5);
382   Match_incr_npairings(match3);
383 
384   return gregion;
385 }
386 
387 
388 #if 0
389 static int
390 weight_cmp (const void *x, const void *y) {
391   T a = * (T *) x;
392   T b = * (T *) y;
393 
394   if (a->weight > b->weight) {
395     return -1;
396   } else if (a->weight < b->weight) {
397     return +1;
398   } else if (a->support > b->support) {
399     return -1;
400   } else if (a->support < b->support) {
401     return +1;
402   } else if (a->genomiclength > b->genomiclength) {
403     return -1;
404   } else if (a->genomiclength < b->genomiclength) {
405     return +1;
406   } else {
407     return 0;
408   }
409 }
410 #endif
411 
412 
413 static int
support_cmp(const void * x,const void * y)414 support_cmp (const void *x, const void *y) {
415   T a = * (T *) x;
416   T b = * (T *) y;
417 
418   if (a->support > b->support) {
419     /* Larger support should beat smaller support */
420     return -1;
421   } else if (a->support < b->support) {
422     return +1;
423 
424   } else if (a->genomiclength < b->genomiclength) {
425     /* For the same support, prefer more compact regions */
426     return -1;
427   } else if (a->genomiclength > b->genomiclength) {
428     return +1;
429   } else {
430     return 0;
431   }
432 }
433 
434 
435 
436 /* Not intended for qsort.  Returns 0 when not comparable. */
437 static bool
gregion_overlap_p(T x,T y)438 gregion_overlap_p (T x, T y) {
439   Univcoord_T x_genomicstart, x_genomicend, y_genomicstart, y_genomicend;
440   Univcoord_T overlap;
441   double fraction;
442   bool plusp;
443 
444   if (x->plusp != y->plusp) {
445     debug(printf("Case 0 => different strands\n"));
446     return false;			/* Different strands */
447 
448   } else {
449     plusp = x->plusp;
450     x_genomicstart = x->chroffset + x->chrstart;
451     x_genomicend = x->chroffset + x->chrend;
452     y_genomicstart = y->chroffset + y->chrstart;
453     y_genomicend = y->chroffset + y->chrend;
454 
455     if (y_genomicstart > x_genomicend || x_genomicstart > y_genomicend) {
456       debug(printf("Case 0: x %u..%u, y %u..%u => no overlap\n",
457 		   x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
458       /*
459       /-- x --/ /-- y --/ or /-- y --/ /-- x --/
460       */
461       return false;		/* No overlap */
462 
463     } else if (y_genomicstart < x_genomicstart) {
464       if (y_genomicend < x_genomicend) {
465 	debug(printf("Case 1: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
466 
467 	/*
468 	    /-- x --/
469 	/-- y --/
470 	*/
471 
472 	if (plusp == true && (y->querystart >= x->querystart || y->queryend >= x->queryend)) {
473 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
474 		       x->querystart,x->queryend,y->querystart,y->queryend));
475 	  return false;
476 
477 	} else if (plusp == false && (y->querystart <= x->querystart || y->queryend <= x->queryend)) {
478 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
479 		       x->querystart,x->queryend,y->querystart,y->queryend));
480 	  return false;
481 
482 	} else {
483 	  overlap = y_genomicend - x_genomicstart;
484 	  if (y_genomicend - y_genomicstart < x_genomicend - x_genomicstart) {
485 	    fraction = (double) overlap/(double) (y_genomicend - y_genomicstart);
486 	  } else {
487 	    fraction = (double) overlap/(double) (x_genomicend - x_genomicstart);
488 	  }
489 	  debug(printf(" => fraction %f",fraction));
490 	  if (fraction > 0.5) {
491 	    debug(printf(" => overlap\n"));
492 	    return true;
493 	  } else {
494 	    debug(printf(" => no overlap\n"));
495 	    return false;
496 	  }
497 	}
498 
499       } else if (y_genomicend > x_genomicend) {
500 	debug(printf("Case 2: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
501 
502 	/*
503 	    /-- x --/
504 	/----- y -----/
505 	*/
506 
507 	/* Note: Like case 4, both branches are true for plus and minus */
508 	if (plusp == true && (y->querystart >= x->querystart || y->queryend <= x->queryend)) {
509 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
510 		       x->querystart,x->queryend,y->querystart,y->queryend));
511 	  return false;
512 
513 	} else if (plusp == false && (y->querystart >= x->querystart || y->queryend <= x->queryend)) {
514 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
515 		       x->querystart,x->queryend,y->querystart,y->queryend));
516 	  return false;
517 
518 	} else {
519 	  debug(printf(" => subsumption\n"));
520 	  return true;
521 	}
522 
523       } else {
524 	/* y_genomicend == x_genomicend */
525 	debug(printf("Case 3: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
526 
527 	/*
528 	    /-- x --/
529 	/----- y ---/
530 	*/
531 
532 	if (plusp == true && (y->querystart >= x->querystart || y->queryend != x->queryend)) {
533 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
534 		       x->querystart,x->queryend,y->querystart,y->queryend));
535 	  return false;
536 
537 	} else if (plusp == false && (y->querystart != x->querystart || y->queryend <= x->queryend)) {
538 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
539 		       x->querystart,x->queryend,y->querystart,y->queryend));
540 	  return false;
541 
542 	} else {
543 	  debug(printf(" => subsumption\n"));
544 	  return true;
545 	}
546 
547       }
548 
549     } else if (y_genomicstart > x_genomicstart) {
550       if (y_genomicend < x_genomicend) {
551 	debug(printf("Case 4: x %u..%u, y %u..%u\n",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
552 
553 	/*
554 	/----- x -----/
555 	  /-- y --/
556 	*/
557 
558 	/* Note: Like case 2, both branches are true for plus and minus */
559 	if (plusp == true && (y->querystart <= x->querystart || y->queryend >= x->queryend)) {
560 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
561 		       x->querystart,x->queryend,y->querystart,y->queryend));
562 	  return false;
563 
564 	} else if (plusp == false && (y->querystart <= x->querystart || y->queryend >= x->queryend)) {
565 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
566 		       x->querystart,x->queryend,y->querystart,y->queryend));
567 	  return false;
568 
569 	} else {
570 	  debug(printf(" => subsumption\n"));
571 	  return true;
572 	}
573 
574       } else if (y_genomicend > x_genomicend) {
575 	debug(printf("Case 5: x %u..%u, y %u..%u\n",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
576 
577 	/*
578 	/-- x --/
579 	  /-- y --/
580 	*/
581 
582 	if (plusp == true && (y->querystart <= x->querystart || y->queryend <= x->queryend)) {
583 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
584 		       x->querystart,x->queryend,y->querystart,y->queryend));
585 	  return false;
586 
587 	} else if (plusp == false && (y->querystart >= x->querystart || y->queryend >= x->queryend)) {
588 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
589 		       x->querystart,x->queryend,y->querystart,y->queryend));
590 	  return false;
591 
592 	} else {
593 	  overlap = x_genomicend - y_genomicstart;
594 	  if (y_genomicend - y_genomicstart < x_genomicend - x_genomicstart) {
595 	    fraction = (double) overlap/(double) (y_genomicend - y_genomicstart);
596 	  } else {
597 	    fraction = (double) overlap/(double) (x_genomicend - x_genomicstart);
598 	  }
599 	  debug(printf(" => fraction %f",fraction));
600 	  if (fraction > 0.5) {
601 	    debug(printf(" => overlap\n"));
602 	    return true;
603 	  } else {
604 	    debug(printf(" => no overlap\n"));
605 	    return false;
606 	  }
607 	}
608 
609       } else {
610 	/* y_genomicend == x_genomicend */
611 	debug(printf("Case 6: x %u..%u, y %u..%u\n",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
612 
613 	/*
614 	/-- x ----/
615 	  /-- y --/
616 	*/
617 
618 	if (plusp == true && (y->querystart <= x->querystart || y->queryend != x->queryend)) {
619 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
620 		       x->querystart,x->queryend,y->querystart,y->queryend));
621 	  return false;
622 
623 	} else if (plusp == false && (y->querystart != x->querystart || y->queryend >= x->queryend)) {
624 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
625 		       x->querystart,x->queryend,y->querystart,y->queryend));
626 	  return false;
627 
628 	} else {
629 	  debug(printf(" => subsumption\n"));
630 	  return true;
631 	}
632 
633       }
634 
635     } else {
636       /* y_genomicstart == x_genomicstart */
637 
638       if (y_genomicend < x_genomicend) {
639 	debug(printf("Case 7: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
640 
641 	/*
642         /----- x --/
643 	/-- y --/
644 	*/
645 
646 	if (plusp == true && (y->querystart != x->querystart || y->queryend >= x->queryend)) {
647 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
648 		       x->querystart,x->queryend,y->querystart,y->queryend));
649 	  return false;
650 
651 	} else if (plusp == false && (y->querystart <= x->querystart || y->queryend != x->queryend)) {
652 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
653 		       x->querystart,x->queryend,y->querystart,y->queryend));
654 	  return false;
655 
656 	} else {
657 	  debug(printf(" => subsumption\n"));
658 	  return true;
659 	}
660 
661       } else if (y_genomicend > x_genomicend) {
662 	debug(printf("Case 8: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
663 
664 	/*
665 	    /-- x --/
666 	    /-- y -----/
667 	*/
668 
669 	if (plusp == true && (y->querystart != x->querystart || y->queryend <= x->queryend)) {
670 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
671 		       x->querystart,x->queryend,y->querystart,y->queryend));
672 	  return false;
673 
674 	} else if (plusp == false && (y->querystart >= x->querystart || y->queryend != x->queryend)) {
675 	  debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
676 		       x->querystart,x->queryend,y->querystart,y->queryend));
677 	  return false;
678 
679 	} else {
680 	  debug(printf(" => subsumption\n"));
681 	  return true;
682 	}
683 
684       } else {
685 	/* y_genomicend == x_genomicend */
686 	debug(printf("Case 9: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
687 
688 	/*
689 	    /-- x --/
690 	    /-- y --/
691 	*/
692 
693 	debug(printf(" => equality\n"));
694 	return true;
695       }
696     }
697   }
698 }
699 
700 
701 List_T
Gregion_filter_unique(List_T gregionlist)702 Gregion_filter_unique (List_T gregionlist) {
703   List_T unique = NULL;
704   T x, y, gregion, *array;
705   int n, i, j;
706   bool *eliminate;
707 #ifdef DEBUG
708   List_T p;
709 #endif
710 
711   n = List_length(gregionlist);
712   if (n == 0) {
713     return NULL;
714   }
715 
716   debug(
717 	for (p = gregionlist, i = 0; p != NULL; p = p->rest, i++) {
718 	  gregion = (T) p->first;
719 	  printf("  Initial %d: %d..%d #%d:%u-%u (plusp = %d)\n",
720 		 i,gregion->querystart,gregion->queryend,gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp);
721 	}
722 	);
723 
724   if (n < MAX_GREGIONS_FOR_ALLOCA) {
725     eliminate = (bool *) CALLOCA(n,sizeof(bool));
726     array = (T *) MALLOCA(n * sizeof(T));
727     List_fill_array_and_free((void **) array,&gregionlist);
728   } else {
729     eliminate = (bool *) CALLOC(n,sizeof(bool));
730     array = (T *) List_to_array(gregionlist,NULL);
731     List_free(&gregionlist);
732   }
733 
734   /* qsort(array,n,sizeof(T),weight_cmp); */
735   qsort(array,n,sizeof(T),support_cmp);
736 
737   debug(
738 	for (i = 0; i < n; i++) {
739 	  gregion = array[i];
740 	  printf("  After sorting by support %d: %d..%d #%d:%u-%u (plusp = %d)\n",
741 		 i,gregion->querystart,gregion->queryend,gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp);
742 	}
743 	);
744 
745   for (i = 0; i < n; i++) {
746     x = array[i];
747     for (j = i+1; j < n; j++) {
748       y = array[j];
749 #ifdef DEBUG
750       printf("Comparing these regions %d and %d:\n",i,j);
751       printf("   ");
752       Gregion_print(x);
753       printf("   ");
754       Gregion_print(y);
755       printf("\n");
756 #endif
757       if (gregion_overlap_p(x,y) == true) {
758 	debug(printf(" => Found overlap.  Eliminating %d\n\n",j));
759 	eliminate[j] = true;
760       }
761     }
762   }
763 
764   for (i = n-1; i >= 0; i--) {
765     gregion = array[i];
766     if (eliminate[i] == false) {
767       debug(printf("  Keeping #%d:%u-%u (plusp = %d)\n",
768 		   gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp));
769       unique = List_push(unique,(void *) gregion);
770     } else {
771       debug(printf("  Eliminating #%d:%u-%u (plusp = %d)\n",
772 		   gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp));
773       /*
774       if (gregion->match5 != NULL) {
775 	Match_decr_npairings(gregion->match5);
776 	Match_decr_npairings(gregion->match3);
777       }
778       */
779       Gregion_free(&gregion);
780     }
781   }
782 
783   if (n < MAX_GREGIONS_FOR_ALLOCA) {
784     FREEA(eliminate);
785     FREEA(array);
786   } else {
787     FREE(eliminate);
788     FREE(array);
789   }
790 
791   debug(
792 	for (p = unique, i = 0; p != NULL; p = p->rest, i++) {
793 	  gregion = (T) p->first;
794 	  printf("  Final %d: #%d:%u-%u (plusp = %d)\n",
795 		 i,gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp);
796 	}
797 	);
798 
799   return unique;
800 }
801 
802 
803 List_T
Gregion_filter_support(List_T gregionlist,int boundary,double pct_max,int diff_max)804 Gregion_filter_support (List_T gregionlist, int boundary, double pct_max, int diff_max) {
805   List_T good = NULL, p;
806   int threshold, maxsupport = 0;
807   T gregion;
808 
809   if (gregionlist == NULL) {
810     return NULL;
811   }
812 
813   for (p = gregionlist; p != NULL; p = List_next(p)) {
814     gregion = (T) List_head(p);
815     if (gregion->support > maxsupport) {
816       maxsupport = gregion->support;
817     }
818   }
819 
820   if (maxsupport > boundary) {
821     /* Use diff_max */
822     for (p = gregionlist; p != NULL; p = List_next(p)) {
823       gregion = (T) List_head(p);
824       if (maxsupport - gregion->support < diff_max) {
825 	good = List_push(good,gregion);
826       } else {
827 	Gregion_free(&gregion);
828       }
829     }
830   } else {
831     threshold = (int) ((double) maxsupport * pct_max);
832     /* Use threshold only */
833     for (p = gregionlist; p != NULL; p = List_next(p)) {
834       gregion = (T) List_head(p);
835       if (gregion->support >= threshold) {
836 	good = List_push(good,gregion);
837       } else {
838 	Gregion_free(&gregion);
839       }
840     }
841   }
842 
843   List_free(&gregionlist);
844   return List_reverse(good);
845 }
846 
847 
848 double
Gregion_best_weight(List_T gregionlist)849 Gregion_best_weight (List_T gregionlist) {
850   double best_weight = 0.0;
851   T gregion;
852   List_T p;
853 
854   for (p = gregionlist; p != NULL; p = List_next(p)) {
855     gregion = (T) List_head(p);
856     if (gregion->weight > best_weight) {
857       best_weight = gregion->weight;
858     }
859   }
860 
861   return best_weight;
862 }
863 
864 
865 bool
Gregion_sufficient_support(T this)866 Gregion_sufficient_support (T this) {
867   return this->sufficient_support_p;
868 }
869 
870 
871 void
Gregion_extend(T this,Chrpos_T extension5,Chrpos_T extension3,int querylength)872 Gregion_extend (T this, Chrpos_T extension5, Chrpos_T extension3, int querylength) {
873   Chrpos_T left, right;
874   int extra_end;
875 
876   debug(printf("Entering Gregion_extend with extension5 %u and extension3 %u\n",extension5,extension3));
877   debug(printf("  #%d:%u..%u\n",this->chrnum,this->chrstart,this->chrlength));
878   this->extension5 = extension5;
879   this->extension3 = extension3;
880   this->extendedp = true;
881 
882 
883   if (this->nexons == 1 || Gregion_sufficient_support(this) == true || this->support < 100) {
884     extra_end = EXTRA_SHORTEND;
885 #if 0
886     /* Should no longer be necessary for known splicesites */
887     if (extra_end < min_extra_end) {
888       extra_end = min_extra_end;
889     }
890 #endif
891     if (this->plusp == true) {
892       left = extension5 + querylength + extra_end;
893       right = extension3 + querylength + extra_end;
894     } else {
895       left = extension3 + querylength + extra_end;
896       right = extension5 + querylength + extra_end;
897     }
898   } else {
899     extra_end = EXTRA_LONGEND;
900 #if 0
901     /* Should no longer be necessary for known splicesites */
902     if (extra_end < min_extra_end) {
903       extra_end = min_extra_end;
904     }
905 #endif
906     if (this->plusp == true) {
907       left = extension5 + extra_end;
908       right = extension3 + extra_end;
909     } else {
910       left = extension3 + extra_end;
911       right = extension5 + extra_end;
912     }
913   }
914 
915   /* printf("chrstart %u vs left %u\n",this->chrstart,left); */
916   if (this->chrstart < left) {
917     /* At beginning of chromosome */
918     this->chrstart = 0U;
919   } else {
920     this->chrstart -= left;
921   }
922 
923   /* printf("chroffset %u + chrend %u + right %u vs chrhigh %u\n",this->chroffset,this->chrend,right,this->chrhigh); */
924   if (this->chroffset + this->chrend + right >= this->chrhigh) {
925     /* At end of chromosome */
926     this->chrend = this->chrlength - 1;
927   } else {
928     this->chrend += right;
929   }
930 
931   /* Prevent very large genomic segments */
932   if (this->chrend > this->chrstart + MAX_GENOMICLENGTH) {
933     this->chrend = this->chrstart + MAX_GENOMICLENGTH;
934   }
935 
936   this->genomiclength = this->chrend - this->chrstart + 1U;
937   /* printf("chrstart %u, chrend %u, genomiclength %u\n",this->chrstart,this->chrend,this->genomiclength); */
938 
939 #ifdef PMAP
940   debug5(printf("  Testing bound5+extension5 = %d - %u < %d + %d, bound3+extension3 = %d + %u > %d - %d\n",
941 		this->querystart,this->extension5,this->trimstart,SUFFICIENT_SUPPORT,
942 		this->queryend,this->extension3,this->trimend,SUFFICIENT_SUPPORT));
943   if (this->querystart - this->extension5 < this->trimstart + SUFFICIENT_SUPPORT &&
944       this->queryend + this->extension3 > this->trimend - SUFFICIENT_SUPPORT) {
945     this->sufficient_support_p = true;
946   } else {
947     this->sufficient_support_p = false;
948   }
949 #else
950   debug5(printf("  Testing bound5+extension5 = %d - %u < %d + %d, bound3 = %d + %u > %d - %d\n",
951 		this->querystart,this->extension5,this->trimstart,SUFFICIENT_SUPPORT,
952 		this->queryend,this->extension3,this->trimend,SUFFICIENT_SUPPORT));
953   if (this->querystart - this->extension5 < (Chrpos_T) (this->trimstart + SUFFICIENT_SUPPORT) &&
954       this->queryend + this->extension3 > (Chrpos_T) (this->trimend - SUFFICIENT_SUPPORT)) {
955     this->sufficient_support_p = true;
956   } else {
957     this->sufficient_support_p = false;
958   }
959 #endif
960 
961   debug(printf("  #%d:%u..%u\n",this->chrnum,this->chrstart,this->chrend));
962 
963   return;
964 }
965 
966 
967 int
Gregion_cmp(const void * a,const void * b)968 Gregion_cmp (const void *a, const void *b) {
969   T x = * (T *) a;
970   T y = * (T *) b;
971 
972   if (x->ncovered > y->ncovered) {
973     return -1;
974   } else if (y->ncovered > x->ncovered) {
975     return +1;
976   } else {
977     return 0;
978   }
979 }
980 
981 
982 /************************************************************************
983  *  Filtering, based on spliceclean
984  ************************************************************************/
985 
986 #ifdef USE_CLEAN
987 
988 #define MAXLOOKBACK 60
989 
990 static void
compute_total_up(List_T gregions)991 compute_total_up (List_T gregions) {
992   long int total_up;
993   List_T p;
994   Gregion_T gregion;
995 
996   total_up = 0;
997   for (p = gregions; p != NULL; p = List_next(p)) {
998     gregion = (Gregion_T) List_head(p);
999     total_up += 1;
1000     gregion->total_up = total_up;
1001   }
1002 
1003   return;
1004 }
1005 
1006 static void
compute_total_down(List_T gregions)1007 compute_total_down (List_T gregions) {
1008   long int total_down;
1009   List_T p;
1010   Gregion_T gregion;
1011 
1012   total_down = 0;
1013   for (p = gregions; p != NULL; p = List_next(p)) {
1014     gregion = (Gregion_T) List_head(p);
1015     total_down += 1;
1016     gregion->total_down = total_down;
1017   }
1018 
1019   return;
1020 }
1021 
1022 
1023 static int
Gregion_low_descending_cmp(const void * a,const void * b)1024 Gregion_low_descending_cmp (const void *a, const void *b) {
1025   Gregion_T x = * (Gregion_T *) a;
1026   Gregion_T y = * (Gregion_T *) b;
1027 
1028   if (x->extentstart > y->extentstart) {
1029     return -1;
1030   } else if (y->extentstart > x->extentstart) {
1031     return +1;
1032   } else {
1033     return 0;
1034   }
1035 }
1036 
1037 static int
Gregion_high_ascending_cmp(const void * a,const void * b)1038 Gregion_high_ascending_cmp (const void *a, const void *b) {
1039   Gregion_T x = * (Gregion_T *) a;
1040   Gregion_T y = * (Gregion_T *) b;
1041 
1042   if (x->extentend < y->extentend) {
1043     return -1;
1044   } else if (y->extentend < x->extentend) {
1045     return +1;
1046   } else {
1047     return 0;
1048   }
1049 }
1050 
1051 static List_T
Gregion_sort_low_descending(List_T gregions)1052 Gregion_sort_low_descending (List_T gregions) {
1053   List_T sorted = NULL;
1054   Gregion_T *array;
1055   int n, i;
1056 
1057   if ((n = List_length(gregions)) == 0) {
1058     return (List_T) NULL;
1059 
1060   } else if (n < MAX_GREGIONS_FOR_ALLOCA) {
1061     array = (Gregion_T *) MALLOCA(n * sizeof(Gregion_T));
1062     List_fill_array((void **) array,gregions);
1063 
1064     qsort(array,n,sizeof(Gregion_T),Gregion_low_descending_cmp);
1065     for (i = n-1; i >= 0; i--) {
1066       sorted = List_push(sorted,array[i]);
1067     }
1068 
1069     FREEA(array);
1070     return sorted;
1071 
1072   } else {
1073     array = (Gregion_T *) List_to_array(gregions,NULL);
1074 
1075     qsort(array,n,sizeof(Gregion_T),Gregion_low_descending_cmp);
1076     for (i = n-1; i >= 0; i--) {
1077       sorted = List_push(sorted,array[i]);
1078     }
1079 
1080     FREE(array);
1081     return sorted;
1082   }
1083 }
1084 
1085 
1086 
1087 static List_T
Gregion_sort_high_ascending(List_T gregions)1088 Gregion_sort_high_ascending (List_T gregions) {
1089   List_T sorted = NULL;
1090   Gregion_T *array;
1091   int n, i;
1092 
1093   if ((n = List_length(gregions)) == 0) {
1094     return (List_T) NULL;
1095 
1096   } else if (n < MAX_GREGIONS_FOR_ALLOCA) {
1097     array = (Gregion_T *) MALLOCA(n * sizeof(Gregion_T));
1098     List_fill_array((void **) array,gregions);
1099 
1100     qsort(array,n,sizeof(Gregion_T),Gregion_high_ascending_cmp);
1101     for (i = n-1; i >= 0; i--) {
1102       sorted = List_push(sorted,array[i]);
1103     }
1104 
1105     FREEA(array);
1106     return sorted;
1107 
1108   } else {
1109     array = (Gregion_T *) List_to_array(gregions,NULL);
1110 
1111     qsort(array,n,sizeof(Gregion_T),Gregion_high_ascending_cmp);
1112     for (i = n-1; i >= 0; i--) {
1113       sorted = List_push(sorted,array[i]);
1114     }
1115 
1116     FREE(array);
1117     return sorted;
1118   }
1119 }
1120 
1121 
1122 
1123 typedef struct Base_T *Base_T;
1124 struct Base_T {
1125   Chrpos_T prevpos;
1126   Chrpos_T minextent;
1127   Chrpos_T maxextent;
1128   List_T gregions;
1129 
1130   bool usedp;
1131 };
1132 
1133 #if 0
1134 static void
1135 Base_free (Base_T *old) {
1136   if ((*old)->gregions != NULL) {
1137     List_free(&(*old)->gregions);
1138   }
1139   FREE(*old);
1140   return;
1141 }
1142 #endif
1143 
1144 
1145 static Base_T
Base_new()1146 Base_new () {
1147   Base_T new = (Base_T) MALLOC(sizeof(*new));
1148 
1149   new->minextent = -1U;
1150   new->maxextent = -1U;
1151   new->gregions = (List_T) NULL;
1152   new->usedp = false;
1153 
1154   return new;
1155 }
1156 
1157 
1158 
1159 /* Assumes gregions are arranged low to high */
1160 static Gregion_T
apply_ceiling(List_T gregions,Chrpos_T ceiling)1161 apply_ceiling (List_T gregions, Chrpos_T ceiling) {
1162   List_T p;
1163   Gregion_T prevgregion = NULL, gregion;
1164 
1165   for (p = gregions; p != NULL; p = List_next(p)) {
1166     gregion = (Gregion_T) List_head(p);
1167     if (gregion->extentend > ceiling) {
1168       return prevgregion;
1169     }
1170     prevgregion = gregion;
1171   }
1172 
1173   return prevgregion;
1174 }
1175 
1176 /* Assumes gregions are arranged high to low */
1177 static Gregion_T
apply_floor(List_T gregions,Chrpos_T floor)1178 apply_floor (List_T gregions, Chrpos_T floor) {
1179   List_T p;
1180   Gregion_T prevgregion = NULL, gregion;
1181 
1182   for (p = gregions; p != NULL; p = List_next(p)) {
1183     gregion = (Gregion_T) List_head(p);
1184     if (gregion->extentstart < floor) {
1185       return prevgregion;
1186     }
1187     prevgregion = gregion;
1188   }
1189 
1190   return prevgregion;
1191 }
1192 
1193 
1194 static Chrpos_T
compute_ceilings(Uinttable_T low_basetable)1195 compute_ceilings (Uinttable_T low_basetable) {
1196   Chrpos_T ceiling, bestprevpos, prevpos;
1197   long int bestscore, score;
1198   int nlookback;
1199   Chrpos_T *keys;
1200   Base_T base, prevbase;
1201   Gregion_T gregion, prevgregion;
1202 #ifdef DEBUG2
1203   Gregion_T bestprevgregion;
1204 #endif
1205   List_T p;
1206   int n, i;
1207 
1208   n = Uinttable_length(low_basetable);
1209   keys = Uinttable_keys(low_basetable,/*sortp*/true);
1210   debug2(printf("low_basetable has %d entries\n",n));
1211 
1212   prevpos = 0U;
1213   for (i = 0; i < n; i++) {
1214     base = (Base_T) Uinttable_get(low_basetable,keys[i]);
1215     base->gregions = Gregion_sort_high_ascending(base->gregions);
1216     compute_total_up(base->gregions);
1217 
1218     base->prevpos = prevpos;
1219     prevpos = keys[i];
1220   }
1221 
1222   /* Initialize minbaselow */
1223   base = (Base_T) Uinttable_get(low_basetable,keys[0]);
1224   for (p = base->gregions; p != NULL; p = List_next(p)) {
1225     gregion = (Gregion_T) List_head(p);
1226     gregion->score_ceiling = gregion->total_up;
1227     gregion->bestprev_ceiling = 0U;
1228   }
1229 
1230   for (i = 1; i < n; i++) {
1231     base = (Base_T) Uinttable_get(low_basetable,keys[i]);
1232 
1233     debug2(printf("At base %u, have %d gregions\n",keys[i],List_length(base->gregions)));
1234 
1235     for (p = base->gregions; p != NULL; p = List_next(p)) {
1236       gregion = (Gregion_T) List_head(p);
1237 
1238       bestscore = 0;
1239       bestprevpos = keys[0];
1240       debug2(bestprevgregion = NULL);
1241 
1242       prevpos = base->prevpos;
1243       nlookback = 0;
1244 
1245       ceiling = gregion->extentend;
1246       debug2(printf("  Gregion %u",gregion->extentend));
1247       while (prevpos >= keys[0] && nlookback < MAXLOOKBACK) {
1248 	prevbase = (Base_T) Uinttable_get(low_basetable,prevpos);
1249 	if ((prevgregion = apply_ceiling(prevbase->gregions,ceiling)) != NULL) {
1250 	  debug2(printf(" ... prev:%u score:%d+%d = %d",
1251 			prevpos,prevgregion->score_ceiling,gregion->total_up,
1252 			prevgregion->score_ceiling+gregion->total_up));
1253 
1254 	  if ((score = prevgregion->score_ceiling + gregion->total_up) > bestscore) {
1255 	    debug2(printf("*"));
1256 	    bestscore = score;
1257 	    bestprevpos = prevpos;
1258 #ifdef DEBUG2
1259 	    bestprevgregion = prevgregion;
1260 #endif
1261 	  }
1262 	}
1263 
1264 	prevpos = prevbase->prevpos;
1265 	nlookback++;
1266       }
1267       debug2(printf("\n"));
1268 
1269       gregion->score_ceiling = bestscore;
1270       gregion->bestprev_ceiling = bestprevpos;
1271 
1272 #ifdef DEBUG2
1273       if (bestprevgregion == NULL) {
1274 	printf(" no prevgregion, bestprev is %u, score is %d\n",
1275 	       bestprevpos,gregion->score_ceiling);
1276       } else {
1277 	printf(" bestprev is pos %u, gregion %u, score is %d\n",
1278 	       bestprevpos,bestprevgregion->extentend,gregion->score_ceiling);
1279       }
1280 #endif
1281     }
1282   }
1283 
1284   /* Get best overall gregion */
1285   bestscore = 0;
1286   bestprevpos = 0U;
1287 
1288   for (i = 0; i < n; i++) {
1289     base = (Base_T) Uinttable_get(low_basetable,keys[i]);
1290     for (p = base->gregions; p != NULL; p = List_next(p)) {
1291       gregion = (Gregion_T) List_head(p);
1292 
1293       if (gregion->score_ceiling > bestscore) {
1294 	bestscore = gregion->score_ceiling;
1295 	bestprevpos = keys[i];
1296       }
1297     }
1298   }
1299 
1300   FREE(keys);
1301 
1302   return bestprevpos;
1303 }
1304 
1305 
1306 
1307 static Chrpos_T
compute_floors(Uinttable_T high_basetable)1308 compute_floors (Uinttable_T high_basetable) {
1309   Chrpos_T floor, bestprevpos, prevpos;
1310   long int bestscore, score;
1311   int nlookback;
1312   Chrpos_T *keys;
1313   Base_T base, prevbase;
1314   Gregion_T gregion, prevgregion;
1315 #ifdef DEBUG2
1316   Gregion_T bestprevgregion;
1317 #endif
1318   List_T p;
1319   int n, i;
1320 
1321   n = Uinttable_length(high_basetable);
1322   keys = Uinttable_keys(high_basetable,/*sortp*/true);
1323   debug2(printf("high_basetable has %d entries\n",n));
1324 
1325   prevpos = (Chrpos_T) -1U;
1326   for (i = n-1; i >= 0; --i) {
1327     base = (Base_T) Uinttable_get(high_basetable,keys[i]);
1328     base->gregions = Gregion_sort_low_descending(base->gregions);
1329     compute_total_down(base->gregions);
1330 
1331     base->prevpos = prevpos;
1332     prevpos = keys[i];
1333   }
1334 
1335   /* Initialize maxbasehigh */
1336   base = (Base_T) Uinttable_get(high_basetable,keys[n-1]);
1337   for (p = base->gregions; p != NULL; p = List_next(p)) {
1338     gregion = (Gregion_T) List_head(p);
1339     gregion->score_floor = gregion->total_down;
1340     gregion->bestprev_floor = (Chrpos_T) -1U;
1341   }
1342 
1343   for (i = n-2; i >= 0; --i) {
1344     base = (Base_T) Uinttable_get(high_basetable,keys[i]);
1345 
1346     debug2(printf("At base %u, have %d gregions\n",keys[i],List_length(base->gregions)));
1347     for (p = base->gregions; p != NULL; p = List_next(p)) {
1348       gregion = (Gregion_T) List_head(p);
1349 
1350       bestscore = 0;
1351       bestprevpos = keys[n-1];
1352       debug2(bestprevgregion = NULL);
1353 
1354       prevpos = base->prevpos;
1355       nlookback = 0;
1356 
1357       floor = gregion->extentstart /*+1*/;
1358       debug2(printf("  Gregion %u",gregion->extentstart));
1359       while (prevpos <= keys[n-1] && nlookback < MAXLOOKBACK) {
1360 	prevbase = (Base_T) Uinttable_get(high_basetable,prevpos);
1361 	if ((prevgregion = apply_floor(prevbase->gregions,floor)) != NULL) {
1362 	  debug2(printf(" ... prev:%u, score:%d+%d = %d",
1363 			prevpos,prevgregion->score_floor,gregion->total_down,
1364 			prevgregion->score_floor+gregion->total_down));
1365 
1366 	  if ((score = prevgregion->score_floor + gregion->total_down) > bestscore) {
1367 	    debug2(printf("*"));
1368 	    bestscore = score;
1369 	    bestprevpos = prevpos;
1370 #ifdef DEBUG2
1371 	    bestprevgregion = prevgregion;
1372 #endif
1373 	  }
1374 	}
1375 
1376 	prevpos = prevbase->prevpos;
1377 	nlookback++;
1378       }
1379       debug2(printf("\n"));
1380 
1381       gregion->score_floor = bestscore;
1382       gregion->bestprev_floor = bestprevpos;
1383 
1384 #ifdef DEBUG2
1385       if (bestprevgregion == NULL) {
1386 	printf(" no prevgregion, bestprev is %u, score is %d\n",
1387 	       bestprevpos,gregion->score_floor);
1388       } else {
1389 	printf(" bestprev is pos %u, gregion %u, score is %d\n",
1390 	       bestprevpos,bestprevgregion->extentstart,gregion->score_floor);
1391       }
1392 #endif
1393     }
1394   }
1395 
1396   /* Get best overall gregion */
1397   bestscore = 0;
1398   bestprevpos = (Chrpos_T) -1U;
1399 
1400   for (i = n-1; i >= 0; --i) {
1401     base = (Base_T) Uinttable_get(high_basetable,keys[i]);
1402     for (p = base->gregions; p != NULL; p = List_next(p)) {
1403       gregion = (Gregion_T) List_head(p);
1404 
1405       if (gregion->score_floor > bestscore) {
1406 	bestscore = gregion->score_floor;
1407 	bestprevpos = keys[i];
1408       }
1409     }
1410   }
1411 
1412   FREE(keys);
1413 
1414   return bestprevpos;
1415 }
1416 
1417 
1418 static void
traceback_ceilings(Uinttable_T low_basetable,Chrpos_T prevpos)1419 traceback_ceilings (Uinttable_T low_basetable, Chrpos_T prevpos) {
1420   Chrpos_T ceiling;
1421   Gregion_T end_gregion;
1422   Base_T base, prevbase;
1423   Chrpos_T *keys;
1424   int n, i;
1425 
1426   n = Uinttable_length(low_basetable);
1427   keys = Uinttable_keys(low_basetable,/*sortp*/true);
1428 
1429   ceiling = (Chrpos_T) -1U;
1430 
1431   i = n-1;
1432   while (prevpos > keys[0]) {
1433     debug2(printf("traceback from endpos %u, back to %u\n",keys[i],prevpos));
1434     while (/*startpos*/keys[i] > prevpos) {
1435       base = (Base_T) Uinttable_get(low_basetable,/*startpos*/keys[i]);
1436       base->maxextent = ceiling;
1437       debug2(printf("At low %u, maxextent is %u\n",/*startpos*/keys[i],ceiling));
1438       i--;
1439     }
1440 
1441     prevbase = (Base_T) Uinttable_get(low_basetable,prevpos);
1442     if ((end_gregion = apply_ceiling(prevbase->gregions,ceiling)) == NULL) {
1443       prevpos = keys[0];	/* Ends loop */
1444     } else {
1445       ceiling = end_gregion->extentend /*-1*/;
1446       prevpos = end_gregion->bestprev_ceiling;
1447     }
1448   }
1449 
1450   debug2(printf("End of loop\n"));
1451   while (i >= 0) {
1452     base = (Base_T) Uinttable_get(low_basetable,keys[i]);
1453     base->maxextent = ceiling;
1454     debug2(printf("At low %u, maxextent is %u\n",keys[i],ceiling));
1455     i--;
1456   }
1457 
1458   FREE(keys);
1459 
1460   return;
1461 }
1462 
1463 
1464 static void
traceback_floors(Uinttable_T high_basetable,Chrpos_T prevpos)1465 traceback_floors (Uinttable_T high_basetable, Chrpos_T prevpos) {
1466   Chrpos_T floor;
1467   Gregion_T start_gregion;
1468   Base_T base, prevbase;
1469   Chrpos_T *keys;
1470   int n, i;
1471 
1472   n = Uinttable_length(high_basetable);
1473   keys = Uinttable_keys(high_basetable,/*sortp*/true);
1474 
1475   floor = 0U;
1476 
1477   i = 0;
1478   while (prevpos < keys[n-1]) {
1479     debug2(printf("traceback from startpos %u, forward to %u\n",keys[i],prevpos));
1480     while (/*endpos*/keys[i] < prevpos) {
1481       base = (Base_T) Uinttable_get(high_basetable,/*endpos*/keys[i]);
1482       base->minextent = floor;
1483       debug2(printf("At high %u, minextent is %u\n",/*endpos*/keys[i],floor));
1484       i++;
1485     }
1486 
1487     prevbase = (Base_T) Uinttable_get(high_basetable,prevpos);
1488     if ((start_gregion = apply_floor(prevbase->gregions,floor)) == NULL) {
1489       prevpos = keys[n-1];	/* Ends loop */
1490     } else {
1491       floor = start_gregion->extentstart /*+1*/;
1492       prevpos = start_gregion->bestprev_floor;
1493     }
1494   }
1495 
1496   debug2(printf("End of loop\n"));
1497   while (i < n) {
1498     base = (Base_T) Uinttable_get(high_basetable,keys[i]);
1499     base->minextent = floor;
1500     debug2(printf("At high %u, minextent is %u\n",/*endpos*/keys[i],floor));
1501     i++;
1502   }
1503 
1504   FREE(keys);
1505 
1506   return;
1507 }
1508 
1509 
1510 static void
bound_gregions(Uinttable_T low_basetable,Uinttable_T high_basetable)1511 bound_gregions (Uinttable_T low_basetable, Uinttable_T high_basetable) {
1512   Chrpos_T minextent, maxextent;
1513   Base_T base_low, base_high;
1514   Gregion_T gregion;
1515   List_T p;
1516   Chrpos_T *keys;
1517   int n, i;
1518 
1519   n = Uinttable_length(low_basetable);
1520   keys = Uinttable_keys(low_basetable,/*sortp*/true);
1521 
1522   for (i = 0; i < n; i++) {
1523     base_low = (Base_T) Uinttable_get(low_basetable,keys[i]);
1524     maxextent = base_low->maxextent;
1525     for (p = base_low->gregions; p != NULL; p = List_next(p)) {
1526       gregion = (Gregion_T) List_head(p);
1527       base_high = (Base_T) Uinttable_get(high_basetable,gregion->extentend);
1528       minextent = base_high->minextent;
1529       if (gregion->extentstart < minextent || gregion->extentend > maxextent) {
1530 	debug2(printf("Not bounded low: #%d:%u..%u\n",gregion->chrnum,gregion->extentstart,gregion->extentend));
1531 	gregion->bounded_low_p = false;
1532       } else {
1533 	debug2(printf("Bounded low: #%d:%u..%u\n",gregion->chrnum,gregion->extentstart,gregion->extentend));
1534 	gregion->bounded_low_p = true;
1535       }
1536     }
1537   }
1538 
1539   FREE(keys);
1540 
1541 
1542   n = Uinttable_length(high_basetable);
1543   keys = Uinttable_keys(high_basetable,/*sortp*/true);
1544 
1545   for (i = n-1; i >= 0; i--) {
1546     base_high = (Base_T) Uinttable_get(high_basetable,keys[i]);
1547     minextent = base_high->minextent;
1548     for (p = base_high->gregions; p != NULL; p = List_next(p)) {
1549       gregion = (Gregion_T) List_head(p);
1550       base_low = (Base_T) Uinttable_get(low_basetable,gregion->extentstart);
1551       maxextent = base_low->maxextent;
1552       if (gregion->extentstart < minextent || gregion->extentend > maxextent) {
1553 	debug2(printf("Not bounded high: #%d:%u..%u\n",gregion->chrnum,gregion->extentstart,gregion->extentend));
1554 	gregion->bounded_high_p = false;
1555       } else {
1556 	debug2(printf("Bounded high: #%d:%u..%u\n",gregion->chrnum,gregion->extentstart,gregion->extentend));
1557 	gregion->bounded_high_p = true;
1558       }
1559     }
1560   }
1561 
1562   FREE(keys);
1563 
1564   return;
1565 }
1566 
1567 
1568 /* Called only if USE_CLEAN is defined in stage1.c */
1569 void
Gregion_filter_clean(List_T gregionlist,int nchrs)1570 Gregion_filter_clean (List_T gregionlist, int nchrs) {
1571   Uinttable_T *low_basetables, *high_basetables, basetable;
1572   Base_T base;
1573   Chrpos_T prevpos;
1574   Chrnum_T chrnum;
1575 
1576   List_T p;
1577   T gregion;
1578   int n;
1579 #if 0
1580   List_T unique = NULL;
1581   T x, y, *array;
1582   int i, j;
1583   bool *eliminate;
1584 #endif
1585 #ifdef DEBUG
1586   int i;
1587 #endif
1588 
1589   if ((n = List_length(gregionlist)) == 0) {
1590     return;
1591   }
1592 
1593   debug(
1594 	for (p = gregionlist, i = 0; p != NULL; p = p->rest, i++) {
1595 	  gregion = (T) p->first;
1596 	  printf("  Initial %d: %d..%d %u-%u (plusp = %d)\n",
1597 		 i,gregion->querystart,gregion->queryend,gregion->extentstart,gregion->extentend,gregion->plusp);
1598 	}
1599 	);
1600 
1601   if (nchrs < MAX_NCHRS_FOR_ALLOCA) {
1602     low_basetables = (Uinttable_T *) CALLOCA(nchrs+1,sizeof(Uinttable_T));
1603     high_basetables = (Uinttable_T *) CALLOCA(nchrs+1,sizeof(Uinttable_T));
1604   } else {
1605     low_basetables = (Uinttable_T *) CALLOC(nchrs+1,sizeof(Uinttable_T));
1606     high_basetables = (Uinttable_T *) CALLOC(nchrs+1,sizeof(Uinttable_T));
1607   }
1608 
1609   for (p = gregionlist; p != NULL; p = List_next(p)) {
1610     gregion = (T) List_head(p);
1611 
1612     if (low_basetables[gregion->chrnum] == NULL) {
1613       low_basetables[gregion->chrnum] = Uinttable_new(n,/*save_contents_p*/false);
1614       high_basetables[gregion->chrnum] = Uinttable_new(n,/*save_contents_p*/false);
1615     }
1616 
1617     basetable = low_basetables[gregion->chrnum];
1618     if ((base = (Base_T) Uinttable_get(basetable,gregion->extentstart)) == NULL) {
1619       base = Base_new();
1620       Uinttable_put(basetable,gregion->extentstart,(void *) base);
1621     }
1622     base->gregions = List_push(base->gregions,(void *) gregion);
1623 
1624     basetable = high_basetables[gregion->chrnum];
1625     if ((base = (Base_T) Uinttable_get(basetable,gregion->extentend)) == NULL) {
1626       base = Base_new();
1627       Uinttable_put(basetable,gregion->extentend,(void *) base);
1628     }
1629     base->gregions = List_push(base->gregions,(void *) gregion);
1630   }
1631 
1632   for (chrnum = 0; chrnum <= nchrs; chrnum++) {
1633     if ((basetable = low_basetables[chrnum]) != NULL) {
1634       debug2(printf("Processing gregions for chrnum %d\n",chrnum));
1635       prevpos = compute_ceilings(basetable);
1636       traceback_ceilings(basetable,prevpos);
1637 
1638       basetable = high_basetables[chrnum];
1639       prevpos = compute_floors(basetable);
1640       traceback_floors(basetable,prevpos);
1641 
1642       bound_gregions(low_basetables[chrnum],high_basetables[chrnum]);
1643     }
1644   }
1645 
1646   for (p = gregionlist; p != NULL; p = List_next(p)) {
1647     gregion = (T) List_head(p);
1648 
1649     if (low_basetables[gregion->chrnum] == NULL) {
1650       Uinttable_free(&(low_basetables[gregion->chrnum]));
1651       Uinttable_free(&(high_basetables[gregion->chrnum]));
1652     }
1653   }
1654 
1655   if (nchrs < MAX_NCHRS_FOR_ALLOCA) {
1656     FREEA(high_basetables);
1657     FREEA(low_basetables);
1658   } else {
1659     FREE(high_basetables);
1660     FREE(low_basetables);
1661   }
1662 
1663 #if 0
1664   eliminate = (bool *) CALLOC(n,sizeof(bool));
1665 
1666   /* Not necessary if false is zero */
1667   /*
1668   for (i = 0; i < n; i++) {
1669     eliminate[i] = false;
1670   }
1671   */
1672 
1673   array = (T *) List_to_array(gregionlist,NULL);
1674   List_free(&gregionlist);
1675   qsort(array,n,sizeof(T),weight_cmp);
1676 
1677   for (i = 0; i < n; i++) {
1678     x = array[i];
1679     for (j = i+1; j < n; j++) {
1680       y = array[j];
1681       if (gregion_overlap_p(x,y) == true) {
1682 	eliminate[j] = true;
1683       }
1684     }
1685   }
1686 
1687   for (i = n-1; i >= 0; i--) {
1688     gregion = array[i];
1689     if (eliminate[i] == false) {
1690       debug(printf("  Keeping %u-%u (plusp = %d)\n",
1691 		   gregion->extentstart,gregion->extentend,gregion->plusp));
1692       unique = List_push(unique,(void *) gregion);
1693     } else {
1694       debug(printf("  Eliminating %u-%u (plusp = %d)\n",
1695 		   gregion->extentstart,gregion->extentend,gregion->plusp));
1696       /*
1697       if (gregion->match5 != NULL) {
1698 	Match_decr_npairings(gregion->match5);
1699 	Match_decr_npairings(gregion->match3);
1700       }
1701       */
1702       Gregion_free(&gregion);
1703     }
1704   }
1705 
1706   FREE(eliminate);
1707   FREE(array);
1708 
1709   debug(
1710 	for (p = unique, i = 0; p != NULL; p = p->rest, i++) {
1711 	  gregion = (T) p->first;
1712 	  printf("  Final %d: %u-%u (plusp = %d)\n",
1713 		 i,gregion->extentstart,gregion->extentend,gregion->plusp);
1714 	}
1715 	);
1716 
1717   return unique;
1718 
1719 #endif
1720 
1721   return;
1722 }
1723 
1724 #endif
1725 
1726