1 static char rcsid[] = "$Id: gregion.c 218147 2019-01-16 21:28:41Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "gregion.h"
7 #include <stdlib.h>
8 #include "assert.h"
9 #include "mem.h"
10 #include "indexdb.h" /* For SUFFICIENT_SUPPORT */
11 #include "univinterval.h"
12 #include "chrnum.h"
13
14 #define USE_CLEAN 1
15
16 #ifdef USE_CLEAN
17 #include "uinttable_rh.h"
18 #endif
19
20
21 #define MAX_GENOMICLENGTH 2000000
22 #define MAX_NCHRS_FOR_ALLOCA 100
23 #define MAX_GREGIONS_FOR_ALLOCA 100
24
25
26 #define EXTRA_SHORTEND 30000
27 #define EXTRA_LONGEND 100000
28
29 #ifdef DEBUG
30 #define debug(x) x
31 #else
32 #define debug(x)
33 #endif
34
35 /* filter_clean */
36 #ifdef DEBUG2
37 #define debug2(x) x
38 #else
39 #define debug2(x)
40 #endif
41
42 /* Sufficient support */
43 #ifdef DEBUG5
44 #define debug5(x) x
45 #else
46 #define debug5(x)
47 #endif
48
49
50 #define T Gregion_T
51 struct T {
52 int nexons;
53
54 Chrpos_T genomiclength;
55 bool plusp;
56 int genestrand;
57
58 Chrpos_T extension5;
59 Chrpos_T extension3;
60 bool extendedp;
61
62 Chrnum_T chrnum;
63
64 Chrpos_T chrstart;
65 Chrpos_T chrend;
66
67 Chrpos_T extentstart;
68 Chrpos_T extentend;
69
70 Univcoord_T chroffset; /* This is for chr, not the segment */
71 Univcoord_T chrhigh;
72 Chrpos_T chrlength;
73
74 int querystart; /* Used only for maponly mode */
75 int queryend; /* Used only for maponly mode */
76 int matchsize; /* Used only for maponly mode */
77
78 int trimstart;
79 int trimend;
80 bool sufficient_support_p;
81
82 double weight; /* Product of match weights */
83 int support;
84
85 int ncovered;
86 int source; /* Oligoindex source in stage 2 */
87
88 /* For cleaning */
89
90
91 int total_up;
92 int total_down;
93 Chrpos_T bestprev_ceiling;
94 Chrpos_T bestprev_floor;
95 int score_ceiling;
96 int score_floor;
97
98 #ifdef USE_CLEAN
99 bool bounded_low_p;
100 bool bounded_high_p;
101 #endif
102 };
103
104
105
106 void
Gregion_print(T this)107 Gregion_print (T this) {
108
109 #if 1
110 /* Off for debugging */
111 printf(" %d..%d ",this->querystart,this->queryend);
112 #endif
113 printf("%u %u %u %u #%d:%u..%u length:%u weight:%.2f support:%d",
114 this->extentstart,this->extentend,this->chrstart,this->chrend,
115 this->chrnum,this->chrstart,this->chrend,this->genomiclength,
116 this->weight,this->support);
117 #ifdef USE_CLEAN
118 printf(" bounded_low:%d, bounded_high:%d",this->bounded_low_p,this->bounded_high_p);
119 #endif
120
121 printf("\n");
122
123 return;
124 }
125
126
127 void
Gregion_free(T * old)128 Gregion_free (T *old) {
129 FREE(*old);
130 return;
131 }
132
133 Univcoord_T
Gregion_genomicstart(T this)134 Gregion_genomicstart (T this) {
135 return this->chroffset + this->chrstart;
136 }
137
138 Univcoord_T
Gregion_genomicend(T this)139 Gregion_genomicend (T this) {
140 return this->chroffset + this->chrend;
141 }
142
143 Chrpos_T
Gregion_chrstart(T this)144 Gregion_chrstart (T this) {
145 return this->chrstart;
146 }
147
148 Chrpos_T
Gregion_chrend(T this)149 Gregion_chrend (T this) {
150 return this->chrend;
151 }
152
153 Chrpos_T
Gregion_genomiclength(T this)154 Gregion_genomiclength (T this) {
155 return this->genomiclength;
156 }
157
158 bool
Gregion_plusp(T this)159 Gregion_plusp (T this) {
160 return this->plusp;
161 }
162
163 bool
Gregion_revcompp(T this)164 Gregion_revcompp (T this) {
165 return !(this->plusp);
166 }
167
168 int
Gregion_genestrand(T this)169 Gregion_genestrand (T this) {
170 return this->genestrand;
171 }
172
173 Chrnum_T
Gregion_chrnum(T this)174 Gregion_chrnum (T this) {
175 return this->chrnum;
176 }
177
178 /* Used only for debugging. String is allocated and should be freed. */
179 char *
Gregion_chr(T this,Univ_IIT_T chromosome_iit)180 Gregion_chr (T this, Univ_IIT_T chromosome_iit) {
181 return Chrnum_to_string(this->chrnum,chromosome_iit);
182 }
183
184 Univcoord_T
Gregion_chroffset(T this)185 Gregion_chroffset (T this) {
186 return this->chroffset;
187 }
188
189 Univcoord_T
Gregion_chrhigh(T this)190 Gregion_chrhigh (T this) {
191 return this->chrhigh;
192 }
193
194 Chrpos_T
Gregion_chrlength(T this)195 Gregion_chrlength (T this) {
196 return this->chrlength;
197 }
198
199 int
Gregion_querystart(T this)200 Gregion_querystart (T this) {
201 return this->querystart;
202 }
203
204 int
Gregion_queryend(T this)205 Gregion_queryend (T this) {
206 return this->queryend;
207 }
208
209 int
Gregion_matchsize(T this)210 Gregion_matchsize (T this) {
211 return this->matchsize;
212 }
213
214 double
Gregion_weight(T this)215 Gregion_weight (T this) {
216 return this->weight;
217 }
218
219 int
Gregion_support(T this)220 Gregion_support (T this) {
221 return this->support;
222 }
223
224 bool
Gregion_extendedp(T this)225 Gregion_extendedp (T this) {
226 return this->extendedp;
227 }
228
229 void
Gregion_set_ncovered(T this,int ncovered,int source)230 Gregion_set_ncovered (T this, int ncovered, int source) {
231 this->ncovered = ncovered;
232 this->source = source;
233 return;
234 }
235
236 int
Gregion_ncovered(T this)237 Gregion_ncovered (T this) {
238 return this->ncovered;
239 }
240
241
242
243 T
Gregion_new(int nexons,Univcoord_T genomicstart,Univcoord_T genomicend,bool plusp,int genestrand,Univ_IIT_T chromosome_iit,int querystart,int queryend,int querylength,int matchsize,int trimstart,int trimend,int circular_typeint)244 Gregion_new (int nexons, Univcoord_T genomicstart, Univcoord_T genomicend,
245 bool plusp, int genestrand, Univ_IIT_T chromosome_iit, int querystart, int queryend, int querylength,
246 int matchsize, int trimstart, int trimend, int circular_typeint) {
247 T new = (T) MALLOC(sizeof(*new));
248
249 debug(printf("Creating gregion with genomicstart %u, genomicend %u, trimstart %d, trimend %d\n",
250 genomicstart,genomicend,trimstart,trimend));
251
252 new->nexons = nexons;
253 if (chromosome_iit == NULL) {
254 /* Should not reach here, since user_genomicseg does not call stage 1 */
255 new->chrnum = 0;
256 new->chroffset = 0U;
257 new->chrlength = 0U;
258 } else {
259 new->chrnum = Univ_IIT_get_one(chromosome_iit,genomicstart,genomicstart);
260 /* new->chroffset = Univinterval_low(Univ_IIT_interval(chromosome_iit,new->chrnum)); */
261 /* new->chrhigh = Univinterval_high(Univ_IIT_interval(chromosome_iit,new->chrnum)); */
262 Univ_IIT_interval_bounds(&new->chroffset,&new->chrhigh,&new->chrlength,
263 chromosome_iit,new->chrnum,circular_typeint);
264 }
265
266 assert(genomicstart < genomicend);
267 new->genomiclength = genomicend - genomicstart;
268
269 new->chrstart = genomicstart - new->chroffset;
270 new->chrend = new->chrstart + new->genomiclength;
271
272 new->plusp = plusp;
273 new->genestrand = genestrand;
274
275 if (plusp == true) {
276 if (new->chrstart < (Chrpos_T) querystart) {
277 /* Extends past beginning of genome */
278 new->extentstart = 0;
279 } else {
280 new->extentstart = new->chrstart - querystart;
281 }
282 new->extentend = new->chrend + (querylength - queryend);
283 } else {
284 if (new->chrstart < (Chrpos_T) (querylength - queryend)) {
285 /* Extends past beginning of genome */
286 new->extentstart = 0;
287 } else {
288 new->extentstart = new->chrstart - (querylength - queryend);
289 }
290 new->extentend = new->chrend + querystart;
291 }
292
293 new->extension5 = 0U;
294 new->extension3 = 0U;
295 new->extendedp = false;
296
297 new->querystart = querystart;
298 new->queryend = queryend;
299 new->matchsize = matchsize;
300
301 new->trimstart = trimstart;
302 new->trimend = trimend;
303
304 #ifdef PMAP
305 debug5(printf(" Testing bound5+extension5 = %d - %u < %d + %d, bound3+extension3 = %d + %u > %d - %d\n",
306 new->querystart,new->extension5,trimstart,SUFFICIENT_SUPPORT,
307 new->queryend,new->extension3,trimend,SUFFICIENT_SUPPORT));
308 if (new->querystart - new->extension5 < trimstart + SUFFICIENT_SUPPORT &&
309 new->queryend + new->extension3 > trimend - SUFFICIENT_SUPPORT) {
310 new->sufficient_support_p = true;
311 } else {
312 new->sufficient_support_p = false;
313 }
314 #else
315 debug5(printf(" Testing bound5+extension5 = %d - %u < %d + %d, bound3 = %d + %u > %d - %d\n",
316 new->querystart,new->extension5,trimstart,SUFFICIENT_SUPPORT,
317 new->queryend,new->extension3,trimend,SUFFICIENT_SUPPORT));
318 if (new->querystart - new->extension5 < (Chrpos_T) (trimstart + SUFFICIENT_SUPPORT) &&
319 new->queryend + new->extension3 > (Chrpos_T) (trimend - SUFFICIENT_SUPPORT)) {
320 new->sufficient_support_p = true;
321 } else {
322 new->sufficient_support_p = false;
323 }
324 #endif
325
326 new->weight = 0.0;
327 new->support = queryend - querystart + matchsize;
328
329 new->total_up = 0;
330 new->total_down = 0;
331 new->bestprev_ceiling = 0;
332 new->bestprev_floor = (Chrpos_T) -1;
333 new->score_ceiling = 0;
334 new->score_floor = 0;
335
336 #ifdef USE_CLEAN
337 new->bounded_low_p = false;
338 new->bounded_high_p = false;
339 #endif
340
341 return new;
342 }
343
344
345 T
Gregion_new_from_matches(Match_T match5,Match_T match3,int genestrand,Univ_IIT_T chromosome_iit,int querylength,int matchsize,int trimstart,int trimend,int circular_typeint)346 Gregion_new_from_matches (Match_T match5, Match_T match3, int genestrand, Univ_IIT_T chromosome_iit,
347 int querylength, int matchsize, int trimstart, int trimend, int circular_typeint) {
348 T gregion;
349 Univcoord_T genomicstart, genomicend;
350 int querystart, queryend;
351
352 querystart = Match_querypos(match5);
353 queryend = Match_querypos(match3);
354
355 if (Match_forwardp(match5)) {
356 genomicstart = Match_position(match5);
357 genomicend = Match_position(match3) + 1U;
358 /* chrnum = Match_chrnum(match5); */
359
360 } else {
361 genomicstart = Match_position(match3);
362 genomicend = Match_position(match5) + 1U;
363 /* chrnum = Match_chrnum(match3); */
364 }
365
366 #if 0
367 if (chromosome_iit == NULL) {
368 chroffset = 0U;
369 } else {
370 chroffset = Univinterval_low(Univ_IIT_interval(chromosome_iit,chrnum));
371 }
372 #endif
373
374 debug(printf("Coordinates are %u .. %u\n",genomicstart,genomicend));
375
376 gregion = Gregion_new(/*nexons*/0,genomicstart,genomicend,Match_forwardp(match5),genestrand,
377 chromosome_iit,querystart,queryend,querylength,matchsize,trimstart,trimend,
378 circular_typeint);
379
380 gregion->weight = Match_weight(match5) * Match_weight(match3);
381 Match_incr_npairings(match5);
382 Match_incr_npairings(match3);
383
384 return gregion;
385 }
386
387
388 #if 0
389 static int
390 weight_cmp (const void *x, const void *y) {
391 T a = * (T *) x;
392 T b = * (T *) y;
393
394 if (a->weight > b->weight) {
395 return -1;
396 } else if (a->weight < b->weight) {
397 return +1;
398 } else if (a->support > b->support) {
399 return -1;
400 } else if (a->support < b->support) {
401 return +1;
402 } else if (a->genomiclength > b->genomiclength) {
403 return -1;
404 } else if (a->genomiclength < b->genomiclength) {
405 return +1;
406 } else {
407 return 0;
408 }
409 }
410 #endif
411
412
413 static int
support_cmp(const void * x,const void * y)414 support_cmp (const void *x, const void *y) {
415 T a = * (T *) x;
416 T b = * (T *) y;
417
418 if (a->support > b->support) {
419 /* Larger support should beat smaller support */
420 return -1;
421 } else if (a->support < b->support) {
422 return +1;
423
424 } else if (a->genomiclength < b->genomiclength) {
425 /* For the same support, prefer more compact regions */
426 return -1;
427 } else if (a->genomiclength > b->genomiclength) {
428 return +1;
429 } else {
430 return 0;
431 }
432 }
433
434
435
436 /* Not intended for qsort. Returns 0 when not comparable. */
437 static bool
gregion_overlap_p(T x,T y)438 gregion_overlap_p (T x, T y) {
439 Univcoord_T x_genomicstart, x_genomicend, y_genomicstart, y_genomicend;
440 Univcoord_T overlap;
441 double fraction;
442 bool plusp;
443
444 if (x->plusp != y->plusp) {
445 debug(printf("Case 0 => different strands\n"));
446 return false; /* Different strands */
447
448 } else {
449 plusp = x->plusp;
450 x_genomicstart = x->chroffset + x->chrstart;
451 x_genomicend = x->chroffset + x->chrend;
452 y_genomicstart = y->chroffset + y->chrstart;
453 y_genomicend = y->chroffset + y->chrend;
454
455 if (y_genomicstart > x_genomicend || x_genomicstart > y_genomicend) {
456 debug(printf("Case 0: x %u..%u, y %u..%u => no overlap\n",
457 x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
458 /*
459 /-- x --/ /-- y --/ or /-- y --/ /-- x --/
460 */
461 return false; /* No overlap */
462
463 } else if (y_genomicstart < x_genomicstart) {
464 if (y_genomicend < x_genomicend) {
465 debug(printf("Case 1: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
466
467 /*
468 /-- x --/
469 /-- y --/
470 */
471
472 if (plusp == true && (y->querystart >= x->querystart || y->queryend >= x->queryend)) {
473 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
474 x->querystart,x->queryend,y->querystart,y->queryend));
475 return false;
476
477 } else if (plusp == false && (y->querystart <= x->querystart || y->queryend <= x->queryend)) {
478 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
479 x->querystart,x->queryend,y->querystart,y->queryend));
480 return false;
481
482 } else {
483 overlap = y_genomicend - x_genomicstart;
484 if (y_genomicend - y_genomicstart < x_genomicend - x_genomicstart) {
485 fraction = (double) overlap/(double) (y_genomicend - y_genomicstart);
486 } else {
487 fraction = (double) overlap/(double) (x_genomicend - x_genomicstart);
488 }
489 debug(printf(" => fraction %f",fraction));
490 if (fraction > 0.5) {
491 debug(printf(" => overlap\n"));
492 return true;
493 } else {
494 debug(printf(" => no overlap\n"));
495 return false;
496 }
497 }
498
499 } else if (y_genomicend > x_genomicend) {
500 debug(printf("Case 2: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
501
502 /*
503 /-- x --/
504 /----- y -----/
505 */
506
507 /* Note: Like case 4, both branches are true for plus and minus */
508 if (plusp == true && (y->querystart >= x->querystart || y->queryend <= x->queryend)) {
509 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
510 x->querystart,x->queryend,y->querystart,y->queryend));
511 return false;
512
513 } else if (plusp == false && (y->querystart >= x->querystart || y->queryend <= x->queryend)) {
514 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
515 x->querystart,x->queryend,y->querystart,y->queryend));
516 return false;
517
518 } else {
519 debug(printf(" => subsumption\n"));
520 return true;
521 }
522
523 } else {
524 /* y_genomicend == x_genomicend */
525 debug(printf("Case 3: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
526
527 /*
528 /-- x --/
529 /----- y ---/
530 */
531
532 if (plusp == true && (y->querystart >= x->querystart || y->queryend != x->queryend)) {
533 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
534 x->querystart,x->queryend,y->querystart,y->queryend));
535 return false;
536
537 } else if (plusp == false && (y->querystart != x->querystart || y->queryend <= x->queryend)) {
538 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
539 x->querystart,x->queryend,y->querystart,y->queryend));
540 return false;
541
542 } else {
543 debug(printf(" => subsumption\n"));
544 return true;
545 }
546
547 }
548
549 } else if (y_genomicstart > x_genomicstart) {
550 if (y_genomicend < x_genomicend) {
551 debug(printf("Case 4: x %u..%u, y %u..%u\n",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
552
553 /*
554 /----- x -----/
555 /-- y --/
556 */
557
558 /* Note: Like case 2, both branches are true for plus and minus */
559 if (plusp == true && (y->querystart <= x->querystart || y->queryend >= x->queryend)) {
560 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
561 x->querystart,x->queryend,y->querystart,y->queryend));
562 return false;
563
564 } else if (plusp == false && (y->querystart <= x->querystart || y->queryend >= x->queryend)) {
565 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
566 x->querystart,x->queryend,y->querystart,y->queryend));
567 return false;
568
569 } else {
570 debug(printf(" => subsumption\n"));
571 return true;
572 }
573
574 } else if (y_genomicend > x_genomicend) {
575 debug(printf("Case 5: x %u..%u, y %u..%u\n",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
576
577 /*
578 /-- x --/
579 /-- y --/
580 */
581
582 if (plusp == true && (y->querystart <= x->querystart || y->queryend <= x->queryend)) {
583 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
584 x->querystart,x->queryend,y->querystart,y->queryend));
585 return false;
586
587 } else if (plusp == false && (y->querystart >= x->querystart || y->queryend >= x->queryend)) {
588 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
589 x->querystart,x->queryend,y->querystart,y->queryend));
590 return false;
591
592 } else {
593 overlap = x_genomicend - y_genomicstart;
594 if (y_genomicend - y_genomicstart < x_genomicend - x_genomicstart) {
595 fraction = (double) overlap/(double) (y_genomicend - y_genomicstart);
596 } else {
597 fraction = (double) overlap/(double) (x_genomicend - x_genomicstart);
598 }
599 debug(printf(" => fraction %f",fraction));
600 if (fraction > 0.5) {
601 debug(printf(" => overlap\n"));
602 return true;
603 } else {
604 debug(printf(" => no overlap\n"));
605 return false;
606 }
607 }
608
609 } else {
610 /* y_genomicend == x_genomicend */
611 debug(printf("Case 6: x %u..%u, y %u..%u\n",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
612
613 /*
614 /-- x ----/
615 /-- y --/
616 */
617
618 if (plusp == true && (y->querystart <= x->querystart || y->queryend != x->queryend)) {
619 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
620 x->querystart,x->queryend,y->querystart,y->queryend));
621 return false;
622
623 } else if (plusp == false && (y->querystart != x->querystart || y->queryend >= x->queryend)) {
624 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
625 x->querystart,x->queryend,y->querystart,y->queryend));
626 return false;
627
628 } else {
629 debug(printf(" => subsumption\n"));
630 return true;
631 }
632
633 }
634
635 } else {
636 /* y_genomicstart == x_genomicstart */
637
638 if (y_genomicend < x_genomicend) {
639 debug(printf("Case 7: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
640
641 /*
642 /----- x --/
643 /-- y --/
644 */
645
646 if (plusp == true && (y->querystart != x->querystart || y->queryend >= x->queryend)) {
647 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
648 x->querystart,x->queryend,y->querystart,y->queryend));
649 return false;
650
651 } else if (plusp == false && (y->querystart <= x->querystart || y->queryend != x->queryend)) {
652 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
653 x->querystart,x->queryend,y->querystart,y->queryend));
654 return false;
655
656 } else {
657 debug(printf(" => subsumption\n"));
658 return true;
659 }
660
661 } else if (y_genomicend > x_genomicend) {
662 debug(printf("Case 8: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
663
664 /*
665 /-- x --/
666 /-- y -----/
667 */
668
669 if (plusp == true && (y->querystart != x->querystart || y->queryend <= x->queryend)) {
670 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
671 x->querystart,x->queryend,y->querystart,y->queryend));
672 return false;
673
674 } else if (plusp == false && (y->querystart >= x->querystart || y->queryend != x->queryend)) {
675 debug(printf(" => query coords not consistent: x %d..%d, y %d..%d\n",
676 x->querystart,x->queryend,y->querystart,y->queryend));
677 return false;
678
679 } else {
680 debug(printf(" => subsumption\n"));
681 return true;
682 }
683
684 } else {
685 /* y_genomicend == x_genomicend */
686 debug(printf("Case 9: x %u..%u, y %u..%u",x_genomicstart,x_genomicend,y_genomicstart,y_genomicend));
687
688 /*
689 /-- x --/
690 /-- y --/
691 */
692
693 debug(printf(" => equality\n"));
694 return true;
695 }
696 }
697 }
698 }
699
700
701 List_T
Gregion_filter_unique(List_T gregionlist)702 Gregion_filter_unique (List_T gregionlist) {
703 List_T unique = NULL;
704 T x, y, gregion, *array;
705 int n, i, j;
706 bool *eliminate;
707 #ifdef DEBUG
708 List_T p;
709 #endif
710
711 n = List_length(gregionlist);
712 if (n == 0) {
713 return NULL;
714 }
715
716 debug(
717 for (p = gregionlist, i = 0; p != NULL; p = p->rest, i++) {
718 gregion = (T) p->first;
719 printf(" Initial %d: %d..%d #%d:%u-%u (plusp = %d)\n",
720 i,gregion->querystart,gregion->queryend,gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp);
721 }
722 );
723
724 if (n < MAX_GREGIONS_FOR_ALLOCA) {
725 eliminate = (bool *) CALLOCA(n,sizeof(bool));
726 array = (T *) MALLOCA(n * sizeof(T));
727 List_fill_array_and_free((void **) array,&gregionlist);
728 } else {
729 eliminate = (bool *) CALLOC(n,sizeof(bool));
730 array = (T *) List_to_array(gregionlist,NULL);
731 List_free(&gregionlist);
732 }
733
734 /* qsort(array,n,sizeof(T),weight_cmp); */
735 qsort(array,n,sizeof(T),support_cmp);
736
737 debug(
738 for (i = 0; i < n; i++) {
739 gregion = array[i];
740 printf(" After sorting by support %d: %d..%d #%d:%u-%u (plusp = %d)\n",
741 i,gregion->querystart,gregion->queryend,gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp);
742 }
743 );
744
745 for (i = 0; i < n; i++) {
746 x = array[i];
747 for (j = i+1; j < n; j++) {
748 y = array[j];
749 #ifdef DEBUG
750 printf("Comparing these regions %d and %d:\n",i,j);
751 printf(" ");
752 Gregion_print(x);
753 printf(" ");
754 Gregion_print(y);
755 printf("\n");
756 #endif
757 if (gregion_overlap_p(x,y) == true) {
758 debug(printf(" => Found overlap. Eliminating %d\n\n",j));
759 eliminate[j] = true;
760 }
761 }
762 }
763
764 for (i = n-1; i >= 0; i--) {
765 gregion = array[i];
766 if (eliminate[i] == false) {
767 debug(printf(" Keeping #%d:%u-%u (plusp = %d)\n",
768 gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp));
769 unique = List_push(unique,(void *) gregion);
770 } else {
771 debug(printf(" Eliminating #%d:%u-%u (plusp = %d)\n",
772 gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp));
773 /*
774 if (gregion->match5 != NULL) {
775 Match_decr_npairings(gregion->match5);
776 Match_decr_npairings(gregion->match3);
777 }
778 */
779 Gregion_free(&gregion);
780 }
781 }
782
783 if (n < MAX_GREGIONS_FOR_ALLOCA) {
784 FREEA(eliminate);
785 FREEA(array);
786 } else {
787 FREE(eliminate);
788 FREE(array);
789 }
790
791 debug(
792 for (p = unique, i = 0; p != NULL; p = p->rest, i++) {
793 gregion = (T) p->first;
794 printf(" Final %d: #%d:%u-%u (plusp = %d)\n",
795 i,gregion->chrnum,gregion->chrstart,gregion->chrend,gregion->plusp);
796 }
797 );
798
799 return unique;
800 }
801
802
803 List_T
Gregion_filter_support(List_T gregionlist,int boundary,double pct_max,int diff_max)804 Gregion_filter_support (List_T gregionlist, int boundary, double pct_max, int diff_max) {
805 List_T good = NULL, p;
806 int threshold, maxsupport = 0;
807 T gregion;
808
809 if (gregionlist == NULL) {
810 return NULL;
811 }
812
813 for (p = gregionlist; p != NULL; p = List_next(p)) {
814 gregion = (T) List_head(p);
815 if (gregion->support > maxsupport) {
816 maxsupport = gregion->support;
817 }
818 }
819
820 if (maxsupport > boundary) {
821 /* Use diff_max */
822 for (p = gregionlist; p != NULL; p = List_next(p)) {
823 gregion = (T) List_head(p);
824 if (maxsupport - gregion->support < diff_max) {
825 good = List_push(good,gregion);
826 } else {
827 Gregion_free(&gregion);
828 }
829 }
830 } else {
831 threshold = (int) ((double) maxsupport * pct_max);
832 /* Use threshold only */
833 for (p = gregionlist; p != NULL; p = List_next(p)) {
834 gregion = (T) List_head(p);
835 if (gregion->support >= threshold) {
836 good = List_push(good,gregion);
837 } else {
838 Gregion_free(&gregion);
839 }
840 }
841 }
842
843 List_free(&gregionlist);
844 return List_reverse(good);
845 }
846
847
848 double
Gregion_best_weight(List_T gregionlist)849 Gregion_best_weight (List_T gregionlist) {
850 double best_weight = 0.0;
851 T gregion;
852 List_T p;
853
854 for (p = gregionlist; p != NULL; p = List_next(p)) {
855 gregion = (T) List_head(p);
856 if (gregion->weight > best_weight) {
857 best_weight = gregion->weight;
858 }
859 }
860
861 return best_weight;
862 }
863
864
865 bool
Gregion_sufficient_support(T this)866 Gregion_sufficient_support (T this) {
867 return this->sufficient_support_p;
868 }
869
870
871 void
Gregion_extend(T this,Chrpos_T extension5,Chrpos_T extension3,int querylength)872 Gregion_extend (T this, Chrpos_T extension5, Chrpos_T extension3, int querylength) {
873 Chrpos_T left, right;
874 int extra_end;
875
876 debug(printf("Entering Gregion_extend with extension5 %u and extension3 %u\n",extension5,extension3));
877 debug(printf(" #%d:%u..%u\n",this->chrnum,this->chrstart,this->chrlength));
878 this->extension5 = extension5;
879 this->extension3 = extension3;
880 this->extendedp = true;
881
882
883 if (this->nexons == 1 || Gregion_sufficient_support(this) == true || this->support < 100) {
884 extra_end = EXTRA_SHORTEND;
885 #if 0
886 /* Should no longer be necessary for known splicesites */
887 if (extra_end < min_extra_end) {
888 extra_end = min_extra_end;
889 }
890 #endif
891 if (this->plusp == true) {
892 left = extension5 + querylength + extra_end;
893 right = extension3 + querylength + extra_end;
894 } else {
895 left = extension3 + querylength + extra_end;
896 right = extension5 + querylength + extra_end;
897 }
898 } else {
899 extra_end = EXTRA_LONGEND;
900 #if 0
901 /* Should no longer be necessary for known splicesites */
902 if (extra_end < min_extra_end) {
903 extra_end = min_extra_end;
904 }
905 #endif
906 if (this->plusp == true) {
907 left = extension5 + extra_end;
908 right = extension3 + extra_end;
909 } else {
910 left = extension3 + extra_end;
911 right = extension5 + extra_end;
912 }
913 }
914
915 /* printf("chrstart %u vs left %u\n",this->chrstart,left); */
916 if (this->chrstart < left) {
917 /* At beginning of chromosome */
918 this->chrstart = 0U;
919 } else {
920 this->chrstart -= left;
921 }
922
923 /* printf("chroffset %u + chrend %u + right %u vs chrhigh %u\n",this->chroffset,this->chrend,right,this->chrhigh); */
924 if (this->chroffset + this->chrend + right >= this->chrhigh) {
925 /* At end of chromosome */
926 this->chrend = this->chrlength - 1;
927 } else {
928 this->chrend += right;
929 }
930
931 /* Prevent very large genomic segments */
932 if (this->chrend > this->chrstart + MAX_GENOMICLENGTH) {
933 this->chrend = this->chrstart + MAX_GENOMICLENGTH;
934 }
935
936 this->genomiclength = this->chrend - this->chrstart + 1U;
937 /* printf("chrstart %u, chrend %u, genomiclength %u\n",this->chrstart,this->chrend,this->genomiclength); */
938
939 #ifdef PMAP
940 debug5(printf(" Testing bound5+extension5 = %d - %u < %d + %d, bound3+extension3 = %d + %u > %d - %d\n",
941 this->querystart,this->extension5,this->trimstart,SUFFICIENT_SUPPORT,
942 this->queryend,this->extension3,this->trimend,SUFFICIENT_SUPPORT));
943 if (this->querystart - this->extension5 < this->trimstart + SUFFICIENT_SUPPORT &&
944 this->queryend + this->extension3 > this->trimend - SUFFICIENT_SUPPORT) {
945 this->sufficient_support_p = true;
946 } else {
947 this->sufficient_support_p = false;
948 }
949 #else
950 debug5(printf(" Testing bound5+extension5 = %d - %u < %d + %d, bound3 = %d + %u > %d - %d\n",
951 this->querystart,this->extension5,this->trimstart,SUFFICIENT_SUPPORT,
952 this->queryend,this->extension3,this->trimend,SUFFICIENT_SUPPORT));
953 if (this->querystart - this->extension5 < (Chrpos_T) (this->trimstart + SUFFICIENT_SUPPORT) &&
954 this->queryend + this->extension3 > (Chrpos_T) (this->trimend - SUFFICIENT_SUPPORT)) {
955 this->sufficient_support_p = true;
956 } else {
957 this->sufficient_support_p = false;
958 }
959 #endif
960
961 debug(printf(" #%d:%u..%u\n",this->chrnum,this->chrstart,this->chrend));
962
963 return;
964 }
965
966
967 int
Gregion_cmp(const void * a,const void * b)968 Gregion_cmp (const void *a, const void *b) {
969 T x = * (T *) a;
970 T y = * (T *) b;
971
972 if (x->ncovered > y->ncovered) {
973 return -1;
974 } else if (y->ncovered > x->ncovered) {
975 return +1;
976 } else {
977 return 0;
978 }
979 }
980
981
982 /************************************************************************
983 * Filtering, based on spliceclean
984 ************************************************************************/
985
986 #ifdef USE_CLEAN
987
988 #define MAXLOOKBACK 60
989
990 static void
compute_total_up(List_T gregions)991 compute_total_up (List_T gregions) {
992 long int total_up;
993 List_T p;
994 Gregion_T gregion;
995
996 total_up = 0;
997 for (p = gregions; p != NULL; p = List_next(p)) {
998 gregion = (Gregion_T) List_head(p);
999 total_up += 1;
1000 gregion->total_up = total_up;
1001 }
1002
1003 return;
1004 }
1005
1006 static void
compute_total_down(List_T gregions)1007 compute_total_down (List_T gregions) {
1008 long int total_down;
1009 List_T p;
1010 Gregion_T gregion;
1011
1012 total_down = 0;
1013 for (p = gregions; p != NULL; p = List_next(p)) {
1014 gregion = (Gregion_T) List_head(p);
1015 total_down += 1;
1016 gregion->total_down = total_down;
1017 }
1018
1019 return;
1020 }
1021
1022
1023 static int
Gregion_low_descending_cmp(const void * a,const void * b)1024 Gregion_low_descending_cmp (const void *a, const void *b) {
1025 Gregion_T x = * (Gregion_T *) a;
1026 Gregion_T y = * (Gregion_T *) b;
1027
1028 if (x->extentstart > y->extentstart) {
1029 return -1;
1030 } else if (y->extentstart > x->extentstart) {
1031 return +1;
1032 } else {
1033 return 0;
1034 }
1035 }
1036
1037 static int
Gregion_high_ascending_cmp(const void * a,const void * b)1038 Gregion_high_ascending_cmp (const void *a, const void *b) {
1039 Gregion_T x = * (Gregion_T *) a;
1040 Gregion_T y = * (Gregion_T *) b;
1041
1042 if (x->extentend < y->extentend) {
1043 return -1;
1044 } else if (y->extentend < x->extentend) {
1045 return +1;
1046 } else {
1047 return 0;
1048 }
1049 }
1050
1051 static List_T
Gregion_sort_low_descending(List_T gregions)1052 Gregion_sort_low_descending (List_T gregions) {
1053 List_T sorted = NULL;
1054 Gregion_T *array;
1055 int n, i;
1056
1057 if ((n = List_length(gregions)) == 0) {
1058 return (List_T) NULL;
1059
1060 } else if (n < MAX_GREGIONS_FOR_ALLOCA) {
1061 array = (Gregion_T *) MALLOCA(n * sizeof(Gregion_T));
1062 List_fill_array((void **) array,gregions);
1063
1064 qsort(array,n,sizeof(Gregion_T),Gregion_low_descending_cmp);
1065 for (i = n-1; i >= 0; i--) {
1066 sorted = List_push(sorted,array[i]);
1067 }
1068
1069 FREEA(array);
1070 return sorted;
1071
1072 } else {
1073 array = (Gregion_T *) List_to_array(gregions,NULL);
1074
1075 qsort(array,n,sizeof(Gregion_T),Gregion_low_descending_cmp);
1076 for (i = n-1; i >= 0; i--) {
1077 sorted = List_push(sorted,array[i]);
1078 }
1079
1080 FREE(array);
1081 return sorted;
1082 }
1083 }
1084
1085
1086
1087 static List_T
Gregion_sort_high_ascending(List_T gregions)1088 Gregion_sort_high_ascending (List_T gregions) {
1089 List_T sorted = NULL;
1090 Gregion_T *array;
1091 int n, i;
1092
1093 if ((n = List_length(gregions)) == 0) {
1094 return (List_T) NULL;
1095
1096 } else if (n < MAX_GREGIONS_FOR_ALLOCA) {
1097 array = (Gregion_T *) MALLOCA(n * sizeof(Gregion_T));
1098 List_fill_array((void **) array,gregions);
1099
1100 qsort(array,n,sizeof(Gregion_T),Gregion_high_ascending_cmp);
1101 for (i = n-1; i >= 0; i--) {
1102 sorted = List_push(sorted,array[i]);
1103 }
1104
1105 FREEA(array);
1106 return sorted;
1107
1108 } else {
1109 array = (Gregion_T *) List_to_array(gregions,NULL);
1110
1111 qsort(array,n,sizeof(Gregion_T),Gregion_high_ascending_cmp);
1112 for (i = n-1; i >= 0; i--) {
1113 sorted = List_push(sorted,array[i]);
1114 }
1115
1116 FREE(array);
1117 return sorted;
1118 }
1119 }
1120
1121
1122
1123 typedef struct Base_T *Base_T;
1124 struct Base_T {
1125 Chrpos_T prevpos;
1126 Chrpos_T minextent;
1127 Chrpos_T maxextent;
1128 List_T gregions;
1129
1130 bool usedp;
1131 };
1132
1133 #if 0
1134 static void
1135 Base_free (Base_T *old) {
1136 if ((*old)->gregions != NULL) {
1137 List_free(&(*old)->gregions);
1138 }
1139 FREE(*old);
1140 return;
1141 }
1142 #endif
1143
1144
1145 static Base_T
Base_new()1146 Base_new () {
1147 Base_T new = (Base_T) MALLOC(sizeof(*new));
1148
1149 new->minextent = -1U;
1150 new->maxextent = -1U;
1151 new->gregions = (List_T) NULL;
1152 new->usedp = false;
1153
1154 return new;
1155 }
1156
1157
1158
1159 /* Assumes gregions are arranged low to high */
1160 static Gregion_T
apply_ceiling(List_T gregions,Chrpos_T ceiling)1161 apply_ceiling (List_T gregions, Chrpos_T ceiling) {
1162 List_T p;
1163 Gregion_T prevgregion = NULL, gregion;
1164
1165 for (p = gregions; p != NULL; p = List_next(p)) {
1166 gregion = (Gregion_T) List_head(p);
1167 if (gregion->extentend > ceiling) {
1168 return prevgregion;
1169 }
1170 prevgregion = gregion;
1171 }
1172
1173 return prevgregion;
1174 }
1175
1176 /* Assumes gregions are arranged high to low */
1177 static Gregion_T
apply_floor(List_T gregions,Chrpos_T floor)1178 apply_floor (List_T gregions, Chrpos_T floor) {
1179 List_T p;
1180 Gregion_T prevgregion = NULL, gregion;
1181
1182 for (p = gregions; p != NULL; p = List_next(p)) {
1183 gregion = (Gregion_T) List_head(p);
1184 if (gregion->extentstart < floor) {
1185 return prevgregion;
1186 }
1187 prevgregion = gregion;
1188 }
1189
1190 return prevgregion;
1191 }
1192
1193
1194 static Chrpos_T
compute_ceilings(Uinttable_T low_basetable)1195 compute_ceilings (Uinttable_T low_basetable) {
1196 Chrpos_T ceiling, bestprevpos, prevpos;
1197 long int bestscore, score;
1198 int nlookback;
1199 Chrpos_T *keys;
1200 Base_T base, prevbase;
1201 Gregion_T gregion, prevgregion;
1202 #ifdef DEBUG2
1203 Gregion_T bestprevgregion;
1204 #endif
1205 List_T p;
1206 int n, i;
1207
1208 n = Uinttable_length(low_basetable);
1209 keys = Uinttable_keys(low_basetable,/*sortp*/true);
1210 debug2(printf("low_basetable has %d entries\n",n));
1211
1212 prevpos = 0U;
1213 for (i = 0; i < n; i++) {
1214 base = (Base_T) Uinttable_get(low_basetable,keys[i]);
1215 base->gregions = Gregion_sort_high_ascending(base->gregions);
1216 compute_total_up(base->gregions);
1217
1218 base->prevpos = prevpos;
1219 prevpos = keys[i];
1220 }
1221
1222 /* Initialize minbaselow */
1223 base = (Base_T) Uinttable_get(low_basetable,keys[0]);
1224 for (p = base->gregions; p != NULL; p = List_next(p)) {
1225 gregion = (Gregion_T) List_head(p);
1226 gregion->score_ceiling = gregion->total_up;
1227 gregion->bestprev_ceiling = 0U;
1228 }
1229
1230 for (i = 1; i < n; i++) {
1231 base = (Base_T) Uinttable_get(low_basetable,keys[i]);
1232
1233 debug2(printf("At base %u, have %d gregions\n",keys[i],List_length(base->gregions)));
1234
1235 for (p = base->gregions; p != NULL; p = List_next(p)) {
1236 gregion = (Gregion_T) List_head(p);
1237
1238 bestscore = 0;
1239 bestprevpos = keys[0];
1240 debug2(bestprevgregion = NULL);
1241
1242 prevpos = base->prevpos;
1243 nlookback = 0;
1244
1245 ceiling = gregion->extentend;
1246 debug2(printf(" Gregion %u",gregion->extentend));
1247 while (prevpos >= keys[0] && nlookback < MAXLOOKBACK) {
1248 prevbase = (Base_T) Uinttable_get(low_basetable,prevpos);
1249 if ((prevgregion = apply_ceiling(prevbase->gregions,ceiling)) != NULL) {
1250 debug2(printf(" ... prev:%u score:%d+%d = %d",
1251 prevpos,prevgregion->score_ceiling,gregion->total_up,
1252 prevgregion->score_ceiling+gregion->total_up));
1253
1254 if ((score = prevgregion->score_ceiling + gregion->total_up) > bestscore) {
1255 debug2(printf("*"));
1256 bestscore = score;
1257 bestprevpos = prevpos;
1258 #ifdef DEBUG2
1259 bestprevgregion = prevgregion;
1260 #endif
1261 }
1262 }
1263
1264 prevpos = prevbase->prevpos;
1265 nlookback++;
1266 }
1267 debug2(printf("\n"));
1268
1269 gregion->score_ceiling = bestscore;
1270 gregion->bestprev_ceiling = bestprevpos;
1271
1272 #ifdef DEBUG2
1273 if (bestprevgregion == NULL) {
1274 printf(" no prevgregion, bestprev is %u, score is %d\n",
1275 bestprevpos,gregion->score_ceiling);
1276 } else {
1277 printf(" bestprev is pos %u, gregion %u, score is %d\n",
1278 bestprevpos,bestprevgregion->extentend,gregion->score_ceiling);
1279 }
1280 #endif
1281 }
1282 }
1283
1284 /* Get best overall gregion */
1285 bestscore = 0;
1286 bestprevpos = 0U;
1287
1288 for (i = 0; i < n; i++) {
1289 base = (Base_T) Uinttable_get(low_basetable,keys[i]);
1290 for (p = base->gregions; p != NULL; p = List_next(p)) {
1291 gregion = (Gregion_T) List_head(p);
1292
1293 if (gregion->score_ceiling > bestscore) {
1294 bestscore = gregion->score_ceiling;
1295 bestprevpos = keys[i];
1296 }
1297 }
1298 }
1299
1300 FREE(keys);
1301
1302 return bestprevpos;
1303 }
1304
1305
1306
1307 static Chrpos_T
compute_floors(Uinttable_T high_basetable)1308 compute_floors (Uinttable_T high_basetable) {
1309 Chrpos_T floor, bestprevpos, prevpos;
1310 long int bestscore, score;
1311 int nlookback;
1312 Chrpos_T *keys;
1313 Base_T base, prevbase;
1314 Gregion_T gregion, prevgregion;
1315 #ifdef DEBUG2
1316 Gregion_T bestprevgregion;
1317 #endif
1318 List_T p;
1319 int n, i;
1320
1321 n = Uinttable_length(high_basetable);
1322 keys = Uinttable_keys(high_basetable,/*sortp*/true);
1323 debug2(printf("high_basetable has %d entries\n",n));
1324
1325 prevpos = (Chrpos_T) -1U;
1326 for (i = n-1; i >= 0; --i) {
1327 base = (Base_T) Uinttable_get(high_basetable,keys[i]);
1328 base->gregions = Gregion_sort_low_descending(base->gregions);
1329 compute_total_down(base->gregions);
1330
1331 base->prevpos = prevpos;
1332 prevpos = keys[i];
1333 }
1334
1335 /* Initialize maxbasehigh */
1336 base = (Base_T) Uinttable_get(high_basetable,keys[n-1]);
1337 for (p = base->gregions; p != NULL; p = List_next(p)) {
1338 gregion = (Gregion_T) List_head(p);
1339 gregion->score_floor = gregion->total_down;
1340 gregion->bestprev_floor = (Chrpos_T) -1U;
1341 }
1342
1343 for (i = n-2; i >= 0; --i) {
1344 base = (Base_T) Uinttable_get(high_basetable,keys[i]);
1345
1346 debug2(printf("At base %u, have %d gregions\n",keys[i],List_length(base->gregions)));
1347 for (p = base->gregions; p != NULL; p = List_next(p)) {
1348 gregion = (Gregion_T) List_head(p);
1349
1350 bestscore = 0;
1351 bestprevpos = keys[n-1];
1352 debug2(bestprevgregion = NULL);
1353
1354 prevpos = base->prevpos;
1355 nlookback = 0;
1356
1357 floor = gregion->extentstart /*+1*/;
1358 debug2(printf(" Gregion %u",gregion->extentstart));
1359 while (prevpos <= keys[n-1] && nlookback < MAXLOOKBACK) {
1360 prevbase = (Base_T) Uinttable_get(high_basetable,prevpos);
1361 if ((prevgregion = apply_floor(prevbase->gregions,floor)) != NULL) {
1362 debug2(printf(" ... prev:%u, score:%d+%d = %d",
1363 prevpos,prevgregion->score_floor,gregion->total_down,
1364 prevgregion->score_floor+gregion->total_down));
1365
1366 if ((score = prevgregion->score_floor + gregion->total_down) > bestscore) {
1367 debug2(printf("*"));
1368 bestscore = score;
1369 bestprevpos = prevpos;
1370 #ifdef DEBUG2
1371 bestprevgregion = prevgregion;
1372 #endif
1373 }
1374 }
1375
1376 prevpos = prevbase->prevpos;
1377 nlookback++;
1378 }
1379 debug2(printf("\n"));
1380
1381 gregion->score_floor = bestscore;
1382 gregion->bestprev_floor = bestprevpos;
1383
1384 #ifdef DEBUG2
1385 if (bestprevgregion == NULL) {
1386 printf(" no prevgregion, bestprev is %u, score is %d\n",
1387 bestprevpos,gregion->score_floor);
1388 } else {
1389 printf(" bestprev is pos %u, gregion %u, score is %d\n",
1390 bestprevpos,bestprevgregion->extentstart,gregion->score_floor);
1391 }
1392 #endif
1393 }
1394 }
1395
1396 /* Get best overall gregion */
1397 bestscore = 0;
1398 bestprevpos = (Chrpos_T) -1U;
1399
1400 for (i = n-1; i >= 0; --i) {
1401 base = (Base_T) Uinttable_get(high_basetable,keys[i]);
1402 for (p = base->gregions; p != NULL; p = List_next(p)) {
1403 gregion = (Gregion_T) List_head(p);
1404
1405 if (gregion->score_floor > bestscore) {
1406 bestscore = gregion->score_floor;
1407 bestprevpos = keys[i];
1408 }
1409 }
1410 }
1411
1412 FREE(keys);
1413
1414 return bestprevpos;
1415 }
1416
1417
1418 static void
traceback_ceilings(Uinttable_T low_basetable,Chrpos_T prevpos)1419 traceback_ceilings (Uinttable_T low_basetable, Chrpos_T prevpos) {
1420 Chrpos_T ceiling;
1421 Gregion_T end_gregion;
1422 Base_T base, prevbase;
1423 Chrpos_T *keys;
1424 int n, i;
1425
1426 n = Uinttable_length(low_basetable);
1427 keys = Uinttable_keys(low_basetable,/*sortp*/true);
1428
1429 ceiling = (Chrpos_T) -1U;
1430
1431 i = n-1;
1432 while (prevpos > keys[0]) {
1433 debug2(printf("traceback from endpos %u, back to %u\n",keys[i],prevpos));
1434 while (/*startpos*/keys[i] > prevpos) {
1435 base = (Base_T) Uinttable_get(low_basetable,/*startpos*/keys[i]);
1436 base->maxextent = ceiling;
1437 debug2(printf("At low %u, maxextent is %u\n",/*startpos*/keys[i],ceiling));
1438 i--;
1439 }
1440
1441 prevbase = (Base_T) Uinttable_get(low_basetable,prevpos);
1442 if ((end_gregion = apply_ceiling(prevbase->gregions,ceiling)) == NULL) {
1443 prevpos = keys[0]; /* Ends loop */
1444 } else {
1445 ceiling = end_gregion->extentend /*-1*/;
1446 prevpos = end_gregion->bestprev_ceiling;
1447 }
1448 }
1449
1450 debug2(printf("End of loop\n"));
1451 while (i >= 0) {
1452 base = (Base_T) Uinttable_get(low_basetable,keys[i]);
1453 base->maxextent = ceiling;
1454 debug2(printf("At low %u, maxextent is %u\n",keys[i],ceiling));
1455 i--;
1456 }
1457
1458 FREE(keys);
1459
1460 return;
1461 }
1462
1463
1464 static void
traceback_floors(Uinttable_T high_basetable,Chrpos_T prevpos)1465 traceback_floors (Uinttable_T high_basetable, Chrpos_T prevpos) {
1466 Chrpos_T floor;
1467 Gregion_T start_gregion;
1468 Base_T base, prevbase;
1469 Chrpos_T *keys;
1470 int n, i;
1471
1472 n = Uinttable_length(high_basetable);
1473 keys = Uinttable_keys(high_basetable,/*sortp*/true);
1474
1475 floor = 0U;
1476
1477 i = 0;
1478 while (prevpos < keys[n-1]) {
1479 debug2(printf("traceback from startpos %u, forward to %u\n",keys[i],prevpos));
1480 while (/*endpos*/keys[i] < prevpos) {
1481 base = (Base_T) Uinttable_get(high_basetable,/*endpos*/keys[i]);
1482 base->minextent = floor;
1483 debug2(printf("At high %u, minextent is %u\n",/*endpos*/keys[i],floor));
1484 i++;
1485 }
1486
1487 prevbase = (Base_T) Uinttable_get(high_basetable,prevpos);
1488 if ((start_gregion = apply_floor(prevbase->gregions,floor)) == NULL) {
1489 prevpos = keys[n-1]; /* Ends loop */
1490 } else {
1491 floor = start_gregion->extentstart /*+1*/;
1492 prevpos = start_gregion->bestprev_floor;
1493 }
1494 }
1495
1496 debug2(printf("End of loop\n"));
1497 while (i < n) {
1498 base = (Base_T) Uinttable_get(high_basetable,keys[i]);
1499 base->minextent = floor;
1500 debug2(printf("At high %u, minextent is %u\n",/*endpos*/keys[i],floor));
1501 i++;
1502 }
1503
1504 FREE(keys);
1505
1506 return;
1507 }
1508
1509
1510 static void
bound_gregions(Uinttable_T low_basetable,Uinttable_T high_basetable)1511 bound_gregions (Uinttable_T low_basetable, Uinttable_T high_basetable) {
1512 Chrpos_T minextent, maxextent;
1513 Base_T base_low, base_high;
1514 Gregion_T gregion;
1515 List_T p;
1516 Chrpos_T *keys;
1517 int n, i;
1518
1519 n = Uinttable_length(low_basetable);
1520 keys = Uinttable_keys(low_basetable,/*sortp*/true);
1521
1522 for (i = 0; i < n; i++) {
1523 base_low = (Base_T) Uinttable_get(low_basetable,keys[i]);
1524 maxextent = base_low->maxextent;
1525 for (p = base_low->gregions; p != NULL; p = List_next(p)) {
1526 gregion = (Gregion_T) List_head(p);
1527 base_high = (Base_T) Uinttable_get(high_basetable,gregion->extentend);
1528 minextent = base_high->minextent;
1529 if (gregion->extentstart < minextent || gregion->extentend > maxextent) {
1530 debug2(printf("Not bounded low: #%d:%u..%u\n",gregion->chrnum,gregion->extentstart,gregion->extentend));
1531 gregion->bounded_low_p = false;
1532 } else {
1533 debug2(printf("Bounded low: #%d:%u..%u\n",gregion->chrnum,gregion->extentstart,gregion->extentend));
1534 gregion->bounded_low_p = true;
1535 }
1536 }
1537 }
1538
1539 FREE(keys);
1540
1541
1542 n = Uinttable_length(high_basetable);
1543 keys = Uinttable_keys(high_basetable,/*sortp*/true);
1544
1545 for (i = n-1; i >= 0; i--) {
1546 base_high = (Base_T) Uinttable_get(high_basetable,keys[i]);
1547 minextent = base_high->minextent;
1548 for (p = base_high->gregions; p != NULL; p = List_next(p)) {
1549 gregion = (Gregion_T) List_head(p);
1550 base_low = (Base_T) Uinttable_get(low_basetable,gregion->extentstart);
1551 maxextent = base_low->maxextent;
1552 if (gregion->extentstart < minextent || gregion->extentend > maxextent) {
1553 debug2(printf("Not bounded high: #%d:%u..%u\n",gregion->chrnum,gregion->extentstart,gregion->extentend));
1554 gregion->bounded_high_p = false;
1555 } else {
1556 debug2(printf("Bounded high: #%d:%u..%u\n",gregion->chrnum,gregion->extentstart,gregion->extentend));
1557 gregion->bounded_high_p = true;
1558 }
1559 }
1560 }
1561
1562 FREE(keys);
1563
1564 return;
1565 }
1566
1567
1568 /* Called only if USE_CLEAN is defined in stage1.c */
1569 void
Gregion_filter_clean(List_T gregionlist,int nchrs)1570 Gregion_filter_clean (List_T gregionlist, int nchrs) {
1571 Uinttable_T *low_basetables, *high_basetables, basetable;
1572 Base_T base;
1573 Chrpos_T prevpos;
1574 Chrnum_T chrnum;
1575
1576 List_T p;
1577 T gregion;
1578 int n;
1579 #if 0
1580 List_T unique = NULL;
1581 T x, y, *array;
1582 int i, j;
1583 bool *eliminate;
1584 #endif
1585 #ifdef DEBUG
1586 int i;
1587 #endif
1588
1589 if ((n = List_length(gregionlist)) == 0) {
1590 return;
1591 }
1592
1593 debug(
1594 for (p = gregionlist, i = 0; p != NULL; p = p->rest, i++) {
1595 gregion = (T) p->first;
1596 printf(" Initial %d: %d..%d %u-%u (plusp = %d)\n",
1597 i,gregion->querystart,gregion->queryend,gregion->extentstart,gregion->extentend,gregion->plusp);
1598 }
1599 );
1600
1601 if (nchrs < MAX_NCHRS_FOR_ALLOCA) {
1602 low_basetables = (Uinttable_T *) CALLOCA(nchrs+1,sizeof(Uinttable_T));
1603 high_basetables = (Uinttable_T *) CALLOCA(nchrs+1,sizeof(Uinttable_T));
1604 } else {
1605 low_basetables = (Uinttable_T *) CALLOC(nchrs+1,sizeof(Uinttable_T));
1606 high_basetables = (Uinttable_T *) CALLOC(nchrs+1,sizeof(Uinttable_T));
1607 }
1608
1609 for (p = gregionlist; p != NULL; p = List_next(p)) {
1610 gregion = (T) List_head(p);
1611
1612 if (low_basetables[gregion->chrnum] == NULL) {
1613 low_basetables[gregion->chrnum] = Uinttable_new(n,/*save_contents_p*/false);
1614 high_basetables[gregion->chrnum] = Uinttable_new(n,/*save_contents_p*/false);
1615 }
1616
1617 basetable = low_basetables[gregion->chrnum];
1618 if ((base = (Base_T) Uinttable_get(basetable,gregion->extentstart)) == NULL) {
1619 base = Base_new();
1620 Uinttable_put(basetable,gregion->extentstart,(void *) base);
1621 }
1622 base->gregions = List_push(base->gregions,(void *) gregion);
1623
1624 basetable = high_basetables[gregion->chrnum];
1625 if ((base = (Base_T) Uinttable_get(basetable,gregion->extentend)) == NULL) {
1626 base = Base_new();
1627 Uinttable_put(basetable,gregion->extentend,(void *) base);
1628 }
1629 base->gregions = List_push(base->gregions,(void *) gregion);
1630 }
1631
1632 for (chrnum = 0; chrnum <= nchrs; chrnum++) {
1633 if ((basetable = low_basetables[chrnum]) != NULL) {
1634 debug2(printf("Processing gregions for chrnum %d\n",chrnum));
1635 prevpos = compute_ceilings(basetable);
1636 traceback_ceilings(basetable,prevpos);
1637
1638 basetable = high_basetables[chrnum];
1639 prevpos = compute_floors(basetable);
1640 traceback_floors(basetable,prevpos);
1641
1642 bound_gregions(low_basetables[chrnum],high_basetables[chrnum]);
1643 }
1644 }
1645
1646 for (p = gregionlist; p != NULL; p = List_next(p)) {
1647 gregion = (T) List_head(p);
1648
1649 if (low_basetables[gregion->chrnum] == NULL) {
1650 Uinttable_free(&(low_basetables[gregion->chrnum]));
1651 Uinttable_free(&(high_basetables[gregion->chrnum]));
1652 }
1653 }
1654
1655 if (nchrs < MAX_NCHRS_FOR_ALLOCA) {
1656 FREEA(high_basetables);
1657 FREEA(low_basetables);
1658 } else {
1659 FREE(high_basetables);
1660 FREE(low_basetables);
1661 }
1662
1663 #if 0
1664 eliminate = (bool *) CALLOC(n,sizeof(bool));
1665
1666 /* Not necessary if false is zero */
1667 /*
1668 for (i = 0; i < n; i++) {
1669 eliminate[i] = false;
1670 }
1671 */
1672
1673 array = (T *) List_to_array(gregionlist,NULL);
1674 List_free(&gregionlist);
1675 qsort(array,n,sizeof(T),weight_cmp);
1676
1677 for (i = 0; i < n; i++) {
1678 x = array[i];
1679 for (j = i+1; j < n; j++) {
1680 y = array[j];
1681 if (gregion_overlap_p(x,y) == true) {
1682 eliminate[j] = true;
1683 }
1684 }
1685 }
1686
1687 for (i = n-1; i >= 0; i--) {
1688 gregion = array[i];
1689 if (eliminate[i] == false) {
1690 debug(printf(" Keeping %u-%u (plusp = %d)\n",
1691 gregion->extentstart,gregion->extentend,gregion->plusp));
1692 unique = List_push(unique,(void *) gregion);
1693 } else {
1694 debug(printf(" Eliminating %u-%u (plusp = %d)\n",
1695 gregion->extentstart,gregion->extentend,gregion->plusp));
1696 /*
1697 if (gregion->match5 != NULL) {
1698 Match_decr_npairings(gregion->match5);
1699 Match_decr_npairings(gregion->match3);
1700 }
1701 */
1702 Gregion_free(&gregion);
1703 }
1704 }
1705
1706 FREE(eliminate);
1707 FREE(array);
1708
1709 debug(
1710 for (p = unique, i = 0; p != NULL; p = p->rest, i++) {
1711 gregion = (T) p->first;
1712 printf(" Final %d: %u-%u (plusp = %d)\n",
1713 i,gregion->extentstart,gregion->extentend,gregion->plusp);
1714 }
1715 );
1716
1717 return unique;
1718
1719 #endif
1720
1721 return;
1722 }
1723
1724 #endif
1725
1726