1 static char rcsid[] = "$Id: stage1.c 218185 2019-01-17 13:05:01Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "stage1.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h> /* For memset */
10
11 #include "bool.h"
12 #include "mem.h"
13 #include "reader.h"
14 #include "block.h"
15 #ifdef PMAP
16 #include "oligop.h"
17 #else
18 #include "oligo.h"
19 #endif
20 #include "intlist.h"
21 #include "matchdef.h"
22 #include "match.h"
23 #include "gregion.h"
24 #include "orderstat.h"
25
26
27 /* Restoring old scan_ends algorithm, removed on 2012-05-14, to get
28 ends correctly (e.g., NM_004449) */
29 #define SCAN_ENDS 1
30
31
32 /* Need to limit ninrange in find_range, or else we get bogged down in
33 repeats */
34 #define MAX_NINRANGE 100
35
36
37 #define MAX_INDELS 15
38 #define MIN_REPEAT 6
39 #define MAXENTRIES 100
40 #define MIN_MATCH_WEIGHT 0.05 /* For connectablep */
41
42 #define SUFFICIENT_FIRST_WEIGHT 0.50
43
44 #define PCT_MAX_SUPPORT 0.70
45 #define DIFF_MAX_SUPPORT 200
46 #define BOUNDARY_SUPPORT 666 /* DIFF_MAXSUPPORT/(1-PCT_MAX_SUPPORT) */
47
48 #ifdef PMAP
49 #define VERYSHORTSEQLEN 30
50 #define SHORTSEQLEN 90
51 #define SINGLEEXONLENGTH 30 /* per nt */
52 #define SLOPE 2400 /* genomic bp per aa */
53 #define NOEXTENDLEN 6 /* in aa */
54 #else
55 #define VERYSHORTSEQLEN 60
56 #define SHORTSEQLEN 90
57 #define SINGLEEXONLENGTH 90 /* per nt */
58 #define SLOPE 800 /* genomic bp per nt */
59 #define NOEXTENDLEN 6 /* in nt */
60 #endif
61
62 #define HIGHINTRONLENGTH 100000U /* Used in find_best_path */
63
64 #define INTRONLEN_PENALTY 10000 /* Used in find_best_path on diagonal segments. 1 point for each length */
65
66 #define MAX_ATTEMPTS_UNIT 100 /* This parameter is used to decide when to switch from 24-mers to 18-mers */
67
68 #define MAX_FILL_IN 200
69 #define MAX_DANGLING_PCT 0.33
70
71 #define SAMPLERUN 50
72
73 #define COLLAPSE_DISTANCE 12
74 #define SUBOPTIMAL 36
75
76 #define MAX_GREGIONS_PRE_UNIQUE 1000
77 #define MAX_GREGIONS_POST_UNIQUE 100
78
79 #define BINARY_FOLDDIFF 8
80
81 #define MAXEXONS 3
82
83 /* Once a match at a genomic location has PROMISCUOUS matches locally,
84 it is unlikely that further matches will help define that candidate
85 segment. Allowing PROMISCUOUS to be greater than 1 allows more
86 candidate segments to be passed to stage 2. However, if
87 PROMISCUOUS is set too high, then a single pair of matches may make
88 the algorithm miss the correct spot. For example, PROMISCUOUS of 9
89 or greater will miss the spliced form of BM994949, on chr4, and get
90 a pseudogene that contains a poly-T on chr15. */
91
92 #define PROMISCUOUS 4
93
94 /* Debugging of scanning for 24-mers */
95 #ifdef DEBUG
96 #define debug(x) x
97 static Univ_IIT_T global_chromosome_iit;
98 static char *queryuc_ptr;
99 #else
100 #define debug(x)
101 #endif
102
103 /* final results */
104 #ifdef DEBUG0
105 #define debug0(x) x
106 #else
107 #define debug0(x)
108 #endif
109
110 /* show positions. Should also specify DEBUG */
111 #ifdef DEBUG1
112 #define debug1(x) x
113 #else
114 #define debug1(x)
115 #endif
116
117 /* Debugging of 12-mer extension at ends */
118 #ifdef DEBUG2
119 #define debug2(x) x
120 #else
121 #define debug2(x)
122 #endif
123
124 /* Detailed view of scanning for 24-mers */
125 #ifdef DEBUG3
126 #define debug3(x) x
127 #else
128 #define debug3(x)
129 #endif
130
131 /* Double/triple matching, including binary search */
132 #ifdef DEBUG4
133 #define debug4(x) x
134 #else
135 #define debug4(x)
136 #endif
137
138 /* connectable_p */
139 #ifdef DEBUG5
140 #define debug5(x) x
141 #else
142 #define debug5(x)
143 #endif
144
145
146 /* collapse_diagonals and find_segments */
147 #ifdef DEBUG6
148 #define debug6(x) x
149 #else
150 #define debug6(x)
151 #endif
152
153 /* find_segments_heap */
154 #ifdef DEBUG7
155 #define debug7(x) x
156 #else
157 #define debug7(x)
158 #endif
159
160 /* compute_paths */
161 #ifdef DEBUG8
162 #define debug8(x) x
163 #else
164 #define debug8(x)
165 #endif
166
167 /* find_good_paths */
168 #ifdef DEBUG9
169 #define debug9(x) x
170 #else
171 #define debug9(x)
172 #endif
173
174
175 #ifdef PMAP
176 static Width_T index1part_aa;
177 static int leftreadshift;
178 static Chrpos_T maxextension;
179 static Chrpos_T maxtotallen_bound;
180 static int circular_typeint;
181
182 void
Stage1_setup(Width_T index1part_aa_in,Chrpos_T maxextension_in,Chrpos_T maxtotallen_bound_in,int circular_typeint_in)183 Stage1_setup (Width_T index1part_aa_in, Chrpos_T maxextension_in, Chrpos_T maxtotallen_bound_in,
184 int circular_typeint_in) {
185 index1part_aa = index1part_aa_in;
186 leftreadshift = 32 - 2*index1part_aa_in; /* chars are shifted into left of a 32 bit word */
187 maxextension = maxextension_in;
188 maxtotallen_bound = maxtotallen_bound_in;
189 circular_typeint = circular_typeint_in;
190 return;
191 }
192
193 #else
194 static Width_T index1part;
195 static int leftreadshift;
196 static Chrpos_T maxextension;
197 static Chrpos_T maxtotallen_bound;
198 static Oligospace_T oligobase_mask;
199 static int circular_typeint;
200
201 void
Stage1_setup(Width_T index1part_in,Chrpos_T maxextension_in,Chrpos_T maxtotallen_bound_in,int circular_typeint_in)202 Stage1_setup (Width_T index1part_in, Chrpos_T maxextension_in, Chrpos_T maxtotallen_bound_in,
203 int circular_typeint_in) {
204 index1part = index1part_in;
205 #ifdef HAVE_64_BIT
206 leftreadshift = 64 - 2*index1part_in; /* chars are shifted into left of a 64-bit word */
207 oligobase_mask = ~(~ (Oligospace_T) 0 << 2*index1part_in);
208 #else
209 leftreadshift = 32 - 2*index1part_in; /* chars are shifted into left of a 32-bit word */
210 oligobase_mask = ~(~ (Oligospace_T) 0 << 2*index1part_in);
211 #endif
212 maxextension = maxextension_in;
213 maxtotallen_bound = maxtotallen_bound_in;
214 circular_typeint = circular_typeint_in;
215 return;
216 }
217 #endif
218
219
220
221 #define T Stage1_T
222 struct T {
223 int trimstart;
224 int trimend;
225 int trimlength;
226 int maxtotallen;
227 int querylength;
228 int maxentries;
229
230 Width_T oligosize;
231
232 Block_T block5;
233 Block_T block3;
234 List_T matches5;
235 List_T matches3;
236 Univcoord_T **plus_positions;
237 Univcoord_T **minus_positions;
238 int *plus_npositions;
239 int *minus_npositions;
240 bool *plus_matchedp; /* For identify_matches */
241 bool *minus_matchedp;
242 bool *processedp; /* For Block_process_oligo */
243 bool *validp; /* For Block_process_oligo */
244 int querystart; /* Changes during computation */
245 int queryend; /* Changes during computation */
246
247 #ifdef PMAP
248 Oligospace_T *oligos;
249 #else
250 Oligospace_T *forward_oligos;
251 Oligospace_T *revcomp_oligos;
252 #endif
253 };
254
255
256 #ifdef CHECK
257 static void
Stage1_check(T this)258 Stage1_check (T this) {
259 int i, npositions, j;
260
261 for (i = 0; i < this->querylength; i++) {
262 npositions = this->plus_npositions[i];
263 if (npositions == 0 && this->plus_positions[i] != NULL) {
264 fprintf(stderr,"npositions = 0, this->plus_positions[i] != NULL\n");
265 abort();
266 } else if (npositions != 0 && this->plus_positions[i] == NULL) {
267 fprintf(stderr,"npositions != 0, this->plus_positions[i] == NULL\n");
268 abort();
269 } else {
270 for (j = 0; j < npositions; j++) {
271 if (this->plus_positions[i][j] == 0U) {
272 fprintf(stderr,"this->plus_positions[i][i] == 0\n");
273 abort();
274 }
275 }
276 }
277
278 npositions = this->minus_npositions[i];
279 if (npositions == 0 && this->minus_positions[i] != NULL) {
280 fprintf(stderr,"npositions = 0, this->minus_positions[i] != NULL\n");
281 abort();
282 } else if (npositions != 0 && this->minus_positions[i] == NULL) {
283 fprintf(stderr,"npositions != 0, this->minus_positions[i] == NULL\n");
284 abort();
285 } else {
286 for (j = 0; j < npositions; j++) {
287 if (this->minus_positions[i][j] == 0U) {
288 fprintf(stderr,"this->minus_positions[i][i] == 0\n");
289 abort();
290 }
291 }
292 }
293 }
294
295 return;
296
297 }
298 #endif
299
300
301
302 static T
Stage1_new(Sequence_T queryuc,int maxtotallen,int maxentries)303 Stage1_new (Sequence_T queryuc, int maxtotallen, int maxentries) {
304 T new = (T) MALLOC(sizeof(*new));
305 Reader_T reader;
306
307 #ifdef PMAP
308 new->querylength = Sequence_fulllength_given(queryuc);
309 #else
310 new->querylength = Sequence_fulllength(queryuc);
311 #endif
312
313 new->querystart = Sequence_trim_start(queryuc);
314 new->queryend = Sequence_trim_end(queryuc);
315 new->trimstart = new->querystart;
316 new->trimend = new->queryend;
317 new->trimlength = Sequence_trimlength(queryuc);
318
319 new->maxtotallen = maxtotallen;
320 new->maxentries = maxentries;
321 #ifdef PMAP
322 new->oligosize = index1part_aa;
323 #else
324 new->oligosize = index1part;
325 #endif
326
327 #if 0
328 debug(Sequence_print(stdout,queryuc,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/true));
329 #endif
330 #ifdef PMAP
331 reader = Reader_new(Sequence_fullpointer(queryuc),new->querystart,new->queryend);
332 new->block5 = Block_new(FIVE,/*oligosize*/index1part_aa,reader,3*new->querylength);
333 new->block3 = Block_new(THREE,/*oligosize*/index1part_aa,reader,3*new->querylength);
334 #else
335 reader = Reader_new(Sequence_fullpointer(queryuc),new->querystart,new->queryend);
336 new->block5 = Block_new(FIVE,/*oligosize*/index1part,leftreadshift,reader,new->querylength);
337 new->block3 = Block_new(THREE,/*oligosize*/index1part,leftreadshift,reader,new->querylength);
338 #endif
339 new->matches5 = NULL;
340 new->matches3 = NULL;
341
342 new->plus_positions = (Univcoord_T **) CALLOC(new->querylength,sizeof(Univcoord_T *));
343 new->minus_positions = (Univcoord_T **) CALLOC(new->querylength,sizeof(Univcoord_T *));
344 new->plus_npositions = (int *) CALLOC(new->querylength,sizeof(int));
345 new->minus_npositions = (int *) CALLOC(new->querylength,sizeof(int));
346 new->processedp = (bool *) CALLOC(new->querylength,sizeof(bool));
347
348 new->plus_matchedp = (bool *) CALLOC(new->querylength,sizeof(bool));
349 new->minus_matchedp = (bool *) CALLOC(new->querylength,sizeof(bool));
350
351 new->validp = (bool *) NULL;
352 #ifdef PMAP
353 new->oligos = (Oligospace_T *) NULL;
354 #else
355 new->forward_oligos = (Oligospace_T *) NULL;
356 new->revcomp_oligos = (Oligospace_T *) NULL;
357 #endif
358
359 return new;
360 }
361
362 static void
Stage1_free(T * old)363 Stage1_free (T *old) {
364 Reader_T reader;
365 int i;
366
367 /* Stage1_check(*old); */
368
369 if ((*old)->validp != NULL) {
370 FREE((*old)->validp);
371 #ifdef PMAP
372 FREE((*old)->oligos);
373 #else
374 FREE((*old)->revcomp_oligos);
375 FREE((*old)->forward_oligos);
376 #endif
377 }
378
379 for (i = 0; i < (*old)->querylength; i++) {
380 if ((*old)->plus_positions[i] != NULL) {
381 FREE((*old)->plus_positions[i]);
382 }
383 if ((*old)->minus_positions[i] != NULL) {
384 FREE((*old)->minus_positions[i]);
385 }
386 }
387
388 FREE((*old)->plus_positions);
389 FREE((*old)->minus_positions);
390 FREE((*old)->plus_npositions);
391 FREE((*old)->minus_npositions);
392 FREE((*old)->processedp);
393
394 FREE((*old)->plus_matchedp);
395 FREE((*old)->minus_matchedp);
396
397 reader = Block_reader((*old)->block5);
398 Reader_free(&reader);
399
400 Block_free(&(*old)->block3);
401 Block_free(&(*old)->block5);
402
403 FREE(*old);
404 return;
405 }
406
407
408
409 /* This procedure is called a lot. Replacing calls to Match_chrnum and so on with direct calls to Match_T object */
410 static bool
connectable_p(Match_T match5,Match_T match3,int maxtotallen)411 connectable_p (Match_T match5, Match_T match3, int maxtotallen) {
412 Univcoord_T position5, position3;
413 int querypos5, querypos3, exonlen;
414 bool forwardp5, forwardp3;
415
416 debug5(printf("Comparing #%d:%u at querypos %d with #%d:%u at querypos %d\n",
417 Match_chrnum(match5),Match_chrpos(match5),Match_querypos(match5),
418 Match_chrnum(match3),Match_chrpos(match3),Match_querypos(match3)));
419
420 if (match5->chrnum != match3->chrnum) {
421 debug5(printf("No, different chromosomes\n\n"));
422 return false;
423 } else {
424 querypos5 = match5->querypos;
425 querypos3 = match3->querypos;
426 exonlen = querypos3 - querypos5;
427 #if 0
428 if (exonlen < matchinterval) {
429 debug5(printf("No, exonlen == 0\n\n"));
430 return false;
431 } else {
432 #endif
433 position5 = match5->position;
434 position3 = match3->position;
435 /* intronlen = abs(position3 - position5) - exonlen; -- shouldn't subtract unsigned ints */
436 if (position3 > position5) {
437 /* intronlen = position3 - position5 - exonlen; -- Don't subtract into a signed int */
438 /* The check below is equivalent to intronlen > maxtotallen */
439 if (position3 > (Univcoord_T) maxtotallen + position5 + (Univcoord_T) exonlen) {
440 debug5(printf("No, intron too long (%u > %u + %u + %u)\n\n",
441 position3,maxtotallen,position5,exonlen));
442 return false;
443 }
444 } else {
445 /* intronlen = position5 - position3 - exonlen; -- Don't subtract into a signed int */
446 /* The check below is equivalent to intronlen > maxtotallen */
447 if (position5 > (Univcoord_T) maxtotallen + position3 + (Univcoord_T) exonlen) {
448 debug5(printf("No, intron too long (%u > %u + %u + %u)\n\n",
449 position5,maxtotallen,position3,exonlen));
450 return false;
451 }
452 }
453 forwardp5 = match5->forwardp;
454 forwardp3 = match3->forwardp;
455
456 if (forwardp5 != forwardp3) {
457 debug5(printf("No, forwardp different\n\n"));
458 return false;
459 } else if (forwardp5 == true && position3 < position5) {
460 debug5(printf("No, forwardp true, but positions wrong\n\n"));
461 return false;
462 } else if (forwardp5 == false && position5 < position3) {
463 debug5(printf("No, forwardp false, but positions wrong\n\n"));
464 return false;
465 } else if (match5->weight < MIN_MATCH_WEIGHT || match3->weight < MIN_MATCH_WEIGHT) {
466 debug5(printf("Yes, but weights are too small: %.1f %.1f\n\n",
467 match5->weight,match3->weight));
468 return false;
469 } else {
470 debug5(printf("Yes\n\n"));
471 return true;
472 }
473 #if 0
474 }
475 #endif
476 }
477 }
478
479
480
481 /* This procedure is called a lot. Replacing calls to Match_npairings and so on with direct calls to Match_T object */
482 /* Updates a list of Stage1_T objects */
483 static List_T
pair_up(bool * foundpairp,List_T gregionlist,Width_T matchsize,List_T newmatches5,List_T newmatches3,List_T matches5,List_T matches3,int genestrand,Univ_IIT_T chromosome_iit,int querylength,int maxtotallen,int trimstart,int trimend,int trimlength)484 pair_up (bool *foundpairp, List_T gregionlist, Width_T matchsize,
485 List_T newmatches5, List_T newmatches3, List_T matches5, List_T matches3,
486 int genestrand, Univ_IIT_T chromosome_iit, int querylength,
487 int maxtotallen, int trimstart, int trimend, int trimlength) {
488 List_T p, q, s, new_gregions = NULL;
489 Match_T match5, match3;
490 /* Width_T matchinterval; */
491 #ifdef DEBUG
492 Gregion_T gregion;
493 #endif
494
495 /* matchinterval = matchsize - oligosize; */
496
497 /* Do N vs N */
498 for (q = newmatches5; q != NULL; q = q->rest) {
499 match5 = (Match_T) q->first;
500 if (match5->npairings < PROMISCUOUS) {
501 for (s = newmatches3; s != NULL; s = s->rest) {
502 match3 = (Match_T) s->first;
503 if (match3->npairings < PROMISCUOUS) {
504 if (connectable_p(match5,match3,maxtotallen)) {
505 #if 1
506 if (Match_acceptable_pair(match5,match3,trimlength,matchsize) == true) {
507 #endif
508 new_gregions = List_push(new_gregions,Gregion_new_from_matches(match5,match3,genestrand,chromosome_iit,
509 querylength,matchsize,trimstart,trimend,
510 circular_typeint));
511 #if 1
512 }
513 #endif
514 }
515 }
516 }
517 }
518 }
519
520 /* Do N vs (N-1..1) */
521 for (q = newmatches5; q != NULL; q = q->rest) {
522 match5 = (Match_T) q->first;
523 if (match5->npairings < PROMISCUOUS) {
524 for (s = matches3; s != NULL; s = s->rest) {
525 match3 = (Match_T) s->first;
526 if (match3->npairings < PROMISCUOUS) {
527 if (connectable_p(match5,match3,maxtotallen)) {
528 #if 1
529 if (Match_acceptable_pair(match5,match3,trimlength,matchsize) == true) {
530 #endif
531 new_gregions = List_push(new_gregions,Gregion_new_from_matches(match5,match3,genestrand,chromosome_iit,
532 querylength,matchsize,trimstart,trimend,
533 circular_typeint));
534 #if 1
535 }
536 #endif
537 }
538 }
539 }
540 }
541 }
542
543 /* Do (N-1..1) vs N */
544 for (q = matches5; q != NULL; q = q->rest) {
545 match5 = (Match_T) q->first;
546 if (match5->npairings < PROMISCUOUS) {
547 for (s = newmatches3; s != NULL; s = s->rest) {
548 match3 = (Match_T) s->first;
549 if (match3->npairings < PROMISCUOUS) {
550 if (connectable_p(match5,match3,maxtotallen)) {
551 #if 1
552 if (Match_acceptable_pair(match5,match3,trimlength,matchsize) == true) {
553 #endif
554 new_gregions = List_push(new_gregions,Gregion_new_from_matches(match5,match3,genestrand,chromosome_iit,
555 querylength,matchsize,trimstart,trimend,
556 circular_typeint));
557 #if 1
558 }
559 #endif
560 }
561 }
562 }
563 }
564 }
565
566 if (new_gregions == NULL) {
567 debug(printf("--No new gregions found\n"));
568 *foundpairp = false;
569
570 } else {
571 debug(printf("--%d new gregions found before uniq\n",List_length(new_gregions)));
572 debug(
573 for (p = new_gregions; p != NULL; p = p->rest) {
574 gregion = (Gregion_T) List_head(p);
575 printf("Before uniq: ");
576 Gregion_print(gregion);
577 }
578 );
579
580 #if 0
581 new_gregions = Gregion_filter_unique(new_gregions);
582 #endif
583
584 debug(printf("--%d new gregions found after uniq\n",List_length(new_gregions)));
585 *foundpairp = true;
586 for (p = new_gregions; p != NULL; p = p->rest) {
587 debug(
588 gregion = (Gregion_T) List_head(p);
589 printf("After uniq: ");
590 Gregion_print(gregion);
591 );
592 gregionlist = List_push(gregionlist,(Gregion_T) List_head(p));
593 }
594 List_free(&new_gregions);
595 }
596
597 return gregionlist;
598 }
599
600
601 #if 0
602 static bool
603 repetitivep (Oligospace_T oligo, int oligosize) {
604 int i;
605
606 oligos = (Oligospace_T *) CALLOC(this->querylength,sizeof(Oligospace_T));
607 for (querypos = 0; querypos <= this->querylength - this->oligobase; querypos++) {
608 if (this->validp[querypos] == true) {
609 oligos[k++] = this->forward_oligos[querypos];
610 }
611 }
612 qsort(oligos,k,sizeof(Oligospace_T),Oligospace_compare);
613
614 for (i = 0, j = 1; j < k; i++, j++) {
615 if (oligos[j] == oligos[i]) {
616 debug(printf("Found repetition of oligo %06X == %06X\n",oligos[i],oligos[j]));
617 FREE(oligos);
618 return true;
619 }
620 }
621
622 FREE(oligos);
623 return false;
624 }
625 #endif
626
627
628 static List_T
identify_singles(int * nnew,bool * overflowp,List_T matches,int merstart,Univcoord_T positionadj,int querylength,Univcoord_T * positions,int npositions,Width_T matchsize,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,bool forwardp,bool fivep,int maxentries)629 identify_singles (int *nnew, bool *overflowp, List_T matches, int merstart, Univcoord_T positionadj,
630 int querylength, Univcoord_T *positions, int npositions,
631 #ifdef DEBUG
632 Width_T matchsize,
633 #endif
634 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
635 Matchpool_T matchpool, bool forwardp, bool fivep, int maxentries) {
636 List_T newmatches = NULL, p;
637 Match_T match;
638 Univcoord_T position;
639 int i = 0, nentries = 0;
640 bool donep = false;
641 double weight;
642
643 if (npositions == 0) {
644 *nnew = 0;
645 *overflowp = false;
646 return matches;
647 } else {
648 position = positions[0];
649 }
650
651 Matchpool_save(matchpool);
652
653 while (donep == false) {
654 if (position >= chrsubset_start && position < chrsubset_end) {
655 if (++nentries > maxentries) {
656 donep = true;
657 } else {
658 newmatches = Matchpool_push(newmatches,matchpool,merstart,querylength,forwardp,fivep,
659 position+positionadj,chromosome_iit);
660 }
661 }
662
663 /* Advance */
664 if (++i >= npositions) {
665 donep = true;
666 } else {
667 position = positions[i];
668 }
669 }
670
671 if (nentries > maxentries) {
672 /* Too many entries. Not unique enough in genome */
673 *nnew = 0;
674 *overflowp = true;
675 debug(printf(" Singles overflow at %d\n",merstart));
676
677 /* Not necessary to free */
678 Matchpool_restore(matchpool);
679
680 } else {
681 *nnew = nentries;
682 *overflowp = false;
683 if (nentries > 0) {
684 weight = 1.0/(double) nentries;
685 for (p = newmatches; p != NULL; p = p->rest) {
686 match = (Match_T) p->first;
687 Match_set_weight(match,weight);
688
689 debug(
690 Match_print(match,chromosome_iit);
691 printf("\n");
692 );
693 }
694 }
695 matches = Matchpool_transfer(matches,newmatches);
696 }
697
698 return matches;
699 }
700
701
702 static int
binary_search(int lowi,int highi,Univcoord_T * positions,Univcoord_T goal)703 binary_search (int lowi, int highi, Univcoord_T *positions, Univcoord_T goal) {
704 bool foundp = false;
705 int middlei;
706
707 #ifdef NOBINARY
708 return lowi;
709 #endif
710
711 if (goal == 0U) {
712 return lowi;
713 }
714
715 while (!foundp && lowi < highi) {
716 middlei = lowi + ((highi - lowi) / 2);
717 debug4(printf(" binary: %d:%u %d:%u %d:%u vs. %u\n",
718 lowi,positions[lowi],middlei,positions[middlei],
719 highi,positions[highi],goal));
720 if (goal < positions[middlei]) {
721 highi = middlei;
722 } else if (goal > positions[middlei]) {
723 lowi = middlei + 1;
724 } else {
725 foundp = true;
726 }
727 }
728
729 if (foundp == true) {
730 debug4(printf("returning %d\n\n",middlei));
731 return middlei;
732 } else {
733 debug4(printf("returning %d\n\n",highi));
734 return highi;
735 }
736 }
737
738
739 /* positions0 should be left of positions1, and expecteddist should be
740 * positive. This procedure assumes that the entries in position0 and
741 * positions1 are in order. They are assumed not to have duplicates, which
742 * are removed by Indexdb_write_positions and Indexdb_read. */
743 static List_T
identify_doubles(int * nnew,bool * overflowp,List_T matches,int merstart,Univcoord_T positionadj,int querylength,Univcoord_T * positions0,int npositions0,Univcoord_T * positions1,int npositions1,Width_T matchsize,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,bool forwardp,bool fivep,int maxentries)744 identify_doubles (int *nnew, bool *overflowp, List_T matches, int merstart, Univcoord_T positionadj,
745 int querylength, Univcoord_T *positions0, int npositions0,
746 Univcoord_T *positions1, int npositions1,
747 #ifdef DEBUG
748 Width_T matchsize,
749 #endif
750 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
751 Matchpool_T matchpool, bool forwardp, bool fivep, int maxentries) {
752 List_T newmatches = NULL, p;
753 Match_T match;
754 Univcoord_T expected0, position0, expected1, position1;
755 int i = 0, j = 0, nentries = 0;
756 bool donep = false;
757 double weight;
758
759 /*
760 debug(printf("Entering identify_doubles with merstart = %d, positionadj = %u, expecteddist = %d\n",
761 merstart,positionadj,expecteddist));
762 */
763 if (npositions0 == 0) {
764 /* debug(printf("Leaving identify_doubles because npositions0 == 0\n\n")); */
765 *nnew = 0;
766 *overflowp = false;
767 return matches;
768 } else {
769 position0 = positions0[0];
770 expected1 = position0 /*+ expecteddist*/;
771 if (npositions1 == 0) {
772 /* debug(printf("Leaving identify_doubles because npositions1 == 0\n\n")); */
773 *nnew = 0;
774 *overflowp = false;
775 return matches;
776 } else {
777 position1 = positions1[0];
778 #if 0
779 if (position1 < expecteddist) {
780 expected0 = 0U;
781 } else {
782 #endif
783 expected0 = position1 /*- expecteddist*/;
784 #if 0
785 }
786 #endif
787 }
788 }
789
790 Matchpool_save(matchpool);
791
792 while (donep == false) {
793 debug4(printf(" %d:%u %d:%u\n",i,position0,j,position1));
794 debug4(
795 if (abs(position1-position0) < 100) {
796 printf("Close: %u %u\n",position0,position1);
797 }
798 );
799
800 if (expected1 < position1) {
801 /* Advance position0 */
802 if (npositions0 - i < BINARY_FOLDDIFF*(npositions1 - j)) {
803 i++;
804 } else {
805 debug4(printf(" remaining positions: %d and %d => binary\n",npositions0-i,npositions1-j));
806 i = binary_search(i+1,npositions0,positions0,expected0);
807 }
808 if (i >= npositions0) {
809 donep = true;
810 } else {
811 position0 = positions0[i];
812 expected1 = position0 /*+ expecteddist*/;
813 }
814
815 } else if (expected1 > position1) {
816 /* Advance position1 */
817 if (npositions1 - j < BINARY_FOLDDIFF*(npositions0 - i)) {
818 j++;
819 } else {
820 debug4(printf(" remaining positions: %d and %d => binary\n",npositions0-i,npositions1-j));
821 j = binary_search(j+1,npositions1,positions1,expected1);
822 }
823 if (j >= npositions1) {
824 donep = true;
825 } else {
826 position1 = positions1[j];
827 #if 0
828 if (position1 < expecteddist) {
829 expected0 = 0U;
830 } else {
831 #endif
832 expected0 = position1 /*- expecteddist*/;
833 #if 0
834 }
835 #endif
836 }
837
838 } else {
839 if (position0 >= chrsubset_start && position0 < chrsubset_end) {
840 if (++nentries > maxentries) {
841 donep = true;
842 } else {
843 newmatches = Matchpool_push(newmatches,matchpool,merstart,querylength,forwardp,fivep,
844 position0+positionadj,chromosome_iit);
845 }
846 }
847
848 /* Advance both */
849 if (++i >= npositions0) {
850 donep = true;
851 } else {
852 position0 = positions0[i];
853 expected1 = position0 /*+ expecteddist*/;
854 if (++j >= npositions1) {
855 donep = true;
856 } else {
857 position1 = positions1[j];
858 expected0 = position1 /*- expecteddist*/;
859 }
860 }
861 }
862 }
863
864 if (nentries > maxentries) {
865 /* Too many entries. Not unique enough in genome */
866 *nnew = 0;
867 *overflowp = true;
868 /* debug(printf(" Doubles overflow at %d\n\n",querypos1)); */
869
870 /* Not necessary to free */
871 Matchpool_restore(matchpool);
872
873 } else {
874 /* debug(printf("Leaving identify_doubles with %d matches\n\n",List_length(newmatches))); */
875 *nnew = nentries;
876 *overflowp = false;
877 if (nentries > 0) {
878 weight = 1.0/(double) nentries;
879
880 debug(if (newmatches != NULL) {
881 printf("Entering identify_doubles with merstart = %d, positionadj = %u\n",
882 merstart,positionadj);
883 });
884
885 for (p = newmatches; p != NULL; p = p->rest) {
886 match = (Match_T) p->first;
887 Match_set_weight(match,weight);
888 debug(
889 Match_print(match,chromosome_iit);
890 printf("\n");
891 );
892 }
893 }
894 matches = Matchpool_transfer(matches,newmatches);
895 }
896
897 return matches;
898 }
899
900
901 /* Involves search of three lists. This procedure assumes that the
902 * entries in positions0, positions1, and positions2 are in order. They are assumed
903 * not to have duplicates, which are removed by
904 * Indexdb_write_positions and Indexdb_read. */
905 static List_T
identify_triples(int * nnew,bool * overflowp,List_T matches,int merstart,Univcoord_T positionadj,int querylength,Univcoord_T * positions0,int npositions0,Univcoord_T * positions1,int npositions1,Univcoord_T * positions2,int npositions2,Width_T matchsize,int expecteddist1,int expecteddist2,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,bool forwardp,bool fivep,int maxentries)906 identify_triples (int *nnew, bool *overflowp, List_T matches, int merstart, Univcoord_T positionadj,
907 int querylength, Univcoord_T *positions0, int npositions0,
908 Univcoord_T *positions1, int npositions1, Univcoord_T *positions2, int npositions2,
909 #ifdef DEBUG
910 Width_T matchsize,
911 #endif
912 int expecteddist1, int expecteddist2, Univ_IIT_T chromosome_iit,
913 Univcoord_T chrsubset_start, Univcoord_T chrsubset_end, Matchpool_T matchpool,
914 bool forwardp, bool fivep, int maxentries) {
915 List_T newmatches = NULL, p;
916 Match_T match;
917 Univcoord_T position0, expected1, position1, expected2, position2;
918 int i = 0, j = 0, k = 0, nentries = 0;
919 int low2, middle2, high2;
920 bool donep = false, foundp;
921 double weight;
922 expecteddist1 = expecteddist2 = 0;
923
924 if (npositions0 == 0) {
925 *nnew = 0;
926 *overflowp = false;
927 return matches;
928 } else {
929 position0 = positions0[0];
930 expected1 = position0 + expecteddist1;
931 if (npositions1 == 0) {
932 *nnew = 0;
933 *overflowp = false;
934 return matches;
935 } else {
936 position1 = positions1[0];
937 expected2 = position1 + expecteddist2;
938 if (npositions2 == 0) {
939 *nnew = 0;
940 *overflowp = false;
941 return matches;
942 } else {
943 position2 = positions2[0];
944 }
945 }
946 }
947
948
949 Matchpool_save(matchpool);
950
951 while (donep == false) {
952 debug4(printf(" %d:%u %d:%u %d:%u\n",
953 i,position0,j,position1,k,position2));
954 if (expected1 < position1) {
955 /* Advance position0 */
956 if (++i >= npositions0) {
957 donep = true;
958 } else {
959 position0 = positions0[i];
960 expected1 = position0 + expecteddist1;
961 }
962 } else if (expected1 > position1) {
963 /* Advance position1 */
964 if (++j >= npositions1) {
965 donep = true;
966 } else {
967 position1 = positions1[j];
968 expected2 = position1 + expecteddist2;
969 }
970 } else if (expected2 < position2) {
971 /* Advance position1 */
972 if (++j >= npositions1) {
973 donep = true;
974 } else {
975 position1 = positions1[j];
976 expected2 = position1 + expecteddist2;
977 }
978 } else if (expected2 > position2) {
979 /* Binary search */
980 low2 = k+1;
981 high2 = npositions2;
982 foundp = false;
983 debug4(
984 for (middle2 = low2; middle2 < high2; middle2++) {
985 printf(" --%d:%u %d:%u %d:%u\n",
986 i,position0,j,position1,middle2,
987 positions2[middle2]);
988 }
989 );
990
991 while (!foundp && low2 < high2) {
992 middle2 = (low2+high2)/2;
993 debug4(printf(" **%d:%u %d:%u %d:%u vs. %u\n",
994 low2,positions2[low2],
995 middle2,positions2[middle2],
996 high2,positions2[high2],
997 expected2));
998 if (expected2 < positions2[middle2]) {
999 high2 = middle2;
1000 } else if (expected2 > positions2[middle2]) {
1001 low2 = middle2 + 1;
1002 } else {
1003 foundp = true;
1004 }
1005 }
1006 if (foundp == true) {
1007 k = middle2;
1008 position2 = positions2[k];
1009 } else if ((k = high2) >= npositions2) {
1010 donep = true;
1011 } else {
1012 position2 = positions2[k];
1013 }
1014
1015 } else {
1016 debug4(printf("Success at position %u\n",positions0[i]));
1017 if (position0 >= chrsubset_start && position0 < chrsubset_end) {
1018 if (++nentries > maxentries) {
1019 donep = true;
1020 } else {
1021 newmatches = Matchpool_push(newmatches,matchpool,merstart,querylength,forwardp,fivep,
1022 position0+positionadj,chromosome_iit);
1023 }
1024 }
1025
1026 /* Advance all */
1027 if (++i >= npositions0) {
1028 donep = true;
1029 } else {
1030 position0 = positions0[i];
1031 expected1 = position0 + expecteddist1;
1032 if (++j >= npositions1) {
1033 donep = true;
1034 } else {
1035 position1 = positions1[j];
1036 expected2 = position1 + expecteddist2;
1037 if (++k >= npositions2) {
1038 donep = true;
1039 } else {
1040 position2 = positions2[k];
1041 }
1042 }
1043 }
1044 }
1045 }
1046
1047 if (nentries > maxentries) {
1048 *nnew = 0;
1049 *overflowp = true;
1050 debug(printf(" Triples overflow at %d\n",merstart));
1051 Matchpool_restore(matchpool);
1052 } else {
1053 *nnew = nentries;
1054 *overflowp = false;
1055 if (nentries > 0) {
1056 weight = 1.0/(double) nentries;
1057 for (p = newmatches; p != NULL; p = p->rest) {
1058 match = (Match_T) p->first;
1059 Match_set_weight(match,weight);
1060 }
1061 }
1062 matches = Matchpool_transfer(matches,newmatches);
1063 }
1064
1065 return matches;
1066 }
1067
1068
1069 /************************************************************************
1070
1071 merstart
1072 5', forward: prevpos (pos0) querypos (pos1)
1073 5', revcomp: prevpos (pos1) querypos (pos0)
1074
1075 3', forward: querypos (pos0) prevpos (pos1)
1076 3', revcomp: querypos (pos1) prevpos (pos0)
1077
1078 ************************************************************************/
1079
1080 static List_T
identify_matches(int * nnew,bool * overflowp,List_T matches,int querypos,int querylength,Width_T oligosize,Width_T matchinterval,Univcoord_T ** plus_positions,int * plus_npositions,Univcoord_T ** minus_positions,int * minus_npositions,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,bool forwardp,bool fivep,int maxentries)1081 identify_matches (int *nnew, bool *overflowp, List_T matches, int querypos, int querylength,
1082 Width_T oligosize, Width_T matchinterval,
1083 Univcoord_T **plus_positions, int *plus_npositions,
1084 Univcoord_T **minus_positions, int *minus_npositions,
1085 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1086 Matchpool_T matchpool, bool forwardp, bool fivep, int maxentries) {
1087 int prevpos, middlepos;
1088 int pos0, pos1;
1089 Univcoord_T **positions, positionadj = 0U;
1090 int *npositions;
1091 Width_T matchsize, merstart;
1092
1093 #ifdef PMAP
1094 matchsize = matchinterval + index1part_aa;
1095 #else
1096 matchsize = matchinterval + index1part;
1097 #endif
1098
1099 if (fivep == true) {
1100 prevpos = querypos - matchinterval;
1101 middlepos = querypos - oligosize;
1102 merstart = prevpos;
1103 } else {
1104 prevpos = querypos + matchinterval;
1105 middlepos = querypos + oligosize;
1106 merstart = querypos;
1107 }
1108 if (forwardp == fivep) {
1109 pos0 = prevpos;
1110 pos1 = querypos;
1111 } else {
1112 pos0 = querypos;
1113 pos1 = prevpos;
1114 }
1115
1116 if (forwardp == true) {
1117 positions = plus_positions;
1118 npositions = plus_npositions;
1119 } else {
1120 positions = minus_positions;
1121 npositions = minus_npositions;
1122 #ifndef PMAP
1123 /* Need this adjustment because gmapindex has only one
1124 idxpositions file for both strands, where position always
1125 points to lowest coordinate of the 12-mer */
1126 positionadj = matchsize - 1U;
1127 #endif
1128 }
1129
1130 if (matchsize == oligosize) {
1131 return identify_singles(&(*nnew),&(*overflowp),matches,merstart,positionadj,querylength,
1132 positions[pos0],npositions[pos0],
1133 #ifdef DEBUG
1134 matchsize,
1135 #endif
1136 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1137 forwardp,fivep,maxentries);
1138 } else if (matchsize <= 2*oligosize) {
1139 /* debug(printf("Using positions at %d and %d\n",pos0,pos1)); */
1140 return identify_doubles(&(*nnew),&(*overflowp),matches,merstart,positionadj,querylength,
1141 positions[pos0],npositions[pos0],
1142 positions[pos1],npositions[pos1],
1143 #ifdef DEBUG
1144 matchsize,
1145 #endif
1146 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1147 forwardp,fivep,maxentries);
1148 } else if (matchsize == 3*oligosize) {
1149 /* This branch hasn't been tested very well */
1150 debug(printf("Using positions at %d and %d and %d\n",pos0,middlepos,pos1));
1151 return identify_triples(&(*nnew),&(*overflowp),matches,merstart,positionadj,querylength,
1152 positions[pos0],npositions[pos0],
1153 positions[middlepos],npositions[middlepos],
1154 positions[pos1],npositions[pos1],
1155 #ifdef DEBUG
1156 matchsize,
1157 #endif
1158 #ifdef PMAP
1159 oligosize*3,oligosize*3,
1160 #else
1161 oligosize,oligosize,
1162 #endif
1163 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1164 forwardp,fivep,maxentries);
1165 } else {
1166 abort();
1167 }
1168 }
1169
1170
1171 /* Need to change these procedures from n*m to n+m */
1172 static List_T
find_5prime_matches(int * nnew,List_T matches5,T this,Width_T matchsize,Univcoord_T ** plus_positions,int * plus_npositions,Univcoord_T ** minus_positions,int * minus_npositions,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int querystart,int querylength,int maxentries,bool pairedp)1173 find_5prime_matches (int *nnew, List_T matches5, T this, Width_T matchsize,
1174 Univcoord_T **plus_positions, int *plus_npositions,
1175 Univcoord_T **minus_positions, int *minus_npositions,
1176 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1177 Matchpool_T matchpool, int querystart, int querylength,
1178 int maxentries, bool pairedp) {
1179 Width_T merstart; /* Note: negative values must be allowed */
1180 int nnewplus = 0, nnewminus = 0;
1181 bool overflowp;
1182 Width_T matchinterval;
1183
1184 #ifdef PMAP
1185 matchinterval = matchsize - index1part_aa;
1186 /*matchinterval_nt = matchinterval*3; */
1187 #else
1188 matchinterval = matchsize - index1part;
1189 /* matchinterval_nt = matchinterval; */
1190 #endif
1191
1192 if ((merstart = querystart - matchinterval) < 0) {
1193 debug3(printf("Quitting because %d - %d < 0\n",querystart,matchinterval));
1194 *nnew = 0;
1195 } else {
1196 debug3(printf("Identifying 5' matches for %d-mer at %d and %d. maxentries=%d.\n",
1197 matchsize,merstart,querystart,maxentries));
1198
1199 debug3(printf(" Previous status of plus_matchedp at %d is %d\n",merstart,this->plus_matchedp[merstart]));
1200 if (pairedp == true || this->plus_matchedp[merstart] == false) {
1201 matches5 = identify_matches(&nnewplus,&overflowp,matches5,querystart,querylength,
1202 this->oligosize,matchinterval,
1203 plus_positions,plus_npositions,minus_positions,minus_npositions,
1204 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1205 /*forwardp*/true,/*fivep*/true,maxentries);
1206 if (overflowp == false) {
1207 debug3(printf(" No overflow so setting plus_matchedp at %d to true\n",merstart));
1208 this->plus_matchedp[merstart] = true;
1209 }
1210 }
1211
1212 debug3(printf(" Previous status of minus_matchedp at %d is %d\n",merstart,this->minus_matchedp[merstart]));
1213 if (pairedp == true || this->minus_matchedp[merstart] == false) {
1214 matches5 = identify_matches(&nnewminus,&overflowp,matches5,querystart,querylength,
1215 this->oligosize,matchinterval,
1216 plus_positions,plus_npositions,minus_positions,minus_npositions,
1217 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1218 /*forwardp*/false,/*fivep*/true,maxentries);
1219 if (overflowp == false) {
1220 debug3(printf(" No overflow so setting minus_matchedp at %d to true\n",merstart));
1221 this->minus_matchedp[merstart] = true;
1222 }
1223 }
1224
1225 *nnew = nnewplus + nnewminus;
1226 }
1227
1228 return matches5;
1229 }
1230
1231 static List_T
find_3prime_matches(int * nnew,List_T matches3,T this,Width_T matchsize,Univcoord_T ** plus_positions,int * plus_npositions,Univcoord_T ** minus_positions,int * minus_npositions,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int queryend,int querylength,int maxentries,bool pairedp)1232 find_3prime_matches (int *nnew, List_T matches3, T this, Width_T matchsize,
1233 Univcoord_T **plus_positions, int *plus_npositions,
1234 Univcoord_T **minus_positions, int *minus_npositions,
1235 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1236 Matchpool_T matchpool, int queryend, int querylength, int maxentries, bool pairedp) {
1237 Width_T merstart;
1238 int nnewplus = 0, nnewminus = 0;
1239 bool overflowp;
1240 Width_T matchinterval;
1241
1242 #ifdef PMAP
1243 matchinterval = matchsize - index1part_aa;
1244 /* matchinterval_nt = matchinterval*3; */
1245 #else
1246 matchinterval = matchsize - index1part;
1247 /* matchinterval_nt = matchinterval; */
1248 #endif
1249
1250 if (queryend + matchsize > querylength) {
1251 debug3(printf("Quitting because %d + %d > %d\n",queryend,matchsize,querylength));
1252 *nnew = 0;
1253 } else {
1254 merstart = queryend;
1255 debug3(printf("Identifying 3' matches for %d-mer at %d and %d. maxentries=%d\n",
1256 matchsize,queryend,queryend+matchinterval,maxentries));
1257
1258 /* If maxentries > 0, then we are working on pairs. Otherwise, check for a successful previous check */
1259 debug3(printf(" Previous status of plus_matchedp at %d is %d\n",merstart,this->plus_matchedp[merstart]));
1260 if (pairedp == true || this->plus_matchedp[merstart] == false) {
1261
1262 matches3 = identify_matches(&nnewplus,&overflowp,matches3,queryend,querylength,
1263 this->oligosize,matchinterval,
1264 plus_positions,plus_npositions,minus_positions,minus_npositions,
1265 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1266 /*forwardp*/true,/*fivep*/false,maxentries);
1267 if (overflowp == false) {
1268 debug3(printf(" No overflow so setting plus_matchedp at %d to true\n",merstart));
1269 this->plus_matchedp[merstart] = true;
1270 }
1271 }
1272
1273 debug3(printf(" Previous status of minus_matchedp at %d is %d\n",merstart,this->minus_matchedp[merstart]));
1274 if (pairedp == true || this->minus_matchedp[merstart] == false) {
1275 matches3 = identify_matches(&nnewminus,&overflowp,matches3,queryend,querylength,
1276 this->oligosize,matchinterval,
1277 plus_positions,plus_npositions,minus_positions,minus_npositions,
1278 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1279 /*forwardp*/false,/*fivep*/false,maxentries);
1280 if (overflowp == false) {
1281 debug3(printf(" No overflow so setting minus_matchedp at %d to true\n",merstart));
1282 this->minus_matchedp[merstart] = true;
1283 }
1284 }
1285
1286 *nnew = nnewplus + nnewminus;
1287 }
1288
1289 return matches3;
1290 }
1291
1292
1293 /*
1294 static bool
1295 check_fraction_paired (List_T matches5, List_T matches3) {
1296 List_T p;
1297 int npaired5 = 0, nsingle5 = 0, npaired3 = 0, nsingle3 = 0;
1298 double fracpaired5, fracpaired3;
1299
1300 for (p = matches5; p != NULL; p = p->rest) {
1301 if (Match_pairedp((Match_T) List_head(p)) == true) {
1302 npaired5++;
1303 } else {
1304 nsingle5++;
1305 }
1306 }
1307 for (p = matches3; p != NULL; p = p->rest) {
1308 if (Match_pairedp((Match_T) List_head(p)) == true) {
1309 npaired3++;
1310 } else {
1311 nsingle3++;
1312 }
1313 }
1314
1315 debug(printf("npaired5: %d, nsingle5: %d, npaired3: %d, nsingle3: %d\n",
1316 npaired5,nsingle5,npaired3,nsingle3);
1317 );
1318 if (npaired5+nsingle5 == 0) {
1319 fracpaired5 = (double) (npaired5+1)/(double) (npaired5+nsingle5+1);
1320 } else {
1321 fracpaired5 = (double) npaired5/(double) (npaired5+nsingle5);
1322 }
1323 if (npaired3+nsingle3 == 0) {
1324 fracpaired3 = (double) npaired3/(double) (npaired3+nsingle3+1);
1325 } else {
1326 fracpaired3 = (double) npaired3/(double) (npaired3+nsingle3);
1327 }
1328 if (fracpaired5 > 0.75 || fracpaired3 > 0.75) {
1329 return true;
1330 } else {
1331 return false;
1332 }
1333 }
1334 */
1335
1336 /* Previously had parameter stage1_stutter_sizelimit */
1337 static List_T
stutter(List_T gregionlist,T this,Width_T matchsize,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int stutterhits)1338 stutter (List_T gregionlist, T this, Width_T matchsize,
1339 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
1340 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1341 Matchpool_T matchpool, int stutterhits) {
1342 List_T newmatches5 = NULL, newmatches3 = NULL;
1343 int stutterdist5 = 0, stutterdist3 = 0, maxbases, start5, start3;
1344 bool foundpairp;
1345 double n5hits = 0.0, n3hits = 0.0;
1346 int nnew;
1347 #ifdef DEBUG1
1348 int i;
1349 #endif
1350
1351 start5 = Block_querypos(this->block5);
1352 start3 = Block_querypos(this->block3);
1353
1354 #if 0
1355 maxbases = stuttercycles*this->matchsize; /* (matchsize = matchinterval + INDEX1PART_AA/INDEX1PART); */
1356 if (maxbases > (start3 - start5)/2) {
1357 maxbases = (start3 - start5)/2;
1358 }
1359 #else
1360 maxbases = (start3 - start5)/2;
1361 #endif
1362
1363
1364 debug(printf("*** Beginning stutter. maxbases = %d, stutterhits = %d ***\n",maxbases,stutterhits));
1365
1366 while (Block_next_5(this->block5,genestrand) == true &&
1367 stutterdist5 < maxbases && n5hits < stutterhits) {
1368 this->querystart = Block_querypos(this->block5);
1369 if (this->processedp[this->querystart] == false) {
1370 Block_process_oligo_5(&(this->plus_positions[this->querystart]),&(this->plus_npositions[this->querystart]),
1371 &(this->minus_positions[this->querystart]),&(this->minus_npositions[this->querystart]),
1372 this->block5,indexdb_fwd,indexdb_rev/*,stage1_stutter_sizelimit*/);
1373 debug(printf("stutter: Processing 5' position %d: %d plus, %d minus",
1374 this->querystart,this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1375 debug1(
1376 for (i = 0; i < this->plus_npositions[this->querystart]; i++) {
1377 printf(" +%u",this->plus_positions[this->querystart][i]);
1378 }
1379 for (i = 0; i < this->minus_npositions[this->querystart]; i++) {
1380 printf(" -%u",this->minus_positions[this->querystart][i]);
1381 }
1382 );
1383 debug(printf("\n"));
1384
1385 this->processedp[this->querystart] = true;
1386 }
1387 newmatches5 = find_5prime_matches(&nnew,newmatches5,this,matchsize,
1388 this->plus_positions,this->plus_npositions,
1389 this->minus_positions,this->minus_npositions,
1390 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1391 this->querystart,this->querylength,this->maxentries,true);
1392
1393 stutterdist5 = Block_querypos(this->block5) - start5;
1394 if (nnew > 0) {
1395 n5hits += 1.0/(double) (1 + nnew);
1396 debug(printf("n5hits = %.1f, stutterdist5 = %d\n",n5hits,stutterdist5));
1397 }
1398 }
1399
1400 while (Block_next_3(this->block3,genestrand) == true &&
1401 stutterdist3 < maxbases && n3hits < stutterhits) {
1402 this->queryend = Block_querypos(this->block3);
1403 if (this->processedp[this->queryend] == false) {
1404 Block_process_oligo_3(&(this->plus_positions[this->queryend]),&(this->plus_npositions[this->queryend]),
1405 &(this->minus_positions[this->queryend]),&(this->minus_npositions[this->queryend]),
1406 this->block3,indexdb_fwd,indexdb_rev/*,stage1_stutter_sizelimit*/);
1407 debug(printf("stutter: Processing 3' position %d: %d plus, %d minus",
1408 this->queryend,this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
1409 debug1(
1410 for (i = 0; i < this->plus_npositions[this->queryend]; i++) {
1411 printf(" +%u",this->plus_positions[this->queryend][i]);
1412 }
1413 for (i = 0; i < this->minus_npositions[this->queryend]; i++) {
1414 printf(" -%u",this->minus_positions[this->queryend][i]);
1415 }
1416 );
1417 debug(printf("\n"));
1418 this->processedp[this->queryend] = true;
1419 }
1420 newmatches3 = find_3prime_matches(&nnew,newmatches3,this,matchsize,
1421 this->plus_positions,this->plus_npositions,
1422 this->minus_positions,this->minus_npositions,
1423 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1424 this->queryend,this->querylength,this->maxentries,true);
1425
1426 stutterdist3 = start3 - Block_querypos(this->block3);
1427 if (nnew > 0) {
1428 n3hits += 1.0/(double) (1 + nnew);
1429 debug(printf("n3hits = %.1f, stutterdist3 = %d\n",n3hits,stutterdist3));
1430 }
1431 }
1432
1433 debug(printf("*** Ending stutter ***\n"));
1434
1435 gregionlist = pair_up(&foundpairp,gregionlist,matchsize,newmatches5,newmatches3,
1436 this->matches5,this->matches3,genestrand,chromosome_iit,
1437 this->querylength,this->maxtotallen,
1438 this->trimstart,this->trimend,this->trimlength);
1439
1440 this->matches5 = Matchpool_transfer(this->matches5,newmatches5);
1441 this->matches3 = Matchpool_transfer(this->matches3,newmatches3);
1442
1443 return gregionlist;
1444 }
1445
1446
1447 /* Tries to find matches on the 5' end to unpaired hits from the 3' end */
1448 /* Previously had parameer stage1_fillin_sizelimit */
1449 static List_T
fill_in_5(List_T gregionlist,T this,Width_T matchsize,List_T dangling3,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool)1450 fill_in_5 (List_T gregionlist, T this, Width_T matchsize, List_T dangling3,
1451 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
1452 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1453 Matchpool_T matchpool) {
1454 List_T newmatches5 = NULL;
1455 int fillindist5 = 0, maxbases, start5;
1456 bool foundpairp = false;
1457 int nnew;
1458 #ifdef DEBUG1
1459 int i;
1460 #endif
1461
1462 start5 = Block_querypos(this->block5);
1463 maxbases = MAX_FILL_IN;
1464 if (maxbases > this->querylength/2 - start5) {
1465 maxbases = this->querylength/2 - start5;
1466 }
1467 debug(printf("*** Beginning fill_in_5. maxbases = %d ***\n",maxbases));
1468
1469 while (Block_next_5(this->block5,genestrand) == true &&
1470 fillindist5 < maxbases && foundpairp == false) {
1471 this->querystart = Block_querypos(this->block5);
1472 if (this->processedp[this->querystart] == false) {
1473 Block_process_oligo_5(&(this->plus_positions[this->querystart]),&(this->plus_npositions[this->querystart]),
1474 &(this->minus_positions[this->querystart]),&(this->minus_npositions[this->querystart]),
1475 this->block5,indexdb_fwd,indexdb_rev/*,stage1_fillin_sizelimit*/);
1476 debug(printf("fill_in_5: Processing 5' position %d: %d plus, %d minus",
1477 this->querystart,this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1478 debug1(
1479 for (i = 0; i < this->plus_npositions[this->querystart]; i++) {
1480 printf(" +%u",this->plus_positions[this->querystart][i]);
1481 }
1482 for (i = 0; i < this->minus_npositions[this->querystart]; i++) {
1483 printf(" -%u",this->minus_positions[this->querystart][i]);
1484 }
1485 );
1486 debug(printf("\n"));
1487 this->processedp[this->querystart] = true;
1488 }
1489 newmatches5 = find_5prime_matches(&nnew,newmatches5,this,matchsize,
1490 this->plus_positions,this->plus_npositions,
1491 this->minus_positions,this->minus_npositions,
1492 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1493 this->querystart,this->querylength,this->maxentries,true);
1494 fillindist5 = Block_querypos(this->block5) - start5;
1495
1496 if (nnew > 0) {
1497 gregionlist = pair_up(&foundpairp,gregionlist,matchsize,
1498 newmatches5,/*newmatches3*/(List_T) NULL,
1499 (List_T) NULL,dangling3,genestrand,chromosome_iit,
1500 this->querylength,this->maxtotallen,
1501 this->trimstart,this->trimend,this->trimlength);
1502 debug(printf(" Foundpairp = %d\n",foundpairp));
1503 }
1504 }
1505
1506 /* Mark newmatches5 as being pairedp if they match non-dangling matches3 */
1507 gregionlist = pair_up(&foundpairp,gregionlist,matchsize,
1508 newmatches5,/*newmatches3*/(List_T) NULL,
1509 (List_T) NULL,this->matches3,genestrand,chromosome_iit,
1510 this->querylength,this->maxtotallen,
1511 this->trimstart,this->trimend,this->trimlength);
1512
1513 this->matches5 = Matchpool_transfer(this->matches5,newmatches5);
1514
1515 debug(printf("*** Ending fill_in_5 ***\n"));
1516 return gregionlist;
1517 }
1518
1519
1520 /* Tries to find matches on the 5' end to unpaired hits from the 3' end */
1521 /* Previously had parameer stage1_fillin_sizelimit */
1522 static List_T
fill_in_3(List_T gregionlist,T this,Width_T matchsize,List_T dangling5,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool)1523 fill_in_3 (List_T gregionlist, T this, Width_T matchsize, List_T dangling5,
1524 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
1525 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1526 Matchpool_T matchpool) {
1527 List_T newmatches3 = NULL;
1528 int fillindist3 = 0, maxbases, start3;
1529 bool foundpairp = false;
1530 int nnew;
1531 #ifdef DEBUG1
1532 int i;
1533 #endif
1534
1535 start3 = Block_querypos(this->block3);
1536 maxbases = MAX_FILL_IN;
1537 if (maxbases > start3 - this->querylength/2) {
1538 maxbases = start3 - this->querylength/2;
1539 }
1540 debug(printf("*** Beginning fill_in_3. maxbases = %d ***\n",maxbases));
1541
1542 while (Block_next_3(this->block3,genestrand) == true &&
1543 fillindist3 < maxbases && foundpairp == false) {
1544 this->queryend = Block_querypos(this->block3);
1545 if (this->processedp[this->queryend] == false) {
1546 Block_process_oligo_3(&(this->plus_positions[this->queryend]),&(this->plus_npositions[this->queryend]),
1547 &(this->minus_positions[this->queryend]),&(this->minus_npositions[this->queryend]),
1548 this->block3,indexdb_fwd,indexdb_rev/*,stage1_fillin_sizelimit*/);
1549 debug(printf("fill_in_3: Processing 3' position %d: %d plus, %d minus",
1550 this->queryend,this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
1551 debug1(
1552 for (i = 0; i < this->plus_npositions[this->queryend]; i++) {
1553 printf(" +%u",this->plus_positions[this->queryend][i]);
1554 }
1555 for (i = 0; i < this->minus_npositions[this->queryend]; i++) {
1556 printf(" -%u",this->minus_positions[this->queryend][i]);
1557 }
1558 );
1559 debug(printf("\n"));
1560 this->processedp[this->queryend] = true;
1561 }
1562 newmatches3 = find_3prime_matches(&nnew,newmatches3,this,matchsize,
1563 this->plus_positions,this->plus_npositions,
1564 this->minus_positions,this->minus_npositions,
1565 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1566 this->queryend,this->querylength,this->maxentries,true);
1567 fillindist3 = start3 - Block_querypos(this->block3);
1568
1569 if (nnew > 0) {
1570 gregionlist = pair_up(&foundpairp,gregionlist,matchsize,
1571 /*newmatches5*/(List_T) NULL,newmatches3,
1572 dangling5,(List_T) NULL,genestrand,chromosome_iit,
1573 this->querylength,this->maxtotallen,
1574 this->trimstart,this->trimend,this->trimlength);
1575 debug(printf(" Foundpairp = %d\n",foundpairp));
1576 }
1577 }
1578
1579 /* Mark newmatches3 as being pairedp if they match non-dangling matches5 */
1580 gregionlist = pair_up(&foundpairp,gregionlist,matchsize,
1581 /*newmatches5*/(List_T) NULL,newmatches3,
1582 this->matches5,(List_T) NULL,genestrand,chromosome_iit,
1583 this->querylength,this->maxtotallen,
1584 this->trimstart,this->trimend,this->trimlength);
1585
1586 this->matches3 = Matchpool_transfer(this->matches3,newmatches3);
1587
1588 debug(printf("*** Ending fill_in_3 ***\n"));
1589 return gregionlist;
1590 }
1591
1592
1593 #if 0
1594 static void
1595 sample (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int nskip) {
1596 int mod5 = 0, mod3 = 0;
1597 #ifdef DEBUG1
1598 int i;
1599 #endif
1600
1601 Block_restore(this->block5);
1602 Block_restore(this->block3);
1603
1604 while (Block_next_5(this->block5,genestrand) == true) {
1605 this->querystart = Block_querypos(this->block5);
1606 if (this->processedp[this->querystart] == false) {
1607 Block_process_oligo_5(&(this->plus_positions[this->querystart]),&(this->plus_npositions[this->querystart]),
1608 &(this->minus_positions[this->querystart]),&(this->minus_npositions[this->querystart]),
1609 this->block5,indexdb_fwd,indexdb_rev/*,stage1_sample_sizelimit*/);
1610 debug(printf("sample: Processing 5' position %d: %d plus, %d minus",
1611 this->querystart,this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1612 debug1(
1613 for (i = 0; i < this->plus_npositions[this->querystart]; i++) {
1614 printf(" +%u",this->plus_positions[this->querystart][i]);
1615 }
1616 for (i = 0; i < this->minus_npositions[this->querystart]; i++) {
1617 printf(" -%u",this->minus_positions[this->querystart][i]);
1618 }
1619 );
1620 debug(printf("\n"));
1621 this->processedp[this->querystart] = true;
1622
1623 if (++mod5 % SAMPLERUN == 0) {
1624 mod5 = 0;
1625 Block_skip(this->block5,nskip);
1626 }
1627 }
1628 }
1629
1630 while (Block_next_3(this->block3,genestrand) == true) {
1631 this->queryend = Block_querypos(this->block3);
1632 if (this->processedp[this->queryend] == false) {
1633 Block_process_oligo_3(&(this->plus_positions[this->queryend]),&(this->plus_npositions[this->queryend]),
1634 &(this->minus_positions[this->queryend]),&(this->minus_npositions[this->queryend]),
1635 this->block3,indexdb_fwd,indexdb_rev/*,stage1_sample_sizelimit*/);
1636 debug(printf("sample: Processing 3' position %d: %d plus, %d minus",
1637 this->queryend,this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
1638 debug1(
1639 for (i = 0; i < this->plus_npositions[this->queryend]; i++) {
1640 printf(" +%u",this->plus_positions[this->queryend][i]);
1641 }
1642 for (i = 0; i < this->minus_npositions[this->queryend]; i++) {
1643 printf(" -%u",this->minus_positions[this->queryend][i]);
1644 }
1645 );
1646 debug(printf("\n"));
1647 this->processedp[this->queryend] = true;
1648
1649 if (++mod3 % SAMPLERUN == 0) {
1650 mod3 = 0;
1651 Block_skip(this->block3,nskip);
1652 }
1653 }
1654 }
1655
1656 return;
1657 }
1658 #endif
1659
1660
1661 static Univcoord_T *
find_range(int ** querypositions,int * ninrange,int starti,int endi,Univcoord_T ** positions,int * npositions,Univcoord_T leftbound,Univcoord_T rightbound)1662 find_range (int **querypositions, int *ninrange, int starti, int endi,
1663 Univcoord_T **positions, int *npositions, Univcoord_T leftbound, Univcoord_T rightbound) {
1664 Univcoord_T *range;
1665 int i;
1666 int querypos;
1667
1668 if (rightbound < leftbound) {
1669 abort();
1670 }
1671
1672 *ninrange = 0;
1673 for (querypos = starti; *ninrange < MAX_NINRANGE && querypos <= endi; querypos++) {
1674 i = binary_search(0,npositions[querypos],positions[querypos],leftbound);
1675 while (i < npositions[querypos] && positions[querypos][i] < rightbound) {
1676 debug2(printf("At querypos %d, found position %u in (%u,%u)\n",querypos,positions[querypos][i],leftbound,rightbound));
1677 (*ninrange)++;
1678 i++;
1679 }
1680 }
1681
1682 if (*ninrange == 0) {
1683 *querypositions = (int *) NULL;
1684 return (Univcoord_T *) NULL;
1685 } else {
1686 *querypositions = (int *) CALLOC(*ninrange,sizeof(int));
1687 range = (Univcoord_T *) CALLOC(*ninrange,sizeof(Univcoord_T));
1688 }
1689
1690 *ninrange = 0;
1691 for (querypos = starti; *ninrange < MAX_NINRANGE && querypos <= endi; querypos++) {
1692 i = binary_search(0,npositions[querypos],positions[querypos],leftbound);
1693 while (i < npositions[querypos] && positions[querypos][i] < rightbound) {
1694 (*querypositions)[*ninrange] = querypos;
1695 range[*ninrange] = positions[querypos][i];
1696 (*ninrange)++;
1697 i++;
1698 }
1699 }
1700
1701 return range;
1702 }
1703
1704 static void
find_extensions(Univcoord_T * extension5,Univcoord_T * extension3,T this,Gregion_T gregion,Chrpos_T maxtotallen,bool continuousp)1705 find_extensions (Univcoord_T *extension5, Univcoord_T *extension3, T this,
1706 Gregion_T gregion, Chrpos_T maxtotallen, bool continuousp) {
1707 int *querypositions, querystart, queryend, ninrange, i, j, lastj;
1708 Univcoord_T *range, leftbound, rightbound, best_start, best_end, expectedi, expectedj;
1709 int best_concentration, concentration;
1710 Chrpos_T maxintronlen5, maxintronlen3;
1711 #ifdef DEBUG2
1712 int best_querystart, best_queryend;
1713 #endif
1714
1715 querystart = Gregion_querystart(gregion);
1716 queryend = Gregion_queryend(gregion);
1717 debug2(printf("Entering find_extensions with querystart = %d, queryend = %d, querylength %d\n",
1718 querystart,queryend,this->querylength));
1719
1720 debug2(printf("continuousp = %d, this->trimlength %d (vs %d), querystart %d (vs %d)\n",
1721 continuousp,this->trimlength,SINGLEEXONLENGTH,querystart,NOEXTENDLEN));
1722 if (continuousp == true || this->trimlength < SINGLEEXONLENGTH || querystart < NOEXTENDLEN) {
1723 maxintronlen5 = querystart + 20;
1724 } else {
1725 maxintronlen5 = maxextension;
1726 }
1727
1728 debug2(printf("continuousp = %d, this->trimlength %d (vs %d), this->trimlength - queryend %d (vs %d)\n",
1729 continuousp,this->trimlength,SINGLEEXONLENGTH,this->trimlength - queryend,NOEXTENDLEN));
1730 if (continuousp == true || this->trimlength < SINGLEEXONLENGTH || this->trimlength - queryend < NOEXTENDLEN) {
1731 maxintronlen3 = this->querylength - queryend + 20;
1732 } else {
1733 maxintronlen3 = maxextension;
1734 }
1735 debug2(printf("Set maxintronlen5 = %u, maxintronlen3 = %u\n",maxintronlen5,maxintronlen3));
1736
1737 if (Gregion_plusp(gregion) == true) {
1738 rightbound = Gregion_genomicstart(gregion); /* was Match_position(match5) */
1739 if (rightbound < maxintronlen5) {
1740 leftbound = 0U;
1741 } else {
1742 leftbound = rightbound - maxintronlen5;
1743 }
1744 range = find_range(&querypositions,&ninrange,0,querystart-1,
1745 this->plus_positions,this->plus_npositions,
1746 leftbound,rightbound);
1747
1748 best_concentration = 0;
1749 best_start = Gregion_genomicstart(gregion); /* was Match_position(match5) */
1750 debug2(best_querystart = querystart);
1751 for (i = 0; i < ninrange; i++) {
1752 debug2(printf("Looking at range %d = %u@%d (distance = %u)\n",
1753 i,range[i],querypositions[i],Gregion_genomicstart(gregion)-range[i]));
1754 if (Gregion_genomicstart(gregion) > range[i] + maxtotallen) {
1755 debug2(printf(" => Too far away, so skipping\n"));
1756 } else {
1757 concentration = 1;
1758 lastj = i;
1759 for (j = i+1; j < ninrange; j++) {
1760 debug2(printf(" %u@%d",range[j],querypositions[j]));
1761 expectedj = range[i] + querypositions[j] - querypositions[i];
1762 if (range[j] + 20 > expectedj && range[j] < expectedj + 20) {
1763 concentration++;
1764 lastj = j;
1765 debug2(printf("*"));
1766 }
1767 }
1768 debug2(printf("\nConcentration is %d\n\n",concentration));
1769 if (concentration > best_concentration ||
1770 (concentration == best_concentration && range[i] > best_start)) {
1771 best_concentration = concentration;
1772 best_start = range[i];
1773 debug2(best_querystart = querypositions[i]);
1774 }
1775 }
1776 }
1777 FREE(querypositions);
1778 FREE(range);
1779
1780 *extension5 = Gregion_genomicstart(gregion) - best_start; /* was Match_position(match5) */
1781
1782 } else {
1783 leftbound = Gregion_genomicend(gregion); /* was Match_position(match5) */
1784 rightbound = leftbound + maxintronlen5;
1785 range = find_range(&querypositions,&ninrange,0,querystart-1,
1786 this->minus_positions,this->minus_npositions,
1787 leftbound,rightbound);
1788
1789 best_concentration = 0;
1790 best_start = Gregion_genomicend(gregion); /* was Match_position(match5) */
1791 debug2(best_querystart = querystart);
1792 for (i = 0; i < ninrange; i++) {
1793 debug2(printf("Looking at range %d = %u@%d (distance = %u)\n",
1794 i,range[i],querypositions[i],range[i]-Gregion_genomicend(gregion)));
1795 if (range[i] > Gregion_genomicend(gregion) + maxtotallen) {
1796 debug2(printf(" => Too far away, so skipping\n"));
1797 } else {
1798 concentration = 1;
1799 lastj = i;
1800 for (j = i+1; j < ninrange; j++) {
1801 debug2(printf(" %u@%d",range[j],querypositions[j]));
1802 expectedi = range[j] + querypositions[j] - querypositions[i];
1803 if (range[i] + 20 > expectedi && range[i] < expectedi + 20) {
1804 concentration++;
1805 lastj = j;
1806 debug2(printf("*"));
1807 }
1808 }
1809 debug2(printf("\nConcentration is %d\n\n",concentration));
1810 if (concentration > best_concentration ||
1811 (concentration == best_concentration && range[i] < best_start)) {
1812 best_concentration = concentration;
1813 best_start = range[i];
1814 debug2(best_querystart = querypositions[i]);
1815 }
1816 }
1817 }
1818 FREE(querypositions);
1819 FREE(range);
1820
1821 *extension5 = best_start - Gregion_genomicend(gregion); /* was - Match_position(match5) */
1822 }
1823
1824 debug2(printf("5' extension, from positions, best_querystart = %d\n",best_querystart));
1825 debug2(printf("5' extension, From positions, extension5 = %d\n",*extension5));
1826
1827 if (Gregion_plusp(gregion) == true) {
1828 leftbound = Gregion_genomicend(gregion); /* was Match_position(match3) */
1829 rightbound = leftbound + maxintronlen3;
1830 range = find_range(&querypositions,&ninrange,queryend+this->oligosize+1,this->querylength-1,
1831 this->plus_positions,this->plus_npositions,
1832 leftbound,rightbound);
1833
1834 best_concentration = 0;
1835 best_end = Gregion_genomicend(gregion); /* was Match_position(match3); */
1836 debug2(best_queryend = queryend);
1837 for (i = 0; i < ninrange; i++) {
1838 debug2(printf("Looking at range %d = %u@%d (distance = %u)\n",
1839 i,range[i],querypositions[i],range[i]-Gregion_genomicend(gregion)));
1840 if (range[i] > Gregion_genomicend(gregion) + maxtotallen) {
1841 debug2(printf(" => Too far away, so skipping\n"));
1842 } else {
1843 concentration = 1;
1844 lastj = i;
1845 for (j = i+1; j < ninrange; j++) {
1846 debug2(printf(" %u@%d",range[j],querypositions[j]));
1847 expectedj = range[i] + querypositions[j] - querypositions[i];
1848 if (range[j] + 20 > expectedj && range[j] < expectedj + 20) {
1849 concentration++;
1850 lastj = j;
1851 debug2(printf("*"));
1852 }
1853 }
1854 debug2(printf("\nConcentration is %d\n\n",concentration));
1855 if (concentration > best_concentration ||
1856 (concentration == best_concentration && range[lastj] < best_end)) {
1857 best_concentration = concentration;
1858 best_end = range[lastj];
1859 debug2(best_queryend = querypositions[lastj]);
1860 }
1861 }
1862 }
1863 FREE(querypositions);
1864 FREE(range);
1865
1866 *extension3 = best_end - Gregion_genomicend(gregion); /* was - Match_position(match3); */
1867
1868 } else {
1869 rightbound = Gregion_genomicstart(gregion); /* was Match_position(match3); */
1870 if (rightbound < maxintronlen3) {
1871 leftbound = 0U;
1872 } else {
1873 leftbound = rightbound - maxintronlen3;
1874 }
1875 range = find_range(&querypositions,&ninrange,queryend+this->oligosize+1,this->querylength-1,
1876 this->minus_positions,this->minus_npositions,
1877 leftbound,rightbound);
1878
1879 best_concentration = 0;
1880 best_end = Gregion_genomicstart(gregion); /* was Match_position(match3); */
1881 debug2(best_queryend = queryend);
1882 for (i = 0; i < ninrange; i++) {
1883 debug2(printf("Looking at range %d = %u@%d (distance = %u)\n",
1884 i,range[i],querypositions[i],Gregion_genomicstart(gregion)-range[i]));
1885 if (Gregion_genomicstart(gregion) > range[i] + maxtotallen) {
1886 debug2(printf(" => Too far away, so skipping\n"));
1887 } else {
1888 concentration = 1;
1889 lastj = i;
1890 for (j = i+1; j < ninrange; j++) {
1891 debug2(printf(" %u@%d",range[j],querypositions[j]));
1892 expectedi = range[j] + querypositions[j] - querypositions[i];
1893 if (range[i] + 20 > expectedi && range[i] < expectedi + 20) {
1894 concentration++;
1895 lastj = j;
1896 debug2(printf("*"));
1897 }
1898 }
1899 debug2(printf("\nConcentration is %d\n\n",concentration));
1900 if (concentration > best_concentration ||
1901 (concentration == best_concentration && range[lastj] > best_end)) {
1902 best_concentration = concentration;
1903 best_end = range[lastj];
1904 debug2(best_queryend = querypositions[lastj]);
1905 }
1906 }
1907 }
1908 FREE(querypositions);
1909 FREE(range);
1910
1911 *extension3 = Gregion_genomicstart(gregion) - best_end; /* was Match_position(match3) */
1912 }
1913
1914 debug2(printf("3' extension, from positions, best_queryend = %d\n",best_queryend));
1915 debug2(printf("3' extension, from positions, extension3 = %d\n",*extension3));
1916
1917 return;
1918 }
1919
1920
1921 /* Previously had parameter stage1_firstpair_sizelimit */
1922 static List_T
find_first_pair(bool * foundpairp,List_T gregionlist,T this,Width_T matchsize,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool)1923 find_first_pair (bool *foundpairp, List_T gregionlist, T this, Width_T matchsize,
1924 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
1925 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1926 Matchpool_T matchpool) {
1927 List_T newmatches5 = NULL, newmatches3 = NULL;
1928 bool donep = false;
1929 int nnew;
1930 double n5hits = 0.0, n3hits = 0.0;
1931 int nblocks5 = 0, nblocks3 = 0;
1932 #ifdef DEBUG1
1933 int i;
1934 #endif
1935
1936 Block_reset_ends(this->block5);
1937 Block_reset_ends(this->block3);
1938
1939 debug(printf("*** Starting stage1 with matchsize %d ***\n",matchsize));
1940 debug(printf(" querystart = %d, queryend = %d\n",Block_querypos(this->block5),Block_querypos(this->block3)));
1941
1942 *foundpairp = false;
1943 while (!donep && (*foundpairp) == false) {
1944 if (n5hits <= n3hits) {
1945 if (Block_next_5(this->block5,genestrand) == false) {
1946 donep = true;
1947 } else {
1948 this->querystart = Block_querypos(this->block5);
1949 if (this->processedp[this->querystart] == false) {
1950 Block_process_oligo_5(&(this->plus_positions[this->querystart]),&(this->plus_npositions[this->querystart]),
1951 &(this->minus_positions[this->querystart]),&(this->minus_npositions[this->querystart]),
1952 this->block5,indexdb_fwd,indexdb_rev/*,stage1_firstpair_sizelimit*/);
1953 #ifdef PMAP
1954 debug(printf("find_first_pair: Processing 5' position %d (aaindex %u): %d plus, %d minus",
1955 this->querystart,Block_aaindex(this->block5),
1956 this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1957 #else
1958 debug(printf("find_first_pair: Processing 5' position %d (forward %06X, revcomp %06X): %d plus, %d minus",
1959 this->querystart,Block_forward(this->block5),Block_revcomp(this->block5),
1960 this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1961 #endif
1962 debug1(
1963 for (i = 0; i < this->plus_npositions[this->querystart]; i++) {
1964 printf(" +%u",this->plus_positions[this->querystart][i]);
1965 }
1966 for (i = 0; i < this->minus_npositions[this->querystart]; i++) {
1967 printf(" -%u",this->minus_positions[this->querystart][i]);
1968 }
1969 );
1970 debug(printf("\n"));
1971 this->processedp[this->querystart] = true;
1972 nblocks5++;
1973 }
1974 newmatches5 = find_5prime_matches(&nnew,NULL,this,matchsize,
1975 this->plus_positions,this->plus_npositions,
1976 this->minus_positions,this->minus_npositions,chromosome_iit,
1977 chrsubset_start,chrsubset_end,matchpool,this->querystart,
1978 this->querylength,this->maxentries,true);
1979 if (nnew > 0) {
1980 n5hits += 1.0/(double) (1 + nnew);
1981 debug(printf(" n5hits: %.1f, n3hits: %.1f\n",n5hits,n3hits));
1982 gregionlist = pair_up(&(*foundpairp),gregionlist,matchsize,
1983 newmatches5,/*newmatches3*/NULL,
1984 this->matches5,this->matches3,genestrand,chromosome_iit,
1985 this->querylength,this->maxtotallen,
1986 this->trimstart,this->trimend,this->trimlength);
1987 this->matches5 = Matchpool_transfer(this->matches5,newmatches5);
1988 newmatches5 = NULL;
1989 }
1990 }
1991
1992 } else {
1993 if (Block_next_3(this->block3,genestrand) == false) {
1994 donep = true;
1995 } else {
1996 this->queryend = Block_querypos(this->block3);
1997 if (this->processedp[this->queryend] == false) {
1998 Block_process_oligo_3(&(this->plus_positions[this->queryend]),&(this->plus_npositions[this->queryend]),
1999 &(this->minus_positions[this->queryend]),&(this->minus_npositions[this->queryend]),
2000 this->block3,indexdb_fwd,indexdb_rev/*,stage1_firstpair_sizelimit*/);
2001 #ifdef PMAP
2002 debug(printf("find_first_pair: Processing 3' position %d (aaindex %u): %d plus, %d minus",
2003 this->queryend,Block_aaindex(this->block3),
2004 this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
2005 #else
2006 debug(printf("find_first_pair: Processing 3' position %d (forward %06X, revcomp %06X): %d plus, %d minus",
2007 this->queryend,Block_forward(this->block3),Block_revcomp(this->block3),
2008 this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
2009 #endif
2010 debug1(
2011 for (i = 0; i < this->plus_npositions[this->queryend]; i++) {
2012 printf(" +%u",this->plus_positions[this->queryend][i]);
2013 }
2014 for (i = 0; i < this->minus_npositions[this->queryend]; i++) {
2015 printf(" -%u",this->minus_positions[this->queryend][i]);
2016 }
2017 );
2018 debug(printf("\n"));
2019 this->processedp[this->queryend] = true;
2020 nblocks3++;
2021 }
2022 newmatches3 = find_3prime_matches(&nnew,NULL,this,matchsize,
2023 this->plus_positions,this->plus_npositions,
2024 this->minus_positions,this->minus_npositions,
2025 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2026 this->queryend,this->querylength,this->maxentries,true);
2027 if (nnew > 0) {
2028 n3hits += 1.0/(double) (1 + nnew);
2029 debug(printf(" n5hits: %.1f, n3hits: %.1f\n",n5hits,n3hits));
2030 gregionlist = pair_up(&(*foundpairp),gregionlist,matchsize,
2031 /*newmatches5*/NULL,newmatches3,
2032 this->matches5,this->matches3,genestrand,chromosome_iit,
2033 this->querylength,this->maxtotallen,
2034 this->trimstart,this->trimend,this->trimlength);
2035 this->matches3 = Matchpool_transfer(this->matches3,newmatches3);
2036 newmatches3 = NULL;
2037 }
2038 }
2039 }
2040 }
2041
2042 return gregionlist;
2043 }
2044
2045
2046 /************************************************************************/
2047
2048
2049 /* A dangling match is one that has not been paired up with a match
2050 from the other end of the cDNA sequence. A significant fraction of
2051 dangling matches may indicate the need to continue finding more
2052 matches from the other end. Conversely, the absence of dangling
2053 matches on both ends indicates that no other possibilities exist in
2054 the genome. */
2055
2056 static int
count_dangling(List_T matches)2057 count_dangling (List_T matches) {
2058 int ndangling = 0;
2059 Match_T match;
2060 List_T p;
2061
2062 for (p = matches; p != NULL; p = p->rest) {
2063 match = (Match_T) p->first;
2064 if (Match_npairings(match) == 0) {
2065 ndangling++;
2066 }
2067 }
2068 return ndangling;
2069 }
2070
2071 static double
dangling_pct(List_T matches)2072 dangling_pct (List_T matches) {
2073 double ndangling = 0.0, denominator = 0.0;
2074 Match_T match;
2075 List_T p;
2076 bool weightp = false;
2077
2078 for (p = matches; p != NULL; p = p->rest) {
2079 match = (Match_T) p->first;
2080 if (Match_npairings(match) == 0) {
2081 ndangling += Match_weight(match);
2082 }
2083 if (Match_has_weight_p(match) == true) {
2084 denominator += Match_weight(match);
2085 weightp = true;
2086 }
2087 }
2088 if (weightp == false) {
2089 return 0.0;
2090 } else {
2091 debug(printf("(%f/%f) ",ndangling,denominator));
2092 return ndangling/denominator;
2093 }
2094 }
2095
2096 static List_T
get_dangling(List_T matches,Matchpool_T matchpool)2097 get_dangling (List_T matches, Matchpool_T matchpool) {
2098 List_T dangling = NULL, p;
2099 Match_T match;
2100
2101 for (p = matches; p != NULL; p = p->rest) {
2102 match = (Match_T) p->first;
2103 if (Match_npairings(match) == 0) {
2104 dangling = Matchpool_push_existing(dangling,matchpool,match);
2105 }
2106 }
2107 return dangling;
2108 }
2109
2110
2111 /************************************************************************
2112 * Procedures for solving using a heap
2113 ************************************************************************/
2114
2115 static void
read_oligos(T this,Sequence_T queryuc,int genestrand)2116 read_oligos (T this, Sequence_T queryuc, int genestrand) {
2117 Reader_T reader;
2118 int querypos;
2119 Oligostate_T last_state = INIT;
2120 #ifdef PMAP
2121 Oligospace_T aaindex = 0U;
2122 #else
2123 Oligospace_T forward = 0U, revcomp = 0U;
2124 #endif
2125
2126 this->validp = (bool *) CALLOC(this->querylength,sizeof(bool));
2127 #ifdef PMAP
2128 this->oligos = (Oligospace_T *) CALLOC(this->querylength,sizeof(Oligospace_T));
2129 reader = Reader_new(Sequence_fullpointer(queryuc),/*querystart*/0,/*queryend*/this->querylength);
2130 /* Prevents us from processing invalid query 12-mers */
2131 for (querypos = 0; querypos <= this->querylength - index1part_aa; querypos++) {
2132 this->validp[querypos] = false;
2133 }
2134 #else
2135 this->forward_oligos = (Oligospace_T *) CALLOC(this->querylength,sizeof(Oligospace_T));
2136 this->revcomp_oligos = (Oligospace_T *) CALLOC(this->querylength,sizeof(Oligospace_T));
2137 reader = Reader_new(Sequence_fullpointer(queryuc),/*querystart*/0,/*queryend*/this->querylength);
2138 debug(printf("oligobase is %d, oligobase_mask is %08X\n",index1part,oligobase_mask));
2139 /* Prevents us from processing invalid query 12-mers */
2140 for (querypos = 0; querypos <= this->querylength - index1part; querypos++) {
2141 this->validp[querypos] = false;
2142 }
2143 #endif
2144
2145 /* Note: leftshifting is done here, rather than in Oligo_lookup */
2146 while ((last_state = Oligo_next_5(last_state,&querypos,
2147 #ifdef PMAP
2148 &aaindex,
2149 #else
2150 &forward,&revcomp,
2151 #endif
2152 reader,genestrand)) != DONE) {
2153
2154 this->validp[querypos] = true;
2155 #ifdef PMAP
2156 this->oligos[querypos] = aaindex;
2157 #else
2158 this->forward_oligos[querypos] = forward & oligobase_mask;
2159 this->revcomp_oligos[querypos] = (revcomp >> leftreadshift) & oligobase_mask;
2160 #endif
2161
2162 #if 0
2163 if (this->forward_oligos[querypos] == 0U || this->revcomp_oligos[querypos] == 0U) {
2164 debug(printf("Position %d is poly-A or poly-T\n",querypos));
2165 this->polyat[querypos] = true;
2166 }
2167 #endif
2168
2169 }
2170
2171 Reader_free(&reader);
2172 return;
2173 }
2174
2175
2176 struct Rep_T {
2177 Oligospace_T oligo;
2178 int querypos;
2179 };
2180
2181 static int
Rep_compare(const void * a,const void * b)2182 Rep_compare (const void *a, const void *b) {
2183 struct Rep_T x = * (struct Rep_T *) a;
2184 struct Rep_T y = * (struct Rep_T *) b;
2185
2186 if (x.oligo < y.oligo) {
2187 return -1;
2188 } else if (y.oligo < x.oligo) {
2189 return +1;
2190 } else if (x.querypos < y.querypos) {
2191 return -1;
2192 } else if (y.querypos < x.querypos) {
2193 return +1;
2194 } else {
2195 return 0;
2196 }
2197 }
2198
2199 static void
identify_repeated_oligos(T this,int oligobase,int querylength)2200 identify_repeated_oligos (T this, int oligobase, int querylength) {
2201 struct Rep_T *reps;
2202 int querypos, i, j, k, n = 0, min, max;
2203
2204 debug(printf("Starting identify_repeated_oligos\n"));
2205
2206 if (querylength < MAX_QUERYLENGTH_STACK) {
2207 reps = (struct Rep_T *) MALLOCA(this->querylength * sizeof(struct Rep_T));
2208 } else {
2209 reps = (struct Rep_T *) MALLOC(this->querylength * sizeof(struct Rep_T));
2210 }
2211
2212 for (querypos = 0; querypos <= this->querylength - oligobase; querypos++) {
2213 if (this->validp[querypos] == true) {
2214 #ifdef PMAP
2215 reps[n].oligo = this->oligos[querypos];
2216 #else
2217 reps[n].oligo = this->forward_oligos[querypos];
2218 #endif
2219 reps[n].querypos = querypos;
2220 n++;
2221 }
2222 }
2223
2224 qsort(reps,n,sizeof(struct Rep_T),Rep_compare);
2225 #if 0
2226 /* Test for thread safety */
2227 for (i = 0; i < n-1; i++) {
2228 if (Rep_compare(&(reps[i]),&(reps[i+1])) > 0) {
2229 abort();
2230 }
2231 }
2232 #endif
2233
2234 for (i = 0, j = 1; j < n; i++, j++) {
2235 if (reps[j].oligo == reps[i].oligo) {
2236 #ifdef PMAP
2237 debug(printf("Found repetition of oligo %u at %d and %d\n",
2238 reps[i].oligo,reps[i].querypos,reps[j].querypos));
2239 #else
2240 debug(printf("Found repetition of oligo %06X at %d and %d\n",
2241 reps[i].oligo,reps[i].querypos,reps[j].querypos));
2242 #endif
2243
2244 if (reps[j].querypos - reps[i].querypos <= MIN_REPEAT) {
2245 #if 0
2246 this->validp[reps[j].querypos] = false;
2247 this->validp[reps[i].querypos] = false;
2248 this->processedp[reps[j].querypos] = true;
2249 this->processedp[reps[i].querypos] = true;
2250 #else
2251 /* Wiping out everything around the repeated oligos */
2252 if ((min = reps[i].querypos - oligobase) < 0) {
2253 min = 0;
2254 }
2255 if ((max = reps[i].querypos + oligobase) > querylength) {
2256 max = querylength;
2257 }
2258 for (k = min; k < max; k++) {
2259 this->validp[k] = false;
2260 this->processedp[k] = true;
2261 }
2262
2263 if ((min = reps[j].querypos - oligobase) < 0) {
2264 min = 0;
2265 }
2266 if ((max = reps[j].querypos + oligobase) > querylength) {
2267 max = querylength;
2268 }
2269 for (k = min; k < max; k++) {
2270 this->validp[k] = false;
2271 this->processedp[k] = true;
2272 }
2273 #endif
2274
2275 }
2276 }
2277 }
2278
2279 if (querylength < MAX_QUERYLENGTH_STACK) {
2280 FREEA(reps);
2281 } else {
2282 FREE(reps);
2283 }
2284
2285 return;
2286 }
2287
2288
2289 #if 0
2290 static void
2291 check_old_new (Univcoord_T *old_positions, int old_npositions, Univcoord_T *new_positions, int new_npositions) {
2292 int i;
2293
2294 if (new_npositions != old_npositions) {
2295 abort();
2296 } else {
2297 for (i = 0; i < new_npositions; i++) {
2298 if (new_positions[i] != old_positions[i]) {
2299 abort();
2300 }
2301 }
2302 }
2303 return;
2304 }
2305 #endif
2306
2307
2308 /* Whether to use indexdb_size_threshold */
2309 /* #define SIZELIMIT 1 */
2310
2311 #ifdef PMAP
2312 static void
sample_oligos(T this,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int querylength,int oligobase)2313 sample_oligos (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase) {
2314 int querypos;
2315
2316 for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2317 #if 0
2318 if (this->validp[querypos] == true && this->processedp[querypos] == true) {
2319 printf("sample_oligos: Querypos %d, oligo is %06X\n",querypos,this->oligos[querypos]);
2320 plus_positions = Indexdb_read_with_diagterm(&plus_npositions,indexdb_fwd,this->oligos[querypos],
2321 /*diagterm*/querylength-querypos);
2322 check_old_new(this->plus_positions[querypos],this->plus_npositions[querypos],plus_positions,plus_npositions);
2323
2324 minus_positions = Indexdb_read_with_diagterm(&minus_npositions,indexdb_rev,this->oligos[querypos],
2325 /*diagterm*/querypos + index1part_aa*3);
2326 check_old_new(this->minus_positions[querypos],this->minus_npositions[querypos],minus_positions,minus_npositions);
2327
2328 }
2329 #endif
2330
2331 /* FORMULA */
2332 if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2333 this->plus_positions[querypos] =
2334 Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->oligos[querypos],
2335 /*diagterm*/querylength-querypos);
2336 this->minus_positions[querypos] =
2337 Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->oligos[querypos],
2338 /*diagterm*/querypos + index1part_aa*3);
2339 debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2340 querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2341 this->processedp[querypos] = true;
2342 }
2343 }
2344
2345 return;
2346 }
2347
2348 /* This procedure is equivalent to sample_oligos */
2349 static void
sample_oligos_nolimit(T this,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int querylength,int oligobase)2350 sample_oligos_nolimit (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase) {
2351 int querypos;
2352
2353 for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2354 /* FORMULA */
2355 if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2356 this->plus_positions[querypos] =
2357 Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->oligos[querypos],
2358 /*diagterm*/querylength-querypos);
2359
2360 this->minus_positions[querypos] =
2361 Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->oligos[querypos],
2362 /*diagterm*/querypos + index1part_aa*3);
2363
2364 debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2365 querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2366 this->processedp[querypos] = true;
2367 }
2368 }
2369
2370 return;
2371 }
2372
2373 #else
2374
2375 #if 0
2376 /* Needed to limit number of nregions */
2377 static void
2378 sample_oligos_sizelimit (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase,
2379 int stage1_sample_sizelimit) {
2380 int querypos;
2381
2382 for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2383 /* FORMULA */
2384 if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2385 this->plus_positions[querypos] =
2386 Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->forward_oligos[querypos],
2387 /*diagterm*/querylength-querypos);
2388 this->minus_positions[querypos] =
2389 Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->revcomp_oligos[querypos],
2390 /*diagterm*/querypos + index1part);
2391 debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2392 querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2393 this->processedp[querypos] = true;
2394 }
2395 }
2396
2397 return;
2398 }
2399 #endif
2400
2401
2402 static void
sample_oligos_nolimit(T this,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int querylength,int oligobase)2403 sample_oligos_nolimit (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase) {
2404 int querypos;
2405
2406 for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2407 /* FORMULA */
2408 if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2409 this->plus_positions[querypos] =
2410 Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->forward_oligos[querypos],
2411 /*diagterm*/querylength-querypos);
2412
2413 this->minus_positions[querypos] =
2414 Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->revcomp_oligos[querypos],
2415 /*diagterm*/querypos + index1part);
2416
2417 debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2418 querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2419 this->processedp[querypos] = true;
2420 }
2421 }
2422
2423 return;
2424 }
2425 #endif
2426
2427
2428 #if 0
2429 static void
2430 sample_oligos_findlimit (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase) {
2431 int querypos;
2432
2433 int *counts_plus, *counts_minus, threshold;
2434 int n;
2435
2436 n = querylength-oligobase;
2437 counts_plus = (int *) CALLOC(n,sizeof(int));
2438 counts_minus = (int *) CALLOC(n,sizeof(int));
2439
2440 for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2441 /* FORMULA */
2442 if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2443 this->plus_positions[querypos] =
2444 Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->forward_oligos[querypos],
2445 /*diagterm*/querylength-querypos);
2446 counts_plus[querypos] = this->plus_npositions[querypos];
2447
2448 this->minus_positions[querypos] =
2449 Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->revcomp_oligos[querypos],
2450 /*diagterm*/querypos + index1part);
2451 counts_minus[querypos] = this->minus_npositions[querypos];
2452
2453 debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2454 querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2455 this->processedp[querypos] = true;
2456 }
2457 }
2458
2459 /* Compute own size thresholds here */
2460 threshold = Orderstat_int_pct_inplace(counts_plus,n,/*percentile*/0.90);
2461 for (querypos = 0; querypos < n; querypos++) {
2462 if (counts_plus[querypos] > threshold) {
2463 this->plus_npositions[querypos] = 0;
2464 }
2465 }
2466
2467 threshold = Orderstat_int_pct_inplace(counts_minus,n,/*percentile*/0.90);
2468 for (querypos = 0; querypos < n; querypos++) {
2469 if (counts_minus[querypos] > threshold) {
2470 this->minus_npositions[querypos] = 0;
2471 }
2472 }
2473
2474 return;
2475 }
2476 #endif
2477
2478
2479
2480
2481 #define PARENT(i) (i >> 1)
2482 #define LEFT(i) (i << 1)
2483 #define RIGHT(i) ((i << 1) | 1)
2484
2485
2486 typedef struct Batch_T *Batch_T;
2487 struct Batch_T {
2488 int querypos;
2489 int ndiagonals;
2490 Univcoord_T diagonal;
2491 Univcoord_T *diagonals;
2492 };
2493
2494 static void
Batch_init(Batch_T batch,int querypos,Univcoord_T * diagonals,int ndiagonals)2495 Batch_init (Batch_T batch, int querypos, Univcoord_T *diagonals, int ndiagonals) {
2496
2497 batch->querypos = querypos;
2498 batch->diagonals = diagonals;
2499 batch->diagonal = *diagonals;
2500 batch->ndiagonals = ndiagonals;
2501
2502 return;
2503 }
2504
2505
2506 static void
min_heap_insert(Batch_T * heap,int * heapsize,Batch_T batch)2507 min_heap_insert (Batch_T *heap, int *heapsize, Batch_T batch) {
2508 int querypos, i;
2509 Univcoord_T diagonal;
2510
2511 /* debug0(printf("Inserting into heap: diagonal %u at querypos %d\n",diagonal,querypos)); */
2512
2513 querypos = batch->querypos;
2514 diagonal = batch->diagonal;
2515
2516 i = ++(*heapsize);
2517 /* sort primarily by diagonal, then by querypos */
2518 while (i > 1 && (heap[PARENT(i)]->diagonal > diagonal ||
2519 (heap[PARENT(i)]->diagonal == diagonal && heap[PARENT(i)]->querypos > querypos))) {
2520 heap[i] = heap[PARENT(i)];
2521 i = PARENT(i);
2522 }
2523 heap[i] = batch;
2524
2525 return;
2526 }
2527
2528 /* A diagonal segment */
2529 typedef struct Segment_T *Segment_T;
2530 struct Segment_T {
2531 Univcoord_T diagonal;
2532 int chrnum;
2533 int querypos5;
2534 int querypos3;
2535 int median;
2536 int score;
2537 int noligomers; /* Needed to speed up single_mismatches */
2538 };
2539
2540
2541
2542 /* Reduces diagonals to handle small indels */
2543 static void
collapse_diagonals(Univcoord_T ** diagonals,int * ndiagonals,int oligobase,int querylength)2544 collapse_diagonals (Univcoord_T **diagonals, int *ndiagonals, int oligobase, int querylength) {
2545 Batch_T batch, *heap, sentinel;
2546 struct Batch_T sentinel_struct, *batchpool;
2547 int maxsegments, heapsize = 0, i;
2548 int parenti, smallesti, righti;
2549 int querypos;
2550 Univcoord_T diagonal, base_diagonal, last_diagonal;
2551 #ifdef DEBUG6
2552 int j;
2553 #endif
2554
2555 debug6(printf("*** Starting collapse_diagonals ***\n"));
2556
2557 /* Set up batches */
2558 maxsegments = 0;
2559
2560 /* Batch_T can be large, so don't use alloca */
2561 batchpool = (struct Batch_T *) MALLOC((querylength-oligobase+1) * sizeof(struct Batch_T));
2562 heap = (Batch_T *) MALLOC((2*querylength+1+1) * sizeof(Batch_T));
2563
2564 for (querypos = 0, i = 0; querypos <= querylength - oligobase; querypos++) {
2565 if (ndiagonals[querypos] > 0) {
2566 maxsegments += ndiagonals[querypos];
2567 #ifdef PMAP
2568 debug6(printf("Adding batch for querypos %d with %d diagonals and aaindex %u\n",
2569 querypos,ndiagonals[querypos],oligos[querypos]));
2570 #else
2571 debug6(printf("Adding batch for querypos %d with %d diagonals and oligo %06X\n",
2572 querypos,ndiagonals[querypos],oligos[querypos]));
2573 #endif
2574 batch = &(batchpool[i++]);
2575 Batch_init(batch,querypos,diagonals[querypos],ndiagonals[querypos]);
2576 min_heap_insert(heap,&heapsize,batch);
2577 }
2578 }
2579
2580 if (maxsegments == 0) {
2581 FREE(heap);
2582 FREE(batchpool);
2583 return;
2584 }
2585
2586 /* Set up rest of heap */
2587 sentinel_struct.querypos = querylength; /* infinity */
2588 sentinel_struct.diagonal = (Univcoord_T) -1; /* infinity */
2589 sentinel = &sentinel_struct;
2590 for (i = heapsize+1; i <= 2*heapsize+1; i++) {
2591 heap[i] = sentinel;
2592 }
2593
2594
2595 /* Initialize loop */
2596 batch = heap[1];
2597 base_diagonal = last_diagonal = diagonal = batch->diagonal;
2598 debug6(printf("diagonal %u\n",diagonal));
2599
2600 if (--batch->ndiagonals <= 0) {
2601 /* Use last entry in heap for insertion */
2602 batch = heap[heapsize];
2603 heap[heapsize--] = sentinel;
2604
2605 } else {
2606 /* Use this batch for insertion (same querypos) */
2607 batch->diagonal = *(++batch->diagonals);
2608 }
2609
2610 /* heapify */
2611 diagonal = batch->diagonal;
2612 parenti = 1;
2613 smallesti = 2 + (heap[3]->diagonal < heap[2]->diagonal);
2614 /* Note that diagonal will never exceed a sentinel diagonal */
2615 while (diagonal > heap[smallesti]->diagonal) {
2616 heap[parenti] = heap[smallesti];
2617 parenti = smallesti;
2618 smallesti = LEFT(parenti);
2619 righti = smallesti+1;
2620 smallesti += (heap[righti]->diagonal < heap[smallesti]->diagonal);
2621 }
2622 heap[parenti] = batch;
2623
2624
2625 /* Continue after initialization */
2626 while (heapsize > 0) {
2627 batch = heap[1];
2628 diagonal = batch->diagonal;
2629 debug6(printf("diagonal %u\n",diagonal));
2630
2631 #if 0
2632 /* Compare against last_diagonal, not base_diagonal to allow a chain */
2633 if (diagonal <= last_diagonal + COLLAPSE_DISTANCE) {
2634 /* Compress. Modifies diagonals. */
2635 debug6(printf(" collapsing to %u\n",base_diagonal));
2636 *batch->diagonals = base_diagonal;
2637 } else {
2638 /* New base diagonal */
2639 base_diagonal = diagonal;
2640 }
2641 last_diagonal = diagonal;
2642 #else
2643 /* Compare against base_diagonal to ignore chains */
2644 if (diagonal == base_diagonal) {
2645 /* Do nothing */
2646 } else if (diagonal <= base_diagonal + COLLAPSE_DISTANCE) {
2647 /* Compress. Modifies diagonals. */
2648 debug6(printf(" collapsing to %u\n",base_diagonal));
2649 *batch->diagonals = base_diagonal;
2650 } else {
2651 /* New base diagonal */
2652 base_diagonal = diagonal;
2653 }
2654 #endif
2655
2656 if (--batch->ndiagonals <= 0) {
2657 /* Use last entry in heap for insertion */
2658 batch = heap[heapsize];
2659 heap[heapsize--] = sentinel;
2660
2661 } else {
2662 /* Use this batch for insertion (same querypos) */
2663 batch->diagonal = *(++batch->diagonals);
2664 }
2665
2666 /* heapify */
2667 diagonal = batch->diagonal;
2668 parenti = 1;
2669 smallesti = 2 + (heap[3]->diagonal < heap[2]->diagonal);
2670 /* Note that diagonal/querypos will never exceed a sentinel diagonal/querypos */
2671 while (diagonal > heap[smallesti]->diagonal) {
2672 heap[parenti] = heap[smallesti];
2673 parenti = smallesti;
2674 smallesti = LEFT(parenti);
2675 righti = smallesti+1;
2676 smallesti += (heap[righti]->diagonal < heap[smallesti]->diagonal);
2677 }
2678 heap[parenti] = batch;
2679
2680 }
2681
2682 /* Terminate loop. */
2683 FREE(heap);
2684 FREE(batchpool);
2685
2686 return;
2687 }
2688
2689
2690 static struct Segment_T *
find_segments(int * nsegments,Univcoord_T ** diagonals,int * ndiagonals,int oligobase,int querylength,int threshold_score)2691 find_segments (int *nsegments, Univcoord_T **diagonals, int *ndiagonals,
2692 int oligobase, int querylength, int threshold_score) {
2693 struct Segment_T *segments, *ptr;
2694 Batch_T batch, *heap, sentinel;
2695 struct Batch_T sentinel_struct, *batchpool;
2696 int maxsegments, heapsize = 0, i;
2697 int parenti, smallesti, righti;
2698 int querypos, last_querypos;
2699 Univcoord_T diagonal, last_diagonal;
2700 int *ordered;
2701 #ifdef DEBUG6
2702 int j;
2703 #endif
2704
2705 debug6(printf("*** Starting find_segments ***\n"));
2706
2707 /* Set up batches */
2708 maxsegments = 0;
2709
2710 /* Batch_T can be large, so don't use alloca */
2711 batchpool = (struct Batch_T *) MALLOC((querylength-oligobase+1) * sizeof(struct Batch_T));
2712 heap = (Batch_T *) MALLOC((2*querylength+1+1) * sizeof(Batch_T));
2713
2714 for (querypos = 0, i = 0; querypos <= querylength - oligobase; querypos++) {
2715 if (ndiagonals[querypos] > 0) {
2716 maxsegments += ndiagonals[querypos];
2717 #ifdef PMAP
2718 debug6(printf("Adding batch for querypos %d with %d diagonals and aaindex %u\n",
2719 querypos,ndiagonals[querypos],oligos[querypos]));
2720 #else
2721 debug6(printf("Adding batch for querypos %d with %d diagonals and oligo %06X\n",
2722 querypos,ndiagonals[querypos],oligos[querypos]));
2723 #endif
2724 batch = &(batchpool[i++]);
2725 Batch_init(batch,querypos,diagonals[querypos],ndiagonals[querypos]);
2726 min_heap_insert(heap,&heapsize,batch);
2727 }
2728 }
2729
2730 if (maxsegments == 0) {
2731 FREE(heap);
2732 FREE(batchpool);
2733 *nsegments = 0;
2734 return (struct Segment_T *) NULL;
2735 } else {
2736 ptr = segments = (struct Segment_T *) CALLOC(maxsegments,sizeof(struct Segment_T));
2737 ordered = (int *) CALLOC(i,sizeof(int));
2738 }
2739
2740 /* Set up rest of heap */
2741 sentinel_struct.querypos = querylength; /* infinity */
2742 sentinel_struct.diagonal = (Univcoord_T) -1; /* infinity */
2743 sentinel = &sentinel_struct;
2744 for (i = heapsize+1; i <= 2*heapsize+1; i++) {
2745 heap[i] = sentinel;
2746 }
2747
2748
2749 /* Initialize loop */
2750 batch = heap[1];
2751 last_querypos = querypos = batch->querypos;
2752 last_diagonal = diagonal = batch->diagonal;
2753 debug6(printf("diagonal %u, querypos %d\n",diagonal,querypos));
2754 #ifdef PMAP
2755 ptr->score = 3*oligobase;
2756 #else
2757 ptr->score = oligobase;
2758 #endif
2759 ptr->querypos5 = ptr->querypos3 = querypos;
2760 ordered[0] = querypos;
2761 ptr->noligomers = 1;
2762
2763 if (--batch->ndiagonals <= 0) {
2764 /* Use last entry in heap for insertion */
2765 batch = heap[heapsize];
2766 querypos = batch->querypos;
2767 heap[heapsize--] = sentinel;
2768
2769 } else {
2770 /* Use this batch for insertion (same querypos) */
2771 batch->diagonal = *(++batch->diagonals);
2772 }
2773
2774 /* heapify */
2775 diagonal = batch->diagonal;
2776 parenti = 1;
2777 smallesti = 2 + ((heap[3]->diagonal < heap[2]->diagonal) |
2778 ((heap[3]->diagonal == heap[2]->diagonal) &
2779 (heap[3]->querypos < heap[2]->querypos)));
2780 /* Note that diagonal/querypos will never exceed a sentinel diagonal/querypos */
2781 while (diagonal > heap[smallesti]->diagonal ||
2782 (diagonal == heap[smallesti]->diagonal &&
2783 querypos > heap[smallesti]->querypos)) {
2784 heap[parenti] = heap[smallesti];
2785 parenti = smallesti;
2786 smallesti = LEFT(parenti);
2787 righti = smallesti+1;
2788 smallesti += ((heap[righti]->diagonal < heap[smallesti]->diagonal) |
2789 ((heap[righti]->diagonal == heap[smallesti]->diagonal) &
2790 (heap[righti]->querypos < heap[smallesti]->querypos)));
2791 }
2792 heap[parenti] = batch;
2793
2794
2795 /* Continue after initializaation */
2796 while (heapsize > 0) {
2797 batch = heap[1];
2798 querypos = batch->querypos;
2799 diagonal = batch->diagonal;
2800 debug6(printf("diagonal %u, querypos %d\n",diagonal,querypos));
2801
2802 if (diagonal == last_diagonal) {
2803 /* Continuation of exact match or substitution */
2804 #ifdef PMAP
2805 if (querypos <= last_querypos + oligobase) {
2806 ptr->score += 3*(querypos - last_querypos);
2807 } else {
2808 ptr->score += 3*oligobase;
2809 }
2810 #else
2811 if (querypos <= last_querypos + oligobase) {
2812 ptr->score += querypos - last_querypos;
2813 } else {
2814 ptr->score += oligobase;
2815 }
2816 #endif
2817
2818 if (querypos > last_querypos) {
2819 ordered[ptr->noligomers++] = querypos;
2820 }
2821 ptr->querypos3 = querypos;
2822
2823 } else {
2824 /* End of diagonal */
2825 ptr->diagonal = last_diagonal;
2826
2827 if (ptr->score >= threshold_score) {
2828 /* The above test allows shorter matches on either end to find splicing, and requires longer matches internally */
2829 debug6(printf(" => %c, score %d, noligomers %d, querypos %d..%d, median %d, position %u..%u\n",
2830 plusp ? '+' : '-',ptr->score,ptr->noligomers,ptr->querypos5,ptr->querypos3,
2831 (ordered[ptr->noligomers/2] + ordered[ptr->noligomers/2 - ((ptr->noligomers+1) % 2)])/2,
2832 plusp ? ptr->diagonal + ptr->querypos5 - querylength : ptr->diagonal - ptr->querypos5,
2833 plusp ? ptr->diagonal + ptr->querypos3 - querylength : ptr->diagonal - ptr->querypos3));
2834 ptr->median = (ordered[ptr->noligomers/2] + ordered[ptr->noligomers/2 - ((ptr->noligomers+1) % 2)])/2;
2835 ptr->querypos3 += oligobase;
2836 ptr++;
2837 }
2838
2839 ptr->querypos5 = ptr->querypos3 = querypos;
2840 ordered[0] = querypos;
2841 last_diagonal = diagonal;
2842
2843 #ifdef PMAP
2844 ptr->score = 3*oligobase;
2845 #else
2846 ptr->score = oligobase;
2847 #endif
2848 ptr->noligomers = 1;
2849 }
2850 last_querypos = querypos;
2851
2852
2853 if (--batch->ndiagonals <= 0) {
2854 /* Use last entry in heap for insertion */
2855 batch = heap[heapsize];
2856 querypos = batch->querypos;
2857 heap[heapsize--] = sentinel;
2858
2859 } else {
2860 /* Use this batch for insertion (same querypos) */
2861 batch->diagonal = *(++batch->diagonals);
2862 }
2863
2864 /* heapify */
2865 diagonal = batch->diagonal;
2866 parenti = 1;
2867 debug7(printf("Comparing left %d/right %d: %u @ %d and %u @ %d\n",
2868 2,3,heap[3]->diagonal,heap[3]->querypos,heap[4]->diagonal,heap[4]->querypos));
2869 smallesti = 2 + ((heap[3]->diagonal < heap[2]->diagonal) |
2870 ((heap[3]->diagonal == heap[2]->diagonal) &
2871 (heap[3]->querypos < heap[2]->querypos)));
2872 /* Note that diagonal/querypos will never exceed a sentinel diagonal/querypos */
2873 while (diagonal > heap[smallesti]->diagonal ||
2874 (diagonal == heap[smallesti]->diagonal &&
2875 querypos > heap[smallesti]->querypos)) {
2876 heap[parenti] = heap[smallesti];
2877 parenti = smallesti;
2878 smallesti = LEFT(parenti);
2879 righti = smallesti+1;
2880 debug7(printf("Comparing left %d/right %d (heapsize %d): %u @ %d and %u @ %d\n",
2881 righti,smallesti,heapsize,heap[righti]->diagonal,
2882 heap[righti]->querypos,heap[smallesti]->diagonal,
2883 heap[smallesti]->querypos));
2884 smallesti += ((heap[righti]->diagonal < heap[smallesti]->diagonal) |
2885 ((heap[righti]->diagonal == heap[smallesti]->diagonal) &
2886 (heap[righti]->querypos < heap[smallesti]->querypos)));
2887 }
2888 heap[parenti] = batch;
2889
2890 }
2891
2892 /* Terminate loop. */
2893 ptr->diagonal = last_diagonal;
2894 if (ptr->score >= threshold_score) {
2895 /* The above test allows shorter matches on either end to find splicing, and requires longer matches internally */
2896 debug6(printf(" => %c, score %d, noligomers %d, querypos %d..%d, median %d, position %u..%u\n",
2897 plusp ? '+' : '-',ptr->score,ptr->noligomers,ptr->querypos5,ptr->querypos3,
2898 (ordered[ptr->noligomers/2] + ordered[ptr->noligomers/2 - ((ptr->noligomers+1) % 2)])/2,
2899 plusp ? ptr->diagonal + ptr->querypos5 - querylength : ptr->diagonal - ptr->querypos5,
2900 plusp ? ptr->diagonal + ptr->querypos3 - querylength : ptr->diagonal - ptr->querypos3));
2901 ptr->median = (ordered[ptr->noligomers/2] + ordered[ptr->noligomers/2 - ((ptr->noligomers+1) % 2)])/2;
2902 ptr->querypos3 += oligobase;
2903 ptr++; /* Needed to get correct value for nsegments below */
2904 }
2905
2906 FREE(heap);
2907 FREE(batchpool);
2908
2909 *nsegments = ptr - segments;
2910 debug6(for (j = 0; j < *nsegments; j++) {
2911 printf("diagonal %u\tquerypos %d..%d\tscore %d\tnoligomers %d\n",
2912 segments[j].diagonal,segments[j].querypos5,segments[j].querypos3,
2913 segments[j].score,segments[j].noligomers);
2914 if (segments[j].querypos3 < segments[j].querypos5) {
2915 abort();
2916 }
2917 });
2918
2919 FREE(ordered);
2920 return segments;
2921 }
2922
2923
2924 static void
compute_paths(int ** prev,int ** scores,struct Segment_T * segments,int nsegments,int maxexons,bool plusp)2925 compute_paths (int **prev, int **scores, struct Segment_T *segments, int nsegments, int maxexons, bool plusp) {
2926 int *bestscore, score, intronadj, overlapadj;
2927 struct Segment_T *segmentj;
2928 int k, j, i, *besti;
2929 int medianj;
2930 Univcoord_T diagonalj, intronlength;
2931
2932 if (nsegments == 0) {
2933 return;
2934 } else {
2935 for (i = 1; i <= maxexons; i++) {
2936 prev[i] = (int *) CALLOC(nsegments,sizeof(int)); /* Return value */
2937 scores[i] = (int *) CALLOC(nsegments,sizeof(int)); /* Return value */
2938 }
2939 bestscore = (int *) MALLOCA((maxexons+1) * sizeof(int));
2940 besti = (int *) MALLOCA((maxexons+1) * sizeof(int));
2941 }
2942
2943 if (plusp == true) {
2944 for (j = 0; j < nsegments; j++) {
2945 segmentj = &(segments[j]);
2946 diagonalj = segmentj->diagonal;
2947 medianj = segmentj->median;
2948
2949 /* k = 1 */
2950 scores[1][j] = segmentj->score;
2951 prev[1][j] = -1;
2952
2953 for (k = 2; k <= maxexons; k++) {
2954 bestscore[k] = 0;
2955 besti[k] = -1;
2956 }
2957
2958 i = j-1;
2959 while (i >= 0 && segments[i].diagonal + HIGHINTRONLENGTH > diagonalj) {
2960 /* for plus, median should be ascending */
2961 if (segments[i].median < medianj) {
2962 intronlength = diagonalj - segments[i].diagonal;
2963 intronadj = intronlength/INTRONLEN_PENALTY;
2964 overlapadj = 0;
2965 if (segmentj->querypos5 < segments[i].querypos3) {
2966 #ifdef PMAP
2967 overlapadj = 3*(segments[i].querypos3 - segmentj->querypos5);
2968 #else
2969 overlapadj = (segments[i].querypos3 - segmentj->querypos5);
2970 #endif
2971 }
2972
2973 if (segmentj->score > intronadj + overlapadj) {
2974 /* Require that score be monotonically increasing */
2975 for (k = 2; k <= maxexons; k++) {
2976 if ((score = scores[k-1][i] - intronadj - overlapadj) > bestscore[k]) {
2977 bestscore[k] = score;
2978 besti[k] = i;
2979 }
2980 }
2981
2982 /* For k = maxexons, also check against the same level */
2983 if ((score = scores[maxexons][i] - intronadj - overlapadj) > bestscore[maxexons]) {
2984 bestscore[maxexons] = score;
2985 besti[maxexons] = i;
2986 }
2987 }
2988
2989 }
2990 i--;
2991 }
2992
2993 for (k = 2; k <= maxexons; k++) {
2994 scores[k][j] = bestscore[k] + segmentj->score;
2995 prev[k][j] = besti[k];
2996 debug8(printf("+#%d, nexons %d: diagonal %u, querypos %d..%d, median %d, score %u from previous #%d\n",
2997 j,k,diagonalj,segmentj->querypos5,segmentj->querypos3,medianj,scores[k][j],besti[k]));
2998 }
2999 }
3000
3001 } else {
3002 for (j = 0; j < nsegments; j++) {
3003 segmentj = &(segments[j]);
3004 diagonalj = segmentj->diagonal;
3005 medianj = segmentj->median;
3006
3007 /* k = 1 */
3008 scores[1][j] = segmentj->score;
3009 prev[1][j] = -1;
3010
3011 for (k = 2; k <= maxexons; k++) {
3012 bestscore[k] = 0;
3013 besti[k] = -1;
3014 }
3015
3016 i = j-1;
3017 while (i >= 0 && segments[i].diagonal + HIGHINTRONLENGTH > diagonalj) {
3018 /* for minus, median should be descending */
3019 if (segments[i].median > medianj) {
3020 intronlength = diagonalj - segments[i].diagonal;
3021 intronadj = intronlength/INTRONLEN_PENALTY;
3022 overlapadj = 0;
3023 if (segments[i].querypos5 < segmentj->querypos3) {
3024 #ifdef PMAP
3025 overlapadj = 3*(segmentj->querypos3 - segments[i].querypos5);
3026 #else
3027 overlapadj = (segmentj->querypos3 - segments[i].querypos5);
3028 #endif
3029 }
3030
3031 if (segmentj->score > intronadj + overlapadj) {
3032 /* Require that score be monotonically increasing */
3033 for (k = 2; k <= maxexons; k++) {
3034 if ((score = scores[k-1][i] - intronadj - overlapadj) > bestscore[k]) {
3035 bestscore[k] = score;
3036 besti[k] = i;
3037 }
3038 }
3039
3040 /* For k = maxexons, also check against the same level */
3041 if ((score = scores[maxexons][i] - intronadj - overlapadj) > bestscore[maxexons]) {
3042 bestscore[maxexons] = score;
3043 besti[maxexons] = i;
3044 }
3045 }
3046
3047 }
3048 i--;
3049 }
3050
3051 for (k = 2; k <= maxexons; k++) {
3052 scores[k][j] = bestscore[k] + segmentj->score;
3053 prev[k][j] = besti[k];
3054 debug8(printf("-#%d, nexons %d: diagonal %u, querypos %d..%d, median %d, score %u from previous #%d\n",
3055 j,k,diagonalj,segmentj->querypos5,segmentj->querypos3,medianj,scores[k][j],besti[k]));
3056 }
3057 }
3058 }
3059
3060 FREEA(besti);
3061 FREEA(bestscore);
3062
3063 return;
3064 }
3065
3066
3067 static void
find_best_scores(int * nthbest,int * bestscores,int ** plus_scores,int plus_nscores,int ** minus_scores,int minus_nscores,int maxexons,int n)3068 find_best_scores (int *nthbest, int *bestscores, int **plus_scores, int plus_nscores,
3069 int **minus_scores, int minus_nscores, int maxexons,
3070 int n) {
3071 int ninserted = 0, k, j, i;
3072 int *heap, newscore;
3073 int parenti, smallesti, righti;
3074
3075 memset(bestscores,0,(maxexons+1) * sizeof(int));
3076 memset(nthbest,0,(maxexons+1) * sizeof(int));
3077
3078 heap = (int *) MALLOCA((2*n+1+1) * sizeof(int));
3079
3080 for (k = 1; k <= maxexons; k++) {
3081 /* Initialize heap */
3082 heap[1] = 0;
3083 for (i = 2; i < 2*n+1+1; i++) {
3084 heap[i] = 1000000000;
3085 }
3086
3087 /* plus */
3088 for (j = 0; j < plus_nscores; j++) {
3089 if ((newscore = plus_scores[k][j]) > heap[1]) {
3090 if (ninserted < n) {
3091 /* put new score at bottom and heapify up */
3092 i = ++ninserted;
3093 while (i > 1 && heap[PARENT(i)] > newscore) {
3094 heap[i] = heap[PARENT(i)];
3095 i = PARENT(i);
3096 }
3097 heap[i] = newscore;
3098
3099 } else {
3100 /* replace nth largest with new score and heapify down */
3101 parenti = 1;
3102 smallesti = 2 + (heap[3] < heap[2]);
3103 while (newscore > heap[smallesti]) {
3104 heap[parenti] = heap[smallesti];
3105 parenti = smallesti;
3106 smallesti = LEFT(parenti);
3107 righti = smallesti+1;
3108 smallesti += heap[righti] < heap[smallesti];
3109 }
3110 heap[parenti] = newscore;
3111 }
3112 }
3113 }
3114
3115 /* minus */
3116 for (j = 0; j < minus_nscores; j++) {
3117 if ((newscore = minus_scores[k][j]) > heap[1]) {
3118 if (ninserted < n) {
3119 /* put new score at bottom and heapify up */
3120 i = ++ninserted;
3121 while (i > 1 && heap[PARENT(i)] > newscore) {
3122 heap[i] = heap[PARENT(i)];
3123 i = PARENT(i);
3124 }
3125 heap[i] = newscore;
3126
3127 } else {
3128 /* replace nth largest with new score and heapify down */
3129 parenti = 1;
3130 smallesti = 2 + (heap[3] < heap[2]);
3131 while (newscore > heap[smallesti]) {
3132 heap[parenti] = heap[smallesti];
3133 parenti = smallesti;
3134 smallesti = LEFT(parenti);
3135 righti = smallesti+1;
3136 smallesti += heap[righti] < heap[smallesti];
3137 }
3138 heap[parenti] = newscore;
3139 }
3140 }
3141 }
3142
3143 nthbest[k] = heap[1];
3144 bestscores[k] = 0;
3145 for (j = 1; j <= ninserted; j++) {
3146 if (heap[j] > bestscores[k]) {
3147 bestscores[k] = heap[j];
3148 }
3149 }
3150 debug8(printf("For %d exons, best score is %d, %dth best is %d\n",bestscores[k],n,nthbest[k]));
3151 }
3152
3153 FREEA(heap);
3154 return;
3155 }
3156
3157
3158 static void
find_best_scores_nonstranded(int * nthbest,int * bestscores,int ** plus_scores_fwd,int plus_nscores_fwd,int ** minus_scores_fwd,int minus_nscores_fwd,int ** plus_scores_rev,int plus_nscores_rev,int ** minus_scores_rev,int minus_nscores_rev,int maxexons,int n)3159 find_best_scores_nonstranded (int *nthbest, int *bestscores, int **plus_scores_fwd, int plus_nscores_fwd,
3160 int **minus_scores_fwd, int minus_nscores_fwd,
3161 int **plus_scores_rev, int plus_nscores_rev,
3162 int **minus_scores_rev, int minus_nscores_rev,
3163 int maxexons, int n) {
3164 int ninserted = 0, k, j, i;
3165 int *heap, newscore;
3166 int parenti, smallesti, righti;
3167
3168 memset(bestscores,0,(maxexons+1) * sizeof(int));
3169 memset(nthbest,0,(maxexons+1) * sizeof(int));
3170 heap = (int *) MALLOCA((2*n+1+1) * sizeof(int));
3171
3172 for (k = 1; k <= maxexons; k++) {
3173 /* Initialize heap */
3174 heap[1] = 0;
3175 for (i = 2; i < 2*n+1+1; i++) {
3176 heap[i] = 1000000000;
3177 }
3178
3179 /* plus_fwd */
3180 for (j = 0; j < plus_nscores_fwd; j++) {
3181 if ((newscore = plus_scores_fwd[k][j]) > heap[1]) {
3182 if (ninserted < n) {
3183 /* put new score at bottom and heapify up */
3184 i = ++ninserted;
3185 while (i > 1 && heap[PARENT(i)] > newscore) {
3186 heap[i] = heap[PARENT(i)];
3187 i = PARENT(i);
3188 }
3189 heap[i] = newscore;
3190
3191 } else {
3192 /* replace nth largest with new score and heapify down */
3193 parenti = 1;
3194 smallesti = 2 + (heap[3] < heap[2]);
3195 while (newscore > heap[smallesti]) {
3196 heap[parenti] = heap[smallesti];
3197 parenti = smallesti;
3198 smallesti = LEFT(parenti);
3199 righti = smallesti+1;
3200 smallesti += heap[righti] < heap[smallesti];
3201 }
3202 heap[parenti] = newscore;
3203 }
3204 }
3205 }
3206
3207 /* minus_fwd */
3208 for (j = 0; j < minus_nscores_fwd; j++) {
3209 if ((newscore = minus_scores_fwd[k][j]) > heap[1]) {
3210 if (ninserted < n) {
3211 /* put new score at bottom and heapify up */
3212 i = ++ninserted;
3213 while (i > 1 && heap[PARENT(i)] > newscore) {
3214 heap[i] = heap[PARENT(i)];
3215 i = PARENT(i);
3216 }
3217 heap[i] = newscore;
3218
3219 } else {
3220 /* replace nth largest with new score and heapify down */
3221 parenti = 1;
3222 smallesti = 2 + (heap[3] < heap[2]);
3223 while (newscore > heap[smallesti]) {
3224 heap[parenti] = heap[smallesti];
3225 parenti = smallesti;
3226 smallesti = LEFT(parenti);
3227 righti = smallesti+1;
3228 smallesti += heap[righti] < heap[smallesti];
3229 }
3230 heap[parenti] = newscore;
3231 }
3232 }
3233 }
3234
3235 /* plus_rev */
3236 for (j = 0; j < plus_nscores_rev; j++) {
3237 if ((newscore = plus_scores_rev[k][j]) > heap[1]) {
3238 if (ninserted < n) {
3239 /* put new score at bottom and heapify up */
3240 i = ++ninserted;
3241 while (i > 1 && heap[PARENT(i)] > newscore) {
3242 heap[i] = heap[PARENT(i)];
3243 i = PARENT(i);
3244 }
3245 heap[i] = newscore;
3246
3247 } else {
3248 /* replace nth largest with new score and heapify down */
3249 parenti = 1;
3250 smallesti = 2 + (heap[3] < heap[2]);
3251 while (newscore > heap[smallesti]) {
3252 heap[parenti] = heap[smallesti];
3253 parenti = smallesti;
3254 smallesti = LEFT(parenti);
3255 righti = smallesti+1;
3256 smallesti += heap[righti] < heap[smallesti];
3257 }
3258 heap[parenti] = newscore;
3259 }
3260 }
3261 }
3262
3263 /* minus_rev */
3264 for (j = 0; j < minus_nscores_rev; j++) {
3265 if ((newscore = minus_scores_rev[k][j]) > heap[1]) {
3266 if (ninserted < n) {
3267 /* put new score at bottom and heapify up */
3268 i = ++ninserted;
3269 while (i > 1 && heap[PARENT(i)] > newscore) {
3270 heap[i] = heap[PARENT(i)];
3271 i = PARENT(i);
3272 }
3273 heap[i] = newscore;
3274
3275 } else {
3276 /* replace nth largest with new score and heapify down */
3277 parenti = 1;
3278 smallesti = 2 + (heap[3] < heap[2]);
3279 while (newscore > heap[smallesti]) {
3280 heap[parenti] = heap[smallesti];
3281 parenti = smallesti;
3282 smallesti = LEFT(parenti);
3283 righti = smallesti+1;
3284 smallesti += heap[righti] < heap[smallesti];
3285 }
3286 heap[parenti] = newscore;
3287 }
3288 }
3289 }
3290
3291 nthbest[k] = heap[1];
3292 bestscores[k] = 0;
3293 for (j = 1; j <= ninserted; j++) {
3294 if (heap[j] > bestscores[k]) {
3295 bestscores[k] = heap[j];
3296 }
3297 }
3298 debug8(printf("For %d exons, best score is %d, %dth best is %d\n",bestscores[k],n,(*nthbest)[k]));
3299 }
3300
3301 FREEA(heap);
3302 return;
3303 }
3304
3305
3306
3307 static List_T
find_good_paths(List_T gregionlist,int nexons,int * prev,int * scores,struct Segment_T * segments,int nsegments,int threshold_score,Univ_IIT_T chromosome_iit,int querylength,int trimstart,int trimend,bool plusp,int genestrand)3308 find_good_paths (List_T gregionlist, int nexons, int *prev, int *scores,
3309 struct Segment_T *segments, int nsegments, int threshold_score,
3310 Univ_IIT_T chromosome_iit, int querylength,
3311 int trimstart, int trimend, bool plusp, int genestrand) {
3312 int bestj, besti;
3313 int querystart, queryend;
3314 Univcoord_T genomicstart, genomicend;
3315
3316 debug9(printf("Starting find_good_paths with %d segments and threshold_score %d\n",nsegments,threshold_score));
3317
3318 for (bestj = nsegments-1; bestj >= 0; bestj--) {
3319 debug9(printf("#%d: diagonal %u\tpathscore %d\tprev %d\n",
3320 bestj,segments[bestj].diagonal,scores[bestj],prev[bestj]));
3321 if (scores[bestj] >= threshold_score) {
3322 /* Traceback and wipe out scores along the way, so we don't use them again */
3323 besti = bestj;
3324 debug9(printf("*diagonal %u\tquerypos %d..%d\tscore %d\tnoligomers %d\n",
3325 segments[besti].diagonal,segments[besti].querypos5,segments[besti].querypos3,
3326 segments[besti].score,segments[besti].noligomers));
3327 while (prev[besti] >= 0) {
3328 besti = prev[besti];
3329 /* scores[besti] = 0; -- wipe out */
3330 debug9(printf("*diagonal %u\tquerypos %d..%d\tscore %d\tnoligomers %d\n",
3331 segments[besti].diagonal,segments[besti].querypos5,segments[besti].querypos3,
3332 segments[besti].score,segments[besti].noligomers));
3333 }
3334 if (plusp == true) {
3335 querystart = segments[besti].querypos5;
3336 queryend = segments[bestj].querypos3;
3337 if (segments[besti].diagonal + querystart > (Univcoord_T) querylength) {
3338 genomicstart = segments[besti].diagonal + querystart - querylength;
3339 genomicend = segments[bestj].diagonal + queryend - querylength;
3340 debug9(printf("Pushing gregion for %d plus: %d..%d: %u..%u, score=%d\n",
3341 bestj,querystart,queryend,genomicstart,genomicend,scores[bestj]));
3342 #ifdef PMAP
3343 gregionlist = List_push(gregionlist,Gregion_new(nexons,genomicstart,genomicend,/*plusp*/true,genestrand,
3344 chromosome_iit,querystart,queryend,querylength,
3345 /*matchsize*/index1part_aa,trimstart,trimend,
3346 circular_typeint));
3347 #else
3348 gregionlist = List_push(gregionlist,Gregion_new(nexons,genomicstart,genomicend,/*plusp*/true,genestrand,
3349 chromosome_iit,querystart,queryend,querylength,
3350 /*matchsize*/index1part,trimstart,trimend,
3351 circular_typeint));
3352 #endif
3353 }
3354 } else if (plusp == false) {
3355 querystart = segments[bestj].querypos5;
3356 queryend = segments[besti].querypos3;
3357 if (segments[besti].diagonal > (Univcoord_T) queryend) {
3358 genomicstart = segments[besti].diagonal - queryend;
3359 genomicend = segments[bestj].diagonal - querystart;
3360 debug9(printf("Pushing gregion for %d minus: %d..%d: %u..%u, score=%d\n",
3361 bestj,querystart,queryend,genomicstart,genomicend,scores[bestj]));
3362 #ifdef PMAP
3363 gregionlist = List_push(gregionlist,Gregion_new(nexons,genomicstart,genomicend,/*plusp*/false,genestrand,
3364 chromosome_iit,querystart,queryend,querylength,
3365 /*matchsize*/index1part_aa,trimstart,trimend,
3366 circular_typeint));
3367 #else
3368 gregionlist = List_push(gregionlist,Gregion_new(nexons,genomicstart,genomicend,/*plusp*/false,genestrand,
3369 chromosome_iit,querystart,queryend,querylength,
3370 /*matchsize*/index1part,trimstart,trimend,
3371 circular_typeint));
3372 #endif
3373 }
3374 } else {
3375 fprintf(stderr,"Got strange value %d for plusp\n",plusp);
3376 abort();
3377 }
3378 }
3379 }
3380
3381 debug9(printf("Returning %d gregions\n",List_length(gregionlist)));
3382 return gregionlist;
3383 }
3384
3385
3386
3387 /* Probably don't want to iterate on 18-mers or shorter oligomers. If
3388 we can't find a matching pair of 24-mers, it must be cross-species,
3389 so we should rely on our other method */
3390
3391 static List_T
scan_ends(bool * shortseqp,bool second_pass_p,List_T oldlist,T this,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int stutterhits,Diagnostic_T diagnostic,bool iteratep)3392 scan_ends (bool *shortseqp, bool second_pass_p, List_T oldlist, T this,
3393 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
3394 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
3395 Matchpool_T matchpool, int stutterhits, Diagnostic_T diagnostic, bool iteratep) {
3396 List_T newlist = NULL;
3397 double dangling5_pct, dangling3_pct;
3398 List_T dangling5, dangling3;
3399 bool foundpairp = false, loopp = true;
3400 Width_T matchsize;
3401
3402 debug(printf("Starting scan_ends\n"));
3403
3404 #ifdef PMAP
3405 matchsize = this->oligosize + this->oligosize;
3406 #else
3407 if (index1part < 12) {
3408 matchsize = index1part + index1part;
3409 } else {
3410 /* Originally set for 12-mers, but now allow for 13, 14, and 15-mers */
3411 matchsize = index1part + 12;
3412 }
3413 #endif
3414
3415 /* Handle short query sequences */
3416 if (second_pass_p == true) {
3417 /* Use standard matchsize. Can get higher specificity with longer matchsizes. */
3418 } else {
3419 /* Use small matchsize. Can get higher sensitivity with smaller matchsizes. */
3420 *shortseqp = false;
3421 while (matchsize > this->querylength/4) {
3422 *shortseqp = true;
3423 matchsize -= 6;
3424 }
3425 }
3426 if (matchsize < this->oligosize) {
3427 matchsize = this->oligosize;
3428 }
3429
3430 while (loopp && matchsize >= this->oligosize && foundpairp == false) {
3431 newlist = find_first_pair(&foundpairp,newlist,this,matchsize,
3432 indexdb_fwd,indexdb_rev,genestrand,chromosome_iit,
3433 chrsubset_start,chrsubset_end,matchpool);
3434 if (matchsize == this->oligosize) {
3435 loopp = false;
3436 } else if (foundpairp == false) {
3437 #ifdef PMAP
3438 matchsize -= this->oligosize;
3439 #else
3440 matchsize -= 6; /* Originally set for 12-mers to be this->oligosize/2, but now allow for 13..15-mers */
3441 if (matchsize < this->oligosize) {
3442 /* Allows for 24, 18, 12..15 when we have 12..15-mers */
3443 matchsize = this->oligosize;
3444 }
3445 #endif
3446 stutterhits *= 2;
3447 }
3448 if (iteratep == false) {
3449 loopp = false;
3450 }
3451 }
3452
3453 if (foundpairp == false) {
3454 debug(printf("*** No pair found ***\n"));
3455
3456 diagnostic->firstpair_found_5 = 0;
3457 diagnostic->firstpair_found_3 = 0;
3458 diagnostic->stutter_searched_5 = diagnostic->firstpair_found_5;
3459 diagnostic->stutter_searched_3 = diagnostic->firstpair_found_3;
3460 diagnostic->stutter_nmatchpairs = 0;
3461
3462 diagnostic->stutter_matches_5 = List_length(this->matches5);
3463 diagnostic->stutter_matches_3 = List_length(this->matches3);
3464 diagnostic->dangling_5 = count_dangling(this->matches5);
3465 diagnostic->dangling_3 = count_dangling(this->matches3);
3466 diagnostic->dangling_nmatchpairs = 0;
3467
3468 } else {
3469 debug(printf("*** Found pair ***\n"));
3470 diagnostic->firstpair_found_5 = Block_querypos(this->block5);
3471 diagnostic->firstpair_found_3 = Block_querypos(this->block3);
3472
3473 newlist = stutter(newlist,this,matchsize,indexdb_fwd,indexdb_rev,
3474 genestrand,chromosome_iit,chrsubset_start,chrsubset_end,matchpool,stutterhits);
3475
3476 diagnostic->stutter_searched_5 = Block_querypos(this->block5);
3477 diagnostic->stutter_searched_3 = Block_querypos(this->block3);
3478 diagnostic->stutter_nmatchpairs = List_length(newlist);
3479
3480 diagnostic->stutter_matches_5 = List_length(this->matches5);
3481 diagnostic->stutter_matches_3 = List_length(this->matches3);
3482 diagnostic->dangling_5 = count_dangling(this->matches5);
3483 diagnostic->dangling_3 = count_dangling(this->matches3);
3484
3485 dangling5_pct = dangling_pct(this->matches5);
3486 dangling3_pct = dangling_pct(this->matches3);
3487 debug(printf("Dangling on 5' end: %d/%d\n",
3488 count_dangling(this->matches5),List_length(this->matches5)));
3489 debug(printf("Dangling on 3' end: %d/%d\n",
3490 count_dangling(this->matches3),List_length(this->matches3)));
3491
3492 if (dangling5_pct > MAX_DANGLING_PCT) {
3493 dangling5 = get_dangling(this->matches5,matchpool);
3494 newlist = fill_in_3(newlist,this,matchsize,dangling5,indexdb_fwd,indexdb_rev,
3495 genestrand,chromosome_iit,chrsubset_start,chrsubset_end,matchpool);
3496 /* Not necessary to free */
3497 /* List_free(&dangling5); */
3498 }
3499
3500 if (dangling3_pct > MAX_DANGLING_PCT) {
3501 dangling3 = get_dangling(this->matches3,matchpool);
3502 newlist = fill_in_5(newlist,this,matchsize,dangling3,indexdb_fwd,indexdb_rev,
3503 genestrand,chromosome_iit,chrsubset_start,chrsubset_end,matchpool);
3504 /* Not necessary to free */
3505 /* List_free(&dangling3); */
3506 }
3507
3508 #if 0
3509 newlist = Gregion_filter_unique(newlist);
3510 #endif
3511 diagnostic->dangling_nmatchpairs = List_length(newlist);
3512 }
3513
3514 debug(printf("Returning %d elements in newlist\n",List_length(newlist)));
3515 return List_append(newlist,oldlist);
3516 }
3517
3518
3519 #if 0
3520 static bool
3521 sufficient_gregion_p (List_T gregionlist) {
3522 List_T p;
3523 Gregion_T gregion;
3524
3525 debug(printf("Checking for sufficient gregion on %d gregions\n",List_length(gregionlist)));
3526 for (p = gregionlist; p != NULL; p = List_next(p)) {
3527 gregion = (Gregion_T) List_head(p);
3528 if (Gregion_sufficient_support(gregion) == true) {
3529 debug(printf("Sufficient support found\n"));
3530 return true;
3531 }
3532 }
3533
3534 debug(printf("Sufficient support not found\n"));
3535 return false;
3536 }
3537 #endif
3538
3539
3540 List_T
Stage1_compute(bool * lowidentityp,Sequence_T queryuc,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int stutterhits,Diagnostic_T diagnostic,Stopwatch_T stopwatch,int nbest)3541 Stage1_compute (bool *lowidentityp, Sequence_T queryuc,
3542 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
3543 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
3544 Matchpool_T matchpool, int stutterhits, Diagnostic_T diagnostic, Stopwatch_T stopwatch,
3545 int nbest) {
3546 List_T gregionlist = NULL, p, q;
3547 T this = NULL;
3548 int trimlength, trimstart, trimend, maxentries, i;
3549 /* Width_T matchsize; */
3550 Chrpos_T maxtotallen;
3551 Univcoord_T extension5, extension3;
3552 Gregion_T gregion;
3553 bool shortseqp;
3554
3555 struct Segment_T *plus_segments = NULL, *minus_segments = NULL;
3556 int plus_nsegments = 0, minus_nsegments = 0, maxexons, k;
3557 int nthbest[MAXEXONS+1], bestscores[MAXEXONS+1];
3558 int *plus_prev[MAXEXONS+1], *minus_prev[MAXEXONS+1], *plus_scores[MAXEXONS+1], *minus_scores[MAXEXONS+1];
3559
3560
3561 *lowidentityp = false;
3562 #ifdef DEBUG
3563 global_chromosome_iit = chromosome_iit;
3564 #endif
3565
3566 #ifdef PMAP
3567 if (Sequence_fulllength_given(queryuc) < index1part_aa) {
3568 return (List_T) NULL;
3569 }
3570 #else
3571 if (Sequence_fulllength(queryuc) < index1part) {
3572 return (List_T) NULL;
3573 }
3574 #endif
3575
3576
3577 Stopwatch_start(stopwatch);
3578 diagnostic->sampling_rounds = 0;
3579 diagnostic->sampling_nskip = 0;
3580
3581 debug(queryuc_ptr = Sequence_fullpointer(queryuc));
3582
3583 /* Don't multiply trimlength by 3 in PMAP */
3584 trimlength = Sequence_trimlength(queryuc);
3585 trimstart = Sequence_trim_start(queryuc);
3586 trimend = Sequence_trim_end(queryuc);
3587 debug(printf("At start of Stage1_compute, we have trimstart %d, trimend %d\n",trimstart,trimend));
3588
3589 if (trimlength <= SINGLEEXONLENGTH) {
3590 maxtotallen = 40 + trimlength;
3591 } else {
3592 maxtotallen = trimlength*SLOPE;
3593 if (maxtotallen < 10000) {
3594 maxtotallen = 10000;
3595 } else if (maxtotallen > maxtotallen_bound) {
3596 maxtotallen = maxtotallen_bound;
3597 }
3598 }
3599
3600 debug(fprintf(stderr,"trimlength = %d, maxtotallen = %d\n",trimlength,maxtotallen));
3601
3602 /* Scan ends (find first pair and stutter) */
3603 #ifdef PMAP
3604 /* matchsize = index1part_aa + index1part_aa; */
3605 #else
3606 /* matchsize = 24 */ /* Was index1part + index1part. matchsize now set in scan_ends */;
3607 #endif
3608 maxentries = MAXENTRIES;
3609
3610 debug(printf("Finding first pair with maxentries = %d\n",maxentries));
3611
3612 this = Stage1_new(queryuc,maxtotallen,maxentries);
3613 read_oligos(this,queryuc,genestrand);
3614 #ifdef PMAP
3615 identify_repeated_oligos(this,/*oligobase*/index1part_aa,this->querylength);
3616 #else
3617 identify_repeated_oligos(this,/*oligobase*/index1part,this->querylength);
3618 #endif
3619 #ifdef SCAN_ENDS
3620 gregionlist = scan_ends(&shortseqp,/*second_pass_p*/false,gregionlist,this,
3621 indexdb_fwd,indexdb_rev,/*genestrand*/0,chromosome_iit,chrsubset_start,chrsubset_end,
3622 matchpool,stutterhits,diagnostic,/*iteratep*/false);
3623 debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this->matches5),dangling_pct(this->matches3)));
3624 if (shortseqp == true) {
3625 gregionlist = scan_ends(&shortseqp,/*second_pass_p*/true,gregionlist,this,
3626 indexdb_fwd,indexdb_rev,/*genestrand*/0,chromosome_iit,chrsubset_start,chrsubset_end,
3627 matchpool,stutterhits,diagnostic,/*iteratep*/false);
3628 debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this->matches5),dangling_pct(this->matches3)));
3629 }
3630 #endif
3631
3632 if (gregionlist == NULL) {
3633 /* Don't use dangling to determine lowidentityp */
3634 *lowidentityp = true;
3635 }
3636
3637 debug(printf("\nAfter scan_ends:\n"));
3638 debug(
3639 for (p = gregionlist; p != NULL; p = List_next(p)) {
3640 gregion = (Gregion_T) List_head(p);
3641 Gregion_print(gregion);
3642 }
3643 );
3644 debug(printf("End of scan_ends.\n\n"));
3645
3646 /* Perform sampling, if necessary */
3647 if (gregionlist == NULL ||
3648 (Gregion_best_weight(gregionlist) < SUFFICIENT_FIRST_WEIGHT &&
3649 dangling_pct(this->matches5) > MAX_DANGLING_PCT &&
3650 dangling_pct(this->matches3) > MAX_DANGLING_PCT)) {
3651
3652 #if 0
3653 /* Start over by clearing gregionlist */
3654 for (p = gregionlist; p != NULL; p = List_next(p)) {
3655 gregion = (Gregion_T) List_head(p);
3656 Gregion_free(&gregion);
3657 }
3658 List_free(&gregionlist);
3659 gregionlist = (List_T) NULL;
3660 #endif
3661
3662 debug(printf("Starting sample_oligos\n"));
3663 #ifdef PMAP
3664 sample_oligos(this,indexdb_fwd,indexdb_rev,this->querylength,/*oligobase*/index1part_aa);
3665 collapse_diagonals(this->plus_positions,this->plus_npositions,
3666 /*oligobase*/index1part_aa,this->querylength);
3667 plus_segments = find_segments(&plus_nsegments,this->plus_positions,this->plus_npositions,
3668 /*oligobase*/index1part_aa,this->querylength,
3669 /*threshold_score*/27);
3670 collapse_diagonals(this->minus_positions,this->minus_npositions,
3671 /*oligobase*/index1part_aa,this->querylength);
3672 minus_segments = find_segments(&minus_nsegments,this->minus_positions,this->minus_npositions,
3673 /*oligobase*/index1part_aa,this->querylength,
3674 /*threshold_score*/27);
3675 #else
3676 sample_oligos_nolimit(this,indexdb_fwd,indexdb_rev,this->querylength,/*oligobase*/index1part);
3677 collapse_diagonals(this->plus_positions,this->plus_npositions,
3678 /*oligobase*/index1part,this->querylength);
3679 plus_segments = find_segments(&plus_nsegments,this->plus_positions,this->plus_npositions,
3680 /*oligobase*/index1part,this->querylength,
3681 /*threshold_score*/18);
3682
3683 collapse_diagonals(this->minus_positions,this->minus_npositions,
3684 /*oligobase*/index1part,this->querylength);
3685 minus_segments = find_segments(&minus_nsegments,this->minus_positions,this->minus_npositions,
3686 /*oligobase*/index1part,this->querylength,
3687 /*threshold_score*/18);
3688 #endif
3689
3690 maxexons = 3;
3691 compute_paths(plus_prev,plus_scores,plus_segments,plus_nsegments,/*maxexons*/MAXEXONS,/*plusp*/true);
3692 compute_paths(minus_prev,minus_scores,minus_segments,minus_nsegments,/*maxexons*/MAXEXONS,/*plusp*/false);
3693 find_best_scores(nthbest,bestscores,plus_scores,/*plus_nscores*/plus_nsegments,
3694 minus_scores,/*minus_nscores*/minus_nsegments,maxexons,/*n*/nbest);
3695
3696 for (k = 1; k <= maxexons; k++) {
3697 if (nthbest[k] < bestscores[k] - SUBOPTIMAL) {
3698 nthbest[k] = bestscores[k] - SUBOPTIMAL;
3699 }
3700 if (plus_nsegments > 0) {
3701 gregionlist = find_good_paths(gregionlist,/*nexons*/k,plus_prev[k],plus_scores[k],
3702 plus_segments,plus_nsegments,/*threshold_score*/nthbest[k],
3703 chromosome_iit,this->querylength,
3704 trimstart,trimend,/*plusp*/true,/*genestrand*/0);
3705 }
3706 if (minus_nsegments > 0) {
3707 gregionlist = find_good_paths(gregionlist,/*nexons*/k,minus_prev[k],minus_scores[k],
3708 minus_segments,minus_nsegments,/*threshold_score*/nthbest[k],
3709 chromosome_iit,this->querylength,
3710 trimstart,trimend,/*plusp*/false,/*genestrand*/0);
3711 }
3712 }
3713
3714 if (minus_nsegments > 0) {
3715 for (k = 1; k <= maxexons; k++) {
3716 FREE(minus_scores[k]);
3717 FREE(minus_prev[k]);
3718 }
3719 }
3720 if (plus_nsegments > 0) {
3721 for (k = 1; k <= maxexons; k++) {
3722 FREE(plus_scores[k]);
3723 FREE(plus_prev[k]);
3724 }
3725 }
3726 FREE(minus_segments);
3727 FREE(plus_segments);
3728 }
3729
3730 /* Clean up gregionlist */
3731 debug(printf("Starting extensions on %d gregions\n",List_length(gregionlist)));
3732 for (p = gregionlist; p != NULL; p = List_next(p)) {
3733 gregion = (Gregion_T) List_head(p);
3734 if (Gregion_extendedp(gregion) == false) {
3735 /* Need to extend, otherwise we won't align NM_003360 */
3736 find_extensions(&extension5,&extension3,this,gregion,this->maxtotallen,/*continuousp*/false);
3737 Gregion_extend(gregion,extension5,extension3,this->querylength);
3738 }
3739 }
3740 debug(printf("Finished with extensions\n"));
3741
3742 debug(printf("Before filtering, %d regions\n",List_length(gregionlist)));
3743 debug(
3744 for (p = gregionlist; p != NULL; p = List_next(p)) {
3745 gregion = (Gregion_T) List_head(p);
3746 Gregion_print(gregion);
3747 }
3748 );
3749
3750 #if 0
3751 /* Don't filter for support anymore. Causes loss of good regions. */
3752 gregionlist = Gregion_filter_support(gregionlist,BOUNDARY_SUPPORT,PCT_MAX_SUPPORT,DIFF_MAX_SUPPORT);
3753 debug(printf("After filtering for support, %d regions\n",List_length(gregionlist)));
3754 #endif
3755
3756 #if 0
3757 /* Don't filter for max_regions anymore */
3758 if (List_length(gregionlist) > MAX_GREGIONS_PRE_UNIQUE) {
3759 debug("Too many gregions, so erasing them\n");
3760 for (p = gregionlist; p != NULL; p = List_next(p)) {
3761 gregion = (Gregion_T) List_head(p);
3762 Gregion_free(&gregion);
3763 }
3764 List_free(&gregionlist);
3765 gregionlist = NULL;
3766
3767 } else {
3768 #endif
3769
3770
3771 #ifdef USE_CLEAN
3772 Gregion_filter_clean(gregionlist,nchrs);
3773 #endif
3774
3775 #if 0
3776 for (p = gregionlist; p != NULL; p = p->rest) {
3777 gregion = (Gregion_T) List_head(p);
3778 Gregion_print(gregion);
3779 }
3780 #endif
3781
3782 debug0(printf("Before filtering for unique, %d regions\n",List_length(gregionlist)));
3783 debug0(
3784 for (p = gregionlist; p != NULL; p = List_next(p)) {
3785 gregion = (Gregion_T) List_head(p);
3786 Gregion_print(gregion);
3787 }
3788 );
3789
3790 gregionlist = Gregion_filter_unique(gregionlist);
3791 debug0(printf("After filtering for unique, %d regions\n",List_length(gregionlist)));
3792 debug0(
3793 for (p = gregionlist; p != NULL; p = List_next(p)) {
3794 gregion = (Gregion_T) List_head(p);
3795 Gregion_print(gregion);
3796 }
3797 );
3798
3799 debug0(
3800 if (List_length(gregionlist) > MAX_GREGIONS_POST_UNIQUE) {
3801 printf("Too many gregions %d, so taking the top %d\n",List_length(gregionlist),MAX_GREGIONS_POST_UNIQUE);
3802 });
3803
3804 for (p = gregionlist, i = 1; p != NULL && i < MAX_GREGIONS_POST_UNIQUE; p = List_next(p)) {
3805 /* Advance */
3806 i++;
3807 }
3808 if (p != NULL) {
3809 q = List_next(p);
3810 p->rest = (List_T) NULL;
3811 for (p = q; p != NULL; p = List_next(p)) {
3812 gregion = (Gregion_T) List_head(p);
3813 Gregion_free(&gregion);
3814 }
3815 List_free(&q);
3816 }
3817 #if 0
3818 }
3819 #endif
3820
3821 diagnostic->stage1_runtime = Stopwatch_stop(stopwatch);
3822 diagnostic->ngregions = List_length(gregionlist);
3823
3824 Stage1_free(&this);
3825
3826 debug0(printf("Stage 1 returning %d new regions\n",List_length(gregionlist)));
3827 debug0(
3828 for (p = gregionlist; p != NULL; p = List_next(p)) {
3829 gregion = (Gregion_T) List_head(p);
3830 Gregion_print(gregion);
3831 }
3832 );
3833
3834 return gregionlist;
3835 }
3836
3837
3838
3839 List_T
Stage1_compute_nonstranded(bool * lowidentityp,Sequence_T queryuc,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int stutterhits,Diagnostic_T diagnostic,Stopwatch_T stopwatch,int nbest)3840 Stage1_compute_nonstranded (bool *lowidentityp, Sequence_T queryuc,
3841 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
3842 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
3843 Matchpool_T matchpool, int stutterhits, Diagnostic_T diagnostic, Stopwatch_T stopwatch,
3844 int nbest) {
3845 List_T gregionlist = NULL, p, q;
3846 T this_fwd = NULL, this_rev = NULL;
3847 Sequence_T queryrc;
3848 int trimlength, trimstart, trimend, maxentries, i;
3849 Chrpos_T maxtotallen;
3850 Univcoord_T extension5, extension3;
3851 Gregion_T gregion;
3852 bool shortseqp;
3853
3854 struct Segment_T *plus_segments_fwd = NULL, *minus_segments_fwd = NULL, *plus_segments_rev = NULL, *minus_segments_rev = NULL;
3855 int plus_nsegments_fwd = 0, minus_nsegments_fwd = 0, plus_nsegments_rev = 0, minus_nsegments_rev = 0, maxexons, k;
3856 int nthbest[MAXEXONS+1], bestscores[MAXEXONS+1];
3857
3858 int *plus_prev_fwd[MAXEXONS+1], *plus_scores_fwd[MAXEXONS+1], *minus_prev_fwd[MAXEXONS+1], *minus_scores_fwd[MAXEXONS+1];
3859 int *plus_prev_rev[MAXEXONS+1], *plus_scores_rev[MAXEXONS+1], *minus_prev_rev[MAXEXONS+1], *minus_scores_rev[MAXEXONS+1];
3860
3861 *lowidentityp = false;
3862 #ifdef DEBUG
3863 global_chromosome_iit = chromosome_iit;
3864 #endif
3865
3866 #ifdef PMAP
3867 if (Sequence_fulllength_given(queryuc) < index1part_aa) {
3868 return (List_T) NULL;
3869 }
3870 #else
3871 if (Sequence_fulllength(queryuc) < index1part) {
3872 return (List_T) NULL;
3873 }
3874 #endif
3875
3876 queryrc = Sequence_revcomp(queryuc);
3877
3878 Stopwatch_start(stopwatch);
3879 diagnostic->sampling_rounds = 0;
3880 diagnostic->sampling_nskip = 0;
3881
3882 debug(queryuc_ptr = Sequence_fullpointer(queryuc));
3883
3884 /* FWD */
3885
3886 #ifdef PMAP
3887 /* matchsize = index1part_aa + index1part_aa; */
3888 #else
3889 /* matchsize = 24 */ /* Was index1part + index1part. matchsize now set in scan_ends */;
3890 #endif
3891 maxentries = MAXENTRIES;
3892
3893
3894 /* Don't multiply trimlength by 3 in PMAP */
3895 trimlength = Sequence_trimlength(queryuc);
3896 trimstart = Sequence_trim_start(queryuc);
3897 trimend = Sequence_trim_end(queryuc);
3898 debug(printf("At start of Stage1_compute, we have trimstart %d, trimend %d\n",trimstart,trimend));
3899
3900 if (trimlength <= SINGLEEXONLENGTH) {
3901 maxtotallen = 40 + trimlength;
3902 } else {
3903 maxtotallen = trimlength*SLOPE;
3904 if (maxtotallen < 10000) {
3905 maxtotallen = 10000;
3906 } else if (maxtotallen > maxtotallen_bound) {
3907 maxtotallen = maxtotallen_bound;
3908 }
3909 }
3910
3911 debug(fprintf(stderr,"trimlength = %d, maxtotallen = %d\n",trimlength,maxtotallen));
3912
3913 /* Scan ends (find first pair and stutter) */
3914 debug(printf("Finding first pair, fwd, with maxentries = %d\n",maxentries));
3915
3916 this_fwd = Stage1_new(queryuc,maxtotallen,maxentries);
3917 read_oligos(this_fwd,queryuc,/*genestrand*/+1);
3918 #ifdef PMAP
3919 identify_repeated_oligos(this_fwd,/*oligobase*/index1part_aa,this_fwd->querylength);
3920 #else
3921 identify_repeated_oligos(this_fwd,/*oligobase*/index1part,this_fwd->querylength);
3922 #endif
3923
3924 #ifdef SCAN_ENDS
3925 gregionlist = scan_ends(&shortseqp,/*second_pass_p*/false,gregionlist,this_fwd,
3926 indexdb_fwd,indexdb_fwd,/*genestrand*/+1,chromosome_iit,chrsubset_start,chrsubset_end,
3927 matchpool,stutterhits,diagnostic,/*iteratep*/false);
3928 debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this_fwd->matches5),dangling_pct(this_fwd->matches3)));
3929 if (shortseqp == true) {
3930 gregionlist = scan_ends(&shortseqp,/*second_pass_p*/true,gregionlist,this_fwd,
3931 indexdb_fwd,indexdb_fwd,/*genestrand*/+1,chromosome_iit,chrsubset_start,chrsubset_end,
3932 matchpool,stutterhits,diagnostic,/*iteratep*/false);
3933 debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this_fwd->matches5),dangling_pct(this_fwd->matches3)));
3934 }
3935 #endif
3936
3937 debug(printf("\nAfter scan_ends, fwd:\n"));
3938 debug(
3939 for (p = gregionlist; p != NULL; p = List_next(p)) {
3940 gregion = (Gregion_T) List_head(p);
3941 Gregion_print(gregion);
3942 }
3943 );
3944 debug(printf("End of scan_ends, fwd.\n\n"));
3945
3946
3947 /* REV */
3948
3949 /* Don't multiply trimlength by 3 in PMAP */
3950 trimlength = Sequence_trimlength(queryrc);
3951 trimstart = Sequence_trim_start(queryrc);
3952 trimend = Sequence_trim_end(queryrc);
3953 debug(printf("At start of Stage1_compute, we have trimstart %d, trimend %d\n",trimstart,trimend));
3954
3955 if (trimlength <= SINGLEEXONLENGTH) {
3956 maxtotallen = 40 + trimlength;
3957 } else {
3958 maxtotallen = trimlength*SLOPE;
3959 if (maxtotallen < 10000) {
3960 maxtotallen = 10000;
3961 } else if (maxtotallen > maxtotallen_bound) {
3962 maxtotallen = maxtotallen_bound;
3963 }
3964 }
3965
3966 debug(fprintf(stderr,"trimlength = %d, maxtotallen = %d\n",trimlength,maxtotallen));
3967
3968 /* Scan ends (find first pair and stutter) */
3969 debug(printf("Finding first pair, rev, with maxentries = %d\n",maxentries));
3970
3971 this_rev = Stage1_new(queryrc,maxtotallen,maxentries);
3972 read_oligos(this_rev,queryrc,/*genestrand*/+2);
3973 #ifdef PMAP
3974 identify_repeated_oligos(this_rev,/*oligobase*/index1part_aa,this_rev->querylength);
3975 #else
3976 identify_repeated_oligos(this_rev,/*oligobase*/index1part,this_rev->querylength);
3977 #endif
3978
3979 #ifdef SCAN_ENDS
3980 gregionlist = scan_ends(&shortseqp,/*second_pass_p*/false,gregionlist,this_rev,
3981 indexdb_rev,indexdb_rev,/*genestrand*/+2,chromosome_iit,chrsubset_start,chrsubset_end,
3982 matchpool,stutterhits,diagnostic,/*iteratep*/false);
3983 debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this_rev->matches5),dangling_pct(this_rev->matches3)));
3984 if (shortseqp == true) {
3985 gregionlist = scan_ends(&shortseqp,/*second_pass_p*/true,gregionlist,this_rev,
3986 indexdb_rev,indexdb_rev,/*genestrand*/+2,chromosome_iit,chrsubset_start,chrsubset_end,
3987 matchpool,stutterhits,diagnostic,/*iteratep*/false);
3988 debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this_rev->matches5),dangling_pct(this_rev->matches3)));
3989 }
3990 #endif
3991
3992 debug(printf("\nAfter scan_ends, rev:\n"));
3993 debug(
3994 for (p = gregionlist; p != NULL; p = List_next(p)) {
3995 gregion = (Gregion_T) List_head(p);
3996 Gregion_print(gregion);
3997 }
3998 );
3999 debug(printf("End of scan_ends, rev.\n\n"));
4000
4001
4002 if (gregionlist == NULL) {
4003 /* Don't use dangling to determine lowidentityp */
4004 *lowidentityp = true;
4005 }
4006
4007 #if 0
4008 /* Perform sampling, if necessary */
4009 if (gregionlist == NULL || Gregion_best_weight(gregionlist) < SUFFICIENT_FIRST_WEIGHT) {
4010 if (dangling_pct(this_fwd->matches5) > MAX_DANGLING_PCT &&
4011 dangling_pct(this_fwd->matches3) > MAX_DANGLING_PCT) {
4012 #endif
4013
4014 #ifdef PMAP
4015 debug(printf("Starting sample_oligos, fwd\n"));
4016 sample_oligos_nolimit(this_fwd,indexdb_fwd,indexdb_fwd,this_fwd->querylength,/*oligobase*/index1part_aa);
4017 collapse_diagonals(this_fwd->plus_positions,this_fwd->plus_npositions,
4018 /*oligobase*/index1part_aa,this_fwd->querylength);
4019 plus_segments_fwd = find_segments(&plus_nsegments_fwd,this_fwd->plus_positions,this_fwd->plus_npositions,
4020 /*oligobase*/index1part_aa,this_fwd->querylength,
4021 /*threshold_score*/18);
4022
4023 collapse_diagonals(this_fwd->minus_positions,this_fwd->minus_npositions,
4024 /*oligobase*/index1part_aa,this_fwd->querylength);
4025 minus_segments_fwd = find_segments(&minus_nsegments_fwd,this_fwd->minus_positions,this_fwd->minus_npositions,
4026 /*oligobase*/index1part_aa,this_fwd->querylength,
4027 /*threshold_score*/18);
4028 #else
4029 debug(printf("Starting sample_oligos, fwd\n"));
4030 sample_oligos_nolimit(this_fwd,indexdb_fwd,indexdb_fwd,this_fwd->querylength,/*oligobase*/index1part);
4031 collapse_diagonals(this_fwd->plus_positions,this_fwd->plus_npositions,
4032 /*oligobase*/index1part,this_fwd->querylength);
4033 plus_segments_fwd = find_segments(&plus_nsegments_fwd,this_fwd->plus_positions,this_fwd->plus_npositions,
4034 /*oligobase*/index1part,this_fwd->querylength,
4035 /*threshold_score*/18);
4036
4037 collapse_diagonals(this_fwd->minus_positions,this_fwd->minus_npositions,
4038 /*oligobase*/index1part,this_fwd->querylength);
4039 minus_segments_fwd = find_segments(&minus_nsegments_fwd,this_fwd->minus_positions,this_fwd->minus_npositions,
4040 /*oligobase*/index1part,this_fwd->querylength,
4041 /*threshold_score*/18);
4042 #endif
4043
4044 maxexons = 3;
4045 compute_paths(plus_prev_fwd,plus_scores_fwd,plus_segments_fwd,plus_nsegments_fwd,/*maxexons*/MAXEXONS,/*plusp*/true);
4046 compute_paths(minus_prev_fwd,minus_scores_fwd,minus_segments_fwd,minus_nsegments_fwd,/*maxexons*/MAXEXONS,/*plusp*/false);
4047
4048 #if 0
4049 }
4050 #endif
4051
4052 #if 0
4053 if (dangling_pct(this_rev->matches5) > MAX_DANGLING_PCT &&
4054 dangling_pct(this_rev->matches3) > MAX_DANGLING_PCT) {
4055 #endif
4056
4057 #ifdef PMAP
4058 debug(printf("Starting sample_oligos, rev\n"));
4059 sample_oligos_nolimit(this_rev,indexdb_rev,indexdb_rev,this_rev->querylength,/*oligobase*/index1part_aa);
4060 collapse_diagonals(this_rev->plus_positions,this_rev->plus_npositions,
4061 /*oligobase*/index1part_aa,this_rev->querylength);
4062 plus_segments_rev = find_segments(&plus_nsegments_rev,this_rev->plus_positions,this_rev->plus_npositions,
4063 /*oligobase*/index1part_aa,this_rev->querylength,
4064 /*threshold_score*/18);
4065
4066 collapse_diagonals(this_rev->minus_positions,this_rev->minus_npositions,
4067 /*oligobase*/index1part_aa,this_rev->querylength);
4068 minus_segments_rev = find_segments(&minus_nsegments_rev,this_rev->minus_positions,this_rev->minus_npositions,
4069 /*oligobase*/index1part_aa,this_rev->querylength,
4070 /*threshold_score*/18);
4071 #else
4072 debug(printf("Starting sample_oligos, rev\n"));
4073 sample_oligos_nolimit(this_rev,indexdb_rev,indexdb_rev,this_rev->querylength,/*oligobase*/index1part);
4074 collapse_diagonals(this_rev->plus_positions,this_rev->plus_npositions,
4075 /*oligobase*/index1part,this_rev->querylength);
4076 plus_segments_rev = find_segments(&plus_nsegments_rev,this_rev->plus_positions,this_rev->plus_npositions,
4077 /*oligobase*/index1part,this_rev->querylength,
4078 /*threshold_score*/18);
4079
4080 collapse_diagonals(this_rev->minus_positions,this_rev->minus_npositions,
4081 /*oligobase*/index1part,this_rev->querylength);
4082 minus_segments_rev = find_segments(&minus_nsegments_rev,this_rev->minus_positions,this_rev->minus_npositions,
4083 /*oligobase*/index1part,this_rev->querylength,
4084 /*threshold_score*/18);
4085 #endif
4086
4087 maxexons = 3;
4088 compute_paths(plus_prev_rev,plus_scores_rev,plus_segments_rev,plus_nsegments_rev,/*maxexons*/MAXEXONS,/*plusp*/true);
4089 compute_paths(minus_prev_rev,minus_scores_rev,minus_segments_rev,minus_nsegments_rev,/*maxexons*/MAXEXONS,/*plusp*/false);
4090
4091 #if 0
4092 }
4093 #endif
4094
4095 find_best_scores_nonstranded(nthbest,bestscores,plus_scores_fwd,/*plus_nscores*/plus_nsegments_fwd,
4096 minus_scores_fwd,/*minus_nscores*/minus_nsegments_fwd,
4097 plus_scores_rev,/*plus_nscores*/plus_nsegments_rev,
4098 minus_scores_rev,/*minus_nscores*/minus_nsegments_rev,
4099 /*maxexons*/3,/*n*/nbest);
4100
4101 for (k = 1; k <= maxexons; k++) {
4102 if (nthbest[k] < bestscores[k] - SUBOPTIMAL) {
4103 nthbest[k] = bestscores[k] - SUBOPTIMAL;
4104 }
4105 if (plus_nsegments_fwd > 0) {
4106 gregionlist = find_good_paths(gregionlist,/*nexons*/k,plus_prev_fwd[k],plus_scores_fwd[k],
4107 plus_segments_fwd,plus_nsegments_fwd,/*threshold_score*/nthbest[k],
4108 chromosome_iit,this_fwd->querylength,
4109 trimstart,trimend,/*plusp*/true,/*genestrand*/+1);
4110 }
4111 if (minus_nsegments_fwd > 0) {
4112 gregionlist = find_good_paths(gregionlist,/*nexons*/k,minus_prev_fwd[k],minus_scores_fwd[k],
4113 minus_segments_fwd,minus_nsegments_fwd,/*threshold_score*/nthbest[k],
4114 chromosome_iit,this_fwd->querylength,
4115 trimstart,trimend,/*plusp*/false,/*genestrand*/+1);
4116 }
4117 if (plus_nsegments_rev > 0) {
4118 gregionlist = find_good_paths(gregionlist,/*nexons*/k,plus_prev_rev[k],plus_scores_rev[k],
4119 plus_segments_rev,plus_nsegments_rev,/*threshold_score*/nthbest[k],
4120 chromosome_iit,this_rev->querylength,
4121 trimstart,trimend,/*plusp*/true,/*genestrand*/+2);
4122 }
4123 if (minus_nsegments_rev > 0) {
4124 gregionlist = find_good_paths(gregionlist,/*nexons*/k,minus_prev_rev[k],minus_scores_rev[k],
4125 minus_segments_rev,minus_nsegments_rev,/*threshold_score*/nthbest[k],
4126 chromosome_iit,this_rev->querylength,
4127 trimstart,trimend,/*plusp*/false,/*genestrand*/+2);
4128 }
4129 }
4130
4131 if (minus_nsegments_rev > 0) {
4132 for (k = 1; k <= maxexons; k++) {
4133 FREE(minus_scores_rev[k]);
4134 FREE(minus_prev_rev[k]);
4135 }
4136 }
4137 if (plus_nsegments_rev > 0) {
4138 for (k = 1; k <= maxexons; k++) {
4139 FREE(plus_scores_rev[k]);
4140 FREE(plus_prev_rev[k]);
4141 }
4142 }
4143 if (minus_nsegments_fwd > 0) {
4144 for (k = 1; k <= maxexons; k++) {
4145 FREE(minus_scores_fwd[k]);
4146 FREE(minus_prev_fwd[k]);
4147 }
4148 }
4149 if (plus_nsegments_fwd > 0) {
4150 for (k = 1; k <= maxexons; k++) {
4151 FREE(plus_scores_fwd[k]);
4152 FREE(plus_prev_fwd[k]);
4153 }
4154 }
4155 FREE(minus_segments_rev);
4156 FREE(plus_segments_rev);
4157 FREE(minus_segments_fwd);
4158 FREE(plus_segments_fwd);
4159
4160 #if 0
4161 }
4162 #endif
4163
4164 /* Clean up gregionlist */
4165 debug(printf("Starting extensions\n"));
4166 for (p = gregionlist; p != NULL; p = List_next(p)) {
4167 gregion = (Gregion_T) List_head(p);
4168 if (Gregion_extendedp(gregion) == false) {
4169 /* Need to extend, otherwise we won't align NM_003360 */
4170 if (Gregion_genestrand(gregion) > 0) {
4171 find_extensions(&extension5,&extension3,this_fwd,gregion,this_fwd->maxtotallen,/*continuousp*/false);
4172 Gregion_extend(gregion,extension5,extension3,this_fwd->querylength);
4173 } else {
4174 find_extensions(&extension5,&extension3,this_rev,gregion,this_rev->maxtotallen,/*continuousp*/false);
4175 Gregion_extend(gregion,extension5,extension3,this_rev->querylength);
4176 }
4177 }
4178 }
4179 debug(printf("Finished with extensions\n"));
4180
4181 debug(printf("Before filtering, %d regions\n",List_length(gregionlist)));
4182 debug(
4183 for (p = gregionlist; p != NULL; p = List_next(p)) {
4184 gregion = (Gregion_T) List_head(p);
4185 Gregion_print(gregion);
4186 }
4187 );
4188
4189 #if 0
4190 /* For nonstranded, want to filter for support */
4191 gregionlist = Gregion_filter_support(gregionlist,BOUNDARY_SUPPORT,PCT_MAX_SUPPORT,DIFF_MAX_SUPPORT);
4192 debug(printf("After filtering for support, %d regions\n",List_length(gregionlist)));
4193 #endif
4194
4195 #if 0
4196 /* Don't filter for max_regions anymore */
4197 if (List_length(gregionlist) > MAX_GREGIONS_PRE_UNIQUE) {
4198 debug("Too many gregions, so erasing them\n");
4199 for (p = gregionlist; p != NULL; p = List_next(p)) {
4200 gregion = (Gregion_T) List_head(p);
4201 Gregion_free(&gregion);
4202 }
4203 List_free(&gregionlist);
4204 gregionlist = NULL;
4205
4206 } else {
4207 #endif
4208
4209
4210 #ifdef USE_CLEAN
4211 Gregion_filter_clean(gregionlist,nchrs);
4212 #endif
4213
4214 #if 0
4215 for (p = gregionlist; p != NULL; p = p->rest) {
4216 gregion = (Gregion_T) List_head(p);
4217 Gregion_print(gregion);
4218 }
4219 #endif
4220
4221 gregionlist = Gregion_filter_unique(gregionlist);
4222 debug0(printf("After filtering for unique, %d regions\n",List_length(gregionlist)));
4223 debug0(
4224 for (p = gregionlist; p != NULL; p = List_next(p)) {
4225 gregion = (Gregion_T) List_head(p);
4226 Gregion_print(gregion);
4227 }
4228 );
4229
4230 debug0(
4231 if (List_length(gregionlist) > MAX_GREGIONS_POST_UNIQUE) {
4232 printf("Too many gregions %d, so taking the top %d\n",List_length(gregionlist),MAX_GREGIONS_POST_UNIQUE);
4233 });
4234
4235 for (p = gregionlist, i = 1; p != NULL && i < MAX_GREGIONS_POST_UNIQUE; p = List_next(p)) {
4236 /* Advance */
4237 i++;
4238 }
4239 if (p != NULL) {
4240 q = List_next(p);
4241 p->rest = (List_T) NULL;
4242 for (p = q; p != NULL; p = List_next(p)) {
4243 gregion = (Gregion_T) List_head(p);
4244 Gregion_free(&gregion);
4245 }
4246 List_free(&q);
4247 }
4248 #if 0
4249 }
4250 #endif
4251
4252 diagnostic->stage1_runtime = Stopwatch_stop(stopwatch);
4253 diagnostic->ngregions = List_length(gregionlist);
4254
4255 Stage1_free(&this_rev);
4256 Stage1_free(&this_fwd);
4257
4258 Sequence_free(&queryrc);
4259
4260 debug0(printf("Stage 1 returning %d new regions\n",List_length(gregionlist)));
4261 debug0(
4262 for (p = gregionlist; p != NULL; p = List_next(p)) {
4263 gregion = (Gregion_T) List_head(p);
4264 Gregion_print(gregion);
4265 }
4266 );
4267
4268 return gregionlist;
4269 }
4270
4271