1 static char rcsid[] = "$Id: stage2.c 218187 2019-01-17 13:15:10Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "stage2.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h>
11
12 #include "assert.h"
13 #include "mem.h"
14 #include "comp.h"
15 #include "pair.h"
16 #include "pairdef.h"
17 #include "intlist.h"
18 #include "diag.h"
19 #include "genome_sites.h"
20 #include "complement.h"
21 #include "maxent_hr.h"
22
23
24 /* Tests whether genomicseg == query in convert_to_nucleotides, and
25 whether oligoindex_hr gives same results as oligoindex */
26 /* #define EXTRACT_GENOMICSEG 1 */
27
28 /* #define USE_DIAGPOOL 1 -- Defined in diagpool.h */
29
30 /* #define SQUARE 1 */
31
32 /* #define SLOW 1 */
33
34 #define SUFF_PCTCOVERAGE_OLIGOINDEX 0.90
35
36 /* #define SUFF_PCTCOVERAGE_STAGE2 0.10 */
37 #define SUFF_NCOVERED 200
38 #define SUFF_MAXNCONSECUTIVE 20
39 #define GREEDY_NCONSECUTIVE 100
40
41 #define MAX_NACTIVE 100 /* 100 previously considered too low, but may
42 be okay in conjunction with
43 diagonalization */
44 #define MAX_GRAND_LOOKBACK 200
45
46 /* Penalty for genomic distances */
47
48 #define INTRON_PENALTY_UNKNOWN 8
49 #define INTRON_PENALTY_INCONSISTENT 16
50
51 /* Needs to be high to avoid short exons, but needs to be low to identify short exons. */
52 /* On an example by Nathan Weeks, needed to set this value to be 1 or
53 0 to find a short exon. Setting to 0 gives too many short exons.
54 Also found querydist_credit to be a bad idea. */
55 #define NINTRON_PENALTY_MISMATCH 1
56 /* #define USE_QUERYDIST_CREDIT 1 */
57
58 #define NON_CANONICAL_PENALTY_ENDS 4
59 #define NON_CANONICAL_PENALTY_MIDDLE 4
60 #define MISS_BEHIND 16
61 #define GREEDY_ADVANCE 6
62 #define FINAL_SCORE_TOLERANCE 20 /* Was 8, but missed some paths on Y chromosome */
63 #define NONOVERLAPPING_SCORE_TOLERANCE 0.5
64
65 #define ENOUGH_CONSECUTIVE 32
66
67 #define INFINITE 1000000
68
69 /* EQUAL_DISTANCE used to be 3 for PMAP and 6 for GMAP, but that
70 allowed indels in repetitive regions. Now have separate
71 variables. */
72 #ifdef PMAP
73 #define EQUAL_DISTANCE_FOR_CONSECUTIVE 0
74 #define EQUAL_DISTANCE_NOT_SPLICING 3
75 #else
76 #define EQUAL_DISTANCE_FOR_CONSECUTIVE 0
77 #define EQUAL_DISTANCE_NOT_SPLICING 9
78 #endif
79
80
81 #define INTRON_DEFN 9 /* Cannot exceed 9 */
82 #define NEAR_END_LENGTH 20 /* Determines whether to ignore EXON_DEFN at ends */
83 #define EXON_DEFN 30
84 #define MAX_SKIPPED 3
85
86 #define SCORE_FOR_RESTRICT 10
87 /* #define SUFFICIENT_ROOTNLINKS 10 */ /* Setting this too low can slow down program considerably */
88
89
90 #ifdef PMAP
91 #define SAMPLE_INTERVAL 1
92 #define NT_PER_MATCH 3
93 #define CONSEC_POINTS_PER_MATCH 3 /* Possible increase to reward consecutiveness */
94 #define NONCODON_INDEL_PENALTY 15
95 #else
96 #define SAMPLE_INTERVAL 2 /* For cases where adjacentp == false.
97 Means that we can find islands of
98 9-mers */
99 #define NT_PER_MATCH 1
100 #define NT_PER_CODON 3
101 #define CONSEC_POINTS_PER_MATCH 1 /* Possible increase to reward consecutiveness */
102 #define CONSEC_POINTS_PER_CODON 3 /* Possible increase to reward consecutiveness */
103 #endif
104
105 #define SHIFT_EXTRA 15
106
107 #define ONE 1
108 #define TEN_THOUSAND 8192 /* Power of 2 */
109 #define HUNDRED_THOUSAND 100000.0
110 #define ONE_MILLION 1000000.0
111
112
113
114 static bool splicingp;
115 static bool use_canonical_middle_p;
116 static bool use_canonical_ends_p;
117 static int suboptimal_score_end;
118 static int suboptimal_score_start;
119 static Mode_T mode;
120 static bool snps_p;
121 static int sufflookback;
122 static int nsufflookback;
123 static int maxintronlen;
124
125
126 void
Stage2_setup(bool splicingp_in,bool cross_species_p,int suboptimal_score_start_in,int suboptimal_score_end_in,int sufflookback_in,int nsufflookback_in,int maxintronlen_in,Mode_T mode_in,bool snps_p_in)127 Stage2_setup (bool splicingp_in, bool cross_species_p,
128 int suboptimal_score_start_in, int suboptimal_score_end_in,
129 int sufflookback_in, int nsufflookback_in, int maxintronlen_in,
130 Mode_T mode_in, bool snps_p_in) {
131 splicingp = splicingp_in;
132 if (splicingp == true) {
133 use_canonical_ends_p = true;
134 } else {
135 use_canonical_ends_p = false;
136 }
137 if (cross_species_p == true) {
138 use_canonical_middle_p = true;
139 } else {
140 use_canonical_middle_p = false;
141 }
142 suboptimal_score_start = suboptimal_score_start_in;
143 suboptimal_score_end = suboptimal_score_end_in;
144
145 sufflookback = sufflookback_in;
146 nsufflookback = nsufflookback_in;
147 maxintronlen = maxintronlen_in;
148
149 mode = mode_in;
150 snps_p = snps_p_in;
151 return;
152 }
153
154
155 /* General */
156 #ifdef DEBUG
157 #define debug(x) x
158 #else
159 #define debug(x)
160 #endif
161
162 /* Final results of stage 2 */
163 #ifdef DEBUG0
164 #define debug0(x) x
165 #else
166 #define debug0(x)
167 #endif
168
169 /* Print all links */
170 #ifdef DEBUG1
171 #define debug1(x) x
172 #else
173 #define debug1(x)
174 #endif
175
176 /* For generating a graph */
177 #ifdef DEBUG3
178 #define debug3(x) x
179 #else
180 #define debug3(x)
181 #endif
182
183 /* Converting to nucleotides */
184 #ifdef DEBUG5
185 #define debug5(x) x
186 #else
187 #define debug5(x)
188 #endif
189
190 /* revise_active */
191 #ifdef DEBUG6
192 #define debug6(x) x
193 #else
194 #define debug6(x)
195 #endif
196
197 /* Shifted canonical */
198 #ifdef DEBUG7
199 #define debug7(x) x
200 #else
201 #define debug7(x)
202 #endif
203
204 /* find_canonical_dinucleotides */
205 #ifdef DEBUG8
206 #define debug8(x) x
207 #else
208 #define debug8(x)
209 #endif
210
211 /* Dynamic programming */
212 /* Can also define debug9(x) as: if (curr_querypos == XX) {x;} */
213 #ifdef DEBUG9
214 #define debug9(x) x
215 #else
216 #define debug9(x)
217 #endif
218
219 /* binary search */
220 #ifdef DEBUG10
221 #define debug10(x) x
222 #else
223 #define debug10(x)
224 #endif
225
226 /* Multiple alignments */
227 #ifdef DEBUG11
228 #define debug11(x) x
229 #else
230 #define debug11(x)
231 #endif
232
233 /* Grand winner */
234 #ifdef DEBUG12
235 #define debug12(x) x
236 #else
237 #define debug12(x)
238 #endif
239
240
241 /* Filter unique */
242 #ifdef DEBUG13
243 #define debug13(x) x
244 #else
245 #define debug13(x)
246 #endif
247
248 /* Filter unique, details of overlap */
249 #ifdef DEBUG13A
250 #define debug13a(x) x
251 #else
252 #define debug13a(x)
253 #endif
254
255
256 struct Stage2_alloc_T {
257 int max_querylength_alloc;
258
259 bool *coveredp;
260 Chrpos_T **mappings;
261 int *npositions;
262 unsigned int *minactive;
263 unsigned int *maxactive;
264 int *firstactive;
265 int *nactive;
266 };
267
268 void
Stage2_alloc_free(Stage2_alloc_T * old)269 Stage2_alloc_free (Stage2_alloc_T *old) {
270 FREE((*old)->firstactive);
271 FREE((*old)->nactive);
272 FREE((*old)->maxactive);
273 FREE((*old)->minactive);
274 FREE((*old)->npositions);
275 FREE((*old)->mappings);
276 FREE((*old)->coveredp);
277 FREE(*old);
278 return;
279 }
280
281 Stage2_alloc_T
Stage2_alloc_new(int max_querylength_alloc)282 Stage2_alloc_new (int max_querylength_alloc) {
283 Stage2_alloc_T new = (Stage2_alloc_T) MALLOC(sizeof(*new));
284
285 new->max_querylength_alloc = max_querylength_alloc;
286
287 new->coveredp = (bool *) MALLOC(max_querylength_alloc * sizeof(bool));
288 new->mappings = (Chrpos_T **) MALLOC(max_querylength_alloc * sizeof(Chrpos_T *));
289 new->npositions = (int *) MALLOC(max_querylength_alloc * sizeof(int));
290 new->minactive = (unsigned int *) MALLOC(max_querylength_alloc * sizeof(unsigned int));
291 new->maxactive = (unsigned int *) MALLOC(max_querylength_alloc * sizeof(unsigned int));
292 new->firstactive = (int *) MALLOC(max_querylength_alloc * sizeof(int));
293 new->nactive = (int *) MALLOC(max_querylength_alloc * sizeof(int));
294
295 return new;
296 }
297
298
299 #define T Stage2_T
300 struct T {
301 List_T middle;
302 List_T all_starts;
303 List_T all_ends;
304 };
305
306
307 void
Stage2_free(T * old)308 Stage2_free (T *old) {
309 /* List_free(&(*old)->middle); -- Not necessary because of pairpool */
310 /* List_free(&(*old)->all_starts); -- Handled by Stage3middle_free */
311 /* List_free(&(*old)->all_ends); -- Handled by Stage3middle_free */
312 FREE(*old);
313 return;
314 }
315
316 static T
Stage2_new(List_T middle,List_T all_starts,List_T all_ends)317 Stage2_new (List_T middle, List_T all_starts, List_T all_ends) {
318 T new = (T) MALLOC(sizeof(*new));
319 #ifdef DEBUG0
320 List_T p;
321 #endif
322
323 new->middle = middle;
324 new->all_starts = all_starts;
325 new->all_ends = all_ends;
326
327 #ifdef DEBUG0
328 printf("Starts:\n");
329 for (p = all_starts; p != NULL; p = List_next(p)) {
330 Pair_dump_list(List_head(p),true);
331 }
332
333 printf("Ends:\n");
334 for (p = all_ends; p != NULL; p = List_next(p)) {
335 Pair_dump_list(List_head(p),true);
336 }
337 #endif
338
339 return new;
340 }
341
342 List_T
Stage2_middle(T this)343 Stage2_middle (T this) {
344 return this->middle;
345 }
346
347 List_T
Stage2_all_starts(T this)348 Stage2_all_starts (T this) {
349 return this->all_starts;
350 }
351
352 List_T
Stage2_all_ends(T this)353 Stage2_all_ends (T this) {
354 return this->all_ends;
355 }
356
357
358 /************************************************************************/
359
360 typedef struct Link_T *Link_T;
361 struct Link_T {
362 int fwd_consecutive;
363 int fwd_rootposition;
364 /*int fwd_rootnlinks;*/ /* Number of links in last branch */
365 /* int fwd_score; */ /* Kept as a separate structure */
366
367 int fwd_pos;
368 int fwd_hit;
369 int fwd_tracei; /* Corresponds to a distinct set of branches */
370
371 #ifdef DEBUG9
372 int fwd_intronnfwd;
373 int fwd_intronnrev;
374 int fwd_intronnunk;
375 #endif
376
377 #ifdef SEPARATE_FWD_REV
378 /* No longer checking separate fwd/rev directions */
379 int rev_consecutive;
380 int rev_rootposition;
381 /*int rev_rootnlinks;*/ /* Number of links in last branch */
382 int rev_score;
383
384 int rev_pos;
385 int rev_hit;
386
387 #ifdef DEBUG9
388 int rev_tracei; /* Corresponds to a distinct set of branches */
389 int rev_intronnfwd;
390 int rev_intronnrev;
391 int rev_intronnunk;
392 #endif /* rev */
393
394 #endif
395 };
396
397
398 /* lengths2 is has length1 entries. Note that lengths2 may have
399 negative entries */
400 static struct Link_T **
Linkmatrix_1d_new(int length1,int * lengths2,int totallength)401 Linkmatrix_1d_new (int length1, int *lengths2, int totallength) {
402 struct Link_T **links;
403 int i;
404
405 /* Outer dimension can be MALLOC, but inner one must be CALLOC */
406 links = (struct Link_T **) MALLOC(length1 * sizeof(struct Link_T *));
407 links[0] = (struct Link_T *) CALLOC(totallength,sizeof(struct Link_T));
408 for (i = 1; i < length1; i++) {
409 if (lengths2[i-1] < 0) {
410 links[i] = links[i-1];
411 } else {
412 links[i] = &(links[i-1][lengths2[i-1]]);
413 }
414 }
415 return links;
416 }
417
418 static void
Linkmatrix_1d_free(struct Link_T *** links)419 Linkmatrix_1d_free (struct Link_T ***links) {
420 FREE((*links)[0]);
421 FREE(*links);
422 return;
423 }
424
425
426 static struct Link_T **
Linkmatrix_2d_new(int length1,int * lengths2)427 Linkmatrix_2d_new (int length1, int *lengths2) {
428 struct Link_T **links;
429 int i;
430
431 links = (struct Link_T **) CALLOC(length1,sizeof(struct Link_T *));
432 for (i = 0; i < length1; i++) {
433 if (lengths2[i] <= 0) {
434 links[i] = (struct Link_T *) NULL;
435 } else {
436 links[i] = (struct Link_T *) CALLOC(lengths2[i],sizeof(struct Link_T));
437 }
438 }
439 return links;
440 }
441
442 static void
Linkmatrix_2d_free(struct Link_T *** links,int length1)443 Linkmatrix_2d_free (struct Link_T ***links, int length1) {
444 int i;
445
446 for (i = 0; i < length1; i++) {
447 if ((*links)[i]) {
448 FREE((*links)[i]);
449 }
450 }
451 FREE(*links);
452 return;
453 }
454
455
456
457 #ifdef DEBUG1
458 #ifdef SEPARATE_FWD_REV
459 static void
Linkmatrix_print_both(struct Link_T ** links,Chrpos_T ** mappings,int length1,int * npositions,char * queryseq_ptr,int indexsize)460 Linkmatrix_print_both (struct Link_T **links, Chrpos_T **mappings, int length1,
461 int *npositions, char *queryseq_ptr, int indexsize) {
462 int i, j;
463 char *oligo;
464
465 oligo = (char *) MALLOCA((indexsize+1) * sizeof(char));
466 for (i = 0; i <= length1-indexsize; i++) {
467 strncpy(oligo,&(queryseq_ptr[i]),indexsize);
468 oligo[indexsize] = '\0';
469
470 printf("Querypos %d (%s, %d positions):",i,oligo,npositions[i]);
471 for (j = 0; j < npositions[i]; j++) {
472 printf(" %d.%u:%d(%d,%d)[%u]-%d(%d,%d)[%u]",
473 j,mappings[i][j],links[i][j].fwd_score,
474 links[i][j].fwd_pos,links[i][j].fwd_hit,links[i][j].fwd_tracei,
475 links[i][j].rev_score,
476 links[i][j].rev_pos,links[i][j].rev_hit,links[i][j].rev_tracei);
477 }
478 printf("\n");
479 }
480 printf("\n");
481
482 FREEA(oligo);
483
484 return;
485 }
486
487 #else
488
489 /* For PMAP, indexsize is in aa */
490 static void
print_fwd(struct Link_T ** links,int ** fwd_scores,Chrpos_T ** mappings,int length1,int * npositions,char * queryseq_ptr,int indexsize)491 print_fwd (struct Link_T **links, int **fwd_scores,
492 Chrpos_T **mappings, int length1,
493 int *npositions, char *queryseq_ptr, int indexsize) {
494 int i, j, lastpos;
495 char *oligo;
496
497 oligo = (char *) MALLOCA((indexsize+1) * sizeof(char));
498 lastpos = length1 - indexsize;
499
500 for (i = 0; i <= lastpos; i++) {
501 strncpy(oligo,&(queryseq_ptr[i]),indexsize);
502 oligo[indexsize] = '\0';
503
504 printf("Querypos %d (%s, %d positions):",i,oligo,npositions[i]);
505 for (j = 0; j < npositions[i]; j++) {
506 printf(" %d.%u:%d(%d,%d)[%u]",
507 j,mappings[i][j],fwd_scores[i][j],
508 links[i][j].fwd_pos,links[i][j].fwd_hit,links[i][j].fwd_tracei);
509 }
510 printf("\n");
511 }
512 printf("\n");
513
514 FREEA(oligo);
515
516 return;
517 }
518
519 #endif
520 #endif
521
522 static void
mappings_dump_R(Chrpos_T ** mappings,int * npositions,int length1,int ** active,int * firstactive,int indexsize,char * varname)523 mappings_dump_R (Chrpos_T **mappings, int *npositions, int length1,
524 int **active, int *firstactive, int indexsize, char *varname) {
525 int querypos;
526 int lastpos, hit;
527 bool printp = false;
528
529 lastpos = length1 - indexsize;
530 printf("%s <- matrix(c(\n",varname);
531 for (querypos = 0; querypos < lastpos; querypos++) {
532 if (firstactive) {
533 if (mappings[querypos] != NULL) {
534 hit = firstactive[querypos];
535 while (hit != -1) {
536 /* Last elt is for score */
537 if (printp == false) {
538 printp = true;
539 } else {
540 printf(",\n");
541 }
542 printf("%d,%d,%d,%d",querypos,mappings[querypos][hit],
543 hit,active[querypos][hit]);
544 hit = active[querypos][hit];
545 }
546 }
547 } else {
548 for (hit = 0; hit < npositions[querypos]; hit++) {
549 if (printp == false) {
550 printp = true;
551 } else {
552 printf(",\n");
553 }
554 printf("%d,%d,%d",querypos,mappings[querypos][hit],hit);
555 }
556 }
557 }
558 printf("),ncol=2,byrow=T)\n");
559
560 return;
561 }
562
563
564 #if 0
565 static void
566 best_path_dump_R (struct Link_T **links, Chrpos_T **mappings,
567 int querypos, int hit, bool fwdp, char *varname) {
568 Chrpos_T position;
569 int prev_querypos, prevhit, save_querypos, savehit;
570 bool printp = false;
571
572 save_querypos = querypos;
573 savehit = hit;
574
575 printf("%s <- matrix(c(\n",varname);
576 prev_querypos = querypos+1;
577 while (querypos >= 0) {
578 position = mappings[querypos][hit];
579
580 if (printp == false) {
581 printp = true;
582 } else {
583 printf(",\n");
584 }
585 printf("%d,%d",querypos,position);
586
587 prev_querypos = querypos;
588 prevhit = hit;
589 if (fwdp) {
590 querypos = links[prev_querypos][prevhit].fwd_pos;
591 hit = links[prev_querypos][prevhit].fwd_hit;
592 #ifdef SEPARATE_FWD_REV
593 } else {
594 querypos = links[prev_querypos][prevhit].rev_pos;
595 hit = links[prev_querypos][prevhit].rev_hit;
596 #endif
597 }
598 }
599 printf("),ncol=2,byrow=T)\n");
600
601 querypos = save_querypos;
602 hit = savehit;
603
604 printp = false;
605 printf("%s <- matrix(c(\n","scores");
606 prev_querypos = querypos+1;
607 while (querypos >= 0) {
608 position = mappings[querypos][hit];
609
610 if (printp == false) {
611 printp = true;
612 } else {
613 printf(",\n");
614 }
615 if (fwdp == true) {
616 printf("%d,%d",querypos,links[querypos][hit].fwd_score);
617 #ifdef SEPARATE_FWD_REV
618 } else {
619 printf("%d,%d",querypos,links[querypos][hit].rev_score);
620 #endif
621 }
622
623 prev_querypos = querypos;
624 prevhit = hit;
625 if (fwdp) {
626 querypos = links[prev_querypos][prevhit].fwd_pos;
627 hit = links[prev_querypos][prevhit].fwd_hit;
628 #ifdef SEPARATE_FWD_REV
629 } else {
630 querypos = links[prev_querypos][prevhit].rev_pos;
631 hit = links[prev_querypos][prevhit].rev_hit;
632 #endif
633 }
634 }
635 printf("),ncol=2,byrow=T)\n");
636
637 return;
638 }
639 #endif
640
641 static void
active_bounds_dump_R(Chrpos_T * minactive,Chrpos_T * maxactive,int querylength)642 active_bounds_dump_R (Chrpos_T *minactive, Chrpos_T *maxactive,
643 int querylength) {
644 int querypos;
645 bool printp = false;
646
647 printf("querypos <- 0:%d\n",querylength-1);
648 printf("%s <- c(\n","minactive");
649 for (querypos = 0; querypos < querylength; querypos++) {
650 if (printp == false) {
651 printp = true;
652 } else {
653 printf(",\n");
654 }
655 printf("%d",minactive[querypos]);
656 }
657 printf(")\n");
658
659 printp = false;
660 printf("%s <- c(\n","maxactive");
661 for (querypos = 0; querypos < querylength; querypos++) {
662 if (printp == false) {
663 printp = true;
664 } else {
665 printf(",\n");
666 }
667 printf("%d",maxactive[querypos]);
668 }
669 printf(")\n");
670
671 return;
672 }
673
674
675 #ifdef PMAP
676 #define QUERYDIST_PENALTY_FACTOR 2
677 #else
678 #define QUERYDIST_PENALTY_FACTOR 8
679 #endif
680
681
682 /************************************************************************
683 * Procedures for finding canonical introns quickly
684 ************************************************************************/
685
686 #ifdef DEBUG8
687
688 static void
print_last_dinucl(int * last_dinucl,int genomiclength)689 print_last_dinucl (int *last_dinucl, int genomiclength) {
690 int pos;
691
692 for (pos = 0; pos < genomiclength - 3 + SHIFT_EXTRA; pos++) {
693 printf("%d %d\n",pos,last_dinucl[pos]);
694 }
695 printf("\n");
696
697 return;
698 }
699
700 #endif
701
702
703 #if 0
704 /* Need this procedure because we are skipping some oligomers */
705 static bool
706 find_shifted_canonical (Chrpos_T leftpos, Chrpos_T rightpos, int querydistance,
707 Chrpos_T (*genome_left_position)(Chrpos_T, Chrpos_T, Univcoord_T, Univcoord_T, bool),
708 Chrpos_T (*genome_right_position)(Chrpos_T, Chrpos_T, Univcoord_T, Univcoord_T, bool),
709 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp, bool skip_repetitive_p) {
710 Chrpos_T leftdi, rightdi;
711 Chrpos_T last_leftpos, last_rightpos;
712 int shift, leftmiss, rightmiss;
713 Chrpos_T left_chrbound, right_chrbound;
714
715 /* leftpos = prevposition + querydistance + indexsize_nt - 1; */
716 /* rightpos = position; */
717
718 debug7(printf("Looking for shifted canonical at leftpos %u to rightpos %u, chroffset %u, chrhigh %u\n",
719 leftpos,rightpos,chroffset,chrhigh));
720
721 #if 0
722 /* previously checked against genomiclength */
723 if (leftpos > genomiclength || rightpos > genomiclength) {
724 return false;
725 }
726 #else
727 /* Checking just before call to genome_right_position */
728 #endif
729
730 if (leftpos >= rightpos) {
731 debug7(printf("leftpos %u >= rightpos %u, so returning false\n",leftpos,rightpos));
732 return false;
733 }
734
735 if (leftpos < 103) {
736 left_chrbound = 3; /* Previously 0, but then can find splice site at beginning of segment */
737 } else {
738 left_chrbound = leftpos - 100;
739 }
740
741 if (rightpos < 103) {
742 right_chrbound = 3; /* Previously 0, but then can find splice site at beginning of segment */
743 } else {
744 right_chrbound = rightpos - 100;
745 }
746
747 #if 0
748 if (skip_repetitive_p == false) {
749
750 last_leftpos = (*genome_left_position)(leftpos,left_chrbound,chroffset,chrhigh,plusp);
751 last_rightpos = (*genome_right_position)(rightpos,right_chrbound,chroffset,chrhigh,plusp);
752 debug7(printf("last_leftpos %u, last_rightpos %u\n",last_leftpos,last_rightpos));
753
754 debug7(printf("skip_repetitive_p == false, so returning %u == %u && %u == %u\n",
755 leftpos,last_leftpos,rightpos,last_rightpos));
756 return (leftpos == last_leftpos && rightpos == last_rightpos);
757 }
758 #endif
759
760 /* Allow canonical to be to right of match */
761 leftpos += SHIFT_EXTRA;
762 if (leftpos > chrhigh - 3) {
763 leftpos = chrhigh - 3;
764 }
765 rightpos += SHIFT_EXTRA;
766 if (rightpos > chrhigh - 3) {
767 rightpos = chrhigh - 3;
768 }
769 debug7(printf("after shift, leftpos = %u, rightpos = %u\n",leftpos,rightpos));
770
771 shift = 0;
772 while (shift <= querydistance + SHIFT_EXTRA + SHIFT_EXTRA) {
773
774 #if 0
775 if (leftpos < 0) {
776 return false;
777 } else if (rightpos < 0) {
778 /* Shouldn't need to check if leftpos >= 0 and rightpos >= leftpos, in the other two conditions) */
779 return false;
780 } else if (rightpos >= chrlength) {
781 return false;
782 }
783 #endif
784 if (leftpos < 3) {
785 return false;
786 } else if (leftpos > rightpos) {
787 return false;
788 }
789
790 last_leftpos = (*genome_left_position)(leftpos,left_chrbound,chroffset,chrhigh,plusp);
791 debug7(printf("last_leftpos %u\n",last_leftpos));
792 assert(last_leftpos != 0U);
793 if ((leftdi = last_leftpos) == -1) {
794 debug7(printf("\n"));
795 return false;
796 } else {
797 leftmiss = (int) (leftpos - leftdi);
798 }
799
800 last_rightpos = (*genome_right_position)(rightpos,right_chrbound,chroffset,chrhigh,plusp);
801 debug7(printf("last_rightpos %u\n",last_rightpos));
802 assert(last_rightpos != 0U);
803 if ((rightdi = last_rightpos) == -1) {
804 debug7(printf("\n"));
805 return false;
806 } else {
807 rightmiss = (int) (rightpos - rightdi);
808 }
809
810 debug7(printf("shift %d/left %d (miss %d)/right %d (miss %d)\n",shift,leftpos,leftmiss,rightpos,rightmiss));
811 if (leftmiss == rightmiss) { /* was leftmiss == 0 && rightmiss == 0, which doesn't allow for a shift */
812 debug7(printf(" => Success at %u..%u (fwd) or %u..%u (rev)\n\n",
813 leftpos-leftmiss+/*onebasedp*/1U,rightpos-rightmiss+/*onebasedp*/1U,
814 chrhigh-chroffset-(leftpos-leftmiss),chrhigh-chroffset-(rightpos-rightmiss)));
815 return true;
816 } else if (leftmiss >= rightmiss) {
817 shift += leftmiss;
818 leftpos -= leftmiss;
819 rightpos -= leftmiss;
820 } else {
821 shift += rightmiss;
822 leftpos -= rightmiss;
823 rightpos -= rightmiss;
824 }
825 }
826
827 debug7(printf("\n"));
828 return false;
829 }
830 #endif
831
832
833
834
835 #if 0
836 /* General case for ranges in score_querypos */
837 while (prevhit != -1 && (prevposition = mappings[prev_querypos][prevhit]) + indexsize_nt <= position) {
838 /* printf("fwd: prevposition %u, prevhit %d\n",prevposition,prevhit); */
839 prevlink = &(links[prev_querypos][prevhit]);
840
841 gendistance = position - prevposition - indexsize_nt;
842 /* diffdistance = abs(gendistance - querydistance); */
843 if (gendistance > querydistance) {
844 diffdistance = gendistance - querydistance;
845 } else {
846 diffdistance = querydistance - gendistance;
847 }
848
849 if (diffdistance < maxintronlen) {
850 if (diffdistance <= EQUAL_DISTANCE_NOT_SPLICING) {
851 debug9(canonicalsgn = 9);
852 fwd_score = prevlink->fwd_score + CONSEC_POINTS_PER_MATCH;
853 #ifdef PMAP
854 if (diffdistance % 3 != 0) {
855 fwd_score -= NONCODON_INDEL_PENALTY;
856 }
857 #endif
858 } else if (near_end_p == false && prevlink->fwd_consecutive < EXON_DEFN) {
859 debug9(canonicalsgn = 0);
860 if (splicingp == true) {
861 fwd_score = prevlink->fwd_score - (diffdistance/TEN_THOUSAND + 1) - querydist_penalty - NINTRON_PENALTY_MISMATCH;
862 } else {
863 fwd_score = prevlink->fwd_score - (diffdistance/ONE + 1) - querydist_penalty - NINTRON_PENALTY_MISMATCH;
864 }
865
866 } else if (splicingp == false) {
867 debug9(canonicalsgn = 0);
868 fwd_score = prevlink->fwd_score - (diffdistance/ONE + 1) - querydist_penalty;
869
870 } else if (use_shifted_canonical_p == true) {
871 leftpos = prevposition + querydistance - 1;
872 /* printf("leftpos %d, last_leftpos %d, rightpos %d\n",leftpos,last_leftpos,rightpos); */
873 if (leftpos == last_leftpos) {
874 canonicalp = last_canonicalp;
875 } else {
876 debug7(printf("Calling find_shift_canonical fwd\n"));
877 canonicalp = find_shifted_canonical(leftpos,rightpos,querydistance-indexsize_nt,
878 /* &lastGT,&lastAG, */
879 Genome_prev_donor_position,Genome_prev_acceptor_position,
880 chroffset,chrhigh,plusp,skip_repetitive_p);
881 /* And need to check for shift_canonical_rev */
882
883 last_leftpos = leftpos;
884 last_canonicalp = canonicalp;
885 }
886 if (canonicalp == true) {
887 debug9(canonicalsgn = +1);
888 fwd_score = prevlink->fwd_score - (diffdistance/TEN_THOUSAND + 1) - querydist_penalty;
889 } else {
890 debug9(canonicalsgn = 0);
891 fwd_score = prevlink->fwd_score - (diffdistance/TEN_THOUSAND + 1) - querydist_penalty - NINTRON_PENALTY_MISMATCH;
892 }
893
894 } else {
895 debug9(canonicalsgn = +1);
896 fwd_score = prevlink->fwd_score - (diffdistance/TEN_THOUSAND + 1) - querydist_penalty;
897 }
898
899 debug9(printf("\tD. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
900 prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
901 prevlink->fwd_score,fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
902 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
903 gendistance,querydistance,canonicalsgn));
904
905 /* Allow ties, which should favor shorter intron */
906 if (fwd_score >= best_fwd_score) {
907 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
908 best_fwd_consecutive = prevlink->fwd_consecutive + (querydistance + indexsize_nt);
909 /* best_fwd_rootnlinks = prevlink->fwd_rootnlinks + 1; */
910 } else {
911 best_fwd_consecutive = 0;
912 /* best_fwd_rootnlinks = 1; */
913 }
914 best_fwd_score = fwd_score;
915 best_fwd_prevpos = prev_querypos;
916 best_fwd_prevhit = prevhit;
917 #ifdef DEBUG9
918 best_fwd_tracei = ++*fwd_tracei;
919 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
920 best_fwd_intronnrev = prevlink->fwd_intronnrev;
921 best_fwd_intronnunk = prevlink->fwd_intronnunk;
922 switch (canonicalsgn) {
923 case 1: best_fwd_intronnfwd++; break;
924 case 0: best_fwd_intronnunk++; break;
925 }
926 #endif
927 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
928 } else {
929 debug9(printf(" => Loses to %d\n",best_fwd_score));
930 }
931 }
932
933 prevhit = active[prev_querypos][prevhit];
934 }
935 #endif
936
937
938 #if 0
939 /* SIMD version */
940 _positions = _mm_set1_epi32(position - indexsize_nt);
941 _querydistance = _mm_set1_epi32(querydistance);
942 _splicing_querydist_penalty = _mm_set1_epi32(querydist_penalty+1+NINTRON_PENALTY_MISMATCH);
943 _max_scores = _mm_set1_epi32(-1000);
944
945 prevhit = low_hit;
946 while (prevhit + 4 < high_hit) {
947 /* printf("fwd: prevposition %u, prevhit %d\n",prevposition,prevhit); */
948 _prevpositions = _mm_loadu_epi32(&(mappings[prev_querypos][prevhit]));
949 _gendistance = _mm_sub_epi32(_positions,_prevpositions);
950 if (_mm_cmpgt_epi32(_gendistance,_zeroes) == 0) {
951 break;
952 } else {
953 _diffdistance = _mm_abs_epi32(_mm_sub_epi32(_gendistance,_querydistance));
954
955 _prev_scores = _mm_loadu_epi32(&(fwd_scores[prev_querypos][prevhit]));
956
957 _scores_close = _mm_add_epi32(_prev_scores,_mm_set1_epi32(CONSEC_POINTS_PER_MATCH));
958 /* Right shift of 13 bits gives division by 8192 */
959 _scores_splice = _mm_add_epi32(_prev_scores,_mm_sub_epi32(_mm_srli_epi32(_diffdistance,13),_splicing_querydist_penalty));
960
961 _scores = _mm_blendv_ps(_scores_close,_scores_splice,_mm_cmpgt_epi32(_diffdistance,_mm_set1_epi32(EQUAL_DISTANCE_NOT_SPLICING)));
962
963 _mm_storeu_epi32(_scores);
964
965 _max_scores = _mm_max_epi32(_max_scores,_scores);
966 prevhit += 4;
967 }
968 }
969
970 /* Take care of serial cases */
971
972
973
974
975 /* Compute overall max and return. Caller can find prev_querypos with
976 largest max and store in fwd_pos[curr_querypos][currhit] and max in
977 fwd_max[curr_querypos][currhit]. During traceback, recompute at
978 prev_querypos and find prevhit that gives the max. */
979
980 if (diffdistance < maxintronlen) {
981 if (diffdistance <= EQUAL_DISTANCE_NOT_SPLICING) {
982 debug9(canonicalsgn = 9);
983 fwd_score = prevlink->fwd_score + CONSEC_POINTS_PER_MATCH;
984 #ifdef PMAP
985 if (diffdistance % 3 != 0) {
986 fwd_score -= NONCODON_INDEL_PENALTY;
987 }
988 #endif
989 } else if (near_end_p == false && prevlink->fwd_consecutive < EXON_DEFN) {
990 debug9(canonicalsgn = 0);
991 if (splicingp == true) {
992 fwd_score = prevlink->fwd_score - (diffdistance/TEN_THOUSAND + 1) - querydist_penalty - NINTRON_PENALTY_MISMATCH;
993 } else {
994 fwd_score = prevlink->fwd_score - (diffdistance/ONE + 1) - querydist_penalty - NINTRON_PENALTY_MISMATCH;
995 }
996
997 } else if (splicingp == false) {
998 debug9(canonicalsgn = 0);
999 fwd_score = prevlink->fwd_score - (diffdistance/ONE + 1) - querydist_penalty;
1000
1001 } else if (use_shifted_canonical_p == true) {
1002 leftpos = prevposition + querydistance - 1;
1003 /* printf("leftpos %d, last_leftpos %d, rightpos %d\n",leftpos,last_leftpos,rightpos); */
1004 if (leftpos == last_leftpos) {
1005 canonicalp = last_canonicalp;
1006 } else {
1007 debug7(printf("Calling find_shift_canonical fwd\n"));
1008 canonicalp = find_shifted_canonical(leftpos,rightpos,querydistance-indexsize_nt,
1009 /* &lastGT,&lastAG, */
1010 Genome_prev_donor_position,Genome_prev_acceptor_position,
1011 chroffset,chrhigh,plusp,skip_repetitive_p);
1012 /* And need to check for shift_canonical_rev */
1013
1014 last_leftpos = leftpos;
1015 last_canonicalp = canonicalp;
1016 }
1017 if (canonicalp == true) {
1018 debug9(canonicalsgn = +1);
1019 fwd_score = prevlink->fwd_score - (diffdistance/TEN_THOUSAND + 1) - querydist_penalty;
1020 } else {
1021 debug9(canonicalsgn = 0);
1022 fwd_score = prevlink->fwd_score - (diffdistance/TEN_THOUSAND + 1) - querydist_penalty - NINTRON_PENALTY_MISMATCH;
1023 }
1024
1025 } else {
1026 debug9(canonicalsgn = +1);
1027 fwd_score = prevlink->fwd_score - (diffdistance/TEN_THOUSAND + 1) - querydist_penalty;
1028 }
1029
1030 debug9(printf("\tD. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d, intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
1031 prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
1032 prevlink->fwd_score,fwd_score,prevlink->fwd_consecutive,
1033 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
1034 gendistance,querydistance,canonicalsgn));
1035
1036 /* Allow ties, which should favor shorter intron */
1037 if (fwd_score >= best_fwd_score) {
1038 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
1039 best_fwd_consecutive = prevlink->fwd_consecutive + (querydistance + indexsize_nt);
1040 /* best_fwd_rootnlinks = prevlink->fwd_rootnlinks + 1; */
1041 } else {
1042 best_fwd_consecutive = 0;
1043 /* best_fwd_rootnlinks = 1; */
1044 }
1045 best_fwd_score = fwd_score;
1046 best_fwd_prevpos = prev_querypos;
1047 best_fwd_prevhit = prevhit;
1048 #ifdef DEBUG9
1049 best_fwd_tracei = ++*fwd_tracei;
1050 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
1051 best_fwd_intronnrev = prevlink->fwd_intronnrev;
1052 best_fwd_intronnunk = prevlink->fwd_intronnunk;
1053 switch (canonicalsgn) {
1054 case 1: best_fwd_intronnfwd++; break;
1055 case 0: best_fwd_intronnunk++; break;
1056 }
1057 #endif
1058 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
1059 } else {
1060 debug9(printf(" => Loses to %d\n",best_fwd_score));
1061 }
1062 }
1063
1064 prevhit = active[prev_querypos][prevhit];
1065 }
1066 #endif
1067
1068
1069 static void
score_querypos_lookback_one(int * fwd_tracei,Link_T currlink,int curr_querypos,int currhit,unsigned int position,struct Link_T ** links,int ** fwd_scores,Chrpos_T ** mappings,int ** active,int * firstactive,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int indexsize,Intlist_T processed,bool anchoredp,bool localp,bool splicingp,bool use_canonical_p,int non_canonical_penalty)1070 score_querypos_lookback_one (int *fwd_tracei, Link_T currlink, int curr_querypos, int currhit,
1071 unsigned int position,
1072 struct Link_T **links, int **fwd_scores, Chrpos_T **mappings,
1073 int **active, int *firstactive,
1074 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
1075 int indexsize, Intlist_T processed,
1076 #ifdef MOVE_TO_STAGE3
1077 bool anchoredp,
1078 #endif
1079 bool localp, bool splicingp,
1080 bool use_canonical_p, int non_canonical_penalty) {
1081 Link_T prevlink;
1082 struct Link_T *prev_links;
1083 Chrpos_T *prev_mappings;
1084 int *prev_active;
1085
1086 int best_fwd_consecutive = indexsize*NT_PER_MATCH;
1087 int best_fwd_rootposition = position;
1088 /* int best_fwd_rootnlinks = 1; */
1089 int best_fwd_score = 0, fwd_score;
1090 int best_fwd_prevpos = -1, best_fwd_prevhit = -1;
1091 int best_fwd_tracei, last_tracei;
1092 #ifdef DEBUG9
1093 int best_fwd_intronnfwd = 0, best_fwd_intronnrev = 0, best_fwd_intronnunk = 0;
1094 int canonicalsgn = 0;
1095 #endif
1096 bool donep;
1097 int prev_querypos, prevhit;
1098 Chrpos_T prevposition;
1099 int gendistance;
1100 Univcoord_T prevpos, currpos;
1101 int querydistance, diffdistance, lookback, nlookback, nseen, indexsize_nt;
1102 /* int querydist_penalty; */
1103 int querydist_credit;
1104 /* bool near_end_p; */
1105 bool canonicalp;
1106
1107 #ifdef PMAP
1108 indexsize_nt = indexsize*3; /* Use when evaluating across genomic positions */
1109 #else
1110 indexsize_nt = indexsize;
1111 #endif
1112 #if 0
1113 indexsize_query = indexsize; /* Use when evaluating across query positions */
1114 #endif
1115
1116
1117 /* Parameters for section D, assuming adjacent is false */
1118 /* adjacentp = false; */
1119 nlookback = nsufflookback;
1120 lookback = sufflookback;
1121
1122 /* A. Evaluate adjacent position (at last one processed) */
1123 if (processed != NULL) {
1124 prev_querypos = Intlist_head(processed);
1125 prev_links = links[prev_querypos];
1126 prev_mappings = mappings[prev_querypos];
1127 prev_active = active[prev_querypos];
1128
1129 #ifdef PMAP
1130 querydistance = (curr_querypos - prev_querypos)*3;
1131 #else
1132 querydistance = curr_querypos - prev_querypos;
1133 #endif
1134 prevhit = firstactive[prev_querypos];
1135 prevposition = position; /* Prevents prevposition + querydistance == position */
1136 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) + querydistance < position) {
1137 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1138 }
1139 if (prevposition + querydistance == position) {
1140 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
1141 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
1142 best_fwd_rootposition = prevlink->fwd_rootposition;
1143 /* best_fwd_rootnlinks = prevlink->fwd_rootnlinks + 1; */
1144 best_fwd_score = fwd_scores[prev_querypos][prevhit] + CONSEC_POINTS_PER_MATCH*querydistance;
1145
1146 best_fwd_prevpos = prev_querypos;
1147 best_fwd_prevhit = prevhit;
1148 best_fwd_tracei = prevlink->fwd_tracei;
1149 #ifdef DEBUG9
1150 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
1151 best_fwd_intronnrev = prevlink->fwd_intronnrev;
1152 best_fwd_intronnunk = prevlink->fwd_intronnunk;
1153 #endif
1154 /* adjacentp = true; */
1155
1156 /* Parameters for section D when adjacent is true, so we don't look so far back */
1157 nlookback = 1;
1158 lookback = sufflookback/2;
1159
1160
1161 debug9(printf("\tA. Adjacent qpos %d,%d at %ux%d (scores = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d)\n",
1162 prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],fwd_scores[prev_querypos][prevhit],
1163 best_fwd_score,best_fwd_consecutive,best_fwd_tracei,
1164 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk));
1165 }
1166 }
1167
1168
1169 #ifdef MOVE_TO_STAGE3
1170 /* Check work list */
1171 if (anchoredp && curr_querypos - indexsize_query <= querystart) {
1172 /* Allow close prevpositions that overlap with anchor */
1173 /* Can give rise to false positives, and increases amount of dynamic programming work */
1174 } else if (0 && anchoredp && curr_querypos == queryend) {
1175 /* Test first position */
1176 } else if (0) {
1177 while (processed != NULL && (prev_querypos = Intlist_head(processed)) > curr_querypos - indexsize_query) {
1178 debug9(printf("Skipping prev_querypos %d, because too close\n",prev_querypos));
1179 processed = Intlist_next(processed);
1180 }
1181 }
1182 #endif
1183
1184 /* D. Evaluate for mismatches (all other previous querypos) */
1185 donep = false;
1186 nseen = 0;
1187 last_tracei = -1;
1188 for ( ; processed != NULL && best_fwd_consecutive < ENOUGH_CONSECUTIVE && donep == false;
1189 processed = Intlist_next(processed), nseen++) {
1190 prev_querypos = Intlist_head(processed);
1191
1192 #ifdef PMAP
1193 querydistance = (curr_querypos - prev_querypos)*3;
1194 #else
1195 querydistance = curr_querypos - prev_querypos;
1196 #endif
1197
1198 if (nseen > nlookback && querydistance - indexsize_nt > lookback) {
1199 donep = true;
1200 }
1201
1202 if ((prevhit = firstactive[prev_querypos]) != -1) {
1203 /* querydist_penalty = (querydistance - indexsize_nt)/QUERYDIST_PENALTY_FACTOR; */
1204 /* Actually a querydist_penalty */
1205 querydist_credit = -querydistance/indexsize_nt;
1206
1207 prev_mappings = mappings[prev_querypos];
1208 prev_links = links[prev_querypos];
1209 prev_active = active[prev_querypos];
1210
1211 /* Range 0 */
1212 while (prevhit != -1 && prev_links[prevhit].fwd_tracei == last_tracei) {
1213 debug9(printf("Skipping querypos %d with tracei #%d\n",prev_querypos,prev_links[prevhit].fwd_tracei));
1214 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1215 }
1216 if (prevhit != -1) {
1217 last_tracei = prev_links[prevhit].fwd_tracei;
1218 }
1219
1220 /* Range 1: From Infinity to maxintronlen */
1221 if (splicingp == true) {
1222 /* This is equivalent to diffdistance >= maxintronlen, where
1223 diffdistance = abs(gendistance - querydistance) and
1224 gendistance = (position - prevposition - indexsize_nt) */
1225 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) + maxintronlen + querydistance <= position) {
1226 /* Skip */
1227 /* printf("fwd: prevposition %u, prevhit %d\n",prevposition,prevhit); */
1228 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1229 }
1230 }
1231
1232 /* Range 2: From maxintronlen to (prev_querypos + EQUAL_DISTANCE_NOT_SPLICING) */
1233 /* This is equivalent to +diffdistance > EQUAL_DISTANCE_NOT_SPLICING */
1234 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) + EQUAL_DISTANCE_NOT_SPLICING + querydistance < position) {
1235 /* printf("fwd: prevposition %u, prevhit %d\n",prevposition,prevhit); */
1236 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
1237
1238 gendistance = position - prevposition;
1239 assert(gendistance > querydistance); /* True because gendistance > EQUAL_DISTANCE_NOT_SPLICING + querydistance */
1240 diffdistance = gendistance - querydistance; /* No need for abs() */
1241
1242 fwd_score = fwd_scores[prev_querypos][prevhit] + querydist_credit /*- querydist_penalty*/;
1243 if (splicingp == true) {
1244 fwd_score -= (diffdistance/TEN_THOUSAND + 1);
1245 } else {
1246 fwd_score -= (diffdistance/ONE + 1);
1247 }
1248
1249 if (use_canonical_p == true) {
1250
1251 /* prevpos is lower genomic coordinate than currpos */
1252 /* need to subtract from position and prevposition to compensate for greedy matches */
1253 /* need to add to position and prevposition to compensate for missed matches */
1254 if (plusp == true) {
1255 prevpos = chroffset + prevposition + indexsize_nt;
1256 currpos = chroffset + position - querydistance + indexsize_nt;
1257 assert(prevpos < currpos);
1258
1259 if (prevpos < GREEDY_ADVANCE || currpos < GREEDY_ADVANCE) {
1260 canonicalp = false;
1261 } else if (Genome_sense_canonicalp(/*donor_rightbound*/prevpos + MISS_BEHIND,
1262 /*donor_leftbound*/prevpos - GREEDY_ADVANCE,
1263 /*acceptor_rightbound*/currpos + MISS_BEHIND,
1264 /*acceptor_leftbound*/currpos - GREEDY_ADVANCE,
1265 chroffset) == true) {
1266 debug9(printf("lookback plus: sense canonical\n"));
1267 canonicalp = true;
1268 } else if (Genome_antisense_canonicalp(/*donor_rightbound*/currpos + MISS_BEHIND,
1269 /*donor_leftbound*/currpos - GREEDY_ADVANCE,
1270 /*acceptor_rightbound*/prevpos + MISS_BEHIND,
1271 /*acceptor_leftbound*/prevpos - GREEDY_ADVANCE,
1272 chroffset) == true) {
1273 debug9(printf("lookback plus: antisense canonical\n"));
1274 canonicalp = true;
1275 } else {
1276 debug9(printf("lookback plus: not canonical\n"));
1277 canonicalp = false;
1278 }
1279
1280 } else {
1281 prevpos = chrhigh + 1 - prevposition - indexsize_nt;
1282 currpos = chrhigh + 1 - position + querydistance - indexsize_nt;
1283 assert(currpos < prevpos);
1284
1285 if (currpos < MISS_BEHIND || prevpos < MISS_BEHIND) {
1286 canonicalp = false;
1287 } else if (Genome_sense_canonicalp(/*donor_rightbound*/currpos + GREEDY_ADVANCE,
1288 /*donor_leftbound*/currpos - MISS_BEHIND,
1289 /*acceptor_rightbound*/prevpos + GREEDY_ADVANCE,
1290 /*acceptor_leftbound*/prevpos - MISS_BEHIND,
1291 chroffset) == true) {
1292 debug9(printf("lookback minus: sense canonical\n"));
1293 canonicalp = true;
1294 } else if (Genome_antisense_canonicalp(/*donor_rightbound*/prevpos + GREEDY_ADVANCE,
1295 /*donor_leftbound*/prevpos - MISS_BEHIND,
1296 /*acceptor_rightbound*/currpos + GREEDY_ADVANCE,
1297 /*acceptor_leftbound*/currpos - MISS_BEHIND,
1298 chroffset) == true) {
1299 debug9(printf("lookback minus: antisense canonical\n"));
1300 canonicalp = true;
1301 } else {
1302 debug9(printf("lookback minus: not canonical\n"));
1303 canonicalp = false;
1304 }
1305 }
1306
1307 if (canonicalp == true) {
1308 debug9(canonicalsgn = +1);
1309 } else {
1310 debug9(canonicalsgn = 0);
1311 fwd_score -= non_canonical_penalty;
1312 }
1313
1314 }
1315
1316 debug9(printf("\tD2. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
1317 prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
1318 fwd_scores[prev_querypos][prevhit],fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
1319 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
1320 gendistance-indexsize_nt,querydistance-indexsize_nt,canonicalsgn));
1321
1322 /* Disallow ties, which should favor adjacent */
1323 if (fwd_score > best_fwd_score) {
1324 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
1325 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
1326 /* best_fwd_rootnlinks = prevlink->fwd_rootnlinks + 1; */
1327 } else {
1328 best_fwd_consecutive = 0;
1329 /* best_fwd_rootnlinks = 1; */
1330 }
1331 best_fwd_rootposition = prevlink->fwd_rootposition;
1332 best_fwd_score = fwd_score;
1333 best_fwd_prevpos = prev_querypos;
1334 best_fwd_prevhit = prevhit;
1335 best_fwd_tracei = ++*fwd_tracei;
1336 #ifdef DEBUG9
1337 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
1338 best_fwd_intronnrev = prevlink->fwd_intronnrev;
1339 best_fwd_intronnunk = prevlink->fwd_intronnunk;
1340 switch (canonicalsgn) {
1341 case 1: best_fwd_intronnfwd++; break;
1342 case 0: best_fwd_intronnunk++; break;
1343 }
1344 #endif
1345 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
1346 } else {
1347 debug9(printf(" => Loses to %d\n",best_fwd_score));
1348 }
1349
1350 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1351 }
1352
1353 /* Scoring appears to be the same as for range 4, which is rarely called, so including in range 4 */
1354 /* Range 3: From (querypos + EQUAL_DISTANCE_NOT_SPLICING) to (querypos - EQUAL_DISTANCE_NOT_SPLICING) */
1355 /* This is equivalent to -diffdistance > EQUAL_DISTANCE_NOT_SPLICING && prevposition + indexsize_nt <= position */
1356
1357 /* Range 4: From (prev_querypos - EQUAL_DISTANCE_NOT_SPLICING) to indexsize_nt */
1358 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) + indexsize_nt <= position) {
1359 /* printf("fwd: prevposition %u, prevhit %d\n",prevposition,prevhit); */
1360 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
1361
1362 gendistance = position - prevposition;
1363 /* was abs(gendistance - querydistance) */
1364 diffdistance = gendistance > querydistance ? (gendistance - querydistance) : (querydistance - gendistance);
1365
1366 #ifdef BAD_GMAX
1367 fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/;
1368 #else
1369 /* diffdistance <= EQUAL_DISTANCE_NOT_SPLICING */
1370 /* This is how version 2013-08-14 did it */
1371 fwd_score = fwd_scores[prev_querypos][prevhit] + CONSEC_POINTS_PER_MATCH;
1372 #endif
1373
1374 #if 0
1375 /* Used in range 4 but not in range 3 */
1376 if (/*near_end_p == false &&*/ prevlink->fwd_consecutive < EXON_DEFN) {
1377 fwd_score -= NINTRON_PENALTY_MISMATCH;
1378 }
1379 #endif
1380
1381 debug9(printf("\tD4. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
1382 prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
1383 fwd_scores[prev_querypos][prevhit],fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
1384 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
1385 gendistance-indexsize_nt,querydistance-indexsize_nt,canonicalsgn));
1386
1387 /* Disallow ties, which should favor adjacent */
1388 if (fwd_score > best_fwd_score) {
1389 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
1390 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
1391 /* best_fwd_rootnlinks = prevlink->fwd_rootnlinks + 1; */
1392 } else {
1393 best_fwd_consecutive = 0;
1394 /* best_fwd_rootnlinks = 1; */
1395 }
1396 best_fwd_rootposition = prevlink->fwd_rootposition;
1397 best_fwd_score = fwd_score;
1398 best_fwd_prevpos = prev_querypos;
1399 best_fwd_prevhit = prevhit;
1400 /* best_fwd_tracei = ++*fwd_tracei; */
1401 best_fwd_tracei = prevlink->fwd_tracei; /* Keep previous trace, as in range 3 */
1402 #ifdef DEBUG9
1403 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
1404 best_fwd_intronnrev = prevlink->fwd_intronnrev;
1405 best_fwd_intronnunk = prevlink->fwd_intronnunk;
1406 switch (canonicalsgn) {
1407 case 1: best_fwd_intronnfwd++; break;
1408 case 0: best_fwd_intronnunk++; break;
1409 }
1410 #endif
1411 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
1412 } else {
1413 debug9(printf(" => Loses to %d\n",best_fwd_score));
1414 }
1415
1416 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1417 }
1418 }
1419 }
1420
1421 /* Best_score needs to beat something positive to prevent a
1422 small local extension from beating a good canonical intron.
1423 If querypos is too small, don't insert an intron. */
1424 /* linksconsecutive already assigned above */
1425 currlink->fwd_consecutive = best_fwd_consecutive;
1426 currlink->fwd_rootposition = best_fwd_rootposition;
1427 /* currlink->fwd_rootnlinks = best_fwd_rootnlinks; */
1428 currlink->fwd_pos = best_fwd_prevpos;
1429 currlink->fwd_hit = best_fwd_prevhit;
1430 if (currlink->fwd_pos >= 0) {
1431 currlink->fwd_tracei = best_fwd_tracei;
1432 fwd_scores[curr_querypos][currhit] = best_fwd_score;
1433 #ifdef MOVE_TO_STAGE3
1434 } else if (anchoredp == true) {
1435 currlink->fwd_tracei = -1;
1436 fwd_scores[curr_querypos][currhit] = -100000;
1437 #endif
1438 } else if (localp == true) {
1439 currlink->fwd_tracei = ++*fwd_tracei;
1440 fwd_scores[curr_querypos][currhit] = indexsize_nt;
1441 } else {
1442 currlink->fwd_tracei = ++*fwd_tracei;
1443 fwd_scores[curr_querypos][currhit] = best_fwd_score;
1444 }
1445
1446 #ifdef DEBUG9
1447 currlink->fwd_intronnfwd = best_fwd_intronnfwd;
1448 currlink->fwd_intronnrev = best_fwd_intronnrev;
1449 currlink->fwd_intronnunk = best_fwd_intronnunk;
1450 #endif
1451
1452 debug9(printf("\tChose %d,%d with score %d (fwd) => trace #%d\n",
1453 currlink->fwd_pos,currlink->fwd_hit,fwd_scores[curr_querypos][currhit],currlink->fwd_tracei));
1454 debug3(printf("%d %d %d %d 1\n",querypos,hit,best_prevpos,best_prevhit));
1455
1456 return;
1457 }
1458
1459
1460
1461
1462 static void
score_querypos_lookback_mult(int * fwd_tracei,int low_hit,int high_hit,int curr_querypos,unsigned int * positions,struct Link_T ** links,int ** fwd_scores,Chrpos_T ** mappings,int ** active,int * firstactive,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int indexsize,Intlist_T processed,bool anchoredp,bool localp,bool splicingp,bool use_canonical_p,int non_canonical_penalty)1463 score_querypos_lookback_mult (int *fwd_tracei, int low_hit, int high_hit, int curr_querypos,
1464 unsigned int *positions,
1465 struct Link_T **links, int **fwd_scores,
1466 Chrpos_T **mappings, int **active, int *firstactive,
1467 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
1468 int indexsize, Intlist_T processed,
1469 #ifdef MOVE_TO_STAGE3
1470 bool anchoredp,
1471 #endif
1472 bool localp, bool splicingp,
1473 bool use_canonical_p, int non_canonical_penalty) {
1474 Link_T prevlink, currlink;
1475 Intlist_T last_item, p;
1476 int nhits = high_hit - low_hit, nprocessed, hiti;
1477
1478 struct Link_T *prev_links, *adj_links;
1479 Chrpos_T *prev_mappings, *adj_mappings;
1480 int *prev_active, *adj_active;
1481
1482 int overall_fwd_consecutive, best_fwd_consecutive;
1483 int best_fwd_rootposition;
1484 int best_fwd_score, fwd_score;
1485 int best_fwd_prevpos, best_fwd_prevhit;
1486 int best_fwd_tracei, last_tracei;
1487 #ifdef DEBUG9
1488 int best_fwd_intronnfwd, best_fwd_intronnrev, best_fwd_intronnunk;
1489 int canonicalsgn = 0;
1490 #endif
1491 int adj_querypos, adj_querydistance, prev_querypos, prevhit, adj_frontier, *frontier;
1492 Chrpos_T prevposition, position;
1493 int gendistance;
1494 Univcoord_T prevpos, currpos;
1495 int querydistance, diffdistance, indexsize_nt;
1496 int max_nseen, max_adjacent_nseen, max_nonadjacent_nseen, nseen;
1497 int querydist_credit;
1498 bool canonicalp;
1499
1500 #ifdef PMAP
1501 indexsize_nt = indexsize*3; /* Use when evaluating across genomic positions */
1502 #else
1503 indexsize_nt = indexsize;
1504 #endif
1505 #if 0
1506 indexsize_query = indexsize; /* Use when evaluating across query positions */
1507 #endif
1508
1509
1510 /* Determine work load */
1511 /* printf("Work load (lookback): %s\n",Intlist_to_string(processed)); */
1512 last_item = processed;
1513 #ifdef MOVE_TO_STAGE3
1514 if (anchoredp && curr_querypos - indexsize_query <= querystart) {
1515 /* Allow close prevpositions that overlap with anchor */
1516 /* Can give rise to false positives, and increases amount of dynamic programming work */
1517 /* debug9(printf("No skipping because close to anchor\n")); */
1518 } else if (0 && anchoredp && curr_querypos == queryend) {
1519 /* Test first position */
1520 } else if (0) {
1521 while (processed != NULL && (/*prev_querypos =*/ Intlist_head(processed)) > curr_querypos - indexsize_query) {
1522 debug9(printf("Skipping prev_querypos %d, because too close\n",Intlist_head(processed)));
1523 processed = Intlist_next(processed);
1524 }
1525 }
1526 #endif
1527
1528 if (last_item == NULL) {
1529 for (hiti = 0; hiti < nhits; hiti++) {
1530 currlink = &(links[curr_querypos][hiti + low_hit]);
1531
1532 currlink->fwd_consecutive = /*best_fwd_consecutive =*/ indexsize*NT_PER_MATCH;
1533 currlink->fwd_rootposition = /*best_fwd_rootposition =*/ positions[hiti];
1534 currlink->fwd_pos = /*best_fwd_prevpos =*/ -1;
1535 currlink->fwd_hit = /*best_fwd_prevhit =*/ -1;
1536
1537 #ifdef MOVE_TO_STAGE3
1538 if (anchoredp == true) {
1539 currlink->fwd_tracei = -1;
1540 fwd_scores[curr_querypos][hiti + low_hit] = -100000;
1541 } else
1542 #endif
1543 if (localp == true) {
1544 currlink->fwd_tracei = ++*fwd_tracei;
1545 fwd_scores[curr_querypos][hiti + low_hit] = indexsize_nt;
1546 } else {
1547 fwd_scores[curr_querypos][hiti + low_hit] = /*best_fwd_score =*/ 0;
1548 }
1549 }
1550
1551 } else if (processed == NULL) {
1552 debug9(printf("processed is NULL\n"));
1553 /* A. Evaluate adjacent position (at last one processed, if available). Don't evaluate for mismatches (D). */
1554 adj_querypos = Intlist_head(last_item);
1555 adj_links = links[adj_querypos];
1556 adj_mappings = mappings[adj_querypos];
1557 adj_active = active[adj_querypos];
1558
1559 #ifdef PMAP
1560 adj_querydistance = (curr_querypos - adj_querypos)*3;
1561 #else
1562 adj_querydistance = curr_querypos - adj_querypos;
1563 #endif
1564
1565 /* Process prevhit and hiti in parallel. Values are asscending along prevhit chain and from 0 to nhits-1. */
1566 prevhit = firstactive[adj_querypos];
1567 hiti = 0;
1568 while (prevhit != -1 && hiti < nhits) {
1569 if ((prevposition = /*mappings[adj_querypos]*/adj_mappings[prevhit]) + adj_querydistance < (position = positions[hiti])) {
1570 prevhit = /*active[adj_querypos]*/adj_active[prevhit];
1571
1572 } else if (prevposition + adj_querydistance > position) {
1573 currlink = &(links[curr_querypos][hiti + low_hit]);
1574
1575 currlink->fwd_consecutive = /*best_fwd_consecutive =*/ indexsize*NT_PER_MATCH;
1576 currlink->fwd_rootposition = /*best_fwd_rootposition =*/ positions[hiti];
1577 currlink->fwd_pos = /*best_fwd_prevpos =*/ -1;
1578 currlink->fwd_hit = /*best_fwd_prevhit =*/ -1;
1579
1580 #ifdef MOVE_TO_STAGE3
1581 if (anchoredp == true) {
1582 currlink->fwd_tracei = -1;
1583 fwd_scores[curr_querypos][hiti + low_hit] = -100000;
1584 } else
1585 #endif
1586 if (localp == true) {
1587 currlink->fwd_tracei = ++*fwd_tracei;
1588 fwd_scores[curr_querypos][hiti + low_hit] = indexsize_nt;
1589 } else {
1590 fwd_scores[curr_querypos][hiti + low_hit] = /*best_fwd_score =*/ 0;
1591 }
1592
1593 hiti++;
1594
1595 } else {
1596 /* Adjacent position found for hiti */
1597 currlink = &(links[curr_querypos][hiti + low_hit]);
1598 prevlink = &(/*links[adj_querypos]*/adj_links[prevhit]);
1599
1600 currlink->fwd_consecutive = /*best_fwd_consecutive =*/ prevlink->fwd_consecutive + adj_querydistance;
1601 currlink->fwd_rootposition = /*best_fwd_rootposition =*/ prevlink->fwd_rootposition;
1602 currlink->fwd_pos = /*best_fwd_prevpos =*/ adj_querypos;
1603 currlink->fwd_hit = /*best_fwd_prevhit =*/ prevhit;
1604 fwd_scores[curr_querypos][hiti + low_hit] = /*best_fwd_score =*/ fwd_scores[adj_querypos][prevhit] + CONSEC_POINTS_PER_MATCH*adj_querydistance;
1605
1606 #ifdef DEBUG9
1607 printf("\tA(1). For hit %d, adjacent qpos %d,%d at %ux%d (scores = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d)\n",
1608 hiti,adj_querypos,prevhit,prevposition,active[adj_querypos][prevhit],fwd_scores[adj_querypos][prevhit],
1609 fwd_scores[curr_querypos][hiti + low_hit],currlink->fwd_consecutive,/*best_fwd_tracei*/prevlink->fwd_tracei,
1610 /*best_fwd_intronnfwd*/prevlink->fwd_intronnfwd,
1611 /*best_fwd_intronnrev*/prevlink->fwd_intronnrev,
1612 /*best_fwd_intronnunk*/prevlink->fwd_intronnunk);
1613 #endif
1614
1615 prevhit = /*active[adj_querypos]*/adj_active[prevhit];
1616 hiti++;
1617 }
1618 }
1619
1620 while (hiti < nhits) {
1621 currlink = &(links[curr_querypos][hiti + low_hit]);
1622
1623 currlink->fwd_consecutive = /*best_fwd_consecutive =*/ indexsize*NT_PER_MATCH;
1624 currlink->fwd_rootposition = /*best_fwd_rootposition =*/ positions[hiti];
1625 currlink->fwd_pos = /*best_fwd_prevpos =*/ -1;
1626 currlink->fwd_hit = /*best_fwd_prevhit =*/ -1;
1627
1628 #ifdef MOVE_TO_STAGE3
1629 if (anchoredp == true) {
1630 currlink->fwd_tracei = -1;
1631 fwd_scores[curr_querypos][hiti + low_hit] = -100000;
1632 } else
1633 #endif
1634 if (localp == true) {
1635 currlink->fwd_tracei = ++*fwd_tracei;
1636 fwd_scores[curr_querypos][hiti + low_hit] = indexsize_nt;
1637 } else {
1638 fwd_scores[curr_querypos][hiti + low_hit] = /*best_fwd_score =*/ 0;
1639 }
1640
1641 hiti++;
1642 }
1643
1644 } else {
1645 adj_querypos = Intlist_head(last_item);
1646 adj_links = links[adj_querypos];
1647 adj_mappings = mappings[adj_querypos];
1648 adj_active = active[adj_querypos];
1649
1650 #ifdef PMAP
1651 adj_querydistance = (curr_querypos - adj_querypos)*3;
1652 #else
1653 adj_querydistance = curr_querypos - adj_querypos;
1654 #endif
1655 nprocessed = Intlist_length(processed);
1656 frontier = (int *) MALLOCA(nprocessed * sizeof(int));
1657
1658 nseen = 0;
1659 for (p = processed; p != NULL; p = Intlist_next(p)) {
1660 prev_querypos = Intlist_head(p);
1661
1662 querydistance = curr_querypos - prev_querypos;
1663 if (nseen <= /*nlookback*/1 || querydistance - indexsize_nt <= /*lookback*/sufflookback/2) {
1664 max_adjacent_nseen = nseen;
1665 }
1666 if (nseen <= /*nlookback*/nsufflookback || querydistance - indexsize_nt <= /*lookback*/sufflookback) {
1667 max_nonadjacent_nseen = nseen;
1668 }
1669
1670 frontier[nseen++] = firstactive[prev_querypos];
1671 }
1672
1673
1674 /* Look for overall_fwd_consecutive to see whether we can be greedy */
1675 overall_fwd_consecutive = 0;
1676 adj_frontier = firstactive[adj_querypos];
1677 for (hiti = 0; hiti < nhits; hiti++) {
1678 position = positions[hiti];
1679
1680 /* A. Evaluate adjacent positions (at last one processed) */
1681 prevhit = adj_frontier; /* Get information from last hiti */
1682 prevposition = position; /* Prevents prevposition + adj_querydistance == position */
1683 while (prevhit != -1 && (prevposition = /*mappings[adj_querypos]*/adj_mappings[prevhit]) + adj_querydistance < position) {
1684 prevhit = /*active[adj_querypos]*/adj_active[prevhit];
1685 }
1686 adj_frontier = prevhit; /* Save information for next hiti */
1687
1688 if (prevposition + adj_querydistance == position) {
1689 /* Adjacent found */
1690 prevlink = &(/*links[adj_querypos]*/adj_links[prevhit]);
1691 if (prevlink->fwd_consecutive + adj_querydistance > overall_fwd_consecutive) {
1692 overall_fwd_consecutive = prevlink->fwd_consecutive + adj_querydistance;
1693 }
1694 }
1695 }
1696 debug(printf("Overall fwd consecutive is %d\n",overall_fwd_consecutive));
1697
1698
1699 /* Now process */
1700 adj_frontier = firstactive[adj_querypos];
1701 for (hiti = 0; hiti < nhits; hiti++) {
1702 position = positions[hiti];
1703
1704 /* A. Evaluate adjacent positions (at last one processed) */
1705 prevhit = adj_frontier; /* Get information from last hiti */
1706 prevposition = position; /* Prevents prevposition + adj_querydistance == position */
1707 while (prevhit != -1 && (prevposition = /*mappings[adj_querypos]*/adj_mappings[prevhit]) + adj_querydistance < position) {
1708 prevhit = /*active[adj_querypos]*/adj_active[prevhit];
1709 }
1710 adj_frontier = prevhit; /* Save information for next hiti */
1711
1712 if (prevposition + adj_querydistance == position) {
1713 /* Adjacent found */
1714 prevlink = &(/*links[adj_querypos]*/adj_links[prevhit]);
1715
1716 best_fwd_consecutive = prevlink->fwd_consecutive + adj_querydistance;
1717 best_fwd_rootposition = prevlink->fwd_rootposition;
1718 best_fwd_prevpos = adj_querypos;
1719 best_fwd_prevhit = prevhit;
1720 best_fwd_score = fwd_scores[adj_querypos][prevhit] + CONSEC_POINTS_PER_MATCH*adj_querydistance;
1721 max_nseen = max_adjacent_nseen; /* Look not so far back */
1722 best_fwd_tracei = prevlink->fwd_tracei;
1723
1724 #ifdef DEBUG9
1725 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
1726 best_fwd_intronnrev = prevlink->fwd_intronnrev;
1727 best_fwd_intronnunk = prevlink->fwd_intronnunk;
1728 #endif
1729 debug9(printf("\tA(2). For hit %d, adjacent qpos %d,%d at %ux%d (scores = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d)\n",
1730 hiti,adj_querypos,prevhit,prevposition,active[adj_querypos][prevhit],fwd_scores[adj_querypos][prevhit],
1731 best_fwd_score,best_fwd_consecutive,/*best_fwd_tracei*/prevlink->fwd_tracei,
1732 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk));
1733
1734 } else {
1735 /* Adjacent not found */
1736 best_fwd_consecutive = indexsize*NT_PER_MATCH;
1737 best_fwd_rootposition = position;
1738 best_fwd_prevpos = -1;
1739 best_fwd_prevhit = -1;
1740 best_fwd_score = 0;
1741 max_nseen = max_nonadjacent_nseen; /* Look farther back */
1742 best_fwd_tracei = -1;
1743
1744 #ifdef DEBUG9
1745 best_fwd_intronnfwd = 0;
1746 best_fwd_intronnrev = 0;
1747 best_fwd_intronnunk = 0;
1748 #endif
1749 }
1750
1751 if (overall_fwd_consecutive < GREEDY_NCONSECUTIVE) {
1752 /* D. Evaluate for mismatches (all other previous querypos) */
1753 nseen = 0;
1754 last_tracei = -1;
1755 for (p = processed; p != NULL && best_fwd_consecutive < ENOUGH_CONSECUTIVE && nseen <= max_nseen;
1756 p = Intlist_next(p), nseen++) {
1757
1758 /* Making this check helps with efficiency */
1759 if ((prevhit = frontier[nseen]) != -1) { /* Retrieve starting point from last hiti */
1760 prev_querypos = Intlist_head(p);
1761 #ifdef PMAP
1762 querydistance = (curr_querypos - prev_querypos)*3;
1763 #else
1764 querydistance = curr_querypos - prev_querypos;
1765 #endif
1766 /* Actually a querydist_penalty */
1767 querydist_credit = -querydistance/indexsize_nt;
1768
1769 prev_mappings = mappings[prev_querypos];
1770 prev_links = links[prev_querypos];
1771 prev_active = active[prev_querypos];
1772
1773 /* Range 0 */
1774 while (prevhit != -1 && prev_links[prevhit].fwd_tracei == last_tracei) {
1775 debug9(printf("Skipping querypos %d with tracei #%d\n",prev_querypos,prev_links[prevhit].fwd_tracei));
1776 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1777 }
1778 if (prevhit != -1) {
1779 last_tracei = prev_links[prevhit].fwd_tracei;
1780 }
1781
1782 /* Range 1: From Infinity to maxintronlen. To be skipped.
1783 This is equivalent to diffdistance >= maxintronlen, where
1784 diffdistance = abs(gendistance - querydistance) and
1785 gendistance = (position - prevposition - indexsize_nt) */
1786 while (prevhit != -1 && (/*prevposition =*/ /*mappings[prev_querypos]*/prev_mappings[prevhit]) + maxintronlen + querydistance <= position) {
1787 /* Accept within range 1 (ignore) */
1788 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1789 }
1790 frontier[nseen] = prevhit; /* Store as starting point for next hiti */
1791
1792 /* Range 2: From maxintronlen to (prev_querypos + EQUAL_DISTANCE_NOT_SPLICING) */
1793 /* This is equivalent to +diffdistance > EQUAL_DISTANCE_NOT_SPLICING */
1794 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) + EQUAL_DISTANCE_NOT_SPLICING + querydistance < position) {
1795 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
1796
1797 gendistance = position - prevposition;
1798 assert(gendistance > querydistance); /* True because gendistance > EQUAL_DISTANCE_NOT_SPLICING + querydistance */
1799 diffdistance = gendistance - querydistance; /* No need for abs() */
1800
1801 fwd_score = fwd_scores[prev_querypos][prevhit] + querydist_credit /*- querydist_penalty*/;
1802 if (splicingp == true) {
1803 fwd_score -= (diffdistance/TEN_THOUSAND + 1);
1804 } else {
1805 fwd_score -= (diffdistance/ONE + 1);
1806 }
1807
1808 if (use_canonical_p == true) {
1809 /* prevpos is lower genomic coordinate than currpos */
1810 /* need to subtract from position and prevposition to compensate for greedy matches */
1811 /* need to add to position and prevposition to compensate for missed matches */
1812 if (plusp == true) {
1813 prevpos = chroffset + prevposition + indexsize_nt;
1814 currpos = chroffset + position - querydistance + indexsize_nt;
1815 if (prevpos < GREEDY_ADVANCE || currpos < GREEDY_ADVANCE) {
1816 canonicalp = false;
1817 } else if (Genome_sense_canonicalp(/*donor_rightbound*/prevpos + MISS_BEHIND,
1818 /*donor_leftbound*/prevpos - GREEDY_ADVANCE,
1819 /*acceptor_rightbound*/currpos + MISS_BEHIND,
1820 /*acceptor_leftbound*/currpos - GREEDY_ADVANCE,
1821 chroffset) == true) {
1822 debug9(printf("lookback plus: sense canonical\n"));
1823 canonicalp = true;
1824 } else if (Genome_antisense_canonicalp(/*donor_rightbound*/currpos + MISS_BEHIND,
1825 /*donor_leftbound*/currpos - GREEDY_ADVANCE,
1826 /*acceptor_rightbound*/prevpos + MISS_BEHIND,
1827 /*acceptor_leftbound*/prevpos - GREEDY_ADVANCE,
1828 chroffset) == true) {
1829 debug9(printf("lookback plus: antisense canonical\n"));
1830 canonicalp = true;
1831 } else {
1832 debug9(printf("lookback plus: not canonical\n"));
1833 canonicalp = false;
1834 }
1835
1836 } else {
1837 prevpos = chrhigh + 1 - prevposition - indexsize_nt;
1838 currpos = chrhigh + 1 - position + querydistance - indexsize_nt;
1839 if (currpos < MISS_BEHIND || prevpos < MISS_BEHIND) {
1840 canonicalp = false;
1841 } else if (Genome_sense_canonicalp(/*donor_rightbound*/currpos + GREEDY_ADVANCE,
1842 /*donor_leftbound*/currpos - MISS_BEHIND,
1843 /*acceptor_rightbound*/prevpos + GREEDY_ADVANCE,
1844 /*acceptor_leftbound*/prevpos - MISS_BEHIND,
1845 chroffset) == true) {
1846 debug9(printf("lookback minus: sense canonical\n"));
1847 canonicalp = true;
1848 } else if (Genome_antisense_canonicalp(/*donor_rightbound*/prevpos + GREEDY_ADVANCE,
1849 /*donor_leftbound*/prevpos - MISS_BEHIND,
1850 /*acceptor_rightbound*/currpos + GREEDY_ADVANCE,
1851 /*acceptor_leftbound*/currpos - MISS_BEHIND,
1852 chroffset) == true) {
1853 debug9(printf("lookback minus: antisense canonical\n"));
1854 canonicalp = true;
1855 } else {
1856 debug9(printf("lookback minus: not canonical\n"));
1857 canonicalp = false;
1858 }
1859 }
1860
1861 if (canonicalp == true) {
1862 debug9(canonicalsgn = +1);
1863 } else {
1864 debug9(canonicalsgn = 0);
1865 fwd_score -= non_canonical_penalty;
1866 }
1867 }
1868
1869 debug9(printf("\tD2, hit %d. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
1870 hiti,prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
1871 fwd_scores[prev_querypos][prevhit],fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
1872 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
1873 gendistance-indexsize_nt,querydistance-indexsize_nt,canonicalsgn));
1874
1875 /* Disallow ties, which should favor adjacent */
1876 if (fwd_score > best_fwd_score) {
1877 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
1878 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
1879 } else {
1880 best_fwd_consecutive = 0;
1881 }
1882 best_fwd_rootposition = prevlink->fwd_rootposition;
1883 best_fwd_score = fwd_score;
1884 best_fwd_prevpos = prev_querypos;
1885 best_fwd_prevhit = prevhit;
1886 best_fwd_tracei = ++*fwd_tracei;
1887 #ifdef DEBUG9
1888 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
1889 best_fwd_intronnrev = prevlink->fwd_intronnrev;
1890 best_fwd_intronnunk = prevlink->fwd_intronnunk;
1891 switch (canonicalsgn) {
1892 case 1: best_fwd_intronnfwd++; break;
1893 case 0: best_fwd_intronnunk++; break;
1894 }
1895 #endif
1896 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
1897 } else {
1898 debug9(printf(" => Loses to %d\n",best_fwd_score));
1899 }
1900 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1901 }
1902
1903
1904 /* Scoring appears to be the same as for range 4, which is rarely called, so including in range 4 */
1905 /* Range 3: From (querypos + EQUAL_DISTANCE_NOT_SPLICING) to (querypos - EQUAL_DISTANCE_NOT_SPLICING) */
1906 /* This is equivalent to -diffdistance > EQUAL_DISTANCE_NOT_SPLICING && prevposition + indexsize_nt <= position */
1907
1908
1909 /* Range 4: From (prev_querypos - EQUAL_DISTANCE_NOT_SPLICING) to indexsize_nt */
1910 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) + indexsize_nt <= position) {
1911 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
1912
1913 gendistance = position - prevposition;
1914 /* was abs(gendistance - querydistance) */
1915 diffdistance = gendistance > querydistance ? (gendistance - querydistance) : (querydistance - gendistance);
1916
1917 #ifdef BAD_GMAX
1918 fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/;
1919 #else
1920 /* diffdistance <= EQUAL_DISTANCE_NOT_SPLICING */
1921 /* This is how version 2013-08-14 did it */
1922 fwd_score = fwd_scores[prev_querypos][prevhit] + CONSEC_POINTS_PER_MATCH;
1923 #endif
1924
1925 debug9(printf("\tD4, hit %d. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
1926 hiti,prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
1927 fwd_scores[prev_querypos][prevhit],fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
1928 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
1929 gendistance-indexsize_nt,querydistance-indexsize_nt,canonicalsgn));
1930
1931 /* Disallow ties, which should favor adjacent */
1932 if (fwd_score > best_fwd_score) {
1933 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
1934 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
1935 } else {
1936 best_fwd_consecutive = 0;
1937 }
1938 best_fwd_rootposition = prevlink->fwd_rootposition;
1939 best_fwd_score = fwd_score;
1940 best_fwd_prevpos = prev_querypos;
1941 best_fwd_prevhit = prevhit;
1942 /* best_fwd_tracei = ++*fwd_tracei; */
1943 best_fwd_tracei = prevlink->fwd_tracei; /* Keep previous trace, as in range 3 */
1944 #ifdef DEBUG9
1945 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
1946 best_fwd_intronnrev = prevlink->fwd_intronnrev;
1947 best_fwd_intronnunk = prevlink->fwd_intronnunk;
1948 switch (canonicalsgn) {
1949 case 1: best_fwd_intronnfwd++; break;
1950 case 0: best_fwd_intronnunk++; break;
1951 }
1952 #endif
1953 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
1954 } else {
1955 debug9(printf(" => Loses to %d\n",best_fwd_score));
1956 }
1957
1958 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
1959 }
1960 }
1961 }
1962 }
1963
1964 /* Best_score needs to beat something positive to prevent a
1965 small local extension from beating a good canonical intron.
1966 If querypos is too small, don't insert an intron. */
1967 /* linksconsecutive already assigned above */
1968 currlink = &(links[curr_querypos][hiti + low_hit]);
1969 currlink->fwd_consecutive = best_fwd_consecutive;
1970 currlink->fwd_rootposition = best_fwd_rootposition;
1971 currlink->fwd_pos = best_fwd_prevpos;
1972 currlink->fwd_hit = best_fwd_prevhit;
1973 if (currlink->fwd_pos >= 0) {
1974 currlink->fwd_tracei = best_fwd_tracei;
1975 fwd_scores[curr_querypos][hiti + low_hit] = best_fwd_score;
1976 #ifdef MOVE_TO_STAGE3
1977 } else if (anchoredp == true) {
1978 currlink->fwd_tracei = -1;
1979 fwd_scores[curr_querypos][hiti + low_hit] = -100000;
1980 #endif
1981 } else if (localp == true) {
1982 currlink->fwd_tracei = ++*fwd_tracei;
1983 fwd_scores[curr_querypos][hiti + low_hit] = indexsize_nt;
1984 } else {
1985 currlink->fwd_tracei = ++*fwd_tracei;
1986 fwd_scores[curr_querypos][hiti + low_hit] = best_fwd_score;
1987 }
1988
1989 #ifdef DEBUG9
1990 currlink->fwd_intronnfwd = best_fwd_intronnfwd;
1991 currlink->fwd_intronnrev = best_fwd_intronnrev;
1992 currlink->fwd_intronnunk = best_fwd_intronnunk;
1993 #endif
1994
1995 debug9(printf("\tChose %d,%d with score %d (fwd) => trace #%d\n",
1996 currlink->fwd_pos,currlink->fwd_hit,fwd_scores[curr_querypos][hiti + low_hit],currlink->fwd_tracei));
1997 debug3(printf("%d %d %d %d 1\n",querypos,hit,best_prevpos,best_prevhit));
1998 }
1999
2000 FREEA(frontier);
2001 }
2002
2003 return;
2004 }
2005
2006
2007 static void
score_querypos_lookforward_one(int * fwd_tracei,Link_T currlink,int curr_querypos,int currhit,unsigned int position,struct Link_T ** links,int ** fwd_scores,Chrpos_T ** mappings,int ** active,int * firstactive,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int indexsize,Intlist_T processed,bool anchoredp,bool localp,bool splicingp,bool use_canonical_p,int non_canonical_penalty)2008 score_querypos_lookforward_one (int *fwd_tracei, Link_T currlink, int curr_querypos, int currhit,
2009 unsigned int position,
2010 struct Link_T **links, int **fwd_scores,
2011 Chrpos_T **mappings, int **active, int *firstactive,
2012 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
2013 int indexsize, Intlist_T processed,
2014 #ifdef MOVE_TO_STAGE3
2015 bool anchoredp,
2016 #endif
2017 bool localp, bool splicingp,
2018 bool use_canonical_p, int non_canonical_penalty) {
2019 Link_T prevlink;
2020 struct Link_T *prev_links;
2021 Chrpos_T *prev_mappings;
2022 int *prev_active;
2023
2024 int best_fwd_consecutive = indexsize*NT_PER_MATCH;
2025 int best_fwd_rootposition = position;
2026 int best_fwd_score = 0, fwd_score;
2027 int best_fwd_prevpos = -1, best_fwd_prevhit = -1;
2028 int best_fwd_tracei, last_tracei;
2029 #ifdef DEBUG9
2030 int best_fwd_intronnfwd = 0, best_fwd_intronnrev = 0, best_fwd_intronnunk = 0;
2031 int canonicalsgn = 0;
2032 #endif
2033 bool donep;
2034 int prev_querypos, prevhit;
2035 Chrpos_T prevposition;
2036 int gendistance;
2037 Univcoord_T prevpos, currpos;
2038 int querydistance, diffdistance, lookback, nlookback, nseen, indexsize_nt;
2039 /* int querydist_penalty; */
2040 int querydist_credit;
2041 /* bool near_end_p; */
2042 bool canonicalp;
2043
2044 #ifdef PMAP
2045 indexsize_nt = indexsize*3;
2046 #else
2047 indexsize_nt = indexsize;
2048 #endif
2049 /* indexsize_query = indexsize; */ /* Use when evaluating across query positions */
2050
2051
2052 /* Parameters for section D, assuming adjacent is false */
2053 /* adjacentp = false; */
2054 nlookback = nsufflookback;
2055 lookback = sufflookback;
2056
2057 /* A. Evaluate adjacent position (at last one processed) */
2058 if (processed != NULL) {
2059 prev_querypos = Intlist_head(processed);
2060 prev_mappings = mappings[prev_querypos];
2061 prev_links = links[prev_querypos];
2062 prev_active = active[prev_querypos];
2063
2064 #ifdef PMAP
2065 querydistance = (prev_querypos - curr_querypos)*3;
2066 #else
2067 querydistance = prev_querypos - curr_querypos;
2068 #endif
2069 prevhit = firstactive[prev_querypos];
2070 prevposition = position; /* Prevents prevposition == position + querydistance */
2071 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) > position + querydistance) {
2072 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2073 }
2074 if (prevposition == position + querydistance) {
2075 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
2076 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
2077 /* best_fwd_rootnlinks = prevlink->fwd_rootnlinks + 1; */
2078 best_fwd_rootposition = prevlink->fwd_rootposition;
2079 best_fwd_score = fwd_scores[prev_querypos][prevhit] + CONSEC_POINTS_PER_MATCH*querydistance;
2080
2081 best_fwd_prevpos = prev_querypos;
2082 best_fwd_prevhit = prevhit;
2083 best_fwd_tracei = prevlink->fwd_tracei;
2084 #ifdef DEBUG9
2085 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
2086 best_fwd_intronnrev = prevlink->fwd_intronnrev;
2087 best_fwd_intronnunk = prevlink->fwd_intronnunk;
2088 #endif
2089 /* adjacentp = true; */
2090 /* Parameters for section D when adjacent is true */
2091 nlookback = 1;
2092 lookback = sufflookback/2;
2093
2094 debug9(printf("\tA. Adjacent qpos %d,%d at %ux%d (scores = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d)\n",
2095 prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],fwd_scores[prev_querypos][prevhit],
2096 best_fwd_score,best_fwd_consecutive,best_fwd_tracei,
2097 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk));
2098 }
2099 }
2100
2101 /* Check work list */
2102 #ifdef MOVE_TO_STAGE3
2103 if (anchoredp && curr_querypos + indexsize_query >= queryend) {
2104 /* Allow close prevpositions that overlap with anchor */
2105 /* Can give rise to false positives, and increases amount of dynamic programming work */
2106 debug9(printf("No skipping because close to anchor\n"));
2107 } else if (0 && anchoredp && curr_querypos == querystart) {
2108 /* Test end position */
2109 } else if (0) {
2110 while (processed != NULL && (prev_querypos = Intlist_head(processed)) < curr_querypos + indexsize_query) {
2111 debug9(printf("Skipping prev_querypos %d, because too close\n",prev_querypos));
2112 processed = Intlist_next(processed);
2113 }
2114 }
2115 #endif
2116
2117 /* D. Evaluate for mismatches (all other previous querypos) */
2118 donep = false;
2119 nseen = 0;
2120 last_tracei = -1;
2121 for ( ; processed != NULL && best_fwd_consecutive < ENOUGH_CONSECUTIVE && donep == false;
2122 processed = Intlist_next(processed), nseen++) {
2123 prev_querypos = Intlist_head(processed);
2124
2125 #ifdef PMAP
2126 querydistance = (prev_querypos - curr_querypos)*3;
2127 #else
2128 querydistance = prev_querypos - curr_querypos;
2129 #endif
2130
2131 if (nseen > nlookback && querydistance - indexsize_nt > lookback) {
2132 donep = true;
2133 }
2134
2135 if ((prevhit = firstactive[prev_querypos]) != -1) {
2136 /* querydist_penalty = (querydistance - indexsize_nt)/QUERYDIST_PENALTY_FACTOR; */
2137 /* Actually a querydist_penalty */
2138 querydist_credit = -querydistance/indexsize_nt;
2139
2140 prev_mappings = mappings[prev_querypos];
2141 prev_links = links[prev_querypos];
2142 prev_active = active[prev_querypos];
2143
2144 /* Range 0 */
2145 while (prevhit != -1 && prev_links[prevhit].fwd_tracei == last_tracei) {
2146 debug9(printf("Skipping querypos %d with tracei #%d\n",prev_querypos,prev_links[prevhit].fwd_tracei));
2147 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2148 }
2149 if (prevhit != -1) {
2150 last_tracei = prev_links[prevhit].fwd_tracei;
2151 }
2152
2153 /* Range 1: From Infinity to maxintronlen */
2154 if (splicingp == true) {
2155 /* This is equivalent to diffdistance >= maxintronlen, where
2156 diffdistance = abs(gendistance - querydistance) and
2157 gendistance = (position - prevposition - indexsize_nt) */
2158 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) >= position + maxintronlen + querydistance) {
2159 /* Skip */
2160 /* printf("fwd: prevposition %u, prevhit %d\n",prevposition,prevhit); */
2161 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2162 }
2163 }
2164
2165 /* Range 2: From maxintronlen to (prev_querypos + EQUAL_DISTANCE_NOT_SPLICING) */
2166 /* This is equivalent to +diffdistance > EQUAL_DISTANCE_NOT_SPLICING */
2167 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) > position + EQUAL_DISTANCE_NOT_SPLICING + querydistance) {
2168 /* printf("fwd: prevposition %u, prevhit %d\n",prevposition,prevhit); */
2169 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
2170
2171 gendistance = prevposition - position;
2172 assert(gendistance > querydistance); /* True because gendistance > EQUAL_DISTANCE_NOT_SPLICING + querydistance */
2173 diffdistance = gendistance - querydistance; /* No need for abs() */
2174
2175 fwd_score = fwd_scores[prev_querypos][prevhit] + querydist_credit /*- querydist_penalty*/;
2176 if (splicingp == true) {
2177 fwd_score -= (diffdistance/TEN_THOUSAND + 1);
2178 } else {
2179 fwd_score -= (diffdistance/ONE + 1);
2180 }
2181
2182 if (use_canonical_p == true) {
2183
2184 /* prevpos is higher genomic coordinate than currpos */
2185 /* need to add to position and prevposition to compensate for greedy matches */
2186 /* need to subtract from position and prevposition to compensate for missed matches */
2187 if (plusp == true) {
2188 prevpos = chroffset + prevposition;
2189 currpos = chroffset + position + querydistance;
2190 if (currpos < MISS_BEHIND || prevpos < MISS_BEHIND) {
2191 canonicalp = false;
2192 } else if (Genome_sense_canonicalp(/*donor_rightbound*/currpos + GREEDY_ADVANCE,
2193 /*donor_leftbound*/currpos - MISS_BEHIND,
2194 /*acceptor_rightbound*/prevpos + GREEDY_ADVANCE,
2195 /*acceptor_leftbound*/prevpos - MISS_BEHIND,
2196 chroffset) == true) {
2197 debug9(printf("lookforward plus: sense canonical\n"));
2198 canonicalp = true;
2199 } else if (Genome_antisense_canonicalp(/*donor_rightbound*/prevpos + GREEDY_ADVANCE,
2200 /*donor_leftbound*/prevpos - MISS_BEHIND,
2201 /*acceptor_rightbound*/currpos + GREEDY_ADVANCE,
2202 /*acceptor_leftbound*/currpos - MISS_BEHIND,
2203 chroffset) == true) {
2204 debug9(printf("lookforward plus: antisense canonical\n"));
2205 canonicalp = true;
2206 } else {
2207 debug9(printf("lookforward plus: not canonical\n"));
2208 canonicalp = false;
2209 }
2210
2211 } else {
2212 prevpos = chrhigh + 1 - prevposition;
2213 currpos = chrhigh + 1 - position - querydistance;
2214 if (prevpos < GREEDY_ADVANCE || currpos < GREEDY_ADVANCE) {
2215 canonicalp = false;
2216 } else if (Genome_sense_canonicalp(/*donor_rightbound*/prevpos + MISS_BEHIND,
2217 /*donor_leftbound*/prevpos - GREEDY_ADVANCE,
2218 /*acceptor_rightbound*/currpos + MISS_BEHIND,
2219 /*acceptor_leftbound*/currpos - GREEDY_ADVANCE,
2220 chroffset) == true) {
2221 debug9(printf("lookforward minus: sense canonical\n"));
2222 canonicalp = true;
2223 } else if (Genome_antisense_canonicalp(/*donor_rightbound*/currpos + MISS_BEHIND,
2224 /*donor_leftbound*/currpos - GREEDY_ADVANCE,
2225 /*acceptor_rightbound*/prevpos + MISS_BEHIND,
2226 /*acceptor_leftbound*/prevpos - GREEDY_ADVANCE,
2227 chroffset) == true) {
2228 debug9(printf("lookforward minus: antisense canonical\n"));
2229 canonicalp = true;
2230 } else {
2231 debug9(printf("lookforward minus: not canonical\n"));
2232 canonicalp = false;
2233 }
2234 }
2235
2236 if (canonicalp == true) {
2237 debug9(canonicalsgn = +1);
2238 } else {
2239 debug9(canonicalsgn = 0);
2240 fwd_score -= non_canonical_penalty;
2241 }
2242 }
2243
2244 debug9(printf("\tD2. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
2245 prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
2246 fwd_scores[prev_querypos][prevhit],fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
2247 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
2248 gendistance-indexsize_nt,querydistance-indexsize_nt,canonicalsgn));
2249
2250 /* Disallow ties, which should favor adjacent */
2251 if (fwd_score > best_fwd_score) {
2252 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
2253 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
2254 /* best_fwd_rootnlinks = prevlink->fwd_rootnlinks + 1; */
2255 } else {
2256 best_fwd_consecutive = 0;
2257 /* best_fwd_rootnlinks = 1; */
2258 }
2259 best_fwd_rootposition = prevlink->fwd_rootposition;
2260 best_fwd_score = fwd_score;
2261 best_fwd_prevpos = prev_querypos;
2262 best_fwd_prevhit = prevhit;
2263 best_fwd_tracei = ++*fwd_tracei;
2264 #ifdef DEBUG9
2265 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
2266 best_fwd_intronnrev = prevlink->fwd_intronnrev;
2267 best_fwd_intronnunk = prevlink->fwd_intronnunk;
2268 switch (canonicalsgn) {
2269 case 1: best_fwd_intronnfwd++; break;
2270 case 0: best_fwd_intronnunk++; break;
2271 }
2272 #endif
2273 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
2274 } else {
2275 debug9(printf(" => Loses to %d\n",best_fwd_score));
2276 }
2277
2278 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2279 }
2280
2281 /* Scoring appears to be the same as for range 4, which is rarely called, so including in range 4 */
2282 /* Range 3: From (querypos + EQUAL_DISTANCE_NOT_SPLICING) to (querypos - EQUAL_DISTANCE_NOT_SPLICING) */
2283 /* This is equivalent to -diffdistance > EQUAL_DISTANCE_NOT_SPLICING && prevposition + indexsize_nt <= position */
2284
2285 /* Range 4: From (prev_querypos - EQUAL_DISTANCE_NOT_SPLICING) to indexsize_nt */
2286 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) >= position + indexsize_nt) {
2287 /* printf("fwd: prevposition %u, prevhit %d\n",prevposition,prevhit); */
2288 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
2289
2290 gendistance = prevposition - position;
2291 /* was abs(gendistance - querydistance) */
2292 diffdistance = gendistance > querydistance ? (gendistance - querydistance) : (querydistance - gendistance);
2293
2294 #ifdef BAD_GMAX
2295 fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/;
2296 #else
2297 /* diffdistance <= EQUAL_DISTANCE_NOT_SPLICING */
2298 /* This is how version 2013-08-14 did it */
2299 fwd_score = fwd_scores[prev_querypos][prevhit] + CONSEC_POINTS_PER_MATCH;
2300 #endif
2301 #if 0
2302 if (/*near_end_p == false &&*/ prevlink->fwd_consecutive < EXON_DEFN) {
2303 fwd_score -= NINTRON_PENALTY_MISMATCH;
2304 }
2305 #endif
2306
2307 debug9(printf("\tD4. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
2308 prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
2309 fwd_scores[prev_querypos][prevhit],fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
2310 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
2311 gendistance-indexsize_nt,querydistance-indexsize_nt,canonicalsgn));
2312
2313 /* Disallow ties, which should favor adjacent */
2314 if (fwd_score > best_fwd_score) {
2315 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
2316 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
2317 /* best_fwd_rootnlinks = prevlink->fwd_rootnlinks + 1; */
2318 } else {
2319 best_fwd_consecutive = 0;
2320 /* best_fwd_rootnlinks = 1; */
2321 }
2322 best_fwd_rootposition = prevlink->fwd_rootposition;
2323 best_fwd_score = fwd_score;
2324 best_fwd_prevpos = prev_querypos;
2325 best_fwd_prevhit = prevhit;
2326 /* best_fwd_tracei = ++*fwd_tracei; */
2327 best_fwd_tracei = prevlink->fwd_tracei; /* Keep previous trace, as in range 3 */
2328 #ifdef DEBUG9
2329 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
2330 best_fwd_intronnrev = prevlink->fwd_intronnrev;
2331 best_fwd_intronnunk = prevlink->fwd_intronnunk;
2332 switch (canonicalsgn) {
2333 case 1: best_fwd_intronnfwd++; break;
2334 case 0: best_fwd_intronnunk++; break;
2335 }
2336 #endif
2337 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
2338 } else {
2339 debug9(printf(" => Loses to %d\n",best_fwd_score));
2340 }
2341
2342 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2343 }
2344 }
2345 }
2346
2347 /* Best_score needs to beat something positive to prevent a
2348 small local extension from beating a good canonical intron.
2349 If querypos is too small, don't insert an intron. */
2350 /* linksconsecutive already assigned above */
2351 currlink->fwd_consecutive = best_fwd_consecutive;
2352 currlink->fwd_rootposition = best_fwd_rootposition;
2353 /* currlink->fwd_rootnlinks = best_fwd_rootnlinks; */
2354 currlink->fwd_pos = best_fwd_prevpos;
2355 currlink->fwd_hit = best_fwd_prevhit;
2356 if (currlink->fwd_pos >= 0) {
2357 currlink->fwd_tracei = best_fwd_tracei;
2358 fwd_scores[curr_querypos][currhit] = best_fwd_score;
2359 #ifdef MOVE_TO_STAGE3
2360 } else if (anchoredp == true) {
2361 currlink->fwd_tracei = -1;
2362 fwd_scores[curr_querypos][currhit] = -100000;
2363 #endif
2364 } else if (localp == true) {
2365 currlink->fwd_tracei = ++*fwd_tracei;
2366 fwd_scores[curr_querypos][currhit] = indexsize_nt;
2367 } else {
2368 currlink->fwd_tracei = ++*fwd_tracei;
2369 fwd_scores[curr_querypos][currhit] = best_fwd_score;
2370 }
2371
2372 #ifdef DEBUG9
2373 currlink->fwd_intronnfwd = best_fwd_intronnfwd;
2374 currlink->fwd_intronnrev = best_fwd_intronnrev;
2375 currlink->fwd_intronnunk = best_fwd_intronnunk;
2376 #endif
2377
2378 debug9(printf("\tChose %d,%d with score %d (fwd) => trace #%d\n",
2379 currlink->fwd_pos,currlink->fwd_hit,fwd_scores[curr_querypos][currhit],currlink->fwd_tracei));
2380 debug3(printf("%d %d %d %d 1\n",querypos,hit,best_prevpos,best_prevhit));
2381
2382 return;
2383 }
2384
2385
2386 static void
score_querypos_lookforward_mult(int * fwd_tracei,int low_hit,int high_hit,int curr_querypos,unsigned int * positions,struct Link_T ** links,int ** fwd_scores,Chrpos_T ** mappings,int ** active,int * firstactive,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int indexsize,Intlist_T processed,bool anchoredp,bool localp,bool splicingp,bool use_canonical_p,int non_canonical_penalty)2387 score_querypos_lookforward_mult (int *fwd_tracei, int low_hit, int high_hit, int curr_querypos,
2388 unsigned int *positions,
2389 struct Link_T **links, int **fwd_scores,
2390 Chrpos_T **mappings, int **active, int *firstactive,
2391 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
2392 int indexsize, Intlist_T processed,
2393 #ifdef MOVE_TO_STAGE3
2394 bool anchoredp,
2395 #endif
2396 bool localp, bool splicingp,
2397 bool use_canonical_p, int non_canonical_penalty) {
2398 Link_T prevlink, currlink;
2399 Intlist_T last_item, p;
2400 int nhits = high_hit - low_hit, nprocessed, hiti;
2401
2402 struct Link_T *prev_links, *adj_links;
2403 Chrpos_T *prev_mappings, *adj_mappings;
2404 int *prev_active, *adj_active;
2405
2406 int overall_fwd_consecutive, best_fwd_consecutive;
2407 int best_fwd_rootposition;
2408 int best_fwd_score, fwd_score;
2409 int best_fwd_prevpos, best_fwd_prevhit;
2410 int best_fwd_tracei, last_tracei;
2411 #ifdef DEBUG9
2412 int best_fwd_intronnfwd, best_fwd_intronnrev, best_fwd_intronnunk;
2413 int canonicalsgn = 0;
2414 #endif
2415 int adj_querypos, adj_querydistance, prev_querypos, prevhit, adj_frontier, *frontier;
2416 Chrpos_T prevposition, position;
2417 int gendistance;
2418 Univcoord_T prevpos, currpos;
2419 int querydistance, diffdistance, indexsize_nt;
2420 int max_nseen, max_adjacent_nseen, max_nonadjacent_nseen, nseen;
2421 int querydist_credit;
2422 bool canonicalp;
2423
2424 #ifdef PMAP
2425 indexsize_nt = indexsize*3;
2426 #else
2427 indexsize_nt = indexsize;
2428 #endif
2429 /* indexsize_query = indexsize; */ /* Use when evaluating across query positions */
2430
2431
2432 /* Determine work load */
2433 /* printf("Work load (lookforward): %s\n",Intlist_to_string(processed)); */
2434 last_item = processed;
2435 #ifdef MOVE_TO_STAGE3
2436 if (anchoredp && curr_querypos + indexsize_query >= queryend) {
2437 /* Allow close prevpositions that overlap with anchor */
2438 /* Can give rise to false positives, and increases amount of dynamic programming work */
2439 /* debug9(printf("No skipping because close to anchor\n")); */
2440 } else if (0 && anchoredp && curr_querypos == querystart) {
2441 /* Test end position */
2442 } else if (0) {
2443 while (processed != NULL && (prev_querypos = Intlist_head(processed)) < curr_querypos + indexsize_query) {
2444 debug9(printf("Skipping prev_querypos %d, because too close\n",prev_querypos));
2445 processed = Intlist_next(processed);
2446 }
2447 }
2448 #endif
2449
2450 if (last_item == NULL) {
2451 for (hiti = nhits - 1; hiti >= 0; hiti--) {
2452 currlink = &(links[curr_querypos][hiti + low_hit]);
2453
2454 currlink->fwd_consecutive = /*best_fwd_consecutive =*/ indexsize*NT_PER_MATCH;
2455 currlink->fwd_rootposition = /*best_fwd_rootposition =*/ positions[hiti];
2456 currlink->fwd_pos = /*best_fwd_prevpos =*/ -1;
2457 currlink->fwd_hit = /*best_fwd_prevhit =*/ -1;
2458
2459 #ifdef MOVE_TO_STAGE3
2460 if (anchoredp == true) {
2461 currlink->fwd_tracei = -1;
2462 fwd_scores[curr_querypos][hiti + low_hit] = -100000;
2463 } else
2464 #endif
2465 if (localp == true) {
2466 currlink->fwd_tracei = ++*fwd_tracei;
2467 fwd_scores[curr_querypos][hiti + low_hit] = indexsize_nt;
2468 } else {
2469 fwd_scores[curr_querypos][hiti + low_hit] = /*best_fwd_score =*/ 0;
2470 }
2471 }
2472
2473 } else if (processed == NULL) {
2474 /* A. Evaluate adjacent position (at last one processed, if available). Don't evaluate for mismatches (D). */
2475 adj_querypos = Intlist_head(last_item);
2476 adj_links = links[adj_querypos];
2477 adj_mappings = mappings[adj_querypos];
2478 adj_active = active[adj_querypos];
2479
2480 #ifdef PMAP
2481 adj_querydistance = (adj_querypos - curr_querypos)*3;
2482 #else
2483 adj_querydistance = adj_querypos - curr_querypos;
2484 #endif
2485
2486 /* Process prevhit and hiti in parallel. Values are descending along prevhit chain and from nhits-1 to 0. */
2487 prevhit = firstactive[adj_querypos];
2488 hiti = nhits - 1;
2489 while (prevhit != -1 && hiti >= 0) {
2490 if ((prevposition = /*mappings[adj_querypos]*/adj_mappings[prevhit]) > (position = positions[hiti]) + adj_querydistance) {
2491 prevhit = /*active[adj_querypos]*/adj_active[prevhit];
2492
2493 } else if (prevposition < position + adj_querydistance) {
2494 /* Adjacent position not found for hiti */
2495 currlink = &(links[curr_querypos][hiti + low_hit]);
2496
2497 currlink->fwd_consecutive = /*best_fwd_consecutive =*/ indexsize*NT_PER_MATCH;
2498 currlink->fwd_rootposition = /*best_fwd_rootposition =*/ positions[hiti];
2499 currlink->fwd_pos = /*best_fwd_prevpos =*/ -1;
2500 currlink->fwd_hit = /*best_fwd_prevhit =*/ -1;
2501
2502 #ifdef MOVE_TO_STAGE3
2503 if (anchoredp == true) {
2504 currlink->fwd_tracei = -1;
2505 fwd_scores[curr_querypos][hiti + low_hit] = -100000;
2506 } else
2507 #endif
2508 if (localp == true) {
2509 currlink->fwd_tracei = ++*fwd_tracei;
2510 fwd_scores[curr_querypos][hiti + low_hit] = indexsize_nt;
2511 } else {
2512 fwd_scores[curr_querypos][hiti + low_hit] = /*best_fwd_score =*/ 0;
2513 }
2514
2515 hiti--;
2516
2517 } else {
2518 /* Adjacent position found for hiti */
2519 currlink = &(links[curr_querypos][hiti + low_hit]);
2520 prevlink = &(/*links[adj_querypos]*/adj_links[prevhit]);
2521
2522 currlink->fwd_consecutive = /*best_fwd_consecutive =*/ prevlink->fwd_consecutive + adj_querydistance;
2523 currlink->fwd_rootposition = /*best_fwd_rootposition =*/ prevlink->fwd_rootposition;
2524 currlink->fwd_pos = /*best_fwd_prevpos =*/ adj_querypos;
2525 currlink->fwd_hit = /*best_fwd_prevhit =*/ prevhit;
2526 fwd_scores[curr_querypos][hiti + low_hit] = /*best_fwd_score =*/ fwd_scores[adj_querypos][prevhit] + CONSEC_POINTS_PER_MATCH*adj_querydistance;
2527
2528 #ifdef DEBUG9
2529 printf("\tA(3). For hit %d, adjacent qpos %d,%d at %ux%d (scores = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d)\n",
2530 hiti,adj_querypos,prevhit,prevposition,active[adj_querypos][prevhit],fwd_scores[adj_querypos][prevhit],
2531 fwd_scores[curr_querypos][hiti + low_hit],currlink->fwd_consecutive,/*best_fwd_tracei*/prevlink->fwd_tracei,
2532 /*best_fwd_intronnfwd*/prevlink->fwd_intronnfwd,
2533 /*best_fwd_intronnrev*/prevlink->fwd_intronnrev,
2534 /*best_fwd_intronnunk*/prevlink->fwd_intronnunk);
2535 #endif
2536
2537 prevhit = /*active[adj_querypos]*/adj_active[prevhit];
2538 hiti--;
2539 }
2540 }
2541
2542 while (hiti >= 0) {
2543 /* Adjacent position not found for hiti */
2544 currlink = &(links[curr_querypos][hiti + low_hit]);
2545
2546 currlink->fwd_consecutive = /*best_fwd_consecutive =*/ indexsize*NT_PER_MATCH;
2547 currlink->fwd_rootposition = /*best_fwd_rootposition =*/ positions[hiti];
2548 currlink->fwd_pos = /*best_fwd_prevpos =*/ -1;
2549 currlink->fwd_hit = /*best_fwd_prevhit =*/ -1;
2550
2551 #ifdef MOVE_TO_STAGE3
2552 if (anchoredp == true) {
2553 currlink->fwd_tracei = -1;
2554 fwd_scores[curr_querypos][hiti + low_hit] = -100000;
2555 } else
2556 #endif
2557 if (localp == true) {
2558 currlink->fwd_tracei = ++*fwd_tracei;
2559 fwd_scores[curr_querypos][hiti + low_hit] = indexsize_nt;
2560 } else {
2561 fwd_scores[curr_querypos][hiti + low_hit] = /*best_fwd_score =*/ 0;
2562 }
2563
2564 hiti--;
2565 }
2566
2567 } else {
2568 adj_querypos = Intlist_head(last_item);
2569 adj_links = links[adj_querypos];
2570 adj_mappings = mappings[adj_querypos];
2571 adj_active = active[adj_querypos];
2572
2573 #ifdef PMAP
2574 adj_querydistance = (adj_querypos - curr_querypos)*3;
2575 #else
2576 adj_querydistance = adj_querypos - curr_querypos;
2577 #endif
2578
2579 nprocessed = Intlist_length(processed);
2580 frontier = (int *) MALLOCA(nprocessed * sizeof(int));
2581
2582 nseen = 0;
2583 for (p = processed; p != NULL; p = Intlist_next(p)) {
2584 prev_querypos = Intlist_head(p);
2585
2586 querydistance = prev_querypos - curr_querypos;
2587 if (nseen <= /*nlookback*/1 || querydistance - indexsize_nt <= /*lookback*/sufflookback/2) {
2588 max_adjacent_nseen = nseen;
2589 }
2590 if (nseen <= /*nlookback*/nsufflookback || querydistance - indexsize_nt <= /*lookback*/sufflookback) {
2591 max_nonadjacent_nseen = nseen;
2592 }
2593
2594 frontier[nseen++] = firstactive[prev_querypos];
2595 }
2596
2597
2598 /* Look for overall_fwd_consecutive to see whether we can be greedy */
2599 overall_fwd_consecutive = 0;
2600 adj_frontier = firstactive[adj_querypos];
2601 for (hiti = nhits - 1; hiti >= 0; hiti--) {
2602 position = positions[hiti];
2603
2604 /* A. Evaluate adjacent position (at last one processed) */
2605 prevhit = adj_frontier; /* Get information from last hiti */
2606 prevposition = position; /* Prevents prevposition == position + adj_querydistance */
2607 while (prevhit != -1 && (prevposition = /*mappings[adj_querypos]*/adj_mappings[prevhit]) > position + adj_querydistance) {
2608 prevhit = /*active[adj_querypos]*/adj_active[prevhit];
2609 }
2610 adj_frontier = prevhit; /* Save information for next hiti */
2611
2612 if (prevposition == position + adj_querydistance) {
2613 /* Adjacent found */
2614 prevlink = &(/*links[adj_querypos]*/adj_links[prevhit]);
2615 if (prevlink->fwd_consecutive + adj_querydistance > overall_fwd_consecutive) {
2616 overall_fwd_consecutive = prevlink->fwd_consecutive + adj_querydistance;
2617 }
2618 }
2619 }
2620 debug(printf("Overall fwd consecutive is %d\n",overall_fwd_consecutive));
2621
2622
2623 /* Now process */
2624 adj_frontier = firstactive[adj_querypos];
2625 for (hiti = nhits - 1; hiti >= 0; hiti--) {
2626 position = positions[hiti];
2627
2628 /* A. Evaluate adjacent position (at last one processed) */
2629 prevhit = adj_frontier; /* Get information from last hiti */
2630 prevposition = position; /* Prevents prevposition == position + adj_querydistance */
2631 while (prevhit != -1 && (prevposition = /*mappings[adj_querypos]*/adj_mappings[prevhit]) > position + adj_querydistance) {
2632 prevhit = /*active[adj_querypos]*/adj_active[prevhit];
2633 }
2634 adj_frontier = prevhit; /* Save information for next hiti */
2635
2636 if (prevposition == position + adj_querydistance) {
2637 /* Adjacent found */
2638 prevlink = &(/*links[adj_querypos]*/adj_links[prevhit]);
2639
2640 best_fwd_consecutive = prevlink->fwd_consecutive + adj_querydistance;
2641 best_fwd_rootposition = prevlink->fwd_rootposition;
2642 best_fwd_prevpos = adj_querypos;
2643 best_fwd_prevhit = prevhit;
2644 best_fwd_score = fwd_scores[adj_querypos][prevhit] + CONSEC_POINTS_PER_MATCH*adj_querydistance;
2645 max_nseen = max_adjacent_nseen; /* Look not so far back */
2646 best_fwd_tracei = prevlink->fwd_tracei;
2647
2648 #ifdef DEBUG9
2649 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
2650 best_fwd_intronnrev = prevlink->fwd_intronnrev;
2651 best_fwd_intronnunk = prevlink->fwd_intronnunk;
2652 #endif
2653 debug9(printf("\tA(4). For hit %d, adjacent qpos %d,%d at %ux%d (scores = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d)\n",
2654 hiti,adj_querypos,prevhit,prevposition,active[adj_querypos][prevhit],fwd_scores[adj_querypos][prevhit],
2655 best_fwd_score,best_fwd_consecutive,/*best_fwd_tracei*/prevlink->fwd_tracei,
2656 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk));
2657 } else {
2658 /* Adjacent not found */
2659 best_fwd_consecutive = indexsize*NT_PER_MATCH;
2660 best_fwd_rootposition = position;
2661 best_fwd_prevpos = -1;
2662 best_fwd_prevhit = -1;
2663 best_fwd_score = 0;
2664 max_nseen = max_nonadjacent_nseen; /* Look farther back */
2665 best_fwd_tracei = -1;
2666
2667 #ifdef DEBUG9
2668 best_fwd_intronnfwd = 0;
2669 best_fwd_intronnrev = 0;
2670 best_fwd_intronnunk = 0;
2671 #endif
2672 }
2673
2674 if (overall_fwd_consecutive < GREEDY_NCONSECUTIVE) {
2675 /* D. Evaluate for mismatches (all other previous querypos) */
2676 nseen = 0;
2677 last_tracei = -1;
2678 for (p = processed; p != NULL && best_fwd_consecutive < ENOUGH_CONSECUTIVE && nseen <= max_nseen;
2679 p = Intlist_next(p), nseen++) {
2680
2681 /* Making this check helps with efficiency */
2682 if ((prevhit = frontier[nseen]) != -1) { /* Retrieve starting point from last hiti */
2683 prev_querypos = Intlist_head(p);
2684 #ifdef PMAP
2685 querydistance = (prev_querypos - curr_querypos)*3;
2686 #else
2687 querydistance = prev_querypos - curr_querypos;
2688 #endif
2689 /* Actually a querydist_penalty */
2690 querydist_credit = -querydistance/indexsize_nt;
2691
2692 prev_mappings = mappings[prev_querypos];
2693 prev_links = links[prev_querypos];
2694 prev_active = active[prev_querypos];
2695
2696 /* Range 0 */
2697 while (prevhit != -1 && prev_links[prevhit].fwd_tracei == last_tracei) {
2698 debug9(printf("Skipping querypos %d with tracei #%d\n",prev_querypos,prev_links[prevhit].fwd_tracei));
2699 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2700 }
2701 if (prevhit != -1) {
2702 last_tracei = prev_links[prevhit].fwd_tracei;
2703 }
2704
2705 /* Range 1: From Infinity to maxintronlen. To be skipped.
2706 This is equivalent to diffdistance >= maxintronlen, where
2707 diffdistance = abs(gendistance - querydistance) and
2708 gendistance = (position - prevposition - indexsize_nt) */
2709 while (prevhit != -1 && (/*prevposition =*/ /*mappings[prev_querypos]*/prev_mappings[prevhit]) >= position + maxintronlen + querydistance) {
2710 /* Accept within range 1 (ignore) */
2711 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2712 }
2713 frontier[nseen] = prevhit; /* Store as starting point for next hiti */
2714
2715
2716 /* Range 2: From maxintronlen to (prev_querypos + EQUAL_DISTANCE_NOT_SPLICING) */
2717 /* This is equivalent to +diffdistance > EQUAL_DISTANCE_NOT_SPLICING */
2718 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) > position + EQUAL_DISTANCE_NOT_SPLICING + querydistance) {
2719 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
2720
2721 gendistance = prevposition - position;
2722 assert(gendistance > querydistance); /* True because gendistance > EQUAL_DISTANCE_NOT_SPLICING + querydistance */
2723 diffdistance = gendistance - querydistance; /* No need for abs() */
2724
2725 fwd_score = fwd_scores[prev_querypos][prevhit] + querydist_credit /*- querydist_penalty*/;
2726 if (splicingp == true) {
2727 fwd_score -= (diffdistance/TEN_THOUSAND + 1);
2728 } else {
2729 fwd_score -= (diffdistance/ONE + 1);
2730 }
2731
2732 if (use_canonical_p == true) {
2733 /* prevpos is higher genomic coordinate than currpos */
2734 /* need to add to position and prevposition to compensate for greedy matches */
2735 /* need to subtract from position and prevposition to compensate for missed matches */
2736 if (plusp == true) {
2737 prevpos = chroffset + prevposition;
2738 currpos = chroffset + position + querydistance;
2739 if (currpos < MISS_BEHIND || prevpos < MISS_BEHIND) {
2740 canonicalp = false;
2741 } else if (Genome_sense_canonicalp(/*donor_rightbound*/currpos + GREEDY_ADVANCE,
2742 /*donor_leftbound*/currpos - MISS_BEHIND,
2743 /*acceptor_rightbound*/prevpos + GREEDY_ADVANCE,
2744 /*acceptor_leftbound*/prevpos - MISS_BEHIND,
2745 chroffset) == true) {
2746 debug9(printf("lookforward plus: sense canonical\n"));
2747 canonicalp = true;
2748 } else if (Genome_antisense_canonicalp(/*donor_rightbound*/prevpos + GREEDY_ADVANCE,
2749 /*donor_leftbound*/prevpos - MISS_BEHIND,
2750 /*acceptor_rightbound*/currpos + GREEDY_ADVANCE,
2751 /*acceptor_leftbound*/currpos - MISS_BEHIND,
2752 chroffset) == true) {
2753 debug9(printf("lookforward plus: antisense canonical\n"));
2754 canonicalp = true;
2755 } else {
2756 debug9(printf("lookforward plus: not canonical\n"));
2757 canonicalp = false;
2758 }
2759
2760 } else {
2761 prevpos = chrhigh + 1 - prevposition;
2762 currpos = chrhigh + 1 - position - querydistance;
2763 if (prevpos < GREEDY_ADVANCE || currpos < GREEDY_ADVANCE) {
2764 canonicalp = false;
2765 } else if (Genome_sense_canonicalp(/*donor_rightbound*/prevpos + MISS_BEHIND,
2766 /*donor_leftbound*/prevpos - GREEDY_ADVANCE,
2767 /*acceptor_rightbound*/currpos + MISS_BEHIND,
2768 /*acceptor_leftbound*/currpos - GREEDY_ADVANCE,
2769 chroffset) == true) {
2770 debug9(printf("lookforward minus: sense canonical\n"));
2771 canonicalp = true;
2772 } else if (Genome_antisense_canonicalp(/*donor_rightbound*/currpos + MISS_BEHIND,
2773 /*donor_leftbound*/currpos - GREEDY_ADVANCE,
2774 /*acceptor_rightbound*/prevpos + MISS_BEHIND,
2775 /*acceptor_leftbound*/prevpos - GREEDY_ADVANCE,
2776 chroffset) == true) {
2777 debug9(printf("lookforward minus: antisense canonical\n"));
2778 canonicalp = true;
2779 } else {
2780 debug9(printf("lookforward minus: not canonical\n"));
2781 canonicalp = false;
2782 }
2783 }
2784
2785 if (canonicalp == true) {
2786 debug9(canonicalsgn = +1);
2787 } else {
2788 debug9(canonicalsgn = 0);
2789 fwd_score -= non_canonical_penalty;
2790 }
2791 }
2792
2793 debug9(printf("\tD2, hit %d. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
2794 hiti,prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
2795 fwd_scores[prev_querypos][prevhit],fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
2796 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
2797 gendistance-indexsize_nt,querydistance-indexsize_nt,canonicalsgn));
2798
2799 /* Disallow ties, which should favor adjacent */
2800 if (fwd_score > best_fwd_score) {
2801 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
2802 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
2803 } else {
2804 best_fwd_consecutive = 0;
2805 }
2806 best_fwd_rootposition = prevlink->fwd_rootposition;
2807 best_fwd_score = fwd_score;
2808 best_fwd_prevpos = prev_querypos;
2809 best_fwd_prevhit = prevhit;
2810 best_fwd_tracei = ++*fwd_tracei;
2811 #ifdef DEBUG9
2812 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
2813 best_fwd_intronnrev = prevlink->fwd_intronnrev;
2814 best_fwd_intronnunk = prevlink->fwd_intronnunk;
2815 switch (canonicalsgn) {
2816 case 1: best_fwd_intronnfwd++; break;
2817 case 0: best_fwd_intronnunk++; break;
2818 }
2819 #endif
2820 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
2821 } else {
2822 debug9(printf(" => Loses to %d\n",best_fwd_score));
2823 }
2824
2825 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2826 }
2827
2828
2829 /* Scoring appears to be the same as for range 4, which is rarely called, so including in range 4 */
2830 /* Range 3: From (querypos + EQUAL_DISTANCE_NOT_SPLICING) to (querypos - EQUAL_DISTANCE_NOT_SPLICING) */
2831 /* This is equivalent to -diffdistance > EQUAL_DISTANCE_NOT_SPLICING && prevposition + indexsize_nt <= position */
2832
2833
2834 /* Range 4: From (prev_querypos - EQUAL_DISTANCE_NOT_SPLICING) to indexsize_nt */
2835 while (prevhit != -1 && (prevposition = /*mappings[prev_querypos]*/prev_mappings[prevhit]) >= position + indexsize_nt) {
2836 prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]);
2837
2838 gendistance = prevposition - position;
2839 /* was abs(gendistance - querydistance) */
2840 diffdistance = gendistance > querydistance ? (gendistance - querydistance) : (querydistance - gendistance);
2841
2842 #ifdef BAD_GMAX
2843 fwd_score = fwd_scores[prev_querypos][prevhit] + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/;
2844 #else
2845 /* diffdistance <= EQUAL_DISTANCE_NOT_SPLICING */
2846 /* This is how version 2013-08-14 did it */
2847 fwd_score = fwd_scores[prev_querypos][prevhit] + CONSEC_POINTS_PER_MATCH;
2848 #endif
2849
2850 debug9(printf("\tD4, hit %d. Fwd mismatch qpos %d,%d at %ux%d (score = %d -> %d, consec = %d (from #%d), intr = %d-%d-%d, gendist %u, querydist %d, canonicalsgn %d)",
2851 hiti,prev_querypos,prevhit,prevposition,active[prev_querypos][prevhit],
2852 fwd_scores[prev_querypos][prevhit],fwd_score,prevlink->fwd_consecutive,prevlink->fwd_tracei,
2853 best_fwd_intronnfwd,best_fwd_intronnrev,best_fwd_intronnunk,
2854 gendistance-indexsize_nt,querydistance-indexsize_nt,canonicalsgn));
2855
2856 /* Disallow ties, which should favor adjacent */
2857 if (fwd_score > best_fwd_score) {
2858 if (diffdistance <= EQUAL_DISTANCE_FOR_CONSECUTIVE) {
2859 best_fwd_consecutive = prevlink->fwd_consecutive + querydistance;
2860 } else {
2861 best_fwd_consecutive = 0;
2862 }
2863 best_fwd_rootposition = prevlink->fwd_rootposition;
2864 best_fwd_score = fwd_score;
2865 best_fwd_prevpos = prev_querypos;
2866 best_fwd_prevhit = prevhit;
2867 /* best_fwd_tracei = ++*fwd_tracei; */
2868 best_fwd_tracei = prevlink->fwd_tracei; /* Keep previous trace, as in range 3 */
2869
2870 #ifdef DEBUG9
2871 best_fwd_intronnfwd = prevlink->fwd_intronnfwd;
2872 best_fwd_intronnrev = prevlink->fwd_intronnrev;
2873 best_fwd_intronnunk = prevlink->fwd_intronnunk;
2874 switch (canonicalsgn) {
2875 case 1: best_fwd_intronnfwd++; break;
2876 case 0: best_fwd_intronnunk++; break;
2877 }
2878 #endif
2879 debug9(printf(" => Best fwd at %d (consec = %d)\n",fwd_score,best_fwd_consecutive));
2880 } else {
2881 debug9(printf(" => Loses to %d\n",best_fwd_score));
2882 }
2883
2884 prevhit = /*active[prev_querypos]*/prev_active[prevhit];
2885 }
2886 }
2887 }
2888 }
2889
2890 /* Best_score needs to beat something positive to prevent a
2891 small local extension from beating a good canonical intron.
2892 If querypos is too small, don't insert an intron. */
2893 /* linksconsecutive already assigned above */
2894 currlink = &(links[curr_querypos][hiti + low_hit]);
2895 currlink->fwd_consecutive = best_fwd_consecutive;
2896 currlink->fwd_rootposition = best_fwd_rootposition;
2897 currlink->fwd_pos = best_fwd_prevpos;
2898 currlink->fwd_hit = best_fwd_prevhit;
2899 if (currlink->fwd_pos >= 0) {
2900 currlink->fwd_tracei = best_fwd_tracei;
2901 fwd_scores[curr_querypos][hiti + low_hit] = best_fwd_score;
2902 #ifdef MOVE_TO_STAGE3
2903 } else if (anchoredp == true) {
2904 currlink->fwd_tracei = -1;
2905 fwd_scores[curr_querypos][hiti + low_hit] = -100000;
2906 #endif
2907 } else if (localp == true) {
2908 currlink->fwd_tracei = ++*fwd_tracei;
2909 fwd_scores[curr_querypos][hiti + low_hit] = indexsize_nt;
2910 } else {
2911 currlink->fwd_tracei = ++*fwd_tracei;
2912 fwd_scores[curr_querypos][hiti + low_hit] = best_fwd_score;
2913 }
2914
2915 #ifdef DEBUG9
2916 currlink->fwd_intronnfwd = best_fwd_intronnfwd;
2917 currlink->fwd_intronnrev = best_fwd_intronnrev;
2918 currlink->fwd_intronnunk = best_fwd_intronnunk;
2919 #endif
2920
2921 debug9(printf("\tChose %d,%d with score %d (fwd) => trace #%d\n",
2922 currlink->fwd_pos,currlink->fwd_hit,fwd_scores[curr_querypos][hiti + low_hit],currlink->fwd_tracei));
2923 debug3(printf("%d %d %d %d 1\n",querypos,hit,best_prevpos,best_prevhit));
2924 }
2925
2926 FREEA(frontier);
2927 }
2928
2929 return;
2930 }
2931
2932
2933 static void
revise_active_lookback(int ** active,int * firstactive,int * nactive,int low_hit,int high_hit,int ** fwd_scores,int querypos)2934 revise_active_lookback (int **active, int *firstactive, int *nactive,
2935 int low_hit, int high_hit, int **fwd_scores, int querypos) {
2936 int best_score, threshold, score;
2937 int hit, *ptr;
2938
2939 debug6(printf("Revising querypos %d from low_hit %d to high_hit %d. Scores:\n",querypos,low_hit,high_hit));
2940 if ((hit = low_hit) >= high_hit) {
2941 debug6(printf("1. Initializing firstactive for querypos %d to be -1\n",querypos));
2942 firstactive[querypos] = -1;
2943 nactive[querypos] = 0;
2944
2945 } else {
2946 debug6(printf("At hit %d, fwd_score is %d",hit,fwd_scores[querypos][hit]));
2947 best_score = fwd_scores[querypos][hit];
2948 #ifdef SEPARATE_FWD_REV
2949 debug6(printf(" and rev_score is %d",rev_scores[querypos][hit]));
2950 if ((score = rev_scores[querypos][hit]) > best_score) {
2951 best_score = score;
2952 }
2953 #endif
2954 debug6(printf("\n"));
2955
2956 for (hit++; hit < high_hit; hit++) {
2957 debug6(printf("At hit %d, fwd_score is %d",hit,fwd_scores[querypos][hit]));
2958 if ((score = fwd_scores[querypos][hit]) > best_score) {
2959 best_score = score;
2960 }
2961 #ifdef SEPARATE_FWD_REV
2962 debug6(printf(" and rev_score is %d",rev_scores[querypos][hit]));
2963 if ((score = rev_scores[querypos][hit]) > best_score) {
2964 best_score = score;
2965 }
2966 #endif
2967 debug6(printf("\n"));
2968 }
2969
2970 threshold = best_score - SCORE_FOR_RESTRICT;
2971 if (threshold < 0) {
2972 threshold = 0;
2973 }
2974
2975 nactive[querypos] = 0;
2976 firstactive[querypos] = -1;
2977 ptr = &(firstactive[querypos]);
2978 hit = low_hit;
2979 while (hit < high_hit) {
2980 while (hit < high_hit && fwd_scores[querypos][hit] <= threshold
2981 #ifdef SEPARATE_FWD_REV
2982 && rev_scores[querypos][hit] <= threshold
2983 #endif
2984 ) {
2985 hit++;
2986 }
2987 *ptr = hit;
2988 if (hit < high_hit) {
2989 nactive[querypos] += 1;
2990 ptr = &(active[querypos][hit]);
2991 hit++;
2992 }
2993 }
2994 *ptr = -1;
2995 }
2996
2997 debug6(
2998 printf("Valid hits (%d) at querypos %d (firstactive %d):",nactive[querypos],querypos,firstactive[querypos]);
2999 hit = firstactive[querypos];
3000 while (hit != -1) {
3001 printf(" %d",hit);
3002 hit = active[querypos][hit];
3003 }
3004 printf("\n");
3005 );
3006
3007 return;
3008 }
3009
3010
3011 static void
revise_active_lookforward(int ** active,int * firstactive,int * nactive,int low_hit,int high_hit,int ** fwd_scores,int querypos)3012 revise_active_lookforward (int **active, int *firstactive, int *nactive,
3013 int low_hit, int high_hit, int **fwd_scores, int querypos) {
3014 int best_score, threshold, score;
3015 int hit, *ptr;
3016
3017 debug6(printf("Revising querypos %d from high_hit %d to low_hit %d. Scores:\n",querypos,high_hit,low_hit));
3018 if ((hit = high_hit - 1) < low_hit) {
3019 debug6(printf("2. Initializing firstactive for querypos %d to be -1\n",querypos));
3020 firstactive[querypos] = -1;
3021 nactive[querypos] = 0;
3022 } else {
3023 debug6(printf("At hit %d, fwd_score is %d",hit,fwd_scores[querypos][hit]));
3024 best_score = fwd_scores[querypos][hit];
3025 #ifdef SEPARATE_FWD_REV
3026 debug6(printf(" and rev_score is %d",rev_scores[querypos][hit]));
3027 if ((score = rev_scores[querypos][hit]) > best_score) {
3028 best_score = score;
3029 }
3030 #endif
3031 debug6(printf("\n"));
3032
3033 for (--hit; hit >= low_hit; --hit) {
3034 debug6(printf("At hit %d, fwd_score is %d",hit,fwd_scores[querypos][hit]));
3035 if ((score = fwd_scores[querypos][hit]) > best_score) {
3036 best_score = score;
3037 }
3038 #ifdef SEPARATE_FWD_REV
3039 debug6(printf(" and rev_score is %d",rev_scores[querypos][hit]));
3040 if ((score = rev_scores[querypos][hit]) > best_score) {
3041 best_score = score;
3042 }
3043 #endif
3044 debug6(printf("\n"));
3045 }
3046
3047 threshold = best_score - SCORE_FOR_RESTRICT;
3048 if (threshold < 0) {
3049 threshold = 0;
3050 }
3051
3052 nactive[querypos] = 0;
3053 firstactive[querypos] = -1;
3054 ptr = &(firstactive[querypos]);
3055 hit = high_hit - 1;
3056 while (hit >= low_hit) {
3057 while (hit >= low_hit && fwd_scores[querypos][hit] <= threshold
3058 #ifdef SEPARATE_FWD_REV
3059 && rev_scores[querypos][hit] <= threshold
3060 #endif
3061 ) {
3062 --hit;
3063 }
3064 *ptr = hit;
3065 if (hit >= low_hit) {
3066 nactive[querypos] += 1;
3067 ptr = &(active[querypos][hit]);
3068 --hit;
3069 }
3070 }
3071 *ptr = -1;
3072 }
3073
3074 debug6(
3075 printf("Valid hits (%d) at querypos %d (firstactive %d):",nactive[querypos],querypos,firstactive[querypos]);
3076 hit = firstactive[querypos];
3077 while (hit != -1) {
3078 printf(" %d",hit);
3079 hit = active[querypos][hit];
3080 }
3081 printf("\n");
3082 );
3083
3084 return;
3085 }
3086
3087
3088
3089 static int **
intmatrix_1d_new(int length1,int * lengths2,int totallength)3090 intmatrix_1d_new (int length1, int *lengths2, int totallength) {
3091 int **matrix;
3092 int i;
3093
3094 matrix = (int **) CALLOC(length1,sizeof(int *));
3095 matrix[0] = (int *) CALLOC(totallength,sizeof(int));
3096 for (i = 1; i < length1; i++) {
3097 if (lengths2[i-1] <= 0) {
3098 matrix[i] = matrix[i-1];
3099 } else {
3100 matrix[i] = &(matrix[i-1][lengths2[i-1]]);
3101 }
3102 }
3103 return matrix;
3104 }
3105
3106 static void
intmatrix_1d_free(int *** matrix)3107 intmatrix_1d_free (int ***matrix) {
3108 FREE((*matrix)[0]);
3109 FREE(*matrix);
3110 return;
3111 }
3112
3113
3114 static int **
intmatrix_2d_new(int length1,int * lengths2)3115 intmatrix_2d_new (int length1, int *lengths2) {
3116 int **matrix;
3117 int i;
3118
3119 matrix = (int **) CALLOC(length1,sizeof(int *));
3120 for (i = 0; i < length1; i++) {
3121 if (lengths2[i] <= 0) {
3122 matrix[i] = (int *) NULL;
3123 } else {
3124 matrix[i] = (int *) CALLOC(lengths2[i],sizeof(int));
3125 }
3126 }
3127 return matrix;
3128 }
3129
3130 static void
intmatrix_2d_free(int *** matrix,int length1)3131 intmatrix_2d_free (int ***matrix, int length1) {
3132 int i;
3133
3134 for (i = 0; i < length1; i++) {
3135 if ((*matrix)[i]) {
3136 FREE((*matrix)[i]);
3137 }
3138 }
3139 FREE(*matrix);
3140 return;
3141 }
3142
3143
3144 /************************************************************************
3145 * Cells used for ranking hits
3146 ************************************************************************/
3147
3148 #if 0
3149 typedef struct Cell_T *Cell_T;
3150 struct Cell_T {
3151 int rootposition;
3152 int endposition;
3153 int querypos;
3154 int hit;
3155 bool fwdp;
3156 int score;
3157 };
3158
3159 /* Replaced by Cellpool_T routines */
3160 static void
3161 Cell_free (Cell_T *old) {
3162 FREE(*old);
3163 return;
3164 }
3165
3166
3167 static Cell_T
3168 Cell_new (int rootposition, int endposition, int querypos, int hit, bool fwdp, int score) {
3169 Cell_T new = (Cell_T) MALLOC(sizeof(*new));
3170
3171 new->rootposition = rootposition;
3172 new->endposition = endposition;
3173 new->querypos = querypos;
3174 new->hit = hit;
3175 new->fwdp = fwdp;
3176 new->score = score;
3177 return new;
3178 }
3179 #endif
3180
3181
3182 #ifdef SLOW
3183 /* Used for the final set of cells, to see if we have non-overlapping paths */
3184 static int
Cell_interval_cmp(const void * a,const void * b)3185 Cell_interval_cmp (const void *a, const void *b) {
3186 Cell_T x = * (Cell_T *) a;
3187 Cell_T y = * (Cell_T *) b;
3188
3189 if (x->rootposition < y->rootposition) {
3190 return -1;
3191 } else if (y->rootposition < x->rootposition) {
3192 return +1;
3193
3194 } else if (x->endposition > y->endposition) {
3195 return -1;
3196 } else if (y->endposition > x->endposition) {
3197 return +1;
3198
3199 } else {
3200 return 0;
3201 }
3202 }
3203 #endif
3204
3205
3206 /* Used for the initial set of cells, to get the end cell for each rootposition */
3207 static int
Cell_rootposition_left_cmp(const void * a,const void * b)3208 Cell_rootposition_left_cmp (const void *a, const void *b) {
3209 Cell_T x = * (Cell_T *) a;
3210 Cell_T y = * (Cell_T *) b;
3211
3212 if (x->rootposition < y->rootposition) {
3213 return -1;
3214 } else if (y->rootposition < x->rootposition) {
3215 return +1;
3216
3217 #if 0
3218 /* Want score ranking, rather than interval ranking here. Otherwise, we don't get the final endposition */
3219 } else if (x->endposition < y->endposition) {
3220 return -1;
3221 } else if (y->endposition < x->endposition) {
3222 return +1;
3223 #endif
3224
3225 #if 0
3226 } else if (x->tracei < y->tracei) {
3227 return -1;
3228 } else if (y->tracei < x->tracei) {
3229 return +1;
3230 #endif
3231 } else if (x->score > y->score) {
3232 return -1;
3233 } else if (y->score > x->score) {
3234 return +1;
3235 } else if (x->querypos > y->querypos) {
3236 return -1;
3237 } else if (y->querypos > x->querypos) {
3238 return +1;
3239 } else if (x->hit < y->hit) {
3240 return -1;
3241 } else if (y->hit < x->hit) {
3242 return +1;
3243 } else if (x->fwdp == true && y->fwdp == false) {
3244 return -1;
3245 } else if (y->fwdp == true && x->fwdp == false) {
3246 return +1;
3247 } else {
3248 return 0;
3249 }
3250 }
3251
3252
3253 /* Used for the initial set of cells, to get the end cell for each rootposition */
3254 static int
Cell_rootposition_right_cmp(const void * a,const void * b)3255 Cell_rootposition_right_cmp (const void *a, const void *b) {
3256 Cell_T x = * (Cell_T *) a;
3257 Cell_T y = * (Cell_T *) b;
3258
3259 if (x->rootposition < y->rootposition) {
3260 return -1;
3261 } else if (y->rootposition < x->rootposition) {
3262 return +1;
3263
3264 #if 0
3265 /* Want score ranking, rather than interval ranking here. Otherwise, we don't get the final endposition */
3266 } else if (x->endposition < y->endposition) {
3267 return -1;
3268 } else if (y->endposition < x->endposition) {
3269 return +1;
3270 #endif
3271
3272 #if 0
3273 } else if (x->tracei < y->tracei) {
3274 return -1;
3275 } else if (y->tracei < x->tracei) {
3276 return +1;
3277 #endif
3278 } else if (x->score > y->score) {
3279 return -1;
3280 } else if (y->score > x->score) {
3281 return +1;
3282 } else if (x->querypos > y->querypos) {
3283 return -1;
3284 } else if (y->querypos > x->querypos) {
3285 return +1;
3286 } else if (x->hit > y->hit) {
3287 return -1;
3288 } else if (y->hit > x->hit) {
3289 return +1;
3290 } else if (x->fwdp == true && y->fwdp == false) {
3291 return -1;
3292 } else if (y->fwdp == true && x->fwdp == false) {
3293 return +1;
3294 } else {
3295 return 0;
3296 }
3297 }
3298
3299
3300 static int
Cell_score_cmp(const void * a,const void * b)3301 Cell_score_cmp (const void *a, const void *b) {
3302 Cell_T x = * (Cell_T *) a;
3303 Cell_T y = * (Cell_T *) b;
3304
3305 if (x->score > y->score) {
3306 return -1;
3307 } else if (y->score > x->score) {
3308 return +1;
3309 } else {
3310 return 0;
3311 }
3312 }
3313
3314
3315 #ifdef USE_THRESHOLD_SCORE
3316 /* Doesn't work well for short dynamic programming at the ends of a read */
3317 static Cell_T *
Linkmatrix_get_cells_fwd(int * nunique,struct Link_T ** links,int querystart,int queryend,int * npositions,int indexsize,int bestscore,bool favor_right_p,Cellpool_T cellpool)3318 Linkmatrix_get_cells_fwd (int *nunique, struct Link_T **links, int querystart, int queryend, int *npositions,
3319 int indexsize, int bestscore, bool favor_right_p, Cellpool_T cellpool) {
3320 Cell_T *sorted, *cells;
3321 List_T celllist = NULL;
3322 int querypos, hit;
3323 int rootposition, last_rootposition;
3324 int threshold_score, best_score_for_root;
3325 int ngood, ncells, i, k;
3326
3327 if (bestscore > 2*suboptimal_score_end) {
3328 threshold_score = bestscore - suboptimal_score_end;
3329 } else {
3330 threshold_score = bestscore/2;
3331 }
3332 if (threshold_score <= indexsize) {
3333 threshold_score = indexsize + 1;
3334 }
3335
3336 ncells = 0;
3337 for (querypos = querystart; querypos <= queryend; querypos++) {
3338 ngood = 0;
3339 for (hit = 0; hit < npositions[querypos]; hit++) {
3340 if (links[querypos][hit].fwd_score >= threshold_score) {
3341 ngood++;
3342 }
3343 }
3344 if (ngood > 0 && ngood <= 10) {
3345 for (hit = 0; hit < npositions[querypos]; hit++) {
3346 debug11(printf(" At %d,%d, comparing score %d with threshold_score %d\n",
3347 querypos,hit,links[querypos][hit].fwd_score,threshold_score));
3348 if (links[querypos][hit].fwd_score >= threshold_score) {
3349 rootposition = links[querypos][hit].fwd_rootposition;
3350 /* tracei = links[querypos][hit].fwd_tracei; */
3351 celllist = Cellpool_push(celllist,cellpool,rootposition,
3352 /*endposition*/(int) mappings[querypos][hit],
3353 querypos,hit,/*fwdp*/true,links[querypos][hit].fwd_score);
3354 ncells++;
3355 }
3356 }
3357 }
3358 }
3359
3360 if (ncells == 0) {
3361 *nunique = 0;
3362 return (Cell_T *) NULL;
3363
3364 } else {
3365 /* Take best result for each tracei */
3366 /* Using alloca can give a stack overflow */
3367 cells = (Cell_T *) List_to_array(celllist,NULL);
3368 /* List_free(&celllist); -- No need with cellpool */
3369
3370 if (favor_right_p == true) {
3371 qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_right_cmp);
3372 } else {
3373 /* favor_right_p is always false for GMAP */
3374 qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_left_cmp);
3375 }
3376
3377 sorted = (Cell_T *) MALLOC(ncells * sizeof(Cell_T)); /* Return value */
3378 k = 0;
3379
3380 last_rootposition = -1;
3381 best_score_for_root = -1;
3382 for (i = 0; i < ncells; i++) {
3383 if (cells[i]->rootposition != last_rootposition) {
3384 debug11(printf("Pushing rootposition %d, trace #%d, score %d, pos %d, hit %d\n",
3385 cells[i]->rootposition,cells[i]->tracei,cells[i]->score,cells[i]->querypos,cells[i]->hit));
3386 sorted[k++] = cells[i];
3387 last_rootposition = cells[i]->rootposition;
3388 best_score_for_root = cells[i]->score;
3389
3390 } else if (cells[i]->querypos == best_score_for_root) {
3391 debug11(printf("Equivalent cell for rootposition %d, trace #%d, score %d, pos %d, hit %d\n",
3392 cells[i]->rootposition,cells[i]->tracei,cells[i]->score,cells[i]->querypos,cells[i]->hit));
3393 sorted[k++] = cells[i];
3394 /* last_rootposition = cells[i]->rootposition;*/
3395 /* best_score_for_root = cells[i]->score; */
3396
3397 } else {
3398 /* Cell_free(&(cells[i])); -- no need with cellpool */
3399
3400 }
3401 }
3402 debug11(printf("\n"));
3403 FREE(cells);
3404
3405 *nunique = k;
3406 qsort(sorted,*nunique,sizeof(Cell_T),Cell_score_cmp);
3407
3408 return sorted;
3409 }
3410 }
3411
3412 #else
3413
3414 static Cell_T *
get_cells_fwd(int * nunique,struct Link_T ** links,int ** fwd_scores,Chrpos_T ** mappings,int querystart,int queryend,int * npositions,bool favor_right_p,Cellpool_T cellpool)3415 get_cells_fwd (int *nunique, struct Link_T **links, int **fwd_scores, Chrpos_T **mappings,
3416 int querystart, int queryend, int *npositions,
3417 bool favor_right_p, Cellpool_T cellpool) {
3418 Cell_T *sorted, *cells;
3419 List_T celllist = NULL;
3420 int querypos, hit;
3421 int rootposition, last_rootposition;
3422 int best_score_for_root;
3423 int ncells, i, k;
3424
3425 ncells = 0;
3426 for (querypos = querystart; querypos <= queryend; querypos++) {
3427 for (hit = 0; hit < npositions[querypos]; hit++) {
3428 if (fwd_scores[querypos][hit] > 0) {
3429 rootposition = links[querypos][hit].fwd_rootposition;
3430 /* tracei = links[querypos][hit].fwd_tracei; */
3431 celllist = Cellpool_push(celllist,cellpool,rootposition,
3432 /*endposition*/(int) mappings[querypos][hit],
3433 querypos,hit,/*fwdp*/true,fwd_scores[querypos][hit]);
3434 ncells++;
3435 }
3436 }
3437 }
3438
3439 debug12(printf("Have %d cells\n",ncells));
3440 if (ncells == 0) {
3441 *nunique = 0;
3442 return (Cell_T *) NULL;
3443
3444 } else {
3445 /* Take best result for each tracei */
3446 /* Using alloca can give a stack overflow */
3447 cells = (Cell_T *) List_to_array(celllist,NULL);
3448 /* List_free(&celllist); -- No need with cellpool */
3449
3450 if (favor_right_p == true) {
3451 qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_right_cmp);
3452 } else {
3453 /* favor_right_p is always false for GMAP */
3454 qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_left_cmp);
3455 }
3456
3457 sorted = (Cell_T *) MALLOC(ncells * sizeof(Cell_T)); /* Return value */
3458 k = 0;
3459
3460 last_rootposition = -1;
3461 best_score_for_root = -1;
3462 for (i = 0; i < ncells; i++) {
3463 if (cells[i]->rootposition != last_rootposition) {
3464 /* Take best cell at this rootposition */
3465 debug11(printf("Pushing rootposition %d, score %d, pos %d, hit %d\n",
3466 cells[i]->rootposition,cells[i]->score,cells[i]->querypos,cells[i]->hit));
3467 sorted[k++] = cells[i];
3468 last_rootposition = cells[i]->rootposition;
3469 best_score_for_root = cells[i]->score;
3470
3471 } else if (cells[i]->score == best_score_for_root) {
3472 /* Take equivalent cell for this rootposition */
3473 debug11(printf("Pushing equivalent end for rootposition %d, score %d, pos %d, hit %d\n",
3474 cells[i]->rootposition,cells[i]->score,cells[i]->querypos,cells[i]->hit));
3475 sorted[k++] = cells[i];
3476 /* last_rootposition = cells[i]->rootposition; */
3477 /* best_score_for_root = cells[i]->score; */
3478
3479 } else {
3480 /* Cell_free(&(cells[i])); -- no need with cellpool */
3481 }
3482 }
3483 debug11(printf("\n"));
3484 FREE(cells);
3485
3486 *nunique = k;
3487 qsort(sorted,*nunique,sizeof(Cell_T),Cell_score_cmp);
3488
3489 return sorted;
3490 }
3491 }
3492
3493 #endif
3494
3495 #if 0
3496 static Cell_T *
3497 Linkmatrix_get_cells_both (int *nunique, struct Link_T **links, int querystart, int queryend, int *npositions,
3498 int indexsize, int bestscore, bool favor_right_p, Cellpool_T cellpool) {
3499 Cell_T *sorted, *cells;
3500 List_T celllist = NULL;
3501 int querypos, hit;
3502 int rootposition, last_rootposition;
3503 int threshold_score, best_score_for_root;
3504 int ngood, ncells, i, k;
3505
3506 if (bestscore > 2*suboptimal_score_end) {
3507 threshold_score = bestscore - suboptimal_score_end;
3508 } else {
3509 threshold_score = bestscore/2;
3510 }
3511 if (threshold_score <= indexsize) {
3512 threshold_score = indexsize + 1;
3513 }
3514
3515 debug11(printf("Entered Linkmatrix_get_cells_both with querystart %d, queryend %d, threshold score %d\n",
3516 querystart,queryend,threshold_score));
3517
3518 ncells = 0;
3519 for (querypos = querystart; querypos <= queryend; querypos++) {
3520 ngood = 0;
3521 for (hit = 0; hit < npositions[querypos]; hit++) {
3522 if (links[querypos][hit].fwd_score >= threshold_score) {
3523 ngood++;
3524 }
3525 #ifdef SEPARATE_FWD_REV
3526 if (links[querypos][hit].rev_score >= threshold_score) {
3527 ngood++;
3528 }
3529 #endif
3530 }
3531 if (ngood > 0 && ngood <= 10) {
3532 for (hit = 0; hit < npositions[querypos]; hit++) {
3533 if (links[querypos][hit].fwd_score >= threshold_score) {
3534 rootposition = links[querypos][hit].fwd_rootposition;
3535 /* tracei = links[querypos][hit].fwd_tracei; */
3536 celllist = Cellpool_push(celllist,cellpool,rootposition,
3537 /*endposition*/(int) mappings[querypos][hit],
3538 querypos,hit,/*fwdp*/true,links[querypos][hit].fwd_score);
3539 ncells++;
3540 }
3541 #ifdef SEPARATE_FWD_REV
3542 if (links[querypos][hit].rev_score >= threshold_score) {
3543 rootposition = links[querypos][hit].rev_rootposition;
3544 /* tracei = links[querypos][hit].rev_tracei; */
3545 celllist = Cellpool_push(celllist,cellpool,rootposition,
3546 /*endposition*/(int) mappings[querypos][hit],
3547 querypos,hit,/*fwdp*/false,links[querypos][hit].rev_score);
3548 ncells++;
3549 }
3550 #endif
3551 }
3552 }
3553 }
3554
3555 debug12(printf("Have %d cells\n",ncells));
3556 if (ncells == 0) {
3557 *nunique = 0;
3558 return (Cell_T *) NULL;
3559
3560 } else {
3561 /* Take best result for each tracei */
3562 /* Using alloca can give a stack overflow */
3563 cells = (Cell_T *) List_to_array(celllist,NULL);
3564 /* List_free(&celllist); -- no need with cellpool */
3565
3566 if (favor_right_p == true) {
3567 qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_right_cmp);
3568 } else {
3569 /* favor_right_p is always false for GMAP */
3570 qsort(cells,ncells,sizeof(Cell_T),Cell_rootposition_left_cmp);
3571 }
3572
3573 sorted = (Cell_T *) MALLOC(ncells * sizeof(Cell_T)); /* Return value */
3574 k = 0;
3575
3576 last_rootposition = -1;
3577 best_score_for_root = -1;
3578 for (i = 0; i < ncells; i++) {
3579 if (cells[i]->rootposition != last_rootposition) {
3580 /* Take best cell at this rootposition */
3581 debug11(printf("rootposition %d, score %d, pos %d, hit %d\n",
3582 cells[i]->rootposition,cells[i]->score,cells[i]->querypos,cells[i]->hit));
3583 sorted[k++] = cells[i];
3584 last_rootposition = cells[i]->rootposition;
3585 best_score_for_root = cells[i]->score;
3586
3587 } else if (cells[i]->score == best_score_for_root) {
3588 /* Take equivalent end cell for this rootposition */
3589 debug11(printf("equivalent end for rootposition %d, score %d, pos %d, hit %d\n",
3590 cells[i]->rootposition,cells[i]->score,cells[i]->querypos,cells[i]->hit));
3591 sorted[k++] = cells[i];
3592 /* last_rootposition = cells[i]->rootposition; */
3593 /* best_score_for_root = cells[i]->score; */
3594
3595 } else {
3596 /* Cell_free(&(cells[i])); -- no need with cellpool */
3597 }
3598 }
3599 debug11(printf("\n"));
3600 FREE(cells);
3601
3602 *nunique = k;
3603 qsort(sorted,*nunique,sizeof(Cell_T),Cell_score_cmp);
3604
3605 return sorted;
3606 }
3607 }
3608 #endif
3609
3610
3611 #ifdef MOVE_TO_STAGE3
3612 static int
binary_search(int lowi,int highi,Chrpos_T * mappings,Chrpos_T goal)3613 binary_search (int lowi, int highi, Chrpos_T *mappings, Chrpos_T goal) {
3614 int middlei;
3615
3616 debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,goal));
3617 if (mappings == NULL) {
3618 return -1;
3619 } else {
3620 while (lowi < highi) {
3621 middlei = lowi + ((highi - lowi) / 2);
3622 debug10(printf(" binary: %d:%u %d:%u %d:%u vs. %u\n",
3623 lowi,mappings[lowi],middlei,mappings[middlei],
3624 highi,mappings[highi],goal));
3625 if (goal < mappings[middlei]) {
3626 highi = middlei;
3627 } else if (goal > mappings[middlei]) {
3628 lowi = middlei + 1;
3629 } else {
3630 debug10(printf("binary search returns %d\n",middlei));
3631 return middlei;
3632 }
3633 }
3634
3635 debug10(printf("binary search returns %d\n",highi));
3636 return highi;
3637 }
3638 }
3639 #endif
3640
3641
3642 /* Returns celllist */
3643 /* For PMAP, indexsize is in aa. */
3644 static Cell_T *
align_compute_scores_lookback(int * ncells,struct Link_T ** links,int ** fwd_scores,Chrpos_T ** mappings,int * npositions,int totalpositions,bool oned_matrix_p,Chrpos_T * minactive,Chrpos_T * maxactive,int * firstactive,int * nactive,Cellpool_T cellpool,int querystart,int queryend,int querylength,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int indexsize,char * queryseq_ptr,bool anchoredp,int anchor_querypos,Chrpos_T anchor_position,bool localp,bool skip_repetitive_p,bool use_canonical_p,int non_canonical_penalty,bool debug_graphic_p,bool favor_right_p,bool middlep)3645 align_compute_scores_lookback (int *ncells, struct Link_T **links, int **fwd_scores,
3646 Chrpos_T **mappings, int *npositions, int totalpositions,
3647 bool oned_matrix_p, Chrpos_T *minactive, Chrpos_T *maxactive,
3648 int *firstactive, int *nactive, Cellpool_T cellpool,
3649 int querystart, int queryend, int querylength,
3650
3651 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
3652
3653 int indexsize,
3654 #ifdef DEBUG9
3655 char *queryseq_ptr,
3656 #endif
3657 #ifdef MOVE_TO_STAGE3
3658 bool anchoredp, int anchor_querypos, Chrpos_T anchor_position,
3659 #endif
3660 bool localp, bool skip_repetitive_p,
3661 bool use_canonical_p, int non_canonical_penalty, bool debug_graphic_p,
3662 bool favor_right_p, bool middlep) {
3663 #if 0
3664 bool anchoredp = false;
3665 int anchor_querypos = 0;
3666 Chrpos_T anchor_position = 0;
3667 #endif
3668
3669 Cell_T *cells;
3670 Link_T currlink;
3671 int curr_querypos, indexsize_nt, indexsize_query, hit, nhits, low_hit, high_hit;
3672 int nskipped, min_hits, specific_querypos, specific_low_hit, specific_high_hit, next_querypos;
3673 Intlist_T processed = NULL;
3674 int best_overall_score = 0;
3675 int grand_fwd_score, grand_fwd_querypos, grand_fwd_hit, best_fwd_hit, best_fwd_score;
3676 #ifdef SEPARATE_FWD_REV
3677 int grand_rev_score, grand_rev_querypos, grand_rev_hit, best_rev_hit, best_rev_score;
3678 #ifdef DEBUG9
3679 int rev_tracei = 0;
3680 #endif
3681 #endif
3682 int **active;
3683 Chrpos_T position, prevposition;
3684 int fwd_tracei = 0;
3685 #if 0
3686 int *lastGT, *lastAG;
3687 #ifndef PMAP
3688 int *lastCT, *lastAC;
3689 #endif
3690 #endif
3691 #ifdef DEBUG9
3692 Link_T prevlink;
3693 char *oligo;
3694 #endif
3695 #ifdef DEBUG12
3696 Link_T termlink = NULL;
3697 #endif
3698
3699 #ifdef PMAP
3700 indexsize_nt = indexsize*3;
3701 #else
3702 indexsize_nt = indexsize;
3703 #endif
3704 indexsize_query = indexsize; /* Use when evaluating across query positions */
3705
3706
3707 #ifdef DEBUG9
3708 oligo = (char *) CALLOC(indexsize+1,sizeof(char));
3709 #endif
3710 debug0(printf("Lookback: querystart = %d, queryend = %d, indexsize = %d\n",querystart,queryend,indexsize));
3711
3712 assert(oned_matrix_p == true);
3713 if (oned_matrix_p == true) {
3714 active = intmatrix_1d_new(querylength,npositions,totalpositions);
3715 } else {
3716 active = intmatrix_2d_new(querylength,npositions);
3717 }
3718
3719 #if 0
3720 firstactive = (int *) MALLOC(querylength * sizeof(int));
3721 nactive = (int *) MALLOC(querylength * sizeof(int));
3722 #endif
3723
3724 /* Initialize */
3725 for (curr_querypos = 0; curr_querypos < querystart; curr_querypos++) {
3726 debug6(printf("3. Initializing firstactive for querypos %d to be -1\n",curr_querypos));
3727 firstactive[curr_querypos] = -1;
3728 nactive[curr_querypos] = 0;
3729 }
3730 while (curr_querypos <= queryend && npositions[curr_querypos] <= 0) {
3731 debug6(printf("4. Initializing firstactive for querypos %d to be -1\n",curr_querypos));
3732 debug9(printf("Skipping querypos %d which has no positions\n",curr_querypos));
3733 firstactive[curr_querypos] = -1;
3734 nactive[curr_querypos] = 0;
3735 curr_querypos++;
3736 }
3737
3738 #ifdef MOVE_TO_STAGE3
3739 if (anchoredp == true) {
3740 /* Guaranteed to find a hit */
3741 hit = binary_search(0,npositions[anchor_querypos],mappings[anchor_querypos],/*goal*/anchor_position);
3742 if (mappings[anchor_querypos] == NULL) {
3743 printf("mappings at anchor_querypos %d is NULL. mappings = %p\n",anchor_querypos,mappings);
3744 abort();
3745 }
3746
3747 currlink = &(links[anchor_querypos][hit]);
3748 #ifndef SEPARATE_FWD_REV
3749 currlink->fwd_pos = currlink->fwd_hit = -1;
3750 currlink->fwd_consecutive = EXON_DEFN;
3751 currlink->fwd_tracei = 0;
3752 fwd_scores[anchor_querypos][hit] = indexsize_nt;
3753 #else
3754 fprintf(stderr,"Not implemented yet\n");
3755 abort();
3756 #endif
3757
3758 debug6(printf("Setting firstactive for anchorpos %d to be %d\n",anchor_querypos,hit));
3759 firstactive[anchor_querypos] = hit;
3760 nactive[anchor_querypos] = 1;
3761 active[anchor_querypos][hit] = -1;
3762
3763 debug6(printf("Pushing anchorpos %d as processed\n",anchor_querypos));
3764 processed = Intlist_push(processed,anchor_querypos);
3765
3766 } else
3767 #endif
3768
3769 if (curr_querypos <= queryend) {
3770 for (hit = 0; hit < npositions[curr_querypos]; hit++) {
3771 currlink = &(links[curr_querypos][hit]);
3772 #ifndef SEPARATE_FWD_REV
3773 currlink->fwd_pos = currlink->fwd_hit = -1;
3774 currlink->fwd_consecutive = indexsize_nt;
3775 currlink->fwd_tracei = -1;
3776 /* currlink->fwd_rootnlinks = 1; */
3777 fwd_scores[curr_querypos][hit] = indexsize_nt;
3778 #else
3779 currlink->fwd_pos = currlink->fwd_hit = -1;
3780 currlink->fwd_consecutive = indexsize_nt;
3781 currlink->fwd_tracei = -1;
3782 /* currlink->fwd_rootnlinks = 1; */
3783 fwd_scores[curr_querypos][hit] = indexsize_nt;
3784 if (splicingp == true) {
3785 currlink->rev_pos = currlink->rev_hit = -1;
3786 currlink->rev_consecutive = indexsize_nt;
3787 currlink->rev_tracei = -1;
3788 /* currlink->rev_rootnlinks = 1; */
3789 rev_scores[curr_querypos][hit] = indexsize_nt;
3790 }
3791 #endif
3792 }
3793 revise_active_lookback(active,firstactive,nactive,0,npositions[curr_querypos],fwd_scores,curr_querypos);
3794 }
3795
3796 grand_fwd_score = 0;
3797 grand_fwd_querypos = -1;
3798 grand_fwd_hit = -1;
3799 #ifdef SEPARATE_FWD_REV
3800 if (splicingp == true) {
3801 grand_rev_score = 0;
3802 grand_rev_querypos = -1;
3803 grand_rev_hit = -1;
3804 }
3805 #endif
3806
3807 nskipped = 0;
3808 min_hits = 1000000;
3809 specific_querypos = -1;
3810
3811 /* curr_querypos += 1; -- this causes curr_querypos at querystart to be ignored */
3812 while (curr_querypos <= queryend) {
3813 best_fwd_score = 0;
3814 best_fwd_hit = -1;
3815 #ifdef SEPARATE_FWD_REV
3816 best_rev_score = 0;
3817 best_rev_hit = -1;
3818 #endif
3819
3820 debug9(printf("Positions at querypos %d (forward order):",curr_querypos);
3821 for (hit = 0; hit < npositions[curr_querypos]; hit++) {
3822 printf(" %u",mappings[curr_querypos][hit]);
3823 }
3824 printf("\n");
3825 );
3826
3827 hit = 0;
3828 while (hit < npositions[curr_querypos] && mappings[curr_querypos][hit] < minactive[curr_querypos]) {
3829 hit++;
3830 }
3831 low_hit = hit;
3832 while (hit < npositions[curr_querypos] && mappings[curr_querypos][hit] <= maxactive[curr_querypos]) {
3833 hit++;
3834 }
3835 high_hit = hit;
3836 debug9(printf("Querypos %d has hit %d..%d out of %d (minactive = %u, maxactive = %u)\n",
3837 curr_querypos,low_hit,high_hit-1,npositions[curr_querypos],minactive[curr_querypos],maxactive[curr_querypos]));
3838
3839 /* Can't use nactive yet, so use high_hit - low_hit */
3840 if (skip_repetitive_p && high_hit - low_hit >= MAX_NACTIVE && nskipped <= MAX_SKIPPED) { /* Previously turned off */
3841 debug6(printf("Too many active (%d - %d) at querypos %d. Setting firstactive to be -1\n",high_hit,low_hit,curr_querypos));
3842 firstactive[curr_querypos] = -1;
3843 nactive[curr_querypos] = 0;
3844 nskipped++;
3845 debug9(printf(" %d skipped because of %d hits\n",nskipped,high_hit - low_hit + 1));
3846
3847 /* Store most specific querypos in section of skipped */
3848 if (high_hit - low_hit < min_hits) {
3849 min_hits = high_hit - low_hit;
3850 specific_querypos = curr_querypos;
3851 specific_low_hit = low_hit;
3852 specific_high_hit = high_hit;
3853 }
3854 curr_querypos++;
3855
3856 } else {
3857 if (nskipped > MAX_SKIPPED) {
3858 debug9(printf("Too many skipped. Going back to specific querypos %d\n",specific_querypos));
3859 next_querypos = curr_querypos;
3860 curr_querypos = specific_querypos;
3861 low_hit = specific_low_hit;
3862 high_hit = specific_high_hit;
3863 } else {
3864 next_querypos = curr_querypos + 1;
3865 }
3866
3867 if ((nhits = high_hit - low_hit) > 0) {
3868 if (nhits == 1) {
3869 currlink = &(links[curr_querypos][low_hit]);
3870 position = mappings[curr_querypos][low_hit];
3871
3872 debug9(strncpy(oligo,&(queryseq_ptr[curr_querypos]),indexsize));
3873 debug9(oligo[indexsize] = '\0');
3874 debug9(printf("Finding link looking back from querypos %d,%d at %ux%d (%s). prev_querypos was %d\n",
3875 curr_querypos,low_hit,position,active[curr_querypos][low_hit],oligo,processed ? Intlist_head(processed) : -1));
3876
3877 score_querypos_lookback_one(&fwd_tracei,currlink,curr_querypos,low_hit,position,
3878 links,fwd_scores,mappings,active,firstactive,chroffset,chrhigh,plusp,
3879 indexsize,processed,localp,splicingp,use_canonical_p,
3880 non_canonical_penalty);
3881
3882 if (fwd_scores[curr_querypos][low_hit] > 0) {
3883 debug9(printf("Single hit at low_hit %d has score %d\n",low_hit,fwd_scores[curr_querypos][low_hit]));
3884 best_fwd_score = fwd_scores[curr_querypos][low_hit];
3885 best_fwd_hit = low_hit;
3886 }
3887
3888 } else {
3889 debug9(strncpy(oligo,&(queryseq_ptr[curr_querypos]),indexsize));
3890 debug9(oligo[indexsize] = '\0');
3891 debug9(printf("Finding links looking back from querypos %d,%d..%d at (%u..%u) (%s). prev_querypos was %d\n",
3892 curr_querypos,low_hit,high_hit-1,mappings[curr_querypos][low_hit],mappings[curr_querypos][high_hit-1],
3893 oligo,processed ? Intlist_head(processed) : -1));
3894
3895 score_querypos_lookback_mult(&fwd_tracei,low_hit,high_hit,curr_querypos,
3896 /*positions*/&(mappings[curr_querypos][low_hit]),
3897 links,fwd_scores,mappings,active,firstactive,chroffset,chrhigh,plusp,
3898 indexsize,processed,localp,splicingp,use_canonical_p,
3899 non_canonical_penalty);
3900
3901 debug9(printf("Checking hits from low_hit %d to high_hit %d\n",low_hit,high_hit));
3902 for (hit = low_hit; hit < high_hit; hit++) {
3903 debug9(printf("Hit %d has score %d\n",hit,fwd_scores[curr_querypos][hit]));
3904 if (fwd_scores[curr_querypos][hit] > best_fwd_score) {
3905 best_fwd_score = fwd_scores[curr_querypos][hit];
3906 best_fwd_hit = hit;
3907 }
3908 }
3909 }
3910
3911 if (best_fwd_score > best_overall_score) {
3912 best_overall_score = best_fwd_score;
3913 }
3914
3915 nskipped = 0;
3916 min_hits = 1000000;
3917 specific_querypos = -1;
3918
3919 #ifndef SEPARATE_FWD_REV
3920 debug9(printf("Overall result at querypos %d yields best_fwd_hit %d\n",
3921 curr_querypos,best_fwd_hit));
3922 #else
3923 debug9(printf("Overall result at querypos %d yields best_fwd_hit %d and best_rev_hit %d\n",
3924 curr_querypos,best_fwd_hit,best_rev_hit));
3925 #endif
3926
3927 #if 1
3928 /* Previously, thought that using this code causes misses in
3929 some alignments, but not using it causes missing end
3930 exons */
3931 if (middlep == false && best_fwd_hit < 0) {
3932 /* Allow for a new start, to test different ends */
3933 for (hit = 0; hit < npositions[curr_querypos]; hit++) {
3934 currlink = &(links[curr_querypos][hit]);
3935 #ifndef SEPARATE_FWD_REV
3936 currlink->fwd_pos = currlink->fwd_hit = -1;
3937 currlink->fwd_consecutive = indexsize_nt;
3938 currlink->fwd_tracei = -1;
3939 /* currlink->fwd_rootnlinks = 1; */
3940 fwd_scores[curr_querypos][hit] = indexsize_nt;
3941 #else
3942 currlink->fwd_pos = currlink->fwd_hit = -1;
3943 currlink->fwd_consecutive = indexsize_nt;
3944 currlink->fwd_tracei = -1;
3945 /* currlink->fwd_rootnlinks = 1; */
3946 fwd_scores[curr_querypos][hit] = indexsize_nt;
3947 if (splicingp == true) {
3948 currlink->rev_pos = currlink->rev_hit = -1;
3949 currlink->rev_consecutive = indexsize_nt;
3950 currlink->rev_tracei = -1;
3951 /* currlink->rev_rootnlinks = 1; */
3952 rev_scores[curr_querypos][hit] = indexsize_nt;
3953 }
3954 #endif
3955 }
3956 }
3957 #endif
3958
3959 if (splicingp == true && best_fwd_hit >= 0 && links[curr_querypos][best_fwd_hit].fwd_hit < 0 &&
3960 grand_fwd_querypos >= 0 && curr_querypos >= grand_fwd_querypos + indexsize_query) {
3961 if ((best_fwd_score = fwd_scores[grand_fwd_querypos][grand_fwd_hit] - (curr_querypos - grand_fwd_querypos)) > 0) {
3962 prevposition = mappings[grand_fwd_querypos][grand_fwd_hit];
3963 debug12(printf("Considering prevposition %u to position %u as a grand fwd lookback\n",prevposition,position));
3964 for (hit = low_hit; hit < high_hit; hit++) {
3965 if ((position = mappings[curr_querypos][hit]) > prevposition + maxintronlen) {
3966 debug12(printf(" => Too long\n"));
3967 } else if (position >= prevposition + indexsize_nt) {
3968 currlink = &(links[curr_querypos][hit]);
3969 currlink->fwd_consecutive = indexsize_nt;
3970 currlink->fwd_pos = grand_fwd_querypos;
3971 currlink->fwd_hit = grand_fwd_hit;
3972 currlink->fwd_tracei = ++fwd_tracei;
3973 fwd_scores[curr_querypos][hit] = best_fwd_score;
3974 #ifdef DEBUG9
3975 prevlink = &(links[grand_fwd_querypos][grand_fwd_hit]);
3976 currlink->fwd_intronnfwd = prevlink->fwd_intronnfwd;
3977 currlink->fwd_intronnrev = prevlink->fwd_intronnrev;
3978 currlink->fwd_intronnunk = prevlink->fwd_intronnunk + 1;
3979 #endif
3980 }
3981 }
3982 debug12(printf("At querypos %d, setting all fwd hits to point back to grand_fwd %d,%d with a score of %d\n",
3983 curr_querypos,grand_fwd_querypos,grand_fwd_hit,fwd_scores[grand_fwd_querypos][grand_fwd_hit]));
3984 }
3985 }
3986
3987 /* Use >= to favor longer path in case of ties */
3988 if (best_fwd_hit >= 0 && best_fwd_score >= grand_fwd_score &&
3989 links[curr_querypos][best_fwd_hit].fwd_consecutive > EXON_DEFN) {
3990 grand_fwd_score = best_fwd_score;
3991 grand_fwd_querypos = curr_querypos;
3992 grand_fwd_hit = best_fwd_hit;
3993 debug12(termlink = &(links[curr_querypos][best_fwd_hit]));
3994 debug12(printf("At querypos %d, revising grand fwd to be hit %d with score of %d (pointing back to %d,%d)\n",
3995 curr_querypos,best_fwd_hit,best_fwd_score,termlink->fwd_pos,termlink->fwd_hit));
3996 }
3997
3998 #ifdef SEPARATE_FWD_REV
3999 if (best_rev_score > best_overall_score) {
4000 best_overall_score = best_rev_score;
4001 }
4002
4003 if (splicingp == false || use_canonical_p == false) {
4004 /* rev scores should be the same as the fwd scores */
4005 } else {
4006 if (best_rev_hit >= 0 && links[curr_querypos][best_rev_hit].rev_hit < 0 &&
4007 grand_rev_querypos >= 0 && curr_querypos >= grand_rev_querypos + indexsize_query) {
4008 prevlink = &(links[grand_rev_querypos][grand_rev_hit]);
4009 if ((best_rev_score = prevlink->rev_score - (curr_querypos - grand_rev_querypos)) > 0) {
4010 prevposition = mappings[grand_rev_querypos][grand_rev_hit];
4011 debug12(printf("Considering prevposition %u to position %u as a grand rev lookback\n",prevposition,position));
4012 for (hit = low_hit; hit < high_hit; hit++) {
4013 if ((position = mappings[curr_querypos][hit]) > prevposition + maxintronlen) {
4014 debug12(printf(" => Too long\n"));
4015 } else if (position >= prevposition + indexsize_nt) {
4016 currlink = &(links[curr_querypos][hit]);
4017 currlink->rev_consecutive = indexsize_nt;
4018 /* currlink->rev_rootnlinks = 1; */
4019 currlink->rev_pos = grand_rev_querypos;
4020 currlink->rev_hit = grand_rev_hit;
4021 currlink->rev_score = best_rev_score;
4022 #ifdef DEBUG9
4023 currlink->rev_tracei = ++rev_tracei;
4024 currlink->rev_intronnrev = prevlink->rev_intronnfwd;
4025 currlink->rev_intronnrev = prevlink->rev_intronnrev;
4026 currlink->rev_intronnunk = prevlink->rev_intronnunk + 1;
4027 #endif
4028 }
4029 }
4030 debug12(printf("At querypos %d, setting all rev hits to point back to grand_rev %d,%d with a score of %d\n",
4031 curr_querypos,grand_rev_querypos,grand_rev_hit,prevlink->rev_score));
4032 }
4033 }
4034
4035 /* Use >= to favor longer path in case of ties */
4036 if (best_rev_hit >= 0 && best_rev_score >= grand_rev_score &&
4037 links[curr_querypos][best_rev_hit].rev_consecutive > EXON_DEFN) {
4038 grand_rev_score = best_rev_score;
4039 grand_rev_querypos = curr_querypos;
4040 grand_rev_hit = best_rev_hit;
4041 }
4042 }
4043 #endif
4044 }
4045
4046 revise_active_lookback(active,firstactive,nactive,low_hit,high_hit,fwd_scores,curr_querypos);
4047
4048 /* Need to push querypos, even if firstactive[curr_querypos] == -1 */
4049 /* Want to skip npositions[curr_querypos] == 0, so we can find adjacent despite mismatch or overabundance */
4050 if (npositions[curr_querypos] > 0) {
4051 debug6(printf("Pushing querypos %d onto processed\n",curr_querypos));
4052 processed = Intlist_push(processed,curr_querypos);
4053 }
4054 curr_querypos = next_querypos;
4055 }
4056 }
4057 debug9(printf("End of loop lookback\n"));
4058
4059 Intlist_free(&processed);
4060
4061 /* These are the final active oligomers, after pruning by score */
4062 if (debug_graphic_p == true) {
4063 mappings_dump_R(mappings,npositions,querylength,active,firstactive,indexsize,"active.mers");
4064 }
4065
4066 #if 0
4067 FREE(nactive);
4068 FREE(firstactive);
4069 #endif
4070
4071 if (oned_matrix_p == true) {
4072 intmatrix_1d_free(&active);
4073 } else {
4074 intmatrix_2d_free(&active,querylength);
4075 }
4076
4077
4078 /* Grand winners */
4079 debug12(printf("Finding grand winners, using root position method\n"));
4080 #ifdef SEPARATE_FWD_REV
4081 if (splicingp == false || use_canonical_p == false) {
4082 cells = Linkmatrix_get_cells_fwd(&(*ncells),links,querystart,queryend,npositions,
4083 favor_right_p,cellpool);
4084 } else {
4085 cells = Linkmatrix_get_cells_both(&(*ncells),links,querystart,queryend,npositions,
4086 indexsize,best_overall_score,favor_right_p,cellpool);
4087 }
4088 #else
4089 cells = get_cells_fwd(&(*ncells),links,fwd_scores,mappings,querystart,queryend,npositions,
4090 favor_right_p,cellpool);
4091 #endif
4092
4093 debug9(FREE(oligo));
4094
4095 return cells;
4096 }
4097
4098
4099 static char complCode[128] = COMPLEMENT_LC;
4100
4101 /* genomicstart == chroffset + chrpos */
4102 /* arguments were genomicpos, genomicstart, genomiclength */
4103
4104 static char
get_genomic_nt(char * g_alt,Chrpos_T chrpos,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)4105 get_genomic_nt (char *g_alt, Chrpos_T chrpos, Univcoord_T chroffset,
4106 Univcoord_T chrhigh, bool watsonp) {
4107 char c2, c2_alt;
4108
4109 if (watsonp) {
4110 return Genome_get_char_blocks(&(*g_alt),chroffset + chrpos);
4111
4112 } else {
4113 c2 = Genome_get_char_blocks(&c2_alt,chrhigh - chrpos);
4114 *g_alt = complCode[(int) c2_alt];
4115 return complCode[(int) c2];
4116 }
4117 }
4118
4119
4120 static List_T
traceback_one(int curr_querypos,int hit,struct Link_T ** links,Chrpos_T ** mappings,char * queryseq_ptr,char * queryuc_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,bool lookbackp,int ** fwd_scores,int indexsize,Pairpool_T pairpool,bool fwdp)4121 traceback_one (int curr_querypos, int hit, struct Link_T **links, Chrpos_T **mappings,
4122 char *queryseq_ptr, char *queryuc_ptr,
4123 #ifdef PMAP
4124 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, bool lookbackp,
4125 #endif
4126 #ifdef DEBUG0
4127 int **fwd_scores, int indexsize,
4128 #endif
4129 Pairpool_T pairpool, bool fwdp) {
4130 List_T path = NULL;
4131 Chrpos_T position;
4132 int prev_querypos, prevhit;
4133 char c2;
4134 #ifdef PMAP
4135 char c2_alt;
4136 #endif
4137
4138 #ifdef DEBUG0
4139 char *oligo;
4140 #endif
4141
4142
4143 while (curr_querypos >= 0) {
4144 position = mappings[curr_querypos][hit];
4145
4146 #ifdef PMAP
4147 /* Change querypos positions from protein to nucleotide */
4148 if (lookbackp == true) {
4149 c2 = get_genomic_nt(&c2_alt,position+2,chroffset,chrhigh,watsonp);
4150 path = Pairpool_push(path,pairpool,curr_querypos*3+2,position+2,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4151 /*dynprogindex*/0);
4152 c2 = get_genomic_nt(&c2_alt,position+1,chroffset,chrhigh,watsonp);
4153 path = Pairpool_push(path,pairpool,curr_querypos*3+1,position+1,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4154 /*dynprogindex*/0);
4155 c2 = get_genomic_nt(&c2_alt,position,chroffset,chrhigh,watsonp);
4156 path = Pairpool_push(path,pairpool,curr_querypos*3,position,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4157 /*dynprogindex*/0);
4158 } else {
4159 c2 = get_genomic_nt(&c2_alt,position,chroffset,chrhigh,watsonp);
4160 path = Pairpool_push(path,pairpool,curr_querypos*3,position,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4161 /*dynprogindex*/0);
4162 c2 = get_genomic_nt(&c2_alt,position+1,chroffset,chrhigh,watsonp);
4163 path = Pairpool_push(path,pairpool,curr_querypos*3+1,position+1,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4164 /*dynprogindex*/0);
4165 c2 = get_genomic_nt(&c2_alt,position+2,chroffset,chrhigh,watsonp);
4166 path = Pairpool_push(path,pairpool,curr_querypos*3+2,position+2,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4167 /*dynprogindex*/0);
4168 }
4169 #else
4170 /* genomic nucleotide same as queryseq */
4171 c2 = queryuc_ptr[curr_querypos];
4172 path = Pairpool_push(path,pairpool,curr_querypos,position,queryseq_ptr[curr_querypos],MATCH_COMP,
4173 c2,/*genomealt*/c2,/*dynprogindex*/0);
4174 #endif
4175
4176
4177 #ifdef DEBUG0
4178 debug0(oligo = (char *) MALLOC((indexsize+1)*sizeof(char)));
4179 debug0(strncpy(oligo,&(queryseq_ptr[curr_querypos]),indexsize));
4180 debug0(oligo[indexsize] = '\0');
4181 if (fwdp == true) {
4182 debug0(printf("Pushing %d,%d (%s) at %u, score = %d, consec = %d",
4183 curr_querypos,hit,oligo,position,
4184 fwd_scores[curr_querypos][hit],links[curr_querypos][hit].fwd_consecutive));
4185 debug9(printf(" (from #%d), intr = %d(+)/%d(-)/%d(?)",
4186 links[curr_querypos][hit].fwd_tracei,links[curr_querypos][hit].fwd_intronnfwd,links[curr_querypos][hit].fwd_intronnrev,
4187 links[curr_querypos][hit].fwd_intronnunk));
4188 debug0(printf("\n"));
4189
4190 #ifdef SEPARATE_FWD_REV
4191 } else {
4192 debug0(printf("Pushing %d,%d (%s) at %u, score = %d, consec = %d",
4193 curr_querypos,hit,oligo,position,
4194 links[curr_querypos][hit].rev_score,links[curr_querypos][hit].rev_consecutive));
4195 debug9(printf(" (from #%d), intr = %d(+)/%d(-)/%d(?)",
4196 links[curr_querypos][hit].rev_tracei,links[curr_querypos][hit].rev_intronnfwd,links[curr_querypos][hit].rev_intronnrev,
4197 links[curr_querypos][hit].rev_intronnunk));
4198 debug0(printf("\n"));
4199
4200 #endif
4201 }
4202 #endif
4203 debug0(FREE(oligo));
4204
4205 /* prevposition = position; */
4206 prev_querypos = curr_querypos;
4207 prevhit = hit;
4208 if (fwdp == true) {
4209 curr_querypos = links[prev_querypos][prevhit].fwd_pos;
4210 hit = links[prev_querypos][prevhit].fwd_hit;
4211 #ifdef SEPARATE_FWD_REV
4212 } else {
4213 curr_querypos = links[prev_querypos][prevhit].rev_pos;
4214 hit = links[prev_querypos][prevhit].rev_hit;
4215 #endif
4216 }
4217 debug3(printf("%d %d %d %d 3\n",prev_querypos,prevhit,curr_querypos,hit));
4218 }
4219 debug0(printf("Done\n\n"));
4220
4221 return path;
4222 }
4223
4224
4225 static List_T
traceback_one_snps(int curr_querypos,int hit,struct Link_T ** links,Chrpos_T ** mappings,char * queryseq_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int ** fwd_scores,int indexsize,Pairpool_T pairpool,bool fwdp)4226 traceback_one_snps (int curr_querypos, int hit, struct Link_T **links, Chrpos_T **mappings,
4227 char *queryseq_ptr,
4228
4229 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
4230 #ifdef DEBUG0
4231 int **fwd_scores, int indexsize,
4232 #endif
4233 Pairpool_T pairpool, bool fwdp) {
4234 List_T path = NULL;
4235 Chrpos_T position;
4236 int prev_querypos, prevhit;
4237 char c2, c2_alt;
4238
4239 #ifdef DEBUG0
4240 char *oligo;
4241 #endif
4242
4243
4244 while (curr_querypos >= 0) {
4245 position = mappings[curr_querypos][hit];
4246
4247 #ifdef PMAP
4248 /* Change querypos positions from protein to nucleotide */
4249 c2 = get_genomic_nt(&c2_alt,position+2,chroffset,chrhigh,watsonp);
4250 path = Pairpool_push(path,pairpool,curr_querypos*3+2,position+2,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4251 /*dynprogindex*/0);
4252 c2 = get_genomic_nt(&c2_alt,position+1,chroffset,chrhigh,watsonp);
4253 path = Pairpool_push(path,pairpool,curr_querypos*3+1,position+1,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4254 /*dynprogindex*/0);
4255 c2 = get_genomic_nt(&c2_alt,position,chroffset,chrhigh,watsonp);
4256 path = Pairpool_push(path,pairpool,curr_querypos*3,position,/*cdna*/c2,MATCH_COMP,c2,c2_alt,
4257 /*dynprogindex*/0);
4258 #else
4259 /* genomic nucleotide or SNP same as queryseq */
4260 c2 = get_genomic_nt(&c2_alt,position,chroffset,chrhigh,watsonp);
4261 path = Pairpool_push(path,pairpool,curr_querypos,position,queryseq_ptr[curr_querypos],MATCH_COMP,c2,c2_alt,
4262 /*dynprogindex*/0);
4263 #endif
4264
4265
4266 #ifdef DEBUG0
4267 debug0(oligo = (char *) MALLOC((indexsize+1)*sizeof(char)));
4268 debug0(strncpy(oligo,&(queryseq_ptr[curr_querypos]),indexsize));
4269 debug0(oligo[indexsize] = '\0');
4270 if (fwdp == true) {
4271 debug0(printf("Pushing %d,%d (%s) at %u, score = %d, consec = %d",
4272 curr_querypos,hit,oligo,position,
4273 fwd_scores[curr_querypos][hit],links[curr_querypos][hit].fwd_consecutive));
4274 debug9(printf(" (from #%d), intr = %d(+)/%d(-)/%d(?)",
4275 links[curr_querypos][hit].fwd_tracei,links[curr_querypos][hit].fwd_intronnfwd,links[curr_querypos][hit].fwd_intronnrev,
4276 links[curr_querypos][hit].fwd_intronnunk));
4277 debug0(printf("\n"));
4278
4279 #ifdef SEPARATE_FWD_REV
4280 } else {
4281 debug0(printf("Pushing %d,%d (%s) at %u, score = %d, consec = %d",
4282 curr_querypos,hit,oligo,position,
4283 links[curr_querypos][hit].rev_score,links[curr_querypos][hit].rev_consecutive));
4284 debug9(printf(" (from #%d), intr = %d(+)/%d(-)/%d(?)",
4285 links[curr_querypos][hit].rev_tracei,links[curr_querypos][hit].rev_intronnfwd,links[curr_querypos][hit].rev_intronnrev,
4286 links[curr_querypos][hit].rev_intronnunk));
4287 debug0(printf("\n"));
4288 #endif
4289 }
4290 #endif
4291 debug0(FREE(oligo));
4292
4293 /* prevposition = position; */
4294 prev_querypos = curr_querypos;
4295 prevhit = hit;
4296 if (fwdp == true) {
4297 curr_querypos = links[prev_querypos][prevhit].fwd_pos;
4298 hit = links[prev_querypos][prevhit].fwd_hit;
4299 #ifdef SEPARATE_FWD_REV
4300 } else {
4301 curr_querypos = links[prev_querypos][prevhit].rev_pos;
4302 hit = links[prev_querypos][prevhit].rev_hit;
4303 #endif
4304 }
4305 debug3(printf("%d %d %d %d 3\n",prev_querypos,prevhit,curr_querypos,hit));
4306 }
4307 debug0(printf("Done\n\n"));
4308
4309 return path;
4310 }
4311
4312
4313 /* Performs dynamic programming. For PMAP, indexsize is in aa. */
4314 static List_T
align_compute_lookback(Chrpos_T ** mappings,int * npositions,int totalpositions,bool oned_matrix_p,Chrpos_T * minactive,Chrpos_T * maxactive,int * firstactive,int * nactive,Cellpool_T cellpool,char * queryseq_ptr,char * queryuc_ptr,int querylength,int querystart,int queryend,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int indexsize,Pairpool_T pairpool,bool anchoredp,int anchor_querypos,Chrpos_T anchor_position,bool localp,bool skip_repetitive_p,bool use_canonical_p,int non_canonical_penalty,bool favor_right_p,bool middlep,int max_nalignments,bool debug_graphic_p)4315 align_compute_lookback (Chrpos_T **mappings, int *npositions, int totalpositions,
4316 bool oned_matrix_p, Chrpos_T *minactive, Chrpos_T *maxactive,
4317 int *firstactive, int *nactive, Cellpool_T cellpool,
4318 char *queryseq_ptr, char *queryuc_ptr, int querylength, int querystart, int queryend,
4319 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
4320 int indexsize, Pairpool_T pairpool,
4321 #ifdef MOVE_TO_STAGE3
4322 bool anchoredp, int anchor_querypos, Chrpos_T anchor_position,
4323 #endif
4324 bool localp, bool skip_repetitive_p, bool use_canonical_p, int non_canonical_penalty,
4325 bool favor_right_p, bool middlep, int max_nalignments, bool debug_graphic_p) {
4326 List_T all_paths = NULL;
4327 int npaths = 0;
4328 struct Link_T **links;
4329 int **fwd_scores;
4330
4331 #if 0
4332 bool anchoredp = false;
4333 int anchor_querypos = 0;
4334 Chrpos_T anchor_position = 0;
4335 #endif
4336
4337 Cell_T *cells, cell;
4338 int ncells, i;
4339
4340 bool fwdp;
4341 int querypos, hit;
4342 int bestscore;
4343 #ifdef SLOW
4344 int last_endposition;
4345 #endif
4346
4347
4348 if (oned_matrix_p == true) {
4349 links = Linkmatrix_1d_new(querylength,npositions,totalpositions);
4350 fwd_scores = intmatrix_1d_new(querylength,npositions,totalpositions);
4351 } else {
4352 links = Linkmatrix_2d_new(querylength,npositions);
4353 fwd_scores = intmatrix_2d_new(querylength,npositions);
4354 }
4355
4356 /* These are all oligomers */
4357 if (debug_graphic_p == true) {
4358 mappings_dump_R(mappings,npositions,querylength,/*active*/NULL,/*firstactive*/NULL,indexsize,"all.mers");
4359 }
4360
4361 cells = align_compute_scores_lookback(&ncells,links,fwd_scores,
4362 mappings,npositions,totalpositions,
4363 oned_matrix_p,minactive,maxactive,firstactive,nactive,cellpool,
4364 querystart,queryend,querylength,
4365
4366 chroffset,chrhigh,plusp,
4367
4368 indexsize,
4369 #ifdef DEBUG9
4370 queryseq_ptr,
4371 #endif
4372 localp,skip_repetitive_p,use_canonical_p,non_canonical_penalty,
4373 debug_graphic_p,favor_right_p,middlep);
4374 /* cells are currently sorted by Cell_score_cmp in get_cells_fwd */
4375
4376
4377 #ifdef SEPARATE_FWD_REV
4378 debug1(Linkmatrix_print_both(links,mappings,querylength,npositions,queryseq_ptr,indexsize));
4379 #else
4380 debug1(print_fwd(links,fwd_scores,mappings,querylength,npositions,queryseq_ptr,indexsize));
4381 #endif
4382
4383 if (ncells == 0) {
4384 all_paths = (List_T) NULL;
4385
4386 } else {
4387 /* High-scoring paths */
4388 bestscore = cells[0]->score;
4389 debug11(printf("** Looping on %d cells, allowing up to %d alignments, plus any with best score %d\n",
4390 ncells,max_nalignments,bestscore));
4391
4392 if (snps_p == true) {
4393 for (i = 0; i < ncells && (i < max_nalignments || cells[i]->score == bestscore)
4394 && cells[i]->score > bestscore - FINAL_SCORE_TOLERANCE; i++) {
4395 cell = cells[i];
4396 querypos = cell->querypos;
4397 hit = cell->hit;
4398 fwdp = cell->fwdp;
4399 debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
4400 i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
4401
4402 all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
4403 chroffset,chrhigh,/*watsonp*/plusp,
4404 #ifdef DEBUG0
4405 fwd_scores,indexsize,
4406 #endif
4407 pairpool,fwdp));
4408 npaths++;
4409 cell->pushedp = true;
4410 }
4411
4412 } else {
4413 for (i = 0; i < ncells && (i < max_nalignments || cells[i]->score == bestscore)
4414 && cells[i]->score > bestscore - FINAL_SCORE_TOLERANCE; i++) {
4415 cell = cells[i];
4416 querypos = cell->querypos;
4417 hit = cell->hit;
4418 fwdp = cell->fwdp;
4419 debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
4420 i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
4421
4422 #if 0
4423 if (debug_graphic_p == true) {
4424 best_path_dump_R(links,mappings,querypos,hit,fwdp,"best.path");
4425 printf("plot(all.mers,col=\"black\",pch=\".\",xlab=\"Query\",ylab=\"Genomic\")\n");
4426 printf("points(active.mers,col=\"red\",pch=\".\")\n");
4427 printf("points(best.path,col=\"green\",pch=\".\")\n");
4428 printf("lines(querypos,minactive,col=\"blue\")\n");
4429 printf("lines(querypos,maxactive,col=\"blue\")\n");
4430 }
4431 #endif
4432
4433 all_paths = List_push(all_paths,(void *) traceback_one(querypos,hit,links,mappings,queryseq_ptr,queryuc_ptr,
4434 #ifdef PMAP
4435 chroffset,chrhigh,/*watsonp*/plusp,/*lookbackp*/true,
4436 #endif
4437 #ifdef DEBUG0
4438 fwd_scores,indexsize,
4439 #endif
4440 pairpool,fwdp));
4441 npaths++;
4442 cell->pushedp = true;
4443 }
4444 }
4445
4446
4447 #ifdef SLOW
4448 if (npaths < max_nalignments) {
4449 /* Non-overlapping paths */
4450 debug11(printf("** Looping on %d cells, looking for non-overlapping paths. Total paths so far: %d\n",
4451 ncells,npaths));
4452 qsort(cells,ncells,sizeof(Cell_T),Cell_interval_cmp);
4453 last_endposition = 0;
4454 if (snps_p == true) {
4455 for (i = 0; i < ncells && npaths < max_nalignments; i++) {
4456 cell = cells[i];
4457 if (cell->score > bestscore * NONOVERLAPPING_SCORE_TOLERANCE &&
4458 cell->rootposition > last_endposition && cell->pushedp == false) {
4459 debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
4460 i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
4461 querypos = cell->querypos;
4462 hit = cell->hit;
4463 fwdp = cell->fwdp;
4464 all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
4465 chroffset,chrhigh,/*watsonp*/plusp,
4466 #ifdef DEBUG0
4467 fwd_scores,indexsize,
4468 #endif
4469 pairpool,fwdp));
4470 npaths++;
4471 cell->pushedp = true;
4472 last_endposition = cell->endposition;
4473 }
4474 }
4475
4476 } else {
4477 for (i = 0; i < ncells && npaths < max_nalignments; i++) {
4478 cell = cells[i];
4479 if (cell->score > bestscore * NONOVERLAPPING_SCORE_TOLERANCE &&
4480 cell->rootposition > last_endposition && cell->pushedp == false) {
4481 debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
4482 i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
4483 querypos = cell->querypos;
4484 hit = cell->hit;
4485 fwdp = cell->fwdp;
4486 all_paths = List_push(all_paths,(void *) traceback_one(querypos,hit,links,mappings,queryseq_ptr,queryuc_ptr,
4487 #ifdef PMAP
4488 chroffset,chrhigh,/*watsonp*/plusp,/*lookbackp*/true,
4489 #endif
4490 #ifdef DEBUG0
4491 fwd_scores,indexsize,
4492 #endif
4493 pairpool,fwdp));
4494 npaths++;
4495 cell->pushedp = true;
4496 last_endposition = cell->endposition;
4497 }
4498 }
4499 }
4500 }
4501 #endif
4502
4503 debug11(printf("\n"));
4504
4505 #if 0
4506 /* No need with cellpool */
4507 for (i = 0; i < ncells; i++) {
4508 cell = cells[i];
4509 Cell_free(&cell);
4510 }
4511 #endif
4512 FREE(cells);
4513 }
4514
4515
4516 if (oned_matrix_p == true) {
4517 Linkmatrix_1d_free(&links);
4518 intmatrix_1d_free(&fwd_scores);
4519 } else {
4520 Linkmatrix_2d_free(&links,querylength);
4521 intmatrix_2d_free(&fwd_scores,querylength);
4522 }
4523
4524 #if 0
4525 for (p = all_paths; p != NULL; p = List_next(p)) {
4526 Pair_dump_list(List_head(p),/*zerobasedp*/true);
4527 printf("\n");
4528 }
4529 #endif
4530
4531 return all_paths;
4532 }
4533
4534
4535
4536 /* Returns celllist */
4537 /* For PMAP, indexsize is in aa. */
4538 static Cell_T *
align_compute_scores_lookforward(int * ncells,struct Link_T ** links,int ** fwd_scores,Chrpos_T ** mappings,int * npositions,int totalpositions,bool oned_matrix_p,Chrpos_T * minactive,Chrpos_T * maxactive,int * firstactive,int * nactive,Cellpool_T cellpool,int querystart,int queryend,int querylength,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int indexsize,char * queryseq_ptr,bool anchoredp,int anchor_querypos,Chrpos_T anchor_position,bool localp,bool skip_repetitive_p,bool use_canonical_p,int non_canonical_penalty,bool debug_graphic_p,bool favor_right_p,bool middlep)4539 align_compute_scores_lookforward (int *ncells, struct Link_T **links, int **fwd_scores,
4540 Chrpos_T **mappings, int *npositions, int totalpositions,
4541 bool oned_matrix_p, Chrpos_T *minactive, Chrpos_T *maxactive,
4542 int *firstactive, int *nactive, Cellpool_T cellpool,
4543 int querystart, int queryend, int querylength,
4544 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
4545 int indexsize,
4546 #ifdef DEBUG9
4547 char *queryseq_ptr,
4548 #endif
4549 #ifdef MOVE_TO_STAGE3
4550 bool anchoredp, int anchor_querypos, Chrpos_T anchor_position,
4551 #endif
4552 bool localp, bool skip_repetitive_p,
4553 bool use_canonical_p, int non_canonical_penalty,
4554 bool debug_graphic_p, bool favor_right_p, bool middlep) {
4555 #if 0
4556 bool anchoredp = false;
4557 int anchor_querypos = 0;
4558 Chrpos_T anchor_position = 0;
4559 #endif
4560
4561 Cell_T *cells;
4562 Link_T currlink;
4563 int curr_querypos, indexsize_nt, indexsize_query, hit, nhits, low_hit, high_hit;
4564 int nskipped, min_hits, specific_querypos, specific_low_hit, specific_high_hit, next_querypos;
4565 Intlist_T processed = NULL;
4566 int best_overall_score = 0;
4567 int grand_fwd_score, grand_fwd_querypos, grand_fwd_hit, best_fwd_hit, best_fwd_score;
4568 #ifdef SEPARATE_FWD_REV
4569 int grand_rev_score, grand_rev_querypos, grand_rev_hit, best_rev_hit, best_rev_score;
4570 #ifdef DEBUG9
4571 int rev_tracei = 0;
4572 #endif
4573 #endif
4574 int **active;
4575 Chrpos_T position, prevposition;
4576 int fwd_tracei = 0;
4577 #if 0
4578 int *lastGT, *lastAG;
4579 #ifndef PMAP
4580 int *lastCT, *lastAC;
4581 #endif
4582 #endif
4583 #ifdef DEBUG9
4584 Link_T prevlink;
4585 char *oligo;
4586 #endif
4587 #ifdef DEBUG12
4588 Link_T termlink = NULL;
4589 #endif
4590
4591 #ifdef PMAP
4592 indexsize_nt = indexsize*3;
4593 #else
4594 indexsize_nt = indexsize;
4595 #endif
4596 indexsize_query = indexsize; /* Use when evaluating across query positions */
4597
4598
4599 #ifdef DEBUG9
4600 oligo = (char *) CALLOC(indexsize+1,sizeof(char));
4601 #endif
4602 debug0(printf("Lookforward: querystart = %d, queryend = %d, indexsize = %d\n",querystart,queryend,indexsize));
4603
4604 if (oned_matrix_p == true) {
4605 active = intmatrix_1d_new(querylength,npositions,totalpositions);
4606 } else {
4607 active = intmatrix_2d_new(querylength,npositions);
4608 }
4609
4610 #if 0
4611 firstactive = (int *) MALLOC(querylength * sizeof(int));
4612 nactive = (int *) MALLOC(querylength * sizeof(int));
4613 #endif
4614
4615 /* Initialize */
4616 for (curr_querypos = querylength - 1; curr_querypos > queryend; curr_querypos--) {
4617 debug6(printf("5. Initializing firstactive for querypos %d to be -1\n",curr_querypos));
4618 firstactive[curr_querypos] = -1;
4619 nactive[curr_querypos] = 0;
4620 }
4621 while (curr_querypos >= querystart && npositions[curr_querypos] <= 0) {
4622 debug6(printf("6. Initializing firstactive for querypos %d to be -1\n",curr_querypos));
4623 debug9(printf("Skipping querypos %d which has no positions\n",curr_querypos));
4624 firstactive[curr_querypos] = -1;
4625 nactive[curr_querypos] = 0;
4626 curr_querypos--;
4627 }
4628
4629 #ifdef MOVE_TO_STAGE3
4630 if (anchoredp == true) {
4631 /* Guaranteed to find a hit */
4632 hit = binary_search(0,npositions[anchor_querypos],mappings[anchor_querypos],/*goal*/anchor_position);
4633 if (mappings[anchor_querypos] == NULL) {
4634 printf("mappings at anchor_querypos %d is NULL. mappings = %p\n",anchor_querypos,mappings);
4635 abort();
4636 }
4637
4638 currlink = &(links[anchor_querypos][hit]);
4639 #ifndef SEPARATE_FWD_REV
4640 currlink->fwd_pos = currlink->fwd_hit = -1;
4641 currlink->fwd_consecutive = EXON_DEFN;
4642 currlink->fwd_tracei = 0;
4643 fwd_scores[anchor_querypos][hit] = indexsize_nt;
4644 #else
4645 fprintf(stderr,"Not implemented yet\n");
4646 abort();
4647 #endif
4648
4649 debug6(printf("Setting firstactive for anchorpos %d to be %d\n",anchor_querypos,hit));
4650 firstactive[anchor_querypos] = hit;
4651 nactive[anchor_querypos] = 1;
4652 active[anchor_querypos][hit] = -1;
4653
4654 debug6(printf("Pushing anchorpos %d as processed\n",anchor_querypos));
4655 processed = Intlist_push(processed,anchor_querypos);
4656
4657 } else
4658 #endif
4659
4660 if (curr_querypos >= querystart) {
4661 for (hit = npositions[curr_querypos] - 1; hit >= 0; --hit) {
4662 currlink = &(links[curr_querypos][hit]);
4663 #ifndef SEPARATE_FWD_REV
4664 currlink->fwd_pos = currlink->fwd_hit = -1;
4665 currlink->fwd_consecutive = indexsize_nt;
4666 currlink->fwd_tracei = -1;
4667 /* currlink->fwd_rootnlinks = 1; */
4668 fwd_scores[curr_querypos][hit] = indexsize_nt;
4669 #else
4670 currlink->fwd_pos = currlink->fwd_hit = -1;
4671 currlink->fwd_score = indexsize_nt;
4672 currlink->fwd_consecutive = indexsize_nt;
4673 currlink->fwd_tracei = -1;
4674 /* currlink->fwd_rootnlinks = 1; */
4675 if (splicingp == true) {
4676 currlink->rev_pos = currlink->rev_hit = -1;
4677 currlink->rev_consecutive = indexsize_nt;
4678 currlink->rev_tracei = -1;
4679 /* currlink->rev_rootnlinks = 1; */
4680 rev_scores[curr_querypos][hit] = indexsize_nt;
4681 }
4682 #endif
4683 }
4684 revise_active_lookforward(active,firstactive,nactive,0,npositions[curr_querypos],fwd_scores,curr_querypos);
4685 }
4686
4687
4688 grand_fwd_score = 0;
4689 grand_fwd_querypos = -1;
4690 grand_fwd_hit = -1;
4691 #ifdef SEPARATE_FWD_REV
4692 if (splicingp == true) {
4693 grand_rev_score = 0;
4694 grand_rev_querypos = -1;
4695 grand_rev_hit = -1;
4696 }
4697 #endif
4698
4699 nskipped = 0;
4700 min_hits = 1000000;
4701 specific_querypos = -1;
4702
4703 /* curr_querypos -= 1; -- this causes curr_querypos at queryend to be ignored */
4704 while (curr_querypos >= querystart) {
4705 best_fwd_score = 0;
4706 best_fwd_hit = -1;
4707 #ifdef SEPARATE_FWD_REV
4708 best_rev_score = 0;
4709 best_rev_hit = -1;
4710 #endif
4711
4712 debug9(printf("Positions at querypos %d (reverse order):",curr_querypos);
4713 for (hit = npositions[curr_querypos] - 1; hit >= 0; --hit) {
4714 printf(" %u",mappings[curr_querypos][hit]);
4715 }
4716 printf("\n");
4717 );
4718
4719 hit = npositions[curr_querypos] - 1;
4720 while (hit >= 0 && mappings[curr_querypos][hit] > maxactive[curr_querypos]) {
4721 --hit;
4722 }
4723 high_hit = hit + 1;
4724 while (hit >= 0 && mappings[curr_querypos][hit] >= minactive[curr_querypos]) {
4725 --hit;
4726 }
4727 low_hit = hit + 1;
4728 debug9(printf("Querypos %d has hit %d..%d out of %d (minactive = %u, maxactive = %u)\n",
4729 curr_querypos,high_hit-1,low_hit,npositions[curr_querypos],minactive[curr_querypos],maxactive[curr_querypos]));
4730
4731 /* Can't use nactive yet, so use high_hit - low_hit */
4732 if (skip_repetitive_p && high_hit - low_hit >= MAX_NACTIVE && nskipped <= MAX_SKIPPED) { /* Previously turned off */
4733 debug6(printf("Too many active (%d - %d) at querypos %d. Setting firstactive to be -1\n",high_hit,low_hit,curr_querypos));
4734 firstactive[curr_querypos] = -1;
4735 nactive[curr_querypos] = 0;
4736 nskipped++;
4737 debug9(printf(" %d skipped because of %d hits\n",nskipped,high_hit - low_hit + 1));
4738
4739 /* Store most specific querypos in section of skipped */
4740 if (high_hit - low_hit < min_hits) {
4741 min_hits = high_hit - low_hit;
4742 specific_querypos = curr_querypos;
4743 specific_low_hit = low_hit;
4744 specific_high_hit = high_hit;
4745 }
4746 curr_querypos--;
4747
4748 } else {
4749 if (nskipped > MAX_SKIPPED) {
4750 debug9(printf("Too many skipped. Going back to specific querypos %d\n",specific_querypos));
4751 next_querypos = curr_querypos;
4752 curr_querypos = specific_querypos;
4753 low_hit = specific_low_hit;
4754 high_hit = specific_high_hit;
4755 } else {
4756 next_querypos = curr_querypos - 1;
4757 }
4758
4759 if ((nhits = high_hit - low_hit) > 0) {
4760 if (nhits == 1) {
4761 currlink = &(links[curr_querypos][low_hit]);
4762 position = mappings[curr_querypos][low_hit];
4763
4764 debug9(strncpy(oligo,&(queryseq_ptr[curr_querypos]),indexsize));
4765 debug9(oligo[indexsize] = '\0');
4766 debug9(printf("Finding link looking forward from querypos %d,%d at %ux%d (%s). prev_querypos was %d\n",
4767 curr_querypos,low_hit,position,active[curr_querypos][low_hit],oligo,processed ? Intlist_head(processed) : -1));
4768 score_querypos_lookforward_one(&fwd_tracei,currlink,curr_querypos,low_hit,position,
4769 links,fwd_scores,mappings,active,firstactive,
4770 chroffset,chrhigh,plusp,
4771 indexsize,processed,localp,splicingp,use_canonical_p,
4772 non_canonical_penalty);
4773 if (fwd_scores[curr_querypos][low_hit] > 0) {
4774 debug9(printf("Single hit at low_hit %d has score %d\n",low_hit,fwd_scores[curr_querypos][low_hit]));
4775 best_fwd_score = fwd_scores[curr_querypos][low_hit];
4776 best_fwd_hit = low_hit;
4777 }
4778
4779 } else {
4780 debug9(strncpy(oligo,&(queryseq_ptr[curr_querypos]),indexsize));
4781 debug9(oligo[indexsize] = '\0');
4782 debug9(printf("Finding links looking forward from querypos %d,%d..%d at (%u..%u) (%s). prev_querypos was %d\n",
4783 curr_querypos,high_hit-1,low_hit,mappings[curr_querypos][high_hit-1],mappings[curr_querypos][low_hit],
4784 oligo,processed ? Intlist_head(processed) : -1));
4785
4786 score_querypos_lookforward_mult(&fwd_tracei,low_hit,high_hit,curr_querypos,
4787 /*positions*/&(mappings[curr_querypos][low_hit]),
4788 links,fwd_scores,mappings,active,firstactive,chroffset,chrhigh,plusp,
4789 indexsize,processed,localp,splicingp,use_canonical_p,
4790 non_canonical_penalty);
4791
4792 debug9(printf("Checking hits from high_hit %d to low_hit %d\n",high_hit,low_hit));
4793 for (hit = high_hit - 1; hit >= low_hit; hit--) {
4794 debug9(printf("Hit %d has score %d\n",hit,fwd_scores[curr_querypos][hit]));
4795 if (fwd_scores[curr_querypos][hit] > best_fwd_score) {
4796 best_fwd_score = fwd_scores[curr_querypos][hit];
4797 best_fwd_hit = hit;
4798 }
4799 }
4800 }
4801
4802 if (best_fwd_score > best_overall_score) {
4803 best_overall_score = best_fwd_score;
4804 }
4805
4806 nskipped = 0;
4807 min_hits = 1000000;
4808 specific_querypos = -1;
4809
4810 #ifndef SEPARATE_FWD_REV
4811 debug9(printf("Overall result at querypos %d yields best_fwd_hit %d\n",
4812 curr_querypos,best_fwd_hit));
4813 #else
4814 debug9(printf("Overall result at querypos %d yields best_fwd_hit %d and best_rev_hit %d\n",
4815 curr_querypos,best_fwd_hit,best_rev_hit));
4816 #endif
4817
4818 #if 1
4819 /* Previously, thought that using this code causes misses in
4820 some alignments, but not using it causes missing end
4821 exons */
4822 if (middlep == false && best_fwd_hit < 0) {
4823 /* Allow for a new start */
4824 for (hit = 0; hit < npositions[curr_querypos]; hit++) {
4825 currlink = &(links[curr_querypos][hit]);
4826 #ifndef SEPARATE_FWD_REV
4827 currlink->fwd_pos = currlink->fwd_hit = -1;
4828 currlink->fwd_consecutive = indexsize_nt;
4829 currlink->fwd_tracei = -1;
4830 /* currlink->fwd_rootnlinks = 1; */
4831 fwd_scores[curr_querypos][hit] = indexsize_nt;
4832 #else
4833 currlink->fwd_pos = currlink->fwd_hit = -1;
4834 currlink->fwd_consecutive = indexsize_nt;
4835 currlink->fwd_tracei = -1;
4836 /* currlink->fwd_rootnlinks = 1; */
4837 fwd_scores[curr_querypos][hit] = indexsize_nt;
4838 if (splicingp == true) {
4839 currlink->rev_pos = currlink->rev_hit = -1;
4840 currlink->rev_consecutive = indexsize_nt;
4841 currlink->rev_tracei = -1;
4842 /* currlink->rev_rootnlinks = 1; */
4843 rev_scores[curr_querypos][hit] = indexsize_nt;
4844 }
4845 #endif
4846 }
4847 }
4848 #endif
4849
4850 if (splicingp == true && best_fwd_hit >= 0 && links[curr_querypos][best_fwd_hit].fwd_hit < 0 &&
4851 grand_fwd_querypos <= querylength - indexsize_query && curr_querypos + indexsize_query <= grand_fwd_querypos) {
4852 if ((best_fwd_score = fwd_scores[grand_fwd_querypos][grand_fwd_hit] - (grand_fwd_querypos - curr_querypos)) > 0) {
4853 prevposition = mappings[grand_fwd_querypos][grand_fwd_hit];
4854 debug12(printf("Considering prevposition %u to position %u as a grand fwd lookforward\n",prevposition,position));
4855 for (hit = high_hit - 1; hit >= low_hit; --hit) {
4856 if ((position = mappings[curr_querypos][hit]) + maxintronlen < prevposition) {
4857 debug12(printf(" => Too long\n"));
4858 } else if (position + indexsize_nt <= prevposition) {
4859 currlink = &(links[curr_querypos][hit]);
4860 currlink->fwd_consecutive = indexsize_nt;
4861 currlink->fwd_pos = grand_fwd_querypos;
4862 currlink->fwd_hit = grand_fwd_hit;
4863 currlink->fwd_tracei = ++fwd_tracei;
4864 /* currlink->fwd_rootnlinks = 1; */
4865 fwd_scores[curr_querypos][hit] = best_fwd_score;
4866 #ifdef DEBUG9
4867 prevlink = &(links[grand_fwd_querypos][grand_fwd_hit]);
4868 currlink->fwd_intronnfwd = prevlink->fwd_intronnfwd;
4869 currlink->fwd_intronnrev = prevlink->fwd_intronnrev;
4870 currlink->fwd_intronnunk = prevlink->fwd_intronnunk + 1;
4871 #endif
4872 }
4873 }
4874 debug12(printf("At querypos %d, setting all fwd hits to point back to grand_fwd %d,%d with a score of %d\n",
4875 curr_querypos,grand_fwd_querypos,grand_fwd_hit,fwd_scores[grand_fwd_querypos][grand_fwd_hit]));
4876 }
4877 }
4878
4879 /* Use >= to favor longer path in case of ties */
4880 if (best_fwd_hit >= 0 && best_fwd_score >= grand_fwd_score &&
4881 links[curr_querypos][best_fwd_hit].fwd_consecutive > EXON_DEFN) {
4882 grand_fwd_score = best_fwd_score;
4883 grand_fwd_querypos = curr_querypos;
4884 grand_fwd_hit = best_fwd_hit;
4885 debug12(termlink = &(links[curr_querypos][best_fwd_hit]));
4886 debug12(printf("At querypos %d, revising grand fwd to be hit %d with score of %d (pointing back to %d,%d)\n",
4887 curr_querypos,best_fwd_hit,best_fwd_score,termlink->fwd_pos,termlink->fwd_hit));
4888 }
4889
4890 #ifdef SEPARATE_FWD_REV
4891 if (best_rev_score > best_overall_score) {
4892 best_overall_score = best_rev_score;
4893 }
4894
4895 if (splicingp == false || use_canonical_p == false) {
4896 /* rev scores should be the same as the fwd scores */
4897 } else {
4898 if (best_rev_hit >= 0 && links[curr_querypos][best_rev_hit].rev_hit < 0 &&
4899 grand_rev_querypos <= querylength - indexsize_query && curr_querypos + indexsize_query <= grand_rev_querypos) {
4900 prevlink = &(links[grand_rev_querypos][grand_rev_hit]);
4901 if ((best_rev_score = prevlink->rev_score - (grand_rev_querypos - curr_querypos)) > 0) {
4902 prevposition = mappings[grand_rev_querypos][grand_rev_hit];
4903 debug12(printf("Considering prevposition %u to position %u as a grand rev lookforward\n",prevposition,position));
4904 for (hit = high_hit - 1; hit >= low_hit; --hit) {
4905 if ((position = mappings[curr_querypos][hit]) + maxintronlen < prevposition) {
4906 debug12(printf(" => Too long\n"));
4907 } else if (position + indexsize_nt <= prevposition) {
4908 currlink = &(links[curr_querypos][hit]);
4909 currlink->rev_consecutive = indexsize_nt;
4910 /* currlink->rev_rootnlinks = 1; */
4911 currlink->rev_pos = grand_rev_querypos;
4912 currlink->rev_hit = grand_rev_hit;
4913 currlink->rev_score = best_rev_score;
4914 #ifdef DEBUG9
4915 currlink->rev_tracei = ++rev_tracei;
4916 currlink->rev_intronnrev = prevlink->rev_intronnfwd;
4917 currlink->rev_intronnrev = prevlink->rev_intronnrev;
4918 currlink->rev_intronnunk = prevlink->rev_intronnunk + 1;
4919 #endif
4920 }
4921 }
4922 debug12(printf("At querypos %d, setting all rev hits to point back to grand_rev %d,%d with a score of %d\n",
4923 curr_querypos,grand_rev_querypos,grand_rev_hit,prevlink->rev_score));
4924 }
4925 }
4926
4927 /* Use >= to favor longer path in case of ties */
4928 if (best_rev_hit >= 0 && best_rev_score >= grand_rev_score &&
4929 links[curr_querypos][best_rev_hit].rev_consecutive > EXON_DEFN) {
4930 grand_rev_score = best_rev_score;
4931 grand_rev_querypos = curr_querypos;
4932 grand_rev_hit = best_rev_hit;
4933 }
4934 }
4935 #endif
4936 }
4937
4938 revise_active_lookforward(active,firstactive,nactive,low_hit,high_hit,fwd_scores,curr_querypos);
4939
4940 /* Need to push curr_querypos, even if firstactive[curr_querypos] == -1 */
4941 /* Want to skip npositions[curr_querypos] == 0, so we can find adjacent despite mismatch or overabundance */
4942 if (npositions[curr_querypos] > 0) {
4943 debug6(printf("Pushing querypos %d onto processed\n",curr_querypos));
4944 processed = Intlist_push(processed,curr_querypos);
4945 }
4946 curr_querypos = next_querypos;
4947 }
4948 }
4949 debug9(printf("End of loop lookforward\n"));
4950
4951
4952 Intlist_free(&processed);
4953
4954 /* These are the final active oligomers, after pruning by score */
4955 if (debug_graphic_p == true) {
4956 mappings_dump_R(mappings,npositions,querylength,active,firstactive,indexsize,"active.mers");
4957 }
4958
4959 #if 0
4960 FREE(nactive);
4961 FREE(firstactive);
4962 #endif
4963
4964 if (oned_matrix_p == true) {
4965 intmatrix_1d_free(&active);
4966 } else {
4967 intmatrix_2d_free(&active,querylength);
4968 }
4969
4970
4971 /* Grand winners */
4972 debug12(printf("Finding grand winners, using root position method\n"));
4973 #ifdef SEPARATE_FWD_REV
4974 if (splicingp == false || use_canonical_p == false) {
4975 cells = Linkmatrix_get_cells_fwd(&(*ncells),links,querystart,queryend,npositions,
4976 favor_right_p,cellpool);
4977 } else {
4978 cells = Linkmatrix_get_cells_both(&(*ncells),links,querystart,queryend,npositions,
4979 indexsize,best_overall_score,favor_right_p,cellpool);
4980 }
4981 #else
4982 cells = get_cells_fwd(&(*ncells),links,fwd_scores,mappings,querystart,queryend,npositions,
4983 favor_right_p,cellpool);
4984 #endif
4985
4986 debug9(FREE(oligo));
4987
4988 return cells;
4989 }
4990
4991
4992 /* Performs dynamic programming. For PMAP, indexsize is in aa. */
4993 static List_T
align_compute_lookforward(Chrpos_T ** mappings,int * npositions,int totalpositions,bool oned_matrix_p,Chrpos_T * minactive,Chrpos_T * maxactive,int * firstactive,int * nactive,Cellpool_T cellpool,char * queryseq_ptr,char * queryuc_ptr,int querylength,int querystart,int queryend,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int indexsize,Pairpool_T pairpool,bool anchoredp,int anchor_querypos,Chrpos_T anchor_position,bool localp,bool skip_repetitive_p,bool use_canonical_p,int non_canonical_penalty,bool favor_right_p,bool middlep,int max_nalignments,bool debug_graphic_p)4994 align_compute_lookforward (Chrpos_T **mappings, int *npositions, int totalpositions,
4995 bool oned_matrix_p, Chrpos_T *minactive, Chrpos_T *maxactive,
4996 int *firstactive, int *nactive, Cellpool_T cellpool,
4997 char *queryseq_ptr, char *queryuc_ptr, int querylength, int querystart, int queryend,
4998
4999 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
5000 int indexsize, Pairpool_T pairpool,
5001 #ifdef MOVE_TO_STAGE3
5002 bool anchoredp, int anchor_querypos, Chrpos_T anchor_position,
5003 #endif
5004 bool localp, bool skip_repetitive_p, bool use_canonical_p, int non_canonical_penalty,
5005 bool favor_right_p, bool middlep, int max_nalignments, bool debug_graphic_p) {
5006 List_T all_paths = NULL;
5007 int npaths = 0;
5008 struct Link_T **links;
5009 int **fwd_scores;
5010
5011 #if 0
5012 bool anchoredp = false;
5013 int anchor_querypos = 0;
5014 Chrpos_T anchor_position = 0;
5015 #endif
5016
5017 Cell_T *cells, cell;
5018 int ncells, i;
5019
5020 bool fwdp;
5021 int querypos, hit;
5022 int bestscore;
5023 #ifdef SLOW
5024 int last_endposition;
5025 #endif
5026
5027
5028 if (oned_matrix_p == true) {
5029 links = Linkmatrix_1d_new(querylength,npositions,totalpositions);
5030 fwd_scores = intmatrix_1d_new(querylength,npositions,totalpositions);
5031 } else {
5032 links = Linkmatrix_2d_new(querylength,npositions);
5033 fwd_scores = intmatrix_2d_new(querylength,npositions);
5034 }
5035
5036 /* These are all oligomers */
5037 if (debug_graphic_p == true) {
5038 mappings_dump_R(mappings,npositions,querylength,/*active*/NULL,/*firstactive*/NULL,indexsize,"all.mers");
5039 }
5040
5041 cells = align_compute_scores_lookforward(&ncells,links,fwd_scores,
5042 mappings,npositions,totalpositions,
5043 oned_matrix_p,minactive,maxactive,firstactive,nactive,cellpool,
5044 querystart,queryend,querylength,
5045
5046 chroffset,chrhigh,plusp,
5047
5048 indexsize,
5049 #ifdef DEBUG9
5050 queryseq_ptr,
5051 #endif
5052 localp,skip_repetitive_p,use_canonical_p,non_canonical_penalty,
5053 debug_graphic_p,favor_right_p,middlep);
5054 /* cells are currently sorted by Cell_score_cmp in get_cells_fwd */
5055
5056 #ifdef SEPARATE_FWD_REV
5057 debug1(Linkmatrix_print_both(links,mappings,querylength,npositions,queryseq_ptr,indexsize));
5058 #else
5059 debug1(print_fwd(links,fwd_scores,mappings,querylength,npositions,queryseq_ptr,indexsize));
5060 #endif
5061
5062 if (ncells == 0) {
5063 all_paths = (List_T) NULL;
5064
5065 } else {
5066 /* High-scoring paths */
5067 bestscore = cells[0]->score;
5068 debug11(printf("** Looping on %d cells, allowing up to %d alignments, plus any with best score %d\n",
5069 ncells,max_nalignments,bestscore));
5070
5071 if (snps_p == true) {
5072 for (i = 0; i < ncells && (i < max_nalignments || cells[i]->score == bestscore)
5073 && cells[i]->score > bestscore - FINAL_SCORE_TOLERANCE; i++) {
5074 cell = cells[i];
5075 if (cell->pushedp == false) {
5076 querypos = cell->querypos;
5077 hit = cell->hit;
5078 fwdp = cell->fwdp;
5079 debug11(printf("Starting subpath %d at rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
5080 i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
5081 all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
5082 chroffset,chrhigh,/*watsonp*/plusp,
5083 #ifdef DEBUG0
5084 fwd_scores,indexsize,
5085 #endif
5086 pairpool,fwdp));
5087 npaths++;
5088 cell->pushedp = true;
5089 }
5090 }
5091
5092 } else {
5093 for (i = 0; i < ncells && (i < max_nalignments || cells[i]->score == bestscore)
5094 && cells[i]->score > bestscore - FINAL_SCORE_TOLERANCE; i++) {
5095 cell = cells[i];
5096 if (cell->pushedp == false) {
5097 querypos = cell->querypos;
5098 hit = cell->hit;
5099 fwdp = cell->fwdp;
5100 debug11(printf("Starting subpath %d at rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
5101 i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
5102
5103 #if 0
5104 if (debug_graphic_p == true) {
5105 /* best_path_dump_R(links,mappings,querypos,hit,fwdp,"best.path"); */
5106 printf("plot(all.mers,col=\"black\",pch=\".\",xlab=\"Query\",ylab=\"Genomic\")\n");
5107 printf("points(active.mers,col=\"red\",pch=\".\")\n");
5108 printf("points(best.path,col=\"green\",pch=\".\")\n");
5109 printf("lines(querypos,minactive,col=\"blue\")\n");
5110 printf("lines(querypos,maxactive,col=\"blue\")\n");
5111 }
5112 #endif
5113
5114 all_paths = List_push(all_paths,(void *) traceback_one(querypos,hit,links,mappings,queryseq_ptr,queryuc_ptr,
5115 #ifdef PMAP
5116 chroffset,chrhigh,/*watsonp*/plusp,/*lookbackp*/false,
5117 #endif
5118 #ifdef DEBUG0
5119 fwd_scores,indexsize,
5120 #endif
5121 pairpool,fwdp));
5122 npaths++;
5123 cell->pushedp = true;
5124 }
5125 }
5126
5127 }
5128
5129 #ifdef SLOW
5130 if (npaths < max_nalignments) {
5131 /* Non-overlapping paths */
5132 debug11(printf("** Looping on %d cells, looking for non-overlapping paths. Total paths so far: %d\n",
5133 ncells,npaths));
5134 qsort(cells,ncells,sizeof(Cell_T),Cell_interval_cmp);
5135 last_endposition = 0;
5136 if (snps_p == true) {
5137 for (i = 0; i < ncells && npaths < max_nalignments; i++) {
5138 cell = cells[i];
5139 if (cell->score > bestscore * NONOVERLAPPING_SCORE_TOLERANCE &&
5140 cell->rootposition > last_endposition && cell->pushedp == false) {
5141 debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
5142 i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
5143 querypos = cell->querypos;
5144 hit = cell->hit;
5145 fwdp = cell->fwdp;
5146 all_paths = List_push(all_paths,(void *) traceback_one_snps(querypos,hit,links,mappings,queryseq_ptr,
5147 chroffset,chrhigh,/*watsonp*/plusp,
5148 #ifdef DEBUG0
5149 fwd_scores,indexsize,
5150 #endif
5151 pairpool,fwdp));
5152 npaths++;
5153 cell->pushedp = true;
5154 last_endposition = cell->endposition;
5155 }
5156 }
5157
5158 } else {
5159 for (i = 0; i < ncells && npaths < max_nalignments; i++) {
5160 cell = cells[i];
5161 if (cell->score > bestscore * NONOVERLAPPING_SCORE_TOLERANCE &&
5162 cell->rootposition > last_endposition && cell->pushedp == false) {
5163 debug11(printf("Starting subpath %d for rootposition %d with score %d, querypos %d, hit %d, endposition %d\n",
5164 i,cell->rootposition,cell->score,querypos,hit,cell->endposition));
5165 querypos = cell->querypos;
5166 hit = cell->hit;
5167 fwdp = cell->fwdp;
5168 all_paths = List_push(all_paths,(void *) traceback_one(querypos,hit,links,mappings,queryseq_ptr,queryuc_ptr,
5169 #ifdef PMAP
5170 chroffset,chrhigh,/*watsonp*/plusp,/*lookbackp*/false,
5171 #endif
5172 #ifdef DEBUG0
5173 fwd_scores,indexsize,
5174 #endif
5175 pairpool,fwdp));
5176 npaths++;
5177 cell->pushedp = true;
5178 last_endposition = cell->endposition;
5179 }
5180 }
5181 }
5182 }
5183 #endif
5184
5185 debug11(printf("\n"));
5186
5187 #if 0
5188 /* No need with cellpool */
5189 for (i = 0; i < ncells; i++) {
5190 cell = cells[i];
5191 Cell_free(&cell);
5192 }
5193 #endif
5194 FREE(cells);
5195 }
5196
5197
5198 if (oned_matrix_p == true) {
5199 Linkmatrix_1d_free(&links);
5200 intmatrix_1d_free(&fwd_scores);
5201 } else {
5202 Linkmatrix_2d_free(&links,querylength);
5203 intmatrix_2d_free(&fwd_scores,querylength);
5204 }
5205
5206 #if 0
5207 for (p = all_paths; p != NULL; p = List_next(p)) {
5208 Pair_dump_list(List_head(p),/*zerobasedp*/true);
5209 printf("\n");
5210 }
5211 #endif
5212
5213 return all_paths;
5214 }
5215
5216
5217 #if 0
5218 /* Modified from stage3.c */
5219 static void
5220 get_splicesite_probs (double *left_prob, double *right_prob,
5221 Chrpos_T left_genomepos, Chrpos_T right_genomepos,
5222 Univcoord_T chroffset, Univcoord_T chrhigh, int genestrand, bool watsonp) {
5223 Univcoord_T splicesitepos;
5224
5225 if (watsonp == true) {
5226 splicesitepos = chroffset + left_genomepos;
5227 if (genestrand > 0) {
5228 *left_prob = Maxent_hr_donor_prob(splicesitepos /*?*/+ 1,chroffset);
5229 debug5(printf("1. donor splicesitepos is %u (%u), prob %f\n",
5230 splicesitepos,splicesitepos-chroffset,*left_prob));
5231
5232 } else {
5233 *left_prob = Maxent_hr_antiacceptor_prob(splicesitepos /**/+ 1,chroffset);
5234 debug5(printf("2. antiacceptor splicesitepos is %u (%u), prob %f\n",
5235 splicesitepos,splicesitepos-chroffset,*left_prob));
5236
5237 }
5238 } else {
5239 splicesitepos = chrhigh - left_genomepos + 1;
5240 if (genestrand > 0) {
5241 *left_prob = Maxent_hr_acceptor_prob(splicesitepos /*?*/- 1,chroffset);
5242 debug5(printf("4. acceptor splicesitepos is %u (%u), prob %f\n",
5243 splicesitepos,splicesitepos-chroffset,*left_prob));
5244 } else {
5245 *left_prob = Maxent_hr_antidonor_prob(splicesitepos /**/- 1,chroffset);
5246 debug5(printf("3. antidonor splicesitepos is %u (%u), prob %f\n",
5247 splicesitepos,splicesitepos-chroffset,*left_prob));
5248 }
5249 }
5250
5251 if (watsonp == true) {
5252 splicesitepos = chroffset + right_genomepos + 1;
5253 if (genestrand > 0) {
5254 *right_prob = Maxent_hr_acceptor_prob(splicesitepos /*?*/- 1,chroffset);
5255 debug5(printf("5. acceptor splicesitepos is %u (%u), prob %f\n",
5256 splicesitepos,splicesitepos-chroffset,*right_prob));
5257 } else {
5258 *right_prob = Maxent_hr_antidonor_prob(splicesitepos /**/- 1,chroffset);
5259 debug5(printf("6. antidonor splicesitepos is %u (%u), prob %f\n",
5260 splicesitepos,splicesitepos-chroffset,*right_prob));
5261
5262 }
5263 } else {
5264 splicesitepos = chrhigh - right_genomepos;
5265 if (genestrand > 0) {
5266 *right_prob = Maxent_hr_donor_prob(splicesitepos /*?*/+ 1,chroffset);
5267 debug5(printf("8. donor splicesitepos is %u (%u), prob %f\n",
5268 splicesitepos,splicesitepos-chroffset,*right_prob));
5269 } else {
5270 *right_prob = Maxent_hr_antiacceptor_prob(splicesitepos /**/+ 1,chroffset);
5271 debug5(printf("7. antiacceptor splicesitepos is %u (%u), prob %f\n",
5272 splicesitepos,splicesitepos-chroffset,*right_prob));
5273 }
5274 }
5275
5276 return;
5277 }
5278 #endif
5279
5280
5281 /* queryseq_ptr is NULL for PMAP. querypos here is in nt. */
5282 static List_T
convert_to_nucleotides(List_T path,char * queryseq_ptr,char * queryuc_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int query_offset,Pairpool_T pairpool,int indexsize_nt,bool include_gapholders_p)5283 convert_to_nucleotides (List_T path,
5284 #ifndef PMAP
5285 char *queryseq_ptr, char *queryuc_ptr,
5286 #endif
5287 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
5288 int query_offset, Pairpool_T pairpool, int indexsize_nt,
5289 bool include_gapholders_p) {
5290 List_T pairs = NULL;
5291 Pair_T pair;
5292 int querypos, lastquerypos, queryjump, genomejump, fill, default_fill;
5293 int genomepos, lastgenomepos;
5294 char c, c_alt;
5295
5296 debug5(printf("Beginning convert_to_nucleotides with %d pairs. query_offset = %d, indexsize_nt = %d\n",
5297 List_length(path),query_offset,indexsize_nt));
5298
5299 if (path == NULL) {
5300 return (List_T) NULL;
5301 } else {
5302 /* pairptr = path; */
5303 /* path = Pairpool_pop(path,&pair); */
5304 pair = (Pair_T) path->first;
5305 querypos = pair->querypos;
5306 genomepos = pair->genomepos;
5307 }
5308
5309 #ifdef PMAP
5310 default_fill = indexsize_nt - 3;
5311 #else
5312 default_fill = indexsize_nt - 1;
5313 #endif
5314
5315 lastquerypos = querypos + default_fill;
5316 lastgenomepos = genomepos + default_fill;
5317 while (lastquerypos > querypos) {
5318 debug5(printf("querypos %d vs lastquerypos %d, lastgenomepos %d\n",querypos,lastquerypos,lastgenomepos));
5319
5320 #ifdef PMAP
5321 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5322 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,/*cdna*/c,MATCH_COMP,c,c_alt,
5323 /*dynprogindex*/0);
5324 debug5(printf("Pushing %c | %c at %d,%d\n",c,c,lastquerypos,lastgenomepos));
5325 #elif defined(EXTRACT_GENOMICSEG)
5326 if (queryuc_ptr[lastquerypos] == genomicuc_ptr[lastgenomepos]) {
5327 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5328 queryseq_ptr[lastquerypos],MISMATCH_COMP,
5329 genomicseg_ptr[lastgenomepos],/*genomealt*/GENOMEALT_DEFERRED,
5330 /*dynprogindex*/0);
5331 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],queryuc_ptr[lastquerypos],
5332 lastquerypos+query_offset,lastgenomepos));
5333 } else {
5334 abort();
5335 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5336 queryseq_ptr[lastquerypos],MISMATCH_COMP,
5337 genomicseg_ptr[lastgenomepos],/*genomealt*/GENOMEALT_DEFERRED,
5338 /*dynprogindex*/0);
5339 debug5(printf("Pushing %c %c at %d,%d\n",queryseq_ptr[lastquerypos],genomicseg_ptr[lastgenomepos],
5340 lastquerypos+query_offset,lastgenomepos));
5341 }
5342 #else
5343 if (mode == STANDARD) {
5344 c = queryuc_ptr[lastquerypos];
5345 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5346 queryseq_ptr[lastquerypos],MATCH_COMP,c,/*genomealt*/c,
5347 /*dynprogindex*/0);
5348 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],queryuc_ptr[lastquerypos],
5349 lastquerypos+query_offset,lastgenomepos));
5350 } else {
5351 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5352 if (queryuc_ptr[lastquerypos] == c) {
5353 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5354 queryseq_ptr[lastquerypos],MATCH_COMP,c,c_alt,/*dynprogindex*/0);
5355 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],c,
5356 lastquerypos+query_offset,lastgenomepos));
5357 } else {
5358 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5359 queryseq_ptr[lastquerypos],AMBIGUOUS_COMP,c,c_alt,/*dynprogindex*/0);
5360 debug5(printf("Pushing %c : %c at %d,%d\n",queryseq_ptr[lastquerypos],c,
5361 lastquerypos+query_offset,lastgenomepos));
5362 }
5363 }
5364 #endif
5365 --lastquerypos;
5366 --lastgenomepos;
5367 }
5368
5369 /* Take care of observed first pair in oligomer */
5370 if (mode == STANDARD) {
5371 pair->querypos += query_offset; /* Revise coordinates */
5372 /*pair->genomepos += genomic_offset;*/ /* Revise coordinates */
5373 #ifdef WASTE
5374 pairs = Pairpool_push_existing(pairs,pairpool,pair);
5375 #else
5376 pairs = List_transfer_one(pairs,&path);
5377 #endif
5378 debug5(printf("Transferring %c : %c at %d,%d (first pair)\n",pair->cdna,c,
5379 pair->querypos+query_offset,pair->genomepos));
5380 } else {
5381 c = get_genomic_nt(&c_alt,pair->genomepos,chroffset,chrhigh,watsonp);
5382 if (pair->cdna == c) {
5383 pair->querypos += query_offset; /* Revise coordinates */
5384 /*pair->genomepos += genomic_offset;*/ /* Revise coordinates */
5385 #ifdef WASTE
5386 pairs = Pairpool_push_existing(pairs,pairpool,pair);
5387 #else
5388 pairs = List_transfer_one(pairs,&path);
5389 #endif
5390 debug5(printf("Transferring %c : %c at %d,%d (first pair)\n",pair->cdna,c,
5391 pair->querypos+query_offset,pair->genomepos));
5392 } else {
5393 path = Pairpool_pop(path,&pair);
5394 pairs = Pairpool_push(pairs,pairpool,pair->querypos+query_offset,pair->genomepos,
5395 pair->cdna,AMBIGUOUS_COMP,c,c_alt,/*dynprogindex*/0);
5396 debug5(printf("Pushing %c : %c at %d,%d (first pair)\n",pair->cdna,c,
5397 pair->querypos+query_offset,pair->genomepos));
5398 }
5399 }
5400
5401 lastquerypos = querypos;
5402 lastgenomepos = genomepos;
5403
5404 while (path != NULL) {
5405 /* pairptr = path; */
5406 /* path = Pairpool_pop(path,&pair); */
5407 pair = (Pair_T) path->first;
5408 querypos = pair->querypos;
5409 genomepos = pair->genomepos;
5410
5411 queryjump = lastquerypos - 1 - querypos;
5412 genomejump = lastgenomepos - 1 - genomepos;
5413
5414 if (queryjump == 0 && genomejump == 0) {
5415 /* Do nothing */
5416 } else {
5417 debug5(printf("At querypos %d, saw queryjump of %d and genomejump of %d\n",querypos,queryjump,genomejump));
5418
5419 if (querypos + default_fill >= lastquerypos || genomepos + default_fill >= lastgenomepos) {
5420 if (lastquerypos - querypos < (int) (lastgenomepos - genomepos)) {
5421 #if 0
5422 /* This can occur with wobble mask */
5423 fprintf(stderr,"Partial fill from querypos %d to %d (genomepos goes from %u to %u)\n",
5424 querypos,lastquerypos,genomepos,lastgenomepos);
5425 abort();
5426 #endif
5427 fill = lastquerypos - querypos - 1;
5428 } else {
5429 #if 0
5430 /* This can occur with wobble mask */
5431 fprintf(stderr,"Partial fill from genomepos %u to %u (querypos goes from %d to %d)\n",
5432 genomepos,lastgenomepos,querypos,lastquerypos);
5433 abort();
5434 #endif
5435 fill = lastgenomepos - genomepos - 1;
5436 }
5437 } else {
5438 fill = default_fill;
5439 }
5440
5441 /* Recompute queryjump and genomejump */
5442 queryjump -= fill;
5443 genomejump -= fill;
5444 debug5(printf(" Revised queryjump of %d and genomejump of %d\n",queryjump,genomejump));
5445 if (include_gapholders_p == true && (genomejump > 0 || queryjump > 0)) {
5446 debug5(printf(" Pushing gapholder\n"));
5447 pairs = Pairpool_push_gapholder(pairs,pairpool,queryjump,genomejump,
5448 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
5449 #if 0
5450 /* Need to run on both genestrands, and save both results in pair */
5451 if (queryjump == 0) {
5452 get_splicesite_probs(&left_prob,&right_prob,genomepos+fill,lastgenomepos,
5453 chroffset,chrhigh,/*genestrand*/+1,watsonp);
5454 }
5455 #endif
5456 }
5457
5458 /* Fill rest of oligomer */
5459 lastquerypos = querypos + fill;
5460 lastgenomepos = genomepos + fill;
5461 debug5(printf(" Fill from querypos %d down to %d\n",lastquerypos,querypos));
5462 while (lastquerypos > querypos) {
5463 #ifdef PMAP
5464 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5465 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,/*cdna*/c,MATCH_COMP,c,c_alt,
5466 /*dynprogindex*/0);
5467 debug5(printf("Pushing %c | %c at %d,%d\n",c,c,lastquerypos+query_offset,lastgenomepos));
5468 #elif defined(EXTRACT_GENOMICSEG)
5469 if (queryuc_ptr[lastquerypos] == genomicuc_ptr[lastgenomepos]) {
5470 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5471 queryseq_ptr[lastquerypos],MATCH_COMP,
5472 queryuc_ptr[lastquerypos],/*genomealt*/GENOMEALT_DEFERRED,
5473 /*dynprogindex*/0);
5474 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],genomicseg_ptr[lastgenomepos],
5475 lastquerypos+query_offset,lastgenomepos));
5476 } else {
5477 abort();
5478 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5479 queryseq_ptr[lastquerypos],MISMATCH_COMP,
5480 genomicseg_ptr[lastgenomepos],/*genomealt*/GENOMEALT_DEFERRED,
5481 /*dynprogindex*/0);
5482 debug5(printf("Pushing %c %c at %d,%d\n",queryseq_ptr[lastquerypos],genomicseg_ptr[lastgenomepos],
5483 lastquerypos+query_offset,lastgenomepos));
5484 }
5485 #else
5486 if (mode == STANDARD) {
5487 /* assert(queryuc_ptr[lastquerypos] == get_genomic_nt(&c_alt,lastgenomepos,genomicstart,genomiclength,watsonp)); */
5488 c = queryuc_ptr[lastquerypos];
5489 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5490 queryseq_ptr[lastquerypos],MATCH_COMP,c,/*genomealt*/c,
5491 /*dynprogindex*/0);
5492 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],queryuc_ptr[lastquerypos],
5493 lastquerypos+query_offset,lastgenomepos));
5494 } else {
5495 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5496 if (queryuc_ptr[lastquerypos] == c) {
5497 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5498 queryseq_ptr[lastquerypos],MATCH_COMP,c,c_alt,/*dynprogindex*/0);
5499 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],c,
5500 lastquerypos+query_offset,lastgenomepos));
5501 } else {
5502 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5503 queryseq_ptr[lastquerypos],AMBIGUOUS_COMP,c,c_alt,/*dynprogindex*/0);
5504 debug5(printf("Pushing %c : %c at %d,%d\n",queryseq_ptr[lastquerypos],c,
5505 lastquerypos+query_offset,lastgenomepos));
5506 }
5507 }
5508 #endif
5509 --lastquerypos;
5510 --lastgenomepos;
5511 }
5512 }
5513
5514 /* Take care of observed match */
5515 if (mode == STANDARD) {
5516 pair->querypos += query_offset; /* Revise coordinates */
5517 /*pair->genomepos += genomic_offset;*/ /* Revise coordinates */
5518 #ifdef WASTE
5519 pairs = Pairpool_push_existing(pairs,pairpool,pair);
5520 #else
5521 pairs = List_transfer_one(pairs,&path);
5522 #endif
5523 debug5(printf("Transferring %c : %c at %d,%d\n",pair->cdna,c,
5524 pair->querypos+query_offset,pair->genomepos));
5525 } else {
5526 c = get_genomic_nt(&c_alt,pair->genomepos,chroffset,chrhigh,watsonp);
5527 if (pair->cdna == c) {
5528 pair->querypos += query_offset; /* Revise coordinates */
5529 /*pair->genomepos += genomic_offset;*/ /* Revise coordinates */
5530 #ifdef WASTE
5531 pairs = Pairpool_push_existing(pairs,pairpool,pair);
5532 #else
5533 pairs = List_transfer_one(pairs,&path);
5534 #endif
5535 debug5(printf("Transferring %c : %c at %d,%d\n",pair->cdna,c,
5536 pair->querypos+query_offset,pair->genomepos));
5537 } else {
5538 path = Pairpool_pop(path,&pair);
5539 pairs = Pairpool_push(pairs,pairpool,pair->querypos+query_offset,pair->genomepos,
5540 pair->cdna,AMBIGUOUS_COMP,c,c_alt,/*dynprogindex*/0);
5541 debug5(printf("Pushing %c : %c at %d,%d (observed)\n",pair->cdna,c,
5542 pair->querypos+query_offset,pair->genomepos));
5543 }
5544 }
5545
5546 lastquerypos = querypos;
5547 lastgenomepos = genomepos;
5548 }
5549
5550 debug5(Pair_dump_list(pairs,true));
5551 /* pairs is in ascending querypos order */
5552 return pairs; /* Used to return List_reverse(pairs) */
5553 }
5554
5555
5556 /* queryseq_ptr is NULL for PMAP. querypos here is in nt. */
5557 static List_T
convert_to_nucleotides_snps(List_T path,char * queryseq_ptr,char * queryuc_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int query_offset,Pairpool_T pairpool,int indexsize_nt,bool include_gapholders_p)5558 convert_to_nucleotides_snps (List_T path,
5559 #ifndef PMAP
5560 char *queryseq_ptr, char *queryuc_ptr,
5561 #endif
5562 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
5563 int query_offset, Pairpool_T pairpool, int indexsize_nt,
5564 bool include_gapholders_p) {
5565 List_T pairs = NULL;
5566 Pair_T pair;
5567 int querypos, genomepos, lastquerypos, lastgenomepos, queryjump, genomejump, fill, default_fill;
5568 char c, c_alt;
5569
5570 debug5(printf("Beginning convert_to_nucleotides_snps with %d pairs\n",List_length(path)));
5571
5572 if (path == NULL) {
5573 return (List_T) NULL;
5574 } else {
5575 /* pairptr = path; */
5576 /* path = Pairpool_pop(path,&pair); */
5577 pair = (Pair_T) path->first;
5578 querypos = pair->querypos;
5579 genomepos = pair->genomepos;
5580 }
5581
5582 #ifdef PMAP
5583 default_fill = indexsize_nt - 3;
5584 #else
5585 default_fill = indexsize_nt - 1;
5586 #endif
5587
5588 lastquerypos = querypos + default_fill;
5589 lastgenomepos = genomepos + default_fill;
5590 while (lastquerypos > querypos) {
5591 debug5(printf("lastquerypos %d, lastgenomepos %d\n",
5592 lastquerypos,lastgenomepos));
5593
5594 #ifdef PMAP
5595 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5596 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,/*cdna*/c,MATCH_COMP,c,c_alt,
5597 /*dynprogindex*/0);
5598 debug5(printf("Pushing %c | %c at %d,%d\n",c,c,lastquerypos,lastgenomepos));
5599 #elif defined(EXTRACT_GENOMICSEG)
5600 if (queryuc_ptr[lastquerypos] == genomicuc_ptr[lastgenomepos]) {
5601 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5602 queryseq_ptr[lastquerypos],MISMATCH_COMP,
5603 genomicseg_ptr[lastgenomepos],/*genomealt*/GENOMEALT_DEFERRED,
5604 /*dynprogindex*/0);
5605 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],queryuc_ptr[lastquerypos],
5606 lastquerypos+query_offset,lastgenomepos));
5607 } else {
5608 abort();
5609 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5610 queryseq_ptr[lastquerypos],MISMATCH_COMP,
5611 genomicseg_ptr[lastgenomepos],/*genomealt*/GENOMEALT_DEFERRED,
5612 /*dynprogindex*/0);
5613 debug5(printf("Pushing %c %c at %d,%d\n",queryseq_ptr[lastquerypos],genomicseg_ptr[lastgenomepos],
5614 lastquerypos+query_offset,lastgenomepos));
5615 }
5616 #else
5617 if (mode == STANDARD) {
5618 /* assert(queryuc_ptr[lastquerypos] == get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp)); */
5619 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5620 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5621 queryseq_ptr[lastquerypos],MATCH_COMP,c,c_alt,
5622 /*dynprogindex*/0);
5623 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],queryuc_ptr[lastquerypos],
5624 lastquerypos+query_offset,lastgenomepos));
5625 } else {
5626 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5627 if (queryuc_ptr[lastquerypos] == c) {
5628 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5629 queryseq_ptr[lastquerypos],MATCH_COMP,c,c_alt,/*dynprogindex*/0);
5630 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],c,
5631 lastquerypos+query_offset,lastgenomepos));
5632 } else {
5633 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5634 queryseq_ptr[lastquerypos],AMBIGUOUS_COMP,c,c_alt,/*dynprogindex*/0);
5635 debug5(printf("Pushing %c : %c at %d,%d\n",queryseq_ptr[lastquerypos],c,
5636 lastquerypos+query_offset,lastgenomepos));
5637 }
5638 }
5639 #endif
5640 --lastquerypos;
5641 --lastgenomepos;
5642 }
5643
5644 /* Take care of observed first pair in oligomer */
5645 if (mode == STANDARD) {
5646 pair->querypos += query_offset; /* Revise coordinates */
5647 /*pair->genomepos += genomic_offset;*/ /* Revise coordinates */
5648 #ifdef WASTE
5649 pairs = Pairpool_push_existing(pairs,pairpool,pair);
5650 #else
5651 pairs = List_transfer_one(pairs,&path);
5652 #endif
5653 debug5(printf("Transferring %c : %c at %d,%d\n",pair->cdna,c,
5654 pair->querypos+query_offset,pair->genomepos));
5655 } else {
5656 c = get_genomic_nt(&c_alt,pair->genomepos,chroffset,chrhigh,watsonp);
5657 if (pair->cdna == c) {
5658 pair->querypos += query_offset; /* Revise coordinates */
5659 /*pair->genomepos += genomic_offset;*/ /* Revise coordinates */
5660 #ifdef WASTE
5661 pairs = Pairpool_push_existing(pairs,pairpool,pair);
5662 #else
5663 pairs = List_transfer_one(pairs,&path);
5664 #endif
5665 debug5(printf("Transferring %c : %c at %d,%d\n",pair->cdna,c,
5666 pair->querypos+query_offset,pair->genomepos));
5667 } else {
5668 path = Pairpool_pop(path,&pair);
5669 pairs = Pairpool_push(pairs,pairpool,pair->querypos+query_offset,pair->genomepos,
5670 pair->cdna,AMBIGUOUS_COMP,c,c_alt,/*dynprogindex*/0);
5671 debug5(printf("Pushing %c : %c at %d,%d (first pair)\n",pair->cdna,c,
5672 pair->querypos+query_offset,pair->genomepos));
5673 }
5674 }
5675
5676 lastquerypos = querypos;
5677 lastgenomepos = genomepos;
5678
5679 while (path != NULL) {
5680 /* pairptr = path; */
5681 /* path = Pairpool_pop(path,&pair); */
5682 pair = (Pair_T) path->first;
5683 querypos = pair->querypos;
5684 genomepos = pair->genomepos;
5685
5686 queryjump = lastquerypos - 1 - querypos;
5687 genomejump = lastgenomepos - 1 - genomepos;
5688
5689 if (queryjump == 0 && genomejump == 0) {
5690 /* Do nothing */
5691 } else {
5692 debug5(printf("At querypos %d, saw queryjump of %d and genomejump of %d\n",querypos,queryjump,genomejump));
5693
5694 if (querypos + default_fill >= lastquerypos || genomepos + default_fill >= lastgenomepos) {
5695 if (lastquerypos - querypos < lastgenomepos - genomepos) {
5696 #if 0
5697 /* This can occur with wobble mask */
5698 fprintf(stderr,"Partial fill from querypos %d to %d (genomepos goes from %u to %u)\n",
5699 querypos,lastquerypos,genomepos,lastgenomepos);
5700 abort();
5701 #endif
5702 fill = lastquerypos - querypos - 1;
5703 } else {
5704 #if 0
5705 /* This can occur with wobble mask */
5706 fprintf(stderr,"Partial fill from genomepos %u to %u (querypos goes from %d to %d)\n",
5707 genomepos,lastgenomepos,querypos,lastquerypos);
5708 abort();
5709 #endif
5710 fill = lastgenomepos - genomepos - 1;
5711 }
5712 } else {
5713 fill = default_fill;
5714 }
5715
5716 /* Recompute queryjump and genomejump */
5717 queryjump -= fill;
5718 genomejump -= fill;
5719 debug5(printf(" Revised queryjump of %d and genomejump of %d\n",queryjump,genomejump));
5720 if (include_gapholders_p == true && (genomejump > 0 || queryjump > 0)) {
5721 debug5(printf(" Pushing gapholder\n"));
5722 pairs = Pairpool_push_gapholder(pairs,pairpool,queryjump,genomejump,
5723 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
5724 }
5725
5726 /* Fill rest of oligomer */
5727 lastquerypos = querypos + fill;
5728 lastgenomepos = genomepos + fill;
5729 debug5(printf(" Fill from querypos %d down to %d\n",lastquerypos,querypos));
5730 while (lastquerypos > querypos) {
5731 #ifdef PMAP
5732 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5733 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,/*cdna*/c,MATCH_COMP,c,c_alt,
5734 /*dynprogindex*/0);
5735 debug5(printf("Pushing %c | %c at %d,%d\n",c,c,lastquerypos+query_offset,lastgenomepos));
5736 #elif defined(EXTRACT_GENOMICSEG)
5737 if (queryuc_ptr[lastquerypos] == genomicuc_ptr[lastgenomepos]) {
5738 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5739 queryseq_ptr[lastquerypos],MATCH_COMP,
5740 queryuc_ptr[lastquerypos],/*genomealt*/GENOMEALT_DEFERRED,
5741 /*dynprogindex*/0);
5742 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],genomicseg_ptr[lastgenomepos],
5743 lastquerypos+query_offset,lastgenomepos));
5744 } else {
5745 abort();
5746 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5747 queryseq_ptr[lastquerypos],MISMATCH_COMP,
5748 genomicseg_ptr[lastgenomepos],/*genomealt*/GENOMEALT_DEFERRED,
5749 /*dynprogindex*/0);
5750 debug5(printf("Pushing %c %c at %d,%d\n",queryseq_ptr[lastquerypos],genomicseg_ptr[lastgenomepos],
5751 lastquerypos+query_offset,lastgenomepos));
5752 }
5753 #else
5754 if (mode == STANDARD) {
5755 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5756 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5757 queryseq_ptr[lastquerypos],MATCH_COMP,c,c_alt,
5758 /*dynprogindex*/0);
5759 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],queryuc_ptr[lastquerypos],
5760 lastquerypos+query_offset,lastgenomepos));
5761 } else {
5762 c = get_genomic_nt(&c_alt,lastgenomepos,chroffset,chrhigh,watsonp);
5763 if (queryuc_ptr[lastquerypos] == c) {
5764 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5765 queryseq_ptr[lastquerypos],MATCH_COMP,c,c_alt,/*dynprogindex*/0);
5766 debug5(printf("Pushing %c | %c at %d,%d\n",queryseq_ptr[lastquerypos],c,
5767 lastquerypos+query_offset,lastgenomepos));
5768 } else {
5769 pairs = Pairpool_push(pairs,pairpool,lastquerypos+query_offset,lastgenomepos,
5770 queryseq_ptr[lastquerypos],AMBIGUOUS_COMP,c,c_alt,/*dynprogindex*/0);
5771 debug5(printf("Pushing %c : %c at %d,%d\n",queryseq_ptr[lastquerypos],c,
5772 lastquerypos+query_offset,lastgenomepos));
5773 }
5774 }
5775 #endif
5776 --lastquerypos;
5777 --lastgenomepos;
5778 }
5779 }
5780
5781 /* Take care of observed match */
5782 if (mode == STANDARD) {
5783 pair->querypos += query_offset; /* Revise coordinates */
5784 /*pair->genomepos += genomic_offset;*/ /* Revise coordinates */
5785 #ifdef WASTE
5786 pairs = Pairpool_push_existing(pairs,pairpool,pair);
5787 #else
5788 pairs = List_transfer_one(pairs,&path);
5789 #endif
5790 debug5(printf("Transferring %c : %c at %d,%d\n",pair->cdna,c,
5791 pair->querypos+query_offset,pair->genomepos));
5792 } else {
5793 c = get_genomic_nt(&c_alt,pair->genomepos,chroffset,chrhigh,watsonp);
5794 if (pair->cdna == c) {
5795 pair->querypos += query_offset; /* Revise coordinates */
5796 /*pair->genomepos += genomic_offset;*/ /* Revise coordinates */
5797 #ifdef WASTE
5798 pairs = Pairpool_push_existing(pairs,pairpool,pair);
5799 #else
5800 pairs = List_transfer_one(pairs,&path);
5801 #endif
5802 debug5(printf("Transferring %c : %c at %d,%d\n",pair->cdna,c,
5803 pair->querypos+query_offset,pair->genomepos));
5804 } else {
5805 path = Pairpool_pop(path,&pair);
5806 pairs = Pairpool_push(pairs,pairpool,pair->querypos+query_offset,pair->genomepos,
5807 pair->cdna,AMBIGUOUS_COMP,c,c_alt,/*dynprogindex*/0);
5808 debug5(printf("Pushing %c : %c at %d,%d (observed)\n",pair->cdna,c,
5809 pair->querypos+query_offset,pair->genomepos));
5810 }
5811 }
5812
5813 lastquerypos = querypos;
5814 lastgenomepos = genomepos;
5815 }
5816
5817 debug5(Pair_dump_list(pairs,true));
5818 /* pairs is in ascending querypos order */
5819 return pairs; /* Used to return List_reverse(pairs) */
5820 }
5821
5822
5823
5824 /* Returns ncovered */
5825 int
Stage2_scan(int * stage2_source,char * queryuc_ptr,int querylength,Chrpos_T chrstart,Chrpos_T chrend,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int genestrand,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices,Diagpool_T diagpool,bool debug_graphic_p)5826 Stage2_scan (int *stage2_source, char *queryuc_ptr, int querylength,
5827 Chrpos_T chrstart, Chrpos_T chrend,
5828 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp,
5829 int genestrand, Stage2_alloc_T stage2_alloc, Oligoindex_array_T oligoindices,
5830 Diagpool_T diagpool, bool debug_graphic_p) {
5831 int ncovered;
5832 int source;
5833 /* int indexsize; */
5834 Oligoindex_T oligoindex;
5835 Chrpos_T **mappings;
5836 bool *coveredp, oned_matrix_p;
5837 int *npositions, totalpositions;
5838 double pct_coverage;
5839 int maxnconsecutive;
5840 /* double diag_runtime; */
5841 List_T diagonals;
5842 #ifndef USE_DIAGPOOL
5843 List_p;
5844 Diag_T diag;
5845 #endif
5846 #ifdef EXTRACT_GENOMICSEG
5847 Count_T *counts;
5848 #endif
5849
5850 if (debug_graphic_p == true) {
5851 /* printf("par(mfrow=c(1,2),cex=0.2)\n"); */
5852 printf("par(cex=0.3)\n");
5853 printf("layout(matrix(c(1,2),1,2),widths=c(0.5,0.5),heights=c(1))\n");
5854 }
5855
5856 if (querylength > stage2_alloc->max_querylength_alloc) {
5857 coveredp = (bool *) CALLOC(querylength,sizeof(bool));
5858 mappings = (Chrpos_T **) MALLOC(querylength * sizeof(Chrpos_T *));
5859 npositions = (int *) CALLOC(querylength,sizeof(int));
5860 } else {
5861 coveredp = stage2_alloc->coveredp;
5862 mappings = stage2_alloc->mappings;
5863 npositions = stage2_alloc->npositions;
5864
5865 memset(coveredp,0,querylength * sizeof(bool));
5866 memset(npositions,0,querylength * sizeof(int));
5867 }
5868
5869 totalpositions = 0;
5870 maxnconsecutive = 0;
5871
5872 source = 0;
5873 pct_coverage = 0.0;
5874 Diagpool_reset(diagpool);
5875 diagonals = (List_T) NULL;
5876 while (source < Oligoindex_array_length(oligoindices) && pct_coverage < SUFF_PCTCOVERAGE_OLIGOINDEX) {
5877 oligoindex = Oligoindex_array_elt(oligoindices,source);
5878 /* indexsize = Oligoindex_indexsize(oligoindex); */ /* Different sources can have different indexsizes */
5879 #ifdef PMAP
5880 if (plusp == true) {
5881 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
5882 /*mappingend*/chroffset+chrend,/*plusp*/true,
5883 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
5884 /*chrpos*/chrstart);
5885 } else {
5886 /* Need to add 1 to mappingend to cover same range as plusp */
5887 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
5888 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
5889 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
5890 /*chrpos*/(chrhigh-chroffset)-chrend);
5891 }
5892
5893 #else
5894
5895 #ifdef EXTRACT_GENOMICSEG
5896 Oligoindex_hr_tally(oligoindex,genomicuc_ptr,/*genomiclength*/chrend-chrstart,queryuc_ptr,querylength,
5897 /*sequencepos*/0);
5898 counts = Oligoindex_counts_copy(oligoindex);
5899 #endif
5900
5901 if (plusp == true) {
5902 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
5903 /*mappingend*/chroffset+chrend,/*plusp*/true,
5904 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
5905 /*chrpos*/chrstart,genestrand);
5906 } else {
5907 /* Need to add 1 to mappingend to cover same range as plusp */
5908 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
5909 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
5910 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
5911 /*chrpos*/(chrhigh-chroffset)-chrend,genestrand);
5912 }
5913
5914 #ifdef EXTRACT_GENOMICSEG
5915 assert(Oligoindex_counts_equal(oligoindex,counts));
5916 /* Oligoindex_counts_dump(oligoindex,counts); */
5917 FREE(counts);
5918 #endif
5919
5920 #endif
5921
5922 diagonals = Oligoindex_get_mappings(diagonals,coveredp,mappings,npositions,&totalpositions,
5923 &oned_matrix_p,&maxnconsecutive,oligoindices,oligoindex,queryuc_ptr,
5924 /*querystart*/0,/*queryend*/querylength,querylength,
5925 chrstart,chrend,chroffset,chrhigh,plusp,diagpool);
5926 pct_coverage = Diag_update_coverage(coveredp,&ncovered,diagonals,querylength);
5927 debug(printf("Stage2_scan: source = %d, ncovered = %d, pct_coverage = %f\n",source,ncovered,pct_coverage));
5928
5929 source++;
5930 }
5931 *stage2_source = source;
5932
5933 #ifdef USE_DIAGPOOL
5934 /* No need to free diagonals */
5935 #else
5936 for (p = diagonals; p != NULL; p = List_next(p)) {
5937 diag = (Diag_T) List_head(p);
5938 Diag_free(&diag);
5939 }
5940 List_free(&diagonals);
5941 #endif
5942
5943 if (querylength > stage2_alloc->max_querylength_alloc) {
5944 FREE(npositions);
5945 FREE(coveredp);
5946 FREE(mappings); /* Don't need to free contents of mappings */
5947 }
5948
5949 #if 1
5950 for (source = 0; source < Oligoindex_array_length(oligoindices); source++) {
5951 oligoindex = Oligoindex_array_elt(oligoindices,source);
5952 Oligoindex_untally(oligoindex);
5953 }
5954 #endif
5955
5956 return ncovered;
5957 }
5958
5959
5960
5961 static int
stage2_cmp(const void * a,const void * b)5962 stage2_cmp (const void *a, const void *b) {
5963 Stage2_T xx = * (Stage2_T *) a;
5964 Stage2_T yy = * (Stage2_T *) b;
5965 List_T x = xx->middle, y = yy->middle;
5966 Chrpos_T x_chrstart, x_chrend, y_chrstart, y_chrend;
5967
5968 x_chrstart = ((Pair_T) x->first)->genomepos;
5969 x_chrend = ((Pair_T) List_last_value(x))->genomepos;
5970 assert(x_chrstart <= x_chrend); /* Equal if there is only one pair in the list */
5971
5972 y_chrstart = ((Pair_T) y->first)->genomepos;
5973 y_chrend = ((Pair_T) List_last_value(y))->genomepos;
5974 assert(y_chrstart <= y_chrend); /* Equal if there is only one pair in the list */
5975
5976 if (x_chrstart < y_chrstart) {
5977 return -1;
5978 } else if (y_chrstart < x_chrstart) {
5979 return +1;
5980
5981 /* Want most compact representation */
5982 } else if (x_chrend < y_chrend) {
5983 return -1;
5984 } else if (y_chrend < x_chrend) {
5985 return +1;
5986
5987 } else {
5988 return 0;
5989 }
5990 }
5991
5992
5993 /* paths, so chrend is first */
5994 static int
stage2pairs_start_cmp(const void * a,const void * b)5995 stage2pairs_start_cmp (const void *a, const void *b) {
5996 List_T x = * (List_T *) a;
5997 List_T y = * (List_T *) b;
5998 Chrpos_T x_chrstart, x_chrend, y_chrstart, y_chrend;
5999
6000 x_chrend = ((Pair_T) x->first)->genomepos;
6001 x_chrstart = ((Pair_T) List_last_value(x))->genomepos;
6002 assert(x_chrstart <= x_chrend); /* Equal if there is only one pair in the list */
6003
6004 y_chrend = ((Pair_T) y->first)->genomepos;
6005 y_chrstart = ((Pair_T) List_last_value(y))->genomepos;
6006 assert(y_chrstart <= y_chrend); /* Equal if there is only one pair in the list */
6007
6008 if (x_chrend > y_chrend) {
6009 return -1;
6010 } else if (y_chrend > x_chrend) {
6011 return +1;
6012
6013 /* Want most compact representation */
6014 } else if (x_chrstart > y_chrstart) {
6015 return -1;
6016 } else if (y_chrstart > x_chrstart) {
6017 return +1;
6018
6019 } else {
6020 return 0;
6021 }
6022 }
6023
6024
6025 static int
stage2pairs_end_cmp(const void * a,const void * b)6026 stage2pairs_end_cmp (const void *a, const void *b) {
6027 List_T x = * (List_T *) a;
6028 List_T y = * (List_T *) b;
6029
6030 Chrpos_T x_chrstart, x_chrend, y_chrstart, y_chrend;
6031
6032 x_chrstart = ((Pair_T) x->first)->genomepos;
6033 x_chrend = ((Pair_T) List_last_value(x))->genomepos;
6034 assert(x_chrstart <= x_chrend); /* Equal if there is only one pair in the list */
6035
6036 y_chrstart = ((Pair_T) y->first)->genomepos;
6037 y_chrend = ((Pair_T) List_last_value(y))->genomepos;
6038 assert(y_chrstart <= y_chrend); /* Equal if there is only one pair in the list */
6039
6040 if (x_chrstart < y_chrstart) {
6041 return -1;
6042 } else if (y_chrstart < x_chrstart) {
6043 return +1;
6044
6045 /* Want most compact representation */
6046 } else if (x_chrend < y_chrend) {
6047 return -1;
6048 } else if (y_chrend < x_chrend) {
6049 return +1;
6050
6051 } else {
6052 return 0;
6053 }
6054 }
6055
6056
6057 /* Modified from gregion_overlap_p */
6058 static bool
stage2path_overlap_p(List_T x,List_T y)6059 stage2path_overlap_p (List_T x, List_T y) {
6060 Chrpos_T x_chrstart, x_chrend, y_chrstart, y_chrend;
6061 Chrpos_T overlap;
6062 double fraction;
6063
6064 x_chrend = ((Pair_T) x->first)->genomepos;
6065 x_chrstart = ((Pair_T) List_last_value(x))->genomepos;
6066 assert(x_chrstart <= x_chrend); /* Equal if there is only one pair in the list */
6067
6068 y_chrend = ((Pair_T) y->first)->genomepos;
6069 y_chrstart = ((Pair_T) List_last_value(y))->genomepos;
6070 assert(y_chrstart <= y_chrend); /* Equal if there is only one pair in the list */
6071
6072 if (y_chrstart > x_chrend || x_chrstart > y_chrend) {
6073 debug13a(printf("x %u..%u, y %u..%u => no overlap\n",x_chrstart,x_chrend,y_chrstart,y_chrend));
6074 /*
6075 /-- x --/ /-- y --/ or /-- y --/ /-- x --/
6076 */
6077 return false; /* No overlap */
6078
6079 } else if (y_chrstart < x_chrstart) {
6080 debug13a(printf("x %u..%u, y %u..%u",x_chrstart,x_chrend,y_chrstart,y_chrend));
6081 if (y_chrend < x_chrend) {
6082 /*
6083 /-- x --/
6084 /-- y --/
6085 */
6086 overlap = y_chrend - x_chrstart;
6087 if (y_chrend - y_chrstart < x_chrend - x_chrstart) {
6088 fraction = (double) overlap/(double) (y_chrend - y_chrstart);
6089 } else {
6090 fraction = (double) overlap/(double) (x_chrend - x_chrstart);
6091 }
6092 debug13a(printf(" => fraction %f",fraction));
6093 if (fraction > 0.5) {
6094 debug13a(printf(" => overlap\n",fraction));
6095 return true;
6096 } else {
6097 debug13a(printf(" => no overlap\n",fraction));
6098 return false;
6099 }
6100
6101 } else {
6102 /*
6103 /-- x --/
6104 /----- y -----/
6105 */
6106 debug13a(printf(" => subsumption\n"));
6107 return true;
6108 }
6109
6110 } else {
6111 debug13a(printf("x %u..%u, y %u..%u\n",x_chrstart,x_chrend,y_chrstart,y_chrend));
6112 if (y_chrend < x_chrend) {
6113 /*
6114 /----- x -----/
6115 /-- y --/
6116 */
6117 debug13a(printf(" => subsumption\n"));
6118 return true;
6119
6120 } else {
6121 /*
6122 /-- x --/
6123 /-- y --/
6124 */
6125 overlap = x_chrend - y_chrstart;
6126 if (y_chrend - y_chrstart < x_chrend - x_chrstart) {
6127 fraction = (double) overlap/(double) (y_chrend - y_chrstart);
6128 } else {
6129 fraction = (double) overlap/(double) (x_chrend - x_chrstart);
6130 }
6131 debug13a(printf(" => fraction %f",fraction));
6132 if (fraction > 0.5) {
6133 debug13a(printf(" => overlap\n",fraction));
6134 return true;
6135 } else {
6136 debug13a(printf(" => no overlap\n",fraction));
6137 return false;
6138 }
6139
6140 }
6141 }
6142 }
6143
6144
6145 /* Modified from gregion_overlap_p */
6146 static bool
stage2pairs_overlap_p(List_T x,List_T y)6147 stage2pairs_overlap_p (List_T x, List_T y) {
6148 Chrpos_T x_chrstart, x_chrend, y_chrstart, y_chrend;
6149 Chrpos_T overlap;
6150 double fraction;
6151
6152 x_chrstart = ((Pair_T) x->first)->genomepos;
6153 x_chrend = ((Pair_T) List_last_value(x))->genomepos;
6154 assert(x_chrstart <= x_chrend); /* Equal if there is only one pair in the list */
6155
6156 y_chrstart = ((Pair_T) y->first)->genomepos;
6157 y_chrend = ((Pair_T) List_last_value(y))->genomepos;
6158 assert(y_chrstart <= y_chrend); /* Equal if there is only one pair in the list */
6159
6160 if (y_chrstart > x_chrend || x_chrstart > y_chrend) {
6161 debug13a(printf("x %u..%u, y %u..%u => no overlap\n",x_chrstart,x_chrend,y_chrstart,y_chrend));
6162 /*
6163 /-- x --/ /-- y --/ or /-- y --/ /-- x --/
6164 */
6165 return false; /* No overlap */
6166
6167 } else if (y_chrstart < x_chrstart) {
6168 debug13a(printf("x %u..%u, y %u..%u",x_chrstart,x_chrend,y_chrstart,y_chrend));
6169 if (y_chrend < x_chrend) {
6170 /*
6171 /-- x --/
6172 /-- y --/
6173 */
6174 overlap = y_chrend - x_chrstart;
6175 if (y_chrend - y_chrstart < x_chrend - x_chrstart) {
6176 fraction = (double) overlap/(double) (y_chrend - y_chrstart);
6177 } else {
6178 fraction = (double) overlap/(double) (x_chrend - x_chrstart);
6179 }
6180 debug13a(printf(" => fraction %f",fraction));
6181 if (fraction > 0.5) {
6182 debug13a(printf(" => overlap\n",fraction));
6183 return true;
6184 } else {
6185 debug13a(printf(" => no overlap\n",fraction));
6186 return false;
6187 }
6188
6189 } else {
6190 /*
6191 /-- x --/
6192 /----- y -----/
6193 */
6194 debug13a(printf(" => subsumption\n"));
6195 return true;
6196 }
6197
6198 } else {
6199 debug13a(printf("x %u..%u, y %u..%u\n",x_chrstart,x_chrend,y_chrstart,y_chrend));
6200 if (y_chrend < x_chrend) {
6201 /*
6202 /----- x -----/
6203 /-- y --/
6204 */
6205 debug13a(printf(" => subsumption\n"));
6206 return true;
6207
6208 } else {
6209 /*
6210 /-- x --/
6211 /-- y --/
6212 */
6213 overlap = x_chrend - y_chrstart;
6214 if (y_chrend - y_chrstart < x_chrend - x_chrstart) {
6215 fraction = (double) overlap/(double) (y_chrend - y_chrstart);
6216 } else {
6217 fraction = (double) overlap/(double) (x_chrend - x_chrstart);
6218 }
6219 debug13a(printf(" => fraction %f",fraction));
6220 if (fraction > 0.5) {
6221 debug13a(printf(" => overlap\n",fraction));
6222 return true;
6223 } else {
6224 debug13a(printf(" => no overlap\n",fraction));
6225 return false;
6226 }
6227
6228 }
6229 }
6230 }
6231
6232
6233
6234 static List_T
Stage2_filter_unique(List_T all_stage2results)6235 Stage2_filter_unique (List_T all_stage2results) {
6236 List_T unique = NULL;
6237 Stage2_T *array, stage2, xx, yy;
6238 int n, i, j;
6239 bool *eliminate;
6240 #ifdef DEBUG
6241 List_T p, q;
6242 #endif
6243
6244 n = List_length(all_stage2results);
6245 debug13(printf("Entering Stage2_filter_unique with %d results\n",n));
6246
6247 if (n == 0) {
6248 return NULL;
6249 }
6250
6251 #ifdef DEBUG13
6252 for (p = all_stage2results; p != NULL; p = List_next(p)) {
6253 stage2 = (Stage2_T) List_head(p);
6254 stage2pairs = stage2->middle;
6255 printf("Stage 2 list at chrstart %u, chrend %u)\n",
6256 ((Pair_T) stage2pairs->first)->genomepos,
6257 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6258 }
6259 #endif
6260
6261 eliminate = (bool *) CALLOC(n,sizeof(bool));
6262 array = (Stage2_T *) List_to_array(all_stage2results,NULL);
6263 List_free(&all_stage2results);
6264 qsort(array,n,sizeof(Stage2_T),stage2_cmp);
6265
6266 #ifdef DEBUG13
6267 for (i = 0; i < n; i++) {
6268 stage2 = array[i];
6269 stage2pairs = stage2->middle;
6270 printf("%d: Stage 2 list at chrstart %u, chrend %u)\n",
6271 i,((Pair_T) stage2pairs->first)->genomepos,
6272 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6273 }
6274 #endif
6275
6276
6277 for (i = 0; i < n; i++) {
6278 xx = array[i];
6279 for (j = i+1; j < n; j++) {
6280 yy = array[j];
6281 if (stage2pairs_overlap_p(xx->middle,yy->middle) == true) {
6282 #if 0
6283 printf("Found overlap between these regions:\n");
6284 printf(" ");
6285 printf("chrstart %u, chrend %u",
6286 ((Pair_T) xx->middle->first)->genomepos,
6287 ((Pair_T) List_last_value(xx->middle))->genomepos);
6288 printf(" ");
6289 printf("chrstart %u, chrend %u",
6290 ((Pair_T) yy->middle->first)->genomepos,
6291 ((Pair_T) List_last_value(yy->middle))->genomepos);
6292 printf("\n");
6293 #endif
6294 eliminate[j] = true;
6295 }
6296 }
6297 }
6298
6299 for (i = n-1; i >= 0; i--) {
6300 stage2 = array[i];
6301 if (eliminate[i] == false) {
6302 #if 0
6303 debug13(printf("Keeping chrstart %u, chrend %u",
6304 ((Pair_T) stage2pairs->first)->genomepos,
6305 ((Pair_T) List_last_value(stage2pairs))->genomepos));
6306 #endif
6307 unique = List_push(unique,(void *) stage2);
6308 } else {
6309 #if 0
6310 debug13(printf("Eliminating chrstart %u, chrend %u",
6311 ((Pair_T) stage2pairs->first)->genomepos,
6312 ((Pair_T) List_last_value(stage2pairs))->genomepos));
6313 #endif
6314 Stage2_free(&stage2);
6315 }
6316 }
6317
6318 FREE(eliminate);
6319 FREE(array);
6320
6321 #ifdef DEBUG13
6322 for (p = unique, i = 0; p != NULL; p = p->rest, i++) {
6323 stage2 = (Stage2_T) p->first;
6324 stage2pairs = stage2->middle;
6325 printf("Final: chrstart %u, chrend %u\n",
6326 ((Pair_T) stage2pairs->first)->genomepos,
6327 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6328 }
6329 #endif
6330
6331 return unique;
6332 }
6333
6334
6335 static List_T
Stage2pairs_filter_unique_starts(List_T all_results)6336 Stage2pairs_filter_unique_starts (List_T all_results) {
6337 List_T unique = NULL;
6338 List_T *array, stage2pairs, x, y;
6339 int n, i, j;
6340 bool *eliminate, eliminatep = false;
6341 #ifdef DEBUG
6342 List_T p, q;
6343 #endif
6344
6345 n = List_length(all_results);
6346 debug13(printf("Entering Stage2_filter_unique_starts with %d results\n",n));
6347
6348 if (n == 0) {
6349 return NULL;
6350 }
6351
6352 #ifdef DEBUG13
6353 for (p = all_results; p != NULL; p = List_next(p)) {
6354 stage2pairs = (List_T) List_head(p);
6355 printf("Stage 2 list at chrstart %u, chrend %u)\n",
6356 ((Pair_T) stage2pairs->first)->genomepos,
6357 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6358 }
6359 #endif
6360
6361 eliminate = (bool *) CALLOC(n,sizeof(bool));
6362 array = (List_T *) List_to_array(all_results,NULL);
6363 List_free(&all_results);
6364 qsort(array,n,sizeof(List_T),stage2pairs_start_cmp);
6365
6366 #ifdef DEBUG13
6367 for (i = 0; i < n; i++) {
6368 stage2pairs = array[i];
6369 printf("%d: Stage 2 list at chrstart %u, chrend %u)\n",
6370 i,((Pair_T) stage2pairs->first)->genomepos,
6371 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6372 }
6373 #endif
6374
6375
6376 for (i = 0; i < n; i++) {
6377 x = array[i];
6378 for (j = i+1; j < n; j++) {
6379 y = array[j];
6380 if (stage2path_overlap_p(x,y) == true) {
6381 #if 0
6382 printf("Found overlap between these regions:\n");
6383 printf(" ");
6384 printf("chrstart %u, chrend %u",
6385 ((Pair_T) x->first)->genomepos,
6386 ((Pair_T) List_last_value(x))->genomepos);
6387 printf(" ");
6388 printf("chrstart %u, chrend %u",
6389 ((Pair_T) y->first)->genomepos,
6390 ((Pair_T) List_last_value(y))->genomepos);
6391 printf("\n");
6392 #endif
6393 eliminate[j] = true;
6394 eliminatep = true;
6395 }
6396 }
6397 }
6398
6399 if (eliminatep == false) {
6400 /* All are identical, so take the first one only */
6401 unique = List_push(unique,(void *) array[0]);
6402 } else {
6403 for (i = n-1; i >= 0; i--) {
6404 stage2pairs = array[i];
6405 if (eliminate[i] == false) {
6406 #if 0
6407 debug13(printf("Keeping chrstart %u, chrend %u",
6408 ((Pair_T) stage2pairs->first)->genomepos,
6409 ((Pair_T) List_last_value(stage2pairs))->genomepos));
6410 #endif
6411 unique = List_push(unique,(void *) stage2pairs);
6412 } else {
6413 #if 0
6414 debug13(printf("Eliminating chrstart %u, chrend %u",
6415 ((Pair_T) stage2pairs->first)->genomepos,
6416 ((Pair_T) List_last_value(stage2pairs))->genomepos));
6417 #endif
6418 /* List_free(&stage2pairs); */
6419 }
6420 }
6421 }
6422
6423 FREE(eliminate);
6424 FREE(array);
6425
6426 #ifdef DEBUG13
6427 for (p = unique, i = 0; p != NULL; p = p->rest, i++) {
6428 stage2pairs = (List_T) p->first;
6429 printf("Final: chrstart %u, chrend %u\n",
6430 ((Pair_T) stage2pairs->first)->genomepos,
6431 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6432 }
6433 #endif
6434
6435 return unique;
6436 }
6437
6438
6439 static List_T
Stage2pairs_filter_unique_ends(List_T all_results)6440 Stage2pairs_filter_unique_ends (List_T all_results) {
6441 List_T unique = NULL;
6442 List_T *array, stage2pairs, x, y;
6443 int n, i, j;
6444 bool *eliminate, eliminatep = false;
6445 #ifdef DEBUG
6446 List_T p, q;
6447 #endif
6448
6449 n = List_length(all_results);
6450 debug13(printf("Entering Stage2_filter_unique_ends with %d results\n",n));
6451
6452 if (n == 0) {
6453 return NULL;
6454 }
6455
6456 #ifdef DEBUG13
6457 for (p = all_results; p != NULL; p = List_next(p)) {
6458 stage2pairs = (List_T) List_head(p);
6459 printf("Stage 2 list at chrstart %u, chrend %u)\n",
6460 ((Pair_T) stage2pairs->first)->genomepos,
6461 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6462 }
6463 #endif
6464
6465 eliminate = (bool *) CALLOC(n,sizeof(bool));
6466 array = (List_T *) List_to_array(all_results,NULL);
6467 List_free(&all_results);
6468 qsort(array,n,sizeof(List_T),stage2pairs_end_cmp);
6469
6470 #ifdef DEBUG13
6471 for (i = 0; i < n; i++) {
6472 stage2pairs = array[i];
6473 printf("%d: Stage 2 list at chrstart %u, chrend %u)\n",
6474 i,((Pair_T) stage2pairs->first)->genomepos,
6475 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6476 }
6477 #endif
6478
6479
6480 for (i = 0; i < n; i++) {
6481 x = array[i];
6482 for (j = i+1; j < n; j++) {
6483 y = array[j];
6484 if (stage2pairs_overlap_p(x,y) == true) {
6485 #if 0
6486 printf("Found overlap between these regions:\n");
6487 printf(" ");
6488 printf("chrstart %u, chrend %u",
6489 ((Pair_T) x->first)->genomepos,
6490 ((Pair_T) List_last_value(x))->genomepos);
6491 printf(" ");
6492 printf("chrstart %u, chrend %u",
6493 ((Pair_T) y->first)->genomepos,
6494 ((Pair_T) List_last_value(y))->genomepos);
6495 printf("\n");
6496 #endif
6497 eliminate[j] = true;
6498 eliminatep = true;
6499 }
6500 }
6501 }
6502
6503 if (eliminatep == false) {
6504 /* All are identical, so take the first one only */
6505 unique = List_push(unique,(void *) array[0]);
6506 } else {
6507 for (i = n-1; i >= 0; i--) {
6508 stage2pairs = array[i];
6509 if (eliminate[i] == false) {
6510 #if 0
6511 debug13(printf("Keeping chrstart %u, chrend %u",
6512 ((Pair_T) stage2pairs->first)->genomepos,
6513 ((Pair_T) List_last_value(stage2pairs))->genomepos));
6514 #endif
6515 unique = List_push(unique,(void *) stage2pairs);
6516 } else {
6517 #if 0
6518 debug13(printf("Eliminating chrstart %u, chrend %u",
6519 ((Pair_T) stage2pairs->first)->genomepos,
6520 ((Pair_T) List_last_value(stage2pairs))->genomepos));
6521 #endif
6522 /* List_free(&stage2pairs); */
6523 }
6524 }
6525 }
6526
6527 FREE(eliminate);
6528 FREE(array);
6529
6530 #ifdef DEBUG13
6531 for (p = unique, i = 0; p != NULL; p = p->rest, i++) {
6532 stage2pairs = (List_T) p->first;
6533 printf("Final: chrstart %u, chrend %u\n",
6534 ((Pair_T) stage2pairs->first)->genomepos,
6535 ((Pair_T) List_last_value(stage2pairs))->genomepos);
6536 }
6537 #endif
6538
6539 return unique;
6540 }
6541
6542
6543
6544
6545
6546 List_T
Stage2_compute(char * queryseq_ptr,char * queryuc_ptr,int querylength,int query_offset,Chrpos_T chrstart,Chrpos_T chrend,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int genestrand,Stage2_alloc_T stage2_alloc,double proceed_pctcoverage,Oligoindex_array_T oligoindices,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,bool localp,bool skip_repetitive_p,bool favor_right_p,int max_nalignments,bool debug_graphic_p,Stopwatch_T stopwatch,bool diag_debug)6547 Stage2_compute (char *queryseq_ptr, char *queryuc_ptr, int querylength, int query_offset,
6548 Chrpos_T chrstart, Chrpos_T chrend,
6549 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp, int genestrand,
6550 #ifndef GSNAP
6551 Stage2_alloc_T stage2_alloc, double proceed_pctcoverage,
6552 #endif
6553 Oligoindex_array_T oligoindices,
6554 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
6555 bool localp, bool skip_repetitive_p,
6556 bool favor_right_p, int max_nalignments, bool debug_graphic_p,
6557 Stopwatch_T stopwatch, bool diag_debug) {
6558 List_T all_stage2results = NULL, all_paths, all_ends, all_starts, path, pairs, p;
6559 List_T middle;
6560 /* Pair_T firstpair, lastpair; */
6561 int diag_querystart, diag_queryend;
6562 int indexsize, indexsize_nt;
6563 Oligoindex_T oligoindex;
6564 Chrpos_T **mappings;
6565 bool *coveredp, oned_matrix_p;
6566 int source;
6567 int *npositions, totalpositions;
6568 Chrpos_T *minactive, *maxactive;
6569 int *firstactive, *nactive;
6570 int maxnconsecutive;
6571 /* double diag_runtime; */
6572 List_T diagonals;
6573 /* int anchor_querypos, querystart, queryend; */
6574 /* Chrpos_T anchor_position; */
6575 #ifdef GSNAP
6576 Univcoord_T mappingstart, mappingend;
6577 Chrpos_T chrpos, mappinglength;
6578 #else
6579 double pct_coverage;
6580 int ncovered;
6581 #endif
6582
6583
6584 #ifndef USE_DIAGPOOL
6585 List_T p;
6586 Diag_T diag;
6587 #endif
6588 #ifdef DEBUG
6589 int nunique;
6590 #endif
6591 #ifdef DEBUG0
6592 int i;
6593 #endif
6594
6595 #ifdef EXTRACT_GENOMICSEG
6596 Count_T *counts;
6597 #endif
6598
6599 debug(printf("Entered Stage2_compute with chrstart %u and chrend %u\n",chrstart,chrend));
6600
6601 Stopwatch_start(stopwatch);
6602
6603 if (debug_graphic_p == true) {
6604 /* printf("par(mfrow=c(1,2),cex=0.2)\n"); */
6605 printf("par(cex=0.3)\n");
6606 printf("layout(matrix(c(1,2),1,2),widths=c(0.5,0.5),heights=c(1))\n");
6607 }
6608
6609 #ifdef GSNAP
6610 coveredp = (bool *) CALLOCA(querylength,sizeof(bool));
6611 mappings = (Chrpos_T **) MALLOCA(querylength * sizeof(Chrpos_T *));
6612 npositions = (int *) CALLOCA(querylength,sizeof(int));
6613 minactive = (unsigned int *) MALLOCA(querylength * sizeof(unsigned int));
6614 maxactive = (unsigned int *) MALLOCA(querylength * sizeof(unsigned int));
6615 firstactive = (int *) MALLOCA(querylength * sizeof(int));
6616 nactive = (int *) MALLOCA(querylength * sizeof(int));
6617 #else
6618 if (querylength > stage2_alloc->max_querylength_alloc) {
6619 coveredp = (bool *) CALLOC(querylength,sizeof(bool));
6620 mappings = (Chrpos_T **) MALLOC(querylength * sizeof(Chrpos_T *));
6621 npositions = (int *) CALLOC(querylength,sizeof(int));
6622 minactive = (unsigned int *) MALLOC(querylength * sizeof(unsigned int));
6623 maxactive = (unsigned int *) MALLOC(querylength * sizeof(unsigned int));
6624 firstactive = (int *) MALLOC(querylength * sizeof(int));
6625 nactive = (int *) MALLOC(querylength * sizeof(int));
6626 } else {
6627 coveredp = stage2_alloc->coveredp;
6628 mappings = stage2_alloc->mappings;
6629 npositions = stage2_alloc->npositions;
6630 minactive = stage2_alloc->minactive;
6631 maxactive = stage2_alloc->maxactive;
6632 firstactive = stage2_alloc->firstactive;
6633 nactive = stage2_alloc->nactive;
6634
6635 memset(coveredp,0,querylength * sizeof(bool));
6636 memset(npositions,0,querylength * sizeof(int));
6637 }
6638 #endif
6639
6640 totalpositions = 0;
6641 maxnconsecutive = 0;
6642
6643 source = 0;
6644 #ifdef USE_DIAGPOOL
6645 Diagpool_reset(diagpool);
6646 #endif
6647 Cellpool_reset(cellpool);
6648 diagonals = (List_T) NULL;
6649
6650
6651 #ifdef GSNAP
6652 mappingstart = chroffset + chrstart;
6653 if (plusp == true) {
6654 mappingend = chroffset + chrend;
6655 chrpos = chrstart;
6656 } else {
6657 mappingend = chroffset + chrend + 1;
6658 chrpos = (chrhigh - chroffset) - chrend;
6659 }
6660 mappinglength = (Chrpos_T) (mappingend - mappingstart);
6661
6662 if (mappinglength > 100000) {
6663 /* 9-mers */
6664 source = 0;
6665 } else if (mappinglength > 10000) {
6666 /* 8-mers */
6667 source = 1;
6668 } else {
6669 /* 7-mers */
6670 source = 2;
6671 }
6672
6673 oligoindex = Oligoindex_array_elt(oligoindices,source);
6674 indexsize = Oligoindex_indexsize(oligoindex); /* Different sources can have different indexsizes */
6675 /* printf("indexsize = %d\n",indexsize); */
6676
6677
6678 Oligoindex_hr_tally(oligoindex,mappingstart,mappingend,plusp,
6679 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
6680 chrpos,genestrand);
6681
6682 diagonals = Oligoindex_get_mappings(diagonals,coveredp,mappings,npositions,&totalpositions,
6683 &oned_matrix_p,&maxnconsecutive,oligoindices,oligoindex,queryuc_ptr,
6684 /*querystart*/0,/*queryend*/querylength,querylength,
6685 chrstart,chrend,chroffset,chrhigh,plusp,diagpool);
6686 #else
6687 /* GMAP */
6688 pct_coverage = 0.0;
6689 while (source < Oligoindex_array_length(oligoindices) && pct_coverage < SUFF_PCTCOVERAGE_OLIGOINDEX) {
6690 oligoindex = Oligoindex_array_elt(oligoindices,source);
6691 indexsize = Oligoindex_indexsize(oligoindex); /* Different sources can have different indexsizes */
6692
6693 #ifdef PMAP
6694 if (plusp == true) {
6695 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
6696 /*mappingend*/chroffset+chrend,/*plusp*/true,
6697 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
6698 /*chrpos*/chrstart);
6699 } else {
6700 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
6701 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
6702 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
6703 /*chrpos*/(chrhigh-chroffset)-chrend);
6704 }
6705 #else
6706
6707 #if 0
6708 /* Previously used this for user_genomicseg, but now creating genome_blocks on the fly */
6709 Oligoindex_hr_tally(oligoindex,genomicuc_ptr,/*genomiclength*/chrend-chrstart,queryuc_ptr,querylength,
6710 /*sequencepos*/0);
6711 #endif
6712
6713 #ifdef EXTRACT_GENOMICSEG
6714 /* printf("indexsize = %d\n",indexsize); */
6715 /* printf("Query: %.*s\n",querylength,queryuc_ptr); */
6716 /* printf("Genome: %s\n",genomicuc_ptr); */
6717 Oligoindex_hr_tally(oligoindex,genomicuc_ptr,/*genomiclength*/mappingend-mappingstart,
6718 queryuc_ptr,querylength,sequencepos);
6719 counts = Oligoindex_counts_copy(oligoindex);
6720
6721 /* printf("plusp %d\n",plusp); */
6722 /* printf("genomicstart %u, genomicend %u, genomiclength %d\n",genomicstart,genomicend,genomiclength); */
6723 /* printf("mappingstart %u, mappingend %u\n",mappingstart,mappingend); */
6724 #endif
6725
6726 if (plusp == true) {
6727 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
6728 /*mappingend*/chroffset+chrend,/*plusp*/true,
6729 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
6730 /*chrpos*/chrstart,genestrand);
6731 } else {
6732 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
6733 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
6734 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
6735 /*chrpos*/(chrhigh-chroffset)-chrend,genestrand);
6736 }
6737
6738 #ifdef EXTRACT_GENOMICSEG
6739 assert(Oligoindex_counts_equal(oligoindex,counts));
6740 /* Oligoindex_counts_dump(oligoindex,counts); */
6741
6742 FREE(counts);
6743 #endif
6744
6745 #endif
6746
6747 diagonals = Oligoindex_get_mappings(diagonals,coveredp,mappings,npositions,&totalpositions,
6748 &oned_matrix_p,&maxnconsecutive,oligoindices,oligoindex,queryuc_ptr,
6749 /*querystart*/0,/*queryend*/querylength,querylength,
6750 chrstart,chrend,chroffset,chrhigh,plusp,diagpool);
6751 pct_coverage = Diag_update_coverage(coveredp,&ncovered,diagonals,querylength);
6752 debug(printf("Stage2_compute: source = %d, ndiagonals = %d, ncovered = %d, pct_coverage = %f\n",
6753 source,List_length(diagonals),ncovered,pct_coverage));
6754
6755 source++;
6756 }
6757
6758 #endif
6759
6760 /* *stage2_source = source; */
6761 /* *stage2_indexsize = indexsize; */
6762 #ifdef PMAP
6763 indexsize_nt = 3*indexsize;
6764 #else
6765 indexsize_nt = indexsize;
6766 #endif
6767
6768 /* diag_runtime = */ Stopwatch_stop(stopwatch);
6769
6770 Stopwatch_start(stopwatch);
6771
6772 if (diag_debug == true) {
6773 /* Do nothing */
6774 middle = (List_T) NULL;
6775
6776 } else if (totalpositions == 0) {
6777 debug(printf("Quitting because totalpositions is zero\n"));
6778 middle = (List_T) NULL;
6779
6780 #ifndef GSNAP
6781 } else if (querylength > 150 && pct_coverage < proceed_pctcoverage && ncovered < SUFF_NCOVERED) {
6782 /* Filter only on long queries */
6783 debug(printf("Quitting because querylength %d > 150, and pct_coverage is only %f < %f, and ncovered is only %d < %d, maxnconsecutive = %d\n",
6784 querylength,pct_coverage,proceed_pctcoverage,ncovered,SUFF_NCOVERED,maxnconsecutive));
6785 middle = (List_T) NULL;
6786 #endif
6787
6788 } else {
6789 debug(printf("Proceeding because maxnconsecutive is %d and pct_coverage is %f > %f or ncovered = %d > %d\n",
6790 maxnconsecutive,pct_coverage,proceed_pctcoverage,ncovered,SUFF_NCOVERED));
6791
6792 debug(printf("Performing diag on genomiclength %u\n",chrend-chrstart));
6793 Diag_compute_bounds(&diag_querystart,&diag_queryend,minactive,maxactive,diagonals,querylength,
6794 debug_graphic_p,chrstart,chrend,chroffset,chrhigh,plusp);
6795
6796 debug(
6797 nunique = Diag_compute_bounds(&diag_querystart,&diag_queryend,minactive,maxactive,diagonals,querylength,
6798 debug_graphic_p,chrstart,chrend,chroffset,chrhigh,plusp);
6799 fprintf(stderr,"%d diagonals (%d not dominated), maxnconsecutive = %d\n",
6800 List_length(diagonals),nunique,maxnconsecutive);
6801 );
6802
6803 if (debug_graphic_p == true) {
6804 active_bounds_dump_R(minactive,maxactive,querylength);
6805 printf("lines(querypos,minactive,col=\"blue\")\n");
6806 printf("lines(querypos,maxactive,col=\"blue\")\n");
6807 }
6808
6809 all_paths = align_compute_lookback(mappings,npositions,totalpositions,
6810 oned_matrix_p,minactive,maxactive,firstactive,nactive,cellpool,
6811 queryseq_ptr,queryuc_ptr,querylength,
6812 /*querystart*/diag_querystart,/*queryend*/diag_queryend,
6813 chroffset,chrhigh,plusp,indexsize,pairpool,
6814 localp,skip_repetitive_p,use_canonical_middle_p,NON_CANONICAL_PENALTY_MIDDLE,
6815 favor_right_p,/*middlep*/true,max_nalignments,debug_graphic_p);
6816 for (p = all_paths; p != NULL; p = List_next(p)) {
6817 path = (List_T) p->first;
6818 #ifdef MOVE_TO_STAGE3
6819 firstpair = path->first;
6820 #endif
6821 pairs = List_reverse(path);
6822 #ifdef MOVE_TO_STAGE3
6823 lastpair = pairs->first;
6824 #endif
6825
6826 debug5(printf("Converting middle\n"));
6827 if (snps_p == true) {
6828 middle = convert_to_nucleotides_snps(pairs,
6829 #ifndef PMAP
6830 queryseq_ptr,queryuc_ptr,
6831 #endif
6832 chroffset,chrhigh,/*watsonp*/plusp,
6833 query_offset,pairpool,indexsize_nt,
6834 /*include_gapholders_p*/true);
6835 } else {
6836 middle = convert_to_nucleotides(pairs,
6837 #ifndef PMAP
6838 queryseq_ptr,queryuc_ptr,
6839 #endif
6840 chroffset,chrhigh,/*watsonp*/plusp,
6841 query_offset,pairpool,indexsize_nt,
6842 /*include_gapholders_p*/true);
6843 }
6844
6845
6846 all_ends = (List_T) NULL;
6847
6848 #ifdef MOVE_TO_STAGE3
6849 #ifdef PMAP
6850 anchor_querypos = lastpair->querypos/3;
6851 /* anchor_position = lastpair->genomepos - 2; */
6852 #else
6853 anchor_querypos = lastpair->querypos;
6854 /* anchor_position = lastpair->genomepos; */
6855 #endif
6856 querystart = anchor_querypos + 1;
6857 queryend = querylength - 1;
6858 debug0(printf("For end, anchor querypos %d\n",anchor_querypos));
6859
6860 end_paths = align_compute_lookback(mappings,npositions,totalpositions,
6861 oned_matrix_p,minactive,maxactive,firstactive,nactive,cellpool,
6862 queryseq_ptr,queryuc_ptr,querylength,querystart,queryend,
6863 chroffset,chrhigh,plusp,
6864 indexsize,pairpool,
6865 /*anchoredp*/true,anchor_querypos,anchor_position,
6866 localp,skip_repetitive_p,use_canonical_ends_p,NON_CANONICAL_PENALTY_ENDS,
6867 favor_right_p,/*middlep*/false,max_nalignments,debug_graphic_p);
6868
6869 /* fprintf(stderr,"%d ends\n",List_length(end_paths)); */
6870 if (List_length(end_paths) == 1) {
6871 pairs = (List_T) List_head(end_paths);
6872 path = List_reverse(pairs);
6873 debug5(printf("Converting single end\n"));
6874 if (snps_p == true) {
6875 pairs = convert_to_nucleotides_snps(path,
6876 #ifndef PMAP
6877 queryseq_ptr,queryuc_ptr,
6878 #endif
6879 chroffset,chrhigh,/*watsonp*/plusp,
6880 query_offset,pairpool,indexsize_nt,
6881 /*include_gapholders_p*/false);
6882 } else {
6883 pairs = convert_to_nucleotides(path,
6884 #ifndef PMAP
6885 queryseq_ptr,queryuc_ptr,
6886 #endif
6887 chroffset,chrhigh,/*watsonp*/plusp,
6888 query_offset,pairpool,indexsize_nt,
6889 /*include_gapholders_p*/false);
6890 }
6891 middle = Pairpool_remove_gapholders(middle);
6892 middle = List_reverse(Pairpool_join_end3(List_reverse(middle),pairs,pairpool,/*copy_end_p*/false));
6893 debug0(printf("ATTACHING SINGLE END TO MIDDLE\n"));
6894 debug0(Pair_dump_list(middle,true));
6895
6896 } else {
6897 debug0(i = 0);
6898 for (q = end_paths; q != NULL; q = List_next(q)) {
6899 pairs = (List_T) List_head(q);
6900 path = List_reverse(pairs);
6901 debug5(printf("Converting one end\n"));
6902 if (snps_p == true) {
6903 pairs = convert_to_nucleotides_snps(path,
6904 #ifndef PMAP
6905 queryseq_ptr,queryuc_ptr,
6906 #endif
6907 chroffset,chrhigh,/*watsonp*/plusp,
6908 query_offset,pairpool,indexsize_nt,
6909 /*include_gapholders_p*/false);
6910 } else {
6911 pairs = convert_to_nucleotides(path,
6912 #ifndef PMAP
6913 queryseq_ptr,queryuc_ptr,
6914 #endif
6915 chroffset,chrhigh,/*watsonp*/plusp,
6916 query_offset,pairpool,indexsize_nt,
6917 /*include_gapholders_p*/false);
6918 }
6919 debug0(printf("END %d/%d\n",i++,List_length(end_paths)));
6920 debug0(Pair_dump_list(pairs,true));
6921 all_ends = List_push(all_ends,(void *) pairs);
6922 }
6923 }
6924 List_free(&end_paths);
6925 #endif
6926
6927
6928 all_starts = (List_T) NULL;
6929
6930 #ifdef MOVE_TO_STAGE3
6931 #ifdef PMAP
6932 anchor_querypos = firstpair->querypos/3;
6933 anchor_position = firstpair->genomepos;
6934 #else
6935 anchor_querypos = firstpair->querypos;
6936 anchor_position = firstpair->genomepos;
6937 #endif
6938 debug0(printf("For start, anchor querypos %d\n",anchor_querypos));
6939
6940 querystart = 0;
6941 queryend = anchor_querypos - 1;
6942 start_paths = align_compute_lookforward(mappings,npositions,totalpositions,
6943 oned_matrix_p,minactive,maxactive,firstactive,nactive,cellpool,
6944 queryseq_ptr,queryuc_ptr,querylength,querystart,queryend,
6945 chroffset,chrhigh,plusp,
6946 indexsize,pairpool,
6947 /*anchoredp*/true,anchor_querypos,anchor_position,
6948 localp,skip_repetitive_p,use_canonical_ends_p,NON_CANONICAL_PENALTY_ENDS,
6949 favor_right_p,/*middlep*/false,max_nalignments,debug_graphic_p);
6950
6951 /* fprintf(stderr,"%d starts\n",List_length(start_paths)); */
6952 if (List_length(start_paths) == 1) {
6953 path = (List_T) List_head(start_paths);
6954 debug5(printf("Converting single start\n"));
6955 if (snps_p == true) {
6956 pairs = convert_to_nucleotides_snps(path,
6957 #ifndef PMAP
6958 queryseq_ptr,queryuc_ptr,
6959 #endif
6960 chroffset,chrhigh,/*watsonp*/plusp,
6961 query_offset,pairpool,indexsize_nt,
6962 /*include_gapholders_p*/false);
6963 } else {
6964 pairs = convert_to_nucleotides(path,
6965 #ifndef PMAP
6966 queryseq_ptr,queryuc_ptr,
6967 #endif
6968 chroffset,chrhigh,/*watsonp*/plusp,
6969 query_offset,pairpool,indexsize_nt,
6970 /*include_gapholders_p*/false);
6971 }
6972 path = List_reverse(pairs);
6973 middle = Pairpool_remove_gapholders(middle);
6974 middle = Pairpool_join_end5(middle,path,pairpool,/*copy_end_p*/false);
6975 debug0(printf("ATTACHING SINGLE START TO MIDDLE\n"));
6976 debug0(Pair_dump_list(middle,true));
6977
6978 } else {
6979 debug0(i = 0);
6980 for (q = start_paths; q != NULL; q = List_next(q)) {
6981 path = (List_T) List_head(q);
6982 debug5(printf("Converting one start\n"));
6983 if (snps_p == true) {
6984 pairs = convert_to_nucleotides_snps(path,
6985 #ifndef PMAP
6986 queryseq_ptr,queryuc_ptr,
6987 #endif
6988 chroffset,chrhigh,/*watsonp*/plusp,
6989 query_offset,pairpool,indexsize_nt,
6990 /*include_gapholders_p*/false);
6991 } else {
6992 pairs = convert_to_nucleotides(path,
6993 #ifndef PMAP
6994 queryseq_ptr,queryuc_ptr,
6995 #endif
6996 chroffset,chrhigh,/*watsonp*/plusp,
6997 query_offset,pairpool,indexsize_nt,
6998 /*include_gapholders_p*/false);
6999 }
7000 path = List_reverse(pairs);
7001 debug0(printf("START %d/%d\n",i++,List_length(start_paths)));
7002 debug0(Pair_dump_list(path,true));
7003 all_starts = List_push(all_starts,(void *) path);
7004 }
7005 }
7006 List_free(&start_paths);
7007 #endif
7008
7009 all_stage2results = List_push(all_stage2results,(void *) Stage2_new(middle,all_starts,all_ends));
7010 }
7011
7012 List_free(&all_paths);
7013 }
7014
7015 #ifdef GSNAP
7016 FREEA(nactive);
7017 FREEA(firstactive);
7018 FREEA(maxactive);
7019 FREEA(minactive);
7020 FREEA(npositions);
7021 FREEA(coveredp);
7022 FREEA(mappings); /* Don't need to free contents of mappings */
7023 #else
7024 if (querylength > stage2_alloc->max_querylength_alloc) {
7025 FREE(nactive);
7026 FREE(firstactive);
7027 FREE(maxactive);
7028 FREE(minactive);
7029 FREE(npositions);
7030 FREE(coveredp);
7031 FREE(mappings); /* Don't need to free contents of mappings */
7032 }
7033 #endif
7034
7035 #if 1
7036 for (source = 0; source < Oligoindex_array_length(oligoindices); source++) {
7037 oligoindex = Oligoindex_array_elt(oligoindices,source);
7038 Oligoindex_untally(oligoindex);
7039 }
7040 #endif
7041
7042 Stopwatch_stop(stopwatch);
7043
7044 if (diag_debug == true) {
7045 return diagonals;
7046 } else {
7047
7048 #ifdef USE_DIAGPOOL
7049 /* No need to free diagonals */
7050 #else
7051 for (p = diagonals; p != NULL; p = List_next(p)) {
7052 diag = (Diag_T) List_head(p);
7053 Diag_free(&diag);
7054 }
7055 List_free(&diagonals);
7056 #endif
7057 }
7058
7059 all_stage2results = Stage2_filter_unique(all_stage2results);
7060 debug0(printf("Done with stage2. Returning %d results\n",List_length(all_stage2results)));
7061 return all_stage2results;
7062 }
7063
7064
7065
7066 /* Since this stage2 is called from stage3 with a small segment of the
7067 query, we can use alloca instead of stage2_alloc */
7068 List_T
Stage2_compute_one(char * queryseq_ptr,char * queryuc_ptr,int querylength,int query_offset,Chrpos_T chrstart,Chrpos_T chrend,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int genestrand,Oligoindex_array_T oligoindices,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,bool localp,bool skip_repetitive_p,bool favor_right_p,bool debug_graphic_p)7069 Stage2_compute_one (char *queryseq_ptr, char *queryuc_ptr, int querylength, int query_offset,
7070 Chrpos_T chrstart, Chrpos_T chrend,
7071 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp, int genestrand,
7072 Oligoindex_array_T oligoindices,
7073 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
7074 bool localp, bool skip_repetitive_p,
7075 bool favor_right_p, bool debug_graphic_p) {
7076 List_T pairs, all_paths;
7077 List_T middle, path;
7078 int indexsize, indexsize_nt;
7079 Oligoindex_T oligoindex;
7080 Chrpos_T **mappings;
7081 bool *coveredp, oned_matrix_p;
7082 int source;
7083 int *npositions, totalpositions;
7084 Chrpos_T *minactive, *maxactive;
7085 int *firstactive, *nactive;
7086 int ncovered;
7087 double pct_coverage;
7088 int maxnconsecutive;
7089 /* double diag_runtime; */
7090 List_T diagonals;
7091
7092
7093 debug(printf("Entered Stage2_compute_one with chrstart %u and chrend %u\n",chrstart,chrend));
7094
7095 coveredp = (bool *) CALLOCA(querylength,sizeof(bool));
7096 mappings = (Chrpos_T **) MALLOCA(querylength * sizeof(Chrpos_T *));
7097 npositions = (int *) CALLOCA(querylength,sizeof(int));
7098 minactive = (unsigned int *) MALLOCA(querylength * sizeof(unsigned int));
7099 maxactive = (unsigned int *) MALLOCA(querylength * sizeof(unsigned int));
7100 firstactive = (int *) MALLOCA(querylength * sizeof(int));
7101 nactive = (int *) MALLOCA(querylength * sizeof(int));
7102
7103 totalpositions = 0;
7104 maxnconsecutive = 0;
7105
7106 source = 0;
7107 pct_coverage = 0.0;
7108 #ifdef USE_DIAGPOOL
7109 Diagpool_reset(diagpool);
7110 #endif
7111 Cellpool_reset(cellpool);
7112 diagonals = (List_T) NULL;
7113 while (source < Oligoindex_array_length(oligoindices) && pct_coverage < SUFF_PCTCOVERAGE_OLIGOINDEX) {
7114 oligoindex = Oligoindex_array_elt(oligoindices,source);
7115 indexsize = Oligoindex_indexsize(oligoindex); /* Different sources can have different indexsizes */
7116
7117 #ifdef PMAP
7118 if (plusp == true) {
7119 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7120 /*mappingend*/chroffset+chrend,/*plusp*/true,
7121 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7122 /*chrpos*/chrstart);
7123 } else {
7124 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7125 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
7126 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7127 /*chrpos*/(chrhigh-chroffset)-chrend);
7128 }
7129
7130 #else
7131
7132 if (plusp == true) {
7133 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7134 /*mappingend*/chroffset+chrend,/*plusp*/true,
7135 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7136 /*chrpos*/chrstart,genestrand);
7137 } else {
7138 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7139 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
7140 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7141 /*chrpos*/(chrhigh-chroffset)-chrend,genestrand);
7142 }
7143
7144 #endif
7145
7146 diagonals = Oligoindex_get_mappings(diagonals,coveredp,mappings,npositions,&totalpositions,
7147 &oned_matrix_p,&maxnconsecutive,oligoindices,oligoindex,queryuc_ptr,
7148 /*querystart*/0,/*queryend*/querylength,querylength,
7149 chrstart,chrend,chroffset,chrhigh,plusp,diagpool);
7150 pct_coverage = Diag_update_coverage(coveredp,&ncovered,diagonals,querylength);
7151 debug(printf("Stage2_compute: source = %d, ncovered = %d, pct_coverage = %f\n",source,ncovered,pct_coverage));
7152
7153 source++;
7154 }
7155 /* *stage2_source = source; */
7156 /* *stage2_indexsize = indexsize; */
7157 #ifdef PMAP
7158 indexsize_nt = 3*indexsize;
7159 #else
7160 indexsize_nt = indexsize;
7161 #endif
7162
7163
7164 if (totalpositions == 0) {
7165 debug(printf("Quitting because totalpositions is zero\n"));
7166 middle = (List_T) NULL;
7167
7168 } else {
7169 debug(printf("Proceeding because pct_coverage is %f > %f or ncovered = %d > %d\n",
7170 maxnconsecutive,pct_coverage,ncovered,SUFF_NCOVERED));
7171
7172 debug(printf("Performing diag on genomiclength %u\n",chrend-chrstart));
7173 Diag_max_bounds(minactive,maxactive,querylength,chrstart,chrend,chroffset,chrhigh,plusp);
7174
7175 if ((all_paths = align_compute_lookback(mappings,npositions,totalpositions,
7176 oned_matrix_p,minactive,maxactive,firstactive,nactive,cellpool,
7177 queryseq_ptr,queryuc_ptr,querylength,
7178 /*querystart*/0,/*queryend*/querylength-1,
7179 chroffset,chrhigh,plusp,indexsize,pairpool,
7180 localp,skip_repetitive_p,use_canonical_middle_p,NON_CANONICAL_PENALTY_MIDDLE,
7181 favor_right_p,/*middlep*/true,/*max_nalignments*/1,debug_graphic_p)) == NULL) {
7182 middle = (List_T) NULL;
7183 } else if ((path = (List_T) List_head(all_paths)) == NULL) {
7184 middle = (List_T) NULL;
7185 } else if (snps_p == true) {
7186 pairs = List_reverse(path);
7187 middle = convert_to_nucleotides_snps(pairs,
7188 #ifndef PMAP
7189 queryseq_ptr,queryuc_ptr,
7190 #endif
7191 chroffset,chrhigh,/*watsonp*/plusp,
7192 query_offset,pairpool,indexsize_nt,
7193 /*include_gapholders_p*/true);
7194 } else {
7195 pairs = List_reverse(path);
7196 middle = convert_to_nucleotides(pairs,
7197 #ifndef PMAP
7198 queryseq_ptr,queryuc_ptr,
7199 #endif
7200 chroffset,chrhigh,/*watsonp*/plusp,
7201 query_offset,pairpool,indexsize_nt,
7202 /*include_gapholders_p*/true);
7203 }
7204
7205 List_free(&all_paths);
7206 }
7207
7208 FREEA(nactive);
7209 FREEA(firstactive);
7210 FREEA(maxactive);
7211 FREEA(minactive);
7212 FREEA(npositions);
7213 FREEA(coveredp);
7214 FREEA(mappings); /* Don't need to free contents of mappings */
7215
7216 #if 1
7217 for (source = 0; source < Oligoindex_array_length(oligoindices); source++) {
7218 oligoindex = Oligoindex_array_elt(oligoindices,source);
7219 Oligoindex_untally(oligoindex);
7220 }
7221 #endif
7222
7223 #ifdef USE_DIAGPOOL
7224 /* No need to free diagonals */
7225 #else
7226 for (p = diagonals; p != NULL; p = List_next(p)) {
7227 diag = (Diag_T) List_head(p);
7228 Diag_free(&diag);
7229 }
7230 List_free(&diagonals);
7231 #endif
7232
7233 return List_reverse(middle);
7234 }
7235
7236
7237
7238 /* Called by GSNAP for ends of substring alignments */
7239 List_T
Stage2_compute_starts(char * queryseq_ptr,char * queryuc_ptr,int querylength,int query_offset,Chrpos_T chrstart,Chrpos_T chrend,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int genestrand,Oligoindex_array_T oligoindices,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,bool localp,bool skip_repetitive_p,bool favor_right_p,int max_nalignments,bool debug_graphic_p)7240 Stage2_compute_starts (char *queryseq_ptr, char *queryuc_ptr, int querylength, int query_offset,
7241 Chrpos_T chrstart, Chrpos_T chrend,
7242 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp, int genestrand,
7243 Oligoindex_array_T oligoindices,
7244 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
7245 bool localp, bool skip_repetitive_p,
7246 bool favor_right_p, int max_nalignments, bool debug_graphic_p) {
7247 List_T all_results;
7248 List_T pairs, all_paths, p;
7249 List_T path;
7250 int indexsize, indexsize_nt;
7251 Oligoindex_T oligoindex;
7252 Chrpos_T **mappings;
7253 bool *coveredp, oned_matrix_p;
7254 int source;
7255 int *npositions, totalpositions;
7256 Chrpos_T *minactive, *maxactive;
7257 int *firstactive, *nactive;
7258 int ncovered;
7259 double pct_coverage;
7260 int maxnconsecutive;
7261 /* double diag_runtime; */
7262 List_T diagonals;
7263
7264
7265 debug(printf("Entered Stage2_compute_starts with chrstart %u and chrend %u\n",chrstart,chrend));
7266
7267 coveredp = (bool *) CALLOCA(querylength,sizeof(bool));
7268 mappings = (Chrpos_T **) MALLOCA(querylength * sizeof(Chrpos_T *));
7269 npositions = (int *) CALLOCA(querylength,sizeof(int));
7270 minactive = (unsigned int *) MALLOCA(querylength * sizeof(unsigned int));
7271 maxactive = (unsigned int *) MALLOCA(querylength * sizeof(unsigned int));
7272 firstactive = (int *) MALLOCA(querylength * sizeof(int));
7273 nactive = (int *) MALLOCA(querylength * sizeof(int));
7274
7275 totalpositions = 0;
7276 maxnconsecutive = 0;
7277
7278 source = 0;
7279 pct_coverage = 0.0;
7280 #ifdef USE_DIAGPOOL
7281 Diagpool_reset(diagpool);
7282 #endif
7283 Cellpool_reset(cellpool);
7284 diagonals = (List_T) NULL;
7285 while (source < Oligoindex_array_length(oligoindices) && pct_coverage < SUFF_PCTCOVERAGE_OLIGOINDEX) {
7286 oligoindex = Oligoindex_array_elt(oligoindices,source);
7287 indexsize = Oligoindex_indexsize(oligoindex); /* Different sources can have different indexsizes */
7288
7289 #ifdef PMAP
7290 if (plusp == true) {
7291 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7292 /*mappingend*/chroffset+chrend,/*plusp*/true,
7293 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7294 /*chrpos*/chrstart);
7295 } else {
7296 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7297 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
7298 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7299 /*chrpos*/(chrhigh-chroffset)-chrend);
7300 }
7301
7302 #else
7303
7304 if (plusp == true) {
7305 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7306 /*mappingend*/chroffset+chrend,/*plusp*/true,
7307 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7308 /*chrpos*/chrstart,genestrand);
7309 } else {
7310 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7311 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
7312 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7313 /*chrpos*/(chrhigh-chroffset)-chrend,genestrand);
7314 }
7315
7316 #endif
7317
7318 diagonals = Oligoindex_get_mappings(diagonals,coveredp,mappings,npositions,&totalpositions,
7319 &oned_matrix_p,&maxnconsecutive,oligoindices,oligoindex,queryuc_ptr,
7320 /*querystart*/0,/*queryend*/querylength,querylength,
7321 chrstart,chrend,chroffset,chrhigh,plusp,diagpool);
7322 pct_coverage = Diag_update_coverage(coveredp,&ncovered,diagonals,querylength);
7323 debug(printf("Stage2_compute: source = %d, ncovered = %d, pct_coverage = %f\n",source,ncovered,pct_coverage));
7324
7325 source++;
7326 }
7327 /* *stage2_source = source; */
7328 /* *stage2_indexsize = indexsize; */
7329 #ifdef PMAP
7330 indexsize_nt = 3*indexsize;
7331 #else
7332 indexsize_nt = indexsize;
7333 #endif
7334
7335
7336 if (totalpositions == 0) {
7337 debug(printf("Quitting because totalpositions is zero\n"));
7338 all_results = (List_T) NULL;
7339
7340 } else {
7341 debug(printf("Proceeding because maxnconsecutive is %d and pct_coverage is %f or ncovered = %d > %d\n",
7342 maxnconsecutive,pct_coverage,ncovered,SUFF_NCOVERED));
7343
7344 debug(printf("Performing diag on genomiclength %u\n",chrend-chrstart));
7345 Diag_max_bounds(minactive,maxactive,querylength,chrstart,chrend,chroffset,chrhigh,plusp);
7346
7347 if ((all_paths = align_compute_lookforward(mappings,npositions,totalpositions,
7348 oned_matrix_p,minactive,maxactive,firstactive,nactive,cellpool,
7349 queryseq_ptr,queryuc_ptr,querylength,
7350 /*querystart*/0,/*queryend*/querylength-1,
7351 chroffset,chrhigh,plusp,indexsize,pairpool,
7352 localp,skip_repetitive_p,use_canonical_middle_p,NON_CANONICAL_PENALTY_MIDDLE,
7353 favor_right_p,/*middlep*/false,max_nalignments,debug_graphic_p)) == NULL) {
7354 all_results = (List_T) NULL;
7355
7356 } else if (snps_p == true) {
7357 all_results = (List_T) NULL;
7358 for (p = all_paths; p != NULL; p = List_next(p)) {
7359 path = List_head(p);
7360 pairs = convert_to_nucleotides_snps(path,
7361 #ifndef PMAP
7362 queryseq_ptr,queryuc_ptr,
7363 #endif
7364 chroffset,chrhigh,/*watsonp*/plusp,
7365 query_offset,pairpool,indexsize_nt,
7366 /*include_gapholders_p*/false);
7367 path = List_reverse(pairs);
7368 debug0(printf("START\n"));
7369 debug0(Pair_dump_list(path,true));
7370 if (path != NULL) {
7371 all_results = List_push(all_results,(void *) path);
7372 }
7373 }
7374
7375 } else {
7376 all_results = (List_T) NULL;
7377 for (p = all_paths; p != NULL; p = List_next(p)) {
7378 path = List_head(p);
7379 pairs = convert_to_nucleotides(path,
7380 #ifndef PMAP
7381 queryseq_ptr,queryuc_ptr,
7382 #endif
7383 chroffset,chrhigh,/*watsonp*/plusp,
7384 query_offset,pairpool,indexsize_nt,
7385 /*include_gapholders_p*/false);
7386 path = List_reverse(pairs);
7387 debug0(printf("START\n"));
7388 debug0(Pair_dump_list(path,true));
7389 if (path != NULL) {
7390 all_results = List_push(all_results,(void *) path);
7391 }
7392 }
7393 }
7394
7395 List_free(&all_paths);
7396 }
7397
7398 FREEA(nactive);
7399 FREEA(firstactive);
7400 FREEA(maxactive);
7401 FREEA(minactive);
7402 FREEA(npositions);
7403 FREEA(coveredp);
7404 FREEA(mappings); /* Don't need to free contents of mappings */
7405
7406 #if 1
7407 for (source = 0; source < Oligoindex_array_length(oligoindices); source++) {
7408 oligoindex = Oligoindex_array_elt(oligoindices,source);
7409 Oligoindex_untally(oligoindex);
7410 }
7411 #endif
7412
7413 #ifdef USE_DIAGPOOL
7414 /* No need to free diagonals */
7415 #else
7416 for (p = diagonals; p != NULL; p = List_next(p)) {
7417 diag = (Diag_T) List_head(p);
7418 Diag_free(&diag);
7419 }
7420 List_free(&diagonals);
7421 #endif
7422
7423 debug0(printf("Before filtering starts, %d\n",List_length(all_results)));
7424 all_results = Stage2pairs_filter_unique_starts(all_results);
7425 debug0(printf("After filtering starts, %d\n",List_length(all_results)));
7426
7427 return all_results;
7428 }
7429
7430
7431 /* Called by GSNAP for ends of substring alignments */
7432 List_T
Stage2_compute_ends(char * queryseq_ptr,char * queryuc_ptr,int querylength,int query_offset,Chrpos_T chrstart,Chrpos_T chrend,Univcoord_T chroffset,Univcoord_T chrhigh,bool plusp,int genestrand,Oligoindex_array_T oligoindices,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,bool localp,bool skip_repetitive_p,bool favor_right_p,int max_nalignments,bool debug_graphic_p)7433 Stage2_compute_ends (char *queryseq_ptr, char *queryuc_ptr, int querylength, int query_offset,
7434 Chrpos_T chrstart, Chrpos_T chrend,
7435 Univcoord_T chroffset, Univcoord_T chrhigh, bool plusp, int genestrand,
7436 Oligoindex_array_T oligoindices,
7437 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
7438 bool localp,
7439 bool skip_repetitive_p,
7440 bool favor_right_p, int max_nalignments, bool debug_graphic_p) {
7441 List_T all_results;
7442 List_T pairs, all_paths, p;
7443 List_T path;
7444 int indexsize, indexsize_nt;
7445 Oligoindex_T oligoindex;
7446 Chrpos_T **mappings;
7447 bool *coveredp, oned_matrix_p;
7448 int source;
7449 int *npositions, totalpositions;
7450 Chrpos_T *minactive, *maxactive;
7451 int *firstactive, *nactive;
7452 int ncovered;
7453 double pct_coverage;
7454 int maxnconsecutive;
7455 /* double diag_runtime; */
7456 List_T diagonals;
7457
7458
7459 debug(printf("Entered Stage2_compute_ends with chrstart %u and chrend %u\n",chrstart,chrend));
7460
7461 coveredp = (bool *) CALLOCA(querylength,sizeof(bool));
7462 mappings = (Chrpos_T **) MALLOCA(querylength * sizeof(Chrpos_T *));
7463 npositions = (int *) CALLOCA(querylength,sizeof(int));
7464 minactive = (unsigned int *) MALLOCA(querylength * sizeof(unsigned int));
7465 maxactive = (unsigned int *) MALLOCA(querylength * sizeof(unsigned int));
7466 firstactive = (int *) MALLOCA(querylength * sizeof(int));
7467 nactive = (int *) MALLOCA(querylength * sizeof(int));
7468
7469 totalpositions = 0;
7470 maxnconsecutive = 0;
7471
7472 source = 0;
7473 pct_coverage = 0.0;
7474 #ifdef USE_DIAGPOOL
7475 Diagpool_reset(diagpool);
7476 #endif
7477 Cellpool_reset(cellpool);
7478 diagonals = (List_T) NULL;
7479 while (source < Oligoindex_array_length(oligoindices) && pct_coverage < SUFF_PCTCOVERAGE_OLIGOINDEX) {
7480 oligoindex = Oligoindex_array_elt(oligoindices,source);
7481 indexsize = Oligoindex_indexsize(oligoindex); /* Different sources can have different indexsizes */
7482
7483 #ifdef PMAP
7484 if (plusp == true) {
7485 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7486 /*mappingend*/chroffset+chrend,/*plusp*/true,
7487 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7488 /*chrpos*/chrstart);
7489 } else {
7490 Oligoindex_pmap_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7491 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
7492 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7493 /*chrpos*/(chrhigh-chroffset)-chrend);
7494 }
7495
7496 #else
7497
7498 if (plusp == true) {
7499 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7500 /*mappingend*/chroffset+chrend,/*plusp*/true,
7501 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7502 /*chrpos*/chrstart,genestrand);
7503 } else {
7504 Oligoindex_hr_tally(oligoindex,/*mappingstart*/chroffset+chrstart,
7505 /*mappingend*/chroffset+chrend+1,/*plusp*/false,
7506 queryuc_ptr,/*querystart*/0,/*queryend*/querylength,
7507 /*chrpos*/(chrhigh-chroffset)-chrend,genestrand);
7508 }
7509
7510 #endif
7511
7512 diagonals = Oligoindex_get_mappings(diagonals,coveredp,mappings,npositions,&totalpositions,
7513 &oned_matrix_p,&maxnconsecutive,oligoindices,oligoindex,queryuc_ptr,
7514 /*querystart*/0,/*queryend*/querylength,querylength,
7515 chrstart,chrend,chroffset,chrhigh,plusp,diagpool);
7516 pct_coverage = Diag_update_coverage(coveredp,&ncovered,diagonals,querylength);
7517 debug(printf("Stage2_compute: source = %d, ncovered = %d, pct_coverage = %f\n",source,ncovered,pct_coverage));
7518
7519 source++;
7520 }
7521 /* *stage2_source = source; */
7522 /* *stage2_indexsize = indexsize; */
7523 #ifdef PMAP
7524 indexsize_nt = 3*indexsize;
7525 #else
7526 indexsize_nt = indexsize;
7527 #endif
7528
7529
7530 if (totalpositions <= 0) {
7531 debug(printf("Quitting because totalpositions is zero\n"));
7532 all_results = (List_T) NULL;
7533
7534 } else {
7535 debug(printf("Proceeding because maxnconsecutive is %d and pct_coverage is %f or ncovered = %d > %d\n",
7536 maxnconsecutive,pct_coverage,ncovered,SUFF_NCOVERED));
7537
7538 debug(printf("Performing diag on genomiclength %u\n",chrend-chrstart));
7539 Diag_max_bounds(minactive,maxactive,querylength,chrstart,chrend,chroffset,chrhigh,plusp);
7540
7541 if ((all_paths = align_compute_lookback(mappings,npositions,totalpositions,
7542 oned_matrix_p,minactive,maxactive,firstactive,nactive,cellpool,
7543 queryseq_ptr,queryuc_ptr,querylength,
7544 /*querystart*/0,/*queryend*/querylength-1,
7545 chroffset,chrhigh,plusp,indexsize,pairpool,
7546 localp,skip_repetitive_p,use_canonical_middle_p,NON_CANONICAL_PENALTY_MIDDLE,
7547 favor_right_p,/*middlep*/false,max_nalignments,debug_graphic_p)) == NULL) {
7548 all_results = (List_T) NULL;
7549
7550 } else if (snps_p == true) {
7551 all_results = (List_T) NULL;
7552 for (p = all_paths; p != NULL; p = List_next(p)) {
7553 pairs = List_head(p);
7554 path = List_reverse(pairs);
7555 pairs = convert_to_nucleotides_snps(path,
7556 #ifndef PMAP
7557 queryseq_ptr,queryuc_ptr,
7558 #endif
7559 chroffset,chrhigh,/*watsonp*/plusp,
7560 query_offset,pairpool,indexsize_nt,
7561 /*include_gapholders_p*/false);
7562 debug0(printf("END\n"));
7563 debug0(Pair_dump_list(pairs,true));
7564 if (pairs != NULL) {
7565 all_results = List_push(all_results,(void *) pairs);
7566 }
7567 }
7568
7569 } else {
7570 all_results = (List_T) NULL;
7571 for (p = all_paths; p != NULL; p = List_next(p)) {
7572 pairs = List_head(p);
7573 path = List_reverse(pairs);
7574 pairs = convert_to_nucleotides(path,
7575 #ifndef PMAP
7576 queryseq_ptr,queryuc_ptr,
7577 #endif
7578 chroffset,chrhigh,/*watsonp*/plusp,
7579 query_offset,pairpool,indexsize_nt,
7580 /*include_gapholders_p*/false);
7581 debug0(printf("END\n"));
7582 debug0(Pair_dump_list(pairs,true));
7583 if (pairs != NULL) {
7584 all_results = List_push(all_results,(void *) pairs);
7585 }
7586 }
7587 }
7588
7589 List_free(&all_paths);
7590 }
7591
7592 FREEA(nactive);
7593 FREEA(firstactive);
7594 FREEA(maxactive);
7595 FREEA(minactive);
7596 FREEA(npositions);
7597 FREEA(coveredp);
7598 FREEA(mappings); /* Don't need to free contents of mappings */
7599
7600 #if 1
7601 for (source = 0; source < Oligoindex_array_length(oligoindices); source++) {
7602 oligoindex = Oligoindex_array_elt(oligoindices,source);
7603 Oligoindex_untally(oligoindex);
7604 }
7605 #endif
7606
7607 #ifdef USE_DIAGPOOL
7608 /* No need to free diagonals */
7609 #else
7610 for (p = diagonals; p != NULL; p = List_next(p)) {
7611 diag = (Diag_T) List_head(p);
7612 Diag_free(&diag);
7613 }
7614 List_free(&diagonals);
7615 #endif
7616
7617 debug0(printf("Before filtering ends, %d\n",List_length(all_results)));
7618 all_results = Stage2pairs_filter_unique_ends(all_results);
7619 debug0(printf("After filtering ends, %d\n",List_length(all_results)));
7620
7621 return all_results;
7622 }
7623
7624