1 static char rcsid[] = "$Id: stage3.c 222884 2020-06-18 17:11:56Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "stage3.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>		/* For memcpy */
10 #include <math.h>		/* For pow() */
11 
12 #include "assert.h"
13 #include "mem.h"
14 #include "univcoord.h"
15 
16 #include "comp.h"
17 #include "pair.h"
18 #include "pairdef.h"
19 #include "comp.h"
20 #include "chrnum.h"
21 #include "genomicpos.h"
22 #include "smooth.h"
23 #include "scores.h"
24 #include "intron.h"
25 #include "pbinom.h"
26 #include "changepoint.h"
27 #ifndef GSNAP
28 #include "translation.h"
29 #endif
30 #ifdef PMAP
31 #include "backtranslation.h"
32 #endif
33 #include "complement.h"
34 #include "iit-read.h"
35 #include "stage2.h"
36 #include "dynprog.h"		/* For Dynprog_consistent_p */
37 #include "dynprog_single.h"
38 #include "dynprog_genome.h"
39 #include "dynprog_cdna.h"
40 #include "dynprog_end.h"
41 #include "boyer-moore.h"
42 #include "maxent.h"
43 #include "maxent_hr.h"
44 #include "fastlog.h"
45 #include "doublelist.h"
46 
47 
48 /* The following are the same as dividing by 2048 and 1024 */
49 #define goodness_intronlen(x) (x >> 11)
50 #define goodness_nonintronlen(x) (x >> 10)
51 
52 
53 #if 0
54 /* These iterations are very costly to compute */
55 #define MAXITER_CYCLES 5
56 #define MAXITER_SMOOTH_BY_SIZE 2
57 #define MAXITER_INTRONS 2
58 #define MAXITER_KNOWNSPLICE 1	/* Effectively turns off iteration */
59 
60 #else
61 #define MAXITER_CYCLES 1
62 #define MAXITER_SMOOTH_BY_SIZE 1
63 #define MAXITER_INTRONS 1
64 #define MAXITER_KNOWNSPLICE 1	/* Effectively turns off iteration */
65 #endif
66 
67 
68 #define INTRON_PENALTY_INCONSISTENT 16
69 #define NONCANONICAL_PENALTY 12
70 
71 #define SINGLESLEN 9	/* Should be same as MININTRONLEN */
72 #define MININTRONLEN 9		/* Determines when Dynprog_genome_gap gets called vs Dynprog_single_gap */
73 #define MININTRONLEN_FINAL 50	/* Determines when to perform final
74 				   pass to find canonical introns */
75 #define SUFF_MATCHES_KEEP 300
76 
77 /* Old parameters for extrapeel */
78 /* #define SUFFCONSECUTIVE 5 */
79 /* #define MAXINCURSION 5 */
80 
81 
82 #define MINCOVERAGE 0.10  /* Not used anymore */
83 
84 #define DYNPROGINDEX_MAJOR -1
85 #define DYNPROGINDEX_MINOR +1
86 
87 #define DUAL_BREAK_PROB_THRESHOLD 0.90
88 
89 /* If too small, e.g., 3, misses introns with a nearby mismatch.  If too large, e.g., 24, misses small exons */
90 #define MIN_STAGE2_FOR_DUALBREAK 6
91 
92 #define MIN_MICROEXON_LENGTH 3
93 #define MAX_MICROEXON_LOOPS 10
94 
95 #define THETA_SLACK 0.10
96 #define TRIM_END_PVALUE 1e-4
97 
98 #define NEARBY_INDEL 6
99 #define INDEL_SPLICE_ENDLENGTH 12
100 #define NONCANONICAL_ACCEPT 15
101 #define NONCANONICAL_PERFECT_MATCHES 12
102 
103 #define MAXPEELBACK_SCORE 5	/* For determining goodness of intron */
104 #define MAXPEELBACK_END 1000
105 
106 #define DUALBREAK_QUERYJUMP_FACTOR 10
107 
108 #define SCORE_SIGDIFF 5
109 #define PROB_SIGDIFF 0.5
110 
111 #define END_SPLICESITE_SEARCH 10
112 #define END_SPLICESITE_PROB_MATCH 0.90
113 #define END_SPLICESITE_PROB_MISMATCH 0.95
114 
115 #define MICROEXON_PROB_MATCH 0.50
116 #define MICROEXON_PROB_MISMATCH 0.80
117 
118 #define END_MIN_EXONLENGTH 12
119 #define END_SUFFICIENT_EXONLENGTH 40 /* Defines length (in bp) beyond which we can ignore maxintronlen_ends */
120 #define END_SUFFICIENT_EXONLENGTH_PCT 0.10 /* Defines length (in terms of percentage of query) beyond which we can ignore maxintronlen_ends */
121 
122 #if 0
123 /* No longer used.  Not sure why it was used before */
124 #define END_SPLICESITE_EXON_LENGTH 100  /* If shorter than this, then don't look for end splice site */
125 #endif
126 
127 
128 /* For Stage3_bad_stretch_p */
129 #define LOG_99 -0.01005033585
130 #define LOG_01 -4.605170186
131 
132 #define LOG_9999 -0.000100005
133 #define LOG_90 -0.1053605
134 #define LOG_75 -0.2876821
135 #define LOG_25 -1.386294
136 #define LOG_10 -2.302585
137 #define LOG_0001 -9.21034
138 
139 #if 0
140 /* Switches on 5 consecutive mismatches */
141 #define LOG_99_9999 -0.01015034085
142 #define LOG_99_0001 -9.220390708
143 #define LOG_25_0001 -10.59663473
144 #define LOG_25_9999 -1.386394366
145 #define LOG_01_9999 -4.605270191
146 #define LOG_01_0001 -13.81551056
147 #define LOG_75_0001 -9.498022444
148 #define LOG_75_9999 -0.2877820775
149 #endif
150 
151 #if 1
152 #define LOG_99_999 -0.01105083619
153 #define LOG_99_001 -6.917805615
154 #define LOG_25_001 -8.29404964
155 #define LOG_25_999 -1.387294861
156 #define LOG_01_999 -4.606170686
157 #define LOG_01_001 -11.51292546
158 #define LOG_75_001 -7.195437351
159 #define LOG_75_999 -0.2886825728
160 #endif
161 
162 #if 0
163 /* Switches on 4 consecutive mismatches */
164 #define LOG_99_99 -0.02010067171
165 #define LOG_99_01 -4.615220522
166 #define LOG_25_01 -5.991464547
167 #define LOG_25_99 -1.396344697
168 #define LOG_01_99 -4.615220522
169 #define LOG_01_01 -9.210340372
170 #define LOG_75_01 -4.892852258
171 #define LOG_75_99 -0.2977324083
172 #endif
173 
174 
175 /* #define EXTRACT_GENOMICSEG 1 */
176 
177 
178 static const Except_T gapcheck_error = {"Gap check failed"};
179 static const Except_T coordinate_error = {"Coordinate error"};
180 
181 /* #define SHORTCUT 1 */		/* Skips re-solving introns if already canonical */
182 #define EXCESS_GAPHOLDERS 1
183 #define MAXITER 100		/* For peelback */
184 
185 /* In debug mode, probably want to activate debug in pairpool.c and
186    dynprog.c also */
187 #ifdef DEBUG
188 #define debug(x) x
189 #else
190 #define debug(x)
191 #endif
192 
193 
194 #ifdef DEBUG0
195 #define debug0(x) x
196 #else
197 #define debug0(x)
198 #endif
199 
200 /* Pair dump */
201 #ifdef DEBUG1
202 #define debug1(x) x
203 #else
204 #define debug1(x)
205 #endif
206 
207 /* trim ends */
208 #ifdef DEBUG3
209 #define debug3(x) x
210 #else
211 #define debug3(x)
212 #endif
213 
214 /* Fix adjacent indels */
215 #ifdef DEBUG4
216 #define debug4(x) x
217 #else
218 #define debug4(x)
219 #endif
220 
221 /* HMM */
222 #ifdef DEBUG5
223 #define debug5(x) x
224 #else
225 #define debug5(x)
226 #endif
227 
228 /* assign_gap_types and fill_in_gaps */
229 #ifdef DEBUG7
230 #define debug7(x) x
231 #else
232 #define debug7(x)
233 #endif
234 
235 /* stage3debug */
236 #ifdef DEBUG8
237 #define debug8(x) x
238 #else
239 #define debug8(x)
240 #endif
241 
242 /* bad_stretch_p */
243 #ifdef DEBUG9
244 #define debug9(x) x
245 #else
246 #define debug9(x)
247 #endif
248 
249 /* chimera */
250 #ifdef DEBUG10
251 #define debug10(x) x
252 #else
253 #define debug10(x)
254 #endif
255 
256 /* pick_cdna_direction */
257 #ifdef DEBUG11
258 #define debug11(x) x
259 #else
260 #define debug11(x)
261 #endif
262 
263 /* splicesitepos */
264 #ifdef DEBUG12
265 #define debug12(x) x
266 #else
267 #define debug12(x)
268 #endif
269 
270 /* trimming at novel splice sites at ends */
271 #ifdef DEBUG13
272 #define debug13(x) x
273 #else
274 #define debug13(x)
275 #endif
276 
277 /* build_dual_breaks */
278 #ifdef DEBUG14
279 #define debug14(x) x
280 #else
281 #define debug14(x)
282 #endif
283 
284 /* changepoint */
285 #ifdef DEBUG18
286 #define debug18(x) x
287 #else
288 #define debug18(x)
289 #endif
290 
291 /* mergeable */
292 #ifdef DEBUG20
293 #define debug20(x) x
294 #else
295 #define debug20(x)
296 #endif
297 
298 /* end_compare */
299 #ifdef DEBUG21
300 #define debug21(x) x
301 #else
302 #define debug21(x)
303 #endif
304 
305 #ifdef DEBUG99
306 #define debug99(x) x
307 #else
308 #define debug99(x)
309 #endif
310 
311 
312 
313 
314 static bool splicingp;
315 static bool novelsplicingp;
316 static bool require_splicedir_p;
317 static Chrpos_T overall_end_distance_linear;
318 static Chrpos_T overall_end_distance_circular;
319 
320 static IIT_T splicesites_iit;
321 static int *splicesites_divint_crosstable;
322 
323 static int donor_typeint;
324 static int acceptor_typeint;
325 
326 static Univcoord_T *splicesites;
327 
328 static bool *circularp;
329 static bool *altlocp;
330 static Univcoord_T *alias_starts;
331 static Univcoord_T *alias_ends;
332 
333 static int min_intronlength;
334 static int max_deletionlength;
335 static int min_indel_end_matches;
336 
337 static int maxpeelback_distalmedial;
338 static int nullgap;
339 static int extramaterial_end;
340 static int extramaterial_paired;
341 static int extraband_single;
342 static int extraband_end;
343 static int extraband_paired;
344 static int ngap;
345 static int maxintronlen;	/* for middle */
346 static int maxintronlen_ends;
347 static int minendexon;
348 
349 static bool maximize_coverage_p = false;
350 static Stage3debug_T stage3debug;
351 static Univcoord_T genome_totallength;
352 
353 static bool homopolymerp;
354 
355 static int gff3_fasta_annotation_type;
356 
357 
358 void
Stage3_setup(bool splicingp_in,bool novelsplicingp_in,bool require_splicedir_p_in,Chrpos_T shortsplicedist_novelend,IIT_T splicesites_iit_in,int * splicesites_divint_crosstable_in,int donor_typeint_in,int acceptor_typeint_in,Univcoord_T * splicesites_in,bool * circularp_in,bool * altlocp_in,Univcoord_T * alias_starts_in,Univcoord_T * alias_ends_in,int min_intronlength_in,int max_deletionlength_in,int min_indel_end_matches_in,int maxpeelback_distalmedial_in,int nullgap_in,int extramaterial_end_in,int extramaterial_paired_in,int extraband_single_in,int extraband_end_in,int extraband_paired_in,int ngap_in,int maxintronlen_in,int maxintronlen_ends_in,int minendexon_in,bool homopolymerp_in,GFF3_fasta_annotation_T gff3_fasta_annotation_type_in,Stage3debug_T stage3debug_in,Univcoord_T genome_totallength_in)359 Stage3_setup (bool splicingp_in, bool novelsplicingp_in, bool require_splicedir_p_in,
360 	      Chrpos_T shortsplicedist_novelend,
361 	      IIT_T splicesites_iit_in, int *splicesites_divint_crosstable_in,
362 	      int donor_typeint_in, int acceptor_typeint_in,
363 	      Univcoord_T *splicesites_in, bool *circularp_in, bool *altlocp_in,
364 	      Univcoord_T *alias_starts_in, Univcoord_T *alias_ends_in,
365 	      int min_intronlength_in, int max_deletionlength_in, int min_indel_end_matches_in,
366 	      int maxpeelback_distalmedial_in, int nullgap_in,
367 	      int extramaterial_end_in, int extramaterial_paired_in,
368 	      int extraband_single_in, int extraband_end_in, int extraband_paired_in,
369 	      int ngap_in, int maxintronlen_in, int maxintronlen_ends_in, int minendexon_in,
370 	      bool homopolymerp_in, GFF3_fasta_annotation_T gff3_fasta_annotation_type_in,
371 	      Stage3debug_T stage3debug_in, Univcoord_T genome_totallength_in) {
372   splicingp = splicingp_in;
373   novelsplicingp = novelsplicingp_in;
374   require_splicedir_p = require_splicedir_p_in;
375   if (shortsplicedist_novelend > (Chrpos_T) max_deletionlength_in) {
376     overall_end_distance_linear = shortsplicedist_novelend;
377   } else {
378     overall_end_distance_linear = max_deletionlength_in;
379   }
380   overall_end_distance_circular = max_deletionlength_in;
381 
382   splicesites_iit = splicesites_iit_in;
383   splicesites_divint_crosstable = splicesites_divint_crosstable_in;
384   donor_typeint = donor_typeint_in;
385   acceptor_typeint = acceptor_typeint_in;
386 
387   splicesites = splicesites_in;
388 
389   circularp = circularp_in;
390   altlocp = altlocp_in;
391   alias_starts = alias_starts_in;
392   alias_ends = alias_ends_in;
393 
394   min_intronlength = min_intronlength_in;
395   max_deletionlength = max_deletionlength_in;
396   min_indel_end_matches = min_indel_end_matches_in;
397 
398   maxpeelback_distalmedial = maxpeelback_distalmedial_in;
399   nullgap = nullgap_in;
400   extramaterial_end = extramaterial_end_in;
401   extramaterial_paired = extramaterial_paired_in;
402   extraband_single = extraband_single_in;
403   extraband_end = extraband_end_in;
404   extraband_paired = extraband_paired_in;
405   ngap = ngap_in;
406   maxintronlen = maxintronlen_in;
407   maxintronlen_ends = maxintronlen_ends_in;
408   minendexon = minendexon_in;
409 
410   homopolymerp = homopolymerp_in;
411   gff3_fasta_annotation_type = gff3_fasta_annotation_type_in;
412 
413   stage3debug = stage3debug_in;
414   genome_totallength = genome_totallength_in;
415 
416   return;
417 }
418 
419 
420 /************************************************************************
421  *   Stage 3 merges cDNA-genomic pairs from stage 2 (called "path")
422  *   and from dynamic programming into a single list.  In this
423  *   process, stage 3 may also have to pop a pair off the path, insert
424  *   the dynamic programming results (called "gappairs") and then push
425  *   the stored pair onto the list.  The relevant pointers are
426  *   leftquerypos and leftgenomepos, which refer to the left (stored) pair;
427  *   rightquerypos and rightgenomepos, which refer to the right (top) pair on
428  *   the list (which may represent what the list should have for the
429  *   purposes of dynamic programming); and querydp5, genomedp5,
430  *   querydp3, and genomedp3, which refer to the dynamic programming
431  *   indices, inclusive.
432  *
433  *   path has the end of the query sequence as its car.
434  *   pairs has the beginning of the query sequence as its car.
435  *
436  *   Most procedures take the top of path and put it onto pairs:
437  *
438  *	 <- <- path  =====>  pairs -> ->
439  *	   leftpair	     rightpair
440  *
441  *   For stage 1 and stage 2, the poly-A/T tails, if any, was stripped
442  *   off, but for stage 3, we try to extend these tails if possible.
443  *   Therefore, we use the full length and full sequence here, and add
444  *   the offset to the path from stage 2.
445  ************************************************************************/
446 
447 
448 struct Stage3middle_T {
449   int goodness;
450   double defect_rate_fwd;
451   double defect_rate_rev;
452   List_T pairs_fwd;
453   List_T pairs_rev;
454 
455   Chrnum_T chrnum;
456   Univcoord_T chroffset;
457   Univcoord_T chrhigh;
458   Chrpos_T chrlength;
459   bool watsonp;
460   int genestrand;
461 
462   List_T all_stage2_starts;
463   List_T all_stage2_ends;
464 };
465 
466 Chrnum_T
Stage3middle_chrnum(Stage3middle_T this)467 Stage3middle_chrnum (Stage3middle_T this) {
468   return this->chrnum;
469 }
470 
471 Univcoord_T
Stage3middle_chroffset(Stage3middle_T this)472 Stage3middle_chroffset (Stage3middle_T this) {
473   return this->chroffset;
474 }
475 
476 Univcoord_T
Stage3middle_chrhigh(Stage3middle_T this)477 Stage3middle_chrhigh (Stage3middle_T this) {
478   return this->chrhigh;
479 }
480 
481 Chrpos_T
Stage3middle_chrlength(Stage3middle_T this)482 Stage3middle_chrlength (Stage3middle_T this) {
483   return this->chrlength;
484 }
485 
486 
487 bool
Stage3middle_watsonp(Stage3middle_T this)488 Stage3middle_watsonp (Stage3middle_T this) {
489   return this->watsonp;
490 }
491 
492 int
Stage3middle_genestrand(Stage3middle_T this)493 Stage3middle_genestrand (Stage3middle_T this) {
494   return this->genestrand;
495 }
496 
497 int
Stage3middle_goodness(Stage3middle_T this)498 Stage3middle_goodness (Stage3middle_T this) {
499   return this->goodness;
500 }
501 
502 static Stage3middle_T
Stage3middle_new(int goodness,double defect_rate_fwd,double defect_rate_rev,List_T pairs_fwd,List_T pairs_rev,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool watsonp,int genestrand,List_T all_stage2_starts,List_T all_stage2_ends)503 Stage3middle_new (int goodness, double defect_rate_fwd, double defect_rate_rev,
504 		  List_T pairs_fwd, List_T pairs_rev,
505 		  Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
506 		  Chrpos_T chrlength, bool watsonp, int genestrand,
507 		  List_T all_stage2_starts, List_T all_stage2_ends) {
508   Stage3middle_T new = (Stage3middle_T) MALLOC(sizeof(*new));
509 
510 
511   new->goodness = goodness;
512   new->defect_rate_fwd = defect_rate_fwd;
513   new->defect_rate_rev = defect_rate_rev;
514   new->pairs_fwd = pairs_fwd;
515   new->pairs_rev = pairs_rev;
516 
517   new->chrnum = chrnum;
518   new->chroffset = chroffset;
519   new->chrhigh = chrhigh;
520   new->chrlength = chrlength;
521   new->watsonp = watsonp;
522   new->genestrand = genestrand;
523 
524   new->all_stage2_starts = all_stage2_starts;
525   new->all_stage2_ends = all_stage2_ends;
526 
527   return new;
528 }
529 
530 void
Stage3middle_free(Stage3middle_T * old)531 Stage3middle_free (Stage3middle_T *old) {
532   List_free(&(*old)->all_stage2_starts);
533   List_free(&(*old)->all_stage2_ends);
534   FREE(*old);
535   return;
536 }
537 
538 int
Stage3middle_cmp(const void * a,const void * b)539 Stage3middle_cmp (const void *a, const void *b) {
540   Stage3middle_T x = * (Stage3middle_T *) a;
541   Stage3middle_T y = * (Stage3middle_T *) b;
542 
543   if (x->goodness > y->goodness) {
544     return -1;
545   } else if (y->goodness > x->goodness) {
546     return +1;
547   } else {
548     return 0;
549   }
550 }
551 
552 
553 #define T Stage3_T
554 struct T {
555   struct Pair_T *pairarray;	/* The array version of pairs_fwd or pairs_rev, with the gaps substituted */
556   bool pairarray_freeable_p;
557   bool chimera_left_p;		/* Part of a chimera on its querystart end */
558   bool chimera_right_p;		/* Part of a chimera on its queryend end */
559   int npairs;
560 
561   List_T pairs;			/* Winning set of pairs */
562 
563   int straintype;
564   char *strain;
565 
566   Chrnum_T chrnum;
567   Univcoord_T chroffset;	/* Start of chromosome chrnum on genome */
568   Univcoord_T chrhigh;	        /* End of chromosome chrnum on genome */
569   Chrpos_T chrlength;
570   int circularpos;
571 
572   Chrpos_T chrstart; /* Position on chromosome of start of genomicseg */
573   Univcoord_T genomicstart;	/* Start of alignment */
574   Univcoord_T genomicend;	/* End of alignment */
575   int cdna_direction;
576   int sensedir;
577   bool watsonp;
578   int genestrand;
579 
580   double trimmed_coverage;
581   int matches;
582   int unknowns;
583   int mismatches;
584   int qopens;
585   int qindels;
586   int topens;
587   int tindels;
588   int noncanonical;
589   int goodness;
590   int absmq_score;
591   int mapq_score;
592 
593   int translation_start;
594   int translation_end;
595   int translation_length;
596 
597   int relaastart;
598   int relaaend;
599 
600 #if 0
601   int stage2_source;
602   int stage2_indexsize;
603   double stage2_diag_runtime;
604   double stage2_align_runtime;
605   double stage2_mapfraction;
606   int stage2_maxconsecutive;
607   double stage3_runtime;
608 #endif
609 
610   bool joinable_left_p;
611   bool joinable_right_p;
612 };
613 
614 
615 bool
Stage3_chimera_left_p(T this)616 Stage3_chimera_left_p (T this) {
617   return this->chimera_left_p;
618 }
619 
620 bool
Stage3_chimera_right_p(T this)621 Stage3_chimera_right_p (T this) {
622   return this->chimera_right_p;
623 }
624 
625 bool
Stage3_watsonp(T this)626 Stage3_watsonp (T this) {
627   return this->watsonp;
628 }
629 
630 int
Stage3_genestrand(T this)631 Stage3_genestrand (T this) {
632   return this->genestrand;
633 }
634 
635 int
Stage3_cdna_direction(T this)636 Stage3_cdna_direction (T this) {
637   return this->cdna_direction;
638 }
639 
640 int
Stage3_sensedir(T this)641 Stage3_sensedir (T this) {
642   return this->sensedir;
643 }
644 
645 
646 int
Stage3_straintype(T this)647 Stage3_straintype (T this) {
648   return this->straintype;
649 }
650 
651 int
Stage3_goodness(T this)652 Stage3_goodness (T this) {
653   debug(printf("Overall goodness:\n"));
654   debug(printf("  %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels => goodness %d\n",
655 	       this->matches,this->mismatches,this->qopens,this->qindels,this->topens,this->tindels,this->goodness));
656 
657   return this->goodness;
658 }
659 
660 int
Stage3_absmq_score(T this)661 Stage3_absmq_score (T this) {
662   return this->absmq_score;
663 }
664 
665 int
Stage3_mapq_score(T this)666 Stage3_mapq_score (T this) {
667   return this->mapq_score;
668 }
669 
670 List_T
Stage3_pairs(T this)671 Stage3_pairs (T this) {
672   return this->pairs;
673 }
674 
675 struct Pair_T *
Stage3_pairarray(T this)676 Stage3_pairarray (T this) {
677   return this->pairarray;
678 }
679 
680 int
Stage3_npairs(T this)681 Stage3_npairs (T this) {
682   return this->npairs;
683 }
684 
685 int
Stage3_matches(T this)686 Stage3_matches (T this) {
687   return this->matches;
688 }
689 
690 int
Stage3_mismatches(T this)691 Stage3_mismatches (T this) {
692   return this->mismatches;
693 }
694 
695 int
Stage3_indels(T this)696 Stage3_indels (T this) {
697   /* This should be consistent with the output from Pair_print_pathsummary */
698   return this->qindels + this->tindels;
699 }
700 
701 
702 int
Stage3_querystart(T this)703 Stage3_querystart (T this) {
704   return Pair_querypos(&(this->pairarray[0]));
705 }
706 
707 int
Stage3_queryend(T this)708 Stage3_queryend (T this) {
709   return Pair_querypos(&(this->pairarray[this->npairs-1]));
710 }
711 
712 bool
Stage3_joinable_left_p(T this)713 Stage3_joinable_left_p (T this) {
714   return this->joinable_left_p;
715 }
716 
717 bool
Stage3_joinable_right_p(T this)718 Stage3_joinable_right_p (T this) {
719   return this->joinable_right_p;
720 }
721 
722 void
Stage3_clear_joinable(T this)723 Stage3_clear_joinable (T this) {
724   this->joinable_left_p = false;
725   this->joinable_right_p = false;
726   return;
727 }
728 
729 void
Stage3_set_joinable_left(T this)730 Stage3_set_joinable_left (T this) {
731   this->joinable_left_p = true;
732   return;
733 }
734 
735 void
Stage3_set_joinable_right(T this)736 Stage3_set_joinable_right (T this) {
737   this->joinable_right_p = true;
738   return;
739 }
740 
741 
742 void
Stage3_print_ends(T this)743 Stage3_print_ends (T this) {
744   Pair_print_ends(this->pairs);
745   printf(" chimera_left: %d, chimera_right: %d",this->chimera_left_p,this->chimera_right_p);
746   printf(" goodness: %d",this->goodness);
747   printf("\n");
748   return;
749 }
750 
751 
752 Chrnum_T
Stage3_chrnum(T this)753 Stage3_chrnum (T this) {
754   return this->chrnum;
755 }
756 
757 Univcoord_T
Stage3_chroffset(T this)758 Stage3_chroffset (T this) {
759   return this->chroffset;
760 }
761 
762 Univcoord_T
Stage3_chrhigh(T this)763 Stage3_chrhigh (T this) {
764   return this->chrhigh;
765 }
766 
767 Chrpos_T
Stage3_chrlength(T this)768 Stage3_chrlength (T this) {
769   return this->chrlength;
770 }
771 
772 bool
Stage3_altloc_chr(Univcoord_T * alias_start,Univcoord_T * alias_end,T this)773 Stage3_altloc_chr (Univcoord_T *alias_start, Univcoord_T *alias_end, T this) {
774 #if 0
775   *alias_start = alias_starts[this->chrnum];
776   *alias_end = alias_ends[this->chrnum];
777   return altlocp[this->chrnum];
778 #else
779   return false;
780 #endif
781 }
782 
783 
784 Chrpos_T
Stage3_chrstart(T this)785 Stage3_chrstart (T this) {
786   return Pair_genomepos(&(this->pairarray[0]));
787 }
788 
789 Chrpos_T
Stage3_chrend(T this)790 Stage3_chrend (T this) {
791   return Pair_genomepos(&(this->pairarray[this->npairs-1]));
792 }
793 
794 Univcoord_T
Stage3_genomicstart(T this)795 Stage3_genomicstart (T this) {
796   /* Should be chroffset + Pair_genomepos(start) */
797   return this->genomicstart;
798 }
799 
800 Univcoord_T
Stage3_genomicend(T this)801 Stage3_genomicend (T this) {
802   /* Should be chroffset + Pair_genomepos(end) */
803   return this->genomicend;
804 }
805 
806 void
Stage3_set_genomicend(T this,Univcoord_T genomicend)807 Stage3_set_genomicend (T this, Univcoord_T genomicend) {
808   this->genomicend = genomicend;
809   return;
810 }
811 
812 int
Stage3_circularpos(T this)813 Stage3_circularpos (T this) {
814   return this->circularpos;
815 }
816 
817 
818 
819 int
Stage3_translation_start(T this)820 Stage3_translation_start (T this) {
821   return this->translation_start;
822 }
823 
824 int
Stage3_translation_end(T this)825 Stage3_translation_end (T this) {
826   return this->translation_end;
827 }
828 
829 
830 int
Stage3_domain(T this)831 Stage3_domain (T this) {
832   int querystart, queryend;
833 
834   querystart = Pair_querypos(&(this->pairarray[0]));
835   queryend = Pair_querypos(&(this->pairarray[this->npairs-1]));
836 
837   return queryend - querystart + 1;
838 }
839 
840 
841 int
Stage3_largemargin(int * newstart,int * newend,T this,int queryntlength)842 Stage3_largemargin (int *newstart, int *newend, T this, int queryntlength) {
843   int leftmargin, rightmargin;
844   int querystart, queryend;
845 
846   querystart = Pair_querypos(&(this->pairarray[0]));
847   queryend = Pair_querypos(&(this->pairarray[this->npairs-1]));
848 
849   if ((leftmargin = querystart) < 0) {
850     leftmargin = 0;
851   }
852   if ((rightmargin = queryntlength - queryend) < 0) {
853     rightmargin = 0;
854   }
855 
856   /* Return larger margin */
857   *newstart = querystart;
858   *newend = queryend + 1;
859   if (leftmargin > rightmargin) {
860     /* Trim left */
861     return leftmargin;
862   } else {
863     return rightmargin;
864   }
865 }
866 
867 
868 double
Stage3_fracidentity(T this)869 Stage3_fracidentity (T this) {
870   int den;
871 
872   if ((den = this->matches + this->mismatches + this->qindels + this->tindels) == 0) {
873     return 1.0;
874   } else {
875     return (double) this->matches/(double) den;
876   }
877 }
878 
879 Univcoord_T
Stage3_genomicpos(T this,int querypos,bool headp)880 Stage3_genomicpos (T this, int querypos, bool headp) {
881   return this->chroffset + Pair_genomicpos(this->pairarray,this->npairs,querypos,headp);
882 }
883 
884 
885 int
Stage3_chimeric_goodness(int * matches1,int * matches2,T part1,T part2,int breakpoint)886 Stage3_chimeric_goodness (int *matches1, int *matches2, T part1, T part2, int breakpoint) {
887   int goodness1, goodness2, querystart, queryend;
888   int unknowns1, mismatches1, qopens1, qindels1, topens1, tindels1,
889     ncanonical1, nsemicanonical1, nnoncanonical1;
890   int unknowns2, mismatches2, qopens2, qindels2, topens2, tindels2,
891     ncanonical2, nsemicanonical2, nnoncanonical2;
892 
893   querystart = Pair_querypos(&(part1->pairarray[0]));
894   debug10(printf("Chimeric goodness requested for part %d..%d\n",querystart+1,breakpoint));
895   Pair_fracidentity_bounded(&(*matches1),&unknowns1,&mismatches1,&qopens1,&qindels1,&topens1,&tindels1,
896 			    &ncanonical1,&nsemicanonical1,&nnoncanonical1,
897 			    part1->pairarray,part1->npairs,part1->cdna_direction,
898 			    querystart,breakpoint);
899   goodness1 = (*matches1) + MISMATCH*mismatches1 + QOPEN*qopens1 + QINDEL*qindels1 + TOPEN*topens1 + TINDEL*tindels1;
900   debug10(printf("  %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels => %d\n",
901 		*matches1,mismatches1,qopens1,qindels1,topens1,tindels1,goodness1));
902 
903   queryend = Pair_querypos(&(part2->pairarray[part2->npairs-1]));
904   debug10(printf("Chimeric goodness requested for part %d..%d\n",breakpoint+1,queryend+1));
905   Pair_fracidentity_bounded(&(*matches2),&unknowns2,&mismatches2,&qopens2,&qindels2,&topens2,&tindels2,
906 			    &ncanonical2,&nsemicanonical2,&nnoncanonical2,
907 			    part2->pairarray,part2->npairs,part2->cdna_direction,
908 			    breakpoint,queryend);
909   goodness2 = (*matches2) + MISMATCH*mismatches2 + QOPEN*qopens2 + QINDEL*qindels2 + TOPEN*topens2 + TINDEL*tindels2;
910   debug10(printf("  %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels => %d\n",
911 		*matches2,mismatches2,qopens2,qindels2,topens2,tindels2,goodness2));
912 
913   return goodness1 + goodness2;
914 }
915 
916 
917 Chrpos_T
Stage3_genomiclength(T this)918 Stage3_genomiclength (T this) {
919   if (this->genomicstart < this->genomicend) {
920     return this->genomicend - this->genomicstart + 1U;
921   } else {
922     return this->genomicstart - this->genomicend + 1U;
923   }
924 }
925 
926 
927 bool
Stage3_passes_filter(T this,double min_trimmed_coverage,double min_identity)928 Stage3_passes_filter (T this, double min_trimmed_coverage, double min_identity) {
929   int den;
930 
931   if (this->trimmed_coverage < min_trimmed_coverage) {
932     return false;
933   } else if ((den = this->matches + this->mismatches + this->qindels + this->tindels) == 0) {
934     /* fracidentity is 1.0 */
935     return true;
936   } else if ((double) this->matches/(double) den < min_identity) {
937     return false;
938   } else {
939     return true;
940   }
941 }
942 
943 bool
Stage3_passes_filter_chimera(Chimera_T chimera,double min_trimmed_coverage,double min_identity)944 Stage3_passes_filter_chimera (Chimera_T chimera, double min_trimmed_coverage, double min_identity) {
945   int den;
946   Stage3_T from, to;
947 
948   from = Chimera_left_part(chimera);
949   to = Chimera_right_part(chimera);
950 
951   if (from->trimmed_coverage + to->trimmed_coverage < min_trimmed_coverage) {
952     return false;
953   } else if ((den = from->matches + from->mismatches + from->qindels + from->tindels +
954 	      to->matches + to->mismatches + to->qindels + to->tindels) == 0) {
955     /* fracidentity is 1.0 */
956     return true;
957   } else if ((double) (from->matches + to->matches)/(double) den < min_identity) {
958     return false;
959   } else {
960     return true;
961   }
962 }
963 
964 
965 
966 int
Stage3_cmp(const void * a,const void * b)967 Stage3_cmp (const void *a, const void *b) {
968   T x = * (T *) a;
969   T y = * (T *) b;
970   Chrpos_T x_genomiclength, y_genomiclength;
971 
972   if (x->chimera_right_p == true && y->chimera_right_p == false) {
973     return -1;
974   } else if (y->chimera_right_p == true && x->chimera_right_p == false) {
975     return +1;
976   } else if (x->chimera_left_p == true && y->chimera_left_p == false) {
977     return -1;
978   } else if (y->chimera_left_p == true && x->chimera_left_p == false) {
979     return +1;
980   } else if (x->goodness > y->goodness) {
981     return -1;
982   } else if (y->goodness > x->goodness) {
983     return +1;
984 
985     /* If we can achieve same goodness with fewer pairs, then it is a better alignment */
986   } else if (x->npairs < y->npairs) {
987     return -1;
988   } else if (y->npairs < x->npairs) {
989     return +1;
990 
991     /* If we can achieve same goodness with more matches, then it is a better alignment */
992   } else if (x->matches > y->matches) {
993     return -1;
994   } else if (y->matches > x->matches) {
995     return +1;
996 
997   } else if (x->straintype < y->straintype) {
998     return -1;
999   } else if (y->straintype < x->straintype) {
1000     return +1;
1001   } else {
1002     x_genomiclength = Stage3_genomiclength(x);
1003     y_genomiclength = Stage3_genomiclength(y);
1004     if (x_genomiclength < y_genomiclength) {
1005       return -1;
1006     } else if (y_genomiclength < x_genomiclength) {
1007       return +1;
1008     } else if (x->chrnum < y->chrnum) {
1009       return -1;
1010     } else if (y->chrnum < x->chrnum) {
1011       return +1;
1012     } else if (x->genomicstart < y->genomicstart) {
1013       return -1;
1014     } else if (y->genomicstart < x->genomicstart) {
1015       return +1;
1016     } else {
1017       return 0;
1018     }
1019   }
1020 }
1021 
1022 
1023 int
Stage3_position_cmp(const void * a,const void * b)1024 Stage3_position_cmp (const void *a, const void *b) {
1025   T x = * (T *) a;
1026   T y = * (T *) b;
1027   int querypos1, querypos2;
1028 
1029   if (x->genomicstart < y->genomicstart) {
1030     return -1;
1031   } else if (y->genomicstart < x->genomicstart) {
1032     return +1;
1033   } else if (x->genomicend < y->genomicend) {
1034     return -1;
1035   } else if (y->genomicend < x->genomicend) {
1036     return +1;
1037   } else {
1038     querypos1 = Pair_querypos(&(x->pairarray[0]));
1039     querypos2 = Pair_querypos(&(y->pairarray[0]));
1040     if (querypos1 < querypos2) {
1041       return -1;
1042     } else if (querypos2 < querypos1) {
1043       return +1;
1044     } else {
1045       querypos1 = Pair_querypos(&(x->pairarray[x->npairs-1]));
1046       querypos2 = Pair_querypos(&(y->pairarray[y->npairs-1]));
1047       if (querypos1 < querypos2) {
1048 	return -1;
1049       } else if (querypos2 < querypos1) {
1050 	return +1;
1051       } else {
1052 	return 0;
1053       }
1054     }
1055   }
1056 }
1057 
1058 int
Stage3_querystart_cmp(const void * a,const void * b)1059 Stage3_querystart_cmp (const void *a, const void *b) {
1060   T x = * (T *) a;
1061   T y = * (T *) b;
1062   int x_querystart, y_querystart;
1063 
1064   x_querystart = Pair_querypos(&(x->pairarray[0]));
1065   y_querystart = Pair_querypos(&(y->pairarray[0]));
1066 
1067   if (x_querystart < y_querystart) {
1068     return -1;
1069   } else if (y_querystart < x_querystart) {
1070     return +1;
1071   } else {
1072     return 0;
1073   }
1074 }
1075 
1076 int
Stage3_queryend_cmp(const void * a,const void * b)1077 Stage3_queryend_cmp (const void *a, const void *b) {
1078   T x = * (T *) a;
1079   T y = * (T *) b;
1080   int x_queryend, y_queryend;
1081 
1082   x_queryend = Pair_querypos(&(x->pairarray[x->npairs-1]));
1083   y_queryend = Pair_querypos(&(y->pairarray[y->npairs-1]));
1084 
1085   if (x_queryend < y_queryend) {
1086     return -1;
1087   } else if (y_queryend < x_queryend) {
1088     return +1;
1089   } else {
1090     return 0;
1091   }
1092 }
1093 
1094 int
Stage3_chrnum_cmp(const void * a,const void * b)1095 Stage3_chrnum_cmp (const void *a, const void *b) {
1096   T x = * (T *) a;
1097   T y = * (T *) b;
1098 
1099   if (x->chrnum < y->chrnum) {
1100     return -1;
1101   } else if (y->chrnum < x->chrnum) {
1102     return +1;
1103   } else {
1104     return 0;
1105   }
1106 }
1107 
1108 
1109 int
Stage3_chrnum_querystart_cmp(const void * a,const void * b)1110 Stage3_chrnum_querystart_cmp (const void *a, const void *b) {
1111   T x = * (T *) a;
1112   T y = * (T *) b;
1113   int x_querystart, y_querystart, x_length, y_length;
1114 
1115   if (x->chrnum < y->chrnum) {
1116     return -1;
1117   } else if (y->chrnum < x->chrnum) {
1118     return +1;
1119   } else {
1120     x_querystart = Pair_querypos(&(x->pairarray[0]));
1121     y_querystart = Pair_querypos(&(y->pairarray[0]));
1122 
1123     if (x_querystart < y_querystart) {
1124       return -1;
1125     } else if (y_querystart < x_querystart) {
1126       return +1;
1127     } else {
1128       /* Put longer segments at the end so they supersede earlier chimeric matches */
1129       x_length = Pair_querypos(&(x->pairarray[x->npairs-1])) - x_querystart;
1130       y_length = Pair_querypos(&(y->pairarray[y->npairs-1])) - y_querystart;
1131 
1132       if (x_length < y_length) {
1133 	return -1;
1134       } else if (y_length < x_length) {
1135 	return +1;
1136       } else {
1137 	return 0;
1138       }
1139     }
1140   }
1141 }
1142 
1143 int
Stage3_chrnum_queryend_cmp(const void * a,const void * b)1144 Stage3_chrnum_queryend_cmp (const void *a, const void *b) {
1145   T x = * (T *) a;
1146   T y = * (T *) b;
1147   int x_queryend, y_queryend, x_length, y_length;
1148 
1149   if (x->chrnum < y->chrnum) {
1150     return -1;
1151   } else if (y->chrnum < x->chrnum) {
1152     return +1;
1153   } else {
1154     x_queryend = Pair_querypos(&(x->pairarray[x->npairs-1]));
1155     y_queryend = Pair_querypos(&(y->pairarray[y->npairs-1]));
1156 
1157     if (x_queryend < y_queryend) {
1158       return -1;
1159     } else if (y_queryend < x_queryend) {
1160       return +1;
1161     } else {
1162       /* Put longer segments at the end so they supersede earlier chimeric matches */
1163       x_length = x_queryend - Pair_querypos(&(x->pairarray[0]));
1164       y_length = y_queryend - Pair_querypos(&(y->pairarray[0]));
1165 
1166       if (x_length < y_length) {
1167 	return -1;
1168       } else if (y_length < x_length) {
1169 	return +1;
1170       } else {
1171 	return 0;
1172       }
1173     }
1174   }
1175 }
1176 
1177 
1178 int
Stage3_identity_cmp(const void * a,const void * b)1179 Stage3_identity_cmp (const void *a, const void *b) {
1180   T x = * (T *) a;
1181   T y = * (T *) b;
1182 
1183   if (x < y) {
1184     return -1;
1185   } else if (x > y) {
1186     return +1;
1187   } else {
1188     return 0;
1189   }
1190 }
1191 
1192 
1193 bool
Stage3_overlap(T x,T y)1194 Stage3_overlap (T x, T y) {
1195 
1196   if (x->straintype != y->straintype) {
1197     return false;
1198   } else if (x->watsonp != y->watsonp) {
1199     return false;
1200   } else if (x->watsonp) {
1201     if (x->genomicstart >= y->genomicstart && x->genomicstart <= y->genomicend) {
1202       return true;
1203     } else if (y->genomicstart >= x->genomicstart && y->genomicstart <= x->genomicend) {
1204       return true;
1205     } else {
1206       return false;
1207     }
1208   } else {
1209     if (x->genomicstart >= y->genomicend && x->genomicstart <= y->genomicstart) {
1210       return true;
1211     } else if (y->genomicstart >= x->genomicend && y->genomicstart <= x->genomicstart) {
1212       return true;
1213     } else {
1214       return false;
1215     }
1216   }
1217 }
1218 
1219 /************************************************************************
1220  *   Gaps
1221  ************************************************************************/
1222 
1223 /* Note: In going through pairs and path, we have two methods:
1224 
1225    1.  pairptr = path; (to save the pointer)
1226        path = Pairpool_pop(path,&pair);
1227 
1228        pairs = List_push_existing(pairs,pairptr);
1229 
1230    2.  pair = (Pair_T) path->first;
1231 
1232        (refer to path->rest, since we haven't popped path yet)
1233        pairs = List_transfer_one(pairs,&path);  (combines a push and pop)
1234 
1235     In the code below, we sometimes mix these, using method 2 for speed,
1236     and method 1 for clarity.
1237 */
1238 
1239 
1240 #if 0
1241 static List_T
1242 check_gaps (List_T pairs, Pairpool_T pairpool) {
1243   List_T path = NULL, pairptr;
1244   Pair_T pair, leftpair, rightpair;
1245   int queryjump, genomejump;
1246 
1247   debug(printf("\nBeginning check of gaps\n"));
1248   debug(printf("length = %d\n",List_length(pairs)));
1249   debug(Pair_dump_list(pairs,true));
1250 
1251   if (pairs == NULL) {
1252     return (List_T) NULL;
1253   }
1254 
1255   pairptr = pairs;
1256   pairs = Pairpool_pop(pairs,&pair);
1257   if (pair->gapp == true) {
1258     fprintf(stderr,"Gap check error: Unexpected gap at start of pairs\n");
1259     debug(printf("Gap check error: Unexpected gap at start of pairs\n"));
1260 #ifndef DEBUG
1261     Except_raise(&gapcheck_error,__FILE__,__LINE__);
1262 #endif
1263   } else {
1264 #ifdef WASTE
1265     path = Pairpool_push_existing(NULL,pairpool,pair);
1266 #else
1267     path = List_push_existing(NULL,pairptr);
1268 #endif
1269   }
1270 
1271   while (pairs != NULL) {
1272     pairptr = pairs;
1273     pairs = Pairpool_pop(pairs,&pair);
1274     if (pair->gapp == true) {
1275       leftpair = path->first;
1276       rightpair = pairs->first;
1277       debug(printf("Observed a gap at %d..%d with queryjump = %d, genomejump = %d\n",
1278 		   leftpair->querypos,rightpair->querypos,pair->queryjump,pair->genomejump));
1279 
1280       queryjump = rightpair->querypos - leftpair->querypos - 1;
1281       genomejump = rightpair->genomepos - leftpair->genomepos - 1;
1282       /* if (leftpair->cdna == ' ') queryjump++; -- For old dynamic programming */
1283       /* if (leftpair->genome == ' ') genomejump++; -- For old dynamic programming */
1284 
1285       if (pair->queryjump != queryjump) {
1286 	if (rightpair->querypos >= HALFLEN && leftpair->querypos < HALFLEN) {
1287 	  debug(printf("Accept queryjump for gap at %d..%d as probable skiplength.  It's %d, should be %d\n",
1288 		       leftpair->querypos,rightpair->querypos,pair->queryjump,queryjump));
1289 	} else {
1290 	  debug(printf("Gap check error: Wrong queryjump for gap at %d..%d.  It's %d, should be %d\n",
1291 		       leftpair->querypos,rightpair->querypos,pair->queryjump,queryjump));
1292 #ifndef DEBUG
1293 	  Except_raise(&gapcheck_error,__FILE__,__LINE__);
1294 #endif
1295 	}
1296       }
1297       if (pair->genomejump != genomejump) {
1298 	debug(printf("Gap check error: Wrong genomejump for gap at %d..%d.  It's %d, should be %d\n",
1299 		     leftpair->querypos,rightpair->querypos,pair->genomejump,genomejump));
1300 #ifndef DEBUG
1301 	Except_raise(&gapcheck_error,__FILE__,__LINE__);
1302 #endif
1303       }
1304 #ifdef WASTE
1305       path = Pairpool_push_existing(path,pairpool,pair);
1306 #else
1307       path = List_push_existing(path,pairptr);
1308 #endif
1309 
1310       /* Process another pair after gap */
1311       if (pairs == NULL) {
1312 	fprintf(stderr,"Gap check error: Unexpected gap at end of pairs\n");
1313 	debug(printf("Gap check error: Unexpected gap at end of pairs\n"));
1314 #ifndef DEBUG
1315 	Except_raise(&gapcheck_error,__FILE__,__LINE__);
1316 #endif
1317       }
1318       pairptr = pairs;
1319       pairs = Pairpool_pop(pairs,&pair);
1320       if (pair->gapp == true) {
1321 	fprintf(stderr,"Gap check error: Unexpected gap after gap\n");
1322 #ifndef DEBUG
1323 	Except_raise(&gapcheck_error,__FILE__,__LINE__);
1324 #endif
1325       }
1326 #ifdef WASTE
1327       path = Pairpool_push_existing(path,pairpool,pair);
1328 #else
1329       path = List_push_existing(path,pairptr);
1330 #endif
1331 
1332     } else {
1333       /* Not a gap */
1334       leftpair = path->first;
1335       queryjump = pair->querypos - leftpair->querypos - 1;
1336       genomejump = pair->genomepos - leftpair->genomepos - 1;
1337       /* if (leftpair->cdna == ' ') queryjump++; -- For old dynamic programming */
1338       /* if (leftpair->genome == ' ') genomejump++; -- For old dynamic programming */
1339 
1340       if (queryjump <= 0 && genomejump <= 0) {
1341 #ifdef WASTE
1342 	path = Pairpool_push_existing(path,pairpool,pair);
1343 #else
1344 	path = List_push_existing(path,pairptr);
1345 #endif
1346       } else if (queryjump == 0 && genomejump == 0) {
1347 #ifdef WASTE
1348 	path = Pairpool_push_existing(path,pairpool,pair);
1349 #else
1350 	path = List_push_existing(path,pairptr);
1351 #endif
1352       } else {
1353 	fprintf(stderr,"Gap check error: Unexpected missing gap at %d..%d\n",leftpair->querypos,pair->querypos);
1354 	debug(printf("Gap check error: Unexpected missing gap at %d..%d\n",leftpair->querypos,pair->querypos));
1355 	debug(printf("Gap check error: Pushing a gap at %d..%d because of queryjump = %d, genomejump = %d\n",
1356 		     leftpair->querypos,pair->querypos,queryjump,genomejump));
1357 	/* One place we need accurate queryjump and genomejump */
1358 	path = Pairpool_push_gapholder(path,pairpool,queryjump,genomejump,
1359 				       /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
1360 #ifdef WASTE
1361 	path = Pairpool_push_existing(path,pairpool,pair);
1362 #else
1363 	path = List_push_existing(path,pairptr);
1364 #endif
1365 #ifndef DEBUG
1366 	Except_raise(&gapcheck_error,__FILE__,__LINE__);
1367 #endif
1368       }
1369     }
1370   }
1371 
1372   debug(printf("Done with check of gaps\n\n"));
1373 
1374   return path;
1375 }
1376 #endif
1377 
1378 
1379 static char complCode[128] = COMPLEMENT_LC;
1380 
1381 static char
get_genomic_nt(char * g_alt,int genomicpos,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)1382 get_genomic_nt (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
1383 		bool watsonp) {
1384   char c2, c2_alt;
1385   Univcoord_T pos;
1386 
1387   if (watsonp) {
1388     if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
1389       *g_alt = '*';
1390       return '*';
1391 
1392     } else if (pos >= chrhigh) {
1393       *g_alt = '*';
1394       return '*';
1395 
1396     } else {
1397       debug7(printf("At %u, genomicnt is %c\n",
1398 		    genomicpos,Genome_get_char_blocks(&(*g_alt),pos)));
1399       return Genome_get_char_blocks(&(*g_alt),pos);
1400     }
1401 
1402   } else {
1403     if ((pos = chrhigh - genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
1404       *g_alt = '*';
1405       return '*';
1406 
1407     } else if (pos >= chrhigh) {
1408       *g_alt = '*';
1409       return '*';
1410 
1411     } else {
1412       c2 = Genome_get_char_blocks(&c2_alt,pos);
1413     }
1414     debug7(printf("At %u, genomicnt is %c\n",
1415 		  genomicpos,complCode[(int) c2]));
1416     *g_alt = complCode[(int) c2_alt];
1417     return complCode[(int) c2];
1418   }
1419 }
1420 
1421 #if 0
1422 static char
1423 get_genomic_nt_genomicseg (char *g_alt, Chrpos_T genomicpos,
1424 			   Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1425 			   char *genomicseg_ptr, Genome_T genome,
1426 			   bool use_genomicseg_p) {
1427   char g, c2, c2_alt;
1428   Univcoord_T pos;
1429 
1430   assert(use_genomicseg_p == false);
1431   /* Need to allow genomicpos < 0 and genomicpos >= genomiclength for iteration on finding chimeras */
1432   if (use_genomicseg_p == true && genomicpos < 0) {
1433     *g_alt = '*';
1434     return '*';
1435 
1436   } else if (use_genomicseg_p == true && genomicpos >= genomiclength) {
1437     *g_alt = '*';
1438     return '*';
1439 
1440   } else if (use_genomicseg_p) {
1441     debug7(printf("At %u, genomicnt is %c\n",genomicpos,genomicseg_ptr[genomicpos]));
1442     g = *g_alt = genomicseg_ptr[genomicpos];
1443     return g;
1444 
1445   } else if (watsonp) {
1446     if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
1447       *g_alt = '*';
1448       return '*';
1449 
1450     } else if (pos >= chrhigh) {
1451       *g_alt = '*';
1452       return '*';
1453 
1454 #if 0
1455     } else if (genome) {
1456       debug7(printf("At %u, genomicnt is %c\n",
1457 		    genomicpos,Genome_get_char(genome,pos)));
1458       return Genome_get_char(genome,pos);
1459 #endif
1460 
1461     } else {
1462       debug7(printf("At %u, genomicnt is %c\n",
1463 		    genomicpos,Genome_get_char_blocks(pos)));
1464       return Genome_get_char_blocks(&(*g_alt),pos);
1465     }
1466 
1467   } else {
1468     if ((pos = chrhigh - genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
1469       return '*';
1470 
1471     } else if (pos >= chrhigh) {
1472       return '*';
1473 
1474 #if 0
1475     } else if (genome) {
1476       c2 = Genome_get_char(genome,pos);
1477 #endif
1478 
1479     } else {
1480       c2 = Genome_get_char_blocks(&c2_alt,pos);
1481     }
1482     debug7(printf("At %u, genomicnt is %c\n",
1483 		  genomicpos,complCode[(int) c2]));
1484     *g_alt = complCode[(int) c2_alt];
1485     return complCode[(int) c2];
1486   }
1487 }
1488 #endif
1489 
1490 
1491 #if 0
1492 static char *
1493 get_genomic_seg (Chrpos_T genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
1494 		 int length, bool watsonp,
1495 		 char *genomicseg_ptr, Genome_T genome,
1496 		 bool use_genomicseg_p) {
1497   char *segment;
1498 
1499   if (use_genomicseg_p) {
1500     debug7(printf("At %u, genomicseg is %.*s\n",
1501 		  genomicpos,length,genomicseg_ptr[genomicpos]));
1502     return &(genomicseg_ptr[genomicpos]);
1503 
1504   } else if (watsonp) {
1505     segment = (char *) CALLOC(length+1,sizeof(char));
1506     if (genome) {
1507       Genome_fill_buffer_simple(genome,chroffset + genomicpos,length,segment);
1508       debug7(printf("At %u, genomicseg is %s\n",genomicpos,segment));
1509       return segment;
1510     } else {
1511       Genome_fill_buffer_blocks(chroffset + genomicpos,length,segment);
1512       debug7(printf("At %u, genomicseg is %s\n",genomicpos,segment));
1513       return segment;
1514     }
1515 
1516   } else {
1517     if (genome) {
1518       Genome_fill_buffer_simple(genome,chrhigh - genomicpos,length,segment);
1519     } else {
1520       Genome_fill_buffer_blocks(chrhigh - genomicpos,length,segment);
1521     }
1522     make_complement_inplace(segment,length);
1523     debug7(printf("At %u, genomicnt is %s\n",genomicpos,segment));
1524     return segment;
1525   }
1526 }
1527 #endif
1528 
1529 
1530 /* For use by stage3.c procedures */
1531 static List_T
insert_gapholders(List_T pairs,char * queryseq_ptr,char * queryuc_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,Pairpool_T pairpool,bool finalp)1532 insert_gapholders (List_T pairs, char *queryseq_ptr, char *queryuc_ptr,
1533 		   Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
1534 		   int genestrand, Pairpool_T pairpool, bool finalp) {
1535   List_T path = NULL;
1536   Pair_T pair, leftpair, gappair = NULL;
1537   int queryjump, genomejump, i;
1538   bool firstp = true;
1539   char comp, c, g, g_alt;
1540 
1541   /* Remove all existing gaps */
1542   debug(printf("Beginning deletion/insertion of gaps\n"));
1543 
1544   /* Discard old gap(s) and indels */
1545   while (pairs != NULL) {
1546     /* pairptr = pairs; */
1547     /* pairs = Pairpool_pop(pairs,&pair); */
1548     pair = (Pair_T) pairs->first;
1549     if (pair->knowngapp == true) {
1550       /* Keep known introns */
1551       debug(printf("Keeping a known intron gap with queryjump = %d, genomejump = %d\n",
1552 		   pair->queryjump,pair->genomejump));
1553 #ifdef WASTE
1554       path = Pairpool_push_existing(path,pairpool,pair);
1555 #else
1556       path = List_transfer_one(path,&pairs);
1557 #endif
1558 
1559     } else if (pair->gapp == true) {
1560       debug(printf("Removing a gap with queryjump = %d, genomejump = %d\n",
1561 		   pair->queryjump,pair->genomejump));
1562       pairs = Pairpool_pop(pairs,&pair);
1563 
1564     } else if (pair->cdna == ' ' || pair->genome == ' ') {
1565       /* New */
1566       debug(printf("Removing an indel\n"));
1567       pairs = Pairpool_pop(pairs,&pair);
1568 
1569     } else {
1570 #ifdef WASTE
1571       path = Pairpool_push_existing(path,pairpool,pair);
1572 #else
1573       path = List_transfer_one(path,&pairs);
1574 #endif
1575     }
1576   }
1577 
1578   pairs = List_reverse(path);
1579   path = (List_T) NULL;
1580 
1581   if (pairs != NULL) {
1582     /* pairptr = pairs; */
1583     /* pairs = Pairpool_pop(pairs,&pair); */
1584     pair = (Pair_T) pairs->first;
1585 #ifdef WASTE
1586     path = Pairpool_push_existing(path,pairpool,pair);
1587 #else
1588     path = List_transfer_one(path,&pairs);
1589 #endif
1590     leftpair = pair;
1591   }
1592 
1593   while (pairs != NULL) {
1594     /* pairptr = pairs; */
1595     /* pairs = Pairpool_pop(pairs,&pair); */
1596 
1597     /* queryjump = pair->querypos - leftpair->querypos - 1; */
1598     /* genomejump = pair->genomepos - leftpair->genomepos - 1; */
1599     /* if (leftpair->cdna == ' ') queryjump++; -- For old dynamic programming */
1600     /* if (leftpair->genome == ' ') genomejump++; -- For old dynamic programming */
1601 
1602 #if 1
1603     pair = (Pair_T) pairs->first;
1604     queryjump = pair->querypos - leftpair->querypos - 1;
1605     genomejump = pair->genomepos - leftpair->genomepos - 1;
1606 #else
1607     /* Needed when we did not remove indels */
1608     p = pairs;
1609     while (p != NULL && ((Pair_T) p->first)->cdna == ' ') {
1610       p = p->rest;
1611     }
1612     if (p == NULL) {
1613       queryjump = 0;
1614     } else {
1615       queryjump = ((Pair_T) p->first)->querypos - leftpair->querypos - 1;
1616     }
1617 
1618     p = pairs;
1619     while (p != NULL && ((Pair_T) p->first)->genome == ' ') {
1620       p = p->rest;
1621     }
1622     if (p == NULL) {
1623       genomejump = 0;
1624     } else {
1625       genomejump = ((Pair_T) p->first)->genomepos - leftpair->genomepos - 1;
1626     }
1627 #endif
1628 
1629     pair = (Pair_T) pairs->first;
1630     if (pair->knowngapp == true) {
1631 #ifdef WASTE
1632       path = Pairpool_push_existing(path,pairpool,pair);
1633 #else
1634       path = List_transfer_one(path,&pairs);
1635 #endif
1636 
1637     } else if (leftpair->knowngapp == true) {
1638       /* Ignore queryjump and genomejump information of gap pair */
1639 #ifdef WASTE
1640       path = Pairpool_push_existing(path,pairpool,pair);
1641 #else
1642       path = List_transfer_one(path,&pairs);
1643 #endif
1644 
1645     } else if (queryjump <= 0 && genomejump <= 0) {
1646 #ifdef WASTE
1647       path = Pairpool_push_existing(path,pairpool,pair);
1648 #else
1649       path = List_transfer_one(path,&pairs);
1650 #endif
1651 
1652     } else if (finalp == true && queryjump == genomejump) {
1653       /* Fill gap with nucleotides */
1654       debug(printf("Filling a gap with nucleotides at %d..%d because of queryjump %d == genomejump %d\n",
1655 		   leftpair->querypos,pair->querypos,queryjump,genomejump));
1656       for (i = 1; i <= queryjump; i++) {
1657 	c = queryuc_ptr[leftpair->querypos+i];
1658 	g = get_genomic_nt(&g_alt,leftpair->genomepos+i,chroffset,chrhigh,watsonp);
1659 	/* It is possible for a gap with c == g to occur in the middle of a repetitive oligo, such as poly-A */
1660 	if (Dynprog_consistent_p(c,g,g_alt,genestrand) == true) {
1661 	  comp = MATCH_COMP;
1662 #ifdef PMAP
1663 	} else if (Dynprog_consistent_p(c,g,g_alt) == true) {
1664 	  comp = AMBIGUOUS_COMP;
1665 #endif
1666 	} else {
1667 	  comp = MISMATCH_COMP;
1668 	}
1669 	debug(printf(" => query %c, genomic %c\n",queryseq_ptr[leftpair->querypos+i],g));
1670 
1671 	path = Pairpool_push(path,pairpool,leftpair->querypos+i,leftpair->genomepos+i,queryseq_ptr[leftpair->querypos+i],
1672 			     comp,g,g_alt,/*dynprogindex*/0);
1673       }
1674 #ifdef WASTE
1675       path = Pairpool_push_existing(path,pairpool,pair);
1676 #else
1677       path = List_transfer_one(path,&pairs);
1678 #endif
1679 
1680     } else if (queryjump == 1 && genomejump == 1) {
1681       /* Handle a single mismatch by a simple fill */
1682       c = queryuc_ptr[leftpair->querypos+1];
1683       g = get_genomic_nt(&g_alt,leftpair->genomepos+1,chroffset,chrhigh,watsonp);
1684       /* It is possible for a gap with c == g to occur in the middle of a repetitive oligo, such as poly-A */
1685       if (Dynprog_consistent_p(c,g,g_alt,genestrand) == true) {
1686 	comp = MATCH_COMP;
1687 #ifdef PMAP
1688       } else if (Dynprog_consistent_p(c,g,g_alt) == true) {
1689 	comp = AMBIGUOUS_COMP;
1690 #endif
1691       } else {
1692 	comp = MISMATCH_COMP;
1693       }
1694       debug(printf("Filling a gap at %d..%d because of queryjump = %d, genomejump = %d => query %c, genomic %c\n",
1695 		   leftpair->querypos,pair->querypos,queryjump,genomejump,queryseq_ptr[leftpair->querypos+1],g));
1696       path = Pairpool_push(path,pairpool,leftpair->querypos+1,leftpair->genomepos+1,queryseq_ptr[leftpair->querypos+1],
1697 			   comp,g,g_alt,/*dynprogindex*/0);
1698 #ifdef WASTE
1699       path = Pairpool_push_existing(path,pairpool,pair);
1700 #else
1701       path = List_transfer_one(path,&pairs);
1702 #endif
1703 
1704     } else {
1705       /* Insert new gap.  Need accurate queryjump and genomejump */
1706       debug(printf("Inserting a gap at %d..%d because of queryjump = %d, genomejump = %d\n",
1707 		   leftpair->querypos,pair->querypos,queryjump,genomejump));
1708       debug(printf("queryjump %d = pair->querypos %d - leftpair->querypos %d - 1\n",queryjump,pair->querypos,leftpair->querypos));
1709       debug(printf("genomejump %d = pair->genomepos %u - leftpair->genomepos %u - 1\n",genomejump,pair->genomepos,leftpair->genomepos));
1710 
1711       path = Pairpool_push_gapholder(path,pairpool,queryjump,genomejump,
1712 				     /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
1713       gappair = (Pair_T) path->first;
1714       if (firstp == true) {
1715 	gappair->end_intron_p = true;
1716 	firstp = false;
1717       }
1718 #ifdef WASTE
1719       path = Pairpool_push_existing(path,pairpool,pair);
1720 #else
1721       path = List_transfer_one(path,&pairs);
1722 #endif
1723     }
1724 
1725     leftpair = pair;
1726   }
1727 
1728   if (gappair != NULL) {
1729     gappair->end_intron_p = true;
1730   }
1731   debug(printf("Ending deletion/insertion of gaps\n"));
1732 
1733   /* debug(Pair_dump_list(path,true)); */
1734   return path;
1735 }
1736 
1737 
1738 
1739 /* Should call before peel_rightward and peel_leftward, so we don't
1740    run into gaps that are really indels */
1741 static List_T
assign_gap_types(List_T path,int cdna_direction,bool watsonp,char * queryseq_ptr,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Pairpool_T pairpool)1742 assign_gap_types (List_T path, int cdna_direction, bool watsonp, char *queryseq_ptr,
1743 		  Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
1744 		  Pairpool_T pairpool) {
1745   List_T pairs = NULL, pairptr;
1746   Pair_T pair, leftpair, rightpair;
1747   Univcoord_T splicesitepos;
1748   int queryjump, genomejump, leftquerypos, rightquerypos, curquerypos, introntype, intronlength;
1749   Chrpos_T leftgenomepos, rightgenomepos, genomicpos;
1750   char left1, left2, right2, right1, left1_alt, left2_alt, right2_alt, right1_alt, c2, c2_alt;
1751 
1752   debug(printf("\n** Starting assign_gap_types\n"));
1753   while (path != NULL) {
1754     /* pairptr = path; */
1755     /* path = Pairpool_pop(path,&pair); */
1756     pair = (Pair_T) path->first;
1757     if (pair->gapp == false) {
1758 #ifdef WASTE
1759       pairs = Pairpool_push_existing(pairs,pairpool,pair);
1760 #else
1761       pairs = List_transfer_one(pairs,&path);
1762 #endif
1763 
1764     } else if (pairs == NULL) {
1765       /* Discard initial gap */
1766       debug7(printf("Discard initial gap\n"));
1767       path = Pairpool_pop(path,&pair);
1768 
1769     } else if (path->rest == NULL) {
1770       /* Discard terminal gap */
1771       debug7(printf("Discard terminal gap\n"));
1772       path = Pairpool_pop(path,&pair);
1773 
1774     } else {
1775       queryjump = pair->queryjump;
1776       genomejump = pair->genomejump;
1777       debug7(printf("  Gap has queryjump %d, genomejump %d\n",queryjump,genomejump));
1778 
1779       if (queryjump == 0 && genomejump == 0) {
1780 	debug7(printf("  Gap is a non-gap\n"));
1781 	/* Discard the gap pair */
1782 	path = Pairpool_pop(path,&pair);
1783 
1784       } else if (genomejump == 0) {
1785 	debug7(printf("  Gap is a cDNA insertion, so replacing it with indels\n"));
1786 	/* pair->comp = INDEL_COMP; */
1787 
1788 	/* Discard the gap pair */
1789 	path = Pairpool_pop(path,&pair);
1790 
1791 	leftpair = path->first;
1792 	rightpair = pairs->first;
1793 	leftquerypos = leftpair->querypos;
1794 	/* if (leftpair->cdna == ' ') leftquerypos--; -- For old dynamic programming */
1795 	rightquerypos = rightpair->querypos;
1796 	rightgenomepos = rightpair->genomepos;
1797 
1798 	debug7(printf("leftquerypos = %d, rightquerypos = %d\n",leftquerypos,rightquerypos));
1799 	for (curquerypos = rightquerypos - 1; curquerypos > leftquerypos; --curquerypos) {
1800 	  debug7(printf("  pushing indel at %d\n",curquerypos));
1801 	  pairs = Pairpool_push(pairs,pairpool,curquerypos,rightgenomepos,
1802 				queryseq_ptr[curquerypos],INDEL_COMP,/*genome*/' ',/*genomealt*/' ',
1803 				/*dynprogindex*/0);
1804 	}
1805 
1806       } else if (queryjump > 0 /* || stage3debug > NO_STAGE3DEBUG */) {
1807 	debug7(printf("  Gap is a dual break\n"));
1808 	pair->comp = DUALBREAK_COMP;
1809 #ifdef WASTE
1810 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
1811 #else
1812 	pairs = List_transfer_one(pairs,&path);
1813 #endif
1814 
1815       } else {
1816 	debug7(printf("Gap is an intron\n"));
1817 
1818 	pairptr = path;		/* save */
1819 	path = Pairpool_pop(path,&pair);
1820 
1821 	leftpair = path->first;
1822 	rightpair = pairs->first;
1823 
1824 	leftquerypos = leftpair->querypos;
1825 	leftgenomepos = leftpair->genomepos;
1826 	/* if (leftpair->cdna == ' ') leftquerypos--; -- For old dynamic programming */
1827 	/* if (leftpair->genome == ' ') leftgenomepos--; -- For old dynamic programming */
1828 	rightquerypos = rightpair->querypos;
1829 	rightgenomepos = rightpair->genomepos;
1830 
1831 	pair->queryjump = rightquerypos - leftquerypos - 1;
1832 	pair->genomejump = rightgenomepos - leftgenomepos - 1;
1833 
1834 	left1 = get_genomic_nt(&left1_alt,leftgenomepos+1,chroffset,chrhigh,watsonp);
1835 	left2 = get_genomic_nt(&left2_alt,leftgenomepos+2,chroffset,chrhigh,watsonp);
1836 	right2 = get_genomic_nt(&right2_alt,rightgenomepos-2,chroffset,chrhigh,watsonp);
1837 	right1 = get_genomic_nt(&right1_alt,rightgenomepos-1,chroffset,chrhigh,watsonp);
1838 	debug7(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
1839 	introntype = Intron_type(left1,left2,right2,right1,
1840 				 left1_alt,left2_alt,right2_alt,right1_alt,
1841 				 cdna_direction);
1842 	debug7(printf("  Introntype at %u..%u is %s (cdna_direction %d)\n",
1843 		      leftgenomepos,rightgenomepos,Intron_type_string(introntype),cdna_direction));
1844 
1845 	intronlength = rightgenomepos - leftgenomepos - 1;
1846 	if (intronlength < min_intronlength) {
1847 	  debug7(printf("  Gap is too short to be an intron (intronlength %d).  Replacing with pairs from %d downto %d\n",
1848 			intronlength,rightgenomepos-1,leftgenomepos+1));
1849 	  for (genomicpos = rightgenomepos - 1; genomicpos > leftgenomepos; --genomicpos) {
1850 	    c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
1851 	    pairs = Pairpool_push(pairs,pairpool,rightquerypos,genomicpos,' ',/*comp*/SHORTGAP_COMP,c2,c2_alt,
1852 				  /*dynprogindex*/0);
1853 	  }
1854 	  debug7(printf("  Gap is a short gap with queryjump %d, genomejump %d, so discarding the gap pair\n",queryjump,genomejump));
1855 	  /* Discard the gap */
1856 
1857 	} else if (cdna_direction > 0) {
1858 	  pair->introntype = introntype;
1859 	  switch (introntype) {
1860 	  case GTAG_FWD: pair->comp = FWD_CANONICAL_INTRON_COMP; break;
1861 	  case GCAG_FWD: pair->comp = FWD_GCAG_INTRON_COMP; break;
1862 	  case ATAC_FWD: pair->comp = FWD_ATAC_INTRON_COMP; break;
1863 	  case NONINTRON: pair->comp = NONINTRON_COMP; break;
1864 	  default:
1865 	    printf("Unexpected intron type %d\n",introntype);
1866 	    fprintf(stderr,"Unexpected intron type %d\n",introntype);
1867 	    abort();
1868 	  }
1869 	  debug7(printf("  Gap is a fwd intron (intronlength %d), now of type %c\n",intronlength,pair->comp));
1870 
1871 	  if (watsonp == true) {
1872 	    splicesitepos = leftgenomepos + 1;
1873 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
1874 								      splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/+1)) {
1875 	      debug12(printf("1. donor at splicesitepos %u is known\n",splicesitepos));
1876 	      pair->donor_prob = 1.0;
1877 	    } else {
1878 	      pair->donor_prob = Maxent_hr_donor_prob(chroffset + splicesitepos,chroffset);
1879 	      debug12(printf("1. donor at splicesitepos %u has prob %f\n",splicesitepos,pair->donor_prob));
1880 	    }
1881 
1882 	    splicesitepos = rightgenomepos;
1883 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
1884 								      splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/+1)) {
1885 	      debug12(printf("2. acceptor at splicesitepos %u is known\n",splicesitepos));
1886 	      pair->acceptor_prob = 1.0;
1887 	    } else {
1888 	      pair->acceptor_prob = Maxent_hr_acceptor_prob(chroffset + splicesitepos,chroffset);
1889 	      debug12(printf("2. acceptor at splicesitepos %u has prob %f\n",splicesitepos,pair->acceptor_prob));
1890 	    }
1891 
1892 	  } else {
1893 	    splicesitepos = (chrhigh - chroffset) - leftgenomepos;
1894 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
1895 								      splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/-1)) {
1896 	      debug12(printf("3. antidonor at splicesitepos %u is known\n",splicesitepos));
1897 	      pair->donor_prob = 1.0;
1898 	    } else {
1899 	      pair->donor_prob = Maxent_hr_antidonor_prob(chroffset + splicesitepos,chroffset);
1900 	      debug12(printf("3. antidonor at splicesitepos %u has prob %f\n",splicesitepos,pair->donor_prob));
1901 	    }
1902 
1903 	    splicesitepos = (chrhigh - chroffset) - rightgenomepos + 1;
1904 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
1905 								      splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/-1)) {
1906 	      debug12(printf("4. antiacceptor at splicesitepos %u is known\n",splicesitepos));
1907 	      pair->acceptor_prob = 1.0;
1908 	    } else {
1909 	      pair->acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset + splicesitepos,chroffset);
1910 	      debug12(printf("4. antiacceptor at splicesitepos %u has prob %f\n",splicesitepos,pair->acceptor_prob));
1911 	    }
1912 	  }
1913 
1914 	  /* Push the gap back on */
1915 #ifdef WASTE
1916 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
1917 #else
1918 	  pairs = List_push_existing(pairs,pairptr);
1919 #endif
1920 
1921 #ifndef PMAP
1922 	} else if (cdna_direction < 0) {
1923 	  pair->introntype = introntype;
1924 	  switch (introntype) {
1925 	  case ATAC_REV: pair->comp = REV_ATAC_INTRON_COMP; break;
1926 	  case GCAG_REV: pair->comp = REV_GCAG_INTRON_COMP; break;
1927 	  case GTAG_REV: pair->comp = REV_CANONICAL_INTRON_COMP; break;
1928 	  case NONINTRON: pair->comp = NONINTRON_COMP; break;
1929 	  default:
1930 	    printf("Unexpected intron type %d\n",introntype);
1931 	    fprintf(stderr,"Unexpected intron type %d\n",introntype);
1932 	    abort();
1933 	  }
1934 	  debug7(printf("  Gap is a rev intron (intronlength %d), now of type %c\n",intronlength,pair->comp));
1935 
1936 	  if (watsonp == true) {
1937 	    splicesitepos = leftgenomepos + 1;
1938 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
1939 								      splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/-1)) {
1940 	      debug12(printf("5. antiacceptor at splicesitepos %u is known\n",splicesitepos));
1941 	      pair->acceptor_prob = 1.0;
1942 	    } else {
1943 	      pair->acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset + splicesitepos,chroffset);
1944 	      debug12(printf("5. antiacceptor at splicesitepos %u has prob %f\n",splicesitepos,pair->acceptor_prob));
1945 	    }
1946 
1947 	    splicesitepos = rightgenomepos;
1948 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
1949 								      splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/-1)) {
1950 	      debug12(printf("6. antidonor at splicesitepos %u is known\n",splicesitepos));
1951 	      pair->donor_prob = 1.0;
1952 	    } else {
1953 	      pair->donor_prob = Maxent_hr_antidonor_prob(chroffset + splicesitepos,chroffset);
1954 	      debug12(printf("6. antidonor at splicesitepos %u has prob %f\n",splicesitepos,pair->donor_prob));
1955 	    }
1956 
1957 	  } else {
1958 	    splicesitepos = (chrhigh - chroffset) - leftgenomepos;
1959 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
1960 								      splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/+1)) {
1961 	      debug12(printf("7. acceptor at splicesitepos %u is known\n",splicesitepos));
1962 	      pair->acceptor_prob = 1.0;
1963 	    } else {
1964 	      pair->acceptor_prob = Maxent_hr_acceptor_prob(chroffset + splicesitepos,chroffset);
1965 	      debug12(printf("7. acceptor at splicesitepos %u has prob %f\n",splicesitepos,pair->acceptor_prob));
1966 	    }
1967 
1968 	    splicesitepos = (chrhigh - chroffset) - rightgenomepos + 1;
1969 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
1970 								      splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/+1)) {
1971 	      debug12(printf("8. donor at splicesitepos %u is known\n",splicesitepos));
1972 	      pair->donor_prob = 1.0;
1973 	    } else {
1974 	      pair->donor_prob = Maxent_hr_donor_prob(chroffset + splicesitepos,chroffset);
1975 	      debug12(printf("8. donor at splicesitepos %u has prob %f\n",splicesitepos,pair->donor_prob));
1976 	    }
1977 	  }
1978 
1979 	  /* Push the gap back on */
1980 #ifdef WASTE
1981 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
1982 #else
1983 	  pairs = List_push_existing(pairs,pairptr);
1984 #endif
1985 
1986 #endif	/* ifndef PMAP */
1987 
1988 	} else {
1989 	  /* cdna_direction == 0 */
1990 	  pair->introntype = introntype;
1991 	  switch (introntype) {
1992 	  case GTAG_FWD: pair->comp = FWD_CANONICAL_INTRON_COMP; break;
1993 	  case GCAG_FWD: pair->comp = FWD_GCAG_INTRON_COMP; break;
1994 	  case ATAC_FWD: pair->comp = FWD_ATAC_INTRON_COMP; break;
1995 #ifndef PMAP
1996 	  case ATAC_REV: pair->comp = REV_ATAC_INTRON_COMP; break;
1997 	  case GCAG_REV: pair->comp = REV_GCAG_INTRON_COMP; break;
1998 	  case GTAG_REV: pair->comp = REV_CANONICAL_INTRON_COMP; break;
1999 #endif
2000 	  case NONINTRON: pair->comp = NONINTRON_COMP; break;
2001 	  default:
2002 	    printf("Unexpected intron type %d\n",introntype);
2003 	    fprintf(stderr,"Unexpected intron type %d\n",introntype);
2004 	    abort();
2005 	  }
2006 	  pair->donor_prob = 0.0;
2007 	  pair->acceptor_prob = 0.0;
2008 
2009 	  /* Push the gap back on */
2010 #ifdef WASTE
2011 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
2012 #else
2013 	  pairs = List_push_existing(pairs,pairptr);
2014 #endif
2015 	}
2016       }
2017     }
2018   }
2019 
2020   return pairs;
2021 }
2022 
2023 
2024 
2025 static List_T
assign_intron_probs(List_T path,int cdna_direction,bool watsonp,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Pairpool_T pairpool)2026 assign_intron_probs (List_T path, int cdna_direction, bool watsonp,
2027 		     Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
2028 		     Pairpool_T pairpool) {
2029   List_T pairs = NULL, pairptr;
2030   Pair_T pair, leftpair, rightpair;
2031   Univcoord_T splicesitepos;
2032   int queryjump, genomejump, leftquerypos, rightquerypos, introntype, intronlength;
2033   Chrpos_T leftgenomepos, rightgenomepos, genomicpos;
2034   char left1, left2, right2, right1, left1_alt, left2_alt, right2_alt, right1_alt, c2, c2_alt;
2035 
2036   debug(printf("\n** Starting assign_intron_probs with watsonp %d and cdna_direction %d\n",watsonp,cdna_direction));
2037   while (path != NULL) {
2038     /* pairptr = path; */
2039     /* path = Pairpool_pop(path,&pair); */
2040     pair = (Pair_T) path->first;
2041     if (pair->gapp == false) {
2042 #ifdef WASTE
2043       pairs = Pairpool_push_existing(pairs,pairpool,pair);
2044 #else
2045       pairs = List_transfer_one(pairs,&path);
2046 #endif
2047 
2048     } else if (pairs == NULL) {
2049       /* Discard initial gap */
2050       path = Pairpool_pop(path,&pair);
2051 
2052     } else if (path->rest == NULL) {
2053       /* Discard terminal gap */
2054       path = Pairpool_pop(path,&pair);
2055 
2056     } else {
2057       queryjump = pair->queryjump;
2058       genomejump = pair->genomejump;
2059 
2060       if (queryjump == 0 && genomejump == 0) {
2061 	debug7(printf("  Gap is a non-gap\n"));
2062 	/* Discard the gap pair */
2063 	path = Pairpool_pop(path,&pair);
2064 
2065       } else if (genomejump == 0) {
2066 #ifdef WASTE
2067 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
2068 #else
2069 	pairs = List_transfer_one(pairs,&path);
2070 #endif
2071 
2072       } else if (queryjump > 0) {
2073 	debug7(printf("  Gap is a dual break\n"));
2074 	pair->comp = DUALBREAK_COMP;
2075 #ifdef WASTE
2076 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
2077 #else
2078 	pairs = List_transfer_one(pairs,&path);
2079 #endif
2080 
2081       } else {
2082 	debug7(printf("Gap is an intron\n"));
2083 
2084 	pairptr = path;		/* save */
2085 	path = Pairpool_pop(path,&pair);
2086 
2087 	leftpair = path->first;
2088 	rightpair = pairs->first;
2089 
2090 	leftquerypos = leftpair->querypos;
2091 	leftgenomepos = leftpair->genomepos;
2092 	/* if (leftpair->cdna == ' ') leftquerypos--; -- For old dynamic programming */
2093 	/* if (leftpair->genome == ' ') leftgenomepos--; -- For old dynamic programming */
2094 	rightquerypos = rightpair->querypos;
2095 	rightgenomepos = rightpair->genomepos;
2096 
2097 	pair->queryjump = rightquerypos - leftquerypos - 1;
2098 	pair->genomejump = rightgenomepos - leftgenomepos - 1;
2099 
2100 	left1 = get_genomic_nt(&left1_alt,leftgenomepos+1,chroffset,chrhigh,watsonp);
2101 	left2 = get_genomic_nt(&left2_alt,leftgenomepos+2,chroffset,chrhigh,watsonp);
2102 	right2 = get_genomic_nt(&right2_alt,rightgenomepos-2,chroffset,chrhigh,watsonp);
2103 	right1 = get_genomic_nt(&right1_alt,rightgenomepos-1,chroffset,chrhigh,watsonp);
2104 	debug7(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
2105 	introntype = Intron_type(left1,left2,right2,right1,
2106 				 left1_alt,left2_alt,right2_alt,right1_alt,
2107 				 cdna_direction);
2108 	debug7(printf("  Introntype at %u..%u is %s (cdna_direction %d)\n",
2109 		      leftgenomepos,rightgenomepos,Intron_type_string(introntype),cdna_direction));
2110 
2111 	intronlength = (int) (rightgenomepos - leftgenomepos - 1);
2112 	if (intronlength < min_intronlength) {
2113 	  debug7(printf("  Gap is too short to be an intron (intronlength %d).  Replacing with pairs from %d downto %d\n",
2114 			intronlength,rightgenomepos-1,leftgenomepos+1));
2115 	  for (genomicpos = rightgenomepos - 1; genomicpos > leftgenomepos; --genomicpos) {
2116 	    c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
2117 	    pairs = Pairpool_push(pairs,pairpool,rightquerypos,genomicpos,' ',/*comp*/SHORTGAP_COMP,c2,c2_alt,
2118 				  /*dynprogindex*/0);
2119 	  }
2120 	  debug7(printf("  Gap is a short gap with queryjump %d, genomejump %d, so discarding the gap pair\n",queryjump,genomejump));
2121 	  /* Discard the gap */
2122 
2123 	} else if (cdna_direction > 0) {
2124 	  pair->introntype = introntype;
2125 	  switch (introntype) {
2126 	  case GTAG_FWD: pair->comp = FWD_CANONICAL_INTRON_COMP; break;
2127 	  case GCAG_FWD: pair->comp = FWD_GCAG_INTRON_COMP; break;
2128 	  case ATAC_FWD: pair->comp = FWD_ATAC_INTRON_COMP; break;
2129 	  case NONINTRON: pair->comp = NONINTRON_COMP; break;
2130 	  default:
2131 	    printf("Unexpected intron type %d\n",introntype);
2132 	    fprintf(stderr,"Unexpected intron type %d\n",introntype);
2133 	    abort();
2134 	  }
2135 	  debug7(printf("  Gap is a fwd intron (intronlength %d), now of type %c\n",intronlength,pair->comp));
2136 
2137 	  if (watsonp == true) {
2138 	    splicesitepos = leftgenomepos + 1;
2139 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
2140 								      splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/+1)) {
2141 	      debug12(printf("1. donor at splicesitepos %u is known\n",splicesitepos));
2142 	      pair->donor_prob = 1.0;
2143 	    } else {
2144 	      pair->donor_prob = Maxent_hr_donor_prob(chroffset + splicesitepos,chroffset);
2145 	      debug12(printf("1. donor at splicesitepos %u has prob %f\n",splicesitepos,pair->donor_prob));
2146 	    }
2147 
2148 	    splicesitepos = rightgenomepos;
2149 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
2150 								      splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/+1)) {
2151 	      debug12(printf("2. acceptor at splicesitepos %u is known\n",splicesitepos));
2152 	      pair->acceptor_prob = 1.0;
2153 	    } else {
2154 	      pair->acceptor_prob = Maxent_hr_acceptor_prob(chroffset + splicesitepos,chroffset);
2155 	      debug12(printf("2. acceptor at splicesitepos %u has prob %f\n",splicesitepos,pair->acceptor_prob));
2156 	    }
2157 
2158 	  } else {
2159 	    splicesitepos = (chrhigh - chroffset) - leftgenomepos;
2160 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
2161 								      splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/-1)) {
2162 	      debug12(printf("3. antidonor at splicesitepos %u is known\n",splicesitepos));
2163 	      pair->donor_prob = 1.0;
2164 	    } else {
2165 	      pair->donor_prob = Maxent_hr_antidonor_prob(chroffset + splicesitepos,chroffset);
2166 	      debug12(printf("3. antidonor at splicesitepos %u has prob %f\n",splicesitepos,pair->donor_prob));
2167 	    }
2168 
2169 	    splicesitepos = (chrhigh - chroffset) - rightgenomepos + 1;
2170 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
2171 								      splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/-1)) {
2172 	      debug12(printf("4. antiacceptor at splicesitepos %u is known\n",splicesitepos));
2173 	      pair->acceptor_prob = 1.0;
2174 	    } else {
2175 	      pair->acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset + splicesitepos,chroffset);
2176 	      debug12(printf("4. antiacceptor at splicesitepos %u has prob %f\n",splicesitepos,pair->acceptor_prob));
2177 	    }
2178 	  }
2179 
2180 	  /* Push the gap back on */
2181 #ifdef WASTE
2182 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
2183 #else
2184 	  pairs = List_push_existing(pairs,pairptr);
2185 #endif
2186 
2187 #ifndef PMAP
2188 	} else if (cdna_direction < 0) {
2189 	  pair->introntype = introntype;
2190 	  switch (introntype) {
2191 	  case ATAC_REV: pair->comp = REV_ATAC_INTRON_COMP; break;
2192 	  case GCAG_REV: pair->comp = REV_GCAG_INTRON_COMP; break;
2193 	  case GTAG_REV: pair->comp = REV_CANONICAL_INTRON_COMP; break;
2194 	  case NONINTRON: pair->comp = NONINTRON_COMP; break;
2195 	  default:
2196 	    printf("Unexpected intron type %d\n",introntype);
2197 	    fprintf(stderr,"Unexpected intron type %d\n",introntype);
2198 	    abort();
2199 	  }
2200 	  debug7(printf("  Gap is a rev intron (intronlength %d), now of type %c\n",intronlength,pair->comp));
2201 
2202 	  if (watsonp == true) {
2203 	    splicesitepos = leftgenomepos + 1;
2204 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
2205 								      splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/-1)) {
2206 	      debug12(printf("5. antiacceptor at splicesitepos %u is known\n",splicesitepos));
2207 	      pair->acceptor_prob = 1.0;
2208 	    } else {
2209 	      pair->acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset + splicesitepos,chroffset);
2210 	      debug12(printf("5. antiacceptor at splicesitepos %u has prob %f\n",splicesitepos,pair->acceptor_prob));
2211 	    }
2212 
2213 	    splicesitepos = rightgenomepos;
2214 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
2215 								      splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/-1)) {
2216 	      debug12(printf("6. antidonor at splicesitepos %u is known\n",splicesitepos));
2217 	      pair->donor_prob = 1.0;
2218 	    } else {
2219 	      pair->donor_prob = Maxent_hr_antidonor_prob(chroffset + splicesitepos,chroffset);
2220 	      debug12(printf("6. antidonor at splicesitepos %u has prob %f\n",splicesitepos,pair->donor_prob));
2221 	    }
2222 
2223 	  } else {
2224 	    splicesitepos = (chrhigh - chroffset) - leftgenomepos;
2225 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
2226 								      splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/+1)) {
2227 	      debug12(printf("7. acceptor at splicesitepos %u is known\n",splicesitepos));
2228 	      pair->acceptor_prob = 1.0;
2229 	    } else {
2230 	      pair->acceptor_prob = Maxent_hr_acceptor_prob(chroffset + splicesitepos,chroffset);
2231 	      debug12(printf("7. acceptor at splicesitepos %u has prob %f\n",splicesitepos,pair->acceptor_prob));
2232 	    }
2233 
2234 	    splicesitepos = (chrhigh - chroffset) - rightgenomepos + 1;
2235 	    if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
2236 								      splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/+1)) {
2237 	      debug12(printf("8. donor at splicesitepos %u is known\n",splicesitepos));
2238 	      pair->donor_prob = 1.0;
2239 	    } else {
2240 	      pair->donor_prob = Maxent_hr_donor_prob(chroffset + splicesitepos,chroffset);
2241 	      debug12(printf("8. donor at splicesitepos %u has prob %f\n",splicesitepos,pair->donor_prob));
2242 	    }
2243 	  }
2244 
2245 	  /* Push the gap back on */
2246 #ifdef WASTE
2247 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
2248 #else
2249 	  pairs = List_push_existing(pairs,pairptr);
2250 #endif
2251 
2252 #endif	/* ifndef PMAP */
2253 
2254 	} else {
2255 	  /* cdna_direction == 0 */
2256 	  pair->introntype = introntype;
2257 	  switch (introntype) {
2258 	  case GTAG_FWD: pair->comp = FWD_CANONICAL_INTRON_COMP; break;
2259 	  case GCAG_FWD: pair->comp = FWD_GCAG_INTRON_COMP; break;
2260 	  case ATAC_FWD: pair->comp = FWD_ATAC_INTRON_COMP; break;
2261 #ifndef PMAP
2262 	  case ATAC_REV: pair->comp = REV_ATAC_INTRON_COMP; break;
2263 	  case GCAG_REV: pair->comp = REV_GCAG_INTRON_COMP; break;
2264 	  case GTAG_REV: pair->comp = REV_CANONICAL_INTRON_COMP; break;
2265 #endif
2266 	  case NONINTRON: pair->comp = NONINTRON_COMP; break;
2267 	  default:
2268 	    printf("Unexpected intron type %d\n",introntype);
2269 	    fprintf(stderr,"Unexpected intron type %d\n",introntype);
2270 	    abort();
2271 	  }
2272 	  pair->donor_prob = 0.0;
2273 	  pair->acceptor_prob = 0.0;
2274 
2275 	  /* Push the gap back on */
2276 #ifdef WASTE
2277 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
2278 #else
2279 	  pairs = List_push_existing(pairs,pairptr);
2280 #endif
2281 	}
2282       }
2283     }
2284   }
2285 
2286   return pairs;
2287 }
2288 
2289 
2290 /* Modeled after assign_gap_types */
2291 static List_T
remove_indel_gaps(List_T path,Pairpool_T pairpool)2292 remove_indel_gaps (List_T path
2293 #ifdef WASTE
2294 		   , Pairpool_T pairpool
2295 #endif
2296 		   ) {
2297   List_T pairs = NULL, pairptr;
2298   Pair_T pair, leftpair, rightpair;
2299   int queryjump, genomejump;
2300   Chrpos_T leftgenomepos, rightgenomepos;
2301   int intronlength;
2302 
2303   debug(printf("\n** Starting remove_indel_gaps\n"));
2304   while (path != NULL) {
2305     /* pairptr = path; */
2306     /* path = Pairpool_pop(path,&pair); */
2307     pair = (Pair_T) path->first;
2308 
2309     if (pair->gapp == false) {
2310 #ifdef WASTE
2311       pairs = Pairpool_push_existing(pairs,pairpool,pair);
2312 #else
2313       pairs = List_transfer_one(pairs,&path);
2314 #endif
2315 
2316     } else if (pairs == NULL) {
2317       /* Discard initial gap */
2318       path = Pairpool_pop(path,&pair);
2319 
2320     } else if (path->rest == NULL) {
2321       /* Discard terminal gap */
2322       path = Pairpool_pop(path,&pair);
2323 
2324     } else {
2325       queryjump = pair->queryjump;
2326       genomejump = pair->genomejump;
2327 
2328       if (queryjump == 0 && genomejump == 0) {
2329 	debug7(printf("  Gap is a non-gap\n"));
2330 	/* Discard the gap pair */
2331 	path = Pairpool_pop(path,&pair);
2332 
2333       } else if (genomejump == 0) {
2334 	debug7(printf("  Gap is a cDNA insertion\n"));
2335 	/* pair->comp = INDEL_COMP; */
2336 	/* Discard the gap pair */
2337 	path = Pairpool_pop(path,&pair);
2338 
2339       } else if (queryjump > 0) {
2340 	debug7(printf("  Gap is a dual break\n"));
2341 	pair->comp = DUALBREAK_COMP;
2342 #ifdef WASTE
2343 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
2344 #else
2345 	pairs = List_transfer_one(pairs,&path);
2346 #endif
2347 
2348       } else {
2349 	debug7(printf("  Gap is an intron of type %c\n",pair->comp));
2350 
2351 	pairptr = path;		/* save */
2352 	path = Pairpool_pop(path,&pair);
2353 
2354 	leftpair = path->first;
2355 	rightpair = pairs->first;
2356 
2357 	leftgenomepos = leftpair->genomepos;
2358 	/* if (leftpair->genome == ' ') leftgenomepos--; -- For old dynamic programming */
2359 	rightgenomepos = rightpair->genomepos;
2360 
2361 	intronlength = (int) (rightgenomepos - leftgenomepos - 1);
2362 	if (intronlength < min_intronlength) {
2363 	  debug7(printf("  Gap is short (intronlength %d).  Adding pairs from %d downto %d\n",
2364 			intronlength,rightgenomepos-1,leftgenomepos+1));
2365 	  debug7(printf("  Gap is a short gap, so discarding the gap pair\n"));
2366 	  /* Discard the gap pair */
2367 
2368 	} else {
2369 	  debug7(printf("  Gap is not short (intronlength %d)\n",intronlength));
2370 	  /* Push the gap back on */
2371 #ifdef WASTE
2372 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
2373 #else
2374 	  pairs = List_push_existing(pairs,pairptr);
2375 #endif
2376 	}
2377       }
2378     }
2379   }
2380 
2381   return pairs;
2382 }
2383 
2384 
2385 
2386 #ifdef PMAP
2387 static List_T
undefine_nucleotides(char * queryseq_ptr,int querylength,List_T path,Pairpool_T pairpool,int width)2388 undefine_nucleotides (char *queryseq_ptr, int querylength, List_T path, Pairpool_T pairpool, int width) {
2389   List_T pairs = NULL, pairptr;
2390   Pair_T pair, leftpair, rightpair;
2391   int leftquerypos, rightquerypos, pos;
2392   Chrpos_T leftgenomepos, rightgenomepos;
2393 
2394   debug(printf("\n** Starting undefine_nucleotides\n"));
2395 
2396   if (path != NULL) {
2397     pairptr = path;
2398     path = Pairpool_pop(path,&pair);
2399 #ifdef WASTE
2400     pairs = Pairpool_push_existing(NULL,pairpool,pair);
2401 #else
2402     pairs = List_push_existing(NULL,pairptr);
2403 #endif
2404     rightquerypos = pair->querypos;
2405     rightgenomepos = pair->genomepos;
2406   }
2407 
2408   while (path != NULL) {
2409     pairptr = path;
2410     path = Pairpool_pop(path,&pair);
2411     if (pair->gapp == true) {
2412       leftpair = path->first;
2413       rightpair = pairs->first;
2414 
2415       leftquerypos = leftpair->querypos;
2416       leftgenomepos = leftpair->genomepos;
2417       /* if (leftpair->cdna == ' ') leftquerypos--; -- For old dynamic programming */
2418       /* if (leftpair->genome == ' ') leftgenomepos--; -- For old dynamic programming */
2419 
2420       rightquerypos = rightpair->querypos;
2421       rightgenomepos = rightpair->genomepos;
2422 
2423       debug(printf("Undefining around rightquerypos = %d and leftquerypos = %d\n",rightquerypos,leftquerypos));
2424       for (pos = rightquerypos; pos < rightquerypos + width && pos < querylength; pos++) {
2425 	queryseq_ptr[pos] = BACKTRANSLATE_CHAR;
2426       }
2427       for (pos = leftquerypos; pos > leftquerypos - width && pos >= 0; --pos) {
2428 	queryseq_ptr[pos] = BACKTRANSLATE_CHAR;
2429       }
2430     }
2431 #ifdef WASTE
2432     pairs = Pairpool_push_existing(pairs,pairpool,pair);
2433 #else
2434     pairs = List_push_existing(pairs,pairptr);
2435 #endif
2436   }
2437 
2438   return pairs;
2439 }
2440 #endif
2441 
2442 
2443 static List_T
add_dualbreak(List_T pairs,char * queryseq_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,int cdna_direction,bool watsonp,Pair_T leftpair,Pair_T rightpair,Pairpool_T pairpool,int ngap)2444 add_dualbreak (List_T pairs, char *queryseq_ptr,
2445 	       Univcoord_T chroffset, Univcoord_T chrhigh, int cdna_direction,
2446 	       bool watsonp, Pair_T leftpair, Pair_T rightpair, Pairpool_T pairpool, int ngap) {
2447   int leftquerypos, rightquerypos, k;
2448   Chrpos_T leftgenomepos, rightgenomepos, gapgenomepos, midpoint, genomicpos;
2449   int introntype;
2450   char left1, left2, right2, right1, left1_alt, left2_alt, right2_alt, right1_alt;
2451   char c1, c2, c2_alt, comp;
2452 
2453   leftquerypos = leftpair->querypos;
2454   leftgenomepos = leftpair->genomepos;
2455   /* if (leftpair->cdna == ' ') leftquerypos--; -- For old dynamic programming */
2456   /* if (leftpair->genome == ' ') leftgenomepos--; -- For old dynamic programming */
2457   rightquerypos = rightpair->querypos;
2458   rightgenomepos = rightpair->genomepos;
2459 
2460   /* Previously checked for genomicuc_ptr != NULL, but this does not
2461      work with second round of prepare_for_printing */
2462   left1 = get_genomic_nt(&left1_alt,leftgenomepos+1,chroffset,chrhigh,watsonp);
2463   left2 = get_genomic_nt(&left2_alt,leftgenomepos+2,chroffset,chrhigh,watsonp);
2464   right2 = get_genomic_nt(&right2_alt,rightgenomepos-2,chroffset,chrhigh,watsonp);
2465   right1 = get_genomic_nt(&right1_alt,rightgenomepos-1,chroffset,chrhigh,watsonp);
2466 
2467   debug7(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
2468   introntype = Intron_type(left1,left2,right2,right1,
2469 			   left1_alt,left2_alt,right2_alt,right1_alt,
2470 			   cdna_direction);
2471   debug7(printf("  Introntype at %u..%u is %s (cdna_direction %d)\n",
2472 		leftgenomepos,rightgenomepos,Intron_type_string(introntype),cdna_direction));
2473   switch (introntype) {
2474   case GTAG_FWD: comp = FWD_CANONICAL_INTRON_COMP; break;
2475   case GCAG_FWD: comp = FWD_GCAG_INTRON_COMP; break;
2476   case ATAC_FWD: comp = FWD_ATAC_INTRON_COMP; break;
2477 #ifndef PMAP
2478   case ATAC_REV: comp = REV_ATAC_INTRON_COMP; break;
2479   case GCAG_REV: comp = REV_GCAG_INTRON_COMP; break;
2480   case GTAG_REV: comp = REV_CANONICAL_INTRON_COMP; break;
2481 #endif
2482   case NONINTRON: comp = NONINTRON_COMP; break;
2483   default:
2484     printf("Unexpected intron type %d\n",introntype);
2485     fprintf(stderr,"Unexpected intron type %d\n",introntype);
2486     abort();
2487   }
2488   /* End of check */
2489 
2490 
2491   /* queryjump = rightquerypos - leftquerypos - 1; */
2492   /* genomejump = rightgenomepos - leftgenomepos - 1; */
2493 
2494   if ((int) (rightgenomepos - leftgenomepos - 1) < ngap + ngap) {
2495     midpoint = (rightgenomepos + leftgenomepos) / 2;
2496 
2497     /* First insertion */
2498     for (genomicpos = rightgenomepos - 1; genomicpos >= midpoint; --genomicpos) {
2499       c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
2500       pairs = Pairpool_push_gapalign(pairs,pairpool,rightquerypos,genomicpos,
2501 				     /*cdna*/' ',comp,introntype,c2,c2_alt,/*extraexonp*/true);
2502     }
2503 
2504     /* cDNA sequence */
2505     gapgenomepos = genomicpos + 1;
2506     for (k = rightquerypos - 1; k > leftquerypos; --k) {
2507 #if 0				/* PMAP */
2508       c1 = Sequence_codon_char(queryaaseq_ptr[k/3],k%3);
2509 #else
2510       c1 = queryseq_ptr[k];
2511 #endif
2512       pairs = Pairpool_push_gapalign(pairs,pairpool,k,gapgenomepos,
2513 				     c1,EXTRAEXON_COMP,/*introntype*/NONINTRON,c1,c1,
2514 				     /*extraexonp*/true); /* Transfer cDNA char to genome */
2515     }
2516 
2517     /* Second insertion */
2518     for (genomicpos = midpoint - 1; genomicpos > leftgenomepos; --genomicpos) {
2519       c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
2520       pairs = Pairpool_push_gapalign(pairs,pairpool,leftquerypos,genomicpos,
2521 				     /*cdna*/' ',comp,introntype,c2,c2_alt,/*extraexonp*/true);
2522     }
2523 
2524   } else {
2525 
2526     /* First insertion */
2527     for (k = 0, genomicpos = rightgenomepos - 1; k < ngap; k++, --genomicpos) {
2528       c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
2529       pairs = Pairpool_push_gapalign(pairs,pairpool,rightquerypos,genomicpos,
2530 				     /*cdna*/' ',comp,introntype,c2,c2_alt,/*extraexonp*/true);
2531     }
2532 
2533     /* cDNA sequence */
2534     gapgenomepos = genomicpos + 1;
2535     for (k = rightquerypos - 1; k > leftquerypos; --k) {
2536 #if 0				/* PMAP */
2537       c1 = Sequence_codon_char(queryaaseq_ptr[k/3],k%3);
2538 #else
2539       c1 = queryseq_ptr[k];
2540 #endif
2541       pairs = Pairpool_push_gapalign(pairs,pairpool,k,gapgenomepos,
2542 				     c1,EXTRAEXON_COMP,/*introntype*/NONINTRON,c1,c1,
2543 				     /*extraexonp*/true); /* Transfer cDNA char to genome */
2544     }
2545 
2546     /* Second insertion */
2547     genomicpos = leftgenomepos + ngap;
2548     for (k = 0; k < ngap; k++, --genomicpos) {
2549       c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
2550       pairs = Pairpool_push_gapalign(pairs,pairpool,leftquerypos,genomicpos,
2551 				     /*cdna*/' ',comp,introntype,c2,c2_alt,/*extraexonp*/true);
2552     }
2553   }
2554 
2555   return pairs;
2556 }
2557 
2558 
2559 static List_T
add_intron(List_T pairs,Univcoord_T chroffset,Univcoord_T chrhigh,Pair_T leftpair,Pair_T rightpair,char comp,int introntype,int ngap,bool watsonp,Pairpool_T pairpool)2560 add_intron (List_T pairs, Univcoord_T chroffset, Univcoord_T chrhigh,
2561 	    Pair_T leftpair, Pair_T rightpair, char comp, int introntype, int ngap,
2562 	    bool watsonp, Pairpool_T pairpool) {
2563   char c2, c2_alt;
2564   int rightquerypos;
2565   Chrpos_T leftgenomepos, rightgenomepos, gapgenomepos, genomicpos;
2566   int intronlength;
2567   int i;
2568 
2569   leftgenomepos = leftpair->genomepos;
2570   /* if (leftpair->genome == ' ') leftgenomepos--; -- For old dynamic programming */
2571   rightquerypos = rightpair->querypos;
2572   rightgenomepos = rightpair->genomepos;
2573 
2574   intronlength = (int) (rightgenomepos - leftgenomepos - 1);
2575 
2576   debug7(printf("Adding gap of type %c of length %d\n",comp,intronlength));
2577 
2578 #if 0
2579   /* Should not be necessary to fix introns at this point */
2580   if (cdna_direction >= 0) {
2581     switch (*comp) {
2582     case FWD_CANONICAL_INTRON_COMP: case FWD_GCAG_INTRON_COMP: case FWD_ATAC_INTRON_COMP: case NONINTRON: break;
2583     default:
2584       debug7(printf("Unexpected intron comp %c.  Need to fix.\n",*comp));
2585 
2586       left1 = genomicuc_ptr[leftgenomepos+1];
2587       left2 = genomicuc_ptr[leftgenomepos+2];
2588       right2 = genomicuc_ptr[rightgenomepos-2];
2589       right1 = genomicuc_ptr[rightgenomepos-1];
2590 
2591       debug7(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
2592       introntype = Intron_type(left1,left2,right2,right1,
2593 			       left1_alt,left2_alt,right2_alt,right1_alt,
2594 			       cdna_direction);
2595       debug7(printf("  Introntype at %u..%u is %s (cdna_direction %d)\n",
2596 		    leftgenomepos,rightgenomepos,Intron_type_string(introntype),cdna_direction));
2597       switch (introntype) {
2598       case GTAG_FWD: *comp = FWD_CANONICAL_INTRON_COMP; break;
2599       case GCAG_FWD: *comp = FWD_GCAG_INTRON_COMP; break;
2600       case ATAC_FWD: *comp = FWD_ATAC_INTRON_COMP; break;
2601       case NONINTRON:
2602 	intronlength = (int) (rightgenomepos - leftgenomepos - 1);
2603 	if (intronlength < min_intronlength) {
2604 	  *comp = SHORTGAP_COMP;	/* Will be printed as INDEL_COMP, but need to score as NONINTRON_COMP */
2605 	} else {
2606 	  *comp = NONINTRON_COMP;
2607 	}
2608       }
2609     }
2610   } else {
2611     switch (*comp) {
2612     case REV_CANONICAL_INTRON_COMP: case REV_GCAG_INTRON_COMP: case REV_ATAC_INTRON_COMP: case NONINTRON: break;
2613     default:
2614       debug7(printf("Unexpected intron comp %c.  Need to fix.\n",*comp));
2615 
2616       left1 = genomicuc_ptr[leftgenomepos+1];
2617       left2 = genomicuc_ptr[leftgenomepos+2];
2618       right2 = genomicuc_ptr[rightgenomepos-2];
2619       right1 = genomicuc_ptr[rightgenomepos-1];
2620 
2621       debug7(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
2622       introntype = Intron_type(left1,left2,right2,right1,
2623 			       left1_alt,left2_alt,right2_alt,right1_alt,
2624 			       cdna_direction);
2625       debug7(printf("  Introntype at %u..%u is %s (cdna_direction %d)\n",
2626 		    leftgenomepos,rightgenomepos,Intron_type_string(introntype),cdna_direction));
2627       switch (introntype) {
2628       case ATAC_REV: *comp = REV_ATAC_INTRON_COMP; break;
2629       case GCAG_REV: *comp = REV_GCAG_INTRON_COMP; break;
2630       case GTAG_REV: *comp = REV_CANONICAL_INTRON_COMP; break;
2631       case NONINTRON:
2632 	intronlength = (int) (rightgenomepos - leftgenomepos - 1);
2633 	if (intronlength < min_intronlength) {
2634 	  *comp = SHORTGAP_COMP;	/* Will be printed as INDEL_COMP, but need to score as NONINTRON_COMP */
2635 	} else {
2636 	  *comp = NONINTRON_COMP;
2637 	}
2638       }
2639     }
2640   }
2641 #endif
2642 
2643   if (intronlength < ngap + ngap + 3) {
2644     for (i = 0, genomicpos = rightgenomepos - 1; i < intronlength; i++, --genomicpos) {
2645       c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
2646       pairs = Pairpool_push_gapalign(pairs,pairpool,rightquerypos,genomicpos,
2647 				     /*cdna*/' ',comp,introntype,c2,c2_alt,/*extraexonp*/false);
2648     }
2649   } else {
2650     for (i = 0, genomicpos = rightgenomepos - 1; i < ngap; i++, --genomicpos) {
2651       c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
2652       pairs = Pairpool_push_gapalign(pairs,pairpool,rightquerypos,genomicpos,
2653 				     /*cdna*/' ',comp,introntype,c2,c2_alt,/*extraexonp*/false);
2654       debug7(printf("Pushing %c at genomicpos %d\n",c2,genomicpos));
2655     }
2656 
2657     gapgenomepos = genomicpos + 1;
2658     pairs = Pairpool_push_gapalign(pairs,pairpool,rightquerypos,gapgenomepos,' ',INTRONGAP_COMP,/*introntype*/NONINTRON,
2659 				   /*genome*/INTRONGAP_CHAR,/*genomealt*/INTRONGAP_CHAR,/*extraexonp*/false);
2660     pairs = Pairpool_push_gapalign(pairs,pairpool,rightquerypos,gapgenomepos,' ',INTRONGAP_COMP,/*introntype*/NONINTRON,
2661 				   /*genome*/INTRONGAP_CHAR,/*genomealt*/INTRONGAP_CHAR,/*extraexonp*/false);
2662     pairs = Pairpool_push_gapalign(pairs,pairpool,rightquerypos,gapgenomepos,' ',INTRONGAP_COMP,/*introntype*/NONINTRON,
2663 				   /*genome*/INTRONGAP_CHAR,/*genomealt*/INTRONGAP_CHAR,/*extraexonp*/false);
2664 
2665     genomicpos = leftgenomepos + ngap;
2666     for (i = ngap-1; i >= 0; --i, --genomicpos) {
2667       c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
2668       pairs = Pairpool_push_gapalign(pairs,pairpool,rightquerypos,genomicpos,
2669 				     /*cdna*/' ',comp,introntype,c2,c2_alt,/*extraexonp*/false);
2670       debug7(printf("Pushing %c at genomicpos %d\n",c2,genomicpos));
2671     }
2672   }
2673 
2674   return pairs;
2675 }
2676 
2677 
2678 
2679 /************************************************************************
2680  *   Fix adjacent indels
2681  ************************************************************************/
2682 
2683 /* Modeled after print_sam_forward in pair.c */
2684 /* Handles indels next to gaps */
2685 static List_T
fix_adjacent_indels(List_T pairs)2686 fix_adjacent_indels (List_T pairs) {
2687   List_T path = NULL, pairptr;
2688   Pair_T this = NULL, pair;
2689   bool in_exon = false;
2690   int Mlength = 0, Ilength = 0, Dlength = 0;
2691   char last_token_type = ' ';
2692   int last_token_length = 0, i;
2693 
2694   debug4(printf("Starting fix_adjacent_indels: "));
2695 
2696   while (pairs != NULL) {
2697     this = (Pair_T) List_head(pairs);
2698 
2699     if (this->gapp) {
2700       if (in_exon == true) {
2701 
2702 	if (Mlength > 0) {
2703 	  last_token_type = 'M';
2704 	  last_token_length = Mlength;
2705 	  debug4(printf("%dM",Mlength));
2706 	} else if (Ilength > 0) {
2707 	  debug4(printf("%dI",Ilength));
2708 	  if (last_token_type == 'I' || last_token_type == 'D') {
2709 	    debug4(printf("fix_adjacent_indels found %d%c to %d%c\n",last_token_length,last_token_type,Ilength,'I'));
2710 	    for (i = 0; i < last_token_length + Ilength; i++) {
2711 	      path = Pairpool_pop(path,&pair);
2712 	    }
2713 	    last_token_type = 'I';
2714 	    last_token_length = 0; /* Since we have already taken care of this */
2715 	  } else {
2716 	    last_token_type = 'I';
2717 	    last_token_length = Ilength;
2718 	  }
2719 	} else if (Dlength > 0) {
2720 	  debug4(printf("%dD",Dlength));
2721 	  if (last_token_type == 'I' || last_token_type == 'D') {
2722 	    debug4(printf("fix_adjacent_indels found %d%c to %d%c\n",last_token_length,last_token_type,Dlength,'D'));
2723 	    for (i = 0; i < last_token_length + Dlength; i++) {
2724 	      path = Pairpool_pop(path,&pair);
2725 	    }
2726 	    last_token_type = 'D';
2727 	    last_token_length = 0; /* Since we have already taken care of this */
2728 	  } else {
2729 	    last_token_type = 'D';
2730 	    last_token_length = Dlength;
2731 	  }
2732 	}
2733 
2734 	Mlength = Ilength = Dlength = 0;
2735 	in_exon = false;
2736       }
2737 
2738     } else if (this->comp == INTRONGAP_COMP) {
2739       /* Do nothing */
2740 
2741     } else {
2742       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
2743 	 SHORTGAP_COMP, or MISMATCH_COMP */
2744       if (in_exon == false) {
2745 
2746 	if (last_token_type != ' ') {
2747 	  /* Gap */
2748 	  debug4(printf("?N"));
2749 	  last_token_type = 'N';  /* Could potentially also be considered 'D' */
2750 	  last_token_length = 0;
2751 
2752 #if 0
2753 	  query_gap = this->querypos - exon_queryend;
2754 	  if (query_gap > 0) {
2755 	    /* Dual gap.  Don't try to piece together.  */
2756 	    debug4(printf("%dI",query_gap));
2757 	    last_token_type = 'I';
2758 	    last_token_length = query_gap;
2759 	  }
2760 #endif
2761 	}
2762 
2763 	in_exon = true;
2764       }
2765 
2766       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
2767 	/* Gap in upper or lower sequence */
2768 	if (this->genome == ' ') {
2769 	  if (Mlength > 0) {
2770 	    debug4(printf("%dM",Mlength));
2771 	    last_token_type = 'M';
2772 	    last_token_length = Mlength;
2773 	    Mlength = 0;
2774 
2775 	  } else if (Dlength > 0) {
2776 	    /* unlikely */
2777 	    debug4(printf("%dD",Dlength));
2778 	    if (last_token_type == 'I' || last_token_type == 'D') {
2779 	      debug4(printf("fix_adjacent_indels found %d%c to %d%c\n",last_token_length,last_token_type,Dlength,'D'));
2780 	      for (i = 0; i < last_token_length + Dlength; i++) {
2781 		path = Pairpool_pop(path,&pair);
2782 	      }
2783 	      last_token_type = 'D';
2784 	      last_token_length = 0; /* Since we have already taken care of this */
2785 	    } else {
2786 	      last_token_type = 'D';
2787 	      last_token_length = Dlength;
2788 	      Dlength = 0;
2789 	    }
2790 	  }
2791 	  Ilength++;
2792 
2793 	} else if (this->cdna == ' ') {
2794 	  if (Mlength > 0) {
2795 	    debug4(printf("%dM",Mlength));
2796 	    last_token_type = 'M';
2797 	    last_token_length = Mlength;
2798 	    Mlength = 0;
2799 
2800 	  } else if (Ilength > 0) {
2801 	    debug4(printf("%dI",Ilength));
2802 	    if (last_token_type == 'I' || last_token_type == 'D') {
2803 	      debug4(printf("fix_adjacent_indels found %d%c to %d%c\n",last_token_length,last_token_type,Ilength,'I'));
2804 	      for (i = 0; i < last_token_length + Ilength; i++) {
2805 		path = Pairpool_pop(path,&pair);
2806 	      }
2807 	      last_token_type = 'I';
2808 	      last_token_length = 0; /* Since we have already taken care of this */
2809 	    } else {
2810 	      last_token_type = 'I';
2811 	      last_token_length = Ilength;
2812 	    }
2813 	    Ilength = 0;
2814 	  }
2815 	  Dlength++;
2816 
2817 	} else {
2818 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
2819 	  exit(9);
2820 	}
2821 
2822       } else {
2823 	/* Count even if unknown base */
2824 
2825 	if (Ilength > 0) {
2826 	  debug4(printf("%dI",Ilength));
2827 	  if (last_token_type == 'I' || last_token_type == 'D') {
2828 	    debug4(printf("fix_adjacent_indels found %d%c to %d%c\n",last_token_length,last_token_type,Ilength,'I'));
2829 	    for (i = 0; i < last_token_length + Ilength; i++) {
2830 	      path = Pairpool_pop(path,&pair);
2831 	    }
2832 	    last_token_type = 'I';
2833 	    last_token_length = 0; /* Since we have already taken care of this */
2834 	  } else {
2835 	    last_token_type = 'I';
2836 	    last_token_length = Ilength;
2837 	  }
2838 	  Ilength = 0;
2839 
2840 	} else if (Dlength > 0) {
2841 	  debug4(printf("%dD",Dlength));
2842 	  if (last_token_type == 'I' || last_token_type == 'D') {
2843 	    debug4(printf("fix_adjacent_indels found %d%c to %d%c\n",last_token_length,last_token_type,Dlength,'D'));
2844 	    for (i = 0; i < last_token_length + Dlength; i++) {
2845 	      path = Pairpool_pop(path,&pair);
2846 	    }
2847 	    last_token_type = 'D';
2848 	    last_token_length = 0; /* Since we have already taken care of this */
2849 	  } else {
2850 	    last_token_type = 'D';
2851 	    last_token_length = Dlength;
2852 	  }
2853 	  Dlength = 0;
2854 	}
2855 
2856 	Mlength++;
2857       }
2858     }
2859 
2860     pairptr = pairs;
2861     pairs = Pairpool_pop(pairs,&pair);
2862 #ifdef WASTE
2863     path = Pairpool_push_existing(path,pairpool,pair);
2864 #else
2865     path = List_push_existing(path,pairptr);
2866 #endif
2867   }
2868 
2869   if (Mlength > 0) {
2870     debug4(printf("%dM",Mlength));
2871     /* last_token_type = 'M'; */
2872     /* last_token_length = Mlength; */
2873   } else if (Ilength > 0) {
2874     debug4(printf("%dI",Ilength));
2875     if (last_token_type == 'I' || last_token_type == 'D') {
2876       debug4(printf("fix_adjacent_indels found %d%c to %d%c\n",last_token_length,last_token_type,Ilength,'I'));
2877       for (i = 0; i < last_token_length + Ilength; i++) {
2878 	path = Pairpool_pop(path,&pair);
2879       }
2880       /* last_token_type = 'I'; */
2881       /* last_token_length = 0; */ /* Since we have already taken care of this */
2882     } else {
2883       /* last_token_type = 'I'; */
2884       /* last_token_length = Ilength; */
2885     }
2886   } else if (Dlength > 0) {
2887     debug4(printf("%dD",Dlength));
2888     if (last_token_type == 'I' || last_token_type == 'D') {
2889       debug4(printf("fix_adjacent_indels found %d%c to %d%c\n",last_token_length,last_token_type,Dlength,'D'));
2890       for (i = 0; i < last_token_length + Dlength; i++) {
2891 	path = Pairpool_pop(path,&pair);
2892       }
2893       /* last_token_type = 'D'; */
2894       /* last_token_length = 0; */ /* Since we have already taken care of this */
2895     } else {
2896       /* last_token_type = 'D'; */
2897       /* last_token_length = Dlength; */
2898     }
2899   }
2900 
2901   debug4(printf("\n"));
2902 
2903   return path;
2904 }
2905 
2906 
2907 
2908 #define NORMAL_STATE 0
2909 #define INSERTION_STATE +1
2910 #define DELETION_STATE -1
2911 
2912 
2913 #if 0
2914 static List_T
2915 remove_adjacent_ins_del (bool *foundp, List_T pairs) {
2916   List_T path = NULL, pairptr;
2917   Pair_T this, pair;
2918   int state = NORMAL_STATE;
2919 
2920   *foundp = false;
2921   while (pairs != NULL) {
2922     pairptr = pairs;
2923     pairs = Pairpool_pop(pairs,&this);
2924 
2925     if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
2926       if (this->genome == ' ') {
2927 	if (state == NORMAL_STATE) {
2928 	  path = List_push_existing(path,pairptr);
2929 	  state = INSERTION_STATE;
2930 
2931 	} else if (state == INSERTION_STATE) {
2932 	  /* Do nothing */
2933 	  path = List_push_existing(path,pairptr);
2934 
2935 	} else if (state == DELETION_STATE) {
2936 	  /* Switch from insertion to deletion */
2937 	  /* Remove past insertion */
2938 	  while (path != NULL &&
2939 		 (((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP) &&
2940 		 ((Pair_T) path->first)->cdna == ' ') {
2941 	    path = Pairpool_pop(path,&pair);
2942 	  }
2943 	  /* Remove future deletion */
2944 	  while (pairs != NULL &&
2945 		 (((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == SHORTGAP_COMP) &&
2946 		 ((Pair_T) pairs->first)->genome == ' ') {
2947 	    pairs = Pairpool_pop(pairs,&pair);
2948 	  }
2949 	  *foundp = true;
2950 	  state = NORMAL_STATE;
2951 
2952 	} else {
2953 	  abort();
2954 	}
2955 
2956       } else if (this->cdna == ' ') {
2957 	if (state == NORMAL_STATE) {
2958 	  path = List_push_existing(path,pairptr);
2959 	  state = DELETION_STATE;
2960 
2961 	} else if (state == DELETION_STATE) {
2962 	  /* Do nothing */
2963 	  path = List_push_existing(path,pairptr);
2964 
2965 	} else if (state == INSERTION_STATE) {
2966 	  /* Switch from deletion to insertion */
2967 	  /* Remove past deletion */
2968 	  while (path != NULL &&
2969 		 (((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP) &&
2970 		 ((Pair_T) path->first)->genome == ' ') {
2971 	    path = Pairpool_pop(path,&pair);
2972 	  }
2973 	  /* Remove future insertion */
2974 	  while (pairs != NULL &&
2975 		 (((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == SHORTGAP_COMP) &&
2976 		 ((Pair_T) pairs->first)->cdna == ' ') {
2977 	    pairs = Pairpool_pop(pairs,&pair);
2978 	  }
2979 	  *foundp = true;
2980 	  state = NORMAL_STATE;
2981 
2982 	} else {
2983 	  abort();
2984 	}
2985 
2986       } else {
2987 	abort();
2988       }
2989 
2990     } else {
2991       path = List_push_existing(path,pairptr);
2992       state = NORMAL_STATE;
2993     }
2994   }
2995 
2996   return path;
2997 }
2998 #endif
2999 
3000 
3001 
3002 /************************************************************************
3003  *   Chop (trimming within end exons)
3004  ************************************************************************/
3005 
3006 /* Called only by GMAP, because nucleotide matches in PMAP have several ambiguous matches. */
3007 static List_T
clean_path_end3(List_T path,int ambig_end_length_3)3008 clean_path_end3 (List_T path, int ambig_end_length_3) {
3009   Pair_T lastpair;
3010 
3011   debug(printf("Starting clean_path_end3\n"));
3012   debug(Pair_dump_list(path,true));
3013   if (ambig_end_length_3 == 0) {
3014     /* Remove any remaining nonmatches, gaps, or indels at 3' end */
3015     if (path != NULL) {
3016       lastpair = path->first;
3017       while (lastpair->gapp || (lastpair->comp != MATCH_COMP && lastpair->comp != DYNPROG_MATCH_COMP /*&& lastpair->comp != AMBIGUOUS_COMP*/)) {
3018 	debug(printf("Removing nonmatch at 3' end: "));
3019 	debug(Pair_dump_one(lastpair,/*zerobasedp*/true));
3020 	debug(printf("\n"));
3021 	path = Pairpool_pop(path,&lastpair);
3022 	if (path == NULL) {
3023 	  return NULL;
3024 	} else {
3025 	  lastpair = path->first;
3026 	}
3027       }
3028     }
3029 
3030 #ifdef PMAP
3031     while (path != NULL) {
3032       lastpair = path->first;
3033       if (lastpair->querypos % 3 == 2) {
3034 	debug(printf("Ending clean_path_end3\n"));
3035 	debug(Pair_dump_list(path,true));
3036 	return path;
3037       } else {
3038 	debug(printf("PMAP popping querypos %d to get to codon boundary\n",lastpair->querypos));
3039 	path = Pairpool_pop(path,&lastpair);
3040       }
3041     }
3042 #endif
3043   }
3044 
3045   debug(printf("Ending clean_path_end3\n"));
3046   debug(Pair_dump_list(path,true));
3047   return path;
3048 }
3049 
3050 
3051 static List_T
clean_pairs_end5(List_T pairs,int ambig_end_length_5)3052 clean_pairs_end5 (List_T pairs, int ambig_end_length_5) {
3053   Pair_T firstpair;
3054 
3055   debug(printf("Starting clean_pairs_end5\n"));
3056   if (ambig_end_length_5 == 0) {
3057     /* Remove any remaining nonmatches, gaps, or indels at 5' end */
3058     if (pairs != NULL) {
3059       firstpair = pairs->first;
3060       while (firstpair->gapp || (firstpair->comp != MATCH_COMP && firstpair->comp != DYNPROG_MATCH_COMP /*&& firstpair->comp != AMBIGUOUS_COMP*/)) {
3061 	debug(printf("Removing nonmatch at 5' end: "));
3062 	debug(Pair_dump_one(firstpair,/*zerobasedp*/true));
3063 	debug(printf("\n"));
3064 	pairs = Pairpool_pop(pairs,&firstpair);
3065 	if (pairs == NULL) {
3066 	  return NULL;
3067 	} else {
3068 	  firstpair = pairs->first;
3069 	}
3070       }
3071     }
3072 
3073 #ifdef PMAP
3074     while (pairs != NULL) {
3075       firstpair = pairs->first;
3076       if (firstpair->querypos % 3 == 0) {
3077 	return pairs;
3078       } else {
3079 	debug(printf("PMAP popping querypos %d to get to codon boundary\n",firstpair->querypos));
3080 	pairs = Pairpool_pop(pairs,&firstpair);
3081       }
3082     }
3083 #endif
3084   }
3085 
3086   debug(printf("Ending clean_pairs_end5\n"));
3087   return pairs;
3088 }
3089 
3090 
3091 static List_T
clip_path_end3_chromosomal_bounds(List_T path,Univcoord_T chroffset,Univcoord_T chrhigh)3092 clip_path_end3_chromosomal_bounds (List_T path, Univcoord_T chroffset, Univcoord_T chrhigh) {
3093   Pair_T pair;
3094 
3095   while (path != NULL && ((Pair_T) path->first)->genomepos >= chrhigh - chroffset) {
3096     /* Will clean gaps also, but that is okay */
3097     path = Pairpool_pop(path,&pair);
3098     debug(printf("Clipping a pair with %u\n",pair->genomepos));
3099   }
3100 
3101   return path;
3102 }
3103 
3104 static List_T
clip_pairs_end5_chromosomal_bounds(List_T pairs)3105 clip_pairs_end5_chromosomal_bounds (List_T pairs) {
3106   Pair_T pair;
3107 
3108   /* genomepos of 0 corresponds to chrhigh (probably should change this to be chrhigh - 1 in the future) */
3109   while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < 1) {
3110     /* Will clean gaps also, but that is okay */
3111     pairs = Pairpool_pop(pairs,&pair);
3112     debug(printf("Clipping a pair with %u\n",pair->genomepos));
3113   }
3114 
3115   return pairs;
3116 }
3117 
3118 
3119 /* Called only by GMAP, because nucleotide matches in PMAP have several ambiguous matches. */
3120 static List_T
clean_path_end3_gap_indels(List_T path)3121 clean_path_end3_gap_indels (List_T path) {
3122   Pair_T lastpair;
3123 
3124   debug(printf("Starting clean_path_end3_gap_indels\n"));
3125   debug(Pair_dump_list(path,true));
3126 
3127   /* Remove any remaining gap/indels at 3' end, which can happen rarely */
3128   if (path != NULL) {
3129     lastpair = path->first;
3130     while (lastpair->gapp == true || lastpair->comp == INDEL_COMP || lastpair->comp == SHORTGAP_COMP) {
3131       debug(printf("Removing gap/indel at 3' end: "));
3132       debug(Pair_dump_one(lastpair,/*zerobasedp*/true));
3133       debug(printf("\n"));
3134       path = Pairpool_pop(path,&lastpair);
3135       if (path == NULL) {
3136 	return NULL;
3137       } else {
3138 	lastpair = path->first;
3139       }
3140     }
3141   }
3142 
3143 #ifdef PMAP
3144   while (path != NULL) {
3145     lastpair = path->first;
3146     if (lastpair->querypos % 3 == 2) {
3147       debug(printf("Ending clean_path_end3_gap_indels\n"));
3148       debug(Pair_dump_list(path,true));
3149       return path;
3150     } else {
3151       debug(printf("PMAP popping querypos %d to get to codon boundary\n",lastpair->querypos));
3152       path = Pairpool_pop(path,&lastpair);
3153     }
3154   }
3155 #endif
3156 
3157   debug(printf("Ending clean_path_end3_gap_indels\n"));
3158   debug(Pair_dump_list(path,true));
3159   return path;
3160 }
3161 
3162 
3163 static List_T
clean_pairs_end5_gap_indels(List_T pairs)3164 clean_pairs_end5_gap_indels (List_T pairs) {
3165   Pair_T firstpair;
3166 
3167   debug(printf("Starting clean_pairs_end5_gap_indels\n"));
3168   /* Remove any remaining gap/indels at 5' end, which can happen rarely */
3169   if (pairs != NULL) {
3170     firstpair = pairs->first;
3171     while (firstpair->gapp == true || firstpair->comp == INDEL_COMP || firstpair->comp == SHORTGAP_COMP) {
3172       debug(printf("Removing gap/indel at 5' end: "));
3173       debug(Pair_dump_one(firstpair,/*zerobasedp*/true));
3174       debug(printf("\n"));
3175       pairs = Pairpool_pop(pairs,&firstpair);
3176       if (pairs == NULL) {
3177 	return NULL;
3178       } else {
3179 	firstpair = pairs->first;
3180       }
3181     }
3182   }
3183 
3184 #ifdef PMAP
3185   while (pairs != NULL) {
3186     firstpair = pairs->first;
3187     if (firstpair->querypos % 3 == 0) {
3188       return pairs;
3189     } else {
3190       debug(printf("PMAP popping querypos %d to get to codon boundary\n",firstpair->querypos));
3191       pairs = Pairpool_pop(pairs,&firstpair);
3192     }
3193   }
3194 #endif
3195 
3196   debug(printf("Ending clean_pairs_end5_gap_indels\n"));
3197   return pairs;
3198 }
3199 
3200 
3201 /* Cleans to any gapp found within 20 bp of the breakpoint */
3202 static List_T
clean_end_chimera(List_T end)3203 clean_end_chimera (List_T end) {
3204   Pair_T lastpair;
3205   List_T peeled = NULL;
3206   int n = 0;
3207 
3208   debug10(printf("Starting clean_path_end_chimera\n"));
3209   while (end != NULL && n < 20) {
3210     lastpair = end->first;
3211     peeled = List_transfer_one(peeled,&end);
3212     if (lastpair->gapp == true) {
3213       debug10(printf("Cleaning end at a gapp\n"));
3214       peeled = (List_T) NULL;
3215     } else if (lastpair->comp == INDEL_COMP) {
3216       debug10(printf("Cleaning end at an indel\n"));
3217       peeled = (List_T) NULL;
3218     } else if (lastpair->comp == SHORTGAP_COMP) {
3219       debug10(printf("Cleaning end at an indel\n"));
3220       peeled = (List_T) NULL;
3221     } else {
3222       n++;
3223     }
3224   }
3225 
3226   end = Pairpool_transfer(end,peeled);
3227   debug10(printf("Ending clean_path_end_chimera\n"));
3228   return end;
3229 }
3230 
3231 
3232 #if 0
3233 static List_T
3234 chop_ends_by_changepoint (List_T pairs
3235 #ifdef WASTE
3236 			  , Pairpool_T pairpool
3237 #endif
3238 			  ) {
3239   List_T path;
3240   Pair_T pair;
3241   int *matchscores;
3242   int nmatches, ntotal, nmatches_left, ntotal_left, nmatches_right, ntotal_right;
3243   int left_edge, right_edge, length, i;
3244   int side;
3245   double theta;
3246   bool chop_left_p = false, chop_right_p = false;
3247 
3248   if (pairs == NULL) {
3249     return (List_T) NULL;
3250   } else {
3251     matchscores = Pair_matchscores_list(&nmatches,&ntotal,&length,pairs);
3252     debug18(printf("Overall, %d matches/%d total\n",nmatches,ntotal));
3253   }
3254 
3255   left_edge = Changepoint_left(&nmatches_left,&ntotal_left,matchscores,length);
3256   right_edge = Changepoint_right(&nmatches_right,&ntotal_right,matchscores,length);
3257 
3258   debug18(printf("At left edge %d (in 0..%d), %d matches/%d total\n",left_edge,List_length(pairs),nmatches_left,ntotal_left));
3259   debug18(printf("At right edge %d (in 0..%d), %d matches/%d total\n",right_edge,List_length(pairs),nmatches_right,ntotal_right));
3260 
3261   if (right_edge <= left_edge) {
3262     debug18(printf("Edges cross.  Need to select one.\n"));
3263     /* Need to select one side to chop. */
3264     if (ntotal_left == 0 || ntotal - ntotal_left <= 0) {
3265       side = +1;		/* chop right side */
3266     } else if (ntotal_right == 0 || ntotal - ntotal_right <= 0) {
3267       side = -1;		/* chop left side */
3268     } else {
3269 
3270 #if 0
3271       theta = (double) (nmatches - nmatches_left)/(double) (ntotal - ntotal_left);
3272       /* Don't have artificially high expectations for theta, e.g., 1.00 */
3273       theta = theta - THETA_SLACK;
3274       if (theta < 0.10) {
3275 	/* Protect against negative values */
3276 	theta = 0.10;
3277       }
3278       debug18(printf("Testing on left: Pbinom(%d,%d,%f) = %g\n",
3279 		    nmatches_left,ntotal_left,theta,Pbinom(nmatches_left,ntotal_left,theta)));
3280       pbinom_left = Pbinom(nmatches_left,ntotal_left,theta);
3281 
3282       theta = (double) (nmatches - nmatches_right)/(double) (ntotal - ntotal_right);
3283       /* Don't have artificially high expectations for theta, e.g., 1.00 */
3284       theta = theta - THETA_SLACK;
3285       if (theta < 0.10) {
3286 	/* Protect against negative values */
3287 	theta = 0.10;
3288       }
3289 
3290       debug18(printf("Testing on right: Pbinom(%d,%d,%f) = %g\n",
3291 		    nmatches_right,ntotal_right,theta,Pbinom(nmatches_right,ntotal_right,theta)));
3292       pbinom_right = Pbinom(nmatches_right,ntotal_right,theta);
3293 
3294       if (pbinom_left < pbinom_right) {
3295 	if (pbinom_left < TRIM_END_PVALUE) {
3296 	  side = -1;		/* chop left side */
3297 	} else {
3298 	  side = 0;
3299 	}
3300       } else if (pbinom_right < pbinom_left) {
3301 	if (pbinom_right < TRIM_END_PVALUE) {
3302 	  side = +1;		/* chop right side */
3303 	} else {
3304 	  side = 0;
3305 	}
3306       } else {
3307 	side = 0;
3308       }
3309 #else
3310       /* Pick shortest side */
3311       if (ntotal_left < ntotal_right) {
3312 	debug18(printf("left side is shorter\n"));
3313 	side = -1;		/* chop left side */
3314       } else {
3315 	debug18(printf("right side is shorter\n"));
3316 	side = +1;		/* chop right side */
3317       }
3318 #endif
3319 
3320     }
3321 
3322     if (side == -1) {
3323       debug18(printf("Chopping %d on left.\n",left_edge));
3324       for (i = 0; i < left_edge; i++) {
3325 	pairs = Pairpool_pop(pairs,&pair);
3326       }
3327       chop_left_p = true;
3328     } else if (side == +1) {
3329       debug18(printf("Chopping %d - %d on right.\n",length,right_edge));
3330       path = List_reverse(pairs);
3331       for (i = 0; i < length - right_edge; i++) {
3332 	path = Pairpool_pop(path,&pair);
3333       }
3334 
3335       pairs = (List_T) NULL;
3336 #ifdef WASTE
3337       while (path) {
3338 	path = Pairpool_pop(path,&pair);
3339 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
3340       }
3341 #else
3342       pairs = Pairpool_transfer(pairs,path);
3343 #endif
3344       chop_right_p = true;
3345     }
3346 
3347   } else {
3348     if (ntotal_left == 0) {
3349       path = List_reverse(pairs);
3350     } else if (ntotal - ntotal_left <= 0) {
3351       path = List_reverse(pairs);
3352     } else {
3353       theta = (double) (nmatches - nmatches_left)/(double) (ntotal - ntotal_left);
3354       /* Don't have artificially high expectations for theta, e.g., 1.00 */
3355       theta = theta - THETA_SLACK;
3356       if (theta < 0.10) {
3357 	/* Protect against negative values */
3358 	theta = 0.10;
3359       }
3360 
3361       debug18(printf("Testing on left: Pbinom(%d,%d,%f) = %g\n",
3362 		    nmatches_left,ntotal_left,theta,Pbinom(nmatches_left,ntotal_left,theta)));
3363       if (Pbinom(nmatches_left,ntotal_left,theta) > TRIM_END_PVALUE) {
3364 	path = List_reverse(pairs);
3365       } else {
3366 	debug18(printf("Chopping %d on left.\n",left_edge));
3367 	for (i = 0; i < left_edge; i++) {
3368 	  pairs = Pairpool_pop(pairs,&pair);
3369 	}
3370 	path = (List_T) NULL;
3371 #ifdef WASTE
3372 	while (pairs) {
3373 	  pairs = Pairpool_pop(pairs,&pair);
3374 	  path = Pairpool_push_existing(path,pairpool,pair);
3375 	}
3376 #else
3377 	path = Pairpool_transfer(path,pairs);
3378 #endif
3379 	debug18(printf("path is now length %d\n",List_length(path)));
3380 	chop_left_p = true;
3381       }
3382     }
3383 
3384     if (ntotal_right == 0) {
3385       pairs = List_reverse(path);
3386     } else if (ntotal - ntotal_right <= 0) {
3387       pairs = List_reverse(path);
3388     } else {
3389       theta = (double) (nmatches - nmatches_right)/(double) (ntotal - ntotal_right);
3390       /* Don't have artificially high expectations for theta, e.g., 1.00 */
3391       theta = theta - THETA_SLACK;
3392       if (theta < 0.10) {
3393 	/* Protect against negative values */
3394 	theta = 0.10;
3395       }
3396 
3397       debug18(printf("Testing on right: Pbinom(%d,%d,%f) = %g\n",
3398 		    nmatches_right,ntotal_right,theta,Pbinom(nmatches_right,ntotal_right,theta)));
3399       if (Pbinom(nmatches_right,ntotal_right,theta) > TRIM_END_PVALUE) {
3400 	pairs = List_reverse(path);
3401       } else {
3402 	debug18(printf("Chopping %d - %d on right.\n",length,right_edge));
3403 	for (i = 0; i < length - right_edge; i++) {
3404 	  path = Pairpool_pop(path,&pair);
3405 	}
3406 	pairs = (List_T) NULL;
3407 #ifdef WASTE
3408 	while (path) {
3409 	  path = Pairpool_pop(path,&pair);
3410 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
3411 	}
3412 #else
3413 	pairs = Pairpool_transfer(pairs,path);
3414 #endif
3415 	chop_right_p = true;
3416       }
3417     }
3418   }
3419 
3420   FREE(matchscores);
3421 
3422   debug18(printf("Returning alignment of length %d\n",List_length(pairs)));
3423 
3424   return pairs;
3425 }
3426 #endif
3427 
3428 
3429 #if 0
3430 /* pairs -> pairs */
3431 static List_T
3432 trim_short_end_exons (bool *trim5p, bool *trim3p, List_T pairs, Pairpool_T pairpool, int minendexon) {
3433   List_T path, exon, pairptr;
3434   Pair_T pair;
3435   int exon_nmatches;
3436 
3437   debug18(printf("Starting trim_short_end5_exons\n"));
3438   debug18(Pair_dump_list(pairs,true));
3439 
3440   /* Handle first exon */
3441   if (pairs == NULL) {
3442     *trim5p = *trim3p = false;
3443     return (List_T) NULL;
3444   } else {
3445     pair = pairs->first;
3446   }
3447 
3448   exon = (List_T) NULL;
3449   exon_nmatches = 0;
3450   while (pairs != NULL && !pair->gapp) {
3451     pairptr = pairs;
3452     pairs = Pairpool_pop(pairs,&pair);
3453 #ifdef WASTE
3454     exon = Pairpool_push_existing(exon,pairpool,pair);
3455 #else
3456     exon = List_push_existing(exon,pairptr);
3457 #endif
3458     if (pair->gapp == false && pair->comp != MISMATCH_COMP && pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP) {
3459       exon_nmatches++;
3460     }
3461   }
3462 
3463   if (exon_nmatches >= minendexon) {
3464     debug18(printf("Keeping first exon of length %d\n",exon_nmatches));
3465     path = exon;		/* exon already has the gap */
3466     *trim5p = false;
3467   } else if (exon_nmatches == 0) {
3468     debug18(printf("Trimming first exon of length %d.  firstpair must be a gap.\n",exon_nmatches));
3469     pairs = Pairpool_pop(pairs,&pair); /* discard gap */
3470     assert(pair->gapp == true);
3471     path = (List_T) NULL;
3472     *trim5p = false;
3473   } else {
3474     debug18(printf("Trimming first exon of length %d\n",exon_nmatches));
3475     path = (List_T) NULL;
3476     *trim5p = true;
3477   }
3478 
3479 #ifdef WASTE
3480   while (pairs != NULL) {
3481     pairs = Pairpool_pop(pairs,&pair);
3482     path = Pairpool_push_existing(path,pairpool,pair);
3483 
3484   }
3485 #else
3486   path = Pairpool_transfer(path,pairs);
3487 #endif
3488 
3489   /* Handle last exon */
3490   if (path == NULL) {
3491     *trim5p = *trim3p = false;
3492     return (List_T) NULL;
3493   } else {
3494     pair = path->first;
3495   }
3496 
3497   exon = (List_T) NULL;
3498   exon_nmatches = 0;
3499   while (path != NULL && !pair->gapp) {
3500     pairptr = path;
3501     path = Pairpool_pop(path,&pair);
3502 #ifdef WASTE
3503     exon = Pairpool_push_existing(exon,pairpool,pair);
3504 #else
3505     exon = List_push_existing(exon,pairptr);
3506 #endif
3507     if (pair->gapp == false && pair->comp != MISMATCH_COMP && pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP) {
3508       exon_nmatches++;
3509     }
3510   }
3511 
3512   if (exon_nmatches >= minendexon) {
3513     debug18(printf("Keeping last exon of length %d\n",exon_nmatches));
3514     pairs = exon;		/* exon already has the gap */
3515     *trim3p = false;
3516   } else if (exon_nmatches == 0) {
3517     debug18(printf("Trimming last exon of length %d.  firstpair must be a gap.\n",exon_nmatches));
3518     path = Pairpool_pop(path,&pair); /* discard gap */
3519     assert(pair->gapp == true);
3520     pairs = (List_T) NULL;
3521     *trim3p = false;
3522   } else {
3523     debug18(printf("Trimming last exon of length %d\n",exon_nmatches));
3524     pairs = (List_T) NULL;
3525     *trim3p = true;
3526   }
3527 
3528 #ifdef WASTE
3529   while (path != NULL) {
3530     path = Pairpool_pop(path,&pair);
3531     pairs = Pairpool_push_existing(pairs,pairpool,pair);
3532   }
3533 #else
3534   pairs = Pairpool_transfer(pairs,path);
3535 #endif
3536 
3537   debug18(printf("End of trim_short_end_exons: length = %d\n",List_length(pairs)));
3538   debug18(Pair_dump_list(pairs,true));
3539   return pairs;
3540 }
3541 #endif
3542 
3543 
3544 #if 0
3545 /* pairs -> path */
3546 static List_T
3547 trim_short_end5_exons (bool *trim5p, List_T pairs,
3548 #ifdef WASTE
3549 		       Pairpool_T pairpool,
3550 #endif
3551 		       int minendexon) {
3552   List_T path, exon, pairptr;
3553   Pair_T pair;
3554   int exon_nmatches, exon_nmismatches;
3555 
3556   debug18(printf("Starting trim_short_end5_exons\n"));
3557   debug18(Pair_dump_list(pairs,true));
3558 
3559   /* Handle first exon */
3560   if (pairs == NULL) {
3561     *trim5p = false;
3562     return (List_T) NULL;
3563   } else {
3564     pair = pairs->first;
3565   }
3566 
3567   exon = (List_T) NULL;
3568   exon_nmatches = exon_nmismatches = 0;
3569   while (pairs != NULL && !pair->gapp) {
3570     pairptr = pairs;
3571     pairs = Pairpool_pop(pairs,&pair);
3572 #ifdef WASTE
3573     exon = Pairpool_push_existing(exon,pairpool,pair);
3574 #else
3575     exon = List_push_existing(exon,pairptr);
3576 #endif
3577     if (pair->gapp == true) {
3578       /* Skip */
3579     } else if (pair->comp == MISMATCH_COMP || pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP) {
3580       exon_nmismatches++;
3581     } else {
3582       exon_nmatches++;
3583     }
3584   }
3585 
3586   if (exon_nmatches - exon_nmismatches >= minendexon) {
3587     debug18(printf("Keeping first exon of length %d\n",exon_nmatches));
3588     path = exon;		/* exon already has the gap */
3589     *trim5p = false;
3590   } else if (exon_nmatches == 0) {
3591     debug18(printf("Trimming first exon of length %d.  firstpair must be a gap.\n",exon_nmatches));
3592     pairs = Pairpool_pop(pairs,&pair); /* discard gap */
3593     path = (List_T) NULL;
3594     *trim5p = false;
3595   } else {
3596     debug18(printf("Trimming first exon of length %d\n",exon_nmatches));
3597     path = (List_T) NULL;
3598     *trim5p = true;
3599   }
3600 
3601 #ifdef WASTE
3602   while (pairs != NULL) {
3603     pairs = Pairpool_pop(pairs,&pair);
3604     path = Pairpool_push_existing(path,pairpool,pair);
3605 
3606   }
3607 #else
3608   path = Pairpool_transfer(path,pairs);
3609 #endif
3610 
3611   debug18(printf("End of trim_short_end_exons: length = %d\n",List_length(pairs)));
3612   debug18(Pair_dump_list(pairs,true));
3613   return path;
3614 }
3615 #endif
3616 
3617 
3618 #if 0
3619 /* path -> pairs */
3620 static List_T
3621 trim_short_end3_exons (bool *trim3p, List_T path,
3622 #ifdef WASTE
3623 		       Pairpool_T pairpool,
3624 #endif
3625 		       int minendexon) {
3626   List_T pairs, exon, pairptr;
3627   Pair_T pair;
3628   int exon_nmatches, exon_nmismatches;
3629 
3630   debug18(printf("Starting trim_short_end3_exons\n"));
3631   debug18(Pair_dump_list(path,true));
3632 
3633   /* Handle last exon */
3634   if (path == NULL) {
3635     *trim3p = false;
3636     return (List_T) NULL;
3637   } else {
3638     pair = path->first;
3639   }
3640 
3641   exon = (List_T) NULL;
3642   exon_nmatches = exon_nmismatches = 0;
3643   while (path != NULL && !pair->gapp) {
3644     pairptr = path;
3645     path = Pairpool_pop(path,&pair);
3646 #ifdef WASTE
3647     exon = Pairpool_push_existing(exon,pairpool,pair);
3648 #else
3649     exon = List_push_existing(exon,pairptr);
3650 #endif
3651     if (pair->gapp == true) {
3652       /* Skip */
3653     } else if (pair->comp == MISMATCH_COMP || pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP) {
3654       exon_nmismatches++;
3655     } else {
3656       exon_nmatches++;
3657     }
3658   }
3659 
3660   if (exon_nmatches - exon_nmismatches >= minendexon) {
3661     debug18(printf("Keeping last exon of length %d\n",exon_nmatches));
3662     pairs = exon;		/* exon already has the gap */
3663     *trim3p = false;
3664   } else if (exon_nmatches == 0) {
3665     debug18(printf("Trimming last exon of length %d.  firstpair must be a gap.\n",exon_nmatches));
3666     path = Pairpool_pop(path,&pair); /* discard gap */
3667     pairs = (List_T) NULL;
3668     *trim3p = false;
3669   } else {
3670     debug18(printf("Trimming last exon of length %d\n",exon_nmatches));
3671     pairs = (List_T) NULL;
3672     *trim3p = true;
3673   }
3674 
3675 #ifdef WASTE
3676   while (path != NULL) {
3677     path = Pairpool_pop(path,&pair);
3678     pairs = Pairpool_push_existing(pairs,pairpool,pair);
3679   }
3680 #else
3681   pairs = Pairpool_transfer(pairs,path);
3682 #endif
3683 
3684   debug18(printf("End of trim_short_end_exons: length = %d\n",List_length(pairs)));
3685   debug18(Pair_dump_list(pairs,true));
3686   return pairs;
3687 }
3688 #endif
3689 
3690 
3691 
3692 #ifdef STRICT
3693 /* We want pair->queryjump == 0 or pair->genomejump == 0 */
3694 static bool
dual_break_p(List_T pairs)3695 dual_break_p (List_T pairs) {
3696   Pair_T pair;
3697 
3698   while (pairs != NULL) {
3699     pair = (Pair_T) pairs->first;
3700     /* This used to fail when we used UNKNOWNJUMP for gaps */
3701     if (pair->gapp == true && (pair->queryjump != 0 && pair->genomejump != 0)) {
3702       return true;
3703     }
3704     pairs = pairs->rest;
3705   }
3706 
3707   return false;
3708 }
3709 #endif
3710 
3711 
3712 /* We want pair->queryjump >= 0 and pair->genomejump >= 0 */
3713 static bool
negative_break_p(List_T pairs)3714 negative_break_p (List_T pairs) {
3715   Pair_T pair;
3716 
3717   while (pairs != NULL) {
3718     pair = (Pair_T) pairs->first;
3719     /* This used to fail when we used UNKNOWNJUMP for gaps */
3720     if (pair->gapp == true && (pair->queryjump < 0 || pair->genomejump < 0)) {
3721       return true;
3722     }
3723     pairs = pairs->rest;
3724   }
3725 
3726   return false;
3727 }
3728 
3729 
3730 
3731 #if 0
3732 /* We want pair->queryjump == 0 or pair->genomejump == 0 */
3733 static int
3734 dual_break_distance_from_end (int *npairs, int *totaljump, List_T pairs) {
3735   Pair_T pair;
3736   int nmatches, nmismatches;
3737 
3738   /* Handle first pair */
3739   if (pairs == NULL) {
3740     *totaljump = 0;
3741     return 0;
3742   } else {
3743     pair = pairs->first;
3744   }
3745 
3746   nmatches = nmismatches = 0;
3747   *npairs = 0;
3748   while (pairs != NULL && (pair->gapp == false || (pair->queryjump == 0 || pair->genomejump == 0))) {
3749     if (pair->gapp == true) {
3750       /* Skip */
3751     } else if (pair->comp == MISMATCH_COMP || pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP) {
3752       nmismatches++;
3753     } else {
3754       nmatches++;
3755     }
3756 
3757     pairs = pairs->rest;
3758     if (pairs != NULL) {
3759       pair = (Pair_T) pairs->first;
3760     }
3761     *npairs += 1;
3762   }
3763 
3764   /* This used to fail when we used UNKNOWNJUMP for gaps */
3765   if (pair->gapp == true && (pair->queryjump != 0 || pair->genomejump != 0)) {
3766     *npairs += 1;		/* trim gap */
3767     *totaljump = DUALBREAK_QUERYJUMP_FACTOR * pair->queryjump;
3768   } else {
3769     *totaljump = 0;
3770   }
3771 
3772   return nmatches - nmismatches;
3773 }
3774 #endif
3775 
3776 
3777 #ifdef STRICT
3778 /* We want pair->queryjump == 0 or pair->genomejump == 0 */
3779 static void
dual_break_count_npairs(int * npairs5,int * npairs3,List_T pairs)3780 dual_break_count_npairs (int *npairs5, int *npairs3, List_T pairs) {
3781   Pair_T pair;
3782   int npairs = 0;
3783 
3784   assert(pairs != NULL);	/* We previously checked for this */
3785   pair = pairs->first;
3786 
3787   npairs = 0;
3788   while (pairs != NULL && (pair->gapp == false || (pair->queryjump == 0 || pair->genomejump == 0))) {
3789     pairs = pairs->rest;
3790     if (pairs != NULL) {
3791       pair = (Pair_T) pairs->first;
3792     }
3793     npairs += 1;
3794   }
3795   *npairs5 = npairs;
3796 
3797   /* Skip the gap */
3798   pairs = pairs->rest;
3799   npairs += 1;
3800 
3801   /* Count the rest */
3802   while (pairs != NULL) {
3803     pairs = pairs->rest;
3804     npairs += 1;
3805   }
3806   *npairs3 = npairs - (*npairs5) - 1 /* for the gap */;
3807 
3808   return;
3809 }
3810 #endif
3811 
3812 
3813 /* We want pair->queryjump >= 0 and pair->genomejump >= 0 */
3814 static void
negative_break_count_npairs(int * npairs5,int * npairs3,List_T pairs)3815 negative_break_count_npairs (int *npairs5, int *npairs3, List_T pairs) {
3816   Pair_T pair;
3817   int npairs = 0;
3818 
3819   assert(pairs != NULL);	/* We previously checked for this */
3820   pair = pairs->first;
3821 
3822   npairs = 0;
3823   while (pairs != NULL && (pair->gapp == false || (pair->queryjump >= 0 && pair->genomejump >= 0))) {
3824     pairs = pairs->rest;
3825     if (pairs != NULL) {
3826       pair = (Pair_T) pairs->first;
3827     }
3828     npairs += 1;
3829   }
3830   *npairs5 = npairs;
3831 
3832   /* Skip the gap */
3833   pairs = pairs->rest;
3834   npairs += 1;
3835 
3836   /* Count the rest */
3837   while (pairs != NULL) {
3838     pairs = pairs->rest;
3839     npairs += 1;
3840   }
3841   *npairs3 = npairs - (*npairs5) - 1 /* for the gap */;
3842 
3843   return;
3844 }
3845 
3846 
3847 static List_T
trim_npairs(List_T pairs,int npairs)3848 trim_npairs (List_T pairs, int npairs) {
3849   int i;
3850   Pair_T pair;
3851 
3852   for (i = 0; i < npairs; i++) {
3853     pairs = Pairpool_pop(pairs,&pair);
3854   }
3855   return pairs;
3856 }
3857 
3858 
3859 
3860 #if 0
3861 static bool
3862 enough_matches (int matches, int genomejump
3863 #if 0
3864 		, double donor_prob, double acceptor_prob
3865 #endif
3866 		) {
3867 #if 1
3868   if (genomejump > 100000) {
3869     return (matches >= 10) ? true : false;
3870   } else if (genomejump > 32000) {
3871     return (matches >= 9) ? true : false;
3872   } else if (genomejump > 8000) {
3873     return (matches >= 8) ? true : false;
3874   } else if (genomejump > 2000) {
3875     return (matches >= 7) ? true : false;
3876   } else {
3877     return (matches >= 6) ? true : false;
3878   }
3879 #else
3880   double prob, prob_threshold;
3881 
3882   prob = 1 - pow(1.0-pow(4.0,(double) -matches),(double) genomejump);
3883   debug3(printf("Probability of exon of length %d with intron of length %d is %g\n",
3884 		 matches,genomejump,prob));
3885 
3886 #if 0
3887   prob_threshold = 1.0 - (1.0 - donor_prob)*(1.0 - acceptor_prob);
3888   debug3(printf("  Comparing with probability of splice %f and %f => %f\n",
3889 		 donor_prob,acceptor_prob,prob_threshold));
3890 #endif
3891 
3892   if (prob < 0.10) {
3893     return true;
3894   } else {
3895     return false;
3896   }
3897 #endif
3898 }
3899 #endif
3900 
3901 
3902 static bool
canonicalp(bool knowngapp,char comp,double donor_prob,double acceptor_prob,int cdna_direction)3903 canonicalp (bool knowngapp, char comp, double donor_prob, double acceptor_prob, int cdna_direction) {
3904 
3905   if (knowngapp) {
3906     return true;
3907 
3908   } else if (donor_prob < 0.9 || acceptor_prob < 0.9) {
3909     return false;
3910 
3911   } else if (cdna_direction > 0) {
3912     if (comp == FWD_CANONICAL_INTRON_COMP || comp == FWD_GCAG_INTRON_COMP || comp == FWD_ATAC_INTRON_COMP) {
3913       return true;
3914     } else {
3915       return false;
3916     }
3917   } else if (cdna_direction < 0) {
3918     if (comp == REV_CANONICAL_INTRON_COMP || comp == REV_GCAG_INTRON_COMP || comp == REV_ATAC_INTRON_COMP) {
3919       return true;
3920     } else {
3921       return false;
3922     }
3923   } else {
3924 #if 0
3925     /* Too much freedom.  Also, ambig_splicetypes depend on cdna_direction to be known. */
3926     if (comp == FWD_CANONICAL_INTRON_COMP || comp == FWD_GCAG_INTRON_COMP || comp == FWD_ATAC_INTRON_COMP ||
3927 	comp == REV_CANONICAL_INTRON_COMP || comp == REV_GCAG_INTRON_COMP || comp == REV_ATAC_INTRON_COMP) {
3928       return true;
3929     } else {
3930       return false;
3931     }
3932 #else
3933     return false;
3934 #endif
3935   }
3936 
3937 }
3938 
3939 
3940 /* Copied from stage1hr.c.  A slightly different version is used in splice.c */
3941 static int
sufficient_splice_prob_local(int nmatches,int nmismatches,double distal_spliceprob,double medial_spliceprob)3942 sufficient_splice_prob_local (int nmatches, int nmismatches, double distal_spliceprob,
3943 			      double medial_spliceprob) {
3944   debug3(printf("Checking for sufficient splice prob, based on %d matches, %d mismatches, distal spliceprob %f, and medial spliceprob %f\n",
3945 		nmatches,nmismatches,distal_spliceprob,medial_spliceprob));
3946   nmatches -= 2*nmismatches;
3947   if (nmatches < 0) {
3948     return (int) false;
3949   } else if (nmatches < 7) {
3950     return (distal_spliceprob > 0.95 && medial_spliceprob > 0.90);
3951   } else if (nmatches < 11) {
3952     return (distal_spliceprob > 0.90 && medial_spliceprob > 0.85);
3953   } else if (nmatches < 15) {
3954     return (distal_spliceprob > 0.85 && medial_spliceprob > 0.80);
3955   } else if (nmatches < 19) {
3956     return (distal_spliceprob > 0.50 /*&& medial_spliceprob > 0.50*/);
3957   } else {
3958     return (int) true;
3959   }
3960 }
3961 
3962 static bool
sufficient_splice_prob_strict(double distal_spliceprob,double medial_spliceprob)3963 sufficient_splice_prob_strict (double distal_spliceprob, double medial_spliceprob) {
3964   debug3(printf("Checking for sufficient splice prob, based on spliceprob %f, and medial spliceprob %f\n",
3965 		distal_spliceprob,medial_spliceprob));
3966   if (distal_spliceprob > 0.95 && medial_spliceprob > 0.90) {
3967     return true;
3968   } else {
3969     return false;
3970   }
3971 }
3972 
3973 
3974 
3975 #if 0
3976 static int
3977 exon_length_5 (List_T pairs) {
3978   int exon_length = 0;
3979   List_T p;
3980 
3981   p = pairs;
3982   while (p != NULL && ((Pair_T) p->first)->gapp == false) {
3983     exon_length++;
3984     p = p->rest;
3985   }
3986 
3987   if (p == NULL) {
3988     debug13(printf("no intron found, so exon_length_5 = %d\n",END_SPLICESITE_EXON_LENGTH));
3989     return END_SPLICESITE_EXON_LENGTH;
3990   } else {
3991     debug13(printf("intron found, with exon_length_5 = %d\n",exon_length));
3992     return exon_length;
3993   }
3994 }
3995 #endif
3996 
3997 
3998 #if 0
3999 static int
4000 exon_length_3 (List_T path) {
4001   int exon_length = 0;
4002   List_T p;
4003 
4004   p = path;
4005   while (p != NULL && ((Pair_T) p->first)->gapp == false) {
4006     exon_length++;
4007     p = p->rest;
4008   }
4009 
4010   if (p == NULL) {
4011     debug13(printf("no intron found, so exon_length_3 = %d\n",END_SPLICESITE_EXON_LENGTH));
4012     return END_SPLICESITE_EXON_LENGTH;
4013   } else {
4014     debug13(printf("intron found, with exon_length_3 = %d\n",exon_length));
4015     return exon_length;
4016   }
4017 }
4018 #endif
4019 
4020 
4021 /* Also handles case where novelsplicingp == false */
4022 /* pairs -> pairs */
4023 static List_T
trim_end5_indels(List_T pairs,int ambig_end_length,Dynprog_T dynprog,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,double defect_rate)4024 trim_end5_indels (List_T pairs, int ambig_end_length,
4025 		  Dynprog_T dynprog, Univcoord_T chroffset, Univcoord_T chrhigh,
4026 		  char *queryseq_ptr, char *queryuc_ptr,
4027 		  bool watsonp, int genestrand, bool jump_late_p,
4028 		  Pairpool_T pairpool, double defect_rate) {
4029   List_T path, exon, pairptr, p;
4030   Pair_T pair;
4031   int indel_score;
4032   int nindels;
4033 
4034   int finalscore, continuous_nmatches, continuous_nmismatches, continuous_nopens, continuous_nindels;
4035   int querydp3_medialgap, genomedp3_medialgap, queryjump, genomejump;
4036   List_T continuous_gappairs_medialgap;
4037   int dynprogindex_minor = 0;
4038 
4039   debug3(printf("Starting trim_end5_indels\n"));
4040 
4041   /* Handle first exon */
4042   if (pairs == NULL) {
4043     /* *trim5p = false; */
4044     return (List_T) NULL;
4045   } else if (ambig_end_length > 0) {
4046     /* Don't mess with ambiguous end */
4047     /* *trim5p = false; */
4048     return pairs;
4049   } else {
4050     pair = pairs->first;
4051     debug3(printf("querystart %d\n",pair->querypos));
4052   }
4053 
4054   exon = (List_T) NULL;
4055   while (pairs != NULL && pair->gapp == false && pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP) {
4056     pairptr = pairs;
4057     pairs = Pairpool_pop(pairs,&pair);
4058 #ifdef WASTE
4059     exon = Pairpool_push_existing(exon,pairpool,pair);
4060 #else
4061     exon = List_push_existing(exon,pairptr);
4062 #endif
4063   }
4064 
4065   while (pairs != NULL && pair->gapp == false && (((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == SHORTGAP_COMP)) {
4066     pairptr = pairs;
4067     pairs = Pairpool_pop(pairs,&pair);
4068 #ifdef WASTE
4069     exon = Pairpool_push_existing(exon,pairpool,pair);
4070 #else
4071     exon = List_push_existing(exon,pairptr);
4072 #endif
4073   }
4074   debug3(printf("5' End exon:\n"));
4075   debug3(Pair_dump_list(exon,true));
4076 
4077 
4078   if (exon == NULL) {
4079     /* *trim5p = false; */
4080     return pairs;
4081 
4082   } else {
4083     p = exon;
4084     nindels = 1;
4085     while (p != NULL && (((Pair_T) p->first)->comp == INDEL_COMP || ((Pair_T) p->first)->comp == SHORTGAP_COMP)) {
4086       p = List_next(p);
4087       nindels++;
4088     }
4089 
4090     indel_score = 0;
4091     /* Evaluate region distal to indel */
4092     while (p != NULL) {
4093       pair = (Pair_T) List_head(p);
4094       if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
4095 	indel_score += 1;
4096       } else {
4097 	indel_score -= 3;
4098       }
4099       debug3(printf("5' querypos %d => indel_score %d\n",pair->querypos,indel_score));
4100       p = List_next(p);
4101     }
4102 
4103 #if 0
4104     for ( i = 0; p != NULL && i < NEARBY_INDEL; p = List_next(p), i++) {
4105       medial = (Pair_T) p->first;
4106       if (medial->gapp) {
4107 	debug3(printf("Saw splice medial to 5' end indel\n"));
4108 	nearindelp = true;
4109       } else if (medial->comp == MATCH_COMP || medial->comp == DYNPROG_MATCH_COMP || medial->comp == AMBIGUOUS_COMP) {
4110 	/* Skip */
4111       } else {
4112 	debug3(printf("Saw mismatch %c medial to 5' end indel\n",medial->comp));
4113       }
4114     }
4115 #endif
4116 
4117     if (pairs == NULL) {
4118       debug3(printf("No indel/gap\n"));
4119       path = exon;
4120       /* *trim5p = false; */
4121 
4122     } else if (exon == NULL) {
4123       debug3(printf("No 5' exon\n"));
4124       path = exon;
4125       /* *trim5p = false; */
4126 
4127 #if 0
4128     } else if (nearindelp == true && max_nmatches < INDEL_SPLICE_ENDLENGTH) {
4129       debug3(printf("near indel with nmatches %d too low, so trimming it\n",max_nmatches));
4130       path = (List_T) NULL;
4131       /* *trim5p = true; */
4132 #endif
4133 
4134     } else if (((Pair_T) pairs->first)->gapp == true) {
4135       debug3(printf("Peeled all the way to a gap, so not handling with this procedure\n"));
4136       path = exon;
4137 
4138     } else {
4139       querydp3_medialgap = ((Pair_T) pairs->first)->querypos - 1;
4140       genomedp3_medialgap = ((Pair_T) pairs->first)->genomepos - 1;
4141       queryjump = querydp3_medialgap + 1;
4142       genomejump = queryjump /*+ extramaterial_end*/;
4143 
4144       continuous_gappairs_medialgap = Dynprog_end5_gap(&dynprogindex_minor,&finalscore,
4145 						       &continuous_nmatches,&continuous_nmismatches,&continuous_nopens,&continuous_nindels,
4146 						       dynprog,&(queryseq_ptr[querydp3_medialgap]),&(queryuc_ptr[querydp3_medialgap]),
4147 						       queryjump,genomejump,querydp3_medialgap,genomedp3_medialgap,
4148 						       chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
4149 						       extraband_end,defect_rate,/*endalign*/QUERYEND_NOGAPS,/*require_pos_score_p*/true);
4150       debug(printf("CONTINUOUS AT 5 (trim_end5_indels)?\n"));
4151       debug(printf("CONTINUOUS_GAPPAIRS_MEDIALGAP:\n"));
4152       debug(Pair_dump_list(continuous_gappairs_medialgap,true));
4153       debug3(printf("continuous finalscore %d\n",finalscore));
4154 
4155       if (finalscore > 0) {
4156 	debug3(printf("Using continuous\n"));
4157         path = continuous_gappairs_medialgap;
4158 	/* *trim5p = false; */
4159 
4160       } else if (indel_score < 0) {
4161 	debug3(printf("Not enough matches, so trimming it\n"));
4162 	path = (List_T) NULL;
4163 	/* *trim5p = true; */
4164 
4165       } else {
4166 	debug3(printf("Using indel, because indel_score %d > 0\n",indel_score));
4167 	path = exon;		/* exon already has the indel */
4168 	/* *trim5p = false; */
4169       }
4170     }
4171 
4172     path = Pairpool_transfer(path,pairs);
4173 
4174     pairs = List_reverse(path);
4175     pairs = clean_pairs_end5(pairs,ambig_end_length);
4176 
4177     debug3(printf("End of trim_end5_indels: length = %d\n",List_length(pairs)));
4178     debug3(Pair_dump_list(pairs,true));
4179     return pairs;
4180   }
4181 }
4182 
4183 
4184 /* Also handles case where novelsplicingp == false */
4185 /* pairs -> pairs */
4186 static List_T
trim_end5_exons(bool * indelp,bool * trim5p,int ambig_end_length,List_T pairs,Dynprog_T dynprog,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,int querylength,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,double defect_rate)4187 trim_end5_exons (bool *indelp, bool *trim5p, int ambig_end_length, List_T pairs,
4188 		 Dynprog_T dynprog, Univcoord_T chroffset, Univcoord_T chrhigh,
4189 		 char *queryseq_ptr, char *queryuc_ptr, int querylength,
4190 		 int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
4191 		 Pairpool_T pairpool, double defect_rate) {
4192   List_T path, exon, pairptr, p;
4193   Pair_T pair, splice = NULL, gappair;
4194   int max_nmatches, max_nmismatches, nmatches, nmismatches;
4195   int max_score, score;
4196   /* bool nearindelp = false; */
4197   double medial_prob;
4198   int peelback, last_indel;
4199 
4200   int finalscore, continuous_nmatches, continuous_nmismatches, continuous_nopens, continuous_nindels;
4201   int querydp3_medialgap, genomedp3_medialgap, queryjump, genomejump;
4202   List_T continuous_gappairs_medialgap;
4203   int dynprogindex_minor = 0;
4204 
4205 
4206   debug3(printf("Starting trim_end5_exons with ambig_end_length %d\n",ambig_end_length));
4207   debug3(Pair_dump_list(pairs,true));
4208 
4209   *indelp = false;
4210 
4211   /* Handle first exon */
4212   if (pairs == NULL) {
4213     *trim5p = false;
4214     debug3(printf("Ending trim_end5_exons because pairs is NULL\n"));
4215     return (List_T) NULL;
4216 
4217   } else if (0 && ambig_end_length > 0) {
4218     /* Previously didn't mess with ambiguous end, but this can lead to bad ends */
4219     *trim5p = false;
4220     debug3(printf("Ending trim_end5_exons because of ambiguous end\n"));
4221     return pairs;
4222 
4223   } else {
4224     pair = pairs->first;
4225     debug3(printf("querystart %d\n",pair->querypos));
4226     /* Normally expect pair->querypos to be 0, and want to start with -1 because of the gap */
4227 #if 0
4228     if (pair->querypos <= ambig_end_length) {
4229       nmismatches = -1;
4230     } else {
4231       nmismatches = (pair->querypos - ambig_end_length) - 1;
4232     }
4233 #endif
4234   }
4235 
4236   exon = (List_T) NULL;
4237   while (pairs != NULL && !pair->gapp /*&& pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP */) {
4238     pairptr = pairs;
4239     pairs = Pairpool_pop(pairs,&pair);
4240 #ifdef WASTE
4241     exon = Pairpool_push_existing(exon,pairpool,pair);
4242 #else
4243     exon = List_push_existing(exon,pairptr);
4244 #endif
4245   }
4246 
4247 
4248   if (exon == NULL) {
4249     *trim5p = false;
4250     return pairs;
4251 
4252   } else if (pair->gapp == false) {
4253     /* No intron */
4254     gappair = (Pair_T) NULL;
4255 
4256   } else {
4257     /* Look for an indel on the medial side */
4258     gappair = (Pair_T) List_head(exon);
4259 
4260     p = pairs;
4261     peelback = 0;
4262     last_indel = 0;
4263     while (p != NULL && peelback++ < 6) {
4264       if (((Pair_T) p->first)->comp == INDEL_COMP || ((Pair_T) p->first)->comp == SHORTGAP_COMP || ((Pair_T) p->first)->gapp) {
4265 	last_indel = peelback;
4266       }
4267       p = p->rest;
4268     }
4269     if (last_indel == 6) {
4270       while (p != NULL && (((Pair_T) p->first)->comp == INDEL_COMP || ((Pair_T) p->first)->comp == SHORTGAP_COMP || ((Pair_T) p->first)->gapp)) {
4271 	last_indel = peelback++;
4272 	p = p->rest;
4273       }
4274     }
4275 
4276     if (last_indel > 0) {
4277       peelback = 0;
4278       while (pairs != NULL && peelback++ < last_indel) {
4279 	debug3(printf("  Trimming indel on medial side\n"));
4280 	pairptr = pairs;
4281 	pairs = Pairpool_pop(pairs,&pair);
4282 #ifdef WASTE
4283 	exon = Pairpool_push_existing(exon,pairpool,pair);
4284 #else
4285 	exon = List_push_existing(exon,pairptr);
4286 #endif
4287       }
4288     }
4289   }
4290 
4291   debug3(printf("End exon:\n"));
4292   debug3(Pair_dump_list(exon,true));
4293 
4294 
4295   max_nmatches = max_nmismatches = 0;
4296   nmatches = nmismatches = 0;
4297   max_score = score = 0;
4298   for (p = exon; p != NULL; p = List_next(p)) {
4299     pair = (Pair_T) List_head(p);
4300     if (pair->gapp == true) {
4301       /* Skip the intron gap */
4302     } else if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
4303       score += 1;
4304       nmatches += 1;
4305     } else {
4306       score -= 3;
4307       nmismatches += 1;
4308     }
4309     if (score > max_score) {
4310       max_score = score;
4311       max_nmatches = nmatches;
4312       max_nmismatches = nmismatches;
4313     }
4314     debug3(printf("5' querypos %d => score %d, max_nmatches %d, max_nmismatches %d\n",
4315 		  pair->querypos,score,max_nmatches,max_nmismatches));
4316   }
4317 
4318 
4319 #if 0
4320   for (p = pairs, i = 0; p != NULL && i < NEARBY_INDEL; p = List_next(p), i++) {
4321     medial = (Pair_T) p->first;
4322     if (medial->comp == MATCH_COMP || medial->comp == DYNPROG_MATCH_COMP || medial->comp == AMBIGUOUS_COMP) {
4323       /* Skip */
4324     } else if (medial->comp == INDEL_COMP || medial->comp == SHORTGAP_COMP) {
4325       debug3(printf("Saw indel medial to 5' end intron\n"));
4326       nearindelp = true;
4327     } else {
4328       debug3(printf("Saw mismatch %c medial to 5' end intron\n",medial->comp));
4329     }
4330   }
4331 #endif
4332 
4333   debug3(printf("Before end intron, nmatches %d, nmismatches %d\n",max_nmatches,max_nmismatches));
4334   if (pairs == NULL) {
4335     debug3(printf("No indel/gap\n"));
4336     path = exon;
4337     *trim5p = false;
4338 
4339   } else if (exon == NULL) {
4340     debug3(printf("No 5' exon\n"));
4341     path = exon;
4342     *trim5p = false;
4343 
4344   } else if (gappair == NULL) {
4345     debug3(printf("No gappair\n"));
4346     path = exon;
4347     *trim5p = false;
4348 
4349 #if 0
4350   } else if (exon->rest != NULL && ((Pair_T) exon->rest->first)->disallowedp == true) {
4351       debug3(printf("Intron is disallowed, so trimming it\n"));
4352       path = (List_T) NULL;
4353       *trim5p = true;
4354 #endif
4355 
4356 #if 0
4357   } else if (List_length(exon) - 1 > List_length(pairs)) {
4358     /* Subtract 1 because gap is included in exon */
4359     debug3(printf("Exon is more than halfway across %d - 1 > %d, so keeping it\n",List_length(exon),List_length(pairs)));
4360     path = exon;		/* exon already has the gap */
4361     *trim5p = false;
4362 #endif
4363 
4364 #if 0
4365   } else if (nearindelp == true && max_nmatches < INDEL_SPLICE_ENDLENGTH) {
4366     debug3(printf("near indel with nmatches %d too low, so trimming it\n",max_nmatches));
4367     path = (List_T) NULL;
4368     *trim5p = true;
4369 #endif
4370 
4371   } else {
4372     splice = gappair;
4373     if (nmatches < END_SUFFICIENT_EXONLENGTH &&
4374 	nmatches < (int) (END_SUFFICIENT_EXONLENGTH_PCT * (double) querylength) &&
4375 	splice->genomejump > maxintronlen_ends) {
4376       debug3(printf("End intron is too long, so trimming it\n"));
4377       debug3(printf("Calculation: nmatches %d < END_SUFFICIENT_EXONLENGTH %d\n",
4378 		    nmatches,END_SUFFICIENT_EXONLENGTH));
4379       debug3(printf("Calculation: nmatches %d < (END_SUFFICIENT_EXONLENGTH_PCT %f * querylength %d)\n",
4380 		    nmatches,END_SUFFICIENT_EXONLENGTH_PCT,querylength));
4381       debug3(printf("Calculation: genomejump %d > maxintronlen_ends %d\n",
4382 		    splice->genomejump,maxintronlen_ends));
4383       path = (List_T) NULL;
4384       *trim5p = true;
4385 
4386     } else if (nmatches < minendexon) {
4387       debug3(printf("End exon is too short, so trimming it\n"));
4388       path = (List_T) NULL;
4389       *trim5p = true;
4390 
4391     } else if (splice->knowngapp == true && max_nmismatches == 0) {
4392       debug3(printf("Intron is known and no mismatches, so keeping it\n"));
4393       path = exon;		/* exon already has the gap */
4394       *trim5p = false;
4395 
4396 #if 0
4397     } else if (splice->genomejump > maxintronlen) {
4398       debug3(printf("Intron length %d is too long, so trimming it\n",splice->genomejump));
4399       path = (List_T) NULL;
4400       *trim5p = true;
4401 #endif
4402 
4403 #if 0
4404     } else if (enough_matches(nmatches-nmismatches,splice->genomejump/*,splice->donor_prob,splice->acceptor_prob*/) == false) {
4405       debug3(printf("nmatches %d - nmismatches %d not enough for genomejump %d, so trimming it\n",
4406 		    nmatches,nmismatches,splice->genomejump));
4407       path = (List_T) NULL;
4408       *trim5p = true;
4409 #endif
4410 
4411 #if 0
4412     } else if (max_score < 12) {
4413       /* This eliminates ambig end information */
4414       debug3(printf("max_score %d < 12, so trimming it\n",max_score));
4415       path = (List_T) NULL;
4416       *trim5p = true;
4417 #endif
4418 
4419     } else if (sufficient_splice_prob_strict(/*distal_spliceprob*/cdna_direction >= 0 ? splice->donor_prob : splice->acceptor_prob,
4420 					    /*medial_spliceprob*/cdna_direction >= 0 ? splice->acceptor_prob : splice->donor_prob)) {
4421       debug3(printf("Keeping first 5' exon with %d matches and %d mismatches\n",max_nmatches,max_nmismatches));
4422       path = exon;		/* exon already has the gap */
4423       *trim5p = false;
4424 
4425     } else {
4426       querydp3_medialgap = ((Pair_T) pairs->first)->querypos - 1;
4427       genomedp3_medialgap = ((Pair_T) pairs->first)->genomepos - 1;
4428       queryjump = querydp3_medialgap + 1;
4429       genomejump = queryjump + extramaterial_end;
4430       debug3(printf("head of pairs is "));
4431       debug3(Pair_dump_one((Pair_T) pairs->first,true));
4432       debug3(printf("\nmedial end starts at query %d, genome %d\n",querydp3_medialgap,genomedp3_medialgap));
4433 
4434 
4435       /* Set require_pos_score_p to be false for debugging purposes, but we don't use the result anyway when finalscore is <= 0 */
4436       continuous_gappairs_medialgap = Dynprog_end5_gap(&dynprogindex_minor,&finalscore,
4437 						       &continuous_nmatches,&continuous_nmismatches,&continuous_nopens,&continuous_nindels,
4438 						       dynprog,&(queryseq_ptr[querydp3_medialgap]),&(queryuc_ptr[querydp3_medialgap]),
4439 						       queryjump,genomejump,querydp3_medialgap,genomedp3_medialgap,
4440 						       chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
4441 						       extraband_end,defect_rate,/*endalign*/QUERYEND_INDELS,/*require_pos_score_p*/false);
4442       debug(printf("CONTINUOUS AT 5 (trim_end5_exons)?\n"));
4443       debug(printf("CONTINUOUS_GAPPAIRS_MEDIALGAP:\n"));
4444       debug(Pair_dump_list(continuous_gappairs_medialgap,true));
4445       debug3(printf("continuous finalscore %d\n",finalscore));
4446 
4447       if (finalscore > 0) {
4448 	debug3(printf("Using continuous\n"));
4449         path = continuous_gappairs_medialgap;
4450 
4451 	if (continuous_nindels > 0) {
4452 	  *trim5p = true;	/* So calling procedure iterates */
4453 	  *indelp = true;	/* So calling procedure will call trim_end5_indels */
4454 	} else {
4455 	  *trim5p = false;
4456 	}
4457 
4458       } else if (sufficient_splice_prob_local(max_nmatches,max_nmismatches,
4459 	                                      /*distal_spliceprob*/cdna_direction >= 0 ? splice->donor_prob : splice->acceptor_prob,
4460 					      /*medial_spliceprob*/cdna_direction >= 0 ? splice->acceptor_prob : splice->donor_prob)) {
4461         /* Want to keep for comparison of fwd and rev, even if probabilities are poor */
4462         debug3(printf("Keeping first 5' exon with %d matches and %d mismatches\n",max_nmatches,max_nmismatches));
4463 	path = exon;		/* exon already has the gap */
4464 	*trim5p = false;
4465 
4466       } else {
4467 	/* TODO: Set ambig_end_length_5 here, so default output shows a donor or acceptor end type */
4468 	debug3(printf("Fall through (bad probabilities %f and %f): trimming noncanonical 5' exon\n",splice->donor_prob,splice->acceptor_prob));
4469 
4470 	medial_prob = (cdna_direction >= 0) ? splice->acceptor_prob : splice->donor_prob;
4471 	if (canonicalp(splice->knowngapp,splice->comp,splice->donor_prob,splice->acceptor_prob,cdna_direction) == true &&
4472 	    medial_prob > 0.95) {
4473 	  *trim5p = false;		/* Not really, since we are trimming, but this stops further work */
4474 	} else {
4475 	  *trim5p = true;
4476 	}
4477 	path = (List_T) NULL;
4478       }
4479     }
4480   }
4481 
4482 #ifdef WASTE
4483   while (pairs != NULL) {
4484     pairs = Pairpool_pop(pairs,&pair);
4485     path = Pairpool_push_existing(path,pairpool,pair);
4486 
4487   }
4488 #else
4489   path = Pairpool_transfer(path,pairs);
4490 #endif
4491 
4492   pairs = List_reverse(path);
4493   pairs = clean_pairs_end5(pairs,ambig_end_length);
4494 
4495   debug3(printf("End of trim_end5_exons: length = %d\n",List_length(pairs)));
4496   debug3(Pair_dump_list(pairs,true));
4497 
4498 #ifdef CHECK_ASSERTIONS
4499   Pair_check_list_pairs(pairs);
4500 #endif
4501 
4502   return pairs;
4503 }
4504 
4505 
4506 /* Also handles case where novelsplicingp == false */
4507 /* path -> path */
4508 static List_T
trim_end3_indels(List_T path,int ambig_end_length,Dynprog_T dynprog,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,int querylength,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,double defect_rate)4509 trim_end3_indels (List_T path, int ambig_end_length,
4510 		  Dynprog_T dynprog, Univcoord_T chroffset, Univcoord_T chrhigh,
4511 		  char *queryseq_ptr, char *queryuc_ptr, int querylength,
4512 		  bool watsonp, int genestrand,bool jump_late_p,
4513 		  Pairpool_T pairpool, double defect_rate) {
4514   List_T pairs, exon, pairptr, p;
4515   Pair_T pair;
4516   int indel_score;
4517   int nindels;
4518 
4519   int finalscore, continuous_nmatches, continuous_nmismatches, continuous_nopens, continuous_nindels;
4520   int querydp5_medialgap, genomedp5_medialgap, queryjump, genomejump;
4521   List_T continuous_gappairs_medialgap;
4522   int dynprogindex_minor = 0;
4523 
4524   debug3(printf("Starting trim_end3_indels\n"));
4525 
4526   /* Handle last exon */
4527   if (path == NULL) {
4528     /* *trim3p = false; */
4529     return (List_T) NULL;
4530   } else if (ambig_end_length > 0) {
4531     /* Don't mess with ambiguous end */
4532     /* *trim3p = false; */
4533     return path;
4534   } else {
4535     pair = path->first;
4536     debug3(printf("queryend %d\n",pair->querypos));
4537   }
4538 
4539   exon = (List_T) NULL;
4540   while (path != NULL && pair->gapp == false && pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP) {
4541     pairptr = path;
4542     path = Pairpool_pop(path,&pair);
4543 #ifdef WASTE
4544     exon = Pairpool_push_existing(exon,pairpool,pair);
4545 #else
4546     exon = List_push_existing(exon,pairptr);
4547 #endif
4548   }
4549 
4550   while (path != NULL && pair->gapp == false && (((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
4551     pairptr = path;
4552     path = Pairpool_pop(path,&pair);
4553 #ifdef WASTE
4554     exon = Pairpool_push_existing(exon,pairpool,pair);
4555 #else
4556     exon = List_push_existing(exon,pairptr);
4557 #endif
4558   }
4559   debug3(printf("3' End exon:\n"));
4560   debug3(Pair_dump_list(exon,true));
4561 
4562 
4563   if (exon == NULL) {
4564     /* *trim3p = false; */
4565     return path;
4566 
4567   } else {
4568     p = exon;
4569     nindels = 1;
4570     while (p != NULL && (((Pair_T) p->first)->comp == INDEL_COMP || ((Pair_T) p->first)->comp == SHORTGAP_COMP)) {
4571       p = List_next(p);
4572       nindels++;
4573     }
4574 
4575     indel_score = 0;
4576     /* Evaluate region distal to indel */
4577     while (p != NULL) {
4578       pair = (Pair_T) List_head(p);
4579       if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
4580 	indel_score += 1;
4581       } else {
4582 	indel_score -= 3;
4583       }
4584       debug3(printf("3' querypos %d => indel_score %d\n",pair->querypos,indel_score));
4585       p = List_next(p);
4586     }
4587 
4588 #if 0
4589     for ( i = 0; p != NULL && i < NEARBY_INDEL; p = List_next(p), i++) {
4590       medial = (Pair_T) p->first;
4591       if (medial->gapp) {
4592 	debug3(printf("Saw splice medial to 3' end indeln"));
4593 	nearindelp = true;
4594       } else if (medial->comp == MATCH_COMP || medial->comp == DYNPROG_MATCH_COMP || medial->comp == AMBIGUOUS_COMP) {
4595 	/* Skip */
4596       } else {
4597 	debug3(printf("Saw mismatch medial %c to 3' end indel\n",medial->comp));
4598       }
4599     }
4600 #endif
4601 
4602     if (path == NULL) {
4603       debug3(printf("No indel/gap\n"));
4604       pairs = exon;
4605       /* *trim3p = false; */
4606 
4607     } else if (exon == NULL) {
4608       debug3(printf("No 3' exon\n"));
4609       pairs = exon;
4610       /* *trim3p = false; */
4611 
4612 #if 0
4613     } else if (nearindelp == true && max_nmatches < INDEL_SPLICE_ENDLENGTH) {
4614       debug3(printf("near indel with nmatches %d too low, so trimming it\n",max_nmatches));
4615       pairs = (List_T) NULL;
4616       /* *trim3p = true; */
4617 #endif
4618 
4619     } else if (((Pair_T) path->first)->gapp == true) {
4620       debug3(printf("Peeled all the way to a gap, so not handling with this procedure\n"));
4621       pairs = exon;
4622 
4623     } else {
4624       querydp5_medialgap = ((Pair_T) path->first)->querypos + 1;
4625       genomedp5_medialgap = ((Pair_T) path->first)->genomepos + 1;
4626       queryjump = querylength - querydp5_medialgap;
4627       genomejump = queryjump /*+ extramaterial_end*/;
4628 
4629       continuous_gappairs_medialgap = Dynprog_end3_gap(&dynprogindex_minor,&finalscore,
4630 						       &continuous_nmatches,&continuous_nmismatches,&continuous_nopens,&continuous_nindels,
4631 						       dynprog,&(queryseq_ptr[querydp5_medialgap]),&(queryuc_ptr[querydp5_medialgap]),
4632 						       /*rlength*/queryjump,/*glength*/genomejump,
4633 						       /*roffset*/querydp5_medialgap,/*goffset*/genomedp5_medialgap,
4634 						       chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
4635 						       extraband_end,defect_rate,/*endalign*/QUERYEND_NOGAPS,/*require_pos_score_p*/true);
4636       debug(printf("CONTINUOUS AT 3 (trim_end3_indels)?\n"));
4637       debug(printf("CONTINUOUS_GAPPAIRS_MEDIALGAP:\n"));
4638       debug(Pair_dump_list(continuous_gappairs_medialgap,true));
4639       debug3(printf("continuous finalscore %d\n",finalscore));
4640 
4641       if (finalscore > 0) {
4642 	debug3(printf("Using continuous\n"));
4643 	pairs = List_reverse(continuous_gappairs_medialgap);
4644 	/* *trim3p = false; */
4645 
4646       } else if (indel_score < 0) {
4647 	debug3(printf("Not enough matches, so trimming it\n"));
4648 	pairs = (List_T) NULL;
4649 	/* *trim3p = true; */
4650 
4651       } else {
4652 	debug3(printf("Using indel, because indel_score %d > 0\n",indel_score));
4653 	pairs = exon;
4654 	/* *trim3p = false; */
4655       }
4656     }
4657 
4658     pairs = Pairpool_transfer(pairs,path);
4659 
4660     path = List_reverse(pairs);
4661     path = clean_path_end3(path,ambig_end_length);
4662 
4663     debug3(printf("End of trim_end3_indels: length = %d\n",List_length(path)));
4664     debug3(Pair_dump_list(path,true));
4665     return path;
4666   }
4667 }
4668 
4669 
4670 /* Also handles case where novelsplicingp == false */
4671 /* path -> path */
4672 static List_T
trim_end3_exons(bool * indelp,bool * trim3p,int ambig_end_length,List_T path,Dynprog_T dynprog,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,int querylength,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,double defect_rate)4673 trim_end3_exons (bool *indelp, bool *trim3p, int ambig_end_length, List_T path,
4674 		 Dynprog_T dynprog, Univcoord_T chroffset, Univcoord_T chrhigh,
4675 		 char *queryseq_ptr, char *queryuc_ptr, int querylength,
4676 		 int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
4677 		 Pairpool_T pairpool, double defect_rate) {
4678   List_T pairs, exon, pairptr, p;
4679   Pair_T pair, splice = NULL, gappair;
4680   int max_nmatches, max_nmismatches, nmatches, nmismatches;
4681   int max_score, score;
4682   /* bool nearindelp = false; */
4683   double medial_prob;
4684   int peelback, last_indel;
4685 
4686   int finalscore, continuous_nmatches, continuous_nmismatches, continuous_nopens, continuous_nindels;
4687   int querydp5_medialgap, genomedp5_medialgap, queryjump, genomejump;
4688   List_T continuous_gappairs_medialgap;
4689   int dynprogindex_minor = 0;
4690 
4691   debug3(printf("Starting trim_end3_exons with ambig_end_length %d\n",ambig_end_length));
4692 
4693   *indelp = false;
4694 
4695   /* Handle last exon */
4696   if (path == NULL) {
4697     *trim3p = false;
4698     debug3(printf("Ending trim_end3_exons because path is NULL\n"));
4699     return (List_T) NULL;
4700 
4701   } else if (0 && ambig_end_length > 0) {
4702     /* Previously didn't mess with ambiguous end, but this can lead to bad ends */
4703     *trim3p = false;
4704     debug3(printf("Ending trim_end3_exons because of ambiguous end\n"));
4705     return path;
4706 
4707   } else {
4708     pair = path->first;
4709     debug3(printf("queryend %d\n",pair->querypos));
4710 #if 0
4711     /* Normally expect pair->querypos to be 0, and want to start with -1 because of the gap */
4712     if (pair->querypos >= (querylength - 1) - ambig_end_length) {
4713       nmismatches = -1;
4714     } else {
4715       nmismatches = (querylength - 1) - ambig_end_length - pair->querypos - 1;
4716     }
4717 #endif
4718   }
4719 
4720   exon = (List_T) NULL;
4721   while (path != NULL && !pair->gapp) {
4722     pairptr = path;
4723     path = Pairpool_pop(path,&pair);
4724 #ifdef WASTE
4725     exon = Pairpool_push_existing(exon,pairpool,pair);
4726 #else
4727     exon = List_push_existing(exon,pairptr);
4728 #endif
4729   }
4730 
4731 
4732   if (exon == NULL) {
4733     *trim3p = false;
4734     return path;
4735 
4736   } else if (pair->gapp == false) {
4737     /* No intron */
4738     gappair = (Pair_T) NULL;
4739 
4740   } else {
4741     /* Look for an indel on the medial side */
4742     gappair = (Pair_T) List_head(exon);
4743 
4744     p = path;
4745     peelback = 0;
4746     last_indel = 0;
4747     while (p != NULL && peelback++ < 6) {
4748       if (((Pair_T) p->first)->comp == INDEL_COMP || ((Pair_T) p->first)->comp == SHORTGAP_COMP || ((Pair_T) p->first)->gapp) {
4749 	last_indel = peelback;
4750       }
4751       p = p->rest;
4752     }
4753     if (last_indel == 6) {
4754       while (p != NULL && (((Pair_T) p->first)->comp == INDEL_COMP || ((Pair_T) p->first)->comp == SHORTGAP_COMP || ((Pair_T) p->first)->gapp)) {
4755 	last_indel = peelback++;
4756 	p = p->rest;
4757       }
4758     }
4759 
4760     if (last_indel > 0) {
4761       peelback = 0;
4762       while (path != NULL && peelback++ < last_indel) {
4763 	debug3(printf("  Trimming indel on medial side\n"));
4764 	pairptr = path;
4765 	path = Pairpool_pop(path,&pair);
4766 #ifdef WASTE
4767 	exon = Pairpool_push_existing(exon,pairpool,pair);
4768 #else
4769 	exon = List_push_existing(exon,pairptr);
4770 #endif
4771       }
4772     }
4773   }
4774 
4775   debug3(printf("End exon:\n"));
4776   debug3(Pair_dump_list(exon,true));
4777 
4778 
4779   max_nmatches = max_nmismatches = 0;
4780   nmatches = nmismatches = 0;
4781   max_score = score = 0;
4782   for (p = exon; p != NULL; p = List_next(p)) {
4783     pair = (Pair_T) List_head(p);
4784     if (pair->gapp == true) {
4785       /* Skip the intron gap */
4786     } else if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
4787       score += 1;
4788       nmatches += 1;
4789     } else {
4790       score -= 3;
4791       nmismatches += 1;
4792     }
4793     if (score > max_score) {
4794       max_score = score;
4795       max_nmatches = nmatches;
4796       max_nmismatches = nmismatches;
4797     }
4798     debug3(printf("3' querypos %d => score %d, max_nmatches %d, max_nmismatches %d\n",
4799 	          pair->querypos,score,max_nmatches,max_nmismatches));
4800   }
4801 
4802 
4803 #if 0
4804   for (p = path, i = 0; p != NULL && i < NEARBY_INDEL; p = List_next(p), i++) {
4805     medial = (Pair_T) p->first;
4806     if (medial->comp == MATCH_COMP || medial->comp == DYNPROG_MATCH_COMP || medial->comp == AMBIGUOUS_COMP) {
4807       /* Skip */
4808     } else if (medial->comp == INDEL_COMP || medial->comp == SHORTGAP_COMP) {
4809       debug3(printf("Saw indel medial to 3' end intron\n"));
4810       nearindelp = true;
4811     } else {
4812       debug3(printf("Saw mismatch medial %c to 3' end intron\n",medial->comp));
4813     }
4814   }
4815 #endif
4816 
4817   debug3(printf("Before end intron, nmatches %d, nmismatches %d\n",max_nmatches,max_nmismatches));
4818   if (path == NULL) {
4819     debug3(printf("No gap\n"));
4820     pairs = exon;
4821     *trim3p = false;
4822 
4823   } else if (exon == NULL) {
4824     debug3(printf("No 3' exon\n"));
4825     pairs = exon;
4826     *trim3p = false;
4827 
4828   } else if (gappair == NULL) {
4829     debug3(printf("No gappair\n"));
4830     pairs = exon;
4831     *trim3p = false;
4832 
4833 #if 0
4834   } else if (exon->rest != NULL && ((Pair_T) exon->rest->first)->disallowedp == true) {
4835     debug3(printf("Intron is disallowed, so trimming it\n"));
4836     pairs = (List_T) NULL;
4837     *trim3p = true;
4838 #endif
4839 
4840 #if 0
4841   } else if (List_length(exon) - 1 > List_length(path)) {
4842     /* Subtract 1 because gap is included in exon */
4843     debug3(printf("Exon is more than halfway across %d - 1 > %d, so keeping it\n",List_length(exon),List_length(path)));
4844     pairs = exon;		/* exon already has the gap */
4845     *trim3p = false;
4846 #endif
4847 
4848 #if 0
4849   } else if (nearindelp == true && max_nmatches < INDEL_SPLICE_ENDLENGTH) {
4850     debug3(printf("near indel with nmatches %d too low, so trimming it\n",max_nmatches));
4851     pairs = (List_T) NULL;
4852     *trim3p = true;
4853 #endif
4854 
4855   } else {
4856     splice = gappair;
4857     if (nmatches < END_SUFFICIENT_EXONLENGTH &&
4858 	nmatches < (int) (END_SUFFICIENT_EXONLENGTH_PCT * (double) querylength) &&
4859 	splice->genomejump > maxintronlen_ends) {
4860       debug3(printf("End intron is too long, so trimming it\n"));
4861       debug3(printf("Calculation: nmatches %d < END_SUFFICIENT_EXONLENGTH %d\n",
4862 		    nmatches,END_SUFFICIENT_EXONLENGTH));
4863       debug3(printf("Calculation: nmatches %d < (END_SUFFICIENT_EXONLENGTH_PCT %f * querylength %d)\n",
4864 		    nmatches,END_SUFFICIENT_EXONLENGTH_PCT,querylength));
4865       debug3(printf("Calculation: genomejump %d > maxintronlen_ends %d\n",
4866 		    splice->genomejump,maxintronlen_ends));
4867       pairs = (List_T) NULL;
4868       *trim3p = true;
4869 
4870     } else if (nmatches < minendexon) {
4871       debug3(printf("End exon is too short, so trimming it\n"));
4872       pairs = (List_T) NULL;
4873       *trim3p = true;
4874 
4875     } else if (splice->knowngapp == true && max_nmismatches == 0) {
4876       debug3(printf("Intron is known and no mismatches, so keeping it\n"));
4877       pairs = exon;		/* exon already has the gap */
4878       *trim3p = false;
4879 
4880 #if 0
4881     } else if (splice->genomejump > maxintronlen) {
4882       debug3(printf("Intron length %d is too long, so trimming it\n",pair->genomejump));
4883       pairs = (List_T) NULL;
4884       *trim3p = true;
4885 #endif
4886 
4887 #if 0
4888     } else if (enough_matches(nmatches-nmismatches,splice->genomejump/*,splice->donor_prob,splice->acceptor_prob*/) == false) {
4889       debug3(printf("nmatches %d - nmismatches %d not enough for genomejump %d, so trimming it\n",
4890 		    nmatches,nmismatches,splice->genomejump));
4891       pairs = (List_T) NULL;
4892       *trim3p = true;
4893 #endif
4894 
4895 #if 0
4896     } else if (max_score < 12) {
4897       /* This eliminates ambig end information */
4898       debug3(printf("max_score %d < 12, so trimming it\n",max_score));
4899       pairs = (List_T) NULL;
4900       *trim3p = true;
4901 #endif
4902 
4903     } else if (sufficient_splice_prob_strict(/*distal_spliceprob*/cdna_direction >= 0 ? splice->acceptor_prob : splice->donor_prob,
4904 					     /*medial_spliceprob*/cdna_direction >= 0 ? splice->donor_prob : splice->acceptor_prob)) {
4905       debug3(printf("Keeping last 3' exon with %d matches and %d mismatches\n",max_nmatches,max_nmismatches));
4906       pairs = exon;		/* exon already has the gap */
4907       *trim3p = false;
4908 
4909     } else {
4910       querydp5_medialgap = ((Pair_T) path->first)->querypos + 1;
4911       genomedp5_medialgap = ((Pair_T) path->first)->genomepos + 1;
4912       queryjump = querylength - querydp5_medialgap;
4913       genomejump = queryjump + extramaterial_end;
4914       debug3(printf("head of path is "));
4915       debug3(Pair_dump_one((Pair_T) path->first,true));
4916       debug3(printf("\nmedial end starts at query %d, genome %d\n",querydp5_medialgap,genomedp5_medialgap));
4917 
4918       /* Set require_pos_score_p to be false for debugging purposes, but we don't use the result anyway when finalscore is <= 0 */
4919       continuous_gappairs_medialgap = Dynprog_end3_gap(&dynprogindex_minor,&finalscore,
4920 						       &continuous_nmatches,&continuous_nmismatches,&continuous_nopens,&continuous_nindels,
4921 						       dynprog,&(queryseq_ptr[querydp5_medialgap]),&(queryuc_ptr[querydp5_medialgap]),
4922 						       queryjump,genomejump,querydp5_medialgap,genomedp5_medialgap,
4923 						       chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
4924 						       extraband_end,defect_rate,/*endalign*/QUERYEND_INDELS,/*require_pos_score_p*/false);
4925       debug(printf("CONTINUOUS AT 3 (trim_end3_exons)?\n"));
4926       debug(printf("CONTINUOUS_GAPPAIRS_MEDIALGAP:\n"));
4927       debug(Pair_dump_list(continuous_gappairs_medialgap,true));
4928       debug3(printf("continuous finalscore %d\n",finalscore));
4929 
4930       if (finalscore > 0) {
4931 	debug3(printf("Using continuous\n"));
4932 	pairs = List_reverse(continuous_gappairs_medialgap);
4933 	if (continuous_nindels > 0) {
4934 	  *trim3p = true;	/* So calling procedure iterates */
4935 	  *indelp = true; /* So calling procedure will call trim_end3_indels */
4936 	} else {
4937 	  *trim3p = false;
4938 	}
4939 
4940       } else if (sufficient_splice_prob_local(max_nmatches,max_nmismatches,
4941 					      /*distal_spliceprob*/cdna_direction >= 0 ? splice->acceptor_prob : splice->donor_prob,
4942 					      /*medial_spliceprob*/cdna_direction >= 0 ? splice->donor_prob : splice->acceptor_prob)) {
4943 	/* Want to keep for comparison of fwd and rev, even if probabilities are poor */
4944 	debug3(printf("Keeping last 3' exon with %d matches and %d mismatches\n",max_nmatches,max_nmismatches));
4945 	pairs = exon;		/* exon already has the gap */
4946 	*trim3p = false;
4947 
4948       } else {
4949 	/* TODO: Set ambig_end_length_3 here, so default output shows a donor or acceptor end type */
4950 	debug3(printf("Fall through (bad probabilities %f and %f): trimming noncanonical 3' exon\n",splice->donor_prob,splice->acceptor_prob));
4951 
4952 	medial_prob = (cdna_direction >= 0) ? splice->donor_prob : splice->acceptor_prob;
4953 	if (canonicalp(splice->knowngapp,splice->comp,splice->donor_prob,splice->acceptor_prob,cdna_direction) == true &&
4954 	    medial_prob > 0.95) {
4955 	  *trim3p = false;		/* Not really, since we are trimming, but this stops further work */
4956 	} else {
4957 	  *trim3p = true;
4958 	}
4959 	pairs = (List_T) NULL;
4960       }
4961     }
4962   }
4963 
4964 #ifdef WASTE
4965   while (path != NULL) {
4966     path = Pairpool_pop(path,&pair);
4967     pairs = Pairpool_push_existing(pairs,pairpool,pair);
4968   }
4969 #else
4970   pairs = Pairpool_transfer(pairs,path);
4971 #endif
4972 
4973   path = List_reverse(pairs);
4974   path = clean_path_end3(path,ambig_end_length);
4975 
4976   debug3(printf("End of trim_end3_exons: length = %d\n",List_length(path)));
4977   debug3(Pair_dump_list(path,true));
4978 
4979 #ifdef CHECK_ASSERTIONS
4980   Pair_check_list_path(path);
4981 #endif
4982 
4983   return path;
4984 }
4985 
4986 
4987 
4988 /* This procedure fills in introns and replaces non-canonical introns
4989    with deletions, so it should be called after all dynamic
4990    programming procedures */
4991 static List_T
fill_in_gaps(List_T path,Pairpool_T pairpool,char * queryseq_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,int cdna_direction,bool watsonp,int ngap)4992 fill_in_gaps (List_T path, Pairpool_T pairpool, char *queryseq_ptr,
4993 	      Univcoord_T chroffset, Univcoord_T chrhigh,
4994 	      int cdna_direction, bool watsonp, int ngap) {
4995   List_T pairs = NULL;
4996   Pair_T pair, leftpair, rightpair;
4997   int leftquerypos, rightquerypos, introntype, k;
4998   Chrpos_T leftgenomepos, rightgenomepos, genomicpos;
4999   int intronlength;
5000 
5001   char comp;
5002   char left1, left2, right2, right1, left1_alt, left2_alt, right2_alt, right1_alt, c1, c2, c2_alt;
5003   bool intronp, introntype_found_p;
5004 
5005 
5006   if (path == NULL) {
5007     return (List_T) NULL;
5008   } else {
5009     pair = path->first;
5010   }
5011 
5012   while (path != NULL && ((Pair_T) path->first)->gapp == true) {
5013     /* Gap at beginning of alignment.  Can occur after smoothing. */
5014     debug7(printf("Gap %p at beginning of alignment\n",pair));
5015     path = Pairpool_pop(path,&pair);
5016   }
5017 
5018   while (path != NULL) {
5019     /* pairptr = path; */
5020     /* path = Pairpool_pop(path,&pair); */
5021     pair = (Pair_T) path->first;
5022 
5023 #ifdef PMAP
5024     if (pair->cdna == BACKTRANSLATE_CHAR) {
5025       pair->cdna = 'N';
5026     }
5027 #endif
5028     if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP) {
5029 #ifdef WASTE
5030       pairs = Pairpool_push_existing(pairs,pairpool,pair);
5031 #else
5032       pairs = List_transfer_one(pairs,&path);
5033 #endif
5034 
5035     } else if (pair->gapp == false) {
5036 #ifdef WASTE
5037       pairs = Pairpool_push_existing(pairs,pairpool,pair);
5038 #else
5039       pairs = List_transfer_one(pairs,&path);
5040 #endif
5041 
5042     } else if (path->rest == NULL) {
5043       /* Gap at end of alignment.  Can occur after smoothing. */
5044       debug7(printf("Gap at end of alignment\n"));
5045       path = Pairpool_pop(path,&pair);
5046 
5047     } else if (pairs == NULL) {
5048       /* Gap at beginning of alignment.  Skip. */
5049       debug7(printf("Gap at beginning of alignment\n"));
5050       path = Pairpool_pop(path,&pair);
5051 
5052     } else {
5053       /* pairptr = path; -- save */
5054       path = Pairpool_pop(path,&pair);
5055 
5056       /* Discard gap; do not push */
5057       leftpair = path->first;
5058       rightpair = pairs->first;
5059 
5060       if (pair->comp == DUALBREAK_COMP) {
5061 	pairs = add_dualbreak(pairs,queryseq_ptr,
5062 			      chroffset,chrhigh,cdna_direction,watsonp,
5063 			      leftpair,rightpair,pairpool,ngap);
5064       } else {
5065 
5066 	leftquerypos = leftpair->querypos;
5067 	leftgenomepos = leftpair->genomepos;
5068 	/* if (leftpair->cdna == ' ') leftquerypos--; -- For old dynamic programming */
5069 	/* if (leftpair->genome == ' ') leftgenomepos--; -- For old dynamic programming */
5070 	rightquerypos = rightpair->querypos;
5071 	rightgenomepos = rightpair->genomepos;
5072 	intronlength = (int) (rightgenomepos - leftgenomepos - 1);
5073 
5074 	introntype_found_p = false;
5075 	if (pair->knowngapp == true) {
5076 	  debug7(printf("known intron is true, so an intron\n"));
5077 	  intronp = true;
5078 	} else if (splicingp == false) {
5079 	  debug7(printf("splicingp is false, so not an intron, but an indel\n"));
5080 	  intronp = false;
5081 #if 0
5082 	} else if (sensedir == SENSE_NULL) {
5083 	  /* Can lead to very large deletions */
5084 	  debug7(printf("sensedir == SENSE_NULL, so not an intron, but an indel\n"));
5085 	  intronp = false;
5086 #endif
5087 	} else if (intronlength < min_intronlength) {
5088 	  debug7(printf("intronlength %d < min_intronlength %d, so not an intron, but an indel\n",
5089 			intronlength,min_intronlength));
5090 	  intronp = false;
5091 	} else if (intronlength >= max_deletionlength) {
5092 	  debug7(printf("intronlength %d >= max_deletionlength %d, so an intron, not an indel\n",
5093 			intronlength,max_deletionlength));
5094 	  intronp = true;
5095 	} else {
5096 	  /* Possible short intron */
5097 	  left1 = get_genomic_nt(&left1_alt,leftgenomepos+1,chroffset,chrhigh,watsonp);
5098 	  left2 = get_genomic_nt(&left2_alt,leftgenomepos+2,chroffset,chrhigh,watsonp);
5099 	  right2 = get_genomic_nt(&right2_alt,rightgenomepos-2,chroffset,chrhigh,watsonp);
5100 	  right1 = get_genomic_nt(&right1_alt,rightgenomepos-1,chroffset,chrhigh,watsonp);
5101 	  debug7(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
5102 	  introntype = Intron_type(left1,left2,right2,right1,
5103 				   left1_alt,left2_alt,right2_alt,right1_alt,
5104 				   cdna_direction);
5105 	  introntype_found_p = true;
5106 	  debug7(printf("  Introntype at %u..%u is %s (cdna_direction %d)\n",
5107 			leftgenomepos,rightgenomepos,Intron_type_string(introntype),cdna_direction));
5108 
5109 	  if ((cdna_direction >= 0 && introntype == GTAG_FWD)
5110 #ifndef PMAP
5111 	      || (cdna_direction <= 0 && introntype == GTAG_REV)
5112 #endif
5113 	      ) {
5114 	    intronp = true;
5115 	  } else {
5116 	    intronp = false;
5117 	  }
5118 	}
5119 
5120 	if (intronp == false) {
5121 	  debug7(printf("  Gap is not an intron (intronlength %d).  Adding pairs from %d downto %d\n",
5122 			intronlength,rightgenomepos-1,leftgenomepos+1));
5123 	  for (k = rightquerypos - 1; k > leftquerypos; --k) {
5124 #if 0				/* PMAP */
5125 	    c1 = Sequence_codon_char(queryaaseq_ptr[k/3],k%3);
5126 #else
5127 	    c1 = queryseq_ptr[k];
5128 #endif
5129 	    pairs = Pairpool_push(pairs,pairpool,k,rightgenomepos,c1,INDEL_COMP,' ',' ',/*dynprogindex*/0);
5130 	  }
5131 
5132 	  for (genomicpos = rightgenomepos - 1; genomicpos > leftgenomepos; --genomicpos) {
5133 	    c2 = get_genomic_nt(&c2_alt,genomicpos,chroffset,chrhigh,watsonp);
5134 	    pairs = Pairpool_push(pairs,pairpool,rightquerypos,genomicpos,' ',/*comp*/SHORTGAP_COMP,c2,c2_alt,
5135 				  /*dynprogindex*/0);
5136 	  }
5137 	} else {
5138 	  if (introntype_found_p == false) {
5139 	    left1 = get_genomic_nt(&left1_alt,leftgenomepos+1,chroffset,chrhigh,watsonp);
5140 	    left2 = get_genomic_nt(&left2_alt,leftgenomepos+2,chroffset,chrhigh,watsonp);
5141 	    right2 = get_genomic_nt(&right2_alt,rightgenomepos-2,chroffset,chrhigh,watsonp);
5142 	    right1 = get_genomic_nt(&right1_alt,rightgenomepos-1,chroffset,chrhigh,watsonp);
5143 	    debug7(printf("  Dinucleotides are %c%c..%c%c\n",left1,left2,right2,right1));
5144 	    introntype = Intron_type(left1,left2,right2,right1,
5145 				     left1_alt,left2_alt,right2_alt,right1_alt,
5146 				     cdna_direction);
5147 	    debug7(printf("  Introntype at %u..%u is %s (cdna_direction %d)\n",
5148 			  leftgenomepos,rightgenomepos,Intron_type_string(introntype),cdna_direction));
5149 	  }
5150 
5151 	  if (cdna_direction == 0) {
5152 	    /* cdna_direction of 0 should happen only from Stage3_merge_local_splice */
5153 	    comp = NONINTRON_COMP;
5154 	  } else {
5155 	    comp = pair->comp;
5156 	  }
5157 
5158 	  debug7(printf("Adding an intron at %d..%d, currently of type %c, with introntype %d\n",
5159 			leftpair->querypos,rightpair->querypos,comp,introntype));
5160 	  pairs = add_intron(pairs,chroffset,chrhigh,leftpair,rightpair,comp,introntype,ngap,
5161 			     watsonp,pairpool);
5162 	}
5163       }
5164     }
5165   }
5166 
5167   debug7(printf("Final length: %d\n",List_length(pairs)));
5168 
5169   return pairs;
5170 }
5171 
5172 static List_T
add_queryseq_offset(List_T path,int queryseq_offset,Pairpool_T pairpool)5173 add_queryseq_offset (List_T path, int queryseq_offset
5174 #ifdef WASTE
5175 		     , Pairpool_T pairpool
5176 #endif
5177 		     ) {
5178   List_T pairs = NULL;
5179   Pair_T pair;
5180 
5181   while (path != NULL) {
5182     /* pairptr = path; */
5183     /* path = Pairpool_pop(path,&pair); */
5184 
5185     /* Previously excluded cases where pair->gapp was true, but this failed on chimeric paths */
5186     /* Now excluding again, because we are running this before chimera detection and merging */
5187     pair = (Pair_T) path->first;
5188     if (pair->gapp == false) {
5189       pair->querypos += queryseq_offset;
5190     }
5191 #ifdef WASTE
5192     pairs = Pairpool_push_existing(pairs,pairpool,pair);
5193 #else
5194     pairs = List_transfer_one(pairs,&path);
5195 #endif
5196   }
5197 
5198   return pairs;
5199 }
5200 
5201 
5202 static void
add_skiplength(List_T pairs,int skiplength)5203 add_skiplength (List_T pairs, int skiplength) {
5204   List_T p;
5205   Pair_T pair;
5206 
5207   for (p = pairs; p != NULL; p = p->rest) {
5208     pair = (Pair_T) p->first;
5209     if (pair->gapp == true) {
5210       /* Skip */
5211     } else if (pair->querypos >= HALFLEN) {
5212       pair->querypos += skiplength;
5213     }
5214   }
5215   return;
5216 }
5217 
5218 
5219 static void
Stage3_free_pairarray(T * old)5220 Stage3_free_pairarray (T *old) {
5221 #if 0
5222   if ((*old)->pairarray_freeable_p == true) {
5223     FREE_OUT((*old)->pairarray);
5224     (*old)->pairarray_freeable_p = false;
5225   }
5226 #else
5227   FREE_OUT((*old)->pairarray);
5228 #endif
5229   return;
5230 }
5231 
5232 
5233 /* Does not alter pairs, except for adding subseq_offset to querypos,
5234    in case we need to re-compute alignment for chimera */
5235 static struct Pair_T *
make_pairarray(int * npairs,List_T * pairs,int cdna_direction,bool watsonp,Pairpool_T pairpool,char * queryseq_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,int ngap,int subseq_offset,int skiplength)5236 make_pairarray (int *npairs, List_T *pairs, int cdna_direction, bool watsonp,
5237 		Pairpool_T pairpool, char *queryseq_ptr,
5238 		Univcoord_T chroffset, Univcoord_T chrhigh,
5239 		int ngap, int subseq_offset, int skiplength) {
5240   struct Pair_T *pairarray;
5241   List_T printpairs, printpath, path, p;
5242   Pair_T oldpair, newpair;
5243 
5244 
5245 #if 0
5246   /* Better to use clip_pairs_end5_chromosomal_bound and
5247      clip_path_end3_chromosomal_bound instead */
5248   printpairs = Pairpool_copy_bounded(*pairs,pairpool,/*chrlength*/chrhigh - chroffset);
5249 #else
5250   printpairs = Pairpool_copy(*pairs,pairpool);
5251 #endif
5252 
5253   printpath = List_reverse(printpairs);
5254   printpairs = fill_in_gaps(printpath,pairpool,queryseq_ptr,
5255 			    chroffset,chrhigh,cdna_direction,watsonp,ngap);
5256 
5257   if (subseq_offset != 0) {
5258     path = List_reverse(*pairs);
5259 #ifdef WASTE
5260     *pairs = add_queryseq_offset(path,subseq_offset,pairpool);
5261 #else
5262     *pairs = add_queryseq_offset(path,subseq_offset);
5263 #endif
5264 
5265     printpath = List_reverse(printpairs);
5266 #ifdef WASTE
5267     printpairs = add_queryseq_offset(printpath,subseq_offset,pairpool);
5268 #else
5269     printpairs = add_queryseq_offset(printpath,subseq_offset);
5270 #endif
5271   }
5272 
5273   if (skiplength != 0) {
5274     add_skiplength(*pairs,skiplength);
5275     add_skiplength(printpairs,skiplength);
5276   }
5277 
5278   if ((*npairs = List_length(printpairs)) == 0) {
5279     return (struct Pair_T *) NULL;
5280   } else {
5281     /* Used to be Pair_block_copy */
5282     newpair = pairarray = (struct Pair_T *) MALLOC_OUT(*npairs*sizeof(struct Pair_T));
5283     for (p = printpairs; p != NULL; p = p->rest) {
5284       oldpair = (Pair_T) p->first;
5285       memcpy(newpair++,oldpair,sizeof(struct Pair_T));
5286     }
5287 
5288     /* No need to free newpairs, since they belong to pairpool */
5289     Pair_set_genomepos(pairarray,*npairs,chroffset,chrhigh,watsonp);
5290 
5291     return pairarray;
5292   }
5293 }
5294 
5295 
5296 /* Does not alter pairs, except for adding subseq_offset to querypos,
5297    in case we need to re-compute alignment for chimera */
5298 static bool
make_pairarray_merge(T this_left,int cdna_direction,bool watsonp,Pairpool_T pairpool,char * queryseq_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,int ngap,int subseq_offset,int skiplength,bool new_gap_p)5299 make_pairarray_merge (T this_left, int cdna_direction, bool watsonp,
5300 		      Pairpool_T pairpool, char *queryseq_ptr,
5301 		      Univcoord_T chroffset, Univcoord_T chrhigh,
5302 		      int ngap, int subseq_offset, int skiplength, bool new_gap_p) {
5303   struct Pair_T *pairarray, *pairarray_save;
5304   List_T printpairs, printpath, path, p;
5305   Pair_T oldpair, newpair;
5306   int ncanonical, nsemicanonical;
5307   double min_splice_prob;
5308 
5309   pairarray_save = this_left->pairarray;
5310 
5311   if (new_gap_p == true) {
5312     path = List_reverse(this_left->pairs);
5313     this_left->pairs = assign_gap_types(path,cdna_direction,this_left->watsonp,queryseq_ptr,
5314 					this_left->chrnum,this_left->chroffset,this_left->chrhigh,
5315 					pairpool);
5316   }
5317 
5318   debug10(Pair_dump_list(this_left->pairs,true));
5319 
5320   this_left->cdna_direction = cdna_direction;
5321 
5322   printpairs = Pairpool_copy(this_left->pairs,pairpool);
5323 
5324   printpath = List_reverse(printpairs);
5325   printpairs = fill_in_gaps(printpath,pairpool,queryseq_ptr,
5326 			    chroffset,chrhigh,cdna_direction,watsonp,ngap);
5327 
5328   if (List_length(printpairs) == 0) {
5329     this_left->pairarray = pairarray_save;
5330     /* this_left->pairarray_freeable_p = false; */
5331     return false;
5332 
5333   } else {
5334     if (subseq_offset != 0) {
5335       path = List_reverse(this_left->pairs);
5336 #ifdef WASTE
5337       this_left->pairs = add_queryseq_offset(path,subseq_offset,pairpool);
5338 #else
5339       this_left->pairs = add_queryseq_offset(path,subseq_offset);
5340 #endif
5341 
5342       printpath = List_reverse(printpairs);
5343 #ifdef WASTE
5344       printpairs = add_queryseq_offset(printpath,subseq_offset,pairpool);
5345 #else
5346       printpairs = add_queryseq_offset(printpath,subseq_offset);
5347 #endif
5348     }
5349 
5350     if (skiplength != 0) {
5351       add_skiplength(this_left->pairs,skiplength);
5352       add_skiplength(printpairs,skiplength);
5353     }
5354 
5355     Stage3_free_pairarray(&this_left);
5356     this_left->npairs = List_length(printpairs);
5357 
5358     /* Used to be Pair_block_copy */
5359     newpair = pairarray = (struct Pair_T *) MALLOC_OUT(this_left->npairs*sizeof(struct Pair_T));
5360     for (p = printpairs; p != NULL; p = p->rest) {
5361       oldpair = (Pair_T) p->first;
5362       memcpy(newpair++,oldpair,sizeof(struct Pair_T));
5363     }
5364 
5365     /* No need to free newpairs, since they belong to pairpool */
5366     Pair_set_genomepos(pairarray,this_left->npairs,chroffset,chrhigh,watsonp);
5367     this_left->pairarray = pairarray;
5368     this_left->pairarray_freeable_p = true;
5369 
5370     this_left->goodness =
5371       Pair_fracidentity_array(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
5372 			      &this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
5373 			      &ncanonical,&nsemicanonical,&this_left->noncanonical,
5374 			      &min_splice_prob,this_left->pairarray,this_left->npairs,this_left->cdna_direction);
5375 
5376     return true;
5377   }
5378 
5379 }
5380 
5381 
5382 static void
make_pairarrays_chimera(T this_left,T this_right,char * queryseq_ptr,Pairpool_T pairpool,int gaplength,int ngap)5383 make_pairarrays_chimera (T this_left, T this_right,
5384 			 char *queryseq_ptr, Pairpool_T pairpool, int gaplength, int ngap) {
5385   List_T printpairs_left, printpath_left, printpairs_right, printpath_right, p;
5386   Pair_T oldpair, newpair;
5387   int newnpairs;
5388   int ncanonical, nsemicanonical;
5389   double min_splice_prob;
5390 
5391 
5392   /* Revise statistics */
5393   Pair_fracidentity(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
5394 		    &this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
5395 		    &ncanonical,&nsemicanonical,&this_left->noncanonical,
5396 		    &min_splice_prob,this_left->pairs,this_left->cdna_direction);
5397 
5398   Pair_fracidentity(&this_right->matches,&this_right->unknowns,&this_right->mismatches,
5399 		    &this_right->qopens,&this_right->qindels,&this_right->topens,&this_right->tindels,
5400 		    &ncanonical,&nsemicanonical,&this_right->noncanonical,
5401 		    &min_splice_prob,this_right->pairs,this_right->cdna_direction);
5402 
5403 
5404   printpairs_left = Pairpool_copy(this_left->pairs,pairpool);
5405   printpath_left = List_reverse(printpairs_left);
5406   printpairs_left = fill_in_gaps(printpath_left,pairpool,queryseq_ptr,
5407 				 this_left->chroffset,this_left->chrhigh,
5408 				 this_left->cdna_direction,this_left->watsonp,ngap);
5409 
5410   printpairs_right = Pairpool_copy(this_right->pairs,pairpool);
5411   printpath_right = List_reverse(printpairs_right);
5412   printpairs_right = fill_in_gaps(printpath_right,pairpool,queryseq_ptr,
5413 				  this_right->chroffset,this_right->chrhigh,
5414 				  this_right->cdna_direction,this_right->watsonp,ngap);
5415 
5416 
5417   /* Do not use subseq_offset or skiplength for chimeras, since we are
5418      working on the original queryseq, not querysubseq */
5419 
5420   this_left->npairs = List_length(printpairs_left);
5421   this_right->npairs = List_length(printpairs_right);
5422 
5423   Stage3_free_pairarray(&this_left);
5424   Stage3_free_pairarray(&this_right);
5425   if ((newnpairs = this_left->npairs + this_right->npairs) == 0) {
5426     this_left->pairarray = (struct Pair_T *) NULL;
5427     this_right->pairarray = (struct Pair_T *) NULL;
5428     this_left->pairarray_freeable_p = false;
5429     this_right->pairarray_freeable_p = false;
5430 
5431   } else {
5432     /* Need to have a single pairarray for this_left, so we can translate protein correctly */
5433     newpair = this_left->pairarray = (struct Pair_T *) MALLOC_OUT((newnpairs + gaplength)*sizeof(struct Pair_T));
5434     this_left->pairarray_freeable_p = true;
5435 
5436     for (p = printpairs_left; p != NULL; p = p->rest) {
5437       oldpair = (Pair_T) p->first;
5438       memcpy(newpair++,oldpair,sizeof(struct Pair_T));
5439     }
5440     Pair_set_genomepos(this_left->pairarray,this_left->npairs,this_left->chroffset,this_left->chrhigh,
5441 		       this_left->watsonp);
5442     this_left->goodness =
5443       Pair_fracidentity_array(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
5444 			      &this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
5445 			      &ncanonical,&nsemicanonical,&this_left->noncanonical,
5446 			      &min_splice_prob,this_left->pairarray,this_left->npairs,this_left->cdna_direction);
5447 
5448 
5449     newpair = this_right->pairarray = &(this_left->pairarray[this_left->npairs + gaplength]);
5450     this_right->pairarray_freeable_p = false; /* This is the only case of setting pairarray_freeable_p to be false */
5451 
5452     for (p = printpairs_right; p != NULL; p = p->rest) {
5453       oldpair = (Pair_T) p->first;
5454       memcpy(newpair++,oldpair,sizeof(struct Pair_T));
5455     }
5456     Pair_set_genomepos(this_right->pairarray,this_right->npairs,this_right->chroffset,this_right->chrhigh,
5457 		       this_right->watsonp);
5458     this_right->goodness =
5459       Pair_fracidentity_array(&this_right->matches,&this_right->unknowns,&this_right->mismatches,
5460 			      &this_right->qopens,&this_right->qindels,&this_right->topens,&this_right->tindels,
5461 			      &ncanonical,&nsemicanonical,&this_right->noncanonical,
5462 			      &min_splice_prob,this_right->pairarray,this_right->npairs,this_right->cdna_direction);
5463   }
5464 
5465   return;
5466 }
5467 
5468 
5469 void
Stage3_count_paths(int * npaths_primary,int * npaths_altloc,List_T stage3list)5470 Stage3_count_paths (int *npaths_primary, int *npaths_altloc, List_T stage3list) {
5471   T this;
5472 
5473   *npaths_primary = *npaths_altloc = 0;
5474 
5475   while (stage3list != NULL) {
5476     this = (T) List_head(stage3list);
5477     if (altlocp[this->chrnum] == true) {
5478       (*npaths_altloc) += 1;
5479     } else {
5480       (*npaths_primary) += 1;
5481     }
5482     stage3list = List_next(stage3list);
5483   }
5484 
5485   return;
5486 }
5487 
5488 
5489 #define MAPQ_MAXIMUM_SCORE 40
5490 
5491 void
Stage3_compute_mapq(List_T stage3list)5492 Stage3_compute_mapq (List_T stage3list) {
5493   T this;
5494   List_T p;
5495   int best_absmq_score;
5496   float total = 0.0, q;
5497 
5498   if (stage3list != NULL) {
5499     /* Use the first entry to initialize best_absmq_score */
5500     p = stage3list;
5501     this = (T) List_head(p);
5502     best_absmq_score = this->absmq_score = this->matches - 10*this->mismatches;
5503     p = List_next(p);
5504 
5505     while (p != NULL) {
5506       this = (T) List_head(p);
5507       if ((this->absmq_score = this->matches - 10*this->mismatches) > best_absmq_score) {
5508 	best_absmq_score = this->absmq_score;
5509       }
5510       p = List_next(p);
5511     }
5512   }
5513 
5514   for (p = stage3list; p != NULL; p = List_next(p)) {
5515     this = (T) List_head(p);
5516     this->absmq_score -= best_absmq_score;
5517     total += fasterexp(this->absmq_score);
5518   }
5519 
5520   for (p = stage3list; p != NULL; p = List_next(p)) {
5521     this = (T) List_head(p);
5522 
5523     if ((q = 1.0 - fasterexp(this->absmq_score) / total) < 1.0e-4 /* 10^-4.0 */) {
5524       this->mapq_score = 40;
5525     } else {
5526       this->mapq_score = rint(-10.0 * log10(q));
5527     }
5528 
5529     this->absmq_score += MAPQ_MAXIMUM_SCORE;
5530     if (this->absmq_score < 0) {
5531       this->absmq_score = 0;
5532     }
5533 
5534   }
5535 }
5536 
5537 
5538 void
Stage3_recompute_coverage(List_T stage3list,Sequence_T queryseq)5539 Stage3_recompute_coverage (List_T stage3list, Sequence_T queryseq) {
5540   List_T p;
5541   T stage3;
5542   Pair_T start, end;
5543   int querypos1, querypos2;
5544   int trim_start, trim_end, skiplength;
5545 
5546   trim_start = Sequence_trim_start(queryseq);
5547   trim_end = Sequence_trim_end(queryseq);
5548   skiplength = Sequence_skiplength(queryseq);
5549 
5550   for (p = stage3list; p != NULL; p = List_next(p)) {
5551     stage3 = (T) List_head(p);
5552     if (stage3->npairs == 0) {
5553       stage3->trimmed_coverage = 0.0;
5554 
5555     } else {
5556       start = &(stage3->pairarray[0]);
5557       end = &(stage3->pairarray[stage3->npairs - 1]);
5558 
5559       querypos1 = start->querypos;
5560       querypos2 = end->querypos;
5561 
5562 #if 0
5563       if (querypos2 + 1 > trim_end) {
5564 	effective_trim_end = querypos2 + 1;
5565       } else {
5566 	effective_trim_end = trim_end;
5567       }
5568       if (querypos1 < trim_start) {
5569 	effective_trim_start = querypos1;
5570       } else {
5571 	effective_trim_start = trim_start;
5572       }
5573 #endif
5574 
5575       stage3->trimmed_coverage = (double) (querypos2 - querypos1 + 1)/(double) (trim_end - trim_start + skiplength);
5576     }
5577   }
5578 
5579   return;
5580 }
5581 
5582 
5583 static List_T
pick_cdna_direction(int * winning_cdna_direction,int * sensedir,List_T pairs_fwd,List_T pairs_rev,double defect_rate_fwd,double defect_rate_rev,int nknown_fwd,int ncanonical_fwd,int nsemicanonical_fwd,int nnoncanonical_fwd,int nbadintrons_fwd,int nknown_rev,int ncanonical_rev,int nsemicanonical_rev,int nnoncanonical_rev,int nbadintrons_rev,double max_intron_score_fwd,double avg_donor_score_fwd,double avg_acceptor_score_fwd,double max_intron_score_rev,double avg_donor_score_rev,double avg_acceptor_score_rev,int nmatches_fwd,int nmismatches_fwd,int nmatches_rev,int nmismatches_rev,int nindels_fwd,int nindels_rev,int indel_alignment_score_fwd,int indel_alignment_score_rev,int sense_filter)5584 pick_cdna_direction (int *winning_cdna_direction, int *sensedir,
5585 		     List_T pairs_fwd, List_T pairs_rev, double defect_rate_fwd, double defect_rate_rev,
5586 		     int nknown_fwd, int ncanonical_fwd, int nsemicanonical_fwd,
5587 		     int nnoncanonical_fwd, int nbadintrons_fwd,
5588 		     int nknown_rev, int ncanonical_rev, int nsemicanonical_rev,
5589 		     int nnoncanonical_rev, int nbadintrons_rev,
5590 		     double max_intron_score_fwd, double avg_donor_score_fwd, double avg_acceptor_score_fwd,
5591 		     double max_intron_score_rev, double avg_donor_score_rev, double avg_acceptor_score_rev,
5592 #ifdef COMPLEX_DIRECTION
5593 		     int nmatches_fwd, int nmismatches_fwd, int nmatches_rev, int nmismatches_rev, int nindels_fwd, int nindels_rev,
5594 		     int indel_alignment_score_fwd, int indel_alignment_score_rev,
5595 #endif
5596 		     int sense_filter) {
5597 #if 0
5598   int canonical_score_fwd, canonical_score_rev;
5599 #endif
5600 
5601   if (pairs_fwd) {
5602     /* canonical_score_fwd = ncanonical_fwd - nbadintrons_fwd + nsemicanonical_fwd - nnoncanonical_fwd; */
5603     debug11(printf("nknown_fwd %d, ncanonical_fwd %d, nbadintrons_fwd %d, nsemicanonical_fwd %d, nnoncanonical_fwd %d\n",
5604 		   nknown_fwd,ncanonical_fwd,nbadintrons_fwd,nsemicanonical_fwd,nnoncanonical_fwd));
5605   }
5606   if (pairs_rev) {
5607     /* canonical_score_rev = ncanonical_rev - nbadintrons_rev + nsemicanonical_rev - nnoncanonical_rev; */
5608     debug11(printf("nknown_rev %d, ncanonical_rev %d, nbadintrons_rev %d, nsemicanonical_rev %d, nnoncanonical_rev %d\n",
5609 		   nknown_rev,ncanonical_rev,nbadintrons_rev,nsemicanonical_rev,nnoncanonical_rev));
5610   }
5611 
5612   if (pairs_fwd == NULL && pairs_rev == NULL) {
5613     debug11(printf("pairs_fwd is NULL and pairs_rev is NULL\n"));
5614     *winning_cdna_direction = 0;
5615     *sensedir = SENSE_NULL;
5616     return (List_T) NULL;
5617 
5618   } else if (pairs_rev == NULL) {
5619     debug11(printf("pairs_rev is NULL, so fwd wins\n"));
5620     *winning_cdna_direction = +1;
5621     *sensedir = SENSE_FORWARD;
5622     return pairs_fwd;
5623 
5624   } else if (pairs_fwd == NULL) {
5625     debug11(printf("pairs_fwd is NULL, so rev wins\n"));
5626     *winning_cdna_direction = -1;
5627     *sensedir = SENSE_ANTI;
5628     return pairs_rev;
5629 
5630 #if 0
5631   } else if (indel_alignment_score_fwd >= 0 && indel_alignment_score_rev < 0 && nbadintrons_rev > 0) {
5632     debug11(printf("indel_alignment_score_fwd %d positive and indel_alignment_score_rev %d negative for a bad intron, so fwd wins\n",
5633 		   indel_alignment_score_fwd,indel_alignment_score_rev));
5634     *winning_cdna_direction = +1;
5635 
5636   } else if (indel_alignment_score_rev >= 0 && indel_alignment_score_fwd < 0 && nbadintrons_fwd > 0) {
5637     debug11(printf("indel_alignment_score_fwd %d negative for a bad intron and indel_alignment_score_rev %d positive, so rev wins\n",
5638 		   indel_alignment_score_fwd,indel_alignment_score_rev));
5639     *winning_cdna_direction = -1;
5640 #endif
5641 
5642 #if 0
5643     /* Cannot use, because favors a terminal over a splice */
5644   } else if (nmismatches_fwd < nmismatches_rev) {
5645     debug11(printf("nmismatches fwd %d < nmismatches rev %d, so fwd wins\n",
5646 		   nmismatches_fwd,nmismatches_rev));
5647     *winning_cdna_direction = +1;
5648 
5649   } else if (nmismatches_fwd > nmismatches_rev) {
5650     debug11(printf("nmismatches fwd %d > nmismatches rev %d, so rev wins\n",
5651 		   nmismatches_fwd,nmismatches_rev));
5652     *winning_cdna_direction = -1;
5653 #endif
5654 
5655   } else if (defect_rate_fwd > DEFECT_MEDQ && defect_rate_rev > DEFECT_MEDQ &&
5656 	     avg_donor_score_fwd > 0.9 && avg_donor_score_rev < 0.5 &&
5657 	     avg_acceptor_score_fwd > 0.9 && avg_acceptor_score_rev < 0.5) {
5658     debug11(printf("defect_rate %f, %f and intronscores fwd %f,%f > intronscores rev %f,%f, so fwd wins\n",
5659 		   defect_rate_fwd,defect_rate_rev,avg_donor_score_fwd,avg_acceptor_score_fwd,
5660 		   avg_donor_score_rev,avg_acceptor_score_rev));
5661     /* intronscores reveal a clear sensedir */
5662     *winning_cdna_direction = +1;
5663 
5664   } else if (defect_rate_fwd > DEFECT_MEDQ && defect_rate_rev > DEFECT_MEDQ &&
5665 	     avg_donor_score_rev > 0.9 && avg_donor_score_fwd < 0.5 &&
5666 	     avg_acceptor_score_rev > 0.9 && avg_acceptor_score_fwd < 0.5) {
5667     debug11(printf("defect_rate %f, %f and intronscores rev %f,%f > intronscores fwd %f,%f, so fwd wins\n",
5668 		   defect_rate_fwd,defect_rate_rev,avg_donor_score_rev,avg_acceptor_score_rev,
5669 		   avg_donor_score_fwd,avg_acceptor_score_fwd));
5670     /* intronscores reveal a clear sensedir */
5671     *winning_cdna_direction = -1;
5672 
5673   } else if (nknown_fwd > 0 && nknown_rev == 0) {
5674     debug11(printf("nknown_fwd %d && nknown_rev %d, so fwd wins\n",
5675 		   nknown_fwd,nknown_rev));
5676     *winning_cdna_direction = +1;
5677 
5678   } else if (nknown_rev > 0 && nknown_fwd == 0) {
5679     debug11(printf("nknown_fwd %d && nknown_rev %d, so rev wins\n",
5680 		   nknown_fwd,nknown_rev));
5681     *winning_cdna_direction = -1;
5682 
5683   } else if (ncanonical_fwd > 0 && ncanonical_rev == 0) {
5684     debug11(printf("ncanonical_fwd %d && ncanonical_rev %d, so fwd wins\n",
5685 		   ncanonical_fwd,ncanonical_rev));
5686     *winning_cdna_direction = +1;
5687 
5688   } else if (ncanonical_rev > 0 && ncanonical_fwd == 0) {
5689     debug11(printf("ncanonical_fwd %d && ncanonical_rev %d, so rev wins\n",
5690 		   ncanonical_fwd,ncanonical_rev));
5691     *winning_cdna_direction = -1;
5692 
5693 #if 0
5694   } else if (canonical_score_fwd > canonical_score_rev + 1) {
5695     debug11(printf("canonical_score_fwd %d > canonical_score_rev %d + 1, so fwd wins\n",
5696 		   canonical_score_fwd,canonical_score_rev));
5697     *winning_cdna_direction = +1;
5698 
5699   } else if (canonical_score_rev > canonical_score_fwd + 1) {
5700     debug11(printf("canonical_score_rev %d > canonical_score_fwd %d + 1, so rev wins\n",
5701 		   canonical_score_rev,canonical_score_fwd));
5702     *winning_cdna_direction = -1;
5703 #endif
5704 
5705 #if 0
5706   } else if (alignment_score_fwd > alignment_score_rev + SCORE_SIGDIFF) {
5707     debug11(printf("alignment_score_fwd %d >> alignment_score_rev %d, so fwd wins\n",
5708 		   alignment_score_fwd,alignment_score_rev));
5709     *winning_cdna_direction = +1;
5710 
5711   } else if (alignment_score_rev > alignment_score_fwd + SCORE_SIGDIFF) {
5712     debug11(printf("alignment_score_rev %d << alignment_score_fwd %d, so rev wins\n",
5713 		   alignment_score_rev,alignment_score_fwd));
5714     *winning_cdna_direction = -1;
5715 #endif
5716 
5717   } else if (nnoncanonical_fwd == 0 && nnoncanonical_rev > 0) {
5718     debug11(printf("nnoncanonical_fwd %d < nnoncanonical_rev %d, so fwd wins\n",
5719 		   nnoncanonical_fwd,nnoncanonical_rev));
5720     *winning_cdna_direction = +1;
5721 
5722   } else if (nnoncanonical_rev == 0 && nnoncanonical_fwd > 0) {
5723     debug11(printf("nnoncanonical_rev %d < nnoncanonical_fwd %d, so rev wins\n",
5724 		   nnoncanonical_rev,nnoncanonical_fwd));
5725     *winning_cdna_direction = -1;
5726 
5727   } else if (nbadintrons_fwd == 0 && nbadintrons_rev > 0) {
5728     debug11(printf("nbadintrons_fwd %d < nbadintrons_rev %d, so fwd wins\n",
5729 		   nbadintrons_fwd,nbadintrons_rev));
5730     *winning_cdna_direction = +1;
5731 
5732   } else if (nbadintrons_rev == 0 && nbadintrons_fwd > 0) {
5733     debug11(printf("nbadintrons_rev %d < nbadintrons_fwd %d, so rev wins\n",
5734 		   nbadintrons_rev,nbadintrons_fwd));
5735     *winning_cdna_direction = -1;
5736 
5737   } else if (avg_donor_score_fwd > avg_donor_score_rev + PROB_SIGDIFF &&
5738 	     avg_acceptor_score_fwd > avg_acceptor_score_rev + PROB_SIGDIFF) {
5739     debug11(printf("intronscores fwd %f+%f > intronscores rev %f+%f, so fwd wins\n",
5740 		   avg_donor_score_fwd,avg_acceptor_score_fwd,avg_donor_score_rev,avg_acceptor_score_rev));
5741     /* intronscores reveal a preferred sensedir */
5742     *winning_cdna_direction = +1;
5743 
5744   } else if (avg_donor_score_rev > avg_donor_score_fwd + PROB_SIGDIFF &&
5745 	     avg_acceptor_score_rev > avg_acceptor_score_fwd + PROB_SIGDIFF) {
5746     debug11(printf("intronscores rev %f+%f > intronscores fwd %f+%f, so fwd wins\n",
5747 		   avg_donor_score_rev,avg_acceptor_score_rev,avg_donor_score_fwd,avg_acceptor_score_fwd));
5748     /* intronscores reveal a preferred sensedir */
5749     *winning_cdna_direction = -1;
5750 
5751 #if 0
5752   } else if (alignment_score_fwd > alignment_score_rev && alignment_score_fwd > 0) {
5753     debug11(printf("alignment_score_fwd %d > alignment_score_rev %d, so fwd wins\n",
5754 		   alignment_score_fwd,alignment_score_rev));
5755     *winning_cdna_direction = +1;
5756 
5757   } else if (alignment_score_rev > alignment_score_fwd && alignment_score_rev > 0) {
5758     debug11(printf("alignment_score_rev %d < alignment_score_fwd %d, so rev wins\n",
5759 		   alignment_score_rev,alignment_score_fwd));
5760     *winning_cdna_direction = -1;
5761 #endif
5762 
5763   } else {
5764     debug11(printf("scores all equal, so fwd wins, but setting cdna_direction to be 0\n"));
5765     /* No clear intron direction, so allow under all sense_filters */
5766     *winning_cdna_direction = 0;
5767     *sensedir = SENSE_NULL;
5768     return pairs_fwd;
5769   }
5770 
5771   debug11(printf("max_intron_score_fwd = %f, max_intron_score_rev = %f\n",max_intron_score_fwd,max_intron_score_rev));
5772 
5773   if (*winning_cdna_direction == +1) {
5774     if (ncanonical_fwd == 0 && nsemicanonical_fwd == 0 && nnoncanonical_fwd == 0) {
5775       *sensedir = SENSE_NULL;
5776     } else if (max_intron_score_fwd < 1.8) {
5777       *sensedir = SENSE_NULL;
5778     } else {
5779       *sensedir = SENSE_FORWARD;
5780     }
5781 #ifndef PMAP
5782     if (sense_filter < 0) {
5783       return (List_T) NULL;
5784     }
5785 #endif
5786     debug11(printf("winning_cdna_direction = %d, sensedir = %d\n",
5787 		   *winning_cdna_direction,*sensedir));
5788     return pairs_fwd;
5789 
5790   } else if (*winning_cdna_direction == -1) {
5791     if (ncanonical_rev == 0 && nsemicanonical_rev == 0 && nnoncanonical_rev == 0) {
5792       *sensedir = SENSE_NULL;
5793     } else if (max_intron_score_rev < 1.8) {
5794       *sensedir = SENSE_NULL;
5795     } else {
5796       *sensedir = SENSE_ANTI;
5797     }
5798 #ifndef PMAP
5799     if (sense_filter > 0) {
5800       return (List_T) NULL;
5801     }
5802 #endif
5803     debug11(printf("winning_cdna_direction = %d, sensedir = %d\n",
5804 		   *winning_cdna_direction,*sensedir));
5805     return pairs_rev;
5806 
5807   } else {
5808     fprintf(stderr,"Unexpected value %d for winning_cdna_direction\n",*winning_cdna_direction);
5809     abort();
5810   }
5811 }
5812 
5813 
5814 static int
initial_cdna_direction(List_T pairs_fwd,List_T pairs_rev,double avg_donor_score_fwd,double avg_acceptor_score_fwd,double avg_donor_score_rev,double avg_acceptor_score_rev,int nmatches_fwd,int nmismatches_fwd,int nmatches_rev,int nmismatches_rev,int nindels_fwd,int nindels_rev,int indel_alignment_score_fwd,int indel_alignment_score_rev)5815 initial_cdna_direction (List_T pairs_fwd, List_T pairs_rev,
5816 			double avg_donor_score_fwd, double avg_acceptor_score_fwd,
5817 			double avg_donor_score_rev, double avg_acceptor_score_rev
5818 #ifdef COMPLEX_DIRECTION
5819 			, int nmatches_fwd, int nmismatches_fwd, int nmatches_rev, int nmismatches_rev, int nindels_fwd, int nindels_rev,
5820 			int indel_alignment_score_fwd, int indel_alignment_score_rev
5821 #endif
5822 			) {
5823 
5824   if (pairs_fwd == NULL && pairs_rev == NULL) {
5825     debug11(printf("pairs_fwd is NULL and pairs_rev is NULL\n"));
5826     return 0;
5827 
5828   } else if (pairs_rev == NULL) {
5829     debug11(printf("pairs_rev is NULL, so fwd wins\n"));
5830     return +1;
5831 
5832   } else if (pairs_fwd == NULL) {
5833     debug11(printf("pairs_fwd is NULL, so rev wins\n"));
5834     return -1;
5835 
5836   } else if (avg_donor_score_fwd > 0.9 && avg_acceptor_score_fwd > 0.9 &&
5837 	     (avg_donor_score_rev < 0.5 || avg_acceptor_score_rev < 0.5)) {
5838     debug11(printf("intronscores fwd %f,%f > intronscores rev %f,%f, so fwd wins\n",
5839 		   avg_donor_score_fwd,avg_acceptor_score_fwd,avg_donor_score_rev,avg_acceptor_score_rev));
5840     /* intronscores reveal a clear sensedir */
5841     return +1;
5842 
5843   } else if (avg_donor_score_rev > 0.9 && avg_acceptor_score_rev > 0.9 &&
5844 	     (avg_donor_score_fwd < 0.5 || avg_acceptor_score_fwd < 0.5)) {
5845     debug11(printf("intronscores rev %f,%f > intronscores fwd %f,%f, so fwd wins\n",
5846 		   avg_donor_score_rev,avg_acceptor_score_rev,avg_donor_score_fwd,avg_acceptor_score_fwd));
5847     /* intronscores reveal a clear sensedir */
5848     return -1;
5849 
5850 #if 0
5851   } else if (alignment_score_fwd > alignment_score_rev + SCORE_SIGDIFF) {
5852     /* Cannot use alignment score until we do a full alignment */
5853     debug11(printf("alignment_score_fwd %d >> alignment_score_rev %d, so fwd wins\n",
5854 		   alignment_score_fwd,alignment_score_rev));
5855     return +1;
5856 
5857   } else if (alignment_score_rev > alignment_score_fwd + SCORE_SIGDIFF) {
5858     debug11(printf("alignment_score_rev %d << alignment_score_fwd %d, so rev wins\n",
5859 		   alignment_score_rev,alignment_score_fwd));
5860     return -1;
5861 #endif
5862 
5863 #if 0
5864   } else if (nnoncanonical_fwd < nnoncanonical_rev) {
5865     /* Not a good test until we do full dynamic programming */
5866     debug11(printf("nnoncanonical_fwd %d < nnoncanonical_rev %d, so fwd wins\n",
5867 		   nnoncanonical_fwd,nnoncanonical_rev));
5868     return +1;
5869 
5870   } else if (nnoncanonical_rev < nnoncanonical_fwd) {
5871     debug11(printf("nnoncanonical_rev %d < nnoncanonical_fwd %d, so rev wins\n",
5872 		   nnoncanonical_rev,nnoncanonical_fwd));
5873     return -1;
5874 #endif
5875 
5876 #if 0
5877   } else if (avg_donor_score_fwd + avg_acceptor_score_fwd > avg_donor_score_rev + avg_acceptor_score_rev + PROB_SIGDIFF) {
5878     /* Not a good test until we do full dynamic programming */
5879     debug11(printf("intronscores fwd %f+%f > intronscores rev %f+%f, so fwd wins\n",
5880 		   avg_donor_score_fwd,avg_acceptor_score_fwd,avg_donor_score_rev,avg_acceptor_score_rev));
5881     /* intronscores reveal a preferred sensedir */
5882     return +1;
5883 
5884   } else if (avg_donor_score_rev + avg_acceptor_score_rev > avg_donor_score_fwd + avg_acceptor_score_fwd + PROB_SIGDIFF) {
5885     debug11(printf("intronscores rev %f+%f > intronscores fwd %f+%f, so fwd wins\n",
5886 		   avg_donor_score_rev,avg_acceptor_score_rev,avg_donor_score_fwd,avg_acceptor_score_fwd));
5887     /* intronscores reveal a preferred sensedir */
5888     return -1;
5889 #endif
5890 
5891   } else {
5892     return 0;
5893   }
5894 }
5895 
5896 
5897 
5898 T
Stage3_new(struct Pair_T * pairarray,List_T pairs,int npairs,int goodness,int cdna_direction,int sensedir,int matches,int unknowns,int mismatches,int qopens,int qindels,int topens,int tindels,int ncanonical,int nsemicanonical,int nnoncanonical,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool watsonp,int genestrand,int querylength,int skiplength,int trimlength,int straintype,char * strain,IIT_T altstrain_iit)5899 Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int goodness, int cdna_direction, int sensedir,
5900 	    int matches, int unknowns, int mismatches, int qopens, int qindels,
5901 	    int topens, int tindels, int ncanonical, int nsemicanonical, int nnoncanonical,
5902 	    Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
5903 	    bool watsonp, int genestrand, int querylength, int skiplength, int trimlength,
5904 	    int straintype, char *strain, IIT_T altstrain_iit) {
5905   T new;
5906   Pair_T start, end;
5907   int *typematches, nmatches;
5908   int alias;
5909 
5910   List_T cigar_tokens;
5911   bool intronp;
5912   int hardclip_start, hardclip_end;
5913 
5914 
5915   /* pairs can be NULL from Stage3_split */
5916   if (pairs == NULL || npairs == 0) {
5917     return (T) NULL;
5918   }
5919 
5920   start = &(pairarray[0]);
5921   end = &(pairarray[npairs-1]);
5922   assert(start->gapp == false);
5923   assert(end->gapp == false);
5924 
5925   hardclip_start = start->querypos;
5926   hardclip_end = (querylength - 1) - end->querypos;
5927 
5928   cigar_tokens = Pair_compute_cigar(&intronp,&hardclip_start,&hardclip_end,pairarray,npairs,querylength,
5929 				    watsonp,/*chimera_part*/0);
5930   if (Pair_cigar_length(cigar_tokens) + hardclip_start + hardclip_end != querylength) {
5931     fprintf(stderr,"Could not compute a valid cigar from the following alignment: %d + %d + %d != %d\n",
5932 	    Pair_cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
5933     Pair_dump_array_stderr(pairarray,npairs,/*zerobasedp*/true);
5934 #ifdef CHECK_ASSERTIONS
5935     abort();
5936 #endif
5937     Pair_tokens_free(&cigar_tokens);
5938     return (T) NULL;
5939 
5940   } else {
5941     Pair_tokens_free(&cigar_tokens);
5942 
5943     new = (T) MALLOC_OUT(sizeof(*new)); /* Matches FREE_OUT in Stage3_free */
5944     debug99(printf("Creating %p\n",new));
5945   }
5946 
5947   new->pairarray = pairarray;
5948   new->pairarray_freeable_p = true;
5949   new->chimera_left_p = false;
5950   new->chimera_right_p = false;
5951 
5952   new->pairs = pairs;
5953   new->npairs = npairs;
5954 
5955   new->matches = matches;
5956   new->unknowns = unknowns;
5957   new->mismatches = mismatches;
5958   new->qopens = qopens;
5959   new->qindels = qindels;
5960   new->topens = topens;
5961   new->tindels = tindels;
5962 
5963   new->noncanonical = nsemicanonical + nnoncanonical;
5964   new->goodness = goodness;
5965 
5966 #ifdef PMAP
5967   /* Should be +1 */
5968   new->cdna_direction = cdna_direction;
5969   new->sensedir = sensedir;
5970 #else
5971   if (cdna_direction == 0 && require_splicedir_p == true) {
5972     new->cdna_direction = Pair_guess_cdna_direction_array(&new->sensedir,pairarray,npairs,/*invertedp*/false,
5973 							  chroffset,watsonp);
5974   } else if (ncanonical == 0 && nsemicanonical == 0 /*&& nnoncanonical == 0*/) {
5975     new->cdna_direction = 0;
5976     new->sensedir = SENSE_NULL;	/* was sensedir, but this gives bad XS output */
5977   } else {
5978     new->cdna_direction = cdna_direction;
5979     new->sensedir = sensedir;
5980   }
5981 #endif
5982 
5983 #if 0
5984   nexons = Pair_nexons_approx(pairs);
5985   if (nexons > 2) {
5986     /* Favor spliced transcripts, but only if we're sure they're
5987        spliced (i.e., 3 or more exons).  A random intron shouldn't
5988        get credit. */
5989     new->goodness += nexons;
5990   }
5991 #endif
5992 
5993   new->translation_start = 0;
5994   new->translation_end = 0;
5995   new->translation_length = 0;
5996 
5997   /* new->stage2_source = stage2_source; */
5998   /* new->stage2_indexsize = stage2_indexsize; */
5999 
6000   new->straintype = straintype;
6001   new->strain = strain;
6002 
6003   new->chrnum = chrnum;
6004   new->chroffset = chroffset;
6005   new->chrhigh = chrhigh;
6006   new->chrlength = chrlength;
6007   new->circularpos = Pair_circularpos(&alias,pairarray,npairs,chrlength,watsonp,querylength);
6008 
6009   new->watsonp = watsonp;
6010   new->genestrand = genestrand;
6011 
6012   new->genomicstart = chroffset + Pair_genomepos(start);
6013   new->genomicend = chroffset + Pair_genomepos(end);
6014 
6015   /* new->stage3_runtime = stage3_runtime; */
6016 
6017   new->trimmed_coverage = (double) (end->querypos - start->querypos + 1)/(double) (trimlength + skiplength);
6018 
6019   debug0(printf("Creating stage3 at chr %d:%u..%u, goodness %d, matches %d, npairs %d\n",
6020 		chrnum,Stage3_chrstart(new),Stage3_chrend(new),new->goodness,new->matches,new->npairs));
6021 
6022   if (straintype == 0) {
6023     return new;
6024   } else {
6025     if (watsonp) {
6026       typematches = IIT_get_typed(&nmatches,altstrain_iit,/*divstring*/NULL,
6027 				  new->genomicstart,new->genomicend,straintype,/*sortp*/false);
6028     } else {
6029       typematches = IIT_get_typed(&nmatches,altstrain_iit,/*divstring*/NULL,
6030 				  new->genomicend,new->genomicstart,straintype,/*sortp*/false);
6031     }
6032     if (typematches == NULL) {
6033       Stage3_free(&new);
6034       return NULL;
6035     } else {
6036       FREE(typematches);
6037       return new;
6038     }
6039   }
6040 
6041   return new;
6042 }
6043 
6044 
6045 T
Stage3_new_from_pairs(List_T pairs,int cdna_direction,bool watsonp,int genestrand,int sensedir,Pairpool_T pairpool,Sequence_T queryseq,int query_subseq_offset,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)6046 Stage3_new_from_pairs (List_T pairs, int cdna_direction, bool watsonp, int genestrand, int sensedir,
6047 		       Pairpool_T pairpool, Sequence_T queryseq, int query_subseq_offset,
6048 		       Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
6049   struct Pair_T *pairarray;
6050   int npairs;
6051   int goodness, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
6052     ncanonical, nsemicanonical, nnoncanonical;
6053   double min_splice_prob;
6054 
6055   if (pairs == NULL) {
6056     return (T) NULL;
6057   } else {
6058     /* Use chroffset+chrlength instead of chrhigh, because we are calling this procedure after Pair_split_circular */
6059     pairarray = make_pairarray(&npairs,&pairs,cdna_direction,watsonp,
6060 			       pairpool,/*queryseq_ptr*/Sequence_fullpointer(queryseq),
6061 			       chroffset,/*chrhigh*/chroffset+chrlength,
6062 			       ngap,query_subseq_offset,/*skiplength*/Sequence_skiplength(queryseq));
6063 
6064     goodness = Pair_fracidentity_array(&matches,&unknowns,&mismatches,
6065 				       &qopens,&qindels,&topens,&tindels,
6066 				       &ncanonical,&nsemicanonical,&nnoncanonical,
6067 				       &min_splice_prob,pairarray,npairs,cdna_direction);
6068 
6069     return Stage3_new(pairarray,pairs,npairs,goodness,cdna_direction,sensedir,
6070 		      matches,unknowns,mismatches,
6071 		      qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
6072 		      chrnum,chroffset,chrhigh,chrlength,watsonp,genestrand,
6073 		      /*querylength*/Sequence_fulllength(queryseq),
6074 		      /*skiplength*/Sequence_skiplength(queryseq),
6075 		      /*trimlength*/Sequence_trimlength(queryseq),
6076 		      /*straintype*/0,/*strain*/NULL,/*altstrain_iit*/NULL);
6077   }
6078 }
6079 
6080 
6081 #if 0
6082 void
6083 Stage3_check_for_shared_pairs (T this, T that) {
6084   List_T p, q;
6085   Pair_T pair1, pair2;
6086 
6087   if (this == that) {
6088     return;
6089   } else {
6090     for (p = this->pairs; p != NULL; p = List_next(p)) {
6091       for (q = that->pairs; q != NULL; q = List_next(q)) {
6092 	if (List_head(p) == List_head(q)) {
6093 	  pair1 = (Pair_T) List_head(p);
6094 	  pair2 = (Pair_T) List_head(q);
6095 	  Pair_dump_one(pair1,true);
6096 	  printf("\n");
6097 	  Pair_dump_one(pair2,true);
6098 	  printf("\n");
6099 	  abort();
6100 	}
6101       }
6102     }
6103   }
6104 
6105   return;
6106 }
6107 
6108 
6109 void
6110 Stage3_list_check_for_shared_pairs (List_T stage3list) {
6111   List_T p, q, r, s;
6112   Pair_T pair1, pair2;
6113 
6114   for (r = stage3list; r != NULL; r = List_next(r)) {
6115     for (s = stage3list; s != NULL; s = List_next(s)) {
6116       Stage3_check_for_shared_pairs(List_head(r),List_head(s));
6117     }
6118   }
6119 
6120   return;
6121 }
6122 #endif
6123 
6124 
6125 #if 0
6126 static T
6127 Stage3_copy (T old, Pairpool_T pairpool) {
6128   T new = (T) MALLOC_OUT(sizeof(*new));
6129 
6130   new->pairarray = (struct Pair_T *) MALLOC_OUT(old->npairs*sizeof(struct Pair_T));
6131   memcpy(new->pairarray,old->pairarray,old->npairs*sizeof(struct Pair_T));
6132   new->npairs = old->npairs;
6133   new->pairarray_freeable_p = true;
6134 
6135   new->chimera_left_p = false;
6136   new->chimera_right_p = false;
6137 
6138   new->pairs = Pairpool_copy(old->pairs,pairpool);
6139   new->matches = old->matches;
6140   new->unknowns = old->unknowns;
6141   new->mismatches = old->mismatches;
6142   new->qopens = old->qopens;
6143   new->qindels = old->qindels;
6144   new->topens = old->topens;
6145   new->tindels = old->tindels;
6146 
6147   new->noncanonical = old->noncanonical;
6148   new->goodness = old->goodness;
6149 
6150   new->cdna_direction = old->cdna_direction;
6151   new->sensedir = old->sensedir;
6152 
6153   new->translation_start = old->translation_start;
6154   new->translation_end = old->translation_end;
6155   new->translation_length = old->translation_length;
6156 
6157   new->straintype = old->straintype;
6158   new->strain = old->strain;
6159 
6160   new->chrnum = old->chrnum;
6161   new->chroffset = old->chroffset;
6162   new->chrhigh = old->chrhigh;
6163   new->chrlength = old->chrlength;
6164   new->circularpos = old->circularpos;
6165 
6166   new->watsonp = old->watsonp;
6167   new->genestrand = old->genestrand;
6168 
6169   new->genomicstart = old->genomicstart;
6170   new->genomicend = old->genomicend;
6171 
6172   /* new->stage3_runtime = stage3_runtime; */
6173 
6174   new->trimmed_coverage = old->trimmed_coverage;
6175 
6176   return new;
6177 }
6178 #endif
6179 
6180 
6181 void
Stage3_free(T * old)6182 Stage3_free (T *old) {
6183 
6184   debug99(printf("Freeing %p\n",*old));
6185   if (*old) {
6186     /* Don't free strain.  Belongs to altstrain_iit. */
6187     if ((*old)->pairarray_freeable_p == true) {
6188       FREE_OUT((*old)->pairarray);
6189     }
6190     FREE_OUT(*old);
6191   }
6192   return;
6193 }
6194 
6195 #if 0
6196 /* Needed for mutation analysis in align_relative */
6197 void
6198 Stage3_genomicbounds (Univcoord_T *genomicstart, Univcoord_T *genomiclength, T this) {
6199   *genomicstart = this->chroffset;
6200   *genomiclength = this->genomiclength;
6201   return;
6202 }
6203 #endif
6204 
6205 
6206 bool
Stage3_test_bounds(T this,int minpos,int maxpos)6207 Stage3_test_bounds (T this, int minpos, int maxpos) {
6208   int nstart;
6209 
6210   if (Pairpool_count_bounded(&nstart,this->pairs,minpos,maxpos) >= 25) {
6211     return true;
6212   } else {
6213     return false;
6214   }
6215 }
6216 
6217 
6218 #ifdef PMAP
6219 void
Stage3_translate_cdna(T this,Sequence_T queryaaseq,bool strictp)6220 Stage3_translate_cdna (T this, Sequence_T queryaaseq, bool strictp) {
6221   Translation_via_cdna(&this->translation_start,&this->translation_end,&this->translation_length,
6222 		       &this->relaastart,&this->relaaend,
6223 		       this->pairarray,this->npairs,Sequence_fullpointer(queryaaseq),strictp);
6224   return;
6225 }
6226 
6227 void
Stage3_backtranslate_cdna(T this)6228 Stage3_backtranslate_cdna (T this) {
6229   Backtranslation_cdna(this->pairarray,this->npairs,this->translation_start,this->translation_end);
6230   return;
6231 }
6232 
6233 #elif defined(GSNAP)
6234 
6235 /* No need to perform translation */
6236 
6237 #else
6238 
6239 static void
truncate_fulllength(Stage3_T this,bool translatep,int cds_startpos,int querylength,bool strictp)6240 truncate_fulllength (Stage3_T this, bool translatep, int cds_startpos, int querylength, bool strictp) {
6241 
6242   if (translatep == true) {
6243     if (this->cdna_direction < 0) {
6244       Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6245 			      &this->relaastart,&this->relaaend,
6246 			      this->pairarray,this->npairs,/*backwardsp*/true,/*revcompp*/true,/*fulllengthp*/true,
6247 			      cds_startpos,querylength,strictp);
6248     } else {
6249       Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6250 			      &this->relaastart,&this->relaaend,
6251 			      this->pairarray,this->npairs,/*backwardsp*/false,/*revcompp*/false,/*fulllengthp*/true,
6252 			      cds_startpos,querylength,strictp);
6253     }
6254   }
6255 
6256   this->npairs = Pair_clip_bounded_array(this->pairarray,this->npairs,
6257 					 /*minpos*/Stage3_translation_start(this),
6258 					 /*maxpos*/Stage3_translation_end(this));
6259   return;
6260 }
6261 
6262 
6263 void
Stage3_translate_genomic(T this,int npairs,bool fulllengthp,int cds_startpos,int querylength,bool truncatep,bool strictp)6264 Stage3_translate_genomic (T this, int npairs, bool fulllengthp, int cds_startpos, int querylength, bool truncatep, bool strictp) {
6265 
6266   if (this->cdna_direction < 0) {
6267     Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6268 			    &this->relaastart,&this->relaaend,
6269 			    this->pairarray,npairs,/*backwardsp*/true,/*revcompp*/true,fulllengthp,
6270 			    cds_startpos,querylength,strictp);
6271   } else {
6272     Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6273 			    &this->relaastart,&this->relaaend,
6274 			    this->pairarray,npairs,/*backwardsp*/false,/*revcompp*/false,fulllengthp,
6275 			    cds_startpos,querylength,strictp);
6276   }
6277   if (truncatep == true) {
6278     truncate_fulllength(this,/*translatep*/false,cds_startpos,querylength,strictp);
6279     if (this->cdna_direction < 0) {
6280       Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6281 			      &this->relaastart,&this->relaaend,
6282 			      this->pairarray,npairs,/*backwardsp*/true,/*revcompp*/true,fulllengthp,
6283 			      cds_startpos,querylength,strictp);
6284     } else {
6285       Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6286 			      &this->relaastart,&this->relaaend,
6287 			      this->pairarray,npairs,/*backwardsp*/false,/*revcompp*/false,fulllengthp,
6288 			      cds_startpos,querylength,strictp);
6289     }
6290   }
6291 
6292   return;
6293 }
6294 #endif
6295 
6296 
6297 #if 0
6298 /* Called previously by align_relative in gmap.c */
6299 #ifndef GSNAP
6300 void
6301 Stage3_translate_cdna_via_reference (T this, T reference) {
6302   /* bool fixshiftp = !literalrefp; */
6303 
6304   if (this->watsonp == reference->watsonp) {
6305     if (reference->cdna_direction < 0) {
6306       Translation_via_reference(&this->relaastart,&this->relaaend,
6307 				this->pairarray,this->npairs,this->watsonp,/*backwardsp*/true,/*revcompp*/true,
6308 				reference->pairarray,reference->npairs,reference->watsonp);
6309     } else {
6310       Translation_via_reference(&this->relaastart,&this->relaaend,
6311 				this->pairarray,this->npairs,this->watsonp,/*backwardsp*/false,/*revcompp*/false,
6312 				reference->pairarray,reference->npairs,reference->watsonp);
6313     }
6314   } else {
6315     if (reference->cdna_direction < 0) {
6316       Translation_via_reference(&this->relaastart,&this->relaaend,
6317 				this->pairarray,this->npairs,this->watsonp,/*backwardsp*/false,/*revcompp*/false,
6318 				reference->pairarray,reference->npairs,reference->watsonp);
6319     } else {
6320       Translation_via_reference(&this->relaastart,&this->relaaend,
6321 				this->pairarray,this->npairs,this->watsonp,/*backwardsp*/true,/*revcompp*/true,
6322 				reference->pairarray,reference->npairs,reference->watsonp);
6323     }
6324   }
6325 
6326   return;
6327 }
6328 #endif
6329 #endif
6330 
6331 
6332 #if 0
6333 /* Called previously by align_relative in gmap.c */
6334 void
6335 Stage3_fix_cdna_direction (T this, T reference) {
6336   if (this->cdna_direction == 0) {
6337     if (reference->cdna_direction > 0) {
6338       if (this->watsonp == reference->watsonp) {
6339 	this->cdna_direction = +1;
6340       } else {
6341 	this->cdna_direction = -1;
6342       }
6343     } else if (reference->cdna_direction < 0) {
6344       if (this->watsonp == reference->watsonp) {
6345 	this->cdna_direction = -1;
6346       } else {
6347 	this->cdna_direction = +1;
6348       }
6349     }
6350   }
6351   return;
6352 }
6353 #endif
6354 
6355 
6356 
6357 #ifndef GSNAP
6358 void
Stage3_translate(T this,Sequence_T queryseq,int querylength,bool fulllengthp,int cds_startpos,bool truncatep,bool strictp)6359 Stage3_translate (T this,
6360 #ifdef PMAP
6361 		  Sequence_T queryseq,
6362 #endif
6363 		  int querylength, bool fulllengthp,
6364 		  int cds_startpos, bool truncatep, bool strictp) {
6365 
6366 #ifdef PMAP
6367   Translation_via_cdna(&this->translation_start,&this->translation_end,&this->translation_length,
6368 		       &this->relaastart,&this->relaaend,
6369 		       this->pairarray,this->npairs,Sequence_fullpointer(queryseq),strictp);
6370   Backtranslation_cdna(this->pairarray,this->npairs,this->translation_start,this->translation_end);
6371 #else
6372   if (this->cdna_direction < 0) {
6373     Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6374 			    &this->relaastart,&this->relaaend,
6375 			    this->pairarray,this->npairs,/*backwardsp*/true,/*revcompp*/true,fulllengthp,
6376 			    cds_startpos,querylength,strictp);
6377   } else {
6378     Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6379 			    &this->relaastart,&this->relaaend,
6380 			    this->pairarray,this->npairs,/*backwardsp*/false,/*revcompp*/false,fulllengthp,
6381 			    cds_startpos,querylength,strictp);
6382   }
6383 
6384   if (truncatep == true) {
6385     truncate_fulllength(this,/*translatep*/false,cds_startpos,querylength,strictp);
6386     if (this->cdna_direction < 0) {
6387       Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6388 			      &this->relaastart,&this->relaaend,
6389 			      this->pairarray,this->npairs,/*backwardsp*/true,/*revcompp*/true,fulllengthp,
6390 			      cds_startpos,querylength,strictp);
6391     } else {
6392       Translation_via_genomic(&this->translation_start,&this->translation_end,&this->translation_length,
6393 			      &this->relaastart,&this->relaaend,
6394 			      this->pairarray,this->npairs,/*backwardsp*/false,/*revcompp*/false,fulllengthp,
6395 			      cds_startpos,querylength,strictp);
6396     }
6397   }
6398 #endif
6399 
6400   return;
6401 }
6402 #endif
6403 
6404 
6405 #ifndef GSNAP
6406 void
Stage3_translate_chimera(T this,T mate,Sequence_T queryseq,int querylength,bool fulllengthp,int cds_startpos,bool truncatep,bool strictp)6407 Stage3_translate_chimera (T this, T mate,
6408 #ifdef PMAP
6409 			  Sequence_T queryseq,
6410 #endif
6411 			  int querylength, bool fulllengthp,
6412 			  int cds_startpos, bool truncatep, bool strictp) {
6413   int npairs1, npairs2;
6414   int translation_start, translation_end, translation_length, relaastart, relaaend;
6415 
6416   npairs1 = this->npairs;
6417   npairs2 = mate->npairs;
6418 
6419 #ifdef PMAP
6420   Translation_via_cdna(&translation_start,&translation_end,&translation_length,
6421 		       &relaastart,&relaaend,
6422 		       this->pairarray,npairs1 + npairs2,Sequence_fullpointer(queryseq),strictp);
6423   Backtranslation_cdna(this->pairarray,npairs1 + npairs2,translation_start,translation_end);
6424 #else
6425   if (this->cdna_direction < 0) {
6426     Translation_via_genomic(&translation_start,&translation_end,&translation_length,
6427 			    &relaastart,&relaaend,
6428 			    this->pairarray,npairs1 + npairs2,/*backwardsp*/true,/*revcompp*/true,fulllengthp,
6429 			    cds_startpos,querylength,strictp);
6430   } else {
6431     Translation_via_genomic(&translation_start,&translation_end,&translation_length,
6432 			    &relaastart,&relaaend,
6433 			    this->pairarray,npairs1 + npairs2,/*backwardsp*/false,/*revcompp*/false,fulllengthp,
6434 			    cds_startpos,querylength,strictp);
6435   }
6436 
6437   if (truncatep == true) {
6438     truncate_fulllength(this,/*translatep*/false,cds_startpos,querylength,strictp);
6439     if (this->cdna_direction < 0) {
6440       Translation_via_genomic(&translation_start,&translation_end,&translation_length,
6441 			      &relaastart,&relaaend,
6442 			      this->pairarray,npairs1 + npairs2,/*backwardsp*/true,/*revcompp*/true,fulllengthp,
6443 			      cds_startpos,querylength,strictp);
6444     } else {
6445       Translation_via_genomic(&translation_start,&translation_end,&translation_length,
6446 			      &relaastart,&relaaend,
6447 			      this->pairarray,npairs1 + npairs2,/*backwardsp*/false,/*revcompp*/false,fulllengthp,
6448 			      cds_startpos,querylength,strictp);
6449     }
6450   }
6451 
6452 #endif
6453 
6454   if (translation_start < npairs1) {
6455     this->translation_start = translation_start;
6456     mate->translation_start = 0;
6457   } else {
6458     this->translation_start = npairs1 - 1;
6459     mate->translation_start = translation_start - npairs1;
6460   }
6461   if (translation_end < npairs1) {
6462     this->translation_end = translation_end;
6463     mate->translation_end = 0;
6464   } else {
6465     this->translation_end = npairs1 - 1;
6466     mate->translation_end = translation_end - npairs1;
6467   }
6468 
6469   /* Additional checks to stay within array bounds */
6470   if (this->translation_end >= this->npairs) {
6471     this->translation_end = this->npairs - 1;
6472   }
6473   if (this->translation_start > this->translation_end) {
6474     this->translation_start = this->translation_end;
6475   }
6476 
6477   if (mate->translation_end >= mate->npairs) {
6478     mate->translation_end = mate->npairs - 1;
6479   }
6480   if (mate->translation_start > mate->translation_end) {
6481     mate->translation_start = mate->translation_end;
6482   }
6483 
6484   debug(printf("Converted translation %d..%d in %d+%d pairs to %d..%d and %d..%d\n",
6485 	       translation_start,translation_end,this->npairs,mate->npairs,
6486 	       this->translation_start,this->translation_end,mate->translation_start,mate->translation_end));
6487 
6488   this->translation_length = Pair_translation_length(this->pairarray,this->npairs);
6489   mate->translation_length = Pair_translation_length(mate->pairarray,mate->npairs);
6490   debug(printf("Original translation length %d => %d plus %d\n",
6491 	       translation_length,this->translation_length,mate->translation_length));
6492 
6493   this->relaastart = this->pairarray[this->translation_start].aapos;
6494   this->relaaend = this->pairarray[this->translation_end].aapos;
6495 
6496   mate->relaastart = mate->pairarray[mate->translation_start].aapos;
6497   mate->relaaend = mate->pairarray[mate->translation_end].aapos;
6498 
6499   return;
6500 }
6501 #endif
6502 
6503 
6504 #ifndef GSNAP
6505 void
Stage3_print_pathsummary(Filestring_T fp,T this,int pathnum,Univ_IIT_T chromosome_iit,Univ_IIT_T contig_iit,IIT_T altstrain_iit,Sequence_T queryseq,char * dbversion)6506 Stage3_print_pathsummary (Filestring_T fp, T this, int pathnum, Univ_IIT_T chromosome_iit, Univ_IIT_T contig_iit,
6507 			  IIT_T altstrain_iit, Sequence_T queryseq, char *dbversion) {
6508   Pair_T start, end;
6509   bool referencealignp;
6510 
6511   debug99(printf("Printing %p\n",this));
6512   start = &(this->pairarray[0]);
6513   end = &(this->pairarray[this->npairs-1]);
6514   referencealignp = this->straintype == 0 ? true : false;
6515   Pair_print_pathsummary(fp,pathnum,start,end,this->chrnum,this->chroffset,
6516 			 chromosome_iit,referencealignp,altstrain_iit,this->strain,contig_iit,
6517 			 dbversion,Sequence_fulllength_given(queryseq),Sequence_skiplength(queryseq),
6518 			 Sequence_trim_start(queryseq),Sequence_trim_end(queryseq),
6519 			 Pair_nexons(this->pairarray,this->npairs),this->matches,this->unknowns,this->mismatches,
6520 			 this->qopens,this->qindels,this->topens,this->tindels,
6521 			 this->watsonp,this->cdna_direction,
6522 			 this->translation_start,this->translation_end,this->translation_length,
6523 			 /*relaastart*/0,/*relaaend*/0);
6524   Translation_print_comparison(fp,this->pairarray,this->npairs,this->relaastart,this->relaaend);
6525   FPRINTF(fp,"\n");
6526 
6527   return;
6528 }
6529 #endif
6530 
6531 
6532 void
Stage3_print_pslformat_nt(Filestring_T fp,T this,Univ_IIT_T chromosome_iit,Sequence_T usersegment,Sequence_T queryaaseq)6533 Stage3_print_pslformat_nt (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T usersegment, Sequence_T queryaaseq) {
6534   Pair_T start, end;
6535 
6536   start = &(this->pairarray[0]);
6537   end = &(this->pairarray[this->npairs-1]);
6538 
6539   Pair_print_pslformat_nt(fp,this->pairarray,this->npairs,start,end,queryaaseq,this->chrnum,
6540 			  chromosome_iit,usersegment,
6541 			  /* Pair_nexons(this->pairarray,this->npairs), */
6542 			  this->matches,this->unknowns,this->mismatches,
6543 			  this->watsonp);
6544   return;
6545 }
6546 
6547 #ifdef PMAP
6548 void
Stage3_print_pslformat_pro(Filestring_T fp,T this,Univ_IIT_T chromosome_iit,Sequence_T usersegment,Sequence_T queryaaseq,bool strictp)6549 Stage3_print_pslformat_pro (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T usersegment, Sequence_T queryaaseq, bool strictp) {
6550   Pair_T start, end;
6551 
6552 #if 0
6553   Stage3_translate_cdna(this,queryaaseq,strictp);
6554   Stage3_backtranslate_cdna(this);
6555 #endif
6556 
6557   start = &(this->pairarray[0]);
6558   end = &(this->pairarray[this->npairs-1]);
6559 
6560   Pair_print_pslformat_pro(fp,this->pairarray,this->npairs,start,end,queryaaseq,this->chrnum,
6561 			   chromosome_iit,usersegment,
6562 			   /* Pair_nexons(this->pairarray,this->npairs), */
6563 			   this->watsonp,this->cdna_direction);
6564   return;
6565 }
6566 #endif
6567 
6568 
6569 void
Stage3_print_gff3(Filestring_T fp,T this,int pathnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,Sequence_T queryseq,int querylength,Printtype_T printtype,char * sourcename)6570 Stage3_print_gff3 (Filestring_T fp, T this, int pathnum, Univ_IIT_T chromosome_iit, Sequence_T usersegment,
6571 		   Sequence_T queryseq, int querylength, Printtype_T printtype, char *sourcename) {
6572   Pair_T start, end;
6573   bool gff_gene_format_p, gff_estmatch_format_p;
6574   char *gff3_annotation = NULL;
6575 
6576   if (printtype == GFF3_GENE) {
6577     gff_gene_format_p = true;
6578     gff_estmatch_format_p = false;
6579   } else if (printtype == GFF3_MATCH_CDNA) {
6580     gff_gene_format_p = false;
6581     gff_estmatch_format_p = false;
6582   } else if (printtype == GFF3_MATCH_EST) {
6583     gff_gene_format_p = false;
6584     gff_estmatch_format_p = true;
6585   } else {
6586     fprintf(stderr,"Unexpected printtype %d\n",printtype);
6587     abort();
6588   }
6589 
6590   start = &(this->pairarray[0]);
6591   end = &(this->pairarray[this->npairs-1]);
6592 
6593   if (gff3_fasta_annotation_type == NO_ANNOTATION) {
6594     gff3_annotation = (char *) NULL;
6595   } else if (gff3_fasta_annotation_type == INSERT_ANNOTATION) {
6596     gff3_annotation = Sequence_restofheader_wannot(queryseq);
6597   } else if (gff3_fasta_annotation_type == KEYVALUE_ANNOTATION) {
6598     gff3_annotation = Sequence_restofheader_keyvalue(queryseq);
6599   }
6600 
6601   Pair_print_gff3(fp,this->pairarray,this->npairs,pathnum,
6602 		  Sequence_accession(queryseq),gff3_annotation,start,end,
6603 		  this->chrnum,chromosome_iit,usersegment,
6604 		  this->translation_end,querylength,Sequence_skiplength(queryseq),
6605 		  this->matches,this->mismatches,this->qindels,this->tindels,this->unknowns,
6606 		  this->watsonp,this->cdna_direction,gff_gene_format_p,gff_estmatch_format_p,
6607 		  sourcename);
6608 
6609   if (gff3_annotation != NULL) {
6610     FREE(gff3_annotation);
6611   }
6612 
6613   return;
6614 }
6615 
6616 
6617 #ifndef PMAP
6618 void
Stage3_print_bedpe(Filestring_T fp,T this,Univ_IIT_T chromosome_iit)6619 Stage3_print_bedpe (Filestring_T fp, T this, Univ_IIT_T chromosome_iit) {
6620   Pair_print_bedpe(fp,this->pairarray,this->npairs,this->chrnum,this->watsonp,chromosome_iit);
6621   return;
6622 }
6623 #endif
6624 
6625 
6626 
6627 #ifndef GSNAP
6628 #ifndef PMAP
6629 /* Only for GMAP program */
6630 void
Stage3_print_sam(Filestring_T fp,char * abbrev,T this,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,int mapq_score,Univ_IIT_T chromosome_iit,Sequence_T usersegment,Sequence_T queryseq,int chimera_part,Chimera_T chimera,int quality_shift,bool sam_paired_p,char * sam_read_group_id)6631 Stage3_print_sam (Filestring_T fp, char *abbrev, T this, int pathnum, int npaths_primary, int npaths_altloc,
6632 		  int absmq_score, int second_absmq, int mapq_score,
6633 		  Univ_IIT_T chromosome_iit, Sequence_T usersegment,
6634 		  Sequence_T queryseq, int chimera_part, Chimera_T chimera,
6635 		  int quality_shift, bool sam_paired_p, char *sam_read_group_id) {
6636   int querylength;
6637   Chrpos_T chrpos;
6638   Pair_T pair;
6639 
6640   querylength = Sequence_fulllength_given(queryseq);
6641   if (this->watsonp == true) {
6642     pair = &(this->pairarray[0]);
6643     chrpos = pair->genomepos + 1U;
6644   } else {
6645     pair = &(this->pairarray[this->npairs-1]);
6646     chrpos = pair->genomepos + 1U;
6647   }
6648 
6649   if (this->circularpos > 0) {
6650     Pair_print_sam(fp,abbrev,this->pairarray,this->npairs,
6651 		   Sequence_accession(queryseq),/*acc2*/NULL,this->chrnum,chromosome_iit,usersegment,
6652 		   Sequence_fullpointer(queryseq),Sequence_quality_string(queryseq),
6653 		   /*hardclip5*/0,/*hardclip3*/querylength-this->circularpos,
6654 		   querylength,this->watsonp,this->sensedir,chimera_part,chimera,
6655 		   quality_shift,Sequence_firstp(queryseq),
6656 		   pathnum,npaths_primary,npaths_altloc,absmq_score,second_absmq,chrpos,this->chrlength,
6657 		   mapq_score,sam_paired_p,sam_read_group_id);
6658     Pair_print_sam(fp,abbrev,this->pairarray,this->npairs,
6659 		   Sequence_accession(queryseq),/*acc2*/NULL,this->chrnum,chromosome_iit,usersegment,
6660 		   Sequence_fullpointer(queryseq),Sequence_quality_string(queryseq),
6661 		   /*hardclip5*/this->circularpos,/*hardclip3*/0,
6662 		   querylength,this->watsonp,this->sensedir,chimera_part,chimera,
6663 		   quality_shift,Sequence_firstp(queryseq),
6664 		   pathnum,npaths_primary,npaths_altloc,absmq_score,second_absmq,/*chrpos*/1,this->chrlength,
6665 		   mapq_score,sam_paired_p,sam_read_group_id);
6666   } else {
6667     Pair_print_sam(fp,abbrev,this->pairarray,this->npairs,
6668 		   Sequence_accession(queryseq),/*acc2*/NULL,this->chrnum,chromosome_iit,usersegment,
6669 		   Sequence_fullpointer(queryseq),Sequence_quality_string(queryseq),
6670 		   /*hardclip5*/0,/*hardclip3*/0,querylength,
6671 		   this->watsonp,this->sensedir,chimera_part,chimera,
6672 		   quality_shift,Sequence_firstp(queryseq),
6673 		   pathnum,npaths_primary,npaths_altloc,absmq_score,second_absmq,chrpos,this->chrlength,
6674 		   mapq_score,sam_paired_p,sam_read_group_id);
6675   }
6676 
6677   return;
6678 }
6679 #endif
6680 #endif
6681 
6682 
6683 void
Stage3_print_iit_map(Filestring_T fp,T this,Univ_IIT_T chromosome_iit,Sequence_T queryseq)6684 Stage3_print_iit_map (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq) {
6685   Pair_T start, end;
6686 
6687   start = &(this->pairarray[0]);
6688   end = &(this->pairarray[this->npairs-1]);
6689 
6690   Pair_print_iit_map(fp,queryseq,Sequence_accession(queryseq),start,end,
6691 		     this->chrnum,chromosome_iit);
6692   return;
6693 }
6694 
6695 void
Stage3_print_iit_exon_map(Filestring_T fp,T this,Univ_IIT_T chromosome_iit,Sequence_T queryseq)6696 Stage3_print_iit_exon_map (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq) {
6697   Pair_T start, end;
6698 
6699   start = &(this->pairarray[0]);
6700   end = &(this->pairarray[this->npairs-1]);
6701 
6702   Pair_print_iit_exon_map(fp,this->pairarray,this->npairs,queryseq,Sequence_accession(queryseq),
6703 			  start,end,this->chrnum,chromosome_iit);
6704   return;
6705 }
6706 
6707 void
Stage3_print_splicesites(Filestring_T fp,T this,Univ_IIT_T chromosome_iit,Sequence_T queryseq)6708 Stage3_print_splicesites (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq) {
6709   Pair_print_splicesites(fp,this->pairarray,this->npairs,Sequence_accession(queryseq),
6710 			 Pair_nexons(this->pairarray,this->npairs),this->chrnum,
6711 			 chromosome_iit,this->watsonp);
6712   return;
6713 }
6714 
6715 void
Stage3_print_introns(Filestring_T fp,T this,Univ_IIT_T chromosome_iit,Sequence_T queryseq)6716 Stage3_print_introns (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq) {
6717   Pair_print_introns(fp,this->pairarray,this->npairs,Sequence_accession(queryseq),
6718 		     Pair_nexons(this->pairarray,this->npairs),this->chrnum,
6719 		     chromosome_iit);
6720   return;
6721 }
6722 
6723 
6724 #if 0
6725 /* Called previously by align_relative in gmap.c */
6726 #ifndef GSNAP
6727 void
6728 Stage3_print_mutations (Filestring_T fp, T this, T reference, Univ_IIT_T chromosome_iit, Sequence_T queryseq,
6729 			char *dbversion, bool showalignp, int invertmode, bool nointronlenp, int wraplength) {
6730   Pair_T start, end;
6731   bool referencealignp;
6732 
6733   start = &(this->pairarray[0]);
6734   end = &(this->pairarray[this->npairs-1]);
6735 
6736   /*  Pair_dump_array(this->pairarray,this->npairs,false); */
6737 
6738   referencealignp = this->straintype == 0 ? true : false;
6739   Pair_print_pathsummary(fp,/*pathnum*/1,start,end,reference->chrnum,reference->chroffset,
6740 			 chromosome_iit,referencealignp,/*altstrain_iit*/NULL,this->strain,/*contig_iit*/NULL,
6741 			 dbversion,Sequence_fulllength_given(queryseq),Sequence_skiplength(queryseq),
6742 			 Sequence_trim_start(queryseq),Sequence_trim_end(queryseq),
6743 			 Pair_nexons(this->pairarray,this->npairs),this->matches,this->unknowns,this->mismatches,
6744 			 this->qopens,this->qindels,this->topens,this->tindels,
6745 			 this->watsonp,this->cdna_direction,0,0,0,this->relaastart,this->relaaend);
6746   Translation_print_comparison(fp,this->pairarray,this->npairs,this->relaastart,this->relaaend);
6747   FPRINTF(fp,"\n");
6748 
6749   if (showalignp == true) {
6750     Pair_print_alignment(fp,this->pairarray,this->npairs,reference->chrnum,reference->chroffset,
6751 			 chromosome_iit,this->watsonp,invertmode,nointronlenp,wraplength);
6752   }
6753   debug1(Pair_dump_array(this->pairarray,this->npairs,/*zerobasedp*/true));
6754   debug1(Pair_check_array(this->pairarray,this->npairs));
6755 
6756   return;
6757 }
6758 #endif
6759 #endif
6760 
6761 
6762 
6763 static void
print_map(Filestring_T fp,T this,IIT_T map_iit,int * map_divint_crosstable,Univ_IIT_T chromosome_iit,int pathnum,bool map_bothstrands_p,int nflanking,bool print_comment_p)6764 print_map (Filestring_T fp, T this, IIT_T map_iit, int *map_divint_crosstable,
6765 	   Univ_IIT_T chromosome_iit, int pathnum, bool map_bothstrands_p,
6766 	   int nflanking, bool print_comment_p) {
6767   int chrlow, chrhigh;
6768   Pair_T start, end;
6769   int chrpos1, chrpos2;
6770   int *iit_matches = NULL, nmatches, *leftflanks, nleftflanks, *rightflanks, nrightflanks;
6771   int divno, sign;
6772   char *chr;
6773 
6774   if ((divno = map_divint_crosstable[this->chrnum]) <= 0) {
6775     FPRINTF(fp,"  *Map hits for path %d (0):\n\n",pathnum);
6776     return;
6777   } else {
6778     chr = Chrnum_to_string(this->chrnum,chromosome_iit);
6779   }
6780 
6781   start = &(this->pairarray[0]);
6782   end = &(this->pairarray[this->npairs-1]);
6783 
6784   if (this->watsonp) {
6785     chrlow = chrpos1 = Pair_genomepos(start);
6786     chrhigh = chrpos2 = Pair_genomepos(end);
6787     sign = +1;
6788 
6789   } else {
6790     chrhigh = chrpos1 = Pair_genomepos(start);
6791     chrlow = chrpos2 = Pair_genomepos(end);
6792     sign = -1;
6793   }
6794 
6795   if (map_bothstrands_p == true) {
6796     iit_matches = IIT_get_with_divno(&nmatches,map_iit,divno,chrlow,chrhigh,/*sortp*/false);
6797     if (nflanking > 0) {
6798       IIT_get_flanking_with_divno(&leftflanks,&nleftflanks,&rightflanks,&nrightflanks,map_iit,
6799 				  divno,chrlow,chrhigh,nflanking,/*sign*/0);
6800     }
6801     if (nflanking > 0) {
6802       FPRINTF(fp,"  Map hits for path %d (%d|%d|%d):\n",pathnum,nleftflanks,nmatches,nrightflanks);
6803     } else {
6804       FPRINTF(fp,"  Map hits for path %d (%d):\n",pathnum,nmatches);
6805     }
6806     if (nflanking > 0) {
6807       IIT_print_header(fp,map_iit,leftflanks,nleftflanks,chr,
6808 		       /*reversep*/true,/*relativep*/false,/*left*/0U,print_comment_p);
6809       FPRINTF(fp,"    ====================\n");
6810     }
6811     IIT_print_header(fp,map_iit,iit_matches,nmatches,chr,
6812 		     /*reversep*/false,/*relativep*/false,/*left*/0U,print_comment_p);
6813     if (nflanking > 0) {
6814       FPRINTF(fp,"    ====================\n");
6815       IIT_print_header(fp,map_iit,rightflanks,nrightflanks,chr,
6816 		       /*reversep*/false,/*relativep*/false,/*left*/0U,print_comment_p);
6817     }
6818 
6819   } else {
6820     iit_matches = IIT_get_signed_with_divno(&nmatches,map_iit,divno,chrlow,chrhigh,/*sortp*/true,sign);
6821     if (nflanking > 0) {
6822       IIT_get_flanking_with_divno(&leftflanks,&nleftflanks,&rightflanks,&nrightflanks,map_iit,
6823 				  divno,chrlow,chrhigh,nflanking,sign);
6824     }
6825     if (nflanking > 0) {
6826       FPRINTF(fp,"  Map hits for path %d (%d|%d|%d):\n",pathnum,nleftflanks,nmatches,nrightflanks);
6827     } else {
6828       FPRINTF(fp,"  Map hits for path %d (%d):\n",pathnum,nmatches);
6829     }
6830     if (nflanking > 0) {
6831       IIT_print_header(fp,map_iit,leftflanks,nleftflanks,chr,
6832 		       /*reversep*/true,/*relativep*/false,/*left*/0U,print_comment_p);
6833       FPRINTF(fp,"    ====================\n");
6834     }
6835     IIT_print_header(fp,map_iit,iit_matches,nmatches,chr,
6836 		     /*reversep*/false,/*relativep*/false,/*left*/0U,print_comment_p);
6837     if (nflanking > 0) {
6838       FPRINTF(fp,"    ====================\n");
6839       IIT_print_header(fp,map_iit,rightflanks,nrightflanks,chr,
6840 		       /*reversep*/false,/*relativep*/false,/*left*/0U,print_comment_p);
6841     }
6842   }
6843   FPRINTF(fp,"\n");
6844 
6845   if (nflanking > 0) {
6846     FREE(rightflanks);
6847     FREE(leftflanks);
6848   }
6849 
6850   FREE(iit_matches);
6851   FREE(chr);
6852   return;
6853 }
6854 
6855 
6856 /* Doesn't handle nflanking */
6857 static void
print_exon_map(Filestring_T fp,T this,IIT_T map_iit,int * map_divint_crosstable,Univ_IIT_T chromosome_iit,int pathnum,bool map_bothstrands_p,bool print_comment_p)6858 print_exon_map (Filestring_T fp, T this, IIT_T map_iit, int *map_divint_crosstable,
6859 		Univ_IIT_T chromosome_iit, int pathnum, bool map_bothstrands_p, bool print_comment_p) {
6860   Uintlist_T exonbounds;
6861   Chrpos_T position1, position2;
6862   int *iit_matches = NULL, nmatches;
6863   int divno, exonno = 0;
6864   char *chr;
6865 
6866   if ((divno = map_divint_crosstable[this->chrnum]) <= 0) {
6867     FPRINTF(fp,"  *Map hits for path %d (0):\n\n",pathnum);
6868     return;
6869   } else {
6870     chr = Chrnum_to_string(this->chrnum,chromosome_iit);
6871   }
6872 
6873   exonbounds = Pair_exonbounds(this->pairarray,this->npairs);
6874 
6875   while (exonbounds != NULL) {
6876     exonbounds = Uintlist_pop(exonbounds,&position1);
6877     exonbounds = Uintlist_pop(exonbounds,&position2);
6878 
6879     if (map_bothstrands_p == true) {
6880       if (position1 < position2) {
6881 	iit_matches = IIT_get(&nmatches,map_iit,chr,position1,position2,/*sortp*/true);
6882       } else {
6883 	iit_matches = IIT_get(&nmatches,map_iit,chr,position2,position1,/*sortp*/true);
6884       }
6885       FPRINTF(fp,"  Map hits for path %d, exon %d (%d):\n",pathnum,++exonno,nmatches);
6886       IIT_print_header(fp,map_iit,iit_matches,nmatches,chr,
6887 		       /*reversep*/false,/*relativep*/false,/*left*/0U,print_comment_p);
6888 
6889     } else {
6890       if (position1 < position2) {
6891 	iit_matches = IIT_get_signed_with_divno(&nmatches,map_iit,divno,position1,position2,
6892 						/*sortp*/true,/*sign*/+1);
6893       } else {
6894 	iit_matches = IIT_get_signed_with_divno(&nmatches,map_iit,divno,position2,position1,
6895 						/*sortp*/true,/*sign*/-1);
6896       }
6897       FPRINTF(fp,"  Map hits for path %d, exon %d (%d):\n",pathnum,++exonno,nmatches);
6898       IIT_print_header(fp,map_iit,iit_matches,nmatches,chr,
6899 		       /*reversep*/false,/*relativep*/false,/*left*/0U,print_comment_p);
6900     }
6901     FPRINTF(fp,"\n");
6902     FREE(iit_matches);
6903   }
6904 
6905   return;
6906 }
6907 
6908 void
Stage3_print_map(Filestring_T fp,T this,IIT_T map_iit,int * map_divint_crosstable,Univ_IIT_T chromosome_iit,int pathnum,bool map_exons_p,bool map_bothstrands_p,int nflanking,bool print_comment_p)6909 Stage3_print_map (Filestring_T fp, T this, IIT_T map_iit, int *map_divint_crosstable, Univ_IIT_T chromosome_iit,
6910 		  int pathnum, bool map_exons_p, bool map_bothstrands_p, int nflanking,
6911 		  bool print_comment_p) {
6912   if (map_exons_p == true) {
6913     print_exon_map(fp,this,map_iit,map_divint_crosstable,
6914 		   chromosome_iit,pathnum,map_bothstrands_p,print_comment_p);
6915   } else {
6916     print_map(fp,this,map_iit,map_divint_crosstable,
6917 	      chromosome_iit,pathnum,map_bothstrands_p,nflanking,print_comment_p);
6918   }
6919   return;
6920 }
6921 
6922 
6923 
6924 /* queryaaseq is used only by PMAP */
6925 void
Stage3_print_alignment(Filestring_T fp,T this,Genome_T genome,Univ_IIT_T chromosome_iit,Printtype_T printtype,bool continuousp,bool continuous_by_exon_p,bool genomefirstp,int invertmode,bool nointronlenp,int wraplength)6926 Stage3_print_alignment (Filestring_T fp, T this, Genome_T genome,
6927 			Univ_IIT_T chromosome_iit, Printtype_T printtype,
6928 			bool continuousp, bool continuous_by_exon_p, bool genomefirstp,
6929 			int invertmode, bool nointronlenp, int wraplength) {
6930   if (continuous_by_exon_p == true) {
6931     Pair_print_exonsummary(fp,this->pairarray,this->npairs,this->chrnum,this->chroffset,
6932 			   genome,chromosome_iit,this->watsonp,this->cdna_direction,genomefirstp,invertmode);
6933     Pair_print_continuous_byexon(fp,this->pairarray,this->npairs,this->watsonp,invertmode);
6934 
6935   } else if (continuousp == true) {
6936     Pair_print_continuous(fp,this->pairarray,this->npairs,this->watsonp,
6937 			  genomefirstp,invertmode,nointronlenp);
6938   } else {
6939     /* Assumes Stage3_print_pathsummary already called */
6940     Pair_print_exonsummary(fp,this->pairarray,this->npairs,this->chrnum,this->chroffset,
6941 			   genome,chromosome_iit,this->watsonp,this->cdna_direction,genomefirstp,invertmode);
6942     if (printtype == ALIGNMENT) {
6943       Pair_print_alignment(fp,this->pairarray,this->npairs,this->chrnum,this->chroffset,
6944 			   chromosome_iit,this->watsonp,invertmode,nointronlenp,wraplength);
6945     }
6946   }
6947   debug1(Pair_dump_array(this->pairarray,this->npairs,/*zerobasedp*/true));
6948   debug1(Pair_check_array(this->pairarray,this->npairs));
6949   return;
6950 }
6951 
6952 
6953 void
Stage3_print_coordinates(Filestring_T fp,T this,Univ_IIT_T chromosome_iit,int invertmode)6954 Stage3_print_coordinates (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, int invertmode) {
6955   Pair_print_coordinates(fp,this->pairarray,this->npairs,this->chrnum,this->chroffset,
6956 			 chromosome_iit,this->watsonp,invertmode);
6957   return;
6958 }
6959 
6960 
6961 void
Stage3_print_cdna(Filestring_T fp,T this,int wraplength)6962 Stage3_print_cdna (Filestring_T fp, T this, int wraplength) {
6963 #ifdef PMAP
6964   Pair_print_nucleotide_cdna(fp,this->pairarray,this->npairs,wraplength);
6965 #else
6966   if (this->cdna_direction >= 0) {
6967     Pair_print_protein_cdna(fp,this->pairarray,this->npairs,wraplength,/*forwardp*/true);
6968   } else {
6969     Pair_print_protein_cdna(fp,&(this->pairarray[this->npairs-1]),this->npairs,wraplength,/*forwardp*/false);
6970   }
6971 #endif
6972   return;
6973 }
6974 
6975 void
Stage3_print_protein_genomic(Filestring_T fp,T this,int wraplength)6976 Stage3_print_protein_genomic (Filestring_T fp, T this, int wraplength) {
6977   if (this->cdna_direction >= 0) {
6978     Pair_print_protein_genomic(fp,this->pairarray,this->npairs,wraplength,/*forwardp*/true);
6979   } else {
6980     Pair_print_protein_genomic(fp,&(this->pairarray[this->npairs-1]),this->npairs,wraplength,/*forwardp*/false);
6981   }
6982   return;
6983 }
6984 
6985 
6986 void
Stage3_print_compressed(Filestring_T fp,T this,Sequence_T queryseq,Univ_IIT_T chromosome_iit,char * dbversion,Sequence_T usersegment,int pathnum,int npaths,bool checksump,int chimerapos,int chimeraequivpos,double donor_prob,double acceptor_prob,int chimera_cdna_direction)6987 Stage3_print_compressed (Filestring_T fp, T this, Sequence_T queryseq, Univ_IIT_T chromosome_iit,
6988 			 char *dbversion, Sequence_T usersegment, int pathnum, int npaths,
6989 			 bool checksump, int chimerapos, int chimeraequivpos,
6990 			 double donor_prob, double acceptor_prob, int chimera_cdna_direction) {
6991   Pair_T start, end;
6992 
6993 #if 0
6994 #ifdef PMAP
6995   Stage3_translate_cdna(this,queryseq,strictp);
6996   Stage3_backtranslate_cdna(this);
6997 #else
6998   if (truncatep == true) {
6999     truncate_fulllength(this,/*translatep*/true,/*cds_startpos*/-1,
7000 			Sequence_fulllength_given(queryseq),strictp);
7001   }
7002 #endif
7003 #endif
7004 
7005   start = &(this->pairarray[0]);
7006   end = &(this->pairarray[this->npairs-1]);
7007   Pair_print_compressed(fp,pathnum,npaths,start,end,queryseq,dbversion,usersegment,
7008 			Pair_nexons(this->pairarray,this->npairs),
7009 			Stage3_fracidentity(this),this->pairarray,this->npairs,
7010 			this->chrnum,this->chroffset,chromosome_iit,
7011 			Sequence_fulllength_given(queryseq),Sequence_skiplength(queryseq),
7012 			Sequence_trim_start(queryseq),Sequence_trim_end(queryseq),
7013 			checksump,chimerapos,chimeraequivpos,donor_prob,acceptor_prob,
7014 			chimera_cdna_direction,this->strain,this->cdna_direction);
7015   return;
7016 }
7017 
7018 
7019 
7020 #if 0
7021 static int
7022 compute_introntype (char left1, char left2, char right2, char right1) {
7023   int leftdi, rightdi;
7024 
7025   if (left1 == 'G' && left2 == 'T') {
7026     leftdi = LEFT_GT;
7027   } else if (left1 == 'G' && left2 == 'C') {
7028     leftdi = LEFT_GC;
7029   } else if (left1 == 'A' && left2 == 'T') {
7030     leftdi = LEFT_AT;
7031 #ifndef PMAP
7032   } else if (left1 == 'C' && left2 == 'T') {
7033     leftdi = LEFT_CT;
7034 #endif
7035   } else {
7036     leftdi = 0x00;
7037   }
7038 
7039   if (right2 == 'A' && right1 == 'G') {
7040     rightdi = RIGHT_AG;
7041   } else if (right2 == 'A' && right1 == 'C') {
7042     rightdi = RIGHT_AC;
7043 #ifndef PMAP
7044   } else if (right2 == 'G' && right1 == 'C') {
7045     rightdi = RIGHT_GC;
7046   } else if (right2 == 'A' && right1 == 'T') {
7047     rightdi = RIGHT_AT;
7048 #endif
7049   } else {
7050     rightdi = 0x00;
7051   }
7052 
7053   return leftdi & rightdi;
7054 }
7055 #endif
7056 
7057 #if 0
7058 static char uppercaseCode[128] = UPPERCASE_U2T;
7059 #endif
7060 
7061 #if 0
7062 static List_T
7063 peel_leftward_old (bool *mismatchp, List_T *peeled_path, List_T path, int *querydp5, int *genomedp5,
7064 #ifdef WASTE
7065 		   Pairpool_T pairpool,
7066 #endif
7067 		   int maxpeelback, bool throughmismatchp, bool quit_on_gap_p,
7068 		   List_T *endgappairs, Pair_T *gappair, int *querydp5_medialgap, int *genomedp5_medialgap) {
7069   List_T peeled = NULL, rest = NULL, pairptr;
7070   Pair_T pair, nextpair, rightpair;
7071   int npeelback = 0, nconsecutive = 0, init_dynprogindex = DYNPROGINDEX_MINOR;
7072   bool stopp;
7073   int nmatches;
7074 #if 0
7075   int nincursion = 0;
7076 #endif
7077 
7078   *mismatchp = false;
7079   debug(printf("Peeling leftward:"));
7080   if (path == NULL) {
7081     debug(printf(" path is empty\n"));
7082   } else {
7083     pair = path->first;
7084     if (pair->gapp == true) {
7085       /* Throw away known gap */
7086       debug(printf(" Known_gap"));
7087       pairptr = path;
7088       path = Pairpool_pop(path,&pair);
7089 #ifdef WASTE
7090       peeled = Pairpool_push_existing(peeled,pairpool,pair);
7091 #else
7092       peeled = List_push_existing(peeled,pairptr);
7093 #endif
7094     }
7095     rest = path->rest;
7096 
7097     stopp = false;
7098     while (rest != NULL && stopp == false) {
7099       nextpair = rest->first;
7100       if (nextpair->gapp == true || nextpair->cdna == ' ' || nextpair->genome == ' ' || nextpair->protectedp == true) {
7101 	stopp = true;
7102       } else {
7103 	pairptr = path;
7104 	path = Pairpool_pop(path,&pair);
7105 #ifdef WASTE
7106 	peeled = Pairpool_push_existing(peeled,pairpool,pair);
7107 #else
7108 	peeled = List_push_existing(peeled,pairptr);
7109 #endif
7110 	debug(printf(" Peel [");
7111 	      Pair_dump_one(pair,/*zerobasedp*/true);
7112 	      printf("]"));
7113 
7114 	if (uppercaseCode[(int) pair->cdna] != uppercaseCode[(int) pair->genome]) {
7115 	  *mismatchp = true;
7116 	}
7117 
7118 	if (++npeelback >= maxpeelback) {
7119 	  stopp = true;
7120 	}
7121 
7122 	if (init_dynprogindex > 0 && pair->dynprogindex <= 0) {
7123 	  init_dynprogindex = pair->dynprogindex;
7124 	}
7125 
7126 	rest = path->rest;
7127       }
7128     }
7129 
7130     /* Continue to peelback through little skips and mismatches */
7131     debug(printf("\n||"));
7132 
7133     stopp = false;
7134     while (rest != NULL && stopp == false) {
7135       nextpair = rest->first;
7136       if (nextpair->gapp == true) {
7137 	/* Peel this one, but then stop at end of loop */
7138       } else if (nextpair->protectedp == true) {
7139 	/* Stop because it's protected */
7140 	stopp = true;
7141       } else if (nextpair->cdna != ' ' && nextpair->genome != ' ') {
7142 	/* Stop because it looks okay */
7143 	stopp = true;
7144       }
7145 
7146       pairptr = path;
7147       path = Pairpool_pop(path,&pair);
7148 #ifdef WASTE
7149       peeled = Pairpool_push_existing(peeled,pairpool,pair);
7150 #else
7151       peeled = List_push_existing(peeled,pairptr);
7152 #endif
7153       debug(printf(" Extrapeel [");
7154 	    Pair_dump_one(pair,/*zerobasedp*/true);
7155 	    printf("]"));
7156 
7157       if (uppercaseCode[(int) pair->cdna] != uppercaseCode[(int) pair->genome]) {
7158 	*mismatchp = true;
7159       }
7160 
7161 #if 0
7162       if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP || pair->comp == MISMATCH_COMP) {
7163 	nconsecutive = 0;
7164       } else if (++nconsecutive >= SUFFCONSECUTIVE) {
7165 	stopp = true;
7166       }
7167 #endif
7168 
7169 #if 0
7170       if (pair->dynprogindex != init_dynprogindex) {
7171 	if (++nincursion >= MAXINCURSION) {
7172 	  stopp = true;
7173 	}
7174       }
7175 #endif
7176 
7177       if (pair->gapp == true) {
7178 	stopp = true;
7179       }
7180 
7181       rest = path->rest;
7182     }
7183   }
7184 
7185 #ifdef PMAP
7186   /* Reverse process to codon boundary.  Cases:
7187 
7188      X X X | X -
7189      0 1 2   3 4
7190 
7191      X X X | - X
7192      0 1 2   3 3
7193 
7194      X X - X | X
7195      0 1 2 2   3
7196 
7197      Rule: nextpair->querypos % 3 == 0 */
7198 
7199   debug(printf("\n<<"));
7200   if (peeled != NULL) {
7201     rest = peeled->rest;
7202     stopp = false;
7203     while (rest != NULL && stopp == false) {
7204       pairptr = peeled;
7205       peeled = Pairpool_pop(peeled,&pair);
7206 #ifdef WASTE
7207       path = Pairpool_push_existing(path,pairpool,pair);
7208 #else
7209       path = List_push_existing(path,pairptr);
7210 #endif
7211       debug(printf(" Mod3putback [");
7212 	    Pair_dump_one(pair,/*zerobasedp*/true);
7213 	    printf("]"));
7214       nextpair = rest->first;
7215       if (nextpair->querypos % 3 == 0) {
7216 	stopp = true;
7217       }
7218       rest = peeled->rest;
7219     }
7220   }
7221 #endif
7222 
7223   if (peeled == NULL) {
7224     /* Do not alter querydp5 or genomedp5 */
7225   } else {
7226     rightpair = peeled->first;
7227     while (peeled != NULL && (rightpair->gapp == true || rightpair->comp == INDEL_COMP || rightpair->comp == SHORTGAP_COMP)) {
7228       debug(printf("Ran into gap; undoing peel, case 1, rightpair gapp %d, comp %c\n",
7229 		   rightpair->gapp,rightpair->comp));
7230       if (endgappairs != NULL) {
7231 	path = Pairpool_transfer(path,*endgappairs);
7232 	*endgappairs = (List_T) NULL;
7233       }
7234 
7235       if (quit_on_gap_p == true) {
7236 	path = Pairpool_transfer(path,peeled);
7237 	*peeled_path = (List_T) NULL;
7238 	return path;
7239 
7240       } else {
7241 	/* Put back 1 */
7242 	/* if ((pairptr = peeled) != NULL) { */
7243 	pairptr = peeled;
7244 	peeled = Pairpool_pop(peeled,&pair);
7245 	path = List_push_existing(path,pairptr);
7246 	debug(printf(" Putback [");
7247 	      Pair_dump_one(pair,/*zerobasedp*/true);
7248 	      printf("]"));
7249 	  /* } */
7250 
7251 #if 0
7252 	/* Put back 2 */
7253 	if ((pairptr = peeled) != NULL) {
7254 	  peeled = Pairpool_pop(peeled,&pair);
7255 	  path = List_push_existing(path,pairptr);
7256 	  debug(printf(" Putback [");
7257 		Pair_dump_one(pair,/*zerobasedp*/true);
7258 		printf("]"));
7259 	}
7260 #endif
7261       }
7262 
7263       rightpair = path->first;
7264     }
7265 
7266     if (path != NULL) {
7267       rightpair = path->first;
7268       *querydp5 = rightpair->querypos + 1;
7269       *genomedp5 = rightpair->genomepos + 1;
7270     } else if (peeled != NULL) {
7271       rightpair = peeled->first;
7272       *querydp5 = rightpair->querypos;
7273       *genomedp5 = rightpair->genomepos;
7274     } else {
7275       fprintf(stderr,"In peel_rightward, path and peeled are both NULL\n");
7276       abort();
7277     }
7278   }
7279 
7280   if (endgappairs != NULL) {
7281     if (path == NULL || (pair = path->first) == NULL || (pair->gapp == false && pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP)) {
7282       *endgappairs = NULL;
7283       *querydp5_medialgap = *querydp5;
7284       *genomedp5_medialgap = *genomedp5;
7285     } else {
7286       pairptr = path;
7287       path = Pairpool_pop(path,&pair);
7288 #ifdef WASTE
7289       *endgappairs = Pairpool_push_existing(NULL,pairpool,pair);
7290 #else
7291       *endgappairs = List_push_existing(NULL,pairptr);
7292 #endif
7293       debug(printf(" Peeling gap [");
7294 	    Pair_dump_one(pair,/*zerobasedp*/true);
7295 	    printf("]"));
7296       *gappair = pair;
7297       debug(printf(" gapcomp: '%c'",pair->comp));
7298 
7299       nmatches = 0;
7300       while (path != NULL && nmatches < 3) {
7301 	pairptr = path;
7302 	path = Pairpool_pop(path,&pair);
7303 #ifdef WASTE
7304 	*endgappairs = Pairpool_push_existing(*endgappairs,pairpool,pair);
7305 #else
7306 	*endgappairs = List_push_existing(*endgappairs,pairptr);
7307 #endif
7308 	debug(printf(" Peeling after gap [");
7309 	      Pair_dump_one(pair,/*zerobasedp*/true);
7310 	      printf("]"));
7311 	if (uppercaseCode[(int) pair->cdna] == uppercaseCode[(int) pair->genome]) {
7312 	  nmatches++;
7313 	}
7314       }
7315 
7316       rightpair = (*endgappairs)->first;
7317       if (rightpair->gapp == true || rightpair->comp == INDEL_COMP || rightpair->comp == SHORTGAP_COMP) {
7318 	debug(printf("Ran into gap; undoing peel, case 2\n"));
7319 	path = Pairpool_transfer(path,*endgappairs);
7320 	*endgappairs = (List_T) NULL;
7321 
7322 	if (quit_on_gap_p == true) {
7323 	  path = Pairpool_transfer(path,peeled);
7324 	  *peeled_path = (List_T) NULL;
7325 	  return path;
7326 
7327 	} else {
7328 	  /* Put back 1 */
7329 	  if ((pairptr = peeled) != NULL) {
7330 	    peeled = Pairpool_pop(peeled,&pair);
7331 	    path = List_push_existing(path,pairptr);
7332 	    debug(printf(" Putback [");
7333 		  Pair_dump_one(pair,/*zerobasedp*/true);
7334 		  printf("]"));
7335 	  }
7336 
7337 	  /* Put back 2 */
7338 	  if ((pairptr = peeled) != NULL) {
7339 	    peeled = Pairpool_pop(peeled,&pair);
7340 	    path = List_push_existing(path,pairptr);
7341 	    debug(printf(" Putback [");
7342 		  Pair_dump_one(pair,/*zerobasedp*/true);
7343 		  printf("]"));
7344 	  }
7345 	}
7346       }
7347 
7348       if (path != NULL) {
7349 	rightpair = path->first;
7350 	*querydp5_medialgap = rightpair->querypos + 1;
7351 	*genomedp5_medialgap = rightpair->genomepos + 1;
7352       } else if (peeled != NULL) {
7353 	rightpair = peeled->first;
7354 	*querydp5_medialgap = rightpair->querypos;
7355 	*genomedp5_medialgap = rightpair->genomepos;
7356       } else {
7357 	fprintf(stderr,"In peel_rightward for medialgap, path and peeled are both NULL\n");
7358 	abort();
7359       }
7360     }
7361   }
7362 
7363   /* assert(peeled == NULL || path == NULL || ((Pair_T) path->first)->comp != INDEL_COMP); */
7364   debug(
7365 	if (path == NULL) {
7366 	  printf(" => Top of path is NULL.");
7367 	} else {
7368 	  pair = path->first;
7369 	  printf(" => Top of path is ");
7370 	  Pair_dump_one(pair,/*zerobasedp*/true);
7371 	}
7372 	printf("\n => querydp5 = %d, genomedp5 = %d\n",*querydp5,*genomedp5);
7373 	);
7374 
7375   *peeled_path = peeled;
7376   return path;
7377 }
7378 #endif
7379 
7380 
7381 static List_T
peel_leftward(int * n_peeled_indels,bool * protectedp,List_T * peeled_path,List_T path,int * querydp5,Chrpos_T * genomedp5,int maxpeelback,bool stop_at_indels_p)7382 peel_leftward (int *n_peeled_indels, bool *protectedp, List_T *peeled_path, List_T path, int *querydp5, Chrpos_T *genomedp5,
7383 	       int maxpeelback, bool stop_at_indels_p) {
7384   List_T peeled = NULL;
7385   Pair_T pair, rightpair;
7386   int npeelback = 0, niter;
7387 #if 0
7388   int nincursion = 0;
7389 #endif
7390 
7391   *n_peeled_indels = 0;
7392   /* *protectedp = false; -- set by calling procedure */
7393 
7394   debug(printf("Peeling leftward with maxpeelback %d and stop_at_indels_p %d:",maxpeelback,stop_at_indels_p));
7395 
7396   /* Remove initial gaps */
7397   while (path != NULL &&
7398 	 ( ((Pair_T) path->first)->gapp == true ||
7399 	   ((Pair_T) path->first)->comp == INDEL_COMP ||
7400 	   ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
7401     path = Pairpool_pop(path,&pair);
7402   }
7403 
7404   if (path == NULL) {
7405     debug(printf(" path is empty\n"));
7406 
7407   } else if (stop_at_indels_p == true) {
7408     pair = path->first;
7409     if (pair->gapp == true) {
7410       /* Peel known gap */
7411       debug(printf(" Known_gap"));
7412       peeled = List_transfer_one(peeled,&path);
7413     }
7414 
7415     /* Peel initial indels anyway */
7416     while (path != NULL && ( ((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP )) {
7417       debug(printf(" Peel [");
7418 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7419 	    printf("]"));
7420       peeled = List_transfer_one(peeled,&path);
7421     }
7422 
7423     while (npeelback < maxpeelback && path != NULL &&
7424 	   ((Pair_T) path->first)->gapp == false &&
7425 	   ((Pair_T) path->first)->comp != INDEL_COMP &&
7426 	   ((Pair_T) path->first)->comp != SHORTGAP_COMP) {
7427       debug(printf(" Peel [");
7428 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7429 	    printf("]"));
7430       if (((Pair_T) path->first)->protectedp == true) {
7431 	*protectedp = true;
7432       }
7433       peeled = List_transfer_one(peeled,&path);
7434       npeelback++;
7435     }
7436 
7437   } else {
7438     /* Don't stop at indels, but do stop at gaps */
7439     pair = path->first;
7440     if (pair->gapp == true) {
7441       /* Peel known gap */
7442       debug(printf(" Known_gap"));
7443       peeled = List_transfer_one(peeled,&path);
7444     }
7445 
7446     niter = 0;
7447     while (npeelback < maxpeelback && niter < MAXITER && path != NULL &&
7448 	   ((Pair_T) path->first)->gapp == false) {
7449       debug(printf(" Peel [");
7450 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7451 	    printf("]"));
7452       if (((Pair_T) path->first)->comp == MATCH_COMP || ((Pair_T) path->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) path->first)->comp == AMBIGUOUS_COMP) {
7453 	npeelback++;
7454       } else if (((Pair_T) path->first)->comp == INDEL_COMP) {
7455 	*n_peeled_indels += 1;
7456 	npeelback--;
7457       } else if (((Pair_T) path->first)->comp == SHORTGAP_COMP) {
7458 	*n_peeled_indels += 1;
7459 	npeelback--;
7460       } else {
7461 	npeelback--;
7462       }
7463       if (((Pair_T) path->first)->protectedp == true) {
7464 	*protectedp = true;
7465       }
7466       niter++;
7467       peeled = List_transfer_one(peeled,&path);
7468     }
7469 
7470     if (path != NULL && ((Pair_T) path->first)->gapp == true) {
7471       debug(printf(" Hit gap [");
7472 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7473 	    printf("]"));
7474     }
7475   }
7476 
7477   if (path != NULL &&
7478       ( ((Pair_T) path->first)->gapp == true ||
7479 	((Pair_T) path->first)->comp == INDEL_COMP ||
7480 	((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
7481     /* Don't leave a gap or indel on the top of the path */
7482     while (peeled != NULL &&
7483 	   ( ((Pair_T) peeled->first)->gapp == true ||
7484 	     ((Pair_T) peeled->first)->comp == INDEL_COMP ||
7485 	     ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
7486       debug(printf(" Putback [");
7487 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
7488 	    printf("]"));
7489       path = List_transfer_one(path,&peeled);
7490     }
7491     if (peeled != NULL) {
7492       debug(printf(" Putback [");
7493 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
7494 	    printf("]"));
7495       path = List_transfer_one(path,&peeled); /* This should be match or mismatch */
7496     }
7497   }
7498 
7499   if (path != NULL) {
7500     rightpair = path->first;
7501     *querydp5 = rightpair->querypos + 1;
7502     *genomedp5 = rightpair->genomepos + 1;
7503   } else if (peeled != NULL) {
7504     rightpair = peeled->first;
7505     *querydp5 = rightpair->querypos;
7506     *genomedp5 = rightpair->genomepos;
7507   } else {
7508     /* fprintf(stderr,"In peel_leftward, path and peeled are both NULL\n"); */
7509     /* abort(); */
7510   }
7511 
7512   debug(
7513 	if (path == NULL) {
7514 	  printf(" => Top of path is NULL.");
7515 	} else {
7516 	  pair = path->first;
7517 	  printf(" => Top of path is ");
7518 	  Pair_dump_one(pair,/*zerobasedp*/true);
7519 	}
7520 	printf("\n => querydp5 = %d, genomedp5 = %d\n",*querydp5,*genomedp5);
7521 	);
7522 
7523   *peeled_path = peeled;
7524   return path;
7525 }
7526 
7527 
7528 static List_T
peel_leftward_intron(int * n_peeled_indels,bool * protectedp,List_T * peeled_path,List_T path,int * querydp5,Chrpos_T * genomedp5,Chrpos_T genomedp3,bool stop_at_indels_p,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int minpeelback,int min_mismatches)7529 peel_leftward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_path, List_T path, int *querydp5, Chrpos_T *genomedp5,
7530 		      Chrpos_T genomedp3, bool stop_at_indels_p, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
7531 		      int minpeelback, int min_mismatches) {
7532   List_T peeled = NULL;
7533   Pair_T pair, rightpair;
7534   int npeelback = 0, nmismatches = 0, niter;
7535   char cdna, intron_nt, intron_nt_alt;
7536 #if 0
7537   int nincursion = 0;
7538 #endif
7539   /* int maxpeelback = 12; */
7540 
7541   *n_peeled_indels = 0;
7542   /* *protectedp = false; -- set by calling procedure */
7543 
7544   debug(printf("Peeling leftward with genomedp3 %d and stop_at_indels_p %d:",genomedp3,stop_at_indels_p));
7545 
7546   /* Remove initial gaps */
7547   while (path != NULL &&
7548 	 ( ((Pair_T) path->first)->gapp == true ||
7549 	   ((Pair_T) path->first)->comp == INDEL_COMP ||
7550 	   ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
7551     path = Pairpool_pop(path,&pair);
7552   }
7553 
7554   if (path == NULL) {
7555     debug(printf(" path is empty\n"));
7556 
7557   } else if (stop_at_indels_p == true) {
7558     pair = path->first;
7559     if (pair->gapp == true) {
7560       /* Peel known gap */
7561       debug(printf(" Known_gap"));
7562       peeled = List_transfer_one(peeled,&path);
7563     }
7564 
7565     /* Peel initial indels anyway */
7566     while (path != NULL && ( ((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP )) {
7567       debug(printf(" Peel [");
7568 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7569 	    printf("]"));
7570       peeled = List_transfer_one(peeled,&path);
7571     }
7572 
7573     while (/*npeelback < maxpeelback &&*/
7574 	   (npeelback < minpeelback || nmismatches < min_mismatches) && path != NULL &&
7575 	   ((Pair_T) path->first)->gapp == false &&
7576 	   ((Pair_T) path->first)->comp != INDEL_COMP &&
7577 	   ((Pair_T) path->first)->comp != SHORTGAP_COMP) {
7578       debug(printf(" Peel [");
7579 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7580 	    printf("]"));
7581 
7582       intron_nt = get_genomic_nt(&intron_nt_alt,genomedp3--,chroffset,chrhigh,watsonp);
7583       if ((cdna = ((Pair_T) path->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
7584 	nmismatches++;
7585 	debug(printf(" (1) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp3+1));
7586       }
7587 
7588       if (((Pair_T) path->first)->protectedp == true) {
7589 	*protectedp = true;
7590       }
7591       peeled = List_transfer_one(peeled,&path);
7592       npeelback++;
7593     }
7594 
7595   } else {
7596     /* Don't stop at indels, but do stop at gaps */
7597     pair = path->first;
7598     if (pair->gapp == true) {
7599       /* Peel known gap */
7600       debug(printf(" Known_gap"));
7601       peeled = List_transfer_one(peeled,&path);
7602     }
7603 
7604     niter = 0;
7605     while (/*npeelback < maxpeelback &&*/
7606 	   (npeelback < minpeelback || nmismatches < min_mismatches) && niter < MAXITER && path != NULL &&
7607 	   ((Pair_T) path->first)->gapp == false) {
7608       debug(printf(" Peel [");
7609 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7610 	    printf("]"));
7611 
7612       intron_nt = get_genomic_nt(&intron_nt_alt,genomedp3--,chroffset,chrhigh,watsonp);
7613       if ((cdna = ((Pair_T) path->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
7614 	nmismatches++;
7615 	debug(printf(" (2) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp3+1));
7616       }
7617 
7618       if (((Pair_T) path->first)->comp == MATCH_COMP || ((Pair_T) path->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) path->first)->comp == AMBIGUOUS_COMP) {
7619 	npeelback++;
7620       } else if (((Pair_T) path->first)->comp == INDEL_COMP) {
7621 	*n_peeled_indels += 1;
7622 	npeelback--;
7623       } else if (((Pair_T) path->first)->comp == SHORTGAP_COMP) {
7624 	*n_peeled_indels += 1;
7625 	npeelback--;
7626       } else {
7627 	npeelback--;
7628       }
7629       if (((Pair_T) path->first)->protectedp == true) {
7630 	*protectedp = true;
7631       }
7632       niter++;
7633       peeled = List_transfer_one(peeled,&path);
7634     }
7635 
7636     if (path != NULL && ((Pair_T) path->first)->gapp == true) {
7637       debug(printf(" Hit gap [");
7638 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7639 	    printf("]"));
7640     }
7641   }
7642 
7643   if (path != NULL &&
7644       ( ((Pair_T) path->first)->gapp == true ||
7645 	((Pair_T) path->first)->comp == INDEL_COMP ||
7646 	((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
7647     /* Don't leave a gap or indel on the top of the path */
7648     while (peeled != NULL &&
7649 	   ( ((Pair_T) peeled->first)->gapp == true ||
7650 	     ((Pair_T) peeled->first)->comp == INDEL_COMP ||
7651 	     ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
7652       debug(printf(" Putback [");
7653 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
7654 	    printf("]"));
7655       path = List_transfer_one(path,&peeled);
7656     }
7657     if (peeled != NULL) {
7658       debug(printf(" Putback [");
7659 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
7660 	    printf("]"));
7661       path = List_transfer_one(path,&peeled); /* This should be match or mismatch */
7662     }
7663   }
7664 
7665   if (path != NULL) {
7666     rightpair = path->first;
7667     *querydp5 = rightpair->querypos + 1;
7668     *genomedp5 = rightpair->genomepos + 1;
7669   } else if (peeled != NULL) {
7670     rightpair = peeled->first;
7671     *querydp5 = rightpair->querypos;
7672     *genomedp5 = rightpair->genomepos;
7673   } else {
7674     /* fprintf(stderr,"In peel_leftward, path and peeled are both NULL\n"); */
7675     /* abort(); */
7676   }
7677 
7678   debug(
7679 	if (path == NULL) {
7680 	  printf(" => Top of path is NULL.");
7681 	} else {
7682 	  pair = path->first;
7683 	  printf(" => Top of path is ");
7684 	  Pair_dump_one(pair,/*zerobasedp*/true);
7685 	}
7686 	printf("\n => querydp5 = %d, genomedp5 = %d\n",*querydp5,*genomedp5);
7687 	);
7688 
7689   *peeled_path = peeled;
7690   return path;
7691 }
7692 
7693 
7694 #if 0
7695 /* Not sure if we need this, or if it causes GMAP to fail on some alignments */
7696 static List_T
7697 peel_leftward_contiguous (int *n_peeled_indels, bool *protectedp, List_T *peeled_path, List_T path, int *querydp5, Chrpos_T *genomedp5,
7698 			  int maxpeelback, bool stop_at_indels_p) {
7699   List_T peeled = NULL;
7700   Pair_T pair, rightpair;
7701   int npeelback = 0, niter;
7702 #if 0
7703   int nincursion = 0;
7704 #endif
7705   int last_querypos;
7706   Chrpos_T last_genomepos;
7707 
7708 
7709   *n_peeled_indels = 0;
7710   /* *protectedp = false; -- set by calling procedure */
7711 
7712   debug(printf("Peeling leftward with maxpeelback %d and stop_at_indels_p %d:",maxpeelback,stop_at_indels_p));
7713 
7714   /* Remove initial gaps */
7715   while (path != NULL &&
7716 	 ( ((Pair_T) path->first)->gapp == true ||
7717 	   ((Pair_T) path->first)->comp == INDEL_COMP ||
7718 	   ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
7719     path = Pairpool_pop(path,&pair);
7720   }
7721 
7722   if (path == NULL) {
7723     debug(printf(" path is empty\n"));
7724 
7725   } else if (stop_at_indels_p == true) {
7726     pair = path->first;
7727     if (pair->gapp == true) {
7728       /* Peel known gap */
7729       debug(printf(" Known_gap"));
7730       peeled = List_transfer_one(peeled,&path);
7731     }
7732 
7733     /* Peel initial indels anyway */
7734     while (path != NULL && ( ((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP )) {
7735       debug(printf(" Peel [");
7736 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7737 	    printf("]"));
7738       peeled = List_transfer_one(peeled,&path);
7739     }
7740 
7741     if (path != NULL) {
7742       last_querypos = ((Pair_T) path->first)->querypos;
7743       last_genomepos = ((Pair_T) path->first)->genomepos;
7744     }
7745     while (npeelback < maxpeelback && path != NULL &&
7746 	   ((Pair_T) path->first)->gapp == false &&
7747 	   ((Pair_T) path->first)->comp != INDEL_COMP &&
7748 	   ((Pair_T) path->first)->comp != SHORTGAP_COMP &&
7749 	   ((Pair_T) path->first)->querypos + 1 >= last_querypos &&
7750 	   ((Pair_T) path->first)->genomepos + 1 >= last_genomepos) {
7751       debug(printf(" Peel [");
7752 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7753 	    printf("]"));
7754       if (((Pair_T) path->first)->protectedp == true) {
7755 	*protectedp = true;
7756       }
7757       last_querypos = ((Pair_T) path->first)->querypos;
7758       last_genomepos = ((Pair_T) path->first)->genomepos;
7759       peeled = List_transfer_one(peeled,&path);
7760       npeelback++;
7761     }
7762 
7763   } else {
7764     /* Don't stop at indels, but do stop at gaps */
7765     pair = path->first;
7766     if (pair->gapp == true) {
7767       /* Peel known gap */
7768       debug(printf(" Known_gap"));
7769       peeled = List_transfer_one(peeled,&path);
7770     }
7771 
7772     niter = 0;
7773     if (path != NULL) {
7774       last_querypos = ((Pair_T) path->first)->querypos;
7775       last_genomepos = ((Pair_T) path->first)->genomepos;
7776     }
7777     while (npeelback < maxpeelback && niter < MAXITER && path != NULL &&
7778 	   ((Pair_T) path->first)->gapp == false &&
7779 	   ((Pair_T) path->first)->querypos + 1 >= last_querypos &&
7780 	   ((Pair_T) path->first)->genomepos + 1 >= last_genomepos) {
7781       debug(printf(" Peel [");
7782 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7783 	    printf("]"));
7784       if (((Pair_T) path->first)->comp == MATCH_COMP || ((Pair_T) path->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) path->first)->comp == AMBIGUOUS_COMP) {
7785 	npeelback++;
7786       } else if (((Pair_T) path->first)->comp == INDEL_COMP) {
7787 	*n_peeled_indels += 1;
7788 	npeelback--;
7789       } else if (((Pair_T) path->first)->comp == SHORTGAP_COMP) {
7790 	*n_peeled_indels += 1;
7791 	npeelback--;
7792       } else {
7793 	npeelback--;
7794       }
7795       if (((Pair_T) path->first)->protectedp == true) {
7796 	*protectedp = true;
7797       }
7798       niter++;
7799       last_querypos = ((Pair_T) path->first)->querypos;
7800       last_genomepos = ((Pair_T) path->first)->genomepos;
7801       peeled = List_transfer_one(peeled,&path);
7802     }
7803 
7804     if (path != NULL && ((Pair_T) path->first)->gapp == true) {
7805       debug(printf(" Hit gap [");
7806 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7807 	    printf("]"));
7808     }
7809   }
7810 
7811   if (path != NULL &&
7812       ( ((Pair_T) path->first)->gapp == true ||
7813 	((Pair_T) path->first)->comp == INDEL_COMP ||
7814 	((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
7815     /* Don't leave a gap or indel on the top of the path */
7816     while (peeled != NULL &&
7817 	   ( ((Pair_T) peeled->first)->gapp == true ||
7818 	     ((Pair_T) peeled->first)->comp == INDEL_COMP ||
7819 	     ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
7820       debug(printf(" Putback [");
7821 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
7822 	    printf("]"));
7823       path = List_transfer_one(path,&peeled);
7824     }
7825     if (peeled != NULL) {
7826       debug(printf(" Putback [");
7827 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
7828 	    printf("]"));
7829       path = List_transfer_one(path,&peeled); /* This should be match or mismatch */
7830     }
7831   }
7832 
7833   if (path != NULL) {
7834     rightpair = path->first;
7835     *querydp5 = rightpair->querypos + 1;
7836     *genomedp5 = rightpair->genomepos + 1;
7837   } else if (peeled != NULL) {
7838     rightpair = peeled->first;
7839     *querydp5 = rightpair->querypos;
7840     *genomedp5 = rightpair->genomepos;
7841   } else {
7842     /* fprintf(stderr,"In peel_leftward, path and peeled are both NULL\n"); */
7843     /* abort(); */
7844   }
7845 
7846   debug(
7847 	if (path == NULL) {
7848 	  printf(" => Top of path is NULL.");
7849 	} else {
7850 	  pair = path->first;
7851 	  printf(" => Top of path is ");
7852 	  Pair_dump_one(pair,/*zerobasedp*/true);
7853 	}
7854 	printf("\n => querydp5 = %d, genomedp5 = %d\n",*querydp5,*genomedp5);
7855 	);
7856 
7857   *peeled_path = peeled;
7858   return path;
7859 }
7860 #endif
7861 
7862 
7863 #if 0
7864 /* Not sure if we need this, or if it causes GMAP to fail on some alignments */
7865 static List_T
7866 peel_leftward_intron_contiguous (int *n_peeled_indels, bool *protectedp, List_T *peeled_path, List_T path, int *querydp5, Chrpos_T *genomedp5,
7867 				 Chrpos_T genomedp3, bool stop_at_indels_p, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
7868 				 int minpeelback, int min_mismatches) {
7869   List_T peeled = NULL;
7870   Pair_T pair, rightpair;
7871   int npeelback = 0, nmismatches = 0, niter;
7872   char cdna, intron_nt, intron_nt_alt;
7873 #if 0
7874   int nincursion = 0;
7875 #endif
7876   int maxpeelback = 12;
7877   int last_querypos;
7878   Chrpos_T last_genomepos;
7879 
7880 
7881   *n_peeled_indels = 0;
7882   /* *protectedp = false; -- set by calling procedure */
7883 
7884   debug(printf("Peeling leftward with genomedp3 %d and stop_at_indels_p %d:",genomedp3,stop_at_indels_p));
7885 
7886   /* Remove initial gaps */
7887   while (path != NULL &&
7888 	 ( ((Pair_T) path->first)->gapp == true ||
7889 	   ((Pair_T) path->first)->comp == INDEL_COMP ||
7890 	   ((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
7891     path = Pairpool_pop(path,&pair);
7892   }
7893 
7894   if (path == NULL) {
7895     debug(printf(" path is empty\n"));
7896 
7897   } else if (stop_at_indels_p == true) {
7898     pair = path->first;
7899     if (pair->gapp == true) {
7900       /* Peel known gap */
7901       debug(printf(" Known_gap"));
7902       peeled = List_transfer_one(peeled,&path);
7903     }
7904 
7905     /* Peel initial indels anyway */
7906     while (path != NULL && ( ((Pair_T) path->first)->comp == INDEL_COMP || ((Pair_T) path->first)->comp == SHORTGAP_COMP )) {
7907       debug(printf(" Peel [");
7908 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7909 	    printf("]"));
7910       peeled = List_transfer_one(peeled,&path);
7911     }
7912 
7913     if (path != NULL) {
7914       last_querypos = ((Pair_T) path->first)->querypos;
7915       last_genomepos = ((Pair_T) path->first)->genomepos;
7916     }
7917     while (/*npeelback < maxpeelback &&*/
7918 	   (npeelback < minpeelback || nmismatches < min_mismatches) && path != NULL &&
7919 	   ((Pair_T) path->first)->gapp == false &&
7920 	   ((Pair_T) path->first)->comp != INDEL_COMP &&
7921 	   ((Pair_T) path->first)->comp != SHORTGAP_COMP &&
7922 	   ((Pair_T) path->first)->querypos + 1 >= last_querypos &&
7923 	   ((Pair_T) path->first)->genomepos + 1 >= last_genomepos) {
7924       debug(printf(" Peel [");
7925 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7926 	    printf("]"));
7927 
7928       intron_nt = get_genomic_nt(&intron_nt_alt,genomedp3--,chroffset,chrhigh,watsonp);
7929       if ((cdna = ((Pair_T) path->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
7930 	nmismatches++;
7931 	debug(printf(" (1) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp3+1));
7932       }
7933 
7934       if (((Pair_T) path->first)->protectedp == true) {
7935 	*protectedp = true;
7936       }
7937       last_querypos = ((Pair_T) path->first)->querypos;
7938       last_genomepos = ((Pair_T) path->first)->genomepos;
7939       peeled = List_transfer_one(peeled,&path);
7940       npeelback++;
7941     }
7942 
7943   } else {
7944     /* Don't stop at indels, but do stop at gaps */
7945     pair = path->first;
7946     if (pair->gapp == true) {
7947       /* Peel known gap */
7948       debug(printf(" Known_gap"));
7949       peeled = List_transfer_one(peeled,&path);
7950     }
7951 
7952     niter = 0;
7953     if (path != NULL) {
7954       last_querypos = ((Pair_T) path->first)->querypos;
7955       last_genomepos = ((Pair_T) path->first)->genomepos;
7956     }
7957     while (/*npeelback < maxpeelback &&*/
7958 	   (npeelback < minpeelback || nmismatches < min_mismatches) && niter < MAXITER && path != NULL &&
7959 	   ((Pair_T) path->first)->gapp == false &&
7960 	   ((Pair_T) path->first)->querypos + 1 >= last_querypos &&
7961 	   ((Pair_T) path->first)->genomepos + 1 >= last_genomepos) {
7962       debug(printf(" Peel [");
7963 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7964 	    printf("]"));
7965 
7966       intron_nt = get_genomic_nt(&intron_nt_alt,genomedp3--,chroffset,chrhigh,watsonp);
7967       if ((cdna = ((Pair_T) path->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
7968 	nmismatches++;
7969 	debug(printf(" (2) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp3+1));
7970       }
7971 
7972       if (((Pair_T) path->first)->comp == MATCH_COMP || ((Pair_T) path->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) path->first)->comp == AMBIGUOUS_COMP) {
7973 	npeelback++;
7974       } else if (((Pair_T) path->first)->comp == INDEL_COMP) {
7975 	*n_peeled_indels += 1;
7976 	npeelback--;
7977       } else if (((Pair_T) path->first)->comp == SHORTGAP_COMP) {
7978 	*n_peeled_indels += 1;
7979 	npeelback--;
7980       } else {
7981 	npeelback--;
7982       }
7983       if (((Pair_T) path->first)->protectedp == true) {
7984 	*protectedp = true;
7985       }
7986       niter++;
7987       last_querypos = ((Pair_T) path->first)->querypos;
7988       last_genomepos = ((Pair_T) path->first)->genomepos;
7989       peeled = List_transfer_one(peeled,&path);
7990     }
7991 
7992     if (path != NULL && ((Pair_T) path->first)->gapp == true) {
7993       debug(printf(" Hit gap [");
7994 	    Pair_dump_one(path->first,/*zerobasedp*/true);
7995 	    printf("]"));
7996     }
7997   }
7998 
7999   if (path != NULL &&
8000       ( ((Pair_T) path->first)->gapp == true ||
8001 	((Pair_T) path->first)->comp == INDEL_COMP ||
8002 	((Pair_T) path->first)->comp == SHORTGAP_COMP)) {
8003     /* Don't leave a gap or indel on the top of the path */
8004     while (peeled != NULL &&
8005 	   ( ((Pair_T) peeled->first)->gapp == true ||
8006 	     ((Pair_T) peeled->first)->comp == INDEL_COMP ||
8007 	     ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
8008       debug(printf(" Putback [");
8009 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8010 	    printf("]"));
8011       path = List_transfer_one(path,&peeled);
8012     }
8013     if (peeled != NULL) {
8014       debug(printf(" Putback [");
8015 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8016 	    printf("]"));
8017       path = List_transfer_one(path,&peeled); /* This should be match or mismatch */
8018     }
8019   }
8020 
8021   if (path != NULL) {
8022     rightpair = path->first;
8023     *querydp5 = rightpair->querypos + 1;
8024     *genomedp5 = rightpair->genomepos + 1;
8025   } else if (peeled != NULL) {
8026     rightpair = peeled->first;
8027     *querydp5 = rightpair->querypos;
8028     *genomedp5 = rightpair->genomepos;
8029   } else {
8030     /* fprintf(stderr,"In peel_leftward, path and peeled are both NULL\n"); */
8031     /* abort(); */
8032   }
8033 
8034   debug(
8035 	if (path == NULL) {
8036 	  printf(" => Top of path is NULL.");
8037 	} else {
8038 	  pair = path->first;
8039 	  printf(" => Top of path is ");
8040 	  Pair_dump_one(pair,/*zerobasedp*/true);
8041 	}
8042 	printf("\n => querydp5 = %d, genomedp5 = %d\n",*querydp5,*genomedp5);
8043 	);
8044 
8045   *peeled_path = peeled;
8046   return path;
8047 }
8048 #endif
8049 
8050 
8051 #if 0
8052 static List_T
8053 peel_rightward_old (bool *mismatchp, List_T *peeled_pairs, List_T pairs, int *querydp3, int *genomedp3,
8054 #ifdef WASTE
8055 		    Pairpool_T pairpool,
8056 #endif
8057 		    int maxpeelback, bool throughmismatchp, bool quit_on_gap_p,
8058 		    List_T *endgappairs, Pair_T *gappair, int *querydp3_medialgap, int *genomedp3_medialgap) {
8059   List_T peeled = NULL, rest = NULL, pairptr;
8060   Pair_T pair, nextpair, leftpair;
8061   int npeelback = 0, nconsecutive = 0, init_dynprogindex = DYNPROGINDEX_MINOR;
8062   bool stopp;
8063   int nmatches;
8064 #if 0
8065   int incursion = 0;
8066 #endif
8067 
8068   *mismatchp = false;
8069   debug(printf("Peeling rightward:"));
8070   if (pairs == NULL) {
8071     debug(printf(" pairs is empty\n"));
8072   } else {
8073     pair = pairs->first;
8074     if (pair->gapp == true) {
8075       /* Throw away known gap */
8076       debug(printf(" Known_gap"));
8077       pairptr = pairs;
8078       pairs = Pairpool_pop(pairs,&pair);
8079 #ifdef WASTE
8080       peeled = Pairpool_push_existing(peeled,pairpool,pair);
8081 #else
8082       peeled = List_push_existing(peeled,pairptr);
8083 #endif
8084     }
8085     rest = pairs->rest;
8086 
8087     stopp = false;
8088     while (rest != NULL && stopp == false) {
8089       nextpair = rest->first;
8090       if (nextpair->gapp == true || nextpair->cdna == ' ' || nextpair->genome == ' ' || nextpair->protectedp == true) {
8091 	stopp = true;
8092       } else {
8093 	pairptr = pairs;
8094 	pairs = Pairpool_pop(pairs,&pair);
8095 #ifdef WASTE
8096 	peeled = Pairpool_push_existing(peeled,pairpool,pair);
8097 #else
8098 	peeled = List_push_existing(peeled,pairptr);
8099 #endif
8100 	debug(printf(" Peel [");
8101 	      Pair_dump_one(pair,/*zerobasedp*/true);
8102 	      printf("]"));
8103 
8104 	if (uppercaseCode[(int) pair->cdna] != uppercaseCode[(int) pair->genome]) {
8105 	  *mismatchp = true;
8106 	}
8107 
8108 	if (++npeelback >= maxpeelback) {
8109 	  stopp = true;
8110 	}
8111 
8112 	if (init_dynprogindex > 0 && pair->dynprogindex <= 0) {
8113 	  init_dynprogindex = pair->dynprogindex;
8114 	}
8115 
8116 	rest = pairs->rest;
8117       }
8118     }
8119 
8120     /* Continue to peelback through little skips and mismatches */
8121     debug(printf("\n||"));
8122 
8123     stopp = false;
8124     while (rest != NULL && stopp == false) {
8125       nextpair = rest->first;
8126       if (nextpair->gapp == true) {
8127 	/* Peel this one, but then stop at end of loop */
8128       } else if (nextpair->protectedp == true) {
8129 	/* Stop because it's protected */
8130 	stopp = true;
8131       } else if (nextpair->cdna != ' ' && nextpair->genome != ' ') {
8132 	/* Stop because it looks okay */
8133 	stopp = true;
8134       }
8135 
8136       pairptr = pairs;
8137       pairs = Pairpool_pop(pairs,&pair);
8138 #ifdef WASTE
8139       peeled = Pairpool_push_existing(peeled,pairpool,pair);
8140 #else
8141       peeled = List_push_existing(peeled,pairptr);
8142 #endif
8143       debug(printf(" Extrapeel [");
8144 	    Pair_dump_one(pair,/*zerobasedp*/true);
8145 	    printf("]"));
8146 
8147       if (uppercaseCode[(int) pair->cdna] != uppercaseCode[(int) pair->genome]) {
8148 	*mismatchp = true;
8149       }
8150 
8151 #if 0
8152       if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP || pair->comp == MISMATCH_COMP) {
8153 	nconsecutive = 0;
8154       } else if (++nconsecutive >= SUFFCONSECUTIVE) {
8155 	stopp = true;
8156       }
8157 #endif
8158 
8159 #if 0
8160       if (pair->dynprogindex != init_dynprogindex) {
8161 	if (++nincursion >= MAXINCURSION) {
8162 	  stopp = true;
8163 	}
8164       }
8165 #endif
8166 
8167       if (pair->gapp == true) {
8168 	stopp = true;
8169       }
8170 
8171       rest = pairs->rest;
8172     }
8173   }
8174 
8175 #ifdef PMAP
8176   /* Reverse process to codon boundary.  Cases:
8177 
8178      - X | X X X
8179      5 5   6 7 8
8180 
8181      X - | X X X
8182      5 6   6 7 8
8183 
8184      X | X - X X
8185      5   6 7 7 8
8186 
8187      Rule: pair->querypos % 3 == 0 */
8188 
8189   debug(printf("\n<<"));
8190   stopp = false;
8191   while (peeled != NULL && stopp == false) {
8192     pairptr = peeled;
8193     peeled = Pairpool_pop(peeled,&pair);
8194 #ifdef WASTE
8195     pairs = Pairpool_push_existing(pairs,pairpool,pair);
8196 #else
8197     pairs = List_push_existing(pairs,pairptr);
8198 #endif
8199     debug(printf(" Mod3putback [");
8200 	  Pair_dump_one(pair,/*zerobasedp*/true);
8201 	  printf("]"));
8202     if (pair->querypos % 3 == 0) {
8203       stopp = true;
8204     }
8205   }
8206 #endif
8207 
8208   if (peeled == NULL) {
8209     /* Do not alter querydp3 or genomedp3 */
8210   } else {
8211     leftpair = peeled->first;
8212     while (peeled != NULL && (leftpair->gapp == true || leftpair->comp == INDEL_COMP || leftpair->comp == SHORTGAP_COMP)) {
8213       debug(printf("Ran into gap; undoing peel, case 3, leftpair gapp %d, comp %c\n",
8214 		   leftpair->gapp,leftpair->comp));
8215       if (endgappairs != NULL) {
8216 	pairs = Pairpool_transfer(pairs,*endgappairs);
8217 	*endgappairs = (List_T) NULL;
8218       }
8219 
8220       if (quit_on_gap_p == true) {
8221 	pairs = Pairpool_transfer(pairs,peeled);
8222 	*peeled_pairs = (List_T) NULL;
8223 	return pairs;
8224 
8225       } else {
8226 	/* Put back 1 */
8227 	/* if ((pairptr = peeled) != NULL) { */
8228 	pairptr = peeled;
8229 	peeled = Pairpool_pop(peeled,&pair);
8230 	pairs = List_push_existing(pairs,pairptr);
8231 	debug(printf(" Putback [");
8232 	      Pair_dump_one(pair,/*zerobasedp*/true);
8233 	      printf("]"));
8234 	  /* } */
8235 
8236 #if 0
8237 	/* Put back 2 */
8238 	if ((pairptr = peeled) != NULL) {
8239 	  peeled = Pairpool_pop(peeled,&pair);
8240 	  pairs = List_push_existing(pairs,pairptr);
8241 	  debug(printf(" Putback [");
8242 		Pair_dump_one(pair,/*zerobasedp*/true);
8243 		printf("]"));
8244 	}
8245 #endif
8246       }
8247 
8248       leftpair = pairs->first;
8249     }
8250 
8251     if (pairs != NULL) {
8252       leftpair = pairs->first;
8253       *querydp3 = leftpair->querypos - 1;
8254       *genomedp3 = leftpair->genomepos - 1;
8255     } else if (peeled != NULL) {
8256       leftpair = peeled->first;
8257       *querydp3 = leftpair->querypos;
8258       *genomedp3 = leftpair->genomepos;
8259     } else {
8260       fprintf(stderr,"In peel_leftward, pairs and peeled are both NULL\n");
8261       abort();
8262     }
8263   }
8264 
8265   if (endgappairs != NULL) {
8266     if (pairs == NULL || (pair = pairs->first) == NULL || (pair->gapp == false && pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP)) {
8267       *endgappairs = NULL;
8268       *querydp3_medialgap = *querydp3;
8269       *genomedp3_medialgap = *genomedp3;
8270     } else {
8271       pairptr = pairs;
8272       pairs = Pairpool_pop(pairs,&pair);
8273 #ifdef WASTE
8274       *endgappairs = Pairpool_push_existing(NULL,pairpool,pair);
8275 #else
8276       *endgappairs = List_push_existing(NULL,pairptr);
8277 #endif
8278       debug(printf(" Peeling gap [");
8279 	    Pair_dump_one(pair,/*zerobasedp*/true);
8280 	    printf("]"));
8281       *gappair = pair;
8282       debug(printf(" gapcomp: '%c'",pair->comp));
8283 
8284       nmatches = 0;
8285       while (pairs != NULL && nmatches < 3) {
8286 	pairptr = pairs;
8287 	pairs = Pairpool_pop(pairs,&pair);
8288 #ifdef WASTE
8289 	*endgappairs = Pairpool_push_existing(*endgappairs,pairpool,pair);
8290 #else
8291 	*endgappairs = List_push_existing(*endgappairs,pairptr);
8292 #endif
8293 	debug(printf(" Peeling after gap [");
8294 	      Pair_dump_one(pair,/*zerobasedp*/true);
8295 	      printf("]"));
8296 	if (uppercaseCode[(int) pair->cdna] == uppercaseCode[(int) pair->genome]) {
8297 	  nmatches++;
8298 	}
8299       }
8300 
8301       leftpair = (*endgappairs)->first;
8302       if (leftpair->gapp == true || leftpair->comp == INDEL_COMP || leftpair->comp == SHORTGAP_COMP) {
8303 	debug(printf("Ran into gap; undoing peel, case 4\n"));
8304 	pairs = Pairpool_transfer(pairs,*endgappairs);
8305 	*endgappairs = (List_T) NULL;
8306 
8307 	if (quit_on_gap_p == true) {
8308 	  pairs = Pairpool_transfer(pairs,peeled);
8309 	  *peeled_pairs = (List_T) NULL;
8310 	  return pairs;
8311 
8312 	} else {
8313 	  /* Put back 1 */
8314 	  if ((pairptr = peeled) != NULL) {
8315 	    peeled = Pairpool_pop(peeled,&pair);
8316 	    pairs = List_push_existing(pairs,pairptr);
8317 	    debug(printf(" Putback [");
8318 		  Pair_dump_one(pair,/*zerobasedp*/true);
8319 		  printf("]"));
8320 	  }
8321 
8322 	  /* Put back 2 */
8323 	  if ((pairptr = peeled) != NULL) {
8324 	    peeled = Pairpool_pop(peeled,&pair);
8325 	    pairs = List_push_existing(pairs,pairptr);
8326 	    debug(printf(" Putback [");
8327 		  Pair_dump_one(pair,/*zerobasedp*/true);
8328 		  printf("]"));
8329 	  }
8330 	}
8331       }
8332 
8333       if (pairs != NULL) {
8334 	leftpair = pairs->first;
8335 	*querydp3_medialgap = leftpair->querypos - 1;
8336 	*genomedp3_medialgap = leftpair->genomepos - 1;
8337       } else if (peeled != NULL) {
8338 	leftpair = peeled->first;
8339 	*querydp3_medialgap = leftpair->querypos;
8340 	*genomedp3_medialgap = leftpair->genomepos;
8341       } else {
8342 	fprintf(stderr,"In peel_leftward for medialgap, pairs and peeled are both NULL\n");
8343 	abort();
8344       }
8345     }
8346   }
8347 
8348   /* assert(peeled == NULL || pairs == NULL || ((Pair_T) pairs->first)->comp != INDEL_COMP); */
8349   debug(
8350 	if (pairs == NULL) {
8351 	  printf(" => Top of pairs is NULL.");
8352 	} else {
8353 	  pair = pairs->first;
8354 	  printf(" => Top of pairs is ");
8355 	  Pair_dump_one(pair,/*zerobasedp*/true);
8356 	}
8357 	printf("\n => querydp3 = %d, genomedp3 = %d\n",*querydp3,*genomedp3);
8358 	);
8359 
8360   *peeled_pairs = peeled;
8361   return pairs;
8362 }
8363 #endif
8364 
8365 
8366 static List_T
peel_rightward(int * n_peeled_indels,bool * protectedp,List_T * peeled_pairs,List_T pairs,int * querydp3,Chrpos_T * genomedp3,int maxpeelback,bool stop_at_indels_p)8367 peel_rightward (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
8368 		int maxpeelback, bool stop_at_indels_p) {
8369   List_T peeled = NULL;
8370   Pair_T pair, leftpair;
8371   int npeelback = 0, niter;
8372 #if 0
8373   int incursion = 0;
8374 #endif
8375 
8376   *n_peeled_indels = 0;
8377   /* *protectedp = false; -- set by calling procedure */
8378 
8379   debug(printf("Peeling rightward with maxpeelback %d and stop_at_indels_p %d:",maxpeelback,stop_at_indels_p));
8380 
8381   /* Remove initial gaps */
8382   while (pairs != NULL &&
8383 	 ( ((Pair_T) pairs->first)->gapp == true ||
8384 	   ((Pair_T) pairs->first)->comp == INDEL_COMP ||
8385 	   ((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
8386     pairs = Pairpool_pop(pairs,&pair);
8387   }
8388 
8389   if (pairs == NULL) {
8390     debug(printf(" pairs is empty\n"));
8391 
8392   } else if (stop_at_indels_p == true) {
8393     pair = pairs->first;
8394     if (pair->gapp == true) {
8395       /* Peel known gap */
8396       debug(printf(" Known_gap"));
8397       peeled = List_transfer_one(peeled,&pairs);
8398     }
8399 
8400     /* Peel initial indels anyway */
8401     while (pairs != NULL && ( ((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == INDEL_COMP )) {
8402       debug(printf(" Peel [");
8403 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8404 	    printf("]"));
8405       peeled = List_transfer_one(peeled,&pairs);
8406     }
8407 
8408     while (npeelback < maxpeelback && pairs != NULL &&
8409 	   ((Pair_T) pairs->first)->gapp == false &&
8410 	   ((Pair_T) pairs->first)->comp != INDEL_COMP &&
8411 	   ((Pair_T) pairs->first)->comp != SHORTGAP_COMP) {
8412       debug(printf(" Peel [");
8413 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8414 	    printf("]"));
8415       if (((Pair_T) pairs->first)->protectedp == true) {
8416 	*protectedp = true;
8417       }
8418       peeled = List_transfer_one(peeled,&pairs);
8419       npeelback++;
8420     }
8421 
8422   } else {
8423     /* Don't stop at indels, but do stop at gaps */
8424     pair = pairs->first;
8425     if (pair->gapp == true) {
8426       /* Peel known gap */
8427       debug(printf(" Known_gap"));
8428       peeled = List_transfer_one(peeled,&pairs);
8429     }
8430 
8431     niter = 0;
8432     while (npeelback < maxpeelback && niter < MAXITER && pairs != NULL &&
8433 	   ((Pair_T) pairs->first)->gapp == false) {
8434       debug(printf(" Peel [");
8435 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8436 	    printf("]"));
8437       if (((Pair_T) pairs->first)->comp == MATCH_COMP || ((Pair_T) pairs->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) pairs->first)->comp == AMBIGUOUS_COMP) {
8438 	npeelback++;
8439       } else if (((Pair_T) pairs->first)->comp == INDEL_COMP) {
8440 	*n_peeled_indels += 1;
8441 	npeelback--;
8442       } else if (((Pair_T) pairs->first)->comp == SHORTGAP_COMP) {
8443 	*n_peeled_indels += 1;
8444 	npeelback--;
8445       } else {
8446 	npeelback--;
8447       }
8448       if (((Pair_T) pairs->first)->protectedp == true) {
8449 	*protectedp = true;
8450       }
8451       niter++;
8452       peeled = List_transfer_one(peeled,&pairs);
8453     }
8454 
8455     if (pairs != NULL && ((Pair_T) pairs->first)->gapp == true) {
8456       debug(printf(" Hit gap [");
8457 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8458 	    printf("]"));
8459     }
8460   }
8461 
8462   if (pairs != NULL &&
8463       ( ((Pair_T) pairs->first)->gapp == true ||
8464 	((Pair_T) pairs->first)->comp == INDEL_COMP ||
8465 	((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
8466     /* Don't leave a gap or indel on the top of the pairs */
8467     while (peeled != NULL &&
8468 	   ( ((Pair_T) peeled->first)->gapp == true ||
8469 	     ((Pair_T) peeled->first)->comp == INDEL_COMP ||
8470 	     ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
8471       debug(printf(" Putback [");
8472 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8473 	    printf("]"));
8474       pairs = List_transfer_one(pairs,&peeled);
8475     }
8476     if (peeled != NULL) {
8477       debug(printf(" Putback [");
8478 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8479 	    printf("]"));
8480       pairs = List_transfer_one(pairs,&peeled); /* This should be match or mismatch */
8481     }
8482   }
8483 
8484   if (pairs != NULL) {
8485     leftpair = pairs->first;
8486     *querydp3 = leftpair->querypos - 1;
8487     *genomedp3 = leftpair->genomepos - 1;
8488   } else if (peeled != NULL) {
8489     leftpair = peeled->first;
8490     *querydp3 = leftpair->querypos;
8491     *genomedp3 = leftpair->genomepos;
8492   } else {
8493     /* fprintf(stderr,"In peel_rightward, pairs and peeled are both NULL\n"); */
8494     /* abort(); */
8495   }
8496 
8497   debug(
8498 	if (pairs == NULL) {
8499 	  printf(" => Top of pairs is NULL.");
8500 	} else {
8501 	  pair = pairs->first;
8502 	  printf(" => Top of pairs is ");
8503 	  Pair_dump_one(pair,/*zerobasedp*/true);
8504 	}
8505 	printf("\n => querydp3 = %d, genomedp3 = %d\n",*querydp3,*genomedp3);
8506 	);
8507 
8508   *peeled_pairs = peeled;
8509   return pairs;
8510 }
8511 
8512 
8513 /* Instead of maxpeelback, follow the 5' intron until we get enough mismatches */
8514 static List_T
peel_rightward_intron(int * n_peeled_indels,bool * protectedp,List_T * peeled_pairs,List_T pairs,int * querydp3,Chrpos_T * genomedp3,Chrpos_T genomedp5,bool stop_at_indels_p,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int minpeelback,int min_mismatches)8515 peel_rightward_intron (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
8516 		       Chrpos_T genomedp5, bool stop_at_indels_p, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
8517 		       int minpeelback, int min_mismatches) {
8518   List_T peeled = NULL;
8519   Pair_T pair, leftpair;
8520   int npeelback = 0, nmismatches = 0, niter;
8521   char cdna, intron_nt, intron_nt_alt;
8522 #if 0
8523   int incursion = 0;
8524 #endif
8525   /* int maxpeelback = 12; */
8526 
8527   *n_peeled_indels = 0;
8528   /* *protectedp = false; -- set by calling procedure */
8529 
8530   debug(printf("Peeling rightward with genomedp5 %d and stop_at_indels_p %d:",genomedp5,stop_at_indels_p));
8531 
8532   /* Remove initial gaps */
8533   while (pairs != NULL &&
8534 	 ( ((Pair_T) pairs->first)->gapp == true ||
8535 	   ((Pair_T) pairs->first)->comp == INDEL_COMP ||
8536 	   ((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
8537     pairs = Pairpool_pop(pairs,&pair);
8538   }
8539 
8540   if (pairs == NULL) {
8541     debug(printf(" pairs is empty\n"));
8542 
8543   } else if (stop_at_indels_p == true) {
8544     pair = pairs->first;
8545     if (pair->gapp == true) {
8546       /* Peel known gap */
8547       debug(printf(" Known_gap"));
8548       peeled = List_transfer_one(peeled,&pairs);
8549     }
8550 
8551     /* Peel initial indels anyway */
8552     while (pairs != NULL && ( ((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == INDEL_COMP )) {
8553       debug(printf(" Peel [");
8554 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8555 	    printf("]"));
8556       peeled = List_transfer_one(peeled,&pairs);
8557     }
8558 
8559     while (/*npeelback < maxpeelback &&*/
8560 	   (npeelback < minpeelback || nmismatches < min_mismatches) && pairs != NULL &&
8561 	   ((Pair_T) pairs->first)->gapp == false &&
8562 	   ((Pair_T) pairs->first)->comp != INDEL_COMP &&
8563 	   ((Pair_T) pairs->first)->comp != SHORTGAP_COMP) {
8564       debug(printf(" Peel [");
8565 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8566 	    printf("]"));
8567 
8568       intron_nt = get_genomic_nt(&intron_nt_alt,genomedp5++,chroffset,chrhigh,watsonp);
8569       if ((cdna = ((Pair_T) pairs->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
8570 	nmismatches++;
8571 	debug(printf(" (3) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp5-1));
8572       }
8573 
8574       if (((Pair_T) pairs->first)->protectedp == true) {
8575 	*protectedp = true;
8576       }
8577       peeled = List_transfer_one(peeled,&pairs);
8578       npeelback++;
8579     }
8580 
8581   } else {
8582     /* Don't stop at indels, but do stop at gaps */
8583     pair = pairs->first;
8584     if (pair->gapp == true) {
8585       /* Peel known gap */
8586       debug(printf(" Known_gap"));
8587       peeled = List_transfer_one(peeled,&pairs);
8588     }
8589 
8590     niter = 0;
8591     while (/*npeelback < maxpeelback &&*/
8592 	   (npeelback < minpeelback || nmismatches < min_mismatches) && niter < MAXITER && pairs != NULL &&
8593 	   ((Pair_T) pairs->first)->gapp == false) {
8594       debug(printf(" Peel [");
8595 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8596 	    printf("]"));
8597 
8598       intron_nt = get_genomic_nt(&intron_nt_alt,genomedp5++,chroffset,chrhigh,watsonp);
8599       if ((cdna = ((Pair_T) pairs->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
8600 	nmismatches++;
8601 	debug(printf(" (4) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp5-1));
8602       }
8603 
8604       if (((Pair_T) pairs->first)->comp == MATCH_COMP || ((Pair_T) pairs->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) pairs->first)->comp == AMBIGUOUS_COMP) {
8605 	npeelback++;
8606       } else if (((Pair_T) pairs->first)->comp == INDEL_COMP) {
8607 	*n_peeled_indels += 1;
8608 	npeelback--;
8609       } else if (((Pair_T) pairs->first)->comp == SHORTGAP_COMP) {
8610 	*n_peeled_indels += 1;
8611 	npeelback--;
8612       } else {
8613 	npeelback--;
8614       }
8615       if (((Pair_T) pairs->first)->protectedp == true) {
8616 	*protectedp = true;
8617       }
8618       niter++;
8619       peeled = List_transfer_one(peeled,&pairs);
8620     }
8621 
8622     if (pairs != NULL && ((Pair_T) pairs->first)->gapp == true) {
8623       debug(printf(" Hit gap [");
8624 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8625 	    printf("]"));
8626     }
8627   }
8628 
8629   if (pairs != NULL &&
8630       ( ((Pair_T) pairs->first)->gapp == true ||
8631 	((Pair_T) pairs->first)->comp == INDEL_COMP ||
8632 	((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
8633     /* Don't leave a gap or indel on the top of the pairs */
8634     while (peeled != NULL &&
8635 	   ( ((Pair_T) peeled->first)->gapp == true ||
8636 	     ((Pair_T) peeled->first)->comp == INDEL_COMP ||
8637 	     ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
8638       debug(printf(" Putback [");
8639 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8640 	    printf("]"));
8641       pairs = List_transfer_one(pairs,&peeled);
8642     }
8643     if (peeled != NULL) {
8644       debug(printf(" Putback [");
8645 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8646 	    printf("]"));
8647       pairs = List_transfer_one(pairs,&peeled); /* This should be match or mismatch */
8648     }
8649   }
8650 
8651   if (pairs != NULL) {
8652     leftpair = pairs->first;
8653     *querydp3 = leftpair->querypos - 1;
8654     *genomedp3 = leftpair->genomepos - 1;
8655   } else if (peeled != NULL) {
8656     leftpair = peeled->first;
8657     *querydp3 = leftpair->querypos;
8658     *genomedp3 = leftpair->genomepos;
8659   } else {
8660     /* fprintf(stderr,"In peel_rightward, pairs and peeled are both NULL\n"); */
8661     /* abort(); */
8662   }
8663 
8664   debug(
8665 	if (pairs == NULL) {
8666 	  printf(" => Top of pairs is NULL.");
8667 	} else {
8668 	  pair = pairs->first;
8669 	  printf(" => Top of pairs is ");
8670 	  Pair_dump_one(pair,/*zerobasedp*/true);
8671 	}
8672 	printf("\n => querydp3 = %d, genomedp3 = %d\n",*querydp3,*genomedp3);
8673 	);
8674 
8675   *peeled_pairs = peeled;
8676   return pairs;
8677 }
8678 
8679 
8680 #if 0
8681 /* Not sure if we need this, or if it causes GMAP to fail on some alignments */
8682 static List_T
8683 peel_rightward_contiguous (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
8684 			   int maxpeelback, bool stop_at_indels_p) {
8685   List_T peeled = NULL;
8686   Pair_T pair, leftpair;
8687   int npeelback = 0, niter;
8688 #if 0
8689   int incursion = 0;
8690 #endif
8691   int last_querypos;
8692   Chrpos_T last_genomepos;
8693 
8694 
8695   *n_peeled_indels = 0;
8696   /* *protectedp = false; -- set by calling procedure */
8697 
8698   debug(printf("Peeling rightward with maxpeelback %d and stop_at_indels_p %d:",maxpeelback,stop_at_indels_p));
8699 
8700   /* Remove initial gaps */
8701   while (pairs != NULL &&
8702 	 ( ((Pair_T) pairs->first)->gapp == true ||
8703 	   ((Pair_T) pairs->first)->comp == INDEL_COMP ||
8704 	   ((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
8705     pairs = Pairpool_pop(pairs,&pair);
8706   }
8707 
8708   if (pairs == NULL) {
8709     debug(printf(" pairs is empty\n"));
8710 
8711   } else if (stop_at_indels_p == true) {
8712     pair = pairs->first;
8713     if (pair->gapp == true) {
8714       /* Peel known gap */
8715       debug(printf(" Known_gap"));
8716       peeled = List_transfer_one(peeled,&pairs);
8717     }
8718 
8719     /* Peel initial indels anyway */
8720     while (pairs != NULL && ( ((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == INDEL_COMP )) {
8721       debug(printf(" Peel [");
8722 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8723 	    printf("]"));
8724       peeled = List_transfer_one(peeled,&pairs);
8725     }
8726 
8727     if (pairs != NULL) {
8728       last_querypos = ((Pair_T) pairs->first)->querypos;
8729       last_genomepos = ((Pair_T) pairs->first)->genomepos;
8730     }
8731     while (npeelback < maxpeelback && pairs != NULL &&
8732 	   ((Pair_T) pairs->first)->gapp == false &&
8733 	   ((Pair_T) pairs->first)->comp != INDEL_COMP &&
8734 	   ((Pair_T) pairs->first)->comp != SHORTGAP_COMP &&
8735 	   ((Pair_T) pairs->first)->querypos <= last_querypos + 1 &&
8736 	   ((Pair_T) pairs->first)->genomepos <= last_genomepos + 1) {
8737       debug(printf(" Peel [");
8738 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8739 	    printf("]"));
8740       if (((Pair_T) pairs->first)->protectedp == true) {
8741 	*protectedp = true;
8742       }
8743       last_querypos = ((Pair_T) pairs->first)->querypos;
8744       last_genomepos = ((Pair_T) pairs->first)->genomepos;
8745       peeled = List_transfer_one(peeled,&pairs);
8746       npeelback++;
8747     }
8748 
8749   } else {
8750     /* Don't stop at indels, but do stop at gaps */
8751     pair = pairs->first;
8752     if (pair->gapp == true) {
8753       /* Peel known gap */
8754       debug(printf(" Known_gap"));
8755       peeled = List_transfer_one(peeled,&pairs);
8756     }
8757 
8758     niter = 0;
8759     if (pairs != NULL) {
8760       last_querypos = ((Pair_T) pairs->first)->querypos;
8761       last_genomepos = ((Pair_T) pairs->first)->genomepos;
8762     }
8763     while (npeelback < maxpeelback && niter < MAXITER && pairs != NULL &&
8764 	   ((Pair_T) pairs->first)->gapp == false &&
8765 	   ((Pair_T) pairs->first)->querypos <= last_querypos + 1 &&
8766 	   ((Pair_T) pairs->first)->genomepos <= last_genomepos + 1) {
8767       debug(printf(" Peel [");
8768 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8769 	    printf("]"));
8770       if (((Pair_T) pairs->first)->comp == MATCH_COMP || ((Pair_T) pairs->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) pairs->first)->comp == AMBIGUOUS_COMP) {
8771 	npeelback++;
8772       } else if (((Pair_T) pairs->first)->comp == INDEL_COMP) {
8773 	*n_peeled_indels += 1;
8774 	npeelback--;
8775       } else if (((Pair_T) pairs->first)->comp == SHORTGAP_COMP) {
8776 	*n_peeled_indels += 1;
8777 	npeelback--;
8778       } else {
8779 	npeelback--;
8780       }
8781       if (((Pair_T) pairs->first)->protectedp == true) {
8782 	*protectedp = true;
8783       }
8784       niter++;
8785       last_querypos = ((Pair_T) pairs->first)->querypos;
8786       last_genomepos = ((Pair_T) pairs->first)->genomepos;
8787       peeled = List_transfer_one(peeled,&pairs);
8788     }
8789 
8790     if (pairs != NULL && ((Pair_T) pairs->first)->gapp == true) {
8791       debug(printf(" Hit gap [");
8792 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8793 	    printf("]"));
8794     }
8795   }
8796 
8797   if (pairs != NULL &&
8798       ( ((Pair_T) pairs->first)->gapp == true ||
8799 	((Pair_T) pairs->first)->comp == INDEL_COMP ||
8800 	((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
8801     /* Don't leave a gap or indel on the top of the pairs */
8802     while (peeled != NULL &&
8803 	   ( ((Pair_T) peeled->first)->gapp == true ||
8804 	     ((Pair_T) peeled->first)->comp == INDEL_COMP ||
8805 	     ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
8806       debug(printf(" Putback [");
8807 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8808 	    printf("]"));
8809       pairs = List_transfer_one(pairs,&peeled);
8810     }
8811     if (peeled != NULL) {
8812       debug(printf(" Putback [");
8813 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8814 	    printf("]"));
8815       pairs = List_transfer_one(pairs,&peeled); /* This should be match or mismatch */
8816     }
8817   }
8818 
8819   if (pairs != NULL) {
8820     leftpair = pairs->first;
8821     *querydp3 = leftpair->querypos - 1;
8822     *genomedp3 = leftpair->genomepos - 1;
8823   } else if (peeled != NULL) {
8824     leftpair = peeled->first;
8825     *querydp3 = leftpair->querypos;
8826     *genomedp3 = leftpair->genomepos;
8827   } else {
8828     /* fprintf(stderr,"In peel_rightward, pairs and peeled are both NULL\n"); */
8829     /* abort(); */
8830   }
8831 
8832   debug(
8833 	if (pairs == NULL) {
8834 	  printf(" => Top of pairs is NULL.");
8835 	} else {
8836 	  pair = pairs->first;
8837 	  printf(" => Top of pairs is ");
8838 	  Pair_dump_one(pair,/*zerobasedp*/true);
8839 	}
8840 	printf("\n => querydp3 = %d, genomedp3 = %d\n",*querydp3,*genomedp3);
8841 	);
8842 
8843   *peeled_pairs = peeled;
8844   return pairs;
8845 }
8846 #endif
8847 
8848 
8849 #if 0
8850 /* Not sure if we need this, or if it causes GMAP to fail on some alignments */
8851 /* Instead of maxpeelback, follow the 5' intron until we get enough mismatches */
8852 static List_T
8853 peel_rightward_intron_contiguous (int *n_peeled_indels, bool *protectedp, List_T *peeled_pairs, List_T pairs, int *querydp3, Chrpos_T *genomedp3,
8854 				  Chrpos_T genomedp5, bool stop_at_indels_p, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
8855 				  int minpeelback, int min_mismatches) {
8856   List_T peeled = NULL;
8857   Pair_T pair, leftpair;
8858   int npeelback = 0, nmismatches = 0, niter;
8859   char cdna, intron_nt, intron_nt_alt;
8860 #if 0
8861   int incursion = 0;
8862 #endif
8863   int maxpeelback = 12;
8864   int last_querypos;
8865   Chrpos_T last_genomepos;
8866 
8867 
8868   *n_peeled_indels = 0;
8869   /* *protectedp = false; -- set by calling procedure */
8870 
8871   debug(printf("Peeling rightward with genomedp5 %d and stop_at_indels_p %d:",genomedp5,stop_at_indels_p));
8872 
8873   /* Remove initial gaps */
8874   while (pairs != NULL &&
8875 	 ( ((Pair_T) pairs->first)->gapp == true ||
8876 	   ((Pair_T) pairs->first)->comp == INDEL_COMP ||
8877 	   ((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
8878     pairs = Pairpool_pop(pairs,&pair);
8879   }
8880 
8881   if (pairs == NULL) {
8882     debug(printf(" pairs is empty\n"));
8883 
8884   } else if (stop_at_indels_p == true) {
8885     pair = pairs->first;
8886     if (pair->gapp == true) {
8887       /* Peel known gap */
8888       debug(printf(" Known_gap"));
8889       peeled = List_transfer_one(peeled,&pairs);
8890     }
8891 
8892     /* Peel initial indels anyway */
8893     while (pairs != NULL && ( ((Pair_T) pairs->first)->comp == INDEL_COMP || ((Pair_T) pairs->first)->comp == INDEL_COMP )) {
8894       debug(printf(" Peel [");
8895 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8896 	    printf("]"));
8897       peeled = List_transfer_one(peeled,&pairs);
8898     }
8899 
8900     if (pairs != NULL) {
8901       last_querypos = ((Pair_T) pairs->first)->querypos;
8902       last_genomepos = ((Pair_T) pairs->first)->genomepos;
8903     }
8904     while (/*npeelback < maxpeelback &&*/
8905 	   (npeelback < minpeelback || nmismatches < min_mismatches) && pairs != NULL &&
8906 	   ((Pair_T) pairs->first)->gapp == false &&
8907 	   ((Pair_T) pairs->first)->comp != INDEL_COMP &&
8908 	   ((Pair_T) pairs->first)->comp != SHORTGAP_COMP &&
8909 	   ((Pair_T) pairs->first)->querypos <= last_querypos + 1 &&
8910 	   ((Pair_T) pairs->first)->genomepos <= last_genomepos + 1) {
8911       debug(printf(" Peel [");
8912 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8913 	    printf("]"));
8914 
8915       intron_nt = get_genomic_nt(&intron_nt_alt,genomedp5++,chroffset,chrhigh,watsonp);
8916       if ((cdna = ((Pair_T) pairs->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
8917 	nmismatches++;
8918 	debug(printf(" (3) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp5-1));
8919       }
8920 
8921       if (((Pair_T) pairs->first)->protectedp == true) {
8922 	*protectedp = true;
8923       }
8924       last_querypos = ((Pair_T) pairs->first)->querypos;
8925       last_genomepos = ((Pair_T) pairs->first)->genomepos;
8926       peeled = List_transfer_one(peeled,&pairs);
8927       npeelback++;
8928     }
8929 
8930   } else {
8931     /* Don't stop at indels, but do stop at gaps */
8932     pair = pairs->first;
8933     if (pair->gapp == true) {
8934       /* Peel known gap */
8935       debug(printf(" Known_gap"));
8936       peeled = List_transfer_one(peeled,&pairs);
8937     }
8938 
8939     niter = 0;
8940     if (pairs != NULL) {
8941       last_querypos = ((Pair_T) pairs->first)->querypos;
8942       last_genomepos = ((Pair_T) pairs->first)->genomepos;
8943     }
8944     while (/*npeelback < maxpeelback &&*/
8945 	   (npeelback < minpeelback || nmismatches < min_mismatches) && niter < MAXITER && pairs != NULL &&
8946 	   ((Pair_T) pairs->first)->gapp == false &&
8947 	   ((Pair_T) pairs->first)->querypos <= last_querypos + 1 &&
8948 	   ((Pair_T) pairs->first)->genomepos <= last_genomepos + 1) {
8949       debug(printf(" Peel [");
8950 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8951 	    printf("]"));
8952 
8953       intron_nt = get_genomic_nt(&intron_nt_alt,genomedp5++,chroffset,chrhigh,watsonp);
8954       if ((cdna = ((Pair_T) pairs->first)->cdna) != intron_nt && cdna != intron_nt_alt) {
8955 	nmismatches++;
8956 	debug(printf(" (4) Intron mismatch #%d: %c != %c or %c at %u\n",nmismatches,cdna,intron_nt,intron_nt_alt,genomedp5-1));
8957       }
8958 
8959       if (((Pair_T) pairs->first)->comp == MATCH_COMP || ((Pair_T) pairs->first)->comp == DYNPROG_MATCH_COMP || ((Pair_T) pairs->first)->comp == AMBIGUOUS_COMP) {
8960 	npeelback++;
8961       } else if (((Pair_T) pairs->first)->comp == INDEL_COMP) {
8962 	*n_peeled_indels += 1;
8963 	npeelback--;
8964       } else if (((Pair_T) pairs->first)->comp == SHORTGAP_COMP) {
8965 	*n_peeled_indels += 1;
8966 	npeelback--;
8967       } else {
8968 	npeelback--;
8969       }
8970       if (((Pair_T) pairs->first)->protectedp == true) {
8971 	*protectedp = true;
8972       }
8973       niter++;
8974       last_querypos = ((Pair_T) pairs->first)->querypos;
8975       last_genomepos = ((Pair_T) pairs->first)->genomepos;
8976       peeled = List_transfer_one(peeled,&pairs);
8977     }
8978 
8979     if (pairs != NULL && ((Pair_T) pairs->first)->gapp == true) {
8980       debug(printf(" Hit gap [");
8981 	    Pair_dump_one(pairs->first,/*zerobasedp*/true);
8982 	    printf("]"));
8983     }
8984   }
8985 
8986   if (pairs != NULL &&
8987       ( ((Pair_T) pairs->first)->gapp == true ||
8988 	((Pair_T) pairs->first)->comp == INDEL_COMP ||
8989 	((Pair_T) pairs->first)->comp == SHORTGAP_COMP )) {
8990     /* Don't leave a gap or indel on the top of the pairs */
8991     while (peeled != NULL &&
8992 	   ( ((Pair_T) peeled->first)->gapp == true ||
8993 	     ((Pair_T) peeled->first)->comp == INDEL_COMP ||
8994 	     ((Pair_T) peeled->first)->comp == SHORTGAP_COMP)) {
8995       debug(printf(" Putback [");
8996 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
8997 	    printf("]"));
8998       pairs = List_transfer_one(pairs,&peeled);
8999     }
9000     if (peeled != NULL) {
9001       debug(printf(" Putback [");
9002 	    Pair_dump_one(peeled->first,/*zerobasedp*/true);
9003 	    printf("]"));
9004       pairs = List_transfer_one(pairs,&peeled); /* This should be match or mismatch */
9005     }
9006   }
9007 
9008   if (pairs != NULL) {
9009     leftpair = pairs->first;
9010     *querydp3 = leftpair->querypos - 1;
9011     *genomedp3 = leftpair->genomepos - 1;
9012   } else if (peeled != NULL) {
9013     leftpair = peeled->first;
9014     *querydp3 = leftpair->querypos;
9015     *genomedp3 = leftpair->genomepos;
9016   } else {
9017     /* fprintf(stderr,"In peel_rightward, pairs and peeled are both NULL\n"); */
9018     /* abort(); */
9019   }
9020 
9021   debug(
9022 	if (pairs == NULL) {
9023 	  printf(" => Top of pairs is NULL.");
9024 	} else {
9025 	  pair = pairs->first;
9026 	  printf(" => Top of pairs is ");
9027 	  Pair_dump_one(pair,/*zerobasedp*/true);
9028 	}
9029 	printf("\n => querydp3 = %d, genomedp3 = %d\n",*querydp3,*genomedp3);
9030 	);
9031 
9032   *peeled_pairs = peeled;
9033   return pairs;
9034 }
9035 #endif
9036 
9037 
9038 /************************************************************************
9039  *  Traversal functions
9040  ************************************************************************/
9041 
9042 /* For peel_rightward and peel_leftward, we set quit_on_gap_p = true,
9043    because we want to merge gaps in initial smoothing steps */
9044 
9045 static List_T
traverse_single_gap(bool * filledp,int * dynprogindex,List_T pairs,List_T * path,Pair_T leftpair,Pair_T rightpair,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,int querylength,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprog,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3,int maxpeelback,double defect_rate,bool forcep,bool finalp)9046 traverse_single_gap (bool *filledp, int *dynprogindex, List_T pairs, List_T *path,
9047 		     Pair_T leftpair, Pair_T rightpair,
9048 		     Univcoord_T chroffset, Univcoord_T chrhigh,
9049 		     char *queryseq_ptr, char *queryuc_ptr, int querylength,
9050 		     bool watsonp, int genestrand, bool jump_late_p,
9051 		     Pairpool_T pairpool, Dynprog_T dynprog,
9052 		     Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3,
9053 		     int maxpeelback, double defect_rate, bool forcep, bool finalp) {
9054   List_T gappairs, peeled_pairs, peeled_path;
9055   int queryjump, genomejump;
9056   int querydp5, querydp3;
9057   Chrpos_T genomedp5, genomedp3;
9058   int nmatches, nmismatches, nopens, nindels;
9059   int unknowns, qopens, qindels, topens, tindels, ncanonical, nsemicanonical, nnoncanonical;
9060   int finalscore, origscore;
9061   bool protectedp;
9062   int n_peeled_indels;
9063   double min_splice_prob;
9064   /* int origqueryjump, origgenomejump; */
9065 
9066   debug(printf("\nTRAVERSE_SINGLE_GAP\n"));
9067   querydp5 = leftpair->querypos + 1;
9068   genomedp5 = leftpair->genomepos + 1;
9069   /* if (leftpair->cdna == ' ') querydp5--; -- For old dynamic programming */
9070   /* if (leftpair->genome == ' ') genomedp5--; -- For old dynamic programming */
9071   querydp3 = rightpair->querypos - 1;
9072   genomedp3 = rightpair->genomepos - 1;
9073 
9074   /* origqueryjump = querydp3 - querydp5 + 1; */
9075   /* origgenomejump = genomedp3 - genomedp5 + 1; */
9076 
9077   /* Used to peelback only half as much as for a paired gap, to save
9078      on dynamic programming, but not any more. */
9079   protectedp = false;
9080   pairs = peel_rightward(&n_peeled_indels,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
9081 			 maxpeelback,/*stop_at_indels_p*/true);
9082   *path = peel_leftward(&n_peeled_indels,&protectedp,&peeled_path,*path,&querydp5,&genomedp5,
9083 			maxpeelback,/*stop_at_indels_p*/true);
9084 
9085   if (last_genomedp5 != NULL) {
9086     if (querydp5 < 0) {
9087       querydp5 = 0;
9088     }
9089     if (querydp3 >= querylength) {
9090       querydp3 = querylength - 1;
9091     }
9092     if (0 && finalp == false && genomedp5 == last_genomedp5[querydp5] && genomedp3 == last_genomedp3[querydp3]) {
9093       debug(printf("Already solved for %u..%u at %d..%d\n",genomedp5,genomedp3,querydp5,querydp3));
9094 
9095       pairs = Pairpool_transfer(pairs,peeled_pairs);
9096       *path = Pairpool_transfer(*path,peeled_path);
9097 
9098       *filledp = false;		/* This replaces the gap */
9099       return pairs;
9100     }
9101   }
9102 
9103   queryjump = querydp3 - querydp5 + 1;
9104   genomejump = genomedp3 - genomedp5 + 1;
9105 
9106   if (queryjump <= 0 || genomejump <= 0) {
9107     /* This prevents cases like queryjump 0, genomejump 1 from being solved */
9108     debug(printf("Unable to perform dynamic programming\n"));
9109     *filledp = false;
9110 
9111     pairs = Pairpool_transfer(pairs,peeled_pairs);
9112     *path = Pairpool_transfer(*path,peeled_path);
9113 
9114     return pairs;
9115 
9116   } else {
9117     gappairs = Dynprog_single_gap(&(*dynprogindex),&finalscore,
9118 				  &nmatches,&nmismatches,&nopens,&nindels,dynprog,
9119 				  &(queryseq_ptr[querydp5]),&(queryuc_ptr[querydp5]),
9120 				  queryjump,genomejump,querydp5,genomedp5,
9121 				  chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
9122 				  extraband_single,defect_rate,/*widebandp*/true);
9123     if (protectedp == true) {
9124       debug(printf("Protecting gappairs\n"));
9125       Pair_protect_list(gappairs);
9126     }
9127     debug(Pair_dump_list(gappairs,true));
9128   }
9129   debug(printf("Gap pairs:\n"));
9130   debug(Pair_dump_list(gappairs,true));
9131   debug(printf("  Final score: %d\n",finalscore));
9132 
9133 #if 0
9134   /* Old behavior: Depends on amount peeled */
9135   if (!forcep && nmismatches + nopens > nmatches) {
9136     /* Put back peeled pairs */
9137     debug(printf("Bad alignment, so undoing this solution\n"));
9138     pairs = Pairpool_transfer(pairs,peeled_pairs);
9139     *path = Pairpool_transfer(*path,peeled_path);
9140     *filledp = false;
9141   } else {
9142     pairs = Pairpool_transfer(pairs,gappairs);
9143     *filledp = true;
9144   }
9145 #else
9146   /* New behavior: Compares new score to orig score */
9147   if (forcep == true && gappairs != NULL) {
9148     /* Need to heed forcep, otherwise we get into bad problems with add_dual_break */
9149     /* But check that gappairs is not NULL, which would eliminate peeled pairs */
9150     /* Intended for build_dual_breaks */
9151     debug(printf("forcep is true, so transferring gappairs\n"));
9152     pairs = Pairpool_transfer(pairs,gappairs);
9153     *filledp = true;
9154   } else {
9155     Pair_fracidentity(&nmatches,&unknowns,&nmismatches,&qopens,&qindels,
9156 		      &topens,&tindels,&ncanonical,&nsemicanonical,&nnoncanonical,
9157 		      &min_splice_prob,peeled_pairs,/*cdna_direction*/0);
9158     origscore = Dynprog_score(nmatches,nmismatches,qopens,qindels,topens,tindels,defect_rate);
9159 
9160     Pair_fracidentity(&nmatches,&unknowns,&nmismatches,&qopens,&qindels,
9161 		      &topens,&tindels,&ncanonical,&nsemicanonical,&nnoncanonical,
9162 		      &min_splice_prob,peeled_path,/*cdna_direction*/0);
9163     origscore += Dynprog_score(nmatches,nmismatches,qopens,qindels,topens,tindels,defect_rate);
9164     debug(printf("  Orig score: %d, ",origscore));
9165 
9166     queryjump = (rightpair->querypos - leftpair->querypos - 1);
9167     if (queryjump > 0) {
9168       origscore += Dynprog_score(/*nmatches*/0,/*nmatches*/0,/*qopens*/1,/*qindels*/queryjump,
9169 				 /*topens*/0,/*tindels*/0,defect_rate);
9170     }
9171     genomejump = (rightpair->genomepos - leftpair->genomepos - 1);
9172     if (genomejump > 0) {
9173       origscore += Dynprog_score(/*nmatches*/0,/*nmatches*/0,/*qopens*/0,/*qindels*/0,
9174 				 /*topens*/1,/*tindels*/genomejump,defect_rate);
9175     }
9176     debug(printf("queryjump = %d, genomejump = %d, Orig score: %d\n",queryjump,genomejump,origscore));
9177 
9178     if (queryjump < 0 || genomejump < 0) {
9179       /* Cannot accept the previous queryjump or genomejump */
9180       debug(printf("Existing queryjump %d or genomejump %d is negative, so accepting this solution\n",
9181 		   queryjump,genomejump));
9182       pairs = Pairpool_transfer(pairs,gappairs);
9183       *filledp = true;
9184 #ifdef PMAP
9185     } else if (finalscore < 0 || finalscore < 2*origscore/3) { /* Allow for wobble */
9186 #else
9187     } else if (finalscore < 0 || finalscore < origscore) {
9188 #endif
9189       /* Put back peeled pairs */
9190       debug(printf("Forcep %d, and Bad alignment, so undoing this solution\n",forcep));
9191       pairs = Pairpool_transfer(pairs,peeled_pairs);
9192       *path = Pairpool_transfer(*path,peeled_path);
9193       *filledp = false;
9194     } else {
9195       debug(printf("Forcep %d, and Good alignment, so accepting this solution\n",forcep));
9196       pairs = Pairpool_transfer(pairs,gappairs);
9197       *filledp = true;
9198     }
9199   }
9200 
9201   if (last_genomedp5 != NULL && *filledp == true) {
9202     /* Save coordinates so we don't recompute this solution */
9203     last_genomedp5[querydp5] = genomedp5;
9204     last_genomedp3[querydp3] = genomedp3;
9205   }
9206 
9207 #endif
9208 
9209   return pairs;
9210 }
9211 
9212 static List_T
traverse_cdna_gap(bool * filledp,bool * incompletep,int * dynprogindex_minor,int * dynprogindex_major,List_T pairs,List_T * path,Pair_T leftpair,Pair_T rightpair,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,int querylength,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3,int maxpeelback,double defect_rate,bool finalp)9213 traverse_cdna_gap (bool *filledp, bool *incompletep, int *dynprogindex_minor, int *dynprogindex_major,
9214 		   List_T pairs, List_T *path, Pair_T leftpair, Pair_T rightpair,
9215 		   Univcoord_T chroffset, Univcoord_T chrhigh,
9216 		   char *queryseq_ptr, char *queryuc_ptr, int querylength,
9217 		   bool watsonp, int genestrand, bool jump_late_p, Pairpool_T pairpool,
9218 		   Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
9219 		   Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3,
9220 		   int maxpeelback, double defect_rate, bool finalp) {
9221   List_T gappairs, peeled_pairs = NULL, peeled_path = NULL;
9222   int queryjump, genomejump;
9223   int querydp5, querydp3;
9224   Chrpos_T genomedp5, genomedp3;
9225   int finalscore;
9226   int nmatches, nmismatches, nopens, nindels;
9227   bool protectedp;
9228   int n_peeled_indels;
9229 
9230   debug(printf("\nTRAVERSE_CDNA_GAP\n"));
9231   querydp5 = leftpair->querypos + 1;
9232   genomedp5 = leftpair->genomepos + 1;
9233   /* if (leftpair->cdna == ' ') querydp5--; -- For old dynamic programming */
9234   /* if (leftpair->genome == ' ') genomedp5--; -- For old dynamic programming */
9235   querydp3 = rightpair->querypos - 1;
9236   genomedp3 = rightpair->genomepos - 1;
9237 
9238 #if 0
9239   if (leftpair->dynprogindex < 0 && leftpair->dynprogindex == rightpair->dynprogindex) {
9240     debug(printf("Re-peeling prior solution\n"));
9241     /* throughmismatchp = false; */
9242   } else {
9243     debug(printf("No prior solution\n"));
9244     /* throughmismatchp = true; */
9245   }
9246 #endif
9247 
9248   protectedp = false;
9249   pairs = peel_rightward(&n_peeled_indels,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
9250 			 maxpeelback,/*stop_at_indels_p*/true);
9251   *path = peel_leftward(&n_peeled_indels,&protectedp,&peeled_path,*path,&querydp5,&genomedp5,
9252 			maxpeelback,/*stop_at_indels_p*/true);
9253 
9254   if (last_genomedp5 != NULL) {
9255     if (querydp5 < 0) {
9256       querydp5 = 0;
9257     }
9258     if (querydp3 >= querylength) {
9259       querydp3 = querylength - 1;
9260     }
9261     if (0 && finalp == false && genomedp5 == last_genomedp5[querydp5] && genomedp3 == last_genomedp3[querydp3]) {
9262       debug(printf("Already solved for %u..%u at %d..%d\n",genomedp5,genomedp3,querydp5,querydp3));
9263 
9264       pairs = Pairpool_transfer(pairs,peeled_pairs);
9265       *path = Pairpool_transfer(*path,peeled_path);
9266 
9267       *filledp = false;		/* This replaces the gap */
9268       return pairs;
9269     } else {
9270       last_genomedp5[querydp5] = genomedp5;
9271       last_genomedp3[querydp3] = genomedp3;
9272     }
9273   }
9274 
9275 
9276 #if 0
9277   if (peeled_pairs == NULL || peeled_path == NULL) {
9278     debug(printf("Skipping this because unable to peel\n"));
9279     *filledp = false;
9280     pairs = Pairpool_transfer(pairs,peeled_pairs);
9281     *path = Pairpool_transfer(*path,peeled_path);
9282     return pairs;
9283   }
9284 #endif
9285 
9286   queryjump = querydp3 - querydp5 + 1;
9287   genomejump = genomedp3 - genomedp5 + 1;
9288 
9289   if (queryjump > 0 && queryjump <= genomejump + MININTRONLEN) {
9290     debug(printf("Really a single gap, not a cDNA gap, since queryjump %d <= genomejump %d + minintronlen %d\n",
9291 		 queryjump,genomejump,MININTRONLEN));
9292     gappairs = Dynprog_single_gap(&(*dynprogindex_minor),&finalscore,
9293 				  &nmatches,&nmismatches,&nopens,&nindels,dynprogM,
9294 				  &(queryseq_ptr[querydp5]),&(queryuc_ptr[querydp5]),
9295 				  queryjump,genomejump,querydp5,genomedp5,
9296 				  chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
9297 				  extraband_single,defect_rate,/*widebandp*/true);
9298     debug(Pair_dump_list(gappairs,true));
9299     debug(printf("  Score: %d\n",finalscore));
9300     pairs = Pairpool_transfer(pairs,gappairs);
9301     *filledp = true;
9302 
9303   } else {
9304     /* Set queryjump approximately equal to genomejump to have square
9305        dynamic programming matrices */
9306     queryjump = genomejump + extramaterial_paired;
9307     gappairs = Dynprog_cdna_gap(&(*dynprogindex_major),&finalscore,&(*incompletep),dynprogL,dynprogR,
9308 				&(queryseq_ptr[querydp5]),&(queryuc_ptr[querydp5]),
9309 				&(queryseq_ptr[querydp3]),&(queryuc_ptr[querydp3]),
9310 #if 0
9311 				&(genomicseg_ptr[genomedp5]),&(genomicuc_ptr[genomedp5]),
9312 #endif
9313 				/*length1L*/queryjump,/*length1R*/queryjump,/*length2*/genomejump,
9314 				/*offset1L*/querydp5,/*revoffset1R*/querydp3,/*offset2*/genomedp5,
9315 				chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
9316 				extraband_paired,defect_rate);
9317     debug(Pair_dump_list(gappairs,true));
9318     *filledp = true;
9319     if (gappairs == NULL) {
9320       pairs = Pairpool_transfer(pairs,peeled_pairs);
9321       *path = Pairpool_transfer(*path,peeled_path);
9322       pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/UNKNOWNJUMP,/*genomejump*/UNKNOWNJUMP,
9323 				      /*leftpair*/(*path)->first,/*rightpair*/pairs->first,/*knownp*/false);
9324     } else {
9325       pairs = Pairpool_transfer(pairs,gappairs);
9326     }
9327   }
9328 
9329   return pairs;
9330 }
9331 
9332 
9333 /* genome_gap is usually an intron */
9334 /* Do not set shiftp to false */
9335 static List_T
traverse_genome_gap(bool * filledp,bool * shiftp,int * dynprogindex_minor,int * dynprogindex_major,List_T pairs,List_T * path,Pair_T leftpair,Pair_T rightpair,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,int querylength,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3,int maxpeelback,double defect_rate,bool finalp,bool simplep)9336 traverse_genome_gap (bool *filledp, bool *shiftp, int *dynprogindex_minor, int *dynprogindex_major,
9337 		     List_T pairs, List_T *path, Pair_T leftpair, Pair_T rightpair,
9338 		     Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
9339 		     char *queryseq_ptr, char *queryuc_ptr, int querylength,
9340 		     int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
9341 		     Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
9342 		     Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3,
9343 		     int maxpeelback, double defect_rate, bool finalp, bool simplep) {
9344   List_T gappairs, peeled_pairs = NULL, peeled_path = NULL, p;
9345   Pair_T pair;
9346   int queryjump, genomejump;
9347   int querydp5, querydp3;
9348   Chrpos_T genomedp5, genomedp3, orig_genomedp5, orig_genomedp3;
9349   int minpeelback, min_mismatches;
9350   int new_leftgenomepos, new_rightgenomepos;
9351   double left_prob, right_prob;
9352   int finalscore, nmatches, nmismatches, nopens, nindels, exonhead, introntype;
9353   int acceptable_nmismatches;
9354   bool stop_at_indels_p, protectedp;
9355   int n_peeled_indels_rightward, n_peeled_indels_leftward;
9356 #ifndef GSNAP
9357   double prob2, prob3;
9358 #endif
9359 #ifndef PMAP
9360   List_T micropairs;
9361   int microintrontype;
9362 #endif
9363 
9364 #ifdef SHORTCUT
9365   char left1, left2, right2, right1, left1_alt, left2_alt, right2_alt, right1_alt;
9366 #endif
9367 
9368   debug(printf("\nTRAVERSE_GENOME_GAP\n"));
9369 
9370   stop_at_indels_p = false;	/* ? true when finalp == true */
9371 
9372   querydp5 = leftpair->querypos + 1;
9373   genomedp5 = leftpair->genomepos + 1;
9374   /* if (leftpair->cdna == ' ') querydp5--; -- For old dynamic programming */
9375   /* if (leftpair->genome == ' ') genomedp5--; -- For old dynamic programming */
9376   querydp3 = rightpair->querypos - 1;
9377   genomedp3 = rightpair->genomepos - 1;
9378 
9379 #if 0
9380   if (leftpair->dynprogindex < 0 && leftpair->dynprogindex == rightpair->dynprogindex) {
9381     debug(printf("Re-peeling prior solution\n"));
9382     /* throughmismatchp = false; */
9383   } else {
9384     debug(printf("No prior solution\n"));
9385     /* throughmismatchp = true; */
9386   }
9387 #endif
9388 
9389 #ifdef SHORTCUT
9390   queryjump = querydp3 - querydp5 + 1;
9391   genomejump = genomedp3 - genomedp5 + 1;
9392 
9393   if (querydp5 != querydp3 + 1) {
9394     protectedp = false;
9395     pairs = peel_rightward(&n_peeled_indels_rightward,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
9396 			   maxpeelback,stop_at_indels_p);
9397     *path = peel_leftward(&n_peeled_indels_leftward,&protectedp,&peeled_path,*path,&querydp5,&genomedp5,
9398 			  maxpeelback,stop_at_indels_p);
9399 
9400   } else {
9401 
9402 #if 0
9403     left1 = genomicuc_ptr[genomedp5];
9404     left2 = genomicuc_ptr[genomedp5+1];
9405     right2 = genomicuc_ptr[genomedp3-1];
9406     right1 = genomicuc_ptr[genomedp3];
9407 #else
9408     left1 = get_genomic_nt(&left1_alt,genomedp5,chroffset,chrhigh,watsonp);
9409     left2 = get_genomic_nt(&left2_alt,genomedp5+1,chroffset,chrhigh,watsonp);
9410     right2 = get_genomic_nt(&right2_alt,genomedp3-1,chroffset,chrhigh,watsonp);
9411     right1 = get_genomic_nt(&right1_alt,genomedp3,chroffset,chrhigh,watsonp);
9412 #endif
9413 
9414     introntype = Intron_type(left1,left2,right2,right1,
9415 			     left1_alt,left2_alt,right2_alt,right1_alt,
9416 			     cdna_direction);
9417     debug(printf("Introntype at %u..%u is %s\n",genomedp5-1,genomedp3+1,Intron_type_string(introntype)));
9418 
9419     protectedp = false;
9420     pairs = peel_rightward(&n_peeled_indels_rightward,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
9421 			   maxpeelback,stop_at_indels_p);
9422     *path = peel_leftward(&n_peeled_indels_leftward,&protectedp,&peeled_path,*path,&querydp5,&genomedp5,
9423 			  maxpeelback,stop_at_indels_p);
9424 
9425     if (finalp == false && novelsplicingp == true /* && mismatch_rightward_p == false && mismatch_leftward_p == false */) {
9426       debug(printf("No mismatches seen\n"));
9427       if ((cdna_direction > 0 && introntype == GTAG_FWD)
9428 #ifndef PMAP
9429 	  || (cdna_direction < 0 && introntype == GTAG_REV)
9430 #endif
9431 	  ) {
9432 	debug(printf("Skipping because intron is already canonical\n"));
9433 	*filledp = false;		/* Calling procedure will replace the gap */
9434 	pairs = Pairpool_transfer(pairs,peeled_pairs);
9435 	*path = Pairpool_transfer(*path,peeled_path);
9436 	return pairs;
9437       }
9438     }
9439   }
9440 
9441 #else  /* not a SHORTCUT */
9442 
9443   orig_genomedp5 = genomedp5;
9444   orig_genomedp3 = genomedp3;
9445 
9446   if (defect_rate < DEFECT_HIGHQ) {
9447     minpeelback = 6;
9448     min_mismatches = 2;
9449   } else if (defect_rate < DEFECT_MEDQ) {
9450     minpeelback = 8;
9451     min_mismatches = 3;
9452   } else {
9453     minpeelback = 10;
9454     min_mismatches = 4;
9455   }
9456 
9457   queryjump = querydp3 - querydp5 + 1;
9458   protectedp = false;
9459 
9460   if (queryjump == 0) {
9461     pairs = peel_rightward(&n_peeled_indels_rightward,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
9462 			   maxpeelback,stop_at_indels_p);
9463     *path = peel_leftward(&n_peeled_indels_leftward,&protectedp,&peeled_path,*path,&querydp5,&genomedp5,
9464 			  maxpeelback,stop_at_indels_p);
9465   } else {
9466     pairs = peel_rightward_intron(&n_peeled_indels_rightward,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
9467 				  orig_genomedp5,stop_at_indels_p,chroffset,chrhigh,watsonp,minpeelback,min_mismatches);
9468     *path = peel_leftward_intron(&n_peeled_indels_leftward,&protectedp,&peeled_path,*path,&querydp5,&genomedp5,
9469 				 orig_genomedp3,stop_at_indels_p,chroffset,chrhigh,watsonp,minpeelback,min_mismatches);
9470   }
9471 
9472   if (last_genomedp5 != NULL) {
9473     if (querydp5 < 0) {
9474       querydp5 = 0;
9475     }
9476     if (querydp3 >= querylength) {
9477       querydp3 = querylength - 1;
9478     }
9479     if (0 && finalp == false && genomedp5 == last_genomedp5[querydp5] && genomedp3 == last_genomedp3[querydp3]) {
9480       debug(printf("Already solved for %u..%u at %d..%d\n",genomedp5,genomedp3,querydp5,querydp3));
9481 
9482       pairs = Pairpool_transfer(pairs,peeled_pairs);
9483       *path = Pairpool_transfer(*path,peeled_path);
9484 
9485       *filledp = false;		/* This replaces the gap */
9486       return pairs;
9487     } else {
9488       last_genomedp5[querydp5] = genomedp5;
9489       last_genomedp3[querydp3] = genomedp3;
9490     }
9491   }
9492 
9493 #endif
9494 
9495 
9496   queryjump = querydp3 - querydp5 + 1;
9497   genomejump = genomedp3 - genomedp5 + 1;
9498 
9499   /* genomedp5 + genomejump - 1 >= genomedp3 - genomejump + 1) ?  but doesn't work on AA669154, chr1*/
9500   if (queryjump > 0 && genomejump <= queryjump + MININTRONLEN) {
9501     debug(printf("Really a single gap, not an intron\n"));
9502     gappairs = Dynprog_single_gap(&(*dynprogindex_minor),&finalscore,
9503 				  &nmatches,&nmismatches,&nopens,&nindels,dynprogM,
9504 				  &(queryseq_ptr[querydp5]),&(queryuc_ptr[querydp5]),
9505 				  queryjump,genomejump,querydp5,genomedp5,
9506 				  chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
9507 				  extraband_single,defect_rate,/*widebandp*/true);
9508     if (protectedp == true) {
9509       debug(printf("Protecting gappairs\n"));
9510       Pair_protect_list(gappairs);
9511     }
9512     debug(Pair_dump_list(gappairs,true));
9513     debug(printf("  Score: %d\n",finalscore));
9514 
9515     pairs = Pairpool_transfer(pairs,gappairs);
9516     *filledp = true;
9517 
9518   } else {
9519     /* Set genomejump approximately equal to queryjump to have square
9520        dynamic programming matrices */
9521     /* The canonical reward for finalp == true is too high */
9522     genomejump = queryjump + extramaterial_paired;
9523     gappairs = Dynprog_genome_gap(&(*dynprogindex_major),&finalscore,&new_leftgenomepos,&new_rightgenomepos,
9524 				  &left_prob,&right_prob,&nmatches,&nmismatches,&nopens,&nindels,
9525 				  &exonhead,&introntype,dynprogL,dynprogR,
9526 				  &(queryseq_ptr[querydp5]),&(queryuc_ptr[querydp5]),
9527 				  queryjump,genomejump,genomejump,querydp5,/*goffsetL*/genomedp5,/*rev_goffsetR*/genomedp3,
9528 				  chrnum,chroffset,chrhigh,
9529 				  cdna_direction,watsonp,genestrand,jump_late_p,pairpool,
9530 				  extraband_paired + n_peeled_indels_leftward + n_peeled_indels_rightward,
9531 				  defect_rate,maxpeelback,/*halfp*/false,/*finalp*/false);
9532     if (protectedp == true) {
9533       debug(printf("Protecting gappairs\n"));
9534       Pair_protect_list(gappairs);
9535     }
9536 
9537     if (gappairs == NULL) {
9538       if (simplep == true) {
9539 	*shiftp = true;
9540 	debug(printf("Shift, since gappairs is NULL and simplep is true\n"));
9541       } else {
9542 	/* *shiftp = false; */
9543 	debug(printf("No shift, since gappairs is NULL: intron is disallowed?\n"));
9544       }
9545 
9546     } else if (new_leftgenomepos != (int) leftpair->genomepos || new_rightgenomepos != (int) rightpair->genomepos) {
9547       *shiftp = true;
9548       debug(printf("Shift in intron location from %d..%d to %d..%d\n",
9549 		   leftpair->genomepos,rightpair->genomepos,new_leftgenomepos,new_rightgenomepos));
9550     } else {
9551       /* *shiftp = false; */
9552       debug(printf("No shift in intron location\n"));
9553     }
9554     debug(Pair_dump_list(gappairs,true));
9555     debug(printf("  gappairs score (%d..%d, %u..%u, dir %d): %d\n",
9556 		 querydp5,querydp3,genomedp5,genomedp3,cdna_direction,finalscore));
9557     debug(fprintf(stderr,"  gappairs score (%d..%d, %u..%u, dir %d): %d\n",
9558 		  querydp5,querydp3,genomedp5,genomedp3,cdna_direction,finalscore));
9559 
9560 #if 0
9561     /* prob = 1.0 - (1.0 - left_prob)*(1.0 - right_prob); */
9562     if (finalp == true && novelsplicingp == true && (left_prob < 0.90 || right_prob < 0.90)) {
9563       /* Bad intron.  See if alternative with indel is better.  Check
9564 	 only on finalp, because earlier steps may need to iterate. */
9565       debug(printf("Checking alternative because found a bad intron with probs %f and %f\n",
9566 		   left_prob,right_prob));
9567       gappairs_alt = Dynprog_genome_gap(&(*dynprogindex_major),&finalscore_alt,&new_leftgenomepos,&new_rightgenomepos,
9568 					&left_prob_alt,&right_prob_alt,&nmatches,&nmismatches_alt,&nopens,&nindels,
9569 					&exonhead,&introntype,dynprogL,dynprogR,
9570 					&(queryseq_ptr[querydp5]),&(queryuc_ptr[querydp5]),
9571 					queryjump,genomejump,genomejump,querydp5,genomedp5,genomedp3,
9572 					chrnum,chroffset,chrhigh,cdna_direction,watsonp,genestrand,jump_late_p,pairpool,
9573 					extraband_paired + n_peeled_indels_leftward + n_peeled_indels_rightward,
9574 					defect_rate,maxpeelback,/*halfp*/false,/*finalp*/true);
9575       if (protectedp == true) {
9576 	debug(printf("Protecting gappairs_alt\n"));
9577 	Pair_protect_list(gappairs_alt);
9578       }
9579 
9580       debug(Pair_dump_list(gappairs_alt,true));
9581       debug(printf("  gappairs_alt score: %d, left prob %f, right prob %f\n",finalscore_alt,left_prob_alt,right_prob_alt));
9582       debug(fprintf(stderr,"  gappairs_alt score: %d\n",finalscore_alt));
9583       if (gappairs_alt != NULL && left_prob_alt > left_prob && right_prob_alt > right_prob) {
9584 	debug(printf(" switching to alt\n"));
9585 	gappairs = gappairs_alt;
9586       } else {
9587 	debug(printf(" keeping original\n"));
9588       }
9589     }
9590 #endif
9591 
9592     if (defect_rate < DEFECT_HIGHQ) {
9593       acceptable_nmismatches = 2;
9594     } else if (defect_rate < DEFECT_MEDQ) {
9595       acceptable_nmismatches = 2;
9596     } else {
9597       acceptable_nmismatches = 3;
9598     }
9599 
9600     debug(printf("nmismatches = %d, nopens = %d, nindels = %d.  acceptable nmismatches = %d\n",
9601 		 nmismatches,nopens,nindels,acceptable_nmismatches));
9602 
9603     if (gappairs == NULL) {
9604       *filledp = false;
9605       if (simplep == true) {
9606 	/* Put back peeled pairs */
9607 	debug(printf("gappairs is false, but simple, so allowed\n"));
9608 	for (p = peeled_pairs; p != NULL; p = p->rest) {
9609 	  pair = (Pair_T) p->first;
9610 	}
9611 	for (p = peeled_path; p != NULL; p = p->rest) {
9612 	  pair = (Pair_T) p->first;
9613 	}
9614       } else {
9615 	/* Put back peeled pairs, but mark pairs as disallowed */
9616 	debug(printf("gappairs is false, so disallowed\n"));
9617 	for (p = peeled_pairs; p != NULL; p = p->rest) {
9618 	  pair = (Pair_T) p->first;
9619 	  pair->disallowedp = true;
9620 	}
9621 	for (p = peeled_path; p != NULL; p = p->rest) {
9622 	  pair = (Pair_T) p->first;
9623 	  pair->disallowedp = true;
9624 	}
9625       }
9626 
9627       pairs = Pairpool_transfer(pairs,peeled_pairs);
9628       *path = Pairpool_transfer(*path,peeled_path);
9629       introntype = NONINTRON;
9630 
9631 #if 0
9632     } else if (!finalp && finalscore < 0) {
9633       *filledp = false;
9634       /* Put back peeled pairs */
9635       debug(printf("Not forced and finalscore is negative\n"));
9636       pairs = Pairpool_transfer(pairs,peeled_pairs);
9637       *path = Pairpool_transfer(*path,peeled_path);
9638       introntype = NONINTRON;
9639 #endif
9640 
9641 #if 0
9642     } else if (defect_rate > DEFECT_MEDQ) {
9643       /* Should look for them, especially for GSNAP short reads */
9644       /* Don't look for microexons in low-quality sequences */
9645       debug(printf("Don't look for microexon in low-quality sequence\n"));
9646       *filledp = true;
9647       pairs = Pairpool_transfer(pairs,gappairs);
9648 #endif
9649 
9650     } else if (introntype != NONINTRON && nmismatches <= acceptable_nmismatches && nopens <= 1 && nindels <= 3) {
9651       debug(printf("introntype != NONINTRON and nmismatches, nopens, nindels low\n"));
9652       *filledp = true;
9653       pairs = Pairpool_transfer(pairs,gappairs);
9654 
9655     } else {
9656 #ifdef PMAP
9657       *filledp = true;
9658       pairs = Pairpool_transfer(pairs,gappairs);
9659 #else
9660       *filledp = true;
9661 
9662 #ifdef GSNAP
9663       micropairs = (List_T) NULL;
9664       microintrontype = NONINTRON;
9665 #else
9666       /* Expensive call, because it requires BoyerMoore */
9667       debug(printf("Calling microexon because introntype == %d or nmismatches %d > acceptable %d or nopens %d > 1 or nindels %d > 3\n",
9668 		   introntype,nmismatches,acceptable_nmismatches,nopens,nindels));
9669       micropairs = Dynprog_microexon_int(&prob2,&prob3,&(*dynprogindex_major),&microintrontype,
9670 					 /*sequence1*/&(queryseq_ptr[querydp5]),
9671 					 /*sequenceuc1*/&(queryuc_ptr[querydp5]),
9672 					 /*length1*/queryjump,/*length2L:genomejump,*//*length2R:genomejump,*/
9673 					 /*offset1*/querydp5,/*offset2L*/genomedp5,/*revoffset2R*/genomedp3,
9674 					 cdna_direction,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,
9675 					 watsonp,genestrand,pairpool);
9676 #endif
9677 
9678       if (micropairs == NULL) {
9679 	debug(printf("No microexon found\n"));
9680 	pairs = Pairpool_transfer(pairs,gappairs);
9681 	/* *shiftp = false; */
9682       } else {
9683 	debug(Pair_dump_list(micropairs,/*zerobasedp*/true));
9684 	debug(printf("\n"));
9685 
9686 #if 0
9687 	if (1 || (nindels == 0 && nmismatches < 4)) {
9688 	  /* Have a higher standard */
9689 	  if (prob2 >= 0.95 && prob3 >= 0.95) {
9690 	    debug(printf("Transferring microexon pairs\n"));
9691 	    pairs = Pairpool_transfer(pairs,micropairs);
9692 	    introntype = microintrontype;
9693 	    *shiftp = true;
9694 	  } else {
9695 	    pairs = Pairpool_transfer(pairs,gappairs);
9696 	  }
9697 	} else {
9698 	  /* Have a lower standard */
9699 	  if (prob2 >= 0.90 || prob3 >= 0.90) {
9700 	    debug(printf("Transferring microexon pairs\n"));
9701 	    pairs = Pairpool_transfer(pairs,micropairs);
9702 	    introntype = microintrontype;
9703 	    *shiftp = true;
9704 	  } else {
9705 	    pairs = Pairpool_transfer(pairs,gappairs);
9706 	  }
9707 	}
9708 #else
9709 	/* Just transfer */
9710 	debug(printf("Transferring microexon pairs\n"));
9711 	pairs = Pairpool_transfer(pairs,micropairs);
9712 	introntype = microintrontype;
9713 	*shiftp = true;
9714 #endif
9715 
9716       }
9717 #endif
9718     }
9719   }
9720 
9721   return pairs;
9722 }
9723 
9724 
9725 static List_T
traverse_dual_genome_gap(int * dynprogindex,List_T pairs,List_T * path,Pair_T leftpair,Pair_T rightpair,bool left_end_intron_p,bool right_end_intron_p,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,int midquerypos,Chrpos_T midgenomepos,char * queryseq_ptr,char * queryuc_ptr,int querylength,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogR,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3,int maxpeelback,double defect_rate,bool finalp)9726 traverse_dual_genome_gap (int *dynprogindex, List_T pairs, List_T *path,
9727 			  Pair_T leftpair, Pair_T rightpair, bool left_end_intron_p, bool right_end_intron_p,
9728 			  Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
9729 			  int midquerypos, Chrpos_T midgenomepos,
9730 			  char *queryseq_ptr, char *queryuc_ptr, int querylength,
9731 			  int cdna_direction, bool watsonp, int genestrand,
9732 			  bool jump_late_p, Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogR,
9733 			  Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3,
9734 			  int maxpeelback, double defect_rate, bool finalp) {
9735   List_T single_gappairs, dual_gappairs_1 = NULL, dual_gappairs_2 = NULL,
9736     right_gappairs = NULL, left_gappairs = NULL, peeled_pairs, peeled_path;
9737   int queryjump, genomejump;
9738   int querydp5, querydp3;
9739   Chrpos_T genomedp5, genomedp3, orig_genomedp5, orig_genomedp3;
9740   int minpeelback, min_mismatches;
9741   int new_leftgenomepos, new_rightgenomepos;
9742   double single_left_prob, single_right_prob, dual_left_prob_1, dual_right_prob_1, dual_left_prob_2, dual_right_prob_2;
9743   int querydp5_dual, querydp3_dual, genomedp5_dual, genomedp3_dual;
9744   int querydp5_left, querydp3_left, genomedp5_left, genomedp3_left;
9745   int querydp5_right, querydp3_right, genomedp5_right, genomedp3_right;
9746   int single_nmatches = 0, dual_nmatches_1 = 0, dual_nmatches_2 = 0, left_nmatches = 0, right_nmatches = 0;
9747   int single_score, dual_score_1, dual_score_2, single_goodness, dual_goodness,
9748     nmismatches, nopens, nindels, exonhead, right_exonhead, left_exonhead;
9749   int left_score, right_score, left_goodness = 0, right_goodness = 0;
9750   int middle_exonlength;
9751   int single_introntype, dual_introntype_1, dual_introntype_2, left_introntype, right_introntype;
9752 #ifdef DEBUG
9753   int interexon_region;
9754   double middle_exonprob;
9755 #endif
9756   bool singlep = false, single_canonical_p, dual_canonical_p, protectedp;
9757   int n_peeled_indels;
9758 
9759   debug(printf("\nTRAVERSE_DUAL_GENOME_GAP: left_end_intron_p %d, right_end_intron_p %d\n",
9760 	       left_end_intron_p,right_end_intron_p));
9761 #if 0
9762   if (cdna_direction > 0) {
9763     canonical_introntype = GTAG_FWD;
9764     semicanonical_introntype_1 = ATAC_FWD;
9765     semicanonical_introntype_2 = GCAG_FWD;
9766 #ifndef PMAP
9767   } else {
9768     canonical_introntype = GTAG_REV;
9769     semicanonical_introntype_1 = ATAC_REV;
9770     semicanonical_introntype_2 = GCAG_REV;
9771 #endif
9772   }
9773 #endif
9774 
9775   querydp5 = leftpair->querypos + 1;
9776   genomedp5 = leftpair->genomepos + 1;
9777   /* if (leftpair->cdna == ' ') querydp5--; -- For old dynamic programming */
9778   /* if (leftpair->genome == ' ') genomedp5--; -- For old dynamic programming */
9779   querydp3 = rightpair->querypos - 1;
9780   genomedp3 = rightpair->genomepos - 1;
9781 
9782 
9783   orig_genomedp5 = genomedp5;
9784   orig_genomedp3 = genomedp3;
9785 
9786   if (defect_rate < DEFECT_HIGHQ) {
9787     minpeelback = 6;
9788     min_mismatches = 2;
9789   } else if (defect_rate < DEFECT_MEDQ) {
9790     minpeelback = 8;
9791     min_mismatches = 3;
9792   } else {
9793     minpeelback = 10;
9794     min_mismatches = 4;
9795   }
9796 
9797   protectedp = false;
9798   pairs = peel_rightward_intron(&n_peeled_indels,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
9799 				orig_genomedp5,/*stop_at_indels_p*/false,chroffset,chrhigh,watsonp,minpeelback,min_mismatches);
9800   *path = peel_leftward_intron(&n_peeled_indels,&protectedp,&peeled_path,*path,&querydp5,&genomedp5,
9801 			       orig_genomedp3,/*stop_at_indels_p*/false,chroffset,chrhigh,watsonp,minpeelback,min_mismatches);
9802 
9803   if (last_genomedp5 != NULL) {
9804     if (querydp5 < 0) {
9805       querydp5 = 0;
9806     }
9807     if (querydp3 >= querylength) {
9808       querydp3 = querylength - 1;
9809     }
9810     if (0 && finalp == false && genomedp5 == last_genomedp5[querydp5] && genomedp3 == last_genomedp3[querydp3]) {
9811       /* Don't want to abort this procedure early */
9812       debug(printf("Already solved for %u..%u at %d..%d\n",genomedp5,genomedp3,querydp5,querydp3));
9813 
9814       pairs = Pairpool_transfer(pairs,peeled_pairs);
9815       *path = Pairpool_transfer(*path,peeled_path);
9816 
9817       return pairs;
9818     } else {
9819       last_genomedp5[querydp5] = genomedp5;
9820       last_genomedp3[querydp3] = genomedp3;
9821     }
9822   }
9823 
9824   queryjump = querydp3 - querydp5 + 1;
9825   genomejump = queryjump + extramaterial_paired;
9826 
9827   if (queryjump > nullgap) {
9828     pairs = Pairpool_transfer(pairs,peeled_pairs);
9829     *path = Pairpool_transfer(*path,peeled_path);
9830     pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/UNKNOWNJUMP,/*genomejump*/UNKNOWNJUMP,
9831 				    /*leftpair*/(*path)->first,/*rightpair*/pairs->first,/*knownp*/false);
9832 
9833     return pairs;
9834   }
9835 
9836   if (genomedp5 + genomejump - 1 >= genomedp3 - genomejump + 1) {
9837     debug(printf("Bounds don't make sense for dual intron gap: %d + %d - 1 >= %d - %d + 1\n\n",
9838 		 genomedp5,genomejump,genomedp3,genomejump));
9839     pairs = Pairpool_transfer(pairs,peeled_pairs);
9840     *path = Pairpool_transfer(*path,peeled_path);
9841     pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/UNKNOWNJUMP,/*genomejump*/UNKNOWNJUMP,
9842 				    /*leftpair*/(*path)->first,/*rightpair*/pairs->first,/*knownp*/false);
9843 
9844     return pairs;
9845   }
9846 
9847   /* Want finalp == true to get best chance for canonical splice site */
9848   single_gappairs = Dynprog_genome_gap(&(*dynprogindex),&single_score,&new_leftgenomepos,&new_rightgenomepos,
9849 				       &single_left_prob,&single_right_prob,&single_nmatches,&nmismatches,&nopens,&nindels,
9850 				       &exonhead,&single_introntype,dynprogL,dynprogR,
9851 				       &(queryseq_ptr[querydp5]),&(queryuc_ptr[querydp5]),
9852 				       queryjump,genomejump,genomejump,querydp5,genomedp5,genomedp3,
9853 				       chrnum,chroffset,chrhigh,
9854 				       cdna_direction,watsonp,genestrand,jump_late_p,pairpool,extraband_paired,
9855 				       defect_rate,maxpeelback,/*halfp*/false,/*finalp*/true);
9856 
9857   debug(Pair_check_list_pairs(single_gappairs));
9858 
9859   /* Okay to have one indel, because may need to shift an island */
9860   if (nopens <= 1) {
9861     single_goodness = (single_nmatches + nindels) + MISMATCH*nmismatches;
9862   } else {
9863     single_goodness = single_nmatches + MISMATCH*nmismatches + QOPEN*(nopens-1) + QINDEL*nindels;
9864   }
9865 
9866 #if 0
9867   if (single_gappairs == NULL) {
9868     single_canonical_p = false;
9869   } else if (single_introntype == canonical_introntype) {
9870     single_canonical_p = true;
9871   } else if (single_introntype == semicanonical_introntype_1 ||
9872 	     single_introntype == semicanonical_introntype_2) {
9873     single_canonical_p = false;
9874   } else {
9875     single_canonical_p = false;
9876   }
9877 #else
9878   if (single_left_prob > 0.9 && single_right_prob > 0.9) {
9879     single_canonical_p = true;
9880   } else {
9881     single_canonical_p = false;
9882   }
9883 #endif
9884 
9885 
9886   /* Right of short exon */
9887   querydp5_dual = midquerypos;
9888   genomedp5_dual = midgenomepos;
9889   querydp3_dual = querydp3;	/* From peel_rightward_intron */
9890   genomedp3_dual = genomedp3;	/* From peel_rightward_intron */
9891 
9892   queryjump = querydp3_dual - querydp5_dual + 1;
9893   genomejump = queryjump + extramaterial_paired;
9894 
9895   if (genomedp5_dual + genomejump - 1 >= genomedp3_dual) {
9896     /* Bounds don't make sense */
9897     debug(printf("Bounds don't make sense on right of dual intron gap: %d + %d - 1 >= %d\n\n",
9898 		 genomedp5_dual,genomejump,genomedp3_dual));
9899     dual_gappairs_2 = NULL;
9900 
9901   } else {
9902     /* Want finalp == true to get best chance for canonical splice site */
9903     dual_gappairs_2 = Dynprog_genome_gap(&(*dynprogindex),&dual_score_2,&new_leftgenomepos,&new_rightgenomepos,
9904 					 &dual_left_prob_2,&dual_right_prob_2,&dual_nmatches_2,&nmismatches,&nopens,&nindels,
9905 					 &right_exonhead,&dual_introntype_2,dynprogL,dynprogR,
9906 					 &(queryseq_ptr[querydp5_dual]),&(queryuc_ptr[querydp5_dual]),
9907 					 queryjump,genomejump,genomejump,
9908 					 querydp5_dual,genomedp5_dual,genomedp3_dual,
9909 					 chrnum,chroffset,chrhigh,
9910 					 cdna_direction,watsonp,genestrand,jump_late_p,pairpool,extraband_paired,
9911 					 defect_rate,maxpeelback,/*halfp*/true,/*finalp*/true);
9912 
9913     dual_goodness = dual_nmatches_2 + MISMATCH*nmismatches + QOPEN*nopens + QINDEL*nindels;
9914 
9915     /* Left of short exon */
9916     querydp5_dual = querydp5;	/* From peel_leftward_intron */
9917     genomedp5_dual = genomedp5;	/* From peel_leftward_intron */
9918     querydp3_dual = midquerypos-1;
9919     genomedp3_dual = midgenomepos-1;
9920 
9921     queryjump = querydp3_dual - querydp5_dual + 1;
9922     genomejump = queryjump + extramaterial_paired;
9923 
9924     if (genomedp5_dual + genomejump - 1 >= genomedp3_dual) {
9925       /* Bounds don't make sense */
9926       debug(printf("Bounds don't make sense on left of dual intron gap: %d + %d - 1 >= %d\n\n",
9927 		   genomedp5_dual,genomejump,genomedp3_dual));
9928       dual_gappairs_1 = NULL;
9929 
9930     } else {
9931       /* Want finalp == true to get best chance for canonical splice site */
9932       dual_gappairs_1 = Dynprog_genome_gap(&(*dynprogindex),&dual_score_1,&new_leftgenomepos,&new_rightgenomepos,
9933 					   &dual_left_prob_1,&dual_right_prob_1,&dual_nmatches_1,&nmismatches,&nopens,&nindels,
9934 					   &left_exonhead,&dual_introntype_1,dynprogL,dynprogR,
9935 					   &(queryseq_ptr[querydp5_dual]),&(queryuc_ptr[querydp5_dual]),
9936 					   queryjump,genomejump,genomejump,
9937 					   querydp5_dual,genomedp5_dual,genomedp3_dual,
9938 					   chrnum,chroffset,chrhigh,
9939 					   cdna_direction,watsonp,genestrand,jump_late_p,pairpool,extraband_paired,
9940 					   defect_rate,maxpeelback,/*halfp*/true,/*finalp*/true);
9941 
9942       dual_goodness += dual_nmatches_1 + MISMATCH*nmismatches + QOPEN*nopens + QINDEL*nindels;
9943 
9944       if (dual_gappairs_1 == NULL || dual_gappairs_2 == NULL) {
9945 	dual_canonical_p = false;
9946 #if 0
9947       } else if (dual_introntype_1 == canonical_introntype && dual_introntype_2 == canonical_introntype) {
9948 	dual_canonical_p = true;
9949 #endif
9950 #if 0
9951       } else if (dual_left_prob_1 > 0.9 && dual_right_prob_1 > 0.9 &&
9952 		 dual_left_prob_2 > 0.9 && dual_right_prob_2 > 0.9) {
9953 	dual_canonical_p = true;
9954 #endif
9955       } else if (dual_left_prob_1 > 0.9 || dual_right_prob_1 > 0.9 ||
9956 		 dual_left_prob_2 > 0.9 || dual_right_prob_2 > 0.9) {
9957 	dual_canonical_p = true;
9958       } else {
9959 	dual_canonical_p = false;
9960       }
9961     }
9962   }
9963 
9964   if (dual_gappairs_2 == NULL || dual_gappairs_1 == NULL) {
9965     debug(printf("Single score wins because dual_guappairs_2 is NULL or dual_gappairs_1 is NULL\n"));
9966     debug(printf("Loser: dual_gappairs\n"));
9967     debug(Pair_dump_list(dual_gappairs_2,true));
9968     debug(Pair_dump_list(dual_gappairs_1,true));
9969     debug(printf("Winner: single gap pairs\n"));
9970     debug(Pair_dump_list(single_gappairs,true));
9971     /* pairs = Pairpool_transfer(pairs,single_gappairs); -- Wait until we check for left_goodness and right_goodness */
9972     singlep = true;
9973 
9974   } else {
9975     middle_exonlength = right_exonhead-left_exonhead;
9976     debug(printf("Middle exon is %d - %d = %d bp in interexon region of %d bp\n",
9977 		 right_exonhead,left_exonhead,right_exonhead-left_exonhead,new_rightgenomepos-new_leftgenomepos));
9978     if (middle_exonlength <= 0) {
9979       debug(middle_exonprob = 0.0);
9980     } else {
9981       debug(interexon_region = new_rightgenomepos - new_leftgenomepos);
9982 
9983 #if 0
9984       if (dual_introntype_2 == canonical_introntype) {
9985 	middle_exonlength += DUAL_HALFCANONICAL_POINTS;
9986 	debug(printf("Add canonical credit of %d for right intron\n",DUAL_HALFCANONICAL_POINTS));
9987       }
9988       if (dual_introntype_1 == canonical_introntype) {
9989 	middle_exonlength += DUAL_HALFCANONICAL_POINTS;
9990 	debug(printf("Add canonical credit of %d for left intron\n",DUAL_HALFCANONICAL_POINTS));
9991       }
9992 #else
9993       if (dual_left_prob_2 > 0.9 && dual_right_prob_2 > 0.9) {
9994 	middle_exonlength += DUAL_HALFCANONICAL_POINTS;
9995 	debug(printf("Add canonical credit of %d for right intron\n",DUAL_HALFCANONICAL_POINTS));
9996       }
9997       if (dual_left_prob_1 > 0.9 && dual_right_prob_1 > 0.9) {
9998 	middle_exonlength += DUAL_HALFCANONICAL_POINTS;
9999 	debug(printf("Add canonical credit of %d for left intron\n",DUAL_HALFCANONICAL_POINTS));
10000       }
10001 #endif
10002 
10003 #ifdef DEBUG
10004       middle_exonprob = 1.0-pow(1.0-pow(4.0,-(double) middle_exonlength),(double) interexon_region);
10005 #endif
10006 
10007       debug(printf("Single score = %d (%d matches).  Single canonical: %d.  Dual score = %d & %d (%d & %d matches).  Dual canonical: %d.  ",
10008 		   single_score,single_nmatches,single_canonical_p,
10009 		   dual_score_1,dual_score_2,dual_nmatches_1,dual_nmatches_2,
10010 		   dual_canonical_p));
10011       debug(printf("Single goodness = %d.  Dual goodness = %d.  ",
10012 		   single_goodness,dual_goodness));
10013       debug(printf("Probability is %g (ignoring).  ",middle_exonprob));
10014     }
10015 
10016     /* Want high threshold for accepting dual intron */
10017     if (dual_canonical_p == true && /*middle_exonprob < 0.001 &&*/
10018 	single_canonical_p == false && single_goodness <= dual_goodness) {
10019       debug(printf("Dual scores win\n"));
10020       debug(printf("Loser: single_gappairs\n"));
10021       debug(Pair_dump_list(single_gappairs,true));
10022       debug(printf("Winner: Transferring dual_gappairs_2 onto pairs\n"));
10023       Pair_protect_list(dual_gappairs_2);
10024       debug(Pair_dump_list(dual_gappairs_2,true));
10025       pairs = Pairpool_transfer(pairs,dual_gappairs_2);
10026       debug(printf("Winner: Transferring dual_gappairs_1 onto pairs\n"));
10027       Pair_protect_list(dual_gappairs_1);
10028       debug(Pair_dump_list(dual_gappairs_1,true));
10029       pairs = Pairpool_transfer(pairs,dual_gappairs_1);
10030     } else {
10031       debug(printf("Single score wins\n"));
10032       debug(printf("Loser: dual_gappairs\n"));
10033       debug(Pair_dump_list(dual_gappairs_2,true));
10034       debug(Pair_dump_list(dual_gappairs_1,true));
10035       debug(printf("Winner: single gappairs\n"));
10036       debug(Pair_dump_list(single_gappairs,true));
10037       /* pairs = Pairpool_transfer(pairs,single_gappairs); -- Wait until we check for left_goodness and right_goodness */
10038       singlep = true;
10039     }
10040   }
10041 
10042   if (singlep == true) {
10043     if (single_gappairs == NULL) {
10044       /* Need to handle the possibility that Dynprog_genome_gap artificially returns NULL for single_gappairs */
10045       pairs = Pairpool_transfer(pairs,peeled_pairs);
10046       *path = Pairpool_transfer(*path,peeled_path);
10047 
10048     } else {
10049       if (right_end_intron_p == true) {
10050 	/* Keep left intron only and extend right from short exon */
10051 	querydp5_right = querydp5;
10052 	genomedp5_right = genomedp5;
10053 	querydp3_right = midquerypos;
10054 	genomedp3_right = midgenomepos;
10055 
10056 	queryjump = querydp3_right - querydp5_right + 1;
10057 	genomejump = queryjump + extramaterial_paired;
10058 
10059 	if (genomedp5_right + genomejump - 1 >= genomedp3_right) {
10060 	  /* Bounds don't make sense */
10061 	  debug(printf("Bounds don't make sense if we omit right intron: %d + %d - 1 >= %d\n\n",
10062 		       genomedp5_right,genomejump,genomedp3_right));
10063 	  right_gappairs = NULL;
10064 
10065 	} else {
10066 	  right_gappairs = Dynprog_genome_gap(&(*dynprogindex),&right_score,&new_leftgenomepos,&new_rightgenomepos,
10067 					      &single_left_prob,&single_right_prob,&right_nmatches,&nmismatches,&nopens,&nindels,
10068 					      &right_exonhead,&right_introntype,dynprogL,dynprogR,
10069 					      &(queryseq_ptr[querydp5_right]),&(queryuc_ptr[querydp5_right]),
10070 					      queryjump,genomejump,genomejump,
10071 					      querydp5_right,genomedp5_right,genomedp3_right,
10072 					      chrnum,chroffset,chrhigh,
10073 					      cdna_direction,watsonp,genestrand,jump_late_p,pairpool,extraband_paired,
10074 					      defect_rate,maxpeelback,/*halfp*/false,/*finalp*/false);
10075 
10076 	  right_goodness = right_nmatches + MISMATCH*nmismatches + QOPEN*nopens + QINDEL*nindels;
10077 	  debug(printf("Right goodness (keeping left intron only) = %d\n",right_goodness));
10078 
10079 	  if (right_goodness > single_goodness) {
10080 	    debug(printf("New winner: right gappairs\n"));
10081 	    debug(Pair_dump_list(right_gappairs,true));
10082 	    single_gappairs = right_gappairs;
10083 	    single_goodness = right_goodness;
10084 	  }
10085 	}
10086       }
10087 
10088       if (left_end_intron_p == true) {
10089 	/* Keep right intron only and extend left from short exon */
10090 	querydp5_left = midquerypos;
10091 	genomedp5_left = midgenomepos;
10092 	querydp3_left = querydp3;
10093 	genomedp3_left = genomedp3;
10094 
10095 	queryjump = querydp3_left - querydp5_left + 1;
10096 	genomejump = queryjump + extramaterial_paired;
10097 
10098 	if (genomedp5_left + genomejump - 1 >= genomedp3_left) {
10099 	  /* Bounds don't make sense */
10100 	  debug(printf("Bounds don't make sense if we omit left intron: %d + %d - 1 >= %d\n\n",
10101 		       genomedp5_left,genomejump,genomedp3_left));
10102 	  left_gappairs = NULL;
10103 
10104 	} else {
10105 	  left_gappairs = Dynprog_genome_gap(&(*dynprogindex),&left_score,&new_leftgenomepos,&new_rightgenomepos,
10106 					     &single_left_prob,&single_right_prob,&left_nmatches,&nmismatches,&nopens,&nindels,
10107 					     &left_exonhead,&left_introntype,dynprogL,dynprogR,
10108 					     &(queryseq_ptr[querydp5_left]),&(queryuc_ptr[querydp5_left]),
10109 					     queryjump,genomejump,genomejump,
10110 					     querydp5_left,genomedp5_left,genomedp3_left,
10111 					     chrnum,chroffset,chrhigh,
10112 					     cdna_direction,watsonp,genestrand,jump_late_p,pairpool,extraband_paired,
10113 					     defect_rate,maxpeelback,/*halfp*/false,/*finalp*/false);
10114 
10115 	  left_goodness = left_nmatches + MISMATCH*nmismatches + QOPEN*nopens + QINDEL*nindels;
10116 	  debug(printf("Left goodness (keeping right intron only) = %d\n",left_goodness));
10117 
10118 	  if (left_goodness > single_goodness) {
10119 	    debug(printf("New winner: left gappairs\n"));
10120 	    debug(Pair_dump_list(left_gappairs,true));
10121 	    single_gappairs = left_gappairs;
10122 	    single_goodness = left_goodness;
10123 	  }
10124 	}
10125       }
10126 
10127       /* Finally transfer best single result */
10128       if (single_gappairs == right_gappairs) {
10129 	pairs = Pairpool_transfer(pairs,peeled_pairs);
10130       }
10131       pairs = Pairpool_transfer(pairs,single_gappairs);
10132       if (single_gappairs == left_gappairs) {
10133 	*path = Pairpool_transfer(*path,peeled_path);
10134       }
10135     }
10136 
10137   }
10138 
10139   return pairs;
10140 }
10141 
10142 
10143 #if 0
10144 static bool
10145 good_end_intron_p (Pair_T gappair, int cdna_direction) {
10146   if (gappair->knowngapp == true) {
10147     return true;
10148 
10149   } else if (cdna_direction > 0) {
10150     if (gappair->comp == FWD_CANONICAL_INTRON_COMP || (gappair->donor_prob >= 0.90 && gappair->acceptor_prob >= 0.90)) {
10151       return true;
10152     } else {
10153       return false;
10154     }
10155   } else if (cdna_direction < 0) {
10156     if (gappair->comp == REV_CANONICAL_INTRON_COMP || (gappair->donor_prob >= 0.90 && gappair->acceptor_prob >= 0.90)) {
10157       return true;
10158     } else {
10159       return false;
10160     }
10161   } else {
10162     if (gappair->comp == FWD_CANONICAL_INTRON_COMP || gappair->comp == REV_CANONICAL_INTRON_COMP ||
10163 	(gappair->donor_prob >= 0.90 && gappair->acceptor_prob >= 0.90)) {
10164       return true;
10165     } else {
10166       return false;
10167     }
10168   }
10169 }
10170 #endif
10171 
10172 
10173 /* Note on QUERYEND_INDELS.  Profiling shows that using
10174    QUERYEND_INDELS caused compute_scores_lookup to be called too
10175    often, slowing program down. */
10176 
10177 
10178 /* to_queryend_p must be true for distalmedial_ending, since we are
10179    comparing alternatives.  But it can be false for extend_ending,
10180    which just tries to improve the ends. */
10181 
10182 static List_T
distalmedial_ending5(bool * knownsplicep,bool * chop_exon_p,int * dynprogindex_minor,int * finalscore,int * ambig_end_length,double * ambig_prob,List_T * pairs,int leftquerypos,Pair_T rightpair,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprog,int maxpeelback,double defect_rate)10183 distalmedial_ending5 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_minor,
10184 		      int *finalscore, int *ambig_end_length, double *ambig_prob,
10185 		      List_T *pairs, int leftquerypos, Pair_T rightpair,
10186 		      Univcoord_T chroffset, Univcoord_T chrhigh,
10187 		      char *queryseq_ptr, char *queryuc_ptr,
10188 		      bool watsonp, int genestrand, bool jump_late_p, Pairpool_T pairpool,
10189 		      Dynprog_T dynprog, int maxpeelback, double defect_rate) {
10190   List_T peeled_pairs, continuous_gappairs_medialgap = NULL;
10191   int queryjump, genomejump;
10192   int querydp5, querydp3_distalgap, querydp3_medialgap;
10193   Chrpos_T genomedp3_distalgap, genomedp3_medialgap;
10194   int continuous_goodness_distalgap = 0, continuous_goodness_medialgap = 0,
10195     nmatches, nmismatches, nopens, nindels;
10196   bool protectedp;
10197   int n_peeled_indels;
10198   bool knownsplice_medial_p = false;
10199 
10200   debug(printf("\nDISTALMEDIAL_ENDING5\n"));
10201 
10202   querydp5 = leftquerypos + 1;
10203 #if 0
10204   genomedp5 = leftgenomepos + 1; /* 0 */
10205 #endif
10206   querydp3_distalgap = querydp3_medialgap = rightpair->querypos - 1;
10207   genomedp3_distalgap = genomedp3_medialgap = rightpair->genomepos - 1;
10208 
10209   /* Used to peelback only half as much as for a paired gap, to save
10210      on dynamic programming, but not any more. */
10211   protectedp = false;
10212   *pairs = peel_rightward(&n_peeled_indels,&protectedp,&peeled_pairs,*pairs,&querydp3_distalgap,&genomedp3_distalgap,
10213 			  maxpeelback,/*stop_at_indels_p*/true);
10214 
10215   continuous_goodness_distalgap = Pair_fracidentity_score(peeled_pairs);
10216   /* continuous_goodness_distalgap += Pair_fracidentity_score(endgappairs); */
10217   debug(printf("continuous_goodness_distalgap (%d pairs) is %d\n",
10218 	       List_length(peeled_pairs),continuous_goodness_distalgap));
10219 
10220 #if 0
10221   /* gappair wasn't initialized */
10222   if (good_end_intron_p(gappair,cdna_direction) == false) {
10223     debug(printf("Subtracting points from continuous distal because noncanonical\n"));
10224     continuous_goodness_distalgap -= CANONICAL_POINTS;
10225   } else if (gappair->comp == DUALBREAK_COMP) {
10226     debug(printf("Subtracting points from continuous distal because of dual break\n"));
10227     continuous_goodness_distalgap -= (CANONICAL_POINTS + CANONICAL_POINTS);
10228   }
10229 #endif
10230 
10231   /* Solve if gap were not present */
10232   queryjump = querydp3_medialgap - querydp5 + 1;
10233   genomejump = queryjump + extramaterial_end; /* proposed */
10234   /* Previously, we limited genomejump = min(2*queryjump,queryjump+extramaterial_end) */
10235 
10236 #ifdef EXTRACT_GENOMICSEG
10237   genomedp5 = genomedp3_medialgap - genomejump + 1;
10238   /* Make sure we don't go past the beginning */
10239   if (genomedp5 < 0) {
10240     genomedp5 = 0;
10241     genomejump = genomedp3_medialgap - genomedp5 + 1;
10242   }
10243 #endif
10244 
10245   debug(printf("Stage 3: traverse_ending5: Dynamic programming at 5' end (medial to gap): querydp5 = %d, querydp3 = %d, genomedp3 = %d\n",
10246 	       querydp5,querydp3_medialgap,genomedp3_medialgap));
10247 
10248   continuous_gappairs_medialgap = Dynprog_end5_gap(&(*dynprogindex_minor),&(*finalscore),
10249 						   &nmatches,&nmismatches,&nopens,&nindels,dynprog,
10250 						   &(queryseq_ptr[querydp3_medialgap]),&(queryuc_ptr[querydp3_medialgap]),
10251 						   queryjump,genomejump,querydp3_medialgap,genomedp3_medialgap,
10252 						   chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
10253 						   extraband_end,defect_rate,/*endalign*/QUERYEND_INDELS,/*require_pos_score_p*/false);
10254   *ambig_end_length = 0;
10255   *ambig_prob = 0.0;
10256 
10257   continuous_goodness_medialgap = nmatches + MISMATCH*nmismatches + QOPEN*nopens + QINDEL*nindels;
10258   debug(printf("Continuous_goodness_medialgap %d = %d + %d*%d + %d*%d + %d*%d\n",
10259 	       continuous_goodness_medialgap,nmatches,MISMATCH,nmismatches,QOPEN,nopens,QINDEL,nindels));
10260 
10261   if (continuous_goodness_distalgap > continuous_goodness_medialgap) {
10262     debug(printf("Continuous distal wins: %d > %d\n",continuous_goodness_distalgap,continuous_goodness_medialgap));
10263     *ambig_end_length = 0;
10264     *ambig_prob = 0.0;
10265 
10266     /* *pairs = Pairpool_transfer(*pairs,endgappairs); */
10267     *chop_exon_p = false;
10268     /* Let previous value of knownsplicep stand */
10269     debug(printf("Returning peeled pairs:\n"));
10270     debug(Pair_dump_list(peeled_pairs,true));
10271     debug(printf("\n"));
10272     return peeled_pairs;
10273 
10274   } else {
10275     debug(printf("Continuous medial wins: %d > %d\n",
10276 		 continuous_goodness_medialgap,continuous_goodness_distalgap));
10277     *chop_exon_p = true;
10278     *knownsplicep = knownsplice_medial_p;
10279     return continuous_gappairs_medialgap;
10280   }
10281 }
10282 
10283 
10284 static List_T
extend_ending5(bool * knownsplicep,int * dynprogindex_minor,int * finalscore,int * ambig_end_length,Splicetype_T * ambig_splicetype,double * ambig_prob,List_T * pairs,int leftquerypos,Pair_T rightpair,Univcoord_T chroffset,Univcoord_T chrhigh,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,char * queryseq_ptr,char * queryuc_ptr,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprog,int maxpeelback,double defect_rate,Endalign_T endalign,bool forcep)10285 extend_ending5 (bool *knownsplicep, int *dynprogindex_minor,
10286 		int *finalscore, int *ambig_end_length, Splicetype_T *ambig_splicetype, double *ambig_prob,
10287 		List_T *pairs, int leftquerypos, Pair_T rightpair,
10288 		Univcoord_T chroffset, Univcoord_T chrhigh,
10289 		Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
10290 		char *queryseq_ptr, char *queryuc_ptr,
10291 		int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
10292 		Pairpool_T pairpool, Dynprog_T dynprog, int maxpeelback,
10293 		double defect_rate, Endalign_T endalign, bool forcep) {
10294   List_T continuous_gappairs_distalgap = NULL, peeled_pairs = NULL;
10295   int queryjump, genomejump;
10296   int querydp5, querydp3_distalgap;
10297   Chrpos_T genomedp3_distalgap;
10298   int nmatches, nmismatches, nopens, nindels;
10299   bool protectedp = false;
10300   int n_peeled_indels = 0;
10301   Pair_T firstpair;
10302 
10303 
10304   debug(printf("\nEXTEND_ENDING5 with endalign %s and maxpeelback %d\n",
10305 	       Dynprog_endalign_string(endalign),maxpeelback));
10306 
10307   querydp5 = leftquerypos + 1;
10308 #if 0
10309   genomedp5 = leftgenomepos + 1; /* 0 */
10310 #endif
10311   querydp3_distalgap = rightpair->querypos - 1;
10312   genomedp3_distalgap = rightpair->genomepos - 1;
10313 
10314   /* Used to peelback only half as much as for a paired gap, to save
10315      on dynamic programming, but not any more. */
10316 
10317   if (endalign == QUERYEND_NOGAPS) {
10318     /* Don't peelback on extension */
10319   } else if (maxpeelback == 0) {
10320     /* Actually, we should peelback after trim_ends, because indel placement could be wrong */
10321     /* Don't peelback on BEST_LOCAL after trim_ends */
10322   } else {
10323     protectedp = false;
10324     *pairs = peel_rightward(&n_peeled_indels,&protectedp,&peeled_pairs,*pairs,&querydp3_distalgap,&genomedp3_distalgap,
10325 			    maxpeelback,/*stop_at_indels_p*/true);
10326   }
10327 
10328   queryjump = querydp3_distalgap - querydp5 + 1;
10329   genomejump = queryjump + extramaterial_end; /* proposed */
10330   /* Previously, we limited genomejump = min(2*queryjump,queryjump+extramaterial_end) */
10331 
10332 #if 0
10333   genomedp5 = genomedp3_distalgap - genomejump + 1;
10334 #endif
10335 #ifdef EXTRACT_GENOMICSEG
10336   /* Make sure we don't go past the beginning */
10337   if (genomedp5 < 0) {
10338     genomedp5 = 0;
10339     genomejump = genomedp3_distalgap - genomedp5 + 1;
10340   }
10341 #endif
10342 
10343   debug(printf("Stage 3 (dir %d), extend_ending5: Dynamic programming at 5' end (distal to gap): querydp5 = %d, querydp3 = %d, genomedp3 = %d\n",
10344 	       cdna_direction,querydp5,querydp3_distalgap,genomedp3_distalgap));
10345 
10346 
10347   if (endalign == QUERYEND_GAP && splicesites != NULL) {
10348     continuous_gappairs_distalgap = Dynprog_end5_known(&(*knownsplicep),&(*dynprogindex_minor),&(*finalscore),
10349 						       &(*ambig_end_length),&(*ambig_splicetype),
10350 						       &nmatches,&nmismatches,&nopens,&nindels,dynprog,
10351 						       &(queryseq_ptr[querydp3_distalgap]),&(queryuc_ptr[querydp3_distalgap]),
10352 						       queryjump,genomejump,querydp3_distalgap,genomedp3_distalgap,
10353 						       chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
10354 						       cdna_direction,watsonp,genestrand,jump_late_p,pairpool,
10355 						       extraband_end,defect_rate);
10356     if (*ambig_end_length > 0) {
10357       *ambig_prob = 2.0;
10358     }
10359   } else {
10360     continuous_gappairs_distalgap = Dynprog_end5_gap(&(*dynprogindex_minor),&(*finalscore),
10361 						     &nmatches,&nmismatches,&nopens,&nindels,dynprog,
10362 						     &(queryseq_ptr[querydp3_distalgap]),&(queryuc_ptr[querydp3_distalgap]),
10363 						     /*rlength*/queryjump,/*glength*/genomejump,
10364 						     /*rev_roffset*/querydp3_distalgap,/*rev_goffset*/genomedp3_distalgap,
10365 						     chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
10366 						     extraband_end,defect_rate,endalign,/*require_pos_score_p*/false);
10367     *ambig_end_length = 0;
10368     *ambig_prob = 0.0;
10369     *knownsplicep = false;
10370   }
10371 
10372   debug(printf("  finalscore: %d\n",*finalscore));
10373   if (continuous_gappairs_distalgap == NULL) {
10374     return peeled_pairs;
10375   } else {
10376     firstpair = List_head(continuous_gappairs_distalgap);
10377     if (0 && firstpair->querypos != querydp3_distalgap) {
10378       /* Not a good test anymore, since we are halting peelbacks at gaps */
10379       /* Must have an indel between the gappairs and the rest of the read */
10380       debug(printf("Detected indel between gappairs %d and the rest of the read %d\n",
10381 		   firstpair->querypos,querydp3_distalgap));
10382       return peeled_pairs;
10383 
10384     } else if (forcep == true) {
10385       /* For example, needed for extending middles of chimeras */
10386       return continuous_gappairs_distalgap;
10387     } else if (*finalscore <= 0) {
10388       /* Need this branch to eliminate bad exons at ends */
10389       *knownsplicep = false;
10390       return peeled_pairs;
10391     } else {
10392       return continuous_gappairs_distalgap;
10393     }
10394   }
10395 }
10396 
10397 
10398 static List_T
distalmedial_ending3(bool * knownsplicep,bool * chop_exon_p,int * dynprogindex_minor,int * finalscore,int * ambig_end_length,double * ambig_prob,List_T * path,Pair_T leftpair,int rightquerypos,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprog,int maxpeelback,double defect_rate)10399 distalmedial_ending3 (bool *knownsplicep, bool *chop_exon_p, int *dynprogindex_minor,
10400 		      int *finalscore, int *ambig_end_length, double *ambig_prob,
10401 		      List_T *path, Pair_T leftpair, int rightquerypos,
10402 		      Univcoord_T chroffset, Univcoord_T chrhigh,
10403 		      char *queryseq_ptr, char *queryuc_ptr,
10404 		      bool watsonp, int genestrand, bool jump_late_p,
10405 		      Pairpool_T pairpool, Dynprog_T dynprog, int maxpeelback, double defect_rate) {
10406   List_T peeled_path, continuous_gappairs_medialgap = NULL;
10407   int queryjump, genomejump;
10408   int querydp5_distalgap, querydp3, querydp5_medialgap;
10409   Chrpos_T genomedp5_distalgap, genomedp5_medialgap;
10410   int continuous_goodness_distalgap = 0, continuous_goodness_medialgap = 0,
10411     nmatches, nmismatches, nopens, nindels;
10412   bool protectedp;
10413   int n_peeled_indels;
10414   bool knownsplice_medial_p = false;
10415 
10416 
10417   debug(printf("\nDISTALMEDIAL_ENDING3\n"));
10418 
10419   querydp5_distalgap = leftpair->querypos + 1;
10420   genomedp5_distalgap = leftpair->genomepos + 1;
10421   /* if (leftpair->cdna == ' ') querydp5_distalgap--; -- For old dynamic programming */
10422   /* if (leftpair->genome == ' ') genomedp5_distalgap--; -- For old dynamic programming */
10423 
10424   querydp5_medialgap = querydp5_distalgap;
10425   genomedp5_medialgap = genomedp5_distalgap;
10426   querydp3 = rightquerypos - 1;
10427   /* genomedp3 = rightgenomepos - 1; */
10428 
10429   /* Used to peelback only half as much as for a paired gap, to save
10430      on dynamic programming, but not any more. */
10431   protectedp = false;
10432   *path = peel_leftward(&n_peeled_indels,&protectedp,&peeled_path,*path,&querydp5_distalgap,&genomedp5_distalgap,
10433 			maxpeelback,/*stop_at_indels_p*/true);
10434 
10435   continuous_goodness_distalgap = Pair_fracidentity_score(peeled_path);
10436   /* continuous_goodness_distalgap += Pair_fracidentity_score(endgappairs); */
10437   debug(printf("continuous_goodness_distalgap (%d pairs) is %d\n",
10438 	       List_length(peeled_path),continuous_goodness_distalgap));
10439 
10440 #if 0
10441   /* gappair wasn't initialized */
10442   if (good_end_intron_p(gappair,cdna_direction) == false) {
10443     debug(printf("Subtracting points from continuous distal because noncanonical\n"));
10444     continuous_goodness_distalgap -= CANONICAL_POINTS;
10445   } else if (gappair->comp == DUALBREAK_COMP) {
10446     debug(printf("Subtracting points from continuous distal because of dual break\n"));
10447     continuous_goodness_distalgap -= (CANONICAL_POINTS + CANONICAL_POINTS);
10448   }
10449 #endif
10450 
10451   /* Solve if gap were not present */
10452   queryjump = querydp3 - querydp5_medialgap + 1;
10453   genomejump = queryjump + extramaterial_end; /* proposed */
10454   /* Previously, we limited genomejump = min(2*queryjump,queryjump+extramaterial_end) */
10455 
10456 #ifdef EXTRACT_GENOMICSEG
10457   genomedp3 = genomedp5_medialgap + genomejump - 1;
10458   /* Make sure we don't go past the end */
10459   if (genomedp3 > genomiclength - 1) {
10460     genomedp3 = genomiclength - 1;
10461     genomejump = genomedp3 - genomedp5_medialgap + 1;
10462   }
10463 #endif
10464 
10465   debug(printf("Stage 3: distalmedial_ending3: Dynamic programming at 3' end (medial to gap): querydp5 = %d, querydp3 = %d, genomedp5 = %u\n",
10466 	       querydp5_medialgap,querydp3,genomedp5_medialgap));
10467 
10468   debug(printf("Before solving the 3' end, here is the path:\n"));
10469   debug(Pair_dump_list(*path,true));
10470   debug(printf("\n"));
10471 
10472   continuous_gappairs_medialgap = Dynprog_end3_gap(&(*dynprogindex_minor),&(*finalscore),
10473 						   &nmatches,&nmismatches,&nopens,&nindels,dynprog,
10474 						   &(queryseq_ptr[querydp5_medialgap]),&(queryuc_ptr[querydp5_medialgap]),
10475 						   queryjump,genomejump,querydp5_medialgap,genomedp5_medialgap,
10476 						   chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
10477 						   extraband_end,defect_rate,/*endalign*/QUERYEND_INDELS,
10478 						   /*require_pos_score_p*/false);
10479   *ambig_end_length = 0;
10480   *ambig_prob = 0.0;
10481 
10482   debug(printf("Medial gap\n"));
10483   debug(Pair_dump_list(continuous_gappairs_medialgap,true));
10484 
10485   continuous_goodness_medialgap = nmatches + MISMATCH*nmismatches + QOPEN*nopens + QINDEL*nindels;
10486   debug(printf("Continuous_goodness_medialgap %d = %d + %d*%d + %d*%d + %d*%d\n",
10487 	       continuous_goodness_medialgap,nmatches,MISMATCH,nmismatches,QOPEN,nopens,QINDEL,nindels));
10488 
10489   if (continuous_goodness_distalgap > continuous_goodness_medialgap) {
10490     debug(printf("Continuous distal wins: %d > %d\n",continuous_goodness_distalgap,continuous_goodness_medialgap));
10491     *ambig_end_length = 0;
10492     *ambig_prob = 0.0;
10493 
10494     /* *path = Pairpool_transfer(*path,endgappairs); */
10495     *chop_exon_p = false;
10496     /* Let previous value of knownsplicep stand */
10497     debug(printf("Returning peeled path:\n"));
10498     debug(Pair_dump_list(peeled_path,true));
10499     debug(printf("\n"));
10500     return peeled_path;
10501 
10502   } else {
10503     debug(printf("Continuous medial wins: %d > %d\n",continuous_goodness_medialgap,continuous_goodness_distalgap));
10504     *chop_exon_p = true;
10505     *knownsplicep = knownsplice_medial_p;
10506     return List_reverse(continuous_gappairs_medialgap);
10507   }
10508 }
10509 
10510 
10511 static List_T
extend_ending3(bool * knownsplicep,int * dynprogindex_minor,int * finalscore,int * ambig_end_length,Splicetype_T * ambig_splicetype,double * ambig_prob,List_T * path,Pair_T leftpair,int rightquerypos,int querylength,Univcoord_T chroffset,Univcoord_T chrhigh,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,char * queryseq_ptr,char * queryuc_ptr,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprog,int maxpeelback,double defect_rate,Endalign_T endalign,bool forcep)10512 extend_ending3 (bool *knownsplicep, int *dynprogindex_minor, int *finalscore,
10513 		int *ambig_end_length, Splicetype_T *ambig_splicetype, double *ambig_prob,
10514 		List_T *path, Pair_T leftpair, int rightquerypos,
10515 		int querylength, Univcoord_T chroffset, Univcoord_T chrhigh,
10516 		Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
10517 		char *queryseq_ptr, char *queryuc_ptr,
10518 		int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
10519 		Pairpool_T pairpool, Dynprog_T dynprog, int maxpeelback,
10520 		double defect_rate, Endalign_T endalign, bool forcep) {
10521   List_T continuous_gappairs_distalgap = NULL, peeled_path = NULL;
10522   int queryjump, genomejump;
10523   int querydp5_distalgap, querydp3;
10524   Chrpos_T genomedp5_distalgap;
10525   int nmatches, nmismatches, nopens, nindels;
10526   bool protectedp = false;
10527   int n_peeled_indels = 0;
10528   Pair_T firstpair;
10529 
10530   debug(printf("\nEXTEND_ENDING3 with endalign %s and maxpeelback %d\n",
10531 	       Dynprog_endalign_string(endalign),maxpeelback));
10532 
10533   querydp5_distalgap = leftpair->querypos + 1;
10534   genomedp5_distalgap = leftpair->genomepos + 1;
10535   /* if (leftpair->cdna == ' ') querydp5_distalgap--; -- For old dynamic programming */
10536   /* if (leftpair->genome == ' ') genomedp5_distalgap--; -- For old dynamic programming */
10537   querydp3 = rightquerypos - 1;
10538   /* genomedp3 = rightgenomepos - 1; */
10539   debug(printf("Set dynprog 3 end to be querydp3 = %d\n",querydp3));
10540 
10541   /* Used to peelback only half as much as for a paired gap, to save
10542      on dynamic programming, but not any more. */
10543   if (endalign == QUERYEND_NOGAPS) {
10544     /* Don't peelback on extension */
10545   } else if (maxpeelback == 0) {
10546     /* Actually, we should peelback after trim_ends, because indel placement could be wrong */
10547     /* Don't peelback on BEST_LOCAL after trim_ends */
10548   } else {
10549     protectedp = false;
10550     *path = peel_leftward(&n_peeled_indels,&protectedp,&peeled_path,*path,&querydp5_distalgap,&genomedp5_distalgap,
10551 			  maxpeelback,/*stop_at_indels_p*/true);
10552   }
10553 
10554   queryjump = querydp3 - querydp5_distalgap + 1;
10555   genomejump = queryjump + extramaterial_end; /* proposed */
10556   /* Previously, we limited genomejump = min(2*queryjump,queryjump+extramaterial_end) */
10557 
10558   /* genomedp3 = genomedp5_distalgap + genomejump - 1; */
10559 #ifdef EXTRACT_GENOMICSEG
10560   /* Make sure we don't go past the end */
10561   if (genomedp3 > genomiclength - 1) {
10562     genomedp3 = genomiclength - 1;
10563     genomejump = genomedp3 - genomedp5_distalgap + 1;
10564   }
10565 #endif
10566 
10567   debug(printf("Stage 3 (dir %d), extend_ending3: Dynamic programming at 3' end (distal to gap): querydp5 = %d, querydp3 = %d, genomedp5 = %d\n",
10568 	       cdna_direction,querydp5_distalgap,querydp3,genomedp5_distalgap));
10569 
10570   if (endalign == QUERYEND_GAP && splicesites != NULL) {
10571     continuous_gappairs_distalgap = Dynprog_end3_known(&(*knownsplicep),&(*dynprogindex_minor),&(*finalscore),
10572 						       &(*ambig_end_length),&(*ambig_splicetype),
10573 						       &nmatches,&nmismatches,&nopens,&nindels,dynprog,
10574 						       &(queryseq_ptr[querydp5_distalgap]),&(queryuc_ptr[querydp5_distalgap]),
10575 						       queryjump,genomejump,querydp5_distalgap,genomedp5_distalgap,
10576 						       querylength,chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
10577 						       cdna_direction,watsonp,genestrand,jump_late_p,pairpool,
10578 						       extraband_end,defect_rate);
10579     if (*ambig_end_length > 0) {
10580       *ambig_prob = 2.0;
10581     }
10582   } else {
10583     continuous_gappairs_distalgap = Dynprog_end3_gap(&(*dynprogindex_minor),&(*finalscore),
10584 						     &nmatches,&nmismatches,&nopens,&nindels,dynprog,
10585 						     &(queryseq_ptr[querydp5_distalgap]),&(queryuc_ptr[querydp5_distalgap]),
10586 						     queryjump,genomejump,querydp5_distalgap,genomedp5_distalgap,
10587 						     chroffset,chrhigh,watsonp,genestrand,jump_late_p,pairpool,
10588 						     extraband_end,defect_rate,endalign,/*require_pos_score_p*/false);
10589     *ambig_end_length = 0;
10590     *ambig_prob = 0.0;
10591     *knownsplicep = false;
10592   }
10593 
10594   debug(printf("  finalscore: %d\n",*finalscore));
10595   if (continuous_gappairs_distalgap == NULL) {
10596     return peeled_path;
10597   } else {
10598     continuous_gappairs_distalgap = List_reverse(continuous_gappairs_distalgap);
10599     firstpair = List_head(continuous_gappairs_distalgap);
10600     if (0 && firstpair->querypos != querydp5_distalgap) {
10601       /* Not a good test anymore, since we are halting peelbacks at gaps */
10602       /* Must have an indel between the gappairs and the rest of the read */
10603       debug(printf("Detected indel between gappairs %d and the rest of the read %d\n",
10604 		   firstpair->querypos,querydp5_distalgap));
10605       return peeled_path;
10606 
10607     } else if (forcep == true) {
10608       /* For example, needed for extending middle of chimeras */
10609       return continuous_gappairs_distalgap;
10610     } else if (*finalscore <= 0) {
10611       /* Need this branch to eliminate bad exons at ends */
10612       *knownsplicep = false;
10613       return peeled_path;
10614     } else {
10615       return continuous_gappairs_distalgap;
10616     }
10617   }
10618 }
10619 
10620 
10621 #ifndef GSNAP
10622 /* Modified from trim_novel_spliceends.  Note that code for 5' end
10623    here is taken from the 3' end of trim_novel_spliceends, and vice
10624    versa */
10625 static void
find_dual_break_spliceends(List_T path,List_T pairs,Doublelist_T * spliceprobs5,Doublelist_T * spliceprobs3,Univcoordlist_T * splice_positions_5,Univcoordlist_T * splice_positions_3,int cdna_direction,bool watsonp,Univcoord_T chroffset,Univcoord_T chrhigh)10626 find_dual_break_spliceends (List_T path, List_T pairs,
10627 			    Doublelist_T *spliceprobs5, Doublelist_T *spliceprobs3,
10628 			    Univcoordlist_T *splice_positions_5, Univcoordlist_T *splice_positions_3,
10629 			    int cdna_direction, bool watsonp,
10630 			    Univcoord_T chroffset, Univcoord_T chrhigh) {
10631   List_T p;
10632   int exondist, i;
10633   int querypos;
10634 
10635   Pair_T pair;
10636   Univcoord_T genomicpos, start_genomicpos, middle_genomicpos, end_genomicpos;
10637   Univcoord_T start, middle, end; /* start to middle has mismatches, while middle to end has none */
10638   double donor_prob, acceptor_prob;
10639 
10640 
10641   debug13(printf("\nEntered find_dual_break_spliceends with cdna_direction %d\n",cdna_direction));
10642   *splice_positions_5 = *splice_positions_3 = (Univcoordlist_T) NULL;
10643   *spliceprobs5 = *spliceprobs3 = (Doublelist_T) NULL;
10644 
10645 
10646   /* 5' intron end */
10647   if (path != NULL) {
10648     p = path;
10649     while (p != NULL && ((Pair_T) p->first)->gapp == true) {
10650       p = List_next(p);
10651     }
10652 
10653     if (p != NULL) {
10654       pair = (Pair_T) List_head(p);
10655       querypos = pair->querypos + 1; /* Because start_genomicpos = start + 1 */
10656       start = middle = end = pair->genomepos;
10657       debug13(printf("Initializing start and end to be %u\n",start));
10658 
10659       i = 0;
10660       while (i < END_SPLICESITE_SEARCH) {
10661 	if ((p = List_next(p)) == NULL) {
10662 	  break;
10663 	} else if (pair->gapp == true) {
10664 	  break;
10665 	} else {
10666 	  end = pair->genomepos;
10667 	  debug13(printf("Resetting end to be %u\n",end));
10668 	}
10669 	pair = (Pair_T) List_head(p);
10670 	i++;
10671       }
10672 
10673       start = middle + 5;
10674       querypos += 5;
10675 
10676       /* Find distance from end to intron, if any */
10677       exondist = 0;
10678       while (p != NULL && ((Pair_T) List_head(p))->gapp == false &&
10679 	     exondist < END_MIN_EXONLENGTH) {
10680 	p = List_next(p);
10681 	exondist++;
10682       }
10683       debug13(printf("exondist is %d\n",exondist));
10684 
10685       if (cdna_direction > 0) {
10686 	if (watsonp) {
10687 	  /* splicetype5 = splicetype5_mm = DONOR; */
10688 
10689 	  start_genomicpos = start + 1;
10690 	  middle_genomicpos = middle + 1;
10691 	  end_genomicpos = end + 1;
10692 
10693 	  /* assert(start_genomicpos >= end_genomicpos); */
10694 	  genomicpos = start_genomicpos;
10695 	  while (genomicpos >= middle_genomicpos &&
10696 		 genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
10697 	    donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 1 */
10698 	    debug13(printf("5', watson, sense anti %d %u %u %f mm",querypos,chroffset+genomicpos,genomicpos,donor_prob));
10699 	    if (donor_prob > 0.5) {
10700 	      *splice_positions_5 = Univcoordlist_push(*splice_positions_5,genomicpos - 1);
10701 	      *spliceprobs5 = Doublelist_push(*spliceprobs5,donor_prob);
10702 	      debug13(printf(" **"));
10703 	    }
10704 	    debug13(printf("\n"));
10705 	    genomicpos--;
10706 	    querypos--;
10707 	  }
10708 	  while (genomicpos >= end_genomicpos &&
10709 		 genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
10710 	    donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 1 */
10711 	    debug13(printf("5', watson, sense anti %d %u %u %f",querypos,chroffset+genomicpos,genomicpos,donor_prob));
10712 	    if (donor_prob > 0.5) {
10713 	      *splice_positions_5 = Univcoordlist_push(*splice_positions_5,genomicpos - 1);
10714 	      *spliceprobs5 = Doublelist_push(*spliceprobs5,donor_prob);
10715 	      debug13(printf(" **"));
10716 	    }
10717 	    debug13(printf("\n"));
10718 	    genomicpos--;
10719 	    querypos--;
10720 	  }
10721 	  debug13(printf("\n"));
10722 
10723 	} else {
10724 	  /* splicetype5 = splicetype5_mm = ANTIDONOR; */
10725 
10726 	  start_genomicpos = (start > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - start;
10727 	  middle_genomicpos = (middle > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - middle;
10728 	  end_genomicpos = (end > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - end;
10729 
10730 	  /* assert(start_genomicpos <= end_genomicpos); */
10731 	  genomicpos = start_genomicpos;
10732 	  while (genomicpos <= middle_genomicpos &&
10733 		 genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
10734 	    donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 3 */
10735 	    debug13(printf("5', crick, sense forward %d %u %u %f mm",querypos,chroffset+genomicpos,genomicpos,donor_prob));
10736 	    if (donor_prob > 0.5) {
10737 	      *splice_positions_5 = Univcoordlist_push(*splice_positions_5,(chrhigh - chroffset) - genomicpos);
10738 	      *spliceprobs5 = Doublelist_push(*spliceprobs5,donor_prob);
10739 	      debug13(printf(" **"));
10740 	    }
10741 	    debug13(printf("\n"));
10742 	    genomicpos++;
10743 	    querypos--;
10744 	  }
10745 	  while (genomicpos <= end_genomicpos &&
10746 		 genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
10747 	    donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 3 */
10748 	    debug13(printf("5', crick, sense forward %d %u %u %f",querypos,chroffset+genomicpos,genomicpos,donor_prob));
10749 	    if (donor_prob > 0.5) {
10750 	      *splice_positions_5 = Univcoordlist_push(*splice_positions_5,(chrhigh - chroffset) - genomicpos);
10751 	      *spliceprobs5 = Doublelist_push(*spliceprobs5,donor_prob);
10752 	      debug13(printf(" **"));
10753 	    }
10754 	    debug13(printf("\n"));
10755 	    genomicpos++;
10756 	    querypos--;
10757 	  }
10758 	  debug13(printf("\n"));
10759 	}
10760 
10761       } else if (cdna_direction < 0) {
10762 	if (watsonp) {
10763 	  /* splicetype5 = splicetype5_mm = ANTIACCEPTOR; */
10764 
10765 	  start_genomicpos = start + 1;
10766 	  middle_genomicpos = middle + 1;
10767 	  end_genomicpos = end + 1;
10768 
10769 	  /* assert(start_genomicpos >= end_genomicpos); */
10770 	  genomicpos = start_genomicpos;
10771 	  while (genomicpos >= middle_genomicpos &&
10772 		 genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
10773 	    acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 5 */
10774 	    debug13(printf("5', watson, sense forward %d %u %u %f mm",querypos,chroffset+genomicpos,genomicpos,acceptor_prob));
10775 	    if (acceptor_prob > 0.5) {
10776 	      *splice_positions_5 = Univcoordlist_push(*splice_positions_5,genomicpos - 1);
10777 	      *spliceprobs5 = Doublelist_push(*spliceprobs5,acceptor_prob);
10778 	      debug13(printf(" **"));
10779 	    }
10780 	    debug13(printf("\n"));
10781 	    genomicpos--;
10782 	    querypos--;
10783 	  }
10784 	  while (genomicpos >= end_genomicpos &&
10785 		 genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
10786 	    acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 5 */
10787 	    debug13(printf("5', watson, sense forward %d %u %u %f",querypos,chroffset+genomicpos,genomicpos,acceptor_prob));
10788 	    if (acceptor_prob > 0.5) {
10789 	      *splice_positions_5 = Univcoordlist_push(*splice_positions_5,genomicpos - 1);
10790 	      *spliceprobs5 = Doublelist_push(*spliceprobs5,acceptor_prob);
10791 	      debug13(printf(" **"));
10792 	    }
10793 	    debug13(printf("\n"));
10794 	    genomicpos--;
10795 	    querypos--;
10796 	  }
10797 	  debug13(printf("\n"));
10798 
10799 	} else {
10800 	  /* splicetype5 = splicetype5_mm = ACCEPTOR; */
10801 
10802 	  start_genomicpos = (start > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - start;
10803 	  middle_genomicpos = (middle > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - middle;
10804 	  end_genomicpos = (end > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - end;
10805 
10806 	  /* assert(start_genomicpos <= end_genomicpos); */
10807 	  genomicpos = start_genomicpos;
10808 	  while (genomicpos <= middle_genomicpos &&
10809 		 genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
10810 	    acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 7 */
10811 	    debug13(printf("5', crick, sense anti %d %u %u %f mm",querypos,chroffset+genomicpos,genomicpos,acceptor_prob));
10812 	    if (acceptor_prob > 0.5) {
10813 	      *splice_positions_5 = Univcoordlist_push(*splice_positions_5,(chrhigh - chroffset) - genomicpos);
10814 	      *spliceprobs5 = Doublelist_push(*spliceprobs5,acceptor_prob);
10815 	      debug13(printf(" **"));
10816 	    }
10817 	    debug13(printf("\n"));
10818 	    genomicpos++;
10819 	    querypos--;
10820 	  }
10821 	  while (genomicpos <= end_genomicpos &&
10822 		 genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
10823 	    acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 7 */
10824 	    debug13(printf("5', crick, sense anti %d %u %u %f",querypos,chroffset+genomicpos,genomicpos,acceptor_prob));
10825 	    if (acceptor_prob > 0.5) {
10826 	      *splice_positions_5 = Univcoordlist_push(*splice_positions_5,(chrhigh - chroffset) - genomicpos);
10827 	      *spliceprobs5 = Doublelist_push(*spliceprobs5,acceptor_prob);
10828 	      debug13(printf(" **"));
10829 	    }
10830 	    debug13(printf("\n"));
10831 	    genomicpos++;
10832 	    querypos--;
10833 	  }
10834 	  debug13(printf("\n"));
10835 	}
10836 
10837       } else {
10838 	fprintf(stderr,"Not expecting cdna_direction to be 0\n");
10839 	abort();
10840       }
10841     }
10842   }
10843 
10844   /* 3' intron end */
10845   if (pairs != NULL) {
10846     p = pairs;
10847     while (p != NULL && ((Pair_T) p->first)->gapp == true) {
10848       p = List_next(p);
10849     }
10850 
10851     if (p != NULL) {
10852       pair = (Pair_T) List_head(p);
10853       querypos = pair->querypos;
10854       start = middle = end = pair->genomepos;
10855       debug13(printf("Initializing start and end to be %u\n",start));
10856 
10857       i = 0;
10858       while (i < END_SPLICESITE_SEARCH) {
10859 	if ((p = List_next(p)) == NULL) {
10860 	  break;
10861 	} else if (pair->gapp == true) {
10862 	  break;
10863 	} else {
10864 	  end = pair->genomepos;
10865 	  debug13(printf("Resetting end to be %u\n",end));
10866 	}
10867 	pair = (Pair_T) List_head(p);
10868 	i++;
10869       }
10870 
10871       start = middle - 5;
10872       querypos -= 5;
10873 
10874       /* Find distance from end to intron, if any */
10875       exondist = 0;
10876       while (p != NULL && ((Pair_T) List_head(p))->gapp == false &&
10877 	     exondist < END_MIN_EXONLENGTH) {
10878 	p = List_next(p);
10879 	exondist++;
10880       }
10881       debug13(printf("exondist is %d\n",exondist));
10882 
10883       if (cdna_direction > 0) {
10884 	if (watsonp) {
10885 	  /* splicetype3 = splicetype3_mm = ACCEPTOR; */
10886 
10887 	  start_genomicpos = start;
10888 	  middle_genomicpos = middle;
10889 	  end_genomicpos = end;
10890 
10891 	  /* assert(start_genomicpos <= end_genomicpos); */
10892 	  genomicpos = start_genomicpos;
10893 	  while (genomicpos <= middle_genomicpos &&
10894 		 genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
10895 	    acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 2 */
10896 	    debug13(printf("3', watson, sense forward %d %u %u %f mm",querypos,chroffset+genomicpos,genomicpos,acceptor_prob));
10897 	    if (acceptor_prob > 0.5) {
10898 	      *splice_positions_3 = Univcoordlist_push(*splice_positions_3,genomicpos);
10899 	      *spliceprobs3 = Doublelist_push(*spliceprobs3,acceptor_prob);
10900 	      debug13(printf(" **"));
10901 	    }
10902 	    debug13(printf("\n"));
10903 	    genomicpos++;
10904 	    querypos++;
10905 	  }
10906 	  while (genomicpos <= end_genomicpos &&
10907 		 genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
10908 	    acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 2 */
10909 	    debug13(printf("3', watson, sense forward %d %u %u %f",querypos,chroffset+genomicpos,genomicpos,acceptor_prob));
10910 	    if (acceptor_prob > 0.5) {
10911 	      *splice_positions_3 = Univcoordlist_push(*splice_positions_3,genomicpos);
10912 	      *spliceprobs3 = Doublelist_push(*spliceprobs3,acceptor_prob);
10913 	      debug13(printf(" **"));
10914 	    }
10915 	    debug13(printf("\n"));
10916 	    genomicpos++;
10917 	    querypos++;
10918 	  }
10919 	  debug13(printf("\n"));
10920 
10921 	} else {
10922 	  /* splicetype3 = splicetype3_mm = ANTIACCEPTOR; */
10923 
10924 	  start_genomicpos = (start > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - start + 1;
10925 	  middle_genomicpos = (middle > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - middle + 1;
10926 	  end_genomicpos = (end > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - end + 1;
10927 
10928 	  /* assert(start_genomicpos >= end_genomicpos); */
10929 	  genomicpos = start_genomicpos;
10930 	  while (genomicpos >= middle_genomicpos &&
10931 		 genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
10932 	    acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 4 */
10933 	    debug13(printf("3', crick, sense anti %d %u %u %f mm",querypos,chroffset+genomicpos,genomicpos,acceptor_prob));
10934 	    if (acceptor_prob > 0.5) {
10935 	      *splice_positions_3 = Univcoordlist_push(*splice_positions_3,(chrhigh - chroffset) - genomicpos + 1);
10936 	      *spliceprobs3 = Doublelist_push(*spliceprobs3,acceptor_prob);
10937 	      debug13(printf(" **"));
10938 	    }
10939 	    debug13(printf("\n"));
10940 	    genomicpos--;
10941 	    querypos++;
10942 	  }
10943 	  while (genomicpos >= end_genomicpos &&
10944 		 genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
10945 	    acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 4 */
10946 	    debug13(printf("3', crick, sense anti %d %u %u %f",querypos,chroffset+genomicpos,genomicpos,acceptor_prob));
10947 	    if (acceptor_prob > 0.5) {
10948 	      *splice_positions_3 = Univcoordlist_push(*splice_positions_3,(chrhigh - chroffset) - genomicpos + 1);
10949 	      *spliceprobs3 = Doublelist_push(*spliceprobs3,acceptor_prob);
10950 	      debug13(printf(" **"));
10951 	    }
10952 	    debug13(printf("\n"));
10953 	    genomicpos--;
10954 	    querypos++;
10955 	  }
10956 	  debug13(printf("\n"));
10957 	}
10958 
10959       } else if (cdna_direction < 0) {
10960 	if (watsonp) {
10961 	  /* splicetype3 = splicetype3_mm = ANTIDONOR; */
10962 
10963 	  start_genomicpos = start;
10964 	  middle_genomicpos = middle;
10965 	  end_genomicpos = end;
10966 
10967 	  /* assert(start_genomicpos <= end_genomicpos); */
10968 	  genomicpos = start_genomicpos;
10969 	  while (genomicpos <= middle_genomicpos &&
10970 		 genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
10971 	    donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 6 */
10972 	    debug13(printf("3', watson, sense anti %d %u %u %f mm",querypos,chroffset+genomicpos,genomicpos,donor_prob));
10973 	    if (donor_prob > 0.5) {
10974 	      *splice_positions_3 = Univcoordlist_push(*splice_positions_3,genomicpos);
10975 	      *spliceprobs3 = Doublelist_push(*spliceprobs3,donor_prob);
10976 	      debug13(printf(" **"));
10977 	    }
10978 	    debug13(printf("\n"));
10979 	    genomicpos++;
10980 	    querypos++;
10981 	  }
10982 	  while (genomicpos <= end_genomicpos &&
10983 		 genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
10984 	    donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 6 */
10985 	    debug13(printf("3', watson, sense anti %d %u %u %f",querypos,chroffset+genomicpos,genomicpos,donor_prob));
10986 	    if (donor_prob > 0.5) {
10987 	      *splice_positions_3 = Univcoordlist_push(*splice_positions_3,genomicpos);
10988 	      *spliceprobs3 = Doublelist_push(*spliceprobs3,donor_prob);
10989 	      debug13(printf(" **"));
10990 	    }
10991 	    debug13(printf("\n"));
10992 	    genomicpos++;
10993 	    querypos++;
10994 	  }
10995 	  debug13(printf("\n"));
10996 
10997 	} else {
10998 	  /* splicetype3 = splicetype3_mm = DONOR; */
10999 
11000 	  start_genomicpos = (start > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - start + 1;
11001 	  middle_genomicpos = (middle > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - middle + 1;
11002 	  end_genomicpos = (end > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - end + 1;
11003 
11004 	  /* assert(start_genomicpos >= end_genomicpos); */
11005 	  genomicpos = start_genomicpos;
11006 	  while (genomicpos >= middle_genomicpos &&
11007 		 genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
11008 	    donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 8 */
11009 	    debug13(printf("3', crick, sense forward %d %u %u %f mm",querypos,chroffset+genomicpos,genomicpos,donor_prob));
11010 	    if (donor_prob > 0.5) {
11011 	      *splice_positions_3 = Univcoordlist_push(*splice_positions_3,(chrhigh - chroffset) - genomicpos + 1);
11012 	      *spliceprobs3 = Doublelist_push(*spliceprobs3,donor_prob);
11013 	      debug13(printf(" **"));
11014 	    }
11015 	    debug13(printf("\n"));
11016 	    genomicpos--;
11017 	    querypos++;
11018 	  }
11019 	  while (genomicpos >= end_genomicpos &&
11020 		 genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
11021 	    donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 8 */
11022 	    debug13(printf("3', crick, sense forward %d %u %u %f",querypos,chroffset+genomicpos,genomicpos,donor_prob));
11023 	    if (donor_prob > 0.5) {
11024 	      *splice_positions_3 = Univcoordlist_push(*splice_positions_3,(chrhigh - chroffset) - genomicpos + 1);
11025 	      *spliceprobs3 = Doublelist_push(*spliceprobs3,donor_prob);
11026 	      debug13(printf(" **"));
11027 	    }
11028 	    debug13(printf("\n"));
11029 	    genomicpos--;
11030 	    querypos++;
11031 	  }
11032 	  debug13(printf("\n"));
11033 	}
11034 
11035       } else {
11036 	fprintf(stderr,"Not expecting cdna_direction to be 0\n");
11037 	abort();
11038       }
11039     }
11040   }
11041 
11042   return;
11043 }
11044 #endif
11045 
11046 
11047 #ifndef GSNAP
11048 /* Modified from make_microexon_pairs_double in dynprog_single.c */
11049 static List_T
add_microexon_pairs(List_T pairs,int querydpM,Chrpos_T genomedpM,int lengthM,char * queryseq_ptr,char * queryuc_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,Pairpool_T pairpool,int dynprogindex)11050 add_microexon_pairs (List_T pairs, int querydpM, Chrpos_T genomedpM, int lengthM,
11051 		     char *queryseq_ptr, char *queryuc_ptr,
11052 		     Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
11053 		     int genestrand, Pairpool_T pairpool, int dynprogindex) {
11054   char c1, c1_uc, c2, c2_alt;
11055   int i;
11056 
11057   /* Microexon */
11058   for (i = 0; i < lengthM; i++) {
11059     c1 = queryseq_ptr[querydpM+i];
11060     c1_uc = queryuc_ptr[querydpM+i];
11061 
11062     c2 = get_genomic_nt(&c2_alt,genomedpM+i,chroffset,chrhigh,watsonp);
11063 #ifdef EXTRACT_GENOMICSEG
11064     assert(c2 == genomicseg[genomedpM+i]);
11065 #endif
11066 
11067     if (c1_uc == c2 || c1_uc == c2_alt) {
11068       pairs = Pairpool_push(pairs,pairpool,querydpM+i,genomedpM+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
11069 			    dynprogindex);
11070     } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
11071       pairs = Pairpool_push(pairs,pairpool,querydpM+i,genomedpM+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
11072 			    dynprogindex);
11073     } else {
11074       pairs = Pairpool_push(pairs,pairpool,querydpM+i,genomedpM+i,c1,MISMATCH_COMP,c2,c2_alt,
11075 			    dynprogindex);
11076     }
11077   }
11078 
11079   return pairs;
11080 }
11081 #endif
11082 
11083 
11084 #ifndef GSNAP
11085 static List_T
traverse_dual_break(List_T pairs,List_T * path,Pair_T leftpair,Pair_T rightpair,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,int querylength,int cdna_direction,bool watsonp,int genestrand,Pairpool_T pairpool,int maxpeelback,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,int * dynprogindex)11086 traverse_dual_break (List_T pairs, List_T *path, Pair_T leftpair, Pair_T rightpair,
11087 		     Univcoord_T chroffset, Univcoord_T chrhigh,
11088 #ifdef PMAP
11089 		     char *queryaaseq_ptr,
11090 #endif
11091 		     char *queryseq_ptr, char *queryuc_ptr, int querylength,
11092 		     int cdna_direction, bool watsonp, int genestrand, Pairpool_T pairpool, int maxpeelback,
11093 		     Oligoindex_array_T oligoindices_minor,
11094 		     Diagpool_T diagpool, Cellpool_T cellpool, int *dynprogindex) {
11095   List_T gappairs, peeled_pairs = NULL, peeled_path = NULL, q, r;
11096   int querydp5, querydp3;
11097   Chrpos_T genomedp5, genomedp3;
11098   Univcoord_T best_splicepos5, best_splicepos3, best_splicepos5_with_canonical, best_splicepos3_with_canonical,
11099     splicepos5, splicepos3;
11100   bool protectedp;
11101   int n_peeled_indels;
11102   Pair_T firstpair, lastpair;
11103   Chrpos_T chrstart, chrend;
11104 
11105   Univcoordlist_T splice_positions_5, splice_positions_3, a, b;
11106   Doublelist_T spliceprobs5, spliceprobs3, x, y;
11107   Intlist_T hits = NULL, p;
11108   int best_middlelength, best_middlelength_with_canonical, middlelength;
11109   int best_candidate, best_candidate_with_canonical, candidate;
11110   Chrpos_T splicesitepos;
11111   double bestprob = 0.0, bestprob_with_canonical = 0.0, prob1, prob2, prob3, prob4;
11112   char intron1, intron2, intron3, intron4;
11113   char c1_alt, c2_alt, c3_alt, c4_alt;
11114   int nloops;
11115 
11116   debug14(printf("\nTRAVERSE_DUAL_BREAK\n"));
11117 
11118   /* First, try to find a microexon */
11119   find_dual_break_spliceends(*path,pairs,&spliceprobs5,&spliceprobs3,&splice_positions_5,&splice_positions_3,
11120 			     cdna_direction,watsonp,chroffset,chrhigh);
11121 
11122   if (cdna_direction > 0) {
11123     intron1 = 'G'; intron2 = 'T'; intron3 = 'A'; intron4 = 'G';
11124   } else if (cdna_direction < 0) {
11125     intron1 = 'C'; intron2 = 'T'; intron3 = 'A'; intron4 = 'C';
11126   }
11127 
11128 
11129   debug(printf("Number of 5' splice positions: %d\n",Univcoordlist_length(splice_positions_5)));
11130   debug(printf("Number of 3' splice positions: %d\n",Univcoordlist_length(splice_positions_3)));
11131   nloops = Univcoordlist_length(splice_positions_5) * Univcoordlist_length(splice_positions_3);
11132 
11133   if (nloops < MAX_MICROEXON_LOOPS) {
11134     for (a = splice_positions_5, x = spliceprobs5; a != NULL;
11135 	 a = Univcoordlist_next(a), x = Doublelist_next(x)) {
11136       prob1 = Doublelist_head(x);
11137       splicepos5 = Univcoordlist_head(a);
11138 
11139       q = *path;
11140       while (q != NULL && ((Pair_T) q->first)->genomepos > splicepos5) {
11141 	q = List_next(q);
11142       }
11143       if (q == NULL) {
11144 	leftpair = (Pair_T) NULL;
11145       } else {
11146 	leftpair = (Pair_T) q->first;
11147 	querydp5 = leftpair->querypos + 1;
11148 	genomedp5 = leftpair->genomepos + 1;
11149       }
11150 
11151       for (b = splice_positions_3, y = spliceprobs3; b != NULL;
11152 	   b = Univcoordlist_next(b), y = Doublelist_next(y)) {
11153 	prob4 = Doublelist_head(y);
11154 	splicepos3 = Univcoordlist_head(b);
11155 
11156 	r = pairs;
11157 	while (r != NULL && ((Pair_T) r->first)->genomepos < splicepos3) {
11158 	  r = List_next(r);
11159 	}
11160 	if (r == NULL) {
11161 	  rightpair = (Pair_T) NULL;
11162 	} else {
11163 	  rightpair = (Pair_T) r->first;
11164 	  querydp3 = rightpair->querypos - 1;
11165 	  genomedp3 = rightpair->genomepos - 1;
11166 	}
11167 
11168 	if (leftpair != NULL && rightpair != NULL &&
11169 	    (middlelength = querydp3 - querydp5 + 1) > MIN_MICROEXON_LENGTH) {
11170 	  debug(printf("middlelength %d = %d - %d + 1\n",middlelength,querydp3,querydp5));
11171 	  hits = BoyerMoore_nt(&(queryuc_ptr[querydp5]),/*querylen*/middlelength,
11172 			       /*textleft*/genomedp5,/*textlen*/genomedp3 - genomedp5 + 1,
11173 			       chroffset,chrhigh,watsonp);
11174 	  for (p = hits; p != NULL; p = Intlist_next(p)) {
11175 	    candidate = genomedp5 + Intlist_head(p);
11176 
11177 	    /* Not handling known splice sites yet */
11178 	    if (watsonp == true) {
11179 	      if (cdna_direction > 0) {
11180 		splicesitepos = chroffset + (candidate-1) + 1;
11181 		prob2 = Maxent_hr_acceptor_prob(splicesitepos,chroffset);
11182 		splicesitepos = chroffset + candidate+middlelength;
11183 		prob3 = Maxent_hr_donor_prob(splicesitepos,chroffset);
11184 	      } else {
11185 		splicesitepos = chroffset + (candidate-1) + 1;
11186 		prob2 = Maxent_hr_antidonor_prob(splicesitepos,chroffset);
11187 		splicesitepos = chroffset + candidate+middlelength;
11188 		prob3 = Maxent_hr_antiacceptor_prob(splicesitepos,chroffset);
11189 	      }
11190 	    } else {
11191 	      if (cdna_direction > 0) {
11192 		splicesitepos = chrhigh - (candidate-1);
11193 		prob2 = Maxent_hr_antiacceptor_prob(splicesitepos,chroffset);
11194 		splicesitepos = chrhigh - (candidate+middlelength) + 1;
11195 		prob3 = Maxent_hr_antidonor_prob(splicesitepos,chroffset);
11196 	      } else {
11197 		splicesitepos = chrhigh - (candidate-1);
11198 		prob2 = Maxent_hr_donor_prob(splicesitepos,chroffset);
11199 		splicesitepos = chrhigh - (candidate+middlelength) + 1;
11200 		prob3 = Maxent_hr_acceptor_prob(splicesitepos,chroffset);
11201 	      }
11202 	    }
11203 
11204 	    debug13(printf("end probabilities: prob1 = %f, prob4 = %f, microexon probabilities: prob2 = %f, prob3 = %f\n",prob1,prob4,prob2,prob3));
11205 	    if (prob1 + prob2 + prob3 + prob4 > bestprob) {
11206 	      best_splicepos5 = splicepos5;
11207 	      best_candidate = candidate;
11208 	      best_middlelength = middlelength;
11209 	      best_splicepos3 = splicepos3;
11210 	      bestprob = prob1 + prob2 + prob3 + prob4;
11211 	    }
11212 
11213 	    debug(printf("candidate: at %u\n",candidate));
11214 	    debug(printf("intron3 %c\n",get_genomic_nt(&c3_alt,candidate-2,chroffset,chrhigh,watsonp)));
11215 	    debug(printf("intron4 %c\n",get_genomic_nt(&c4_alt,candidate-1,chroffset,chrhigh,watsonp)));
11216 	    debug(printf("intron1 %c\n",get_genomic_nt(&c1_alt,candidate+middlelength,chroffset,chrhigh,watsonp)));
11217 	    debug(printf("intron2 %c\n",get_genomic_nt(&c2_alt,candidate+middlelength+1,chroffset,chrhigh,watsonp)));
11218 
11219 	    if (/*genomicuc[candidate - 2]*/ get_genomic_nt(&c3_alt,candidate-2,chroffset,chrhigh,watsonp) == intron3 &&
11220 		/*genomicuc[candidate - 1]*/ get_genomic_nt(&c4_alt,candidate-1,chroffset,chrhigh,watsonp)  == intron4 &&
11221 		/*genomicuc[candidate + middlelength]*/ get_genomic_nt(&c1_alt,candidate+middlelength,chroffset,chrhigh,watsonp) == intron1 &&
11222 		/*genomicuc[candidate + middlelength + 1]*/ get_genomic_nt(&c2_alt,candidate+middlelength+1,chroffset,chrhigh,watsonp) == intron2) {
11223 	      debug(printf("  Canonical microexon at %d >>> %d..%d >>> %d\n",genomedp5,candidate,candidate+middlelength,genomedp3));
11224 	      if (prob1 + prob2 + prob3 + prob4 > bestprob_with_canonical) {
11225 		best_splicepos5_with_canonical = splicepos5;
11226 		best_candidate_with_canonical = candidate;
11227 		best_middlelength_with_canonical = middlelength;
11228 		best_splicepos3_with_canonical = splicepos3;
11229 		bestprob_with_canonical = prob1 + prob2 + prob3 + prob4;
11230 	      }
11231 	    }
11232 	  }
11233 
11234 	  Intlist_free(&hits);
11235 	}
11236       }
11237     }
11238   }
11239 
11240   debug13(printf("best prob is %f\n",bestprob));
11241   if (bestprob > 3.0) {
11242     while ((*path) != NULL && ((Pair_T) (*path)->first)->genomepos > best_splicepos5) {
11243       *path = List_next(*path);
11244     }
11245     while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < best_splicepos3) {
11246       pairs = List_next(pairs);
11247     }
11248     leftpair = (Pair_T) (*path)->first;
11249     querydp5 = leftpair->querypos + 1;
11250 
11251     debug13(printf("Making microexon pairs with splicepos5 %u, candidate %u, middlelength %d, splicepos3 %u\n",
11252 		   best_splicepos5,best_candidate,best_middlelength,best_splicepos3));
11253     *path = add_microexon_pairs(*path,/*querydpM*/querydp5,/*genomedpM*/best_candidate,
11254 				/*lengthM*/best_middlelength,queryseq_ptr,queryuc_ptr,
11255 				chroffset,chrhigh,watsonp,genestrand,pairpool,*dynprogindex);
11256     *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
11257 
11258     Univcoordlist_free(&splice_positions_5);
11259     Univcoordlist_free(&splice_positions_3);
11260     Doublelist_free(&spliceprobs3);
11261     Doublelist_free(&spliceprobs5);
11262 
11263     return pairs;
11264 
11265   } else if (bestprob_with_canonical > 0.0) {
11266     while ((*path) != NULL && ((Pair_T) (*path)->first)->genomepos > best_splicepos5_with_canonical) {
11267       *path = List_next(*path);
11268     }
11269     while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < best_splicepos3_with_canonical) {
11270       pairs = List_next(pairs);
11271     }
11272     leftpair = (Pair_T) (*path)->first;
11273     querydp5 = leftpair->querypos + 1;
11274 
11275     debug13(printf("Making microexon pairs with splicepos5 %u, candidate %u, middlelength %d, splicepos3 %u\n",
11276 		   best_splicepos5_with_canonical,best_candidate_with_canonical,best_middlelength_with_canonical,best_splicepos3_with_canonical));
11277     *path = add_microexon_pairs(*path,/*querydpM*/querydp5,/*genomedpM*/best_candidate_with_canonical,
11278 				/*lengthM*/best_middlelength_with_canonical,queryseq_ptr,queryuc_ptr,
11279 				chroffset,chrhigh,watsonp,genestrand,pairpool,*dynprogindex);
11280     *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
11281 
11282     Univcoordlist_free(&splice_positions_5);
11283     Univcoordlist_free(&splice_positions_3);
11284     Doublelist_free(&spliceprobs3);
11285     Doublelist_free(&spliceprobs5);
11286 
11287     return pairs;
11288 
11289   } else {
11290     Univcoordlist_free(&splice_positions_5);
11291     Univcoordlist_free(&splice_positions_3);
11292     Doublelist_free(&spliceprobs3);
11293     Doublelist_free(&spliceprobs5);
11294   }
11295 
11296 
11297 
11298   /* Try to solve without a microexon */
11299   if (*path == NULL) {
11300     leftpair = (Pair_T) NULL;
11301   } else {
11302     leftpair = (Pair_T) (*path)->first;
11303   }
11304 
11305   if (pairs == NULL) {
11306     rightpair = (Pair_T) NULL;
11307   } else {
11308     rightpair = (Pair_T) pairs->first;
11309   }
11310 
11311   if (leftpair != NULL && rightpair != NULL) {
11312     querydp5 = leftpair->querypos + 1;
11313     genomedp5 = leftpair->genomepos + 1;
11314     /* if (leftpair->cdna == ' ') querydp5--; -- For old dynamic programming */
11315     /* if (leftpair->genome == ' ') genomedp5--; -- For old dynamic programming */
11316     querydp3 = rightpair->querypos - 1;
11317     genomedp3 = rightpair->genomepos - 1;
11318   } else if (leftpair == NULL) {
11319     querydp5 = 0;
11320     genomedp5 = rightpair->genomepos - rightpair->querypos - 100;
11321     querydp3 = rightpair->querypos - 1;
11322     genomedp3 = rightpair->genomepos - 1;
11323   } else if (rightpair == NULL) {
11324     querydp5 = leftpair->querypos + 1;
11325     genomedp5 = leftpair->genomepos + 1;
11326     /* if (leftpair->cdna == ' ') querydp5--; -- For old dynamic programming */
11327     /* if (leftpair->genome == ' ') genomedp5--; -- For old dynamic programming */
11328     querydp3 = querylength - 1;
11329     genomedp3 = leftpair->genomepos + (querylength - leftpair->querypos) + 100;
11330   }
11331 
11332   /* Previously skipped this, but need to do at least a little
11333      peelback to avoid gaps at either end */
11334   protectedp = false;
11335   pairs = peel_rightward(&n_peeled_indels,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
11336 			 maxpeelback,/*stop_at_indels_p*/true);
11337   *path = peel_leftward(&n_peeled_indels,&protectedp,&peeled_path,*path,&querydp5,&genomedp5,
11338 			maxpeelback,/*stop_at_indels_p*/true);
11339 #ifdef PMAP
11340   querydp3 /= 3;
11341   querydp5 /= 3;
11342 #endif
11343 
11344   debug14(printf("genome %d..%d, query %d..%d\n",genomedp5,genomedp3,querydp5,querydp3));
11345 #ifdef EXTRACT_GENOMICSEG
11346   debug14(printf("genome %.*s\n",genomedp3-genomedp5+1,&(genomicseg_ptr[genomedp5])));
11347 #endif
11348 #ifdef PMAP
11349   debug14(printf("query %.*s\n",querydp3-querydp5+1,&(queryaaseq_ptr[querydp5])));
11350 #else
11351   debug14(printf("query %.*s\n",querydp3-querydp5+1,&(queryseq_ptr[querydp5])));
11352 #endif
11353 
11354   if (watsonp) {
11355     chrstart = genomedp5;
11356     chrend = genomedp3;
11357   } else {
11358     chrstart = (chrhigh - chroffset) - genomedp3;
11359     chrend = (chrhigh - chroffset) - genomedp5;
11360   }
11361 
11362   /* chrend can be equal to chrstart - 1 if peelback fails on both ends */
11363   assert(chrend + 1 >= chrstart);
11364   debug14(printf("Starting stage2 with chrstart %u, chrend %u, watsonp %d\n",
11365 		 chrstart,chrend,watsonp));
11366   gappairs = Stage2_compute_one(
11367 #ifdef PMAP
11368 				  &(queryaaseq_ptr[querydp5]),&(queryaaseq_ptr[querydp5]),
11369 				  /*querylength*/querydp3-querydp5+1,/*query_offset*/querydp5*3,
11370 #else
11371 				  &(queryseq_ptr[querydp5]),&(queryuc_ptr[querydp5]),
11372 				  /*querylength*/querydp3-querydp5+1,/*query_offset*/querydp5,
11373 #endif
11374 				  chrstart,chrend,chroffset,chrhigh,/*plusp*/watsonp,genestrand,
11375 
11376 				  oligoindices_minor,pairpool,diagpool,cellpool,
11377 				  /*localp should be false*/true,/*skip_repetitive_p*/false,
11378 				  /*favor_right_p*/false,/*debug_graphic_p*/false);
11379 
11380   debug14(printf("Internal stage2 result:\n"));
11381   debug14(Pair_dump_list(gappairs,true));
11382 
11383   if (gappairs == NULL) {
11384     pairs = Pairpool_transfer(pairs,peeled_pairs);
11385     *path = Pairpool_transfer(*path,peeled_path);
11386     if (*path != NULL && pairs != NULL) {
11387       /* Do not put a gap at the end of the alignment */
11388       pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/UNKNOWNJUMP,/*genomejump*/UNKNOWNJUMP,
11389 				      /*leftpair*/(*path)->first,/*rightpair*/pairs->first,/*knownp*/false);
11390     }
11391 
11392   } else {
11393     lastpair = (Pair_T) gappairs->first;
11394     firstpair = (Pair_T) List_last_value(gappairs);
11395     debug14(printf("gappairs goes from %d to %d\n",firstpair->querypos,lastpair->querypos));
11396     if (1 || (firstpair->querypos == querydp5 && lastpair->querypos == querydp3)) {
11397       /* Note: We have to take this branch, otherwise we get unexpected comp errors */
11398       /* fprintf(stderr,"%d..%d .. %d..%d\n",querydp5,firstpair->querypos,lastpair->querypos,querydp3); */
11399       debug14(printf("  => entire query sequence bridged or not, but taking it regardless\n"));
11400       pairs = Pairpool_transfer(pairs,gappairs);
11401 
11402     } else {
11403       debug14(printf("  => entire query sequence not bridged, so abort\n"));
11404       pairs = Pairpool_transfer(pairs,peeled_pairs);
11405       *path = Pairpool_transfer(*path,peeled_path);
11406       if (*path != NULL && pairs != NULL) {
11407 	/* Do not put a gap at the end of the alignment */
11408 	pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/UNKNOWNJUMP,/*genomejump*/UNKNOWNJUMP,
11409 					/*leftpair*/(*path)->first,/*rightpair*/pairs->first,/*knownp*/false);
11410       }
11411     }
11412   }
11413 
11414   return pairs;
11415 }
11416 #endif
11417 
11418 
11419 /************************************************************************
11420  *   End of traversal functions
11421  ************************************************************************/
11422 
11423 static List_T
build_dual_breaks(bool * dual_break_p,int * dynprogindex_minor,int * dynprogindex_major,List_T path,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,int querylength,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3,int maxpeelback,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,double defect_rate,bool finalp,bool simplep)11424 build_dual_breaks (bool *dual_break_p, int *dynprogindex_minor, int *dynprogindex_major, List_T path,
11425 		   Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
11426 #ifdef PMAP
11427 		   char *queryaaseq_ptr,
11428 #endif
11429 		   char *queryseq_ptr, char *queryuc_ptr, int querylength,
11430 		   int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
11431 		   Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
11432 		   Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3, int maxpeelback,
11433 #ifndef GSNAP
11434 		   Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
11435 #endif
11436 		   double defect_rate, bool finalp, bool simplep) {
11437 
11438   List_T pairs = NULL;
11439   Pair_T pair, leftpair, rightpair;
11440   bool filledp, shiftp;
11441 
11442   debug(printf("Entered build_dual_breaks\n"));
11443   *dual_break_p = false;
11444   debug(Pair_dump_list(path,true));
11445 
11446   while (path != NULL) {
11447     /* pairptr = path; */
11448     /* path = Pairpool_pop(path,&pair); */
11449     pair = (Pair_T) path->first;
11450 
11451     /* Cannot rely on previous procedures to assign pair->comp value */
11452     if (pair->gapp == false /*|| pair->comp != DUALBREAK_COMP*/) {
11453 #ifdef WASTE
11454       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11455 #else
11456       pairs = List_transfer_one(pairs,&path);
11457 #endif
11458     } else if (path->rest == NULL || pairs == NULL) {
11459       debug(printf("Observed a gap at the end of the alignment, case 1\n"));
11460       path = Pairpool_pop(path,&pair);
11461 
11462     } else {
11463       /* pairptr = path; -- save */
11464       path = Pairpool_pop(path,&pair);
11465 
11466       leftpair = path->first;
11467       rightpair = pairs->first;
11468       if (leftpair->querypos < 0 || rightpair->querypos < 0) {
11469 	debug(printf("Observed a gap at the end of the alignment, case 2\n"));
11470       } else {
11471 	debug(printf("Observed a gap at %d..%d with queryjump = %d, genomejump = %d\n",
11472 		     leftpair->querypos,rightpair->querypos,pair->queryjump,pair->genomejump));
11473 
11474 	if (0 && finalp == true) {
11475 	  /* If genomejump is too large, this causes problem with allocation in Dynprog_T objects */
11476 	  debug(printf("  Final: solve as a single gap\n"));
11477 	  pairs = traverse_single_gap(&filledp,&(*dynprogindex_minor),pairs,&path,leftpair,rightpair,
11478 				      chroffset,chrhigh,
11479 				      queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
11480 				      jump_late_p,pairpool,dynprogM,last_genomedp5,last_genomedp3,
11481 				      maxpeelback,defect_rate,/*forcep*/true,/*finalp*/false);
11482 
11483 	} else if (pair->genomejump - pair->queryjump < SINGLESLEN &&
11484 		   pair->queryjump - pair->genomejump < SINGLESLEN) {
11485 	  debug(printf("  Can be solved as a single gap\n"));
11486 	  pairs = traverse_single_gap(&filledp,&(*dynprogindex_minor),pairs,&path,leftpair,rightpair,
11487 				      chroffset,chrhigh,
11488 				      queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
11489 				      jump_late_p,pairpool,dynprogM,last_genomedp5,last_genomedp3,
11490 				      maxpeelback,defect_rate,/*forcep*/true,/*finalp*/false);
11491 
11492 #ifdef GSNAP
11493 	} else {
11494 #else
11495 	} else if (pair->queryjump < MIN_STAGE2_FOR_DUALBREAK) {
11496 #endif
11497 	  debug(printf("  Too small for a dual break\n"));
11498 	  pairs = traverse_genome_gap(&filledp,&shiftp,&(*dynprogindex_minor),&(*dynprogindex_major),
11499 				      pairs,&path,leftpair,rightpair,chrnum,chroffset,chrhigh,
11500 				      queryseq_ptr,queryuc_ptr,querylength,
11501 				      cdna_direction,watsonp,genestrand,jump_late_p,
11502 				      pairpool,dynprogL,dynprogM,dynprogR,last_genomedp5,last_genomedp3,
11503 				      maxpeelback,defect_rate,/*finalp*/false,simplep);
11504 
11505 #ifndef GSNAP
11506 	} else {
11507 	  debug(printf("  Solving as a dual break\n"));
11508 	  *dual_break_p = true;
11509 	  pairs = traverse_dual_break(pairs,&path,leftpair,rightpair,chroffset,chrhigh,
11510 #ifdef PMAP
11511 				      queryaaseq_ptr,
11512 #endif
11513 				      queryseq_ptr,queryuc_ptr,querylength,cdna_direction,watsonp,genestrand,
11514 				      pairpool,maxpeelback,oligoindices_minor,
11515 				      diagpool,cellpool,&(*dynprogindex_major));
11516 #endif
11517 	}
11518       }
11519     }
11520   }
11521 
11522   debug(printf("After build_dual_breaks:\n"));
11523   debug(Pair_dump_list(pairs,true));
11524 
11525   return pairs;
11526 }
11527 
11528 
11529 
11530 /* Note: querypos is actually indexsize nt to the left of the last nt match.
11531 
11532 	||||||||********   X  X	 XX X	X
11533 	       ^	 <- queryjump->	 ^
11534 	    leftquerypos		 rightquerypos
11535 
11536 		<-     querydpspan     ->
11537 */
11538 static List_T
build_path_end3(bool * knownsplicep,int * ambig_end_length_3,Splicetype_T * ambig_splicetype_3,double * ambig_prob_3,bool * chop_exon_p,int * dynprogindex_minor,List_T path,Univcoord_T chroffset,Univcoord_T chrhigh,int querylength,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,char * queryseq_ptr,char * queryuc_ptr,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,int maxpeelback,double defect_rate,Pairpool_T pairpool,Dynprog_T dynprogL,bool extendp,Endalign_T endalign,bool forcep)11539 build_path_end3 (bool *knownsplicep, int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, double *ambig_prob_3,
11540 		 bool *chop_exon_p, int *dynprogindex_minor,
11541 		 List_T path, Univcoord_T chroffset, Univcoord_T chrhigh, int querylength,
11542 		 Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
11543 		 char *queryseq_ptr, char *queryuc_ptr,
11544 		 int cdna_direction, bool watsonp, int genestrand, bool jump_late_p, int maxpeelback,
11545 		 double defect_rate, Pairpool_T pairpool, Dynprog_T dynprogL,
11546 		 bool extendp, Endalign_T endalign, bool forcep) {
11547   List_T gappairs;
11548   Pair_T leftpair;
11549   /* int genomejump */
11550   int queryjump, rightquerypos;
11551   int finalscore;
11552 
11553 #if 0
11554   if (*ambig_end_length_3 > 0) {
11555     debug(printf("ambig_end_length_3 is %d, so returning path\n",*ambig_end_length_3));
11556     return path;
11557   } else {
11558     path = clean_path_end3_gap_indels(path);
11559   }
11560 #else
11561   /* Always want to clean indels at end */
11562   path = clean_path_end3_gap_indels(path);
11563   if (*ambig_end_length_3 > 0) {
11564     debug(printf("ambig_end_length_3 is %d, so returning path\n",*ambig_end_length_3));
11565     return path;
11566   }
11567 #endif
11568 
11569 
11570   *knownsplicep = false;
11571   if (path == NULL) {
11572     *ambig_end_length_3 = 0;
11573     *ambig_prob_3 = 0.0;
11574     return (List_T) NULL;
11575   } else {
11576     leftpair = path->first;
11577   }
11578   debug(printf("Stage 3 (dir %d): 3' end: leftquerypos = %d, rightquerypos = %d, leftgenomepos = %d\n",
11579 	       cdna_direction,leftpair->querypos,querylength,leftpair->genomepos));
11580   if (leftpair->querypos < 0) {
11581     *ambig_end_length_3 = 0;
11582     *ambig_prob_3 = 0.0;
11583     return (List_T) NULL;
11584     /* abort(); */
11585   }
11586 
11587   queryjump = querylength - leftpair->querypos - 1;
11588   /* genomejump = genomiclength - leftpair->genomepos - 1; */
11589   /* if (leftpair->cdna == ' ') queryjump++; -- For old dynamic programming */
11590   /* if (leftpair->genome == ' ') genomejump++; */
11591 
11592   /* Note difference with 5' case.  We use queryjump+1 here instead of queryjump and genomejump */
11593   /* Do use nullgap here.  Truncating back to entire exon can slow algorithm down significantly. */
11594   if (/* 0 && */ queryjump+1 > nullgap) {
11595     rightquerypos = leftpair->querypos + nullgap + 1;
11596     debug(printf("Since queryjump+1 %d > nullgap %d, setting rightquerypos %d = %d + %d + 1\n",
11597 		 queryjump+1,nullgap,rightquerypos,leftpair->querypos,nullgap));
11598   } else {
11599     rightquerypos = querylength;
11600   }
11601 
11602   if (extendp == true) {
11603     debug(printf("Running extend_ending3\n"));
11604     *chop_exon_p = false;
11605     gappairs = extend_ending3(&(*knownsplicep),&(*dynprogindex_minor),&finalscore,
11606 			      &(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
11607 			      &path,leftpair,rightquerypos,querylength,
11608 			      chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
11609 			      queryseq_ptr,queryuc_ptr,
11610 			      cdna_direction,watsonp,genestrand,jump_late_p,pairpool,dynprogL,maxpeelback,
11611 			      defect_rate,endalign,forcep);
11612   } else {
11613     /* Looks like we aren't calling this anymore */
11614     abort();
11615     debug(printf("Running distalmedial_ending3\n"));
11616     gappairs = distalmedial_ending3(&(*knownsplicep),&(*chop_exon_p),&(*dynprogindex_minor),
11617 				    &finalscore,&(*ambig_end_length_3),&(*ambig_prob_3),
11618 				    &path,leftpair,rightquerypos,chroffset,chrhigh,
11619 				    queryseq_ptr,queryuc_ptr,
11620 				    watsonp,genestrand,jump_late_p,pairpool,dynprogL,
11621 				    maxpeelback,defect_rate);
11622   }
11623 
11624   debug(printf("Gappairs from build_path_end3:\n"));
11625   debug(Pair_dump_list(gappairs,true));
11626 
11627   path = Pairpool_transfer(path,gappairs);
11628 
11629   debug(printf("Final result of build_path_end3:\n"));
11630   debug(Pair_dump_list(path,true));
11631   debug(printf("\n"));
11632 
11633   debug(printf("ambig_end_length_3 is %d\n",*ambig_end_length_3));
11634 
11635   return path;
11636 }
11637 
11638 
11639 /* Schematic:
11640 
11641        <- queryjump ->
11642        X  X  XX X   X ********||||||||
11643       ^		      ^
11644    leftquerypos	      rightquerypos
11645 
11646        <-    querydpspan    ->
11647 
11648 */
11649 static List_T
build_pairs_end5(bool * knownsplicep,int * ambig_end_length_5,Splicetype_T * ambig_splicetype_5,double * ambig_prob_5,bool * chop_exon_p,int * dynprogindex_minor,List_T pairs,Univcoord_T chroffset,Univcoord_T chrhigh,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,char * queryseq_ptr,char * queryuc_ptr,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,int maxpeelback,double defect_rate,Pairpool_T pairpool,Dynprog_T dynprogR,bool extendp,Endalign_T endalign,bool forcep)11650 build_pairs_end5 (bool *knownsplicep, int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, double *ambig_prob_5,
11651 		  bool *chop_exon_p, int *dynprogindex_minor, List_T pairs,
11652 		  Univcoord_T chroffset, Univcoord_T chrhigh,
11653 		  Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
11654 		  char *queryseq_ptr, char *queryuc_ptr,
11655 		  int cdna_direction, bool watsonp, int genestrand, bool jump_late_p, int maxpeelback,
11656 		  double defect_rate, Pairpool_T pairpool, Dynprog_T dynprogR,
11657 		  bool extendp, Endalign_T endalign, bool forcep) {
11658   List_T gappairs;
11659   Pair_T rightpair;
11660   int queryjump, leftquerypos;
11661   int finalscore;
11662   /* int genomejump */
11663 
11664 #if 0
11665   if (*ambig_end_length_5 > 0) {
11666     debug(printf("ambig_end_length_5 is %d, so returning pairs\n",*ambig_end_length_5));
11667     return pairs;
11668   } else {
11669     pairs = clean_pairs_end5_gap_indels(pairs);
11670   }
11671 #else
11672   /* Always want to clean indels at end */
11673   pairs = clean_pairs_end5_gap_indels(pairs);
11674   if (*ambig_end_length_5 > 0) {
11675     debug(printf("ambig_end_length_5 is %d, so returning pairs\n",*ambig_end_length_5));
11676     return pairs;
11677   }
11678 #endif
11679 
11680 
11681   *knownsplicep = false;
11682   if (pairs == NULL) {
11683     *ambig_end_length_5 = 0;
11684     *ambig_prob_5 = 0.0;
11685     return (List_T) NULL;
11686   } else {
11687     rightpair = pairs->first;
11688   }
11689   debug(printf("Stage 3 (dir %d): 5' end: leftquerypos = %d, rightquerypos = %d, leftgenomepos = %d\n",
11690 	       cdna_direction,-1,rightpair->querypos,-1));
11691   if (rightpair->querypos < 0) {
11692     *ambig_end_length_5 = 0;
11693     *ambig_prob_5 = 0.0;
11694     return (List_T) NULL;
11695     /* abort(); */
11696   }
11697 
11698   queryjump = rightpair->querypos; /* - leftquerypos (-1) - 1 */
11699   /* genomejump = rightpair->genomepos; */  /* - leftgenomepos (-1) - 1 */
11700 
11701   /* Note difference with 3' case.  We use queryjump here instead of queryjump+1 */
11702   /* Do use nullgap here.  Truncating back to entire exon can slow algorithm significantly. */
11703   if (/*0 && */ queryjump > nullgap) {
11704     leftquerypos = rightpair->querypos - nullgap - 1;
11705   } else {
11706     leftquerypos = -1;
11707   }
11708 
11709   if (extendp == true) {
11710     debug(printf("Running extend_ending5\n"));
11711     *chop_exon_p = false;
11712     gappairs = extend_ending5(&(*knownsplicep),&(*dynprogindex_minor),
11713 			      &finalscore,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
11714 			      &pairs,leftquerypos,rightpair,
11715 			      chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
11716 			      queryseq_ptr,queryuc_ptr,
11717 			      cdna_direction,watsonp,genestrand,jump_late_p,pairpool,dynprogR,
11718 			      maxpeelback,defect_rate,endalign,forcep);
11719   } else {
11720     /* Looks like we aren't calling this anymore */
11721     abort();
11722     debug(printf("Running distalmedial_ending5\n"));
11723     gappairs = distalmedial_ending5(&(*knownsplicep),&(*chop_exon_p),&(*dynprogindex_minor),
11724 				    &finalscore,&(*ambig_end_length_5),&(*ambig_prob_5),
11725 				    &pairs,leftquerypos,rightpair,chroffset,chrhigh,
11726 				    queryseq_ptr,queryuc_ptr,
11727 				    watsonp,genestrand,jump_late_p,pairpool,dynprogR,
11728 				    maxpeelback,defect_rate);
11729   }
11730 
11731   debug(printf("Gappairs from build_pairs_end5:\n"));
11732   debug(Pair_dump_list(gappairs,true));
11733 
11734   pairs = Pairpool_transfer(pairs,gappairs);
11735 
11736   debug(printf("Final result of build_pairs_end5:\n"));
11737   debug(Pair_dump_list(pairs,true));
11738   debug(printf("\n"));
11739 
11740   debug(printf("ambig_end_length_5 is %d\n",*ambig_end_length_5));
11741 
11742   return pairs;
11743 }
11744 
11745 
11746 /* maxsize can be either 3 or nullgap */
11747 static List_T
build_pairs_singles(int * dynprogindex,List_T path,int maxsize,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,int querylength,bool watsonp,int genestrand,bool jump_late_p,int maxpeelback,double defect_rate,Pairpool_T pairpool,Dynprog_T dynprogM,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3,bool forcep,bool finalp)11748 build_pairs_singles (int *dynprogindex, List_T path, int maxsize,
11749 		     Univcoord_T chroffset, Univcoord_T chrhigh,
11750 		     char *queryseq_ptr, char *queryuc_ptr, int querylength,
11751 		     bool watsonp, int genestrand, bool jump_late_p, int maxpeelback, double defect_rate,
11752 		     Pairpool_T pairpool, Dynprog_T dynprogM,
11753 		     Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3, bool forcep, bool finalp) {
11754   List_T pairs = NULL, pairptr;
11755   Pair_T pair, leftpair, rightpair;
11756   bool filledp;
11757 
11758   debug(printf("\n** Starting build_pairs_singles with maxsize %d\n",maxsize));
11759 
11760   /* Remove gaps at beginning */
11761   while (path != NULL && ((Pair_T) path->first)->gapp == true) {
11762     path = Pairpool_pop(path,&pair);
11763   }
11764 
11765   while (path != NULL && path->rest != NULL) {
11766     /* pairptr = path; */
11767     /* path = Pairpool_pop(path,&pair); */
11768     pair = (Pair_T) path->first;
11769     if (pair->gapp == false) {
11770 #ifdef WASTE
11771       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11772 #else
11773       pairs = List_transfer_one(pairs,&path);
11774 #endif
11775 
11776     } else if (pair->queryjump > maxsize) {
11777       /* Large gap.  Do nothing */
11778 #ifdef WASTE
11779       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11780 #else
11781       pairs = List_transfer_one(pairs,&path);
11782 #endif
11783 
11784     } else if (pair->queryjump > pair->genomejump + EXTRAQUERYGAP) {
11785       /* cDNA insertion.  Do nothing */
11786 #ifdef WASTE
11787       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11788 #else
11789       pairs = List_transfer_one(pairs,&path);
11790 #endif
11791 
11792     } else if (pair->genomejump > pair->queryjump + SINGLESLEN) {
11793       /* Intron.  Do nothing */
11794 #ifdef WASTE
11795       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11796 #else
11797       pairs = List_transfer_one(pairs,&path);
11798 #endif
11799 
11800     } else if (path->rest == NULL || pairs == NULL) {
11801       fprintf(stderr,"Single gap at beginning or end of alignment\n");
11802       abort();
11803 
11804     } else {
11805       /* Guarantees: queryjump <= nullgap && genomejump < queryjump - EXTRAQUERYGAP &&
11806 	 genomejump <= queryjump + MININTRONLEN, meaning that score matrix is nearly square */
11807       pairptr = path;		/* save */
11808       path = Pairpool_pop(path,&pair);
11809 
11810       leftpair = path->first;
11811       rightpair = pairs->first;
11812 
11813       debug(printf("Stage 3: Traversing single gap small: leftquerypos = %d, rightquerypos = %d, leftgenomepos = %d, rightgenomepos = %d, queryjump %d, genomejump %d\n",
11814 		   leftpair->querypos,rightpair->querypos,leftpair->genomepos,rightpair->genomepos,pair->queryjump,pair->genomejump));
11815       pairs = traverse_single_gap(&filledp,&(*dynprogindex),pairs,&path,leftpair,rightpair,
11816 				  chroffset,chrhigh,queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
11817 				  jump_late_p,pairpool,dynprogM,last_genomedp5,last_genomedp3,
11818 				  maxpeelback,defect_rate,forcep,finalp);
11819       /* (old comment:) forcep needs to be true here to avoid subsequent anomalies in building dualintrons, e.g., XM_376610.2_mRNA on 7:127885572..127888991 */
11820       if (filledp == true) {
11821 	/* Discard the gap */
11822 	debug(printf("Discarding gap ");
11823 	      Pair_dump_one(pair,true);
11824 	      printf("\n"));
11825       } else {
11826 	/* Replace the gap */
11827 	debug(printf("Replacing gap ");
11828 	      Pair_dump_one(pair,true);
11829 	      printf("\n"));
11830 #ifdef WASTE
11831 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
11832 #else
11833 	pairs = List_push_existing(pairs,pairptr);
11834 #endif
11835 
11836       }
11837     }
11838   }
11839 
11840   /* Handle last entry if not a gap */
11841   if (path != NULL && ((Pair_T) path->first)->gapp == false) {
11842     pair = (Pair_T) path->first;
11843     pairs = List_transfer_one(pairs,&path);
11844   }
11845 
11846   return pairs;
11847 }
11848 
11849 
11850 static List_T
build_pairs_dualintrons(int * dynprogindex,List_T path,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryseq_ptr,char * queryuc_ptr,int querylength,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,int maxpeelback,double defect_rate,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogR,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3)11851 build_pairs_dualintrons (int *dynprogindex, List_T path,
11852 			 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
11853 			 char *queryseq_ptr, char *queryuc_ptr, int querylength,
11854 			 int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
11855 			 int maxpeelback, double defect_rate,
11856 			 Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogR,
11857 			 Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3) {
11858   List_T pairs = NULL, midexon_pairs = NULL, pairptr;
11859   Pair_T pair, leftpair, midleftpair, midpair, midrightpair, rightpair;
11860   int midquerypos;
11861   Chrpos_T midgenomepos;
11862   bool left_end_intron_p = false, right_end_intron_p, exonp;
11863 
11864   debug(printf("\n** Starting build_pairs_dualintrons\n"));
11865   debug(Pair_dump_list(path,true));
11866 
11867   /* Remove gaps at beginning */
11868   while (path != NULL && ((Pair_T) path->first)->gapp == true) {
11869     path = Pairpool_pop(path,&pair);
11870   }
11871 
11872   while (path != NULL && path->rest != NULL) {
11873     /* pairptr = path; */
11874     /* path = Pairpool_pop(path,&pair); */
11875     pair = (Pair_T) path->first;
11876 
11877     if (pair->gapp == false) {
11878 #ifdef WASTE
11879       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11880 #else
11881       pairs = List_transfer_one(pairs,&path);
11882 #endif
11883 
11884     } else if (pair->queryjump > nullgap) {
11885       /* Large gap.  Do nothing */
11886 #ifdef WASTE
11887       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11888 #else
11889       pairs = List_transfer_one(pairs,&path);
11890 #endif
11891 
11892     } else if (pair->queryjump > pair->genomejump + EXTRAQUERYGAP) {
11893       /* cDNA insertion.  Do nothing */
11894 #ifdef WASTE
11895       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11896 #else
11897       pairs = List_transfer_one(pairs,&path);
11898 #endif
11899 
11900     } else if (pair->genomejump <= pair->queryjump + MININTRONLEN) {
11901       /* Single gap.  Do nothing */
11902 #ifdef WASTE
11903       pairs = Pairpool_push_existing(pairs,pairpool,pair);
11904 #else
11905       pairs = List_transfer_one(pairs,&path);
11906 #endif
11907 
11908     } else {
11909       pairptr = path;		/* save */
11910       path = Pairpool_pop(path,&pair);
11911 
11912       midpair = path->first;
11913       if (midpair->shortexonp == false) {
11914 	/* Long exon; do nothing */
11915 	debug(printf("I see a long exon at %d...do nothing\n",midpair->querypos));
11916 #ifdef WASTE
11917 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
11918 #else
11919 	pairs = List_push_existing(pairs,pairptr);
11920 #endif
11921 
11922       } else {
11923 	/* Short exon */
11924 	debug(printf("I see a short exon at %d...crossing\n",midpair->querypos));
11925 	right_end_intron_p = pair->end_intron_p;
11926 #ifdef WASTE
11927 	path = Pairpool_pop(path,&midpair);
11928 	midexon_pairs = Pairpool_push_existing(NULL,pairpool,midpair);
11929 #else
11930 	midpair = path->first;
11931 	midexon_pairs = List_transfer_one(NULL,&path);
11932 #endif
11933 	midrightpair = midpair;
11934 
11935 	exonp = true;
11936 	while (path != NULL && exonp) {
11937 #ifdef WASTE
11938 	  path = Pairpool_pop(path,&midpair);
11939 #else
11940 	  midpair = path->first;
11941 #endif
11942 	  if (midpair->gapp == true) {
11943 	    left_end_intron_p = midpair->end_intron_p;
11944 	    exonp = false;
11945 #ifdef WASTE
11946 #else
11947 	    path = path->rest;
11948 #endif
11949 	  } else {
11950 #ifdef WASTE
11951 	    midexon_pairs = Pairpool_push_existing(midexon_pairs,pairpool,midpair);
11952 #else
11953 	    midexon_pairs = List_transfer_one(midexon_pairs,&path);
11954 #endif
11955 	  }
11956 	}
11957 	debug(printf("Finished crossing a short exon\n"));
11958 
11959 	if (path == NULL) {
11960 	  /* Short exon is the first one.  Process it. */
11961 	  debug(printf("path is NULL so Pairpool_push_existing\n"));
11962 #ifdef WASTE
11963 	  pairs = Pairpool_push_existing(pairs,pairpool,pair); /* initial gap */
11964 #else
11965 	  pairs = List_push_existing(pairs,pairptr);
11966 #endif
11967 	  pairs = Pairpool_transfer(pairs,List_reverse(midexon_pairs));
11968 
11969 	} else {
11970 	  /* Perform dual intron gap */
11971 	  debug(printf("path is not NULL, so doing dual intron gap\n"));
11972 	  midleftpair = midexon_pairs->first;
11973 	  midgenomepos = (midleftpair->genomepos + midrightpair->genomepos)/2;
11974 	  midquerypos = midrightpair->querypos - (midrightpair->genomepos - midgenomepos);
11975 	  leftpair = path->first;
11976 	  rightpair = pairs->first;
11977 	  if (midquerypos <= leftpair->querypos || midquerypos >= rightpair->querypos) {
11978 	    /* Skip */
11979 
11980 	  } else {
11981 	    debug(printf("Stage 3 (dir %d): Traversing dual intron gap: leftquerypos = %d, midquerypos = %d, rightquerypos = %d, leftgenomepos = %d, midgenomepos = %d, rightgenomepos = %d\n",
11982 			 cdna_direction,leftpair->querypos,midquerypos,rightpair->querypos,
11983 			 leftpair->genomepos,midgenomepos,rightpair->genomepos));
11984 
11985 	    pairs = traverse_dual_genome_gap(&(*dynprogindex),pairs,&path,leftpair,rightpair,
11986 					     left_end_intron_p,right_end_intron_p,
11987 					     chrnum,chroffset,chrhigh,midquerypos,midgenomepos,
11988 					     queryseq_ptr,queryuc_ptr,querylength,cdna_direction,watsonp,genestrand,
11989 					     jump_late_p,pairpool,dynprogL,dynprogR,last_genomedp5,last_genomedp3,
11990 					     maxpeelback,defect_rate,/*finalp*/false);
11991 	  }
11992 	}
11993       }
11994     }
11995   }
11996 
11997   /* Handle last entry if not a gap */
11998   if (path != NULL && ((Pair_T) path->first)->gapp == false) {
11999     pair = (Pair_T) path->first;
12000     pairs = List_transfer_one(pairs,&path);
12001   }
12002 
12003   return pairs;
12004 }
12005 
12006 
12007 static List_T
build_pairs_introns(bool * shiftp,bool * incompletep,int * dynprogindex_minor,int * dynprogindex_major,List_T path,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,int querylength,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,int maxpeelback,double defect_rate,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3,bool finalp,bool simplep)12008 build_pairs_introns (bool *shiftp, bool *incompletep,
12009 		     int *dynprogindex_minor, int *dynprogindex_major, List_T path,
12010 		     Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
12011 #ifdef PMAP
12012 		     char *queryaaseq_ptr,
12013 #endif
12014 		     char *queryseq_ptr, char *queryuc_ptr, int querylength,
12015 		     int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
12016 		     int maxpeelback, double defect_rate,
12017 		     Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
12018 #ifndef GSNAP
12019 		     Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
12020 #endif
12021 		     Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3, bool finalp, bool simplep) {
12022   List_T pairs = NULL, pairptr;
12023   Pair_T pair, leftpair, rightpair;
12024   bool filledp;
12025   int minintronlen;
12026 
12027   debug(printf("\n** Starting build_pairs_introns\n"));
12028   debug(Pair_dump_list(path,true));
12029 
12030   if (finalp == true) {
12031     minintronlen = MININTRONLEN_FINAL;
12032   } else {
12033     minintronlen = MININTRONLEN;
12034   }
12035 
12036   /* Remove gaps at beginning */
12037   while (path != NULL && ((Pair_T) path->first)->gapp == true) {
12038     path = Pairpool_pop(path,&pair);
12039   }
12040 
12041   *shiftp = *incompletep = false;
12042   while (path != NULL && path->rest != NULL) {
12043     /* pairptr = path; */
12044     /* path = Pairpool_pop(path,&pair); */
12045     pair = (Pair_T) path->first;
12046     if (pair->gapp == false) {
12047 #ifdef WASTE
12048       pairs = Pairpool_push_existing(pairs,pairpool,pair);
12049 #else
12050       pairs = List_transfer_one(pairs,&path);
12051 #endif
12052 
12053     } else if (pair->queryjump > nullgap) {
12054       if (pair->genomejump < 16) {
12055 	/* Not enough genome material to run stage 2 */
12056 	pairptr = path;		/* save */
12057 	path = Pairpool_pop(path,&pair);
12058 
12059 	leftpair = path->first;
12060 	rightpair = pairs->first;
12061 	debug(printf("Stage 3 (dir %d): Traversing cDNA gap: leftquerypos = %d, rightquerypos = %d, leftgenomepos = %d, rightgenomepos = %d, queryjump %d, genomejump %d\n",
12062 		     cdna_direction,leftpair->querypos,rightpair->querypos,leftpair->genomepos,rightpair->genomepos,pair->queryjump,pair->genomejump));
12063 	pairs = traverse_cdna_gap(&filledp,&(*incompletep),&(*dynprogindex_minor),&(*dynprogindex_major),
12064 				  pairs,&path,leftpair,rightpair,
12065 				  chroffset,chrhigh,queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
12066 				  jump_late_p,pairpool,dynprogL,dynprogM,dynprogR,
12067 				  last_genomedp5,last_genomedp3,maxpeelback,defect_rate,/*finalp*/true);
12068 
12069 	if (filledp == true) {
12070 	  /* Discard gap */
12071 	  debug(printf("Discarding gap ");
12072 		Pair_dump_one(pair,true);
12073 		printf("\n"));
12074 	} else {
12075 	  /* Replace the gap */
12076 	  debug(printf("Replacing gap ");
12077 		Pair_dump_one(pair,true);
12078 		printf("\n"));
12079 #ifdef WASTE
12080 	  pairs = Pairpool_push_existing(pairs,pairpool,pair);
12081 #else
12082 	  pairs = List_push_existing(pairs,pairptr);
12083 #endif
12084 	}
12085 
12086       } else {
12087 #ifndef GSNAP
12088 	/* Solve as dual break.  Should have already been done by build_dual_breaks */
12089 	/* pairptr = path; */		/* save */
12090 	/* path = Pairpool_pop(path,&pair); */
12091 	leftpair = path->first;
12092 	rightpair = pairs->first;
12093 	pairs = traverse_dual_break(pairs,&path,leftpair,rightpair,chroffset,chrhigh,
12094 #ifdef PMAP
12095 				    queryaaseq_ptr,
12096 #endif
12097 				    queryseq_ptr,queryuc_ptr,querylength,cdna_direction,watsonp,genestrand,
12098 				    pairpool,maxpeelback,oligoindices_minor,
12099 				    diagpool,cellpool,&(*dynprogindex_major));
12100 #endif
12101       }
12102 
12103     } else if (finalp == false && pair->queryjump > pair->genomejump + EXTRAQUERYGAP) {
12104       /* If finalp is true, then will need to solve as singles */
12105       pairptr = path;		/* save */
12106       path = Pairpool_pop(path,&pair);
12107 
12108       leftpair = path->first;
12109       rightpair = pairs->first;
12110       debug(printf("Stage 3 (dir %d): Traversing cDNA gap: leftquerypos = %d, rightquerypos = %d, leftgenomepos = %d, rightgenomepos = %d, queryjump %d, genomejump %d\n",
12111 		   cdna_direction,leftpair->querypos,rightpair->querypos,leftpair->genomepos,rightpair->genomepos,pair->queryjump,pair->genomejump));
12112       pairs = traverse_cdna_gap(&filledp,&(*incompletep),&(*dynprogindex_minor),&(*dynprogindex_major),
12113 				pairs,&path,leftpair,rightpair,
12114 				chroffset,chrhigh,queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
12115 				jump_late_p,pairpool,dynprogL,dynprogM,dynprogR,
12116 				last_genomedp5,last_genomedp3,maxpeelback,defect_rate,/*finalp*/true);
12117 
12118       if (filledp == true) {
12119 	/* Discard gap */
12120 	debug(printf("Discarding gap ");
12121 	      Pair_dump_one(pair,true);
12122 	      printf("\n"));
12123       } else {
12124 	/* Replace the gap */
12125 	debug(printf("Replacing gap ");
12126 	      Pair_dump_one(pair,true);
12127 	      printf("\n"));
12128 #ifdef WASTE
12129 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
12130 #else
12131 	pairs = List_push_existing(pairs,pairptr);
12132 #endif
12133       }
12134 
12135     } else if (pair->genomejump > pair->queryjump + minintronlen) {
12136       /* Previously was 2*MININTRONLEN, and comment said needed space for two introns */
12137       /* We will make the score matrices nearly square */
12138       pairptr = path;		/* save */
12139       path = Pairpool_pop(path,&pair);
12140 
12141       leftpair = path->first;
12142       rightpair = pairs->first;
12143       debug(printf("Stage 3 (dir %d): Traversing paired gap: leftquerypos = %d, rightquerypos = %d, leftgenomepos = %d, rightgenomepos = %d, queryjump %d, genomejump %d\n",
12144 		   cdna_direction,leftpair->querypos,rightpair->querypos,leftpair->genomepos,rightpair->genomepos,pair->queryjump,pair->genomejump));
12145       /* fprintf(stderr,"donor prob %f, acceptor prob %f\n",pair->donor_prob,pair->acceptor_prob); */
12146       pairs = traverse_genome_gap(&filledp,&(*shiftp),&(*dynprogindex_minor),&(*dynprogindex_major),
12147 				  pairs,&path,leftpair,rightpair,chrnum,chroffset,chrhigh,
12148 				  queryseq_ptr,queryuc_ptr,querylength,
12149 				  cdna_direction,watsonp,genestrand,jump_late_p,
12150 				  pairpool,dynprogL,dynprogM,dynprogR,last_genomedp5,last_genomedp3,
12151 				  maxpeelback,defect_rate,finalp,simplep);
12152       /* Previously had forcep == true, because previously thought that adding large gap is not a good solution */
12153 
12154       if (filledp == true) {
12155 	/* Discard the gap */
12156 	debug(printf("Discarding gap ");
12157 	      Pair_dump_one(pair,true);
12158 	      printf("\n"));
12159       } else {
12160 	/* Replace the gap */
12161 	debug(printf("Replacing gap ");
12162 	      Pair_dump_one(pair,true);
12163 	      printf("\n"));
12164 #ifdef WASTE
12165 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
12166 #else
12167 	pairs = List_push_existing(pairs,pairptr);
12168 #endif
12169       }
12170 
12171     } else if (pair->genomejump > pair->queryjump + SINGLESLEN) {
12172       /* Intron length shorter than MININTRONLEN_FINAL.  Just replace the gap */
12173       debug(printf("Short intron; not candidate for final calculation.  Replacing gap ");
12174 	    Pair_dump_one(pair,true);
12175 	    printf("\n"));
12176 #ifdef WASTE
12177       pairs = Pairpool_push_existing(pairs,pairpool,pair);
12178 #else
12179       pairs = List_transfer_one(pairs,&path);
12180 #endif
12181 
12182     } else {
12183       /* Single gap; force fill */
12184       pairptr = path;		/* save */
12185       path = Pairpool_pop(path,&pair);
12186 
12187       leftpair = path->first;
12188       rightpair = pairs->first;
12189       debug(printf("Stage 3 (dir %d): Traversing single gap: leftquerypos = %d, rightquerypos = %d, leftgenomepos = %d, rightgenomepos = %d.  queryjump = %d, genomejump = %d\n",
12190 		   cdna_direction,leftpair->querypos,rightpair->querypos,leftpair->genomepos,rightpair->genomepos,
12191 		   pair->queryjump,pair->genomejump));
12192       pairs = traverse_single_gap(&filledp,&(*dynprogindex_minor),pairs,&path,leftpair,rightpair,
12193 				  chroffset,chrhigh,
12194 				  queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
12195 				  jump_late_p,pairpool,dynprogM,last_genomedp5,last_genomedp3,
12196 				  maxpeelback,defect_rate,/*forcep*/false,finalp);
12197 
12198       if (filledp == true) {
12199 	/* Discard the gap */
12200 	debug(printf("Discarding gap ");
12201 	      Pair_dump_one(pair,true);
12202 	      printf("\n"));
12203       } else {
12204 	/* Replace the gap */
12205 	debug(printf("Replacing gap ");
12206 	      Pair_dump_one(pair,true);
12207 	      printf("\n"));
12208 #ifdef WASTE
12209 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
12210 #else
12211 	pairs = List_push_existing(pairs,pairptr);
12212 #endif
12213       }
12214     }
12215   }
12216 
12217   /* Handle last entry if not a gap */
12218   if (path != NULL && ((Pair_T) path->first)->gapp == false) {
12219     pair = (Pair_T) path->first;
12220     pairs = List_transfer_one(pairs,&path);
12221   }
12222 
12223   debug(printf("\n** Finishing build_pairs_introns\n"));
12224   return pairs;
12225 }
12226 
12227 
12228 static int
score_alignment(int * nmatches,int * nmismatches,int * nindels,int * indel_alignment_score,int * nsemicanonical,int * nnoncanonical,List_T pairs,int cdna_direction)12229 score_alignment (int *nmatches, int *nmismatches, int *nindels,
12230 #ifdef COMPLEX_DIRECTION
12231 		 int *indel_alignment_score,
12232 #endif
12233 		 int *nsemicanonical, int *nnoncanonical, List_T pairs, int cdna_direction) {
12234   int ncanonical;		/* Do not return this; use score_introns instead */
12235   int nunknowns, qopens, qindels, topens, tindels;
12236   double min_splice_prob;
12237 
12238   Pair_fracidentity(&(*nmatches),&nunknowns,&(*nmismatches),&qopens,&qindels,&topens,&tindels,
12239 		    &ncanonical,&(*nsemicanonical),&(*nnoncanonical),&min_splice_prob,
12240 		    pairs,cdna_direction);
12241   debug11(printf("%d matches, %d nmismatches, %d+%d qgaps, %d+%d tgaps => alignment_score is %d\n",
12242 		 *nmatches,*nmismatches,qopens,qindels,topens,tindels,
12243 		 MATCH*(*nmatches) + MISMATCH*(*nmismatches) + QOPEN*(qopens + qindels) + TOPEN*(topens + tindels)));
12244 
12245   debug(printf("%d matches, %d nmismatches, %d+%d qgaps, %d+%d tgaps => alignment_score is %d\n",
12246 	       *nmatches,*nmismatches,qopens,qindels,topens,tindels,
12247 	       MATCH*(*nmatches) + MISMATCH*(*nmismatches) + QOPEN*(qopens + qindels) + TOPEN*(topens + tindels)));
12248 
12249 #ifdef COMPLEX_DIRECTION
12250   *indel_alignment_score = QOPEN*(qopens + qindels) + TOPEN*(topens + tindels);
12251 #endif
12252 
12253   *nindels = qindels + tindels;
12254   return MATCH*(*nmatches) + MISMATCH*(*nmismatches) + QOPEN*(qopens + qindels) + TOPEN*(topens + tindels);
12255 }
12256 
12257 
12258 static List_T
score_introns(double * max_intron_score,double * avg_donor_score,double * avg_acceptor_score,int * nknown,int * ncanonical,int * nbadintrons,List_T path,int cdna_direction,bool watsonp,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Pairpool_T pairpool)12259 score_introns (double *max_intron_score, double *avg_donor_score, double *avg_acceptor_score,
12260 	       int *nknown, int *ncanonical, int *nbadintrons, List_T path, int cdna_direction, bool watsonp,
12261 	       Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh
12262 #ifdef WASTE
12263 	       , Pairpool_T pairpool
12264 #endif
12265 	       ) {
12266   List_T pairs = NULL, pairptr, p;
12267   Pair_T pair, leftpair, rightpair;
12268   Univcoord_T splicesitepos;
12269   int minintronlen;
12270   double donor_score, acceptor_score;
12271   int nintrons = 0;
12272   int total_matches, total_denominator;
12273   int max_neighborhood_score, neighborhood_score, neighborhood_length;
12274 #if 0
12275   char gbuffer1[MAXENT_MAXLENGTH];
12276 #endif
12277 
12278   debug11(printf("\n** Starting score_introns with cdna_direction %d\n",cdna_direction));
12279   debug11(Pair_dump_list(path,true));
12280 
12281   minintronlen = MININTRONLEN_FINAL;
12282 
12283   *max_intron_score = *avg_donor_score = *avg_acceptor_score = 0.0;
12284   *nknown = *ncanonical = *nbadintrons = 0;
12285 
12286   total_matches = total_denominator = 0;
12287   for (p = path; p != NULL; p = p->rest) {
12288     pair = (Pair_T) p->first;
12289     if (pair->gapp == true) {
12290       /* Skip */
12291     } else {
12292       if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
12293 	total_matches++;
12294       }
12295       total_denominator++;
12296     }
12297   }
12298 
12299 
12300   while (path != NULL) {
12301     /* pairptr = path; */
12302     /* path = Pairpool_pop(path,&pair); */
12303     pair = (Pair_T) path->first;
12304 
12305     if (pair->gapp == false) {
12306 #ifdef WASTE
12307       pairs = Pairpool_push_existing(pairs,pairpool,pair);
12308 #else
12309       pairs = List_transfer_one(pairs,&path);
12310 #endif
12311 
12312     } else if (pair->queryjump > nullgap) {
12313       debug11(printf("pair->queryjump %d > nullgap %d\n",pair->queryjump,nullgap));
12314 #ifdef WASTE
12315       pairs = Pairpool_push_existing(pairs,pairpool,pair);
12316 #else
12317       pairs = List_transfer_one(pairs,&path);
12318 #endif
12319 
12320     } else if (pair->queryjump > pair->genomejump + EXTRAQUERYGAP) {
12321       debug11(printf("pair->queryjump %d > pair->genomejump %d + EXTRAQUERYGAP %d\n",
12322 		     pair->queryjump,pair->genomejump,EXTRAQUERYGAP));
12323 #ifdef WASTE
12324       pairs = Pairpool_push_existing(pairs,pairpool,pair);
12325 #else
12326       pairs = List_transfer_one(pairs,&path);
12327 #endif
12328 
12329     } else if (pair->genomejump > pair->queryjump + minintronlen) {
12330       debug11(printf("pair->genomejump %d > pair->queryjump %d + minintronlen %d\n",
12331 		     pair->genomejump,pair->queryjump,minintronlen));
12332       pairptr = path;	/* save */
12333       path = Pairpool_pop(path,&pair);
12334 
12335       /* Look at right neighborhood */
12336       max_neighborhood_score = neighborhood_score = 0;
12337       neighborhood_length = 0;
12338       for (p = pairs; p != NULL && neighborhood_length < 25 && ((Pair_T) (p->first))->gapp == false; p = p->rest) {
12339 	rightpair = p->first;
12340 	if (rightpair->comp == MATCH_COMP || rightpair->comp == DYNPROG_MATCH_COMP || rightpair->comp == AMBIGUOUS_COMP) {
12341 	  neighborhood_score += 1;
12342 	} else {
12343 	  neighborhood_score -= 3;
12344 	}
12345 	if (neighborhood_score > max_neighborhood_score) {
12346 	  max_neighborhood_score = neighborhood_score;
12347 	}
12348 	neighborhood_length += 1;
12349       }
12350 
12351       debug11(printf("right neighborhood: max_neighborhood_score %d, neighborhood_length %d\n",
12352 		     max_neighborhood_score,neighborhood_length));
12353       if (max_neighborhood_score >= 6 ||
12354 	  (neighborhood_length < 10 && max_neighborhood_score > neighborhood_length - 1)) {
12355 	/* Alignment in right neighborhood okay.  Look at left neighborhood */
12356 	max_neighborhood_score = neighborhood_score = 0;
12357 	neighborhood_length = 0;
12358 	for (p = path; p != NULL && neighborhood_length < 25 && ((Pair_T) (p->first))->gapp == false; p = p->rest) {
12359 	  leftpair = p->first;
12360 	  if (leftpair->comp == MATCH_COMP || leftpair->comp == DYNPROG_MATCH_COMP || leftpair->comp == AMBIGUOUS_COMP) {
12361 	    neighborhood_score += 1;
12362 	  } else {
12363 	    neighborhood_score -= 3;
12364 	  }
12365 	  if (neighborhood_score > max_neighborhood_score) {
12366 	    max_neighborhood_score = neighborhood_score;
12367 	  }
12368 	  neighborhood_length += 1;
12369 	}
12370 
12371 	debug11(printf("left neighborhood: max_neighborhood_score %d, neighborhood_length %d\n",
12372 		       max_neighborhood_score,neighborhood_length));
12373 	if (max_neighborhood_score >= 6 ||
12374 	    (neighborhood_length < 10 && max_neighborhood_score > neighborhood_length - 1)) {
12375 	  /* Alignment in left neighborhood okay */
12376 	  leftpair = path->first;
12377 	  rightpair = pairs->first;
12378 
12379 	  debug11(printf("pair->comp = %c\n",pair->comp));
12380 
12381 	  if (cdna_direction == +1) {
12382 	    if (watsonp) {
12383 	      splicesitepos = leftpair->genomepos + 1;
12384 	      debug11(printf("1. looking up splicesites_iit for donor at #%d:%u..%u\n",chrnum,splicesitepos,splicesitepos+1));
12385 	      if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
12386 									splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/+1)) {
12387 		debug11(printf(" => known\n"));
12388 		pair->knowngapp = true;
12389 		donor_score = 1.0;
12390 	      } else {
12391 		donor_score = Maxent_hr_donor_prob(chroffset + splicesitepos,chroffset);
12392 	      }
12393 
12394 	      splicesitepos = rightpair->genomepos;
12395 	      debug11(printf("2. looking up splicesites_iit for acceptor at #%d:%u..%u\n",chrnum,splicesitepos,splicesitepos+1));
12396 	      if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
12397 									splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/+1)) {
12398 		debug11(printf(" => known\n"));
12399 		pair->knowngapp = true;
12400 		acceptor_score = 1.0;
12401 	      } else {
12402 		acceptor_score = Maxent_hr_acceptor_prob(chroffset + splicesitepos,chroffset);
12403 	      }
12404 
12405 	    } else {
12406 	      splicesitepos = (chrhigh - chroffset) - leftpair->genomepos;
12407 	      debug11(printf("3. looking up splicesites_iit for donor at #%d:%u..%u\n",chrnum,splicesitepos,splicesitepos+1));
12408 	      if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
12409 									splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/-1)) {
12410 		debug11(printf(" => known\n"));
12411 		pair->knowngapp = true;
12412 		donor_score = 1.0;
12413 	      } else {
12414 		donor_score = Maxent_hr_antidonor_prob(chroffset + splicesitepos,chroffset);
12415 	      }
12416 
12417 	      splicesitepos = (chrhigh - chroffset) - rightpair->genomepos + 1;
12418 	      debug11(printf("4. looking up splicesites_iit for acceptor at #%d:%u..%u\n",chrnum,splicesitepos,splicesitepos+1));
12419 	      if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
12420 									splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/-1)) {
12421 		debug11(printf(" => known\n"));
12422 		pair->knowngapp = true;
12423 		acceptor_score = 1.0;
12424 	      } else {
12425 		acceptor_score = Maxent_hr_antiacceptor_prob(chroffset + splicesitepos,chroffset);
12426 	      }
12427 	    }
12428 	    debug11(printf("donor score at %u is %f, watson %d, cdna_direction %d, comp %c\n",
12429 			   leftpair->genomepos,donor_score,watsonp,cdna_direction,pair->comp));
12430 	    debug11(printf("acceptor score at %u is %f, watson %d, cdna_direction %d, comp %c\n",
12431 			   rightpair->genomepos,acceptor_score,watsonp,cdna_direction,pair->comp));
12432 	    nintrons += 1;
12433 	    if (pair->knowngapp == true) {
12434 	      *nknown += 1;
12435 	      *ncanonical += 1;
12436 	    } else if (pair->comp == FWD_CANONICAL_INTRON_COMP) {
12437 	      *ncanonical += 1;
12438 	    } else if (donor_score < 0.9 && acceptor_score < 0.9) {
12439 	      *nbadintrons = 1;
12440 	    }
12441 	    *avg_donor_score += donor_score;
12442 	    *avg_acceptor_score += acceptor_score;
12443 	    if (donor_score + acceptor_score > *max_intron_score) {
12444 	      *max_intron_score = donor_score + acceptor_score;
12445 	    }
12446 
12447 	  } else if (cdna_direction == -1) {
12448 
12449 #if 0
12450 	    make_complement_buffered(gbuffer1,&(genomicuc_ptr[leftpair->genomepos - ACCEPTOR_MODEL_RIGHT_MARGIN]),
12451 				     ACCEPTOR_MODEL_LEFT_MARGIN+ACCEPTOR_MODEL_RIGHT_MARGIN+1);
12452 	    acceptor_score = Maxent_acceptor_prob(gbuffer1);
12453 	    make_complement_buffered(gbuffer1,&(genomicuc_ptr[rightpair->genomepos - DONOR_MODEL_RIGHT_MARGIN - 1]),
12454 				     DONOR_MODEL_LEFT_MARGIN+DONOR_MODEL_RIGHT_MARGIN+1);
12455 	    donor_score = Maxent_donor_prob(gbuffer1);
12456 #endif
12457 
12458 	    if (watsonp) {
12459 	      splicesitepos = leftpair->genomepos + 1;
12460 	      debug11(printf("5. looking up splicesites_iit for acceptor at #%d:%u..%u\n",chrnum,splicesitepos,splicesitepos+1));
12461 	      if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
12462 									splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/-1)) {
12463 		debug11(printf(" => known\n"));
12464 		pair->knowngapp = true;
12465 		acceptor_score = 1.0;
12466 	      } else {
12467 		acceptor_score = Maxent_hr_antiacceptor_prob(chroffset + splicesitepos,chroffset);
12468 	      }
12469 
12470 
12471 	      splicesitepos = rightpair->genomepos;
12472 	      debug11(printf("6. looking up splicesites_iit for donor at #%d:%u..%u\n",chrnum,splicesitepos,splicesitepos+1));
12473 	      if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
12474 									splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/-1)) {
12475 		debug11(printf(" => known\n"));
12476 		pair->knowngapp = true;
12477 		donor_score = 1.0;
12478 	      } else {
12479 		donor_score = Maxent_hr_antidonor_prob(chroffset + splicesitepos,chroffset);
12480 	      }
12481 
12482 	    } else {
12483 	      splicesitepos = (chrhigh - chroffset) - leftpair->genomepos;
12484 	      debug11(printf("7. looking up splicesites_iit for acceptor at #%d:%u..%u\n",chrnum,splicesitepos,splicesitepos+1));
12485 	      if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
12486 									splicesitepos,splicesitepos+1U,acceptor_typeint,/*sign*/+1)) {
12487 		debug11(printf(" => known\n"));
12488 		pair->knowngapp = true;
12489 		acceptor_score = 1.0;
12490 	      } else {
12491 		acceptor_score = Maxent_hr_acceptor_prob(chroffset + splicesitepos,chroffset);
12492 	      }
12493 
12494 	      splicesitepos = (chrhigh - chroffset) - rightpair->genomepos + 1;
12495 	      debug11(printf("8. looking up splicesites_iit for donor at #%d:%u..%u\n",chrnum,splicesitepos,splicesitepos+1));
12496 	      if (splicesites_iit && IIT_exists_with_divno_typed_signed(splicesites_iit,splicesites_divint_crosstable[chrnum],
12497 									splicesitepos,splicesitepos+1U,donor_typeint,/*sign*/+1)) {
12498 		debug11(printf(" => known\n"));
12499 		pair->knowngapp = true;
12500 		donor_score = 1.0;
12501 	      } else {
12502 		donor_score = Maxent_hr_donor_prob(chroffset + splicesitepos,chroffset);
12503 	      }
12504 	    }
12505 	    debug11(printf("donor score at %u is %f, watson %d, cdna_direction %d, comp %c\n",
12506 			   leftpair->genomepos,donor_score,watsonp,cdna_direction,pair->comp));
12507 	    debug11(printf("acceptor score at %u is %f, watson %d, cdna_direction %d, comp %c\n",
12508 			   rightpair->genomepos,acceptor_score,watsonp,cdna_direction,pair->comp));
12509 	    nintrons += 1;
12510 	    if (pair->knowngapp == true) {
12511 	      *nknown += 1;
12512 	      *ncanonical += 1;
12513 	    } else if (pair->comp == REV_CANONICAL_INTRON_COMP) {
12514 	      *ncanonical += 1;
12515 	    } else if (donor_score < 0.9 && acceptor_score < 0.9) {
12516 	      *nbadintrons += 1;
12517 	    }
12518 	    *avg_donor_score += donor_score;
12519 	    *avg_acceptor_score += acceptor_score;
12520 	    if (donor_score + acceptor_score > *max_intron_score) {
12521 	      *max_intron_score = donor_score + acceptor_score;
12522 	    }
12523 
12524 	  }
12525 	  debug11(printf("\n"));
12526 	}
12527       }
12528 
12529 #ifdef WASTE
12530       pairs = Pairpool_push_existing(pairs,pairpool,pair);
12531 #else
12532       pairs = List_push_existing(pairs,pairptr);
12533 #endif
12534 
12535     } else if (pair->genomejump > pair->queryjump + SINGLESLEN) {
12536       /* Intron length shorter than MININTRONLEN_FINAL.  Just replace the gap */
12537       debug11(printf("pair->genomejump %d > pair->queryjump %d + SINGLESLEN %d\n",
12538 		     pair->genomejump,pair->queryjump,SINGLESLEN));
12539 
12540 #ifdef WASTE
12541       pairs = Pairpool_push_existing(pairs,pairpool,pair);
12542 #else
12543       pairs = List_transfer_one(pairs,&path);
12544 #endif
12545 
12546     } else {
12547       /* Single gap; force fill */
12548       debug11(printf("pair->queryjump %d, pair->genomejump %d => single gap\n",
12549 		     pair->queryjump,pair->genomejump));
12550 #ifdef WASTE
12551       pairs = Pairpool_push_existing(pairs,pairpool,pair);
12552 #else
12553       pairs = List_transfer_one(pairs,&path);
12554 #endif
12555     }
12556   }
12557 
12558   /* Want average scores */
12559   if (nintrons > 0) {
12560     *avg_donor_score /= (double) nintrons;
12561     *avg_acceptor_score /= (double) nintrons;
12562   }
12563 
12564   debug11(printf("max_intron_score = %f, avg_donor_score = %f, avg_acceptor_score = %f\n",
12565 		 *max_intron_score,*avg_donor_score,*avg_acceptor_score));
12566   return pairs;
12567 }
12568 
12569 
12570 static int
end_compare(List_T x,List_T y,int cdna_direction,bool watsonp,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,bool pairsp)12571 end_compare (List_T x, List_T y, int cdna_direction, bool watsonp,
12572 	     Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
12573 	     bool pairsp) {
12574   List_T pairs1, pairs2, path1, path2;
12575   double max_intron_score;
12576 
12577   int nknown_1, ncanonical_1, nsemicanonical_1, nnoncanonical_1, nbadintrons_1;
12578   int nknown_2, ncanonical_2, nsemicanonical_2, nnoncanonical_2, nbadintrons_2;
12579   double avg_donor_score_1, avg_acceptor_score_1;
12580   double avg_donor_score_2, avg_acceptor_score_2;
12581   int alignment_score_1, alignment_score_2;
12582   int nmatches_1, nmismatches_1, nmatches_2, nmismatches_2, nindels_1, nindels_2;
12583 #ifdef COMPLEX_DIRECTION
12584   int indel_alignment_score_1, indel_alignment_score_2;
12585 #endif
12586 
12587 
12588   if (pairsp == true) {
12589     pairs1 = x;
12590     pairs2 = y;
12591 
12592     path1 = List_reverse(pairs1);
12593     debug11(printf("Calling score_introns for end_compare on path1\n"));
12594     pairs1 = score_introns(&max_intron_score,&avg_donor_score_1,&avg_acceptor_score_1,&nknown_1,&ncanonical_1,
12595 			   &nbadintrons_1,path1,cdna_direction,watsonp,chrnum,chroffset,chrhigh);
12596     alignment_score_1 = score_alignment(&nmatches_1,&nmismatches_1,&nindels_1,
12597 #ifdef COMPLEX_DIRECTION
12598 					&indel_alignment_score_1,
12599 #endif
12600 					&nsemicanonical_1,&nnoncanonical_1,pairs1,cdna_direction);
12601 
12602     path2 = List_reverse(pairs2);
12603     debug11(printf("Calling score_introns for end_compare on path2\n"));
12604     pairs2 = score_introns(&max_intron_score,&avg_donor_score_2,&avg_acceptor_score_2,&nknown_2,&ncanonical_2,
12605 			   &nbadintrons_2,path2,cdna_direction,watsonp,chrnum,chroffset,chrhigh);
12606     alignment_score_2 = score_alignment(&nmatches_2,&nmismatches_2,&nindels_2,
12607 #ifdef COMPLEX_DIRECTION
12608 					&indel_alignment_score_2,
12609 #endif
12610 					&nsemicanonical_2,&nnoncanonical_2,pairs2,cdna_direction);
12611 
12612   } else {
12613     path1 = x;
12614     path2 = y;
12615 
12616     debug11(printf("Calling score_introns for end_compare on path1\n"));
12617     pairs1 = score_introns(&max_intron_score,&avg_donor_score_1,&avg_acceptor_score_1,&nknown_1,&ncanonical_1,
12618 			   &nbadintrons_1,path1,cdna_direction,watsonp,chrnum,chroffset,chrhigh);
12619     alignment_score_1 = score_alignment(&nmatches_1,&nmismatches_1,&nindels_1,
12620 #ifdef COMPLEX_DIRECTION
12621 					&indel_alignment_score_1,
12622 #endif
12623 					&nsemicanonical_1,&nnoncanonical_1,pairs1,cdna_direction);
12624 
12625     path1 = List_reverse(pairs1);
12626     debug11(printf("Calling score_introns for end_compare on path2\n"));
12627     pairs2 = score_introns(&max_intron_score,&avg_donor_score_2,&avg_acceptor_score_2,&nknown_2,&ncanonical_2,
12628 			   &nbadintrons_2,path2,cdna_direction,watsonp,chrnum,chroffset,chrhigh);
12629     alignment_score_2 = score_alignment(&nmatches_2,&nmismatches_2,&nindels_2,
12630 #ifdef COMPLEX_DIRECTION
12631 					&indel_alignment_score_2,
12632 #endif
12633 					&nsemicanonical_2,&nnoncanonical_2,pairs2,cdna_direction);
12634     path2 = List_reverse(pairs2);
12635   }
12636 
12637   if (avg_donor_score_1 > 0.9 && avg_acceptor_score_1 > 0.9 &&
12638       (avg_donor_score_2 < 0.5 || avg_acceptor_score_2 < 0.5)) {
12639     debug21(printf("intronscores orig %f,%f > intronscores new %f,%f, so original wins\n",
12640 		   avg_donor_score_1,avg_acceptor_score_1,avg_donor_score_2,avg_acceptor_score_2));
12641     /* intronscores reveal a clear sensedir */
12642     return -1;
12643 
12644   } else if (avg_donor_score_2 > 0.9 && avg_acceptor_score_2 > 0.9 &&
12645 	     (avg_donor_score_1 < 0.5 || avg_acceptor_score_1 < 0.5)) {
12646     debug21(printf("intronscores new %f,%f > intronscores orig %f,%f, so new one wins\n",
12647 		   avg_donor_score_2,avg_acceptor_score_2,avg_donor_score_1,avg_acceptor_score_1));
12648     /* intronscores reveal a clear sensedir */
12649     return +1;
12650 
12651   } else if (alignment_score_1 > alignment_score_2 + SCORE_SIGDIFF) {
12652     debug21(printf("alignment_score_1 %d >> alignment_score_2 %d, so original wins\n",
12653 		   alignment_score_1,alignment_score_2));
12654     return -1;
12655 
12656   } else if (alignment_score_2 > alignment_score_1 + SCORE_SIGDIFF) {
12657     debug21(printf("alignment_score_2 %d << alignment_score_1 %d, so new one wins\n",
12658 		   alignment_score_2,alignment_score_1));
12659     return +1;
12660 
12661   } else if (nnoncanonical_1 < nnoncanonical_2) {
12662     debug21(printf("nnoncanonical_1 %d < nnoncanonical_2 %d, so original wins\n",
12663 		   nnoncanonical_1,nnoncanonical_2));
12664     return -1;
12665 
12666   } else if (nnoncanonical_2 < nnoncanonical_1) {
12667     debug21(printf("nnoncanonical_2 %d < nnoncanonical_1 %d, so new one wins\n",
12668 		   nnoncanonical_2,nnoncanonical_1));
12669     return +1;
12670 
12671   } else if (avg_donor_score_1 + avg_acceptor_score_1 > avg_donor_score_2 + avg_acceptor_score_2 + PROB_SIGDIFF) {
12672     debug21(printf("intronscores orig %f+%f > intronscores new %f+%f, so original wins\n",
12673 		   avg_donor_score_1,avg_acceptor_score_1,avg_donor_score_2,avg_acceptor_score_2));
12674     /* intronscores reveal a preferred sensedir */
12675     return -1;
12676 
12677   } else if (avg_donor_score_2 + avg_acceptor_score_2 > avg_donor_score_1 + avg_acceptor_score_1 + PROB_SIGDIFF) {
12678     debug21(printf("intronscores new %f+%f > intronscores orig %f+%f, so new one wins\n",
12679 		   avg_donor_score_2,avg_acceptor_score_2,avg_donor_score_1,avg_acceptor_score_1));
12680     /* intronscores reveal a preferred sensedir */
12681     return +1;
12682 
12683   } else if (alignment_score_1 > alignment_score_2) {
12684     debug21(printf("alignment_score_1 %d > alignment_score_2 %d, so original wins\n",
12685 		   alignment_score_1,alignment_score_2));
12686     return -1;
12687 
12688   } else if (alignment_score_2 > alignment_score_1) {
12689     debug21(printf("alignment_score_2 %d < alignment_score_1 %d, so new one wins\n",
12690 		   alignment_score_2,alignment_score_1));
12691     return +1;
12692 
12693   } else {
12694     debug21(printf("scores all equal\n"));
12695     return 0;
12696   }
12697 }
12698 
12699 
12700 #if 0
12701 static List_T
12702 filter_goodness_hmm (bool *filterp, List_T pairs, double defect_rate) {
12703   Pair_T pair;
12704   List_T path, p;
12705   float prev_vprob_good = 0.0, prev_vprob_bad = 0.0, vprob_good, vprob_bad;
12706   float good_incr_prob, bad_incr_prob;
12707   float emission_prob;
12708   State_T state;
12709 
12710   if (defect_rate == 0.0) {
12711     defect_rate = 0.001;
12712   }
12713 
12714   debug5(printf("Beginning filter_goodness_hmm with defect rate %f\n",defect_rate));
12715 
12716   for (p = pairs; p != NULL; p = List_next(p)) {
12717     pair = (Pair_T) List_head(p);
12718     debug5(printf("hmm querypos %d (%c %c %c): ",pair->querypos,pair->genome,pair->comp,pair->cdna));
12719 
12720     /* state: GOOD */
12721     if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
12722       emission_prob = 1.0 - defect_rate;
12723     } else {
12724       emission_prob = defect_rate;
12725     }
12726 
12727 #ifdef COMPUTE_LOG
12728     good_incr_prob = log(emission_prob) + log(/*transition_prob*/0.99); /* Prob(prev good state -> good state) */
12729     bad_incr_prob = log(emission_prob) + log(/*transition_prob*/0.10);  /* Prob(prev bad state -> good state) */
12730 #else
12731     good_incr_prob = fasterlog(emission_prob) + LOG_99; /* Prob(prev good state -> good state) */
12732     bad_incr_prob = fasterlog(emission_prob) + LOG_10;  /* Prob(prev bad state -> good state) */
12733 #endif
12734 
12735     debug5(printf("state GOOD: %f+%f %f+%f ",prev_vprob_good,good_incr_prob,prev_vprob_bad,bad_incr_prob));
12736     if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
12737       vprob_good = prev_vprob_good + good_incr_prob;
12738       pair->vstate_good = GOOD;
12739       debug5(printf(" =>GOOD.  "));
12740     } else {
12741       vprob_good = prev_vprob_bad + bad_incr_prob;
12742       pair->vstate_good = BAD;
12743       debug5(printf(" =>BAD.   "));
12744     }
12745 
12746     /* state: BAD */
12747     if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
12748       emission_prob = 0.25;
12749     } else {
12750       emission_prob = 0.75;
12751     }
12752 
12753 #ifdef COMPUTE_LOG
12754     good_incr_prob = log(emission_prob) + log(/*transition_prob*/0.01);  /* Prob(prev good state -> bad state) */
12755     bad_incr_prob = log(emission_prob) + log(/*transition_prob*/0.90);  /* Prob(prev bad state -> bad state) */
12756 #else
12757     good_incr_prob = fasterlog(emission_prob) + LOG_01; /* Prob(prev good state -> bad state) */
12758     bad_incr_prob = fasterlog(emission_prob) + LOG_90; /* Prob(prev bad state -> bad state) */
12759 #endif
12760 
12761     debug5(printf("state BAD: %f+%f %f+%f ",prev_vprob_good,good_incr_prob,prev_vprob_bad,bad_incr_prob));
12762     if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
12763       vprob_bad = prev_vprob_good + good_incr_prob;
12764       pair->vstate_bad = GOOD;
12765       debug5(printf(" =>GOOD.\n"));
12766     } else {
12767       vprob_bad = prev_vprob_bad + bad_incr_prob;
12768       pair->vstate_bad = BAD;
12769       debug5(printf(" =>BAD.\n"));
12770     }
12771 
12772     prev_vprob_good = vprob_good;
12773     prev_vprob_bad = vprob_bad;
12774   }
12775 
12776   if (prev_vprob_good > prev_vprob_bad) {
12777     state = GOOD;
12778   } else {
12779     state = BAD;
12780   }
12781 
12782   path = List_reverse(pairs);
12783   pairs = (List_T) NULL;
12784 
12785   *filterp = false;
12786   while (path != NULL) {
12787     pair = path->first;
12788     pair->state = state;
12789 
12790 #ifdef DEBUG5
12791     Pair_dump_one(pair,/*zerobasedp*/false);
12792     printf("\n");
12793 #endif
12794 
12795     if (state == GOOD) {
12796       pairs = List_transfer_one(pairs,&path);
12797       state = pair->vstate_good;
12798     } else {
12799       *filterp = true;
12800       path = path->rest;
12801       state = pair->vstate_bad;
12802     }
12803   }
12804 
12805   return pairs;
12806 }
12807 #endif
12808 
12809 
12810 #if 0
12811 static List_T
12812 filter_indels_hmm (bool *filterp, List_T pairs) {
12813   Pair_T pair;
12814   List_T path, p;
12815   float prev_vprob_good = 0.0, prev_vprob_bad = 0.0, vprob_good, vprob_bad;
12816   float good_incr_prob, bad_incr_prob;
12817   float emission_prob;
12818   State_T state;
12819 
12820   debug5(printf("Beginning filter_indels_hmm\n"));
12821 
12822   for (p = pairs; p != NULL; p = List_next(p)) {
12823     pair = (Pair_T) List_head(p);
12824     debug5(printf("indels querypos %d (%c %c %c): ",pair->querypos,pair->genome,pair->comp,pair->cdna));
12825 
12826     /* state: GOOD */
12827     /* These emission probs should add to 1.0 */
12828     if (pair->comp != INDEL_COMP) {
12829       emission_prob = 0.9999;	/* Prob(good state -> match/mismatch) */
12830     } else if (pair->comp != SHORTGAP_COMP) {
12831       emission_prob = 0.9999;	/* Prob(good state -> match/mismatch) */
12832     } else {
12833       emission_prob = 0.0001;	/* Prob(good state -> indel) */
12834     }
12835 
12836     /* These transition probs should complement those for state BAD */
12837 #ifdef COMPUTE_LOG
12838     good_incr_prob = log(emission_prob) + log(/*transition_prob*/0.9999);  /* Prob(prev good state -> good state) */
12839     bad_incr_prob = log(emission_prob) + log(/*transition_prob*/0.25);   /* Prob(prev bad state -> good state) */
12840 #else
12841     good_incr_prob = fasterlog(emission_prob) + LOG_9999;  /* Prob(prev good state -> good state) */
12842     bad_incr_prob = fasterlog(emission_prob) + LOG_25;   /* Prob(prev bad state -> good state) */
12843 #endif
12844 
12845     debug5(printf("state GOOD: %f+%f %f+%f ",prev_vprob_good,good_incr_prob,prev_vprob_bad,bad_incr_prob));
12846     if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
12847       vprob_good = prev_vprob_good + good_incr_prob;
12848       pair->vstate_good = GOOD;
12849       debug5(printf(" =>GOOD.  "));
12850     } else {
12851       vprob_good = prev_vprob_bad + bad_incr_prob;
12852       pair->vstate_good = BAD;
12853       debug5(printf(" =>BAD.   "));
12854     }
12855 
12856     /* state: BAD */
12857     /* These emission probs should add to 1.0 */
12858     if (pair->comp != INDEL_COMP && pair->comp != SHORTGAP_COMP) {
12859       emission_prob = 0.5; 	/* Prob(bad state -> match/mismatch) */
12860     } else {
12861       emission_prob = 0.5;	/* Prob(bad state -> indel) */
12862     }
12863 
12864 #ifdef COMPUTE_LOG
12865     good_incr_prob = log(emission_prob) + log(/*transition_prob*/0.0001);   /* Prob(prev good state -> bad state) */
12866     bad_incr_prob = log(emission_prob) + log(/*transition_prob*/0.75);  /* Prob(prev bad state -> bad state) */
12867 #else
12868     good_incr_prob = fasterlog(emission_prob) + LOG_0001;   /* Prob(prev good state -> bad state) */
12869     bad_incr_prob = fasterlog(emission_prob) + LOG_75;  /* Prob(prev bad state -> bad state) */
12870 #endif
12871 
12872     debug5(printf("state BAD: %f+%f %f+%f ",prev_vprob_good,good_incr_prob,prev_vprob_bad,bad_incr_prob));
12873     if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
12874       vprob_bad = prev_vprob_good + good_incr_prob;
12875       pair->vstate_bad = GOOD;
12876       debug5(printf(" =>GOOD.\n"));
12877     } else {
12878       vprob_bad = prev_vprob_bad + bad_incr_prob;
12879       pair->vstate_bad = BAD;
12880       debug5(printf(" =>BAD.\n"));
12881     }
12882 
12883     prev_vprob_good = vprob_good;
12884     prev_vprob_bad = vprob_bad;
12885   }
12886 
12887   if (prev_vprob_good > prev_vprob_bad) {
12888     state = GOOD;
12889   } else {
12890     state = BAD;
12891   }
12892 
12893   path = List_reverse(pairs);
12894   pairs = (List_T) NULL;
12895 
12896   *filterp = false;
12897   while (path != NULL) {
12898     pair = path->first;
12899     pair->state = state;
12900 
12901 #ifdef DEBUG5
12902     Pair_dump_one(pair,/*zerobasedp*/false);
12903     printf("\n");
12904 #endif
12905 
12906     if (state == GOOD) {
12907       pairs = List_transfer_one(pairs,&path);
12908       state = pair->vstate_good;
12909     } else {
12910       *filterp = true;
12911       path = path->rest;
12912       state = pair->vstate_bad;
12913     }
12914   }
12915 
12916   return pairs;
12917 }
12918 #endif
12919 
12920 
12921 
12922 bool
Stage3_short_alignment_p(struct Pair_T * pairarray,int npairs,int querylength)12923 Stage3_short_alignment_p (struct Pair_T *pairarray, int npairs, int querylength) {
12924   int querystart, queryend;
12925 
12926   if (npairs == 0) {
12927     return true;
12928   } else {
12929     querystart = pairarray[0].querypos;
12930     queryend = pairarray[npairs-1].querypos;
12931     if (queryend - querystart + 1 < querylength/3) {
12932       return true;
12933     } else {
12934       return false;
12935     }
12936   }
12937 }
12938 
12939 
12940 #if 0
12941 /* Uses hmm */
12942 /* Modified from Substring_bad_stretch_p */
12943 bool
12944 Stage3_bad_stretch_p (struct Pair_T *pairarray, int npairs, int pos5, int pos3) {
12945   Pair_T pair;
12946   int i;
12947   double vprob_good, vprob_bad, prev_vprob_good, prev_vprob_bad, good_incr_prob, bad_incr_prob;
12948   bool indelp = false, mismatchp, matchp;
12949 
12950   /* Initialize priors */
12951 #ifdef COMPUTE_LOG
12952   prev_vprob_good = log(0.99);
12953   prev_vprob_bad = log(0.01);
12954 #else
12955   prev_vprob_good = LOG_99;
12956   prev_vprob_bad = LOG_01;
12957 #endif
12958 
12959   for (i = 0; i < npairs; i++) {
12960     pair = &(pairarray[i]);
12961     if (pair->querypos < pos5) {
12962       /* Skip */
12963       matchp = mismatchp = false;
12964     } else if (pair->querypos >= pos3) {
12965       /* Skip */
12966       matchp = mismatchp = false;
12967     } else if (pair->gapp == true) {
12968       /* Skip */
12969       matchp = mismatchp = false;
12970       indelp = false;
12971     } else if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP) {
12972       if (indelp == true) {
12973 	/* Skip, because we count each indel just once  */
12974 	matchp = mismatchp = false;
12975       } else {
12976 	/* Count each gap as a mismatch */
12977 	mismatchp = true;
12978 	matchp = false;
12979 	indelp = true;
12980       }
12981     } else if (pair->comp == MISMATCH_COMP) {
12982       mismatchp = true;
12983       matchp = false;
12984       indelp = false;
12985     } else {
12986       mismatchp = false;
12987       matchp = true;
12988       indelp = false;
12989     }
12990 
12991     if (mismatchp == true) {
12992       debug9(printf("querypos %d (mismatch): ",pair->querypos));
12993 
12994       /* state: GOOD */
12995 #ifdef COMPUTE_LOG
12996       good_incr_prob = log(/*emission_prob*/0.001) + log(/*transition_prob*/0.999);
12997       bad_incr_prob = log(/*emission_prob*/0.001) + log(/*transition_prob*/0.001);
12998 #else
12999       good_incr_prob = LOG_01_999;
13000       bad_incr_prob = LOG_01_001;
13001 #endif
13002 
13003       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13004 	vprob_good = prev_vprob_good + good_incr_prob;
13005       } else {
13006 	/* vprob_good = prev_vprob_bad + bad_incr_prob; */
13007 	return true;
13008       }
13009 
13010       /* state: BAD */
13011 #ifdef COMPUTE_LOG
13012       good_incr_prob = log(/*emission_prob*/0.75) + log(/*transition_prob*/0.001);
13013       bad_incr_prob = log(/*emission_prob*/0.75) + log(/*transition_prob*/0.999);
13014 #else
13015       good_incr_prob = LOG_75_001;
13016       bad_incr_prob = LOG_75_999;
13017 #endif
13018       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13019 	vprob_bad = prev_vprob_good + good_incr_prob;
13020       } else {
13021 	vprob_bad = prev_vprob_bad + bad_incr_prob;
13022       }
13023 
13024       debug9(printf("vprob_good %f, vprob_bad %f",vprob_good,vprob_bad));
13025       prev_vprob_good = vprob_good;
13026       prev_vprob_bad = vprob_bad;
13027       debug9(printf("\n"));
13028 
13029     } else if (matchp == true) {
13030       debug9(printf("querypos %d (match): ",pair->querypos));
13031 
13032       /* state: GOOD */
13033 #ifdef COMPUTE_LOG
13034       good_incr_prob = log(/*emission_prob*/0.999) + log(/*transition_prob*/0.999);
13035       bad_incr_prob = log(/*emission_prob*/0.999) + log(/*transition_prob*/0.001);
13036 #else
13037       good_incr_prob = LOG_99_999;
13038       bad_incr_prob = LOG_99_001;
13039 #endif
13040 
13041       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13042 	vprob_good = prev_vprob_good + good_incr_prob;
13043       } else {
13044 	/* vprob_good = prev_vprob_bad + bad_incr_prob; */
13045 	return true;
13046       }
13047 
13048       /* state: BAD */
13049 #ifdef COMPUTE_LOG
13050       good_incr_prob = log(/*emission_prob*/0.25) + log(/*transition_prob*/0.001);
13051       bad_incr_prob = log(/*emission_prob*/0.25) + log(/*transition_prob*/0.999);
13052 #else
13053       good_incr_prob = LOG_25_001;
13054       bad_incr_prob = LOG_25_999;
13055 #endif
13056       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13057 	vprob_bad = prev_vprob_good + good_incr_prob;
13058       } else {
13059 	vprob_bad = prev_vprob_bad + bad_incr_prob;
13060       }
13061 
13062       debug9(printf("vprob_good %f, vprob_bad %f",vprob_good,vprob_bad));
13063       prev_vprob_good = vprob_good;
13064       prev_vprob_bad = vprob_bad;
13065       debug9(printf("\n"));
13066     }
13067 
13068   }
13069 
13070   return false;
13071 }
13072 #else
13073 
13074 /* Uses a window */
13075 bool
Stage3_bad_stretch_p(struct Pair_T * pairarray,int npairs,int pos5,int pos3)13076 Stage3_bad_stretch_p (struct Pair_T *pairarray, int npairs, int pos5, int pos3) {
13077   int *nindels;
13078   struct Pair_T *ptr;
13079   Pair_T pair;
13080   int i;
13081 
13082   nindels = (int *) CALLOCA(npairs,sizeof(int));
13083 
13084   i = 0;
13085   ptr = pairarray;
13086   while (i < npairs) {
13087     pair = ptr++;
13088     i++;
13089 
13090     if (pair->querypos < pos5) {
13091       /* Skip */
13092     } else if (pair->querypos >= pos3) {
13093       /* Skip */
13094     } else if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP) {
13095       if (pair->genome == ' ') {
13096 	nindels[i] = 1;
13097 	while (i < npairs && pair->genome == ' ') {
13098 	  pair = ptr++;
13099 	  i++;
13100 	}
13101 	ptr--;
13102 	i--;
13103 
13104       } else {
13105 	nindels[i] = 1;
13106 	while (i < npairs && pair->cdna == ' ') {
13107 	  pair = ptr++;
13108 	  i++;
13109 	}
13110 	ptr--;
13111 	i--;
13112 
13113       }
13114     }
13115   }
13116 
13117   /* Compute cumulative count of indel openings */
13118   for (i = 1; i < npairs; i++) {
13119     nindels[i] += nindels[i-1];
13120   }
13121 
13122   /* Look for more than 3 indel openings in a span of 25 pairs */
13123   for (i = 0; i < npairs - 25; i++) {
13124     if (nindels[i+25] - nindels[i] > 3) {
13125       FREEA(nindels);
13126       return true;
13127     }
13128   }
13129 
13130   FREEA(nindels);
13131 
13132   return false;
13133 }
13134 
13135 #endif
13136 
13137 
13138 
13139 int
Stage3_good_part(struct Pair_T * pairarray,int npairs,int pos5,int pos3)13140 Stage3_good_part (struct Pair_T *pairarray, int npairs, int pos5, int pos3) {
13141   Pair_T pair;
13142   int i;
13143   double vprob_good, vprob_bad, prev_vprob_good, prev_vprob_bad, good_incr_prob, bad_incr_prob;
13144   bool indelp = false, mismatchp, matchp, stopp;
13145   int ngoodleft, ngoodright;
13146 
13147   /* Initialize priors */
13148 #ifdef COMPUTE_LOG
13149   prev_vprob_good = log(0.99);
13150   prev_vprob_bad = log(0.01);
13151 #else
13152   prev_vprob_good = LOG_99;
13153   prev_vprob_bad = LOG_01;
13154 #endif
13155 
13156   ngoodleft = 0;
13157   stopp = false;
13158   for (i = 0; i < npairs && stopp == false; i++) {
13159     pair = &(pairarray[i]);
13160     if (pair->querypos < pos5) {
13161       /* Skip */
13162       matchp = mismatchp = false;
13163     } else if (pair->querypos >= pos3) {
13164       /* Skip */
13165       matchp = mismatchp = false;
13166     } else if (pair->gapp == true) {
13167       /* Skip */
13168       matchp = mismatchp = false;
13169       indelp = false;
13170     } else if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP) {
13171       if (indelp == true) {
13172 	/* Skip, because we count each indel just once  */
13173 	matchp = mismatchp = false;
13174       } else {
13175 	/* Count each gap as a mismatch */
13176 	mismatchp = true;
13177 	matchp = false;
13178 	indelp = true;
13179       }
13180     } else if (pair->comp == MISMATCH_COMP) {
13181       mismatchp = true;
13182       matchp = false;
13183       indelp = false;
13184     } else {
13185       mismatchp = false;
13186       matchp = true;
13187       indelp = false;
13188     }
13189 
13190     if (mismatchp == true) {
13191       debug9(printf("querypos %d (mismatch): ",pair->querypos));
13192 
13193       /* state: GOOD */
13194 #ifdef COMPUTE_LOG
13195       good_incr_prob = log(/*emission_prob*/0.001) + log(/*transition_prob*/0.999);
13196       bad_incr_prob = log(/*emission_prob*/0.001) + log(/*transition_prob*/0.001);
13197 #else
13198       good_incr_prob = LOG_01_999;
13199       bad_incr_prob = LOG_01_001;
13200 #endif
13201 
13202       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13203 	vprob_good = prev_vprob_good + good_incr_prob;
13204 	ngoodleft++;
13205       } else {
13206 	/* vprob_good = prev_vprob_bad + bad_incr_prob; */
13207 	stopp = true;
13208       }
13209 
13210       /* state: BAD */
13211 #ifdef COMPUTE_LOG
13212       good_incr_prob = log(/*emission_prob*/0.75) + log(/*transition_prob*/0.001);
13213       bad_incr_prob = log(/*emission_prob*/0.75) + log(/*transition_prob*/0.999);
13214 #else
13215       good_incr_prob = LOG_75_001;
13216       bad_incr_prob = LOG_75_999;
13217 #endif
13218       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13219 	vprob_bad = prev_vprob_good + good_incr_prob;
13220       } else {
13221 	vprob_bad = prev_vprob_bad + bad_incr_prob;
13222       }
13223 
13224       debug9(printf("vprob_good %f, vprob_bad %f",vprob_good,vprob_bad));
13225       prev_vprob_good = vprob_good;
13226       prev_vprob_bad = vprob_bad;
13227       debug9(printf("\n"));
13228 
13229     } else if (matchp == true) {
13230       debug9(printf("querypos %d (match): ",pair->querypos));
13231 
13232       /* state: GOOD */
13233 #ifdef COMPUTE_LOG
13234       good_incr_prob = log(/*emission_prob*/0.999) + log(/*transition_prob*/0.999);
13235       bad_incr_prob = log(/*emission_prob*/0.999) + log(/*transition_prob*/0.001);
13236 #else
13237       good_incr_prob = LOG_99_999;
13238       bad_incr_prob = LOG_99_001;
13239 #endif
13240 
13241       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13242 	vprob_good = prev_vprob_good + good_incr_prob;
13243 	ngoodleft++;
13244       } else {
13245 	/* vprob_good = prev_vprob_bad + bad_incr_prob; */
13246 	stopp = true;
13247       }
13248 
13249       /* state: BAD */
13250 #ifdef COMPUTE_LOG
13251       good_incr_prob = log(/*emission_prob*/0.25) + log(/*transition_prob*/0.001);
13252       bad_incr_prob = log(/*emission_prob*/0.25) + log(/*transition_prob*/0.999);
13253 #else
13254       good_incr_prob = LOG_25_001;
13255       bad_incr_prob = LOG_25_999;
13256 #endif
13257       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13258 	vprob_bad = prev_vprob_good + good_incr_prob;
13259       } else {
13260 	vprob_bad = prev_vprob_bad + bad_incr_prob;
13261       }
13262 
13263       debug9(printf("vprob_good %f, vprob_bad %f",vprob_good,vprob_bad));
13264       prev_vprob_good = vprob_good;
13265       prev_vprob_bad = vprob_bad;
13266       debug9(printf("\n"));
13267     }
13268 
13269   }
13270 
13271 
13272   /* Initialize priors */
13273 #ifdef COMPUTE_LOG
13274   prev_vprob_good = log(0.99);
13275   prev_vprob_bad = log(0.01);
13276 #else
13277   prev_vprob_good = LOG_99;
13278   prev_vprob_bad = LOG_01;
13279 #endif
13280 
13281   ngoodright = 0;
13282   stopp = false;
13283   for (i = npairs - 1; i >= 0 && stopp == false; i--) {
13284     pair = &(pairarray[i]);
13285     if (pair->querypos < pos5) {
13286       /* Skip */
13287       matchp = mismatchp = false;
13288     } else if (pair->querypos >= pos3) {
13289       /* Skip */
13290       matchp = mismatchp = false;
13291     } else if (pair->gapp == true) {
13292       /* Skip */
13293       matchp = mismatchp = false;
13294       indelp = false;
13295     } else if (pair->comp == INDEL_COMP || pair->comp == SHORTGAP_COMP) {
13296       if (indelp == true) {
13297 	/* Skip, because we count each indel just once  */
13298 	matchp = mismatchp = false;
13299       } else {
13300 	/* Count each gap as a mismatch */
13301 	mismatchp = true;
13302 	matchp = false;
13303 	indelp = true;
13304       }
13305     } else if (pair->comp == MISMATCH_COMP) {
13306       mismatchp = true;
13307       matchp = false;
13308       indelp = false;
13309     } else {
13310       mismatchp = false;
13311       matchp = true;
13312       indelp = false;
13313     }
13314 
13315     if (mismatchp == true) {
13316       debug9(printf("querypos %d (mismatch): ",pair->querypos));
13317 
13318       /* state: GOOD */
13319 #ifdef COMPUTE_LOG
13320       good_incr_prob = log(/*emission_prob*/0.001) + log(/*transition_prob*/0.999);
13321       bad_incr_prob = log(/*emission_prob*/0.001) + log(/*transition_prob*/0.001);
13322 #else
13323       good_incr_prob = LOG_01_999;
13324       bad_incr_prob = LOG_01_001;
13325 #endif
13326 
13327       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13328 	vprob_good = prev_vprob_good + good_incr_prob;
13329 	ngoodright++;
13330       } else {
13331 	/* vprob_good = prev_vprob_bad + bad_incr_prob; */
13332 	stopp = true;
13333       }
13334 
13335       /* state: BAD */
13336 #ifdef COMPUTE_LOG
13337       good_incr_prob = log(/*emission_prob*/0.75) + log(/*transition_prob*/0.001);
13338       bad_incr_prob = log(/*emission_prob*/0.75) + log(/*transition_prob*/0.999);
13339 #else
13340       good_incr_prob = LOG_75_001;
13341       bad_incr_prob = LOG_75_999;
13342 #endif
13343       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13344 	vprob_bad = prev_vprob_good + good_incr_prob;
13345       } else {
13346 	vprob_bad = prev_vprob_bad + bad_incr_prob;
13347       }
13348 
13349       debug9(printf("vprob_good %f, vprob_bad %f",vprob_good,vprob_bad));
13350       prev_vprob_good = vprob_good;
13351       prev_vprob_bad = vprob_bad;
13352       debug9(printf("\n"));
13353 
13354     } else if (matchp == true) {
13355       debug9(printf("querypos %d (match): ",pair->querypos));
13356 
13357       /* state: GOOD */
13358 #ifdef COMPUTE_LOG
13359       good_incr_prob = log(/*emission_prob*/0.999) + log(/*transition_prob*/0.999);
13360       bad_incr_prob = log(/*emission_prob*/0.999) + log(/*transition_prob*/0.001);
13361 #else
13362       good_incr_prob = LOG_99_999;
13363       bad_incr_prob = LOG_99_001;
13364 #endif
13365 
13366       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13367 	vprob_good = prev_vprob_good + good_incr_prob;
13368 	ngoodright++;
13369       } else {
13370 	/* vprob_good = prev_vprob_bad + bad_incr_prob; */
13371 	stopp = true;
13372       }
13373 
13374       /* state: BAD */
13375 #ifdef COMPUTE_LOG
13376       good_incr_prob = log(/*emission_prob*/0.25) + log(/*transition_prob*/0.001);
13377       bad_incr_prob = log(/*emission_prob*/0.25) + log(/*transition_prob*/0.999);
13378 #else
13379       good_incr_prob = LOG_25_001;
13380       bad_incr_prob = LOG_25_999;
13381 #endif
13382       if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13383 	vprob_bad = prev_vprob_good + good_incr_prob;
13384       } else {
13385 	vprob_bad = prev_vprob_bad + bad_incr_prob;
13386       }
13387 
13388       debug9(printf("vprob_good %f, vprob_bad %f",vprob_good,vprob_bad));
13389       prev_vprob_good = vprob_good;
13390       prev_vprob_bad = vprob_bad;
13391       debug9(printf("\n"));
13392     }
13393 
13394   }
13395 
13396   debug9(printf("ngoodleft = %d, ngoodright = %d\n",ngoodleft,ngoodright));
13397 
13398   if (ngoodleft > ngoodright) {
13399     return ngoodleft;
13400   } else {
13401     return ngoodright;
13402   }
13403 }
13404 
13405 
13406 
13407 
13408 #if 0
13409 static int
13410 score_nconsecutive (List_T pairs) {
13411   int score = 0, nconsecutive;
13412   Pair_T pair;
13413   List_T path, p;
13414   float prev_vprob_good = 0.0, prev_vprob_bad = 0.0, vprob_good, vprob_bad;
13415   float defect_rate = 0.001, good_incr_prob, bad_incr_prob;
13416   float emission_prob;
13417   State_T state;
13418 
13419 
13420   debug(printf("Beginning score_nconsecutive\n"));
13421 
13422   for (p = pairs; p != NULL; p = List_next(p)) {
13423     pair = (Pair_T) List_head(p);
13424     debug5(printf("hmm querypos %d (%c %c %c): ",pair->querypos,pair->genome,pair->comp,pair->cdna));
13425 
13426     /* state: GOOD */
13427     if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
13428       emission_prob = 1.0 - defect_rate;
13429     } else {
13430       emission_prob = defect_rate;
13431     }
13432 
13433 #ifdef COMPUTE_LOG
13434     good_incr_prob = log(emission_prob) + log(/*transition_prob*/0.99); /* Prob(prev good state -> good state) */
13435     bad_incr_prob = log(emission_prob) + log(/*transition_prob*/0.10);  /* Prob(prev bad state -> good state) */
13436 #else
13437     good_incr_prob = fasterlog(emission_prob) + LOG_99; /* Prob(prev good state -> good state) */
13438     bad_incr_prob = fasterlog(emission_prob) + LOG_10;  /* Prob(prev bad state -> good state) */
13439 #endif
13440 
13441     debug5(printf("state GOOD: %f+%f %f+%f ",prev_vprob_good,good_incr_prob,prev_vprob_bad,bad_incr_prob));
13442     if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13443       vprob_good = prev_vprob_good + good_incr_prob;
13444       pair->vstate_good = GOOD;
13445       debug5(printf(" =>GOOD.  "));
13446     } else {
13447       vprob_good = prev_vprob_bad + bad_incr_prob;
13448       pair->vstate_good = BAD;
13449       debug5(printf(" =>BAD.   "));
13450     }
13451 
13452     /* state: BAD */
13453     if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
13454       emission_prob = 0.25;
13455     } else {
13456       emission_prob = 0.75;
13457     }
13458 
13459 #ifdef COMPUTE_LOG
13460     good_incr_prob = log(emission_prob) + log(/*transition_prob*/0.01);  /* Prob(prev good state -> bad state) */
13461     bad_incr_prob = log(emission_prob) + log(/*transition_prob*/0.90);  /* Prob(prev bad state -> bad state) */
13462 #else
13463     good_incr_prob = fasterlog(emission_prob) + LOG_01;  /* Prob(prev good state -> bad state) */
13464     bad_incr_prob = fasterlog(emission_prob) + LOG_90;  /* Prob(prev bad state -> bad state) */
13465 #endif
13466 
13467     debug5(printf("state BAD: %f+%f %f+%f ",prev_vprob_good,good_incr_prob,prev_vprob_bad,bad_incr_prob));
13468     if (prev_vprob_good + good_incr_prob > prev_vprob_bad + bad_incr_prob) {
13469       vprob_bad = prev_vprob_good + good_incr_prob;
13470       pair->vstate_bad = GOOD;
13471       debug5(printf(" =>GOOD.\n"));
13472     } else {
13473       vprob_bad = prev_vprob_bad + bad_incr_prob;
13474       pair->vstate_bad = BAD;
13475       debug5(printf(" =>BAD.\n"));
13476     }
13477 
13478     prev_vprob_good = vprob_good;
13479     prev_vprob_bad = vprob_bad;
13480   }
13481 
13482   if (prev_vprob_good > prev_vprob_bad) {
13483     state = GOOD;
13484   } else {
13485     state = BAD;
13486   }
13487 
13488   nconsecutive = 0;
13489   path = List_reverse(pairs);
13490 
13491   for (p = path; p != NULL; p = p->rest) {
13492     pair = p->first;
13493     debug5(printf("hmm querypos %d (%c %c %c): ",pair->querypos,pair->genome,pair->comp,pair->cdna));
13494     if (state == GOOD) {
13495       nconsecutive++;
13496       debug5(printf("state is GOOD, nconsecutive %d\n",nconsecutive));
13497       state = pair->vstate_good;
13498     } else {
13499       if (nconsecutive > score) {
13500 	score = nconsecutive;
13501       }
13502       nconsecutive = 0;
13503       debug5(printf("state is BAD, score %d\n",score));
13504       state = pair->vstate_bad;
13505     }
13506   }
13507 
13508   if (nconsecutive > score) {
13509     score = nconsecutive;
13510   }
13511   debug5(printf("At end, score %d\n",score));
13512 
13513   pairs = List_reverse(path);
13514 
13515   return score;
13516 }
13517 #endif
13518 
13519 
13520 
13521 static List_T
path_compute_dir(double * defect_rate,List_T pairs,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,int querylength,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,int maxpeelback,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3)13522 path_compute_dir (double *defect_rate, List_T pairs,
13523 		  int cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
13524 #ifdef PMAP
13525 		  char *queryaaseq_ptr,
13526 #endif
13527 		  char *queryseq_ptr, char *queryuc_ptr, int querylength,
13528 		  Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
13529 		  int maxpeelback,
13530 #ifndef GSNAP
13531 		  Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
13532 #endif
13533 		  Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
13534 		  Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3) {
13535   List_T path = NULL;
13536   int dynprogindex_minor = DYNPROGINDEX_MINOR, dynprogindex_major = DYNPROGINDEX_MAJOR;
13537   int iter0, iter1, iter2;
13538   bool shiftp, incompletep;
13539   bool shortp, badp, deletep, dual_break_p;
13540   int matches, unknowns, mismatches, qopens, qindels, topens, tindels,
13541     ncanonical, nsemicanonical, nnoncanonical;
13542   double min_splice_prob;
13543 
13544   bool indelp, trim5p, trim3p;
13545 
13546 
13547   iter0 = 0;
13548   dual_break_p = true;
13549 
13550   while (dual_break_p == true && iter0 < MAXITER_CYCLES) {
13551     /* path = List_reverse(pairs); */
13552     /* Need to insert gapholders after Pairpool_join_end5 and Pairpool_join_end3 */
13553     path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13554 			     pairpool,/*finalp*/false);
13555 
13556 #ifdef PMAP
13557 #if 0
13558     /* Pass 1b: undefine nucleotides around gaps.  path --> path */
13559     pairs = undefine_nucleotides(queryseq_ptr,querylength,path,pairpool,/*width*/6);
13560     path = List_reverse(pairs);
13561 #endif
13562 #endif
13563 
13564     /* Pass 2A: solve straight gaps (small).  path --> pairs (for defect rate) */
13565     debug(printf("\n*** Pass 2A (dir %d): Solve straight gaps (small).  Iteration0 %d\n",
13566 		 cdna_direction,iter0));
13567     debug(Pair_dump_list(path,true));
13568     pairs = build_pairs_singles(&dynprogindex_minor,path,/*maxsize*/3,
13569 				chroffset,chrhigh,queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
13570 				jump_late_p,maxpeelback,/*defect_rate*/0.0,pairpool,dynprogM,
13571 				last_genomedp5,last_genomedp3,/*forcep*/false,/*finalp*/false);
13572 #ifdef DEBUG8
13573     if (stage3debug == POST_SINGLES) {
13574       path = List_reverse(pairs);
13575       return path;
13576     }
13577 #endif
13578 
13579     if (homopolymerp == true) {
13580       /* Pass 2B: fix adjacent indels */
13581       /* >>pairs */
13582 #if 0
13583       /* gapholders shouldn't be necessary before fix_adjacent_indels,
13584 	 but is necessary afterward for build_pairs_singles */
13585       path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13586 			       pairpool,/*finalp*/false);
13587       pairs = List_reverse(path);
13588 #endif
13589 
13590       debug(printf("\n*** Pass 2B (dir %d): Fix adjacent indels.  Iteration0 %d\n",
13591 		   cdna_direction,iter0));
13592       path = fix_adjacent_indels(pairs);
13593       pairs = List_reverse(path);
13594       path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13595 			       pairpool,/*finalp*/false);
13596 
13597 
13598       /* Pass 2C: solve straight gaps again.  path --> pairs (for defect rate) */
13599       debug(printf("\n*** Pass 2C (dir %d): Solve straight gaps again (large).  Iteration0 %d\n",
13600 		   cdna_direction,iter0));
13601       pairs = build_pairs_singles(&dynprogindex_minor,path,/*maxsize*/nullgap,
13602 				  chroffset,chrhigh,queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
13603 				  jump_late_p,maxpeelback,/*defect_rate*/0.0,pairpool,dynprogM,
13604 				  last_genomedp5,last_genomedp3,/*forcep*/false,/*finalp*/false);
13605       /* <<pairs */
13606     }
13607 
13608     /* Compute defect rate here */
13609     Pair_fracidentity(&matches,&unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
13610 		      &ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
13611 		      pairs,/*cdna_direction*/0);
13612     if (matches + mismatches == 0) {
13613       *defect_rate = 0.0;
13614     } else {
13615       *defect_rate = (double) mismatches/(double) (matches + mismatches);
13616     }
13617     debug(printf("defect_rate = %f (%d matches, %d mismatches)\n",*defect_rate,matches,mismatches));
13618 
13619 
13620     /* Pass 3: Smoothing */
13621     debug(printf("*** Pass 3 (dir %d): Smooth\n",cdna_direction));
13622 
13623     /* Smoothing by probability */
13624     path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13625 			     pairpool,/*finalp*/false);
13626     pairs = assign_intron_probs(path,cdna_direction,watsonp,chrnum,chroffset,chrhigh,pairpool);
13627     Smooth_reset(pairs);
13628     pairs = Smooth_pairs_by_intronprobs(&badp,pairs,pairpool);
13629 
13630 #if 0
13631     /* Smoothing by netgap.  Can crash or stall, and generally doesn't do anything except for very low-identity alignments. */
13632     debug(printf("\n*** Pass 1 (dir %d): Initial smoothing by net gap.  Iteration1 %d\n",
13633 		 cdna_direction,iter1));
13634     pairs = Smooth_pairs_by_netgap(&smoothp,pairs,pairpool);
13635 #endif
13636 
13637     /* Smoothing by size: This can undo the short exons found by traverse_dual_genome, so we use protectedp in traverse_dual_genome  */
13638     debug(printf("*** Pass 3a (dir %d): Smoothing by size.  Iteration0 %d\n",
13639 		 cdna_direction,iter0));
13640     path = List_reverse(pairs);
13641     pairs = remove_indel_gaps(path);
13642     pairs = Smooth_pairs_by_size(&shortp,&deletep,pairs,pairpool,/*stage2_indexsize*/6);
13643     debug(printf("  => Result of Pass 3a (smoothing): shortp is %d, deletep is %d\n",shortp,deletep));
13644     debug(Pair_dump_list(pairs,/*zerobasedp*/true));
13645 
13646 #ifdef DEBUG8
13647     if (stage3debug == POST_SMOOTHING) {
13648       path = List_reverse(pairs);
13649       return path;
13650     }
13651 #endif
13652 
13653 
13654     /* Pass 5: introns */
13655     /* >>pairs */
13656     debug(printf("\n*** Pass 5 (dir %d): Smooth and solve dual introns iteratively.  Iteration0 %d\n",
13657 		 cdna_direction,iter0));
13658     iter1 = 0;
13659     badp = true;
13660     while (badp == true && iter1 < MAXITER_SMOOTH_BY_SIZE) {
13661       /* Pass 5a: single introns */
13662       debug(printf("*** Pass 5a (dir %d): Solve introns.  Iteration0 %d, iteration1 %d\n",
13663 		   cdna_direction,iter0,iter1));
13664 
13665       iter2 = 0;
13666       shiftp = true;
13667       while ((shiftp == true || incompletep == true) && iter2++ < MAXITER_INTRONS) {
13668 	path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13669 				 pairpool,/*finalp*/false);
13670 	pairs = build_pairs_introns(&shiftp,&incompletep,
13671 				    &dynprogindex_minor,&dynprogindex_major,path,
13672 				    chrnum,chroffset,chrhigh,
13673 #ifdef PMAP
13674 				    queryaaseq_ptr,
13675 #endif
13676 				    queryseq_ptr,queryuc_ptr,querylength,
13677 				    cdna_direction,watsonp,genestrand,jump_late_p,
13678 				    maxpeelback,*defect_rate,pairpool,dynprogL,dynprogM,dynprogR,
13679 #ifndef GSNAP
13680 				    oligoindices_minor,diagpool,cellpool,
13681 #endif
13682 				    last_genomedp5,last_genomedp3,/*finalp*/false,/*simplep*/true);
13683 	debug(printf("  => Result of Pass 5 (introns):\n"));
13684 	debug(Pair_dump_list(pairs,/*zerobasedp*/true));
13685       }
13686 
13687 #ifdef DEBUG8
13688       if (stage3debug == POST_INTRONS) {
13689 	path = List_reverse(pairs);
13690 	return path;
13691       }
13692 #endif
13693 
13694       /* Re-evaluate any small exons inserted by build_dual_breaks */
13695       path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13696 			       pairpool,/*finalp*/false);
13697       pairs = assign_intron_probs(path,cdna_direction,watsonp,chrnum,chroffset,chrhigh,pairpool);
13698       Smooth_reset(pairs);
13699       pairs = Smooth_pairs_by_intronprobs(&badp,pairs,pairpool);
13700 
13701       debug(printf("*** Pass 5b (dir %d): Solve dual introns.  Iteration0 %d, Iteration1 %d\n",
13702 		   cdna_direction,iter0,iter1));
13703       if (badp == false) {
13704 	debug(printf("  no badp, so do nothing\n"));
13705       } else {
13706 	debug(printf("  badp is true, so running build_pairs_dualintrons\n"));
13707 	path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13708 				 pairpool,/*finalp*/false);
13709 	/* XX */
13710 	/* pairs = assign_gap_types(path,cdna_direction,watsonp,queryseq_ptr,
13711 	   chrnum,chroffset,chrhigh,pairpool); */
13712 
13713 	/* Pass 3b: dual introns.  pairs --> pairs */
13714 	pairs = build_pairs_dualintrons(&dynprogindex_major,path,chrnum,chroffset,chrhigh,
13715 					queryseq_ptr,queryuc_ptr,querylength,
13716 					cdna_direction,watsonp,genestrand,jump_late_p,
13717 					maxpeelback,*defect_rate,pairpool,dynprogL,dynprogR,
13718 					last_genomedp5,last_genomedp3);
13719 	debug(printf("  => Result of Pass 5b (dual introns):\n"));
13720 	debug(Pair_dump_list(pairs,/*zerobasedp*/true));
13721 
13722 	/* Trimming needed for path_compute_end5 and path_compute_end3 to add new start and end exons */
13723 	pairs = trim_end5_exons(&indelp,&trim5p,/*ambig_end_length*/0,pairs,dynprogR,chroffset,chrhigh,
13724 				queryseq_ptr,queryuc_ptr,querylength,
13725 				cdna_direction,watsonp,genestrand,jump_late_p,pairpool,*defect_rate);
13726 	path = List_reverse(pairs);
13727 	path = trim_end3_exons(&indelp,&trim3p,/*ambig_end_length_3*/0,path,dynprogL,chroffset,chrhigh,
13728 			       queryseq_ptr,queryuc_ptr,querylength,
13729 			       cdna_direction,watsonp,genestrand,jump_late_p,pairpool,*defect_rate);
13730 	pairs = List_reverse(path);
13731       }
13732 
13733 #ifdef DEBUG8
13734       if (stage3debug == POST_DUAL_INTRONS) {
13735 	path = List_reverse(pairs);
13736 	return path;
13737       }
13738 #endif
13739 
13740       iter1++;
13741       debug(printf("At end of inner loop: iter1 %d, badp %d\n",iter1,badp));
13742     }
13743 
13744 #ifdef DEBUG8
13745     if (stage3debug == POST_CYCLES) {
13746       path = List_reverse(pairs);
13747       return path;
13748     }
13749 #endif
13750 
13751 
13752     /* Pass 99: Fix dual breaks */
13753     /* >>pairs */
13754     debug(printf("\n*** Pass 99 (dir %d): Fix dual breaks.  Iteration0 %d\n",cdna_direction,iter0));
13755     /* pairs = remove_indel_gaps(path); */
13756     path = List_reverse(pairs);
13757 
13758     pairs = build_dual_breaks(&dual_break_p,&dynprogindex_minor,&dynprogindex_major,path,
13759 			      chrnum,chroffset,chrhigh,
13760 #ifdef PMAP
13761 			      queryaaseq_ptr,
13762 #endif
13763 			      queryseq_ptr,queryuc_ptr,querylength,
13764 			      cdna_direction,watsonp,genestrand,jump_late_p,pairpool,
13765 			      dynprogL,dynprogM,dynprogR,last_genomedp5,last_genomedp3,maxpeelback,
13766 #ifndef GSNAP
13767 			      oligoindices_minor,diagpool,cellpool,
13768 #endif
13769 			      *defect_rate,/*finalp*/false,/*simplep*/true);
13770 
13771 
13772 #if 0
13773     path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13774 			     pairpool,/*finalp*/false);
13775     debug(Pair_dump_list(path,/*zerobasedp*/true));
13776     pairs = List_reverse(path);
13777     debug14(printf("Result of build_dual_breaks\n"));
13778     debug14(Pair_dump_list(pairs,true));
13779     debug(printf("Result of build_dual_breaks\n"));
13780     debug(Pair_dump_list(pairs,true));
13781 #endif
13782 
13783 #ifdef GSNAP
13784     /* Too expensive to loop */
13785     dual_break_p = false;
13786     /* filterp = false; */
13787 #endif
13788     iter0++;
13789     debug(printf("At end of outer loop: dual_break_p %d\n",dual_break_p));
13790   }
13791 
13792   path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13793 			   pairpool,/*finalp*/false);
13794 
13795   return path;
13796 }
13797 
13798 
13799 #ifdef STRICT
13800 /* Written for pairs, but works for path also */
13801 static List_T
remove_dual_breaks(List_T pairs)13802 remove_dual_breaks (List_T pairs) {
13803   int npairs5, npairs3;
13804   List_T path;
13805   bool donep;
13806 
13807 
13808   donep = false;
13809   while (donep == false) {
13810     if (pairs == NULL) {
13811       debug(printf("pairs is NULL\n"));
13812       donep = true;
13813     } else if (dual_break_p(pairs) == false) {
13814       debug(printf("dual break not found\n"));
13815       donep = true;
13816     } else {
13817       dual_break_count_npairs(&npairs5,&npairs3,pairs);
13818       debug(printf("npairs5 %d, npairs3 %d, total npairs %d\n",
13819 		   npairs5,npairs3,List_length(pairs)));
13820       if (npairs5 < npairs3) {
13821 	/* Remove from 5' end */
13822 	debug(printf("trimming dual break from 5' end, %d pairs\n",npairs5));
13823 	pairs = trim_npairs(pairs,npairs5 + 1 /*for the gap*/);
13824       } else {
13825 	/* Remove from 3' end */
13826 	path = List_reverse(pairs);
13827 	path = trim_npairs(path,npairs3 + 1 /*for the gap*/);
13828 	pairs = List_reverse(path);
13829       }
13830       debug(printf("Now alignment length is %d\n",List_length(pairs)));
13831     }
13832   }
13833 
13834   return pairs;
13835 }
13836 #endif
13837 
13838 
13839 /* Written for pairs, but works for path also */
13840 static List_T
remove_negative_breaks(List_T pairs)13841 remove_negative_breaks (List_T pairs) {
13842   int npairs5, npairs3;
13843   List_T path;
13844   bool donep;
13845 
13846 
13847   donep = false;
13848   while (donep == false) {
13849     if (pairs == NULL) {
13850       debug(printf("pairs is NULL\n"));
13851       donep = true;
13852     } else if (negative_break_p(pairs) == false) {
13853       debug(printf("negative break not found\n"));
13854       donep = true;
13855     } else {
13856       negative_break_count_npairs(&npairs5,&npairs3,pairs);
13857       debug(printf("npairs5 %d, npairs3 %d, total npairs %d\n",
13858 		   npairs5,npairs3,List_length(pairs)));
13859       if (npairs5 < npairs3) {
13860 	/* Remove from 5' end */
13861 	debug(printf("trimming negative break from 5' end, %d pairs\n",npairs5));
13862 	pairs = trim_npairs(pairs,npairs5 + 1 /*for the gap*/);
13863       } else {
13864 	/* Remove from 3' end */
13865 	path = List_reverse(pairs);
13866 	path = trim_npairs(path,npairs3 + 1 /*for the gap*/);
13867 	pairs = List_reverse(path);
13868       }
13869       debug(printf("Now alignment length is %d\n",List_length(pairs)));
13870     }
13871   }
13872 
13873   return pairs;
13874 }
13875 
13876 
13877 static List_T
path_compute_end5(int * ambig_end_length_5,Splicetype_T * ambig_splicetype_5,double * ambig_prob_5,double defect_rate,List_T pairs,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,char * queryseq_ptr,char * queryuc_ptr,int querylength,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,int maxpeelback,Pairpool_T pairpool,Dynprog_T dynprogR)13878 path_compute_end5 (int *ambig_end_length_5, Splicetype_T *ambig_splicetype_5, double *ambig_prob_5,
13879 		   double defect_rate, List_T pairs, int cdna_direction,
13880 		   bool watsonp, int genestrand, bool jump_late_p,
13881 		   char *queryseq_ptr, char *queryuc_ptr, int querylength,
13882 		   Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
13883 		   Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
13884 		   int maxpeelback, Pairpool_T pairpool, Dynprog_T dynprogR) {
13885   List_T path = NULL;
13886   int iter1;
13887   int dynprogindex_minor = DYNPROGINDEX_MINOR;
13888   int nmatches, nunknowns, nmismatches, qopens, qindels, topens, tindels,
13889     ncanonical, nsemicanonical, nnoncanonical;
13890   double min_splice_prob;
13891   bool knownsplice5p, chop_exon_p;
13892   bool trim5p, indelp;
13893 
13894 #if 0
13895   int distance5, totaljump5;
13896   int npairs5;
13897 
13898   Pair_T start;
13899   int querypos;
13900   Chrpos_T chrstart, chrend;
13901   List_T all_stage2_starts;
13902 #endif
13903 
13904 
13905   *ambig_end_length_5 = 0;
13906   *ambig_prob_5 = 0.0;
13907 
13908   /* Pass 7: Remove dual or negative breaks */
13909   /* >>pairs */
13910   debug(printf("\n*** Pass 7a (dir %d): Remove dual breaks at ends.  Initially alignment length is %d\n",
13911 	       cdna_direction,List_length(path)));
13912 #ifdef STRICT
13913   pairs = remove_dual_breaks(pairs);
13914 #else
13915   pairs = remove_negative_breaks(pairs);
13916 #endif
13917   debug(printf("Final alignment length is %d\n",List_length(pairs)));
13918   /* <<pairs */
13919 
13920 
13921   /* Extend to query end, so we get an accurate count of matches and mismatches */
13922   /* This is the first extension */
13923   /* >>pairs */
13924   debug(printf("\n*** Pass 8 (dir %d): Extend to 5' end and determine distalmedial\n",cdna_direction));
13925   pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
13926 			   &chop_exon_p,&dynprogindex_minor,pairs,
13927 			   chroffset,chrhigh,
13928 			   knownsplice_limit_low,knownsplice_limit_high,
13929 			   queryseq_ptr,queryuc_ptr,
13930 			   cdna_direction,watsonp,genestrand,jump_late_p,
13931 			   maxpeelback,defect_rate,pairpool,dynprogR,
13932 			   /*extendp*/true,/*endalign*/QUERYEND_GAP,/*forcep*/false);
13933 
13934   /* Necessary to insert gaps and assign gap types (fills in cDNA
13935      insertions, so they don't get trimmed), in case an insertion was
13936      introduced at ends */
13937   path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
13938 			   pairpool,/*finalp*/false);
13939   pairs = assign_gap_types(path,cdna_direction,watsonp,queryseq_ptr,
13940 			   chrnum,chroffset,chrhigh,pairpool);
13941 
13942   Pair_fracidentity(&nmatches,&nunknowns,&nmismatches,&qopens,&qindels,&topens,&tindels,
13943 		    &ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
13944 		    pairs,cdna_direction);
13945   if (ncanonical > 0 && nnoncanonical > 0) {
13946     /* Pass 10: Remove noncanonical end exons: pairs -> pairs */
13947     debug(printf("\n*** Pass 10 (dir %d): Remove noncanonical end exons\n",cdna_direction));
13948 
13949     if (maximize_coverage_p == true) {
13950       trim5p = false;
13951     } else if (knownsplice5p == true) {
13952       /* Don't trim at known splice sites */
13953       trim5p = false;
13954     } else {
13955       trim5p = true;
13956     }
13957     debug(printf("trim5p = %d\n",trim5p));
13958 
13959     /* Using iter1 to avoid the possibility of an infinite loop */
13960     iter1 = 0;
13961     while (iter1 < 5 && trim5p == true) {
13962       pairs = trim_end5_exons(&indelp,&trim5p,*ambig_end_length_5,pairs,dynprogR,chroffset,chrhigh,
13963 			      queryseq_ptr,queryuc_ptr,querylength,
13964 			      cdna_direction,watsonp,genestrand,jump_late_p,pairpool,defect_rate);
13965       if (indelp == true) {
13966 	pairs = trim_end5_indels(pairs,*ambig_end_length_5,dynprogR,chroffset,chrhigh,
13967 				 queryseq_ptr,queryuc_ptr,
13968 				 watsonp,genestrand,jump_late_p,pairpool,defect_rate);
13969       }
13970       if (trim5p == true) {
13971 	pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
13972 				 &chop_exon_p,&dynprogindex_minor,pairs,
13973 				 chroffset,chrhigh,
13974 				 knownsplice_limit_low,knownsplice_limit_high,
13975 				 queryseq_ptr,queryuc_ptr,
13976 				 cdna_direction,watsonp,genestrand,jump_late_p,
13977 				 maxpeelback,defect_rate,pairpool,dynprogR,/*extendp*/true,
13978 				 /*endalign*/BEST_LOCAL,/*forcep*/false);
13979 	debug3(printf("AFTER 5' REBUILD\n"));
13980 	debug3(Pair_dump_list(pairs,true));
13981       }
13982 
13983       /* Stop trimming at known splice sites */
13984       if (knownsplice5p == true) {
13985 	trim5p = false;
13986       }
13987       iter1++;
13988     }
13989 
13990 #if 0
13991     pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
13992 			     &chop_exon_p,&dynprogindex_minor,pairs,
13993 			     chroffset,chrhigh,
13994 			     knownsplice_limit_low,knownsplice_limit_high,
13995 			     queryseq_ptr,queryuc_ptr,
13996 			     cdna_direction,watsonp,genestrand,jump_late_p,
13997 			     maxpeelback,defect_rate,pairpool,dynprogR,/*extendp*/true,
13998 			     /*endalign*/QUERYEND_INDELS,/*forcep*/false);
13999 #endif
14000   }
14001 
14002 
14003 #if 0
14004   debug(printf("Pass 11 (dir %d): Final extension, end5\n",cdna_direction));
14005   /* This is the second extension */
14006   /* Perform final extension without gaps so we can compare fwd and rev properly */
14007   pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
14008 			   &chop_exon_p,&dynprogindex_minor,pairs,
14009 			   chroffset,chrhigh,
14010 			   knownsplice_limit_low,knownsplice_limit_high,
14011 			   queryseq_ptr,queryuc_ptr,
14012 			   cdna_direction,watsonp,genestrand,jump_late_p,
14013 			   maxpeelback,defect_rate,pairpool,dynprogR,
14014 			   /*extendp*/true,/*endalign*/QUERYEND_NOGAPS,/*forcep*/false);
14015 #endif
14016 
14017   debug(Pair_dump_list(pairs,true));
14018   debug(printf("End of path_compute_end5\n"));
14019 
14020   return pairs;
14021 }
14022 
14023 
14024 static List_T
path_compute_end3(int * ambig_end_length_3,Splicetype_T * ambig_splicetype_3,double * ambig_prob_3,double defect_rate,List_T path,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,int querylength,char * queryseq_ptr,char * queryuc_ptr,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,int maxpeelback,Pairpool_T pairpool,Dynprog_T dynprogL)14025 path_compute_end3 (int *ambig_end_length_3, Splicetype_T *ambig_splicetype_3, double *ambig_prob_3,
14026 		   double defect_rate, List_T path, int cdna_direction,
14027 		   bool watsonp, int genestrand, bool jump_late_p, int querylength,
14028 		   char *queryseq_ptr, char *queryuc_ptr,
14029 		   Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
14030 		   Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
14031 		   int maxpeelback, Pairpool_T pairpool, Dynprog_T dynprogL) {
14032   List_T pairs = NULL;
14033   int iter1;
14034   int dynprogindex_minor = DYNPROGINDEX_MINOR;
14035   int nmatches, nunknowns, nmismatches, qopens, qindels, topens, tindels,
14036     ncanonical, nsemicanonical, nnoncanonical;
14037   double min_splice_prob;
14038   bool knownsplice3p, chop_exon_p;
14039   bool trim3p, indelp;
14040 
14041 #if 0
14042   int distance3, totaljump3;
14043   int npairs3;
14044 
14045   Pair_T end;
14046   int querypos;
14047   Chrpos_T chrstart, chrend;
14048   List_T all_stage2_ends;
14049 #endif
14050 
14051 
14052   *ambig_end_length_3 = 0;
14053   *ambig_prob_3 = 0.0;
14054 
14055   /* Pass 7: Remove dual or negative breaks */
14056   /* >>path */
14057   debug(printf("\n*** Pass 7b (dir %d): Remove dual breaks at ends.  Initially alignment length is %d\n",
14058 	       cdna_direction,List_length(path)));
14059 #ifdef STRICT
14060   path = remove_dual_breaks(path);
14061 #else
14062   path = remove_negative_breaks(path);
14063 #endif
14064   debug(printf("Final alignment length is %d\n",List_length(path)));
14065   /* <<path */
14066 
14067 
14068   /* Extend to ends */
14069   /* This is the first extension */
14070   /* >>path */
14071   debug(printf("\n*** Pass 8 (dir %d): Extend to 3' end and determine distalmedial\n",cdna_direction));
14072   path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
14073 			 &chop_exon_p,&dynprogindex_minor,path,
14074 			 chroffset,chrhigh,querylength,
14075 			 knownsplice_limit_low,knownsplice_limit_high,
14076 			 queryseq_ptr,queryuc_ptr,
14077 			 cdna_direction,watsonp,genestrand,jump_late_p,
14078 			 maxpeelback,defect_rate,pairpool,dynprogL,
14079 			 /*extendp*/true,/*endalign*/QUERYEND_GAP,/*forcep*/false);
14080 
14081   /* Necessary to insert gaps and assign gap types (fills in cDNA
14082      insertions, so they don't get trimmed), in case an insertion was
14083      introduced at ends */
14084   pairs = List_reverse(path);
14085   path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
14086 			   pairpool,/*finalp*/false);
14087   pairs = assign_gap_types(path,cdna_direction,watsonp,queryseq_ptr,
14088 			   chrnum,chroffset,chrhigh,pairpool);
14089   path = List_reverse(pairs);
14090 
14091   Pair_fracidentity(&nmatches,&nunknowns,&nmismatches,&qopens,&qindels,&topens,&tindels,
14092 		    &ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
14093 		    pairs,cdna_direction);
14094   if (ncanonical > 0 && nnoncanonical > 0) {
14095     /* Pass 10: Remove noncanonical end exons: pairs -> pairs */
14096     debug(printf("\n*** Pass 10 (dir %d): Remove noncanonical end exons\n",cdna_direction));
14097 
14098     if (maximize_coverage_p == true) {
14099       trim3p = false;
14100     } else if (knownsplice3p == true) {
14101       trim3p = false;
14102     } else {
14103       trim3p = true;
14104     }
14105     debug(printf("trim3p = %d\n",trim3p));
14106 
14107     /* Using iter1 to avoid the possibility of an infinite loop */
14108     iter1 = 0;
14109     while (iter1 < 5 && trim3p == true) {
14110       path = trim_end3_exons(&indelp,&trim3p,*ambig_end_length_3,path,dynprogL,chroffset,chrhigh,
14111 			     queryseq_ptr,queryuc_ptr,querylength,
14112 			     cdna_direction,watsonp,genestrand,jump_late_p,pairpool,defect_rate);
14113       if (indelp == true) {
14114 	path = trim_end3_indels(path,*ambig_end_length_3,dynprogL,chroffset,chrhigh,
14115 				queryseq_ptr,queryuc_ptr,querylength,
14116 				watsonp,genestrand,jump_late_p,pairpool,defect_rate);
14117       }
14118       if (trim3p == true) {
14119 	path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
14120 			       &chop_exon_p,&dynprogindex_minor,path,
14121 			       chroffset,chrhigh,querylength,
14122 			       knownsplice_limit_low,knownsplice_limit_high,
14123 			       queryseq_ptr,queryuc_ptr,
14124 			       cdna_direction,watsonp,genestrand,jump_late_p,
14125 			       maxpeelback,defect_rate,pairpool,dynprogL,/*extendp*/true,
14126 			       /*endalign*/BEST_LOCAL,/*forcep*/false);
14127 	debug3(printf("AFTER 3' REBUILD\n"));
14128 	debug3(Pair_dump_list(path,true));
14129       }
14130 
14131       if (knownsplice3p == true) {
14132 	trim3p = false;
14133       }
14134       iter1++;
14135     }
14136 
14137 #if 0
14138     path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
14139 			   &chop_exon_p,&dynprogindex_minor,path,
14140 			   chroffset,chrhigh,querylength,
14141 			   knownsplice_limit_low,knownsplice_limit_high,
14142 			   queryseq_ptr,queryuc_ptr,
14143 			   cdna_direction,watsonp,genestrand,jump_late_p,
14144 			   maxpeelback,defect_rate,pairpool,dynprogL,/*extendp*/true,
14145 			   /*endalign*/QUERYEND_NOGAPS,/*forcep*/false);
14146 #endif
14147   }
14148 
14149 
14150 #if 0
14151   debug(printf("Pass 11 (dir %d): Final extension, end3\n",cdna_direction));
14152   /* This is the second extension */
14153   /* Perform final extension without gaps so we can compare fwd and rev properly */
14154   path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
14155 			 &chop_exon_p,&dynprogindex_minor,path,
14156 			 chroffset,chrhigh,querylength,
14157 			 knownsplice_limit_low,knownsplice_limit_high,
14158 			 queryseq_ptr,queryuc_ptr,
14159 			 cdna_direction,watsonp,genestrand,jump_late_p,
14160 			 maxpeelback,defect_rate,pairpool,dynprogL,
14161 			 /*extendp*/true,/*endalign*/QUERYEND_NOGAPS,/*forcep*/false);
14162 #endif
14163 
14164   debug(Pair_dump_list(path,true));
14165   debug(printf("End of path_compute_end3\n"));
14166   return path;
14167 }
14168 
14169 
14170 static List_T
path_compute_final(double defect_rate,List_T pairs,int cdna_direction,bool watsonp,int genestrand,bool jump_late_p,int querylength,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,int maxpeelback,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Chrpos_T * last_genomedp5,Chrpos_T * last_genomedp3)14171 path_compute_final (double defect_rate, List_T pairs, int cdna_direction, bool watsonp, int genestrand,
14172 		    bool jump_late_p, int querylength,
14173 #ifdef PMAP
14174 		    char *queryaaseq_ptr,
14175 #endif
14176 		    char *queryseq_ptr, char *queryuc_ptr,
14177 		    Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
14178 		    int maxpeelback,
14179 #ifndef GSNAP
14180 		    Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
14181 #endif
14182 		    Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
14183 		    Chrpos_T *last_genomedp5, Chrpos_T *last_genomedp3) {
14184   List_T path = NULL;
14185   int dynprogindex_minor = DYNPROGINDEX_MINOR, dynprogindex_major = DYNPROGINDEX_MAJOR;
14186   bool dual_break_p = true;
14187   bool shiftp, incompletep;
14188 
14189   /* List_T all_stage2_starts, all_stage2_ends; */
14190   /* Univcoord_T position; */
14191 
14192 
14193   debug(printf("Entering path_compute_final\n"));
14194 
14195 #if 0
14196   if (circularp[chrnum] == true) {
14197     overall_end_distance = overall_end_distance_circular;
14198   } else {
14199     overall_end_distance = overall_end_distance_linear;
14200   }
14201 #endif
14202 
14203   path = List_reverse(pairs);
14204   pairs = build_pairs_singles(&dynprogindex_minor,path,/*maxsize*/nullgap,
14205 			      chroffset,chrhigh,queryseq_ptr,queryuc_ptr,querylength,watsonp,genestrand,
14206 			      jump_late_p,maxpeelback,defect_rate,pairpool,dynprogM,
14207 			      last_genomedp5,last_genomedp3,/*forcep*/true,/*finalp*/true);
14208 
14209 #if 1
14210   /* Okay to use finalp == true, as long as Dynprog_genome_gap is called with finalp == false */
14211   debug(printf("\n*** Pass 999 (dir %d): Final pass to find canonical introns\n",cdna_direction));
14212   debug(Pair_dump_list(pairs,true));
14213   path = List_reverse(pairs);	/* ? insert_gapholders() */
14214   pairs = build_pairs_introns(&shiftp,&incompletep,
14215 			      &dynprogindex_minor,&dynprogindex_major,path,
14216 			      chrnum,chroffset,chrhigh,
14217 #ifdef PMAP
14218 			      queryaaseq_ptr,
14219 #endif
14220 			      queryseq_ptr,queryuc_ptr,querylength,
14221 			      cdna_direction,watsonp,genestrand,jump_late_p,
14222 			      maxpeelback,defect_rate,pairpool,dynprogL,dynprogM,dynprogR,
14223 #ifndef GSNAP
14224 			      oligoindices_minor,diagpool,cellpool,
14225 #endif
14226 			      last_genomedp5,last_genomedp3,/*finalp*/true,/*simplep*/true);
14227 #endif
14228 
14229   path = List_reverse(pairs);
14230   pairs = build_dual_breaks(&dual_break_p,&dynprogindex_minor,&dynprogindex_major,path,
14231 			    chrnum,chroffset,chrhigh,
14232 #ifdef PMAP
14233 			    queryaaseq_ptr,
14234 #endif
14235 			    queryseq_ptr,queryuc_ptr,querylength,
14236 			    cdna_direction,watsonp,genestrand,jump_late_p,pairpool,
14237 			    dynprogL,dynprogM,dynprogR,last_genomedp5,last_genomedp3,maxpeelback,
14238 #ifndef GSNAP
14239 			    oligoindices_minor,diagpool,cellpool,
14240 #endif
14241 			    defect_rate,/*finalp*/true,/*simplep*/true);
14242 
14243   path = insert_gapholders(pairs,queryseq_ptr,queryuc_ptr,chroffset,chrhigh,watsonp,genestrand,
14244 			   pairpool,/*finalp*/true);
14245   pairs = assign_gap_types(path,cdna_direction,watsonp,queryseq_ptr,
14246 			   chrnum,chroffset,chrhigh,pairpool);
14247 
14248   return pairs;
14249 }
14250 
14251 
14252 
14253 #ifdef GSNAP
14254 /* This file no longer gets included by GSNAP */
14255 /* I believe this function never gets called with SENSE_NULL */
14256 static List_T
trim_novel_spliceends(int * new_sensedir,List_T pairs,int * ambig_end_length_5,int * ambig_end_length_3,Splicetype_T * ambig_splicetype_5,Splicetype_T * ambig_splicetype_3,double * ambig_prob_5,double * ambig_prob_3,int orig_sensedir,bool watsonp,int querylength,Univcoord_T chroffset,Univcoord_T chrhigh,bool knownsplice5p,bool knownsplice3p)14257 trim_novel_spliceends (int *new_sensedir, List_T pairs,
14258 		       int *ambig_end_length_5, int *ambig_end_length_3,
14259 		       Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
14260 		       double *ambig_prob_5, double *ambig_prob_3,
14261 		       int orig_sensedir, bool watsonp, int querylength,
14262 		       Univcoord_T chroffset, Univcoord_T chrhigh,
14263 		       bool knownsplice5p, bool knownsplice3p) {
14264   List_T path, p;
14265   int i;
14266 
14267   Pair_T pair;
14268   int trim5, trim3, exondist;
14269   Univcoord_T genomicpos, start_genomicpos, middle_genomicpos, end_genomicpos;
14270   Univcoord_T splice_genomepos_5, splice_genomepos_3, splice_genomepos_5_mm, splice_genomepos_3_mm;
14271   Univcoord_T start, middle, end; /* start to middle has mismatches, while middle to end has none */
14272   double donor_prob, acceptor_prob;
14273   double max_prob_5 = 0.0, max_prob_3 = 0.0,
14274     max_prob_sense_forward_5 = 0.0, max_prob_sense_anti_5 = 0.0,
14275     max_prob_sense_forward_3 = 0.0, max_prob_sense_anti_3 = 0.0;
14276   double max_prob_5_mm = 0.0, max_prob_3_mm = 0.0,
14277     max_prob_sense_forward_5_mm = 0.0, max_prob_sense_anti_5_mm = 0.0,
14278     max_prob_sense_forward_3_mm = 0.0, max_prob_sense_anti_3_mm = 0.0;
14279   Splicetype_T splicetype5, splicetype3, splicetype5_mm, splicetype3_mm;
14280   /* int splice_cdna_direction_5, splice_cdna_direction_3; */
14281   int splice_sensedir_5, splice_sensedir_3;
14282   /* int splice_cdna_direction_5_mm, splice_cdna_direction_3_mm; */
14283   int splice_sensedir_5_mm, splice_sensedir_3_mm;
14284   int nmismatches, *scorei;
14285   bool mismatchp;
14286 
14287   debug13(printf("\nEntered gmap_trim_novel_spliceends with orig_sensedir %d, ambig_end_lengths %d and %d\n",
14288 		 orig_sensedir,*ambig_end_length_5,*ambig_end_length_3));
14289   *new_sensedir = SENSE_NULL;
14290 
14291   Pair_trim_distances(&trim5,&trim3,pairs);
14292   debug13(printf("Trim distances are %d and %d\n",trim5,trim3));
14293   if (trim5 > trim3) {
14294     scorei = (int *) MALLOC((trim5 + 1) * sizeof(int));
14295   } else {
14296     scorei = (int *) MALLOC((trim3 + 1) * sizeof(int));
14297   }
14298 
14299 
14300   path = List_reverse(pairs);
14301   if (path != NULL && knownsplice3p == false && *ambig_end_length_3 == 0
14302       /* && exon_length_3(path) >= END_SPLICESITE_EXON_LENGTH */) {
14303     /* See if there is a good splice site at the 3' end */
14304     /* debug13(Pair_dump_list(path,true)); */
14305 
14306     pair = (Pair_T) List_head(p = path);
14307     start = middle = end = pair->genomepos;
14308     debug13(printf("Initializing start and end to be %u\n",start));
14309 
14310     if (pair->querypos != querylength - 1) {
14311       mismatchp = true;
14312     } else {
14313       mismatchp = false;
14314     }
14315 
14316     i = 0;
14317     nmismatches = 0;
14318     while (i < trim3) {
14319       if ((p = List_next(p)) == NULL) {
14320 	break;
14321       } else if (pair->gapp == true) {
14322 	break;
14323       } else if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
14324 	middle = pair->genomepos;
14325 	scorei[i] = nmismatches;
14326 	debug13(printf("Resetting middle to be %u\n",middle));
14327       } else {
14328 	middle = pair->genomepos;
14329 	scorei[i] = ++nmismatches;
14330 	mismatchp = true;
14331 	debug13(printf("Resetting middle to be %u\n",middle));
14332       }
14333       pair = (Pair_T) List_head(p);
14334       i++;
14335     }
14336     scorei[i] = ++nmismatches;
14337 
14338     while (i < trim3 + END_SPLICESITE_SEARCH) {
14339       if ((p = List_next(p)) == NULL) {
14340 	break;
14341       } else if (pair->gapp == true) {
14342 	break;
14343       } else {
14344 	end = pair->genomepos;
14345 	debug13(printf("Resetting end to be %u\n",end));
14346       }
14347       pair = (Pair_T) List_head(p);
14348       i++;
14349     }
14350 
14351     /* Find distance from end to intron, if any */
14352     exondist = 0;
14353     while (p != NULL && ((Pair_T) List_head(p))->gapp == false &&
14354 	   exondist < END_MIN_EXONLENGTH) {
14355       p = List_next(p);
14356       exondist++;
14357     }
14358     debug13(printf("exondist is %d\n",exondist));
14359 
14360     if (mismatchp == false) {
14361       /* Allow perfect overhangs into intron */
14362       /* Note: pairs may not extend all the way to the end, which is why we look at end pair to initialize mismatchp */
14363       debug13(printf("Allowing perfect overhang into potential intron\n"));
14364 
14365     } else if (orig_sensedir == SENSE_FORWARD) {
14366       if (watsonp) {
14367 	splicetype3 = splicetype3_mm = DONOR;
14368 
14369 	start_genomicpos = start + 1;
14370 	middle_genomicpos = middle + 1;
14371 	end_genomicpos = end + 1;
14372 
14373 	/* assert(start_genomicpos >= end_genomicpos); */
14374 	genomicpos = start_genomicpos;
14375 	i = 0;
14376 	while (genomicpos >= middle_genomicpos && scorei[i] < nmismatches - 2) {
14377 	  debug13(printf("3', watson, sense anti %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14378 	  genomicpos--;
14379 	  i++;
14380 	}
14381 	while (genomicpos >= middle_genomicpos &&
14382 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14383 	  donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 1 */
14384 	  debug13(printf("3', watson, sense anti %u %u %f mm %d\n",chroffset+genomicpos,genomicpos,donor_prob,nmismatches - scorei[i]));
14385 	  if (donor_prob > max_prob_3_mm) {
14386 	    max_prob_3_mm = donor_prob;
14387 	    splice_genomepos_3_mm = genomicpos - 1;
14388 	  }
14389 	  genomicpos--;
14390 	  debug13(i++);
14391 	}
14392 	while (genomicpos >= end_genomicpos &&
14393 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14394 	  donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 1 */
14395 	  debug13(printf("3', watson, sense anti %u %u %f\n",chroffset+genomicpos,genomicpos,donor_prob));
14396 	  if (donor_prob > max_prob_3) {
14397 	    max_prob_3 = donor_prob;
14398 	    splice_genomepos_3 = genomicpos - 1;
14399 	  }
14400 	  genomicpos--;
14401 	}
14402 	debug13(printf("\n"));
14403 
14404       } else {
14405 	splicetype3 = splicetype3_mm = ANTIDONOR;
14406 
14407 	start_genomicpos = (start > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - start;
14408 	middle_genomicpos = (middle > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - middle;
14409 	end_genomicpos = (end > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - end;
14410 
14411 	/* assert(start_genomicpos <= end_genomicpos); */
14412 	genomicpos = start_genomicpos;
14413 	i = 0;
14414 	while (genomicpos <= middle_genomicpos && scorei[i] < nmismatches - 2) {
14415 	  debug13(printf("3', crick, sense forward %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14416 	  genomicpos++;
14417 	  i++;
14418 	}
14419 	while (genomicpos <= middle_genomicpos &&
14420 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14421 	  donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 3 */
14422 	  debug13(printf("3', crick, sense forward %u %u %f mm %d\n",chroffset+genomicpos,genomicpos,donor_prob,nmismatches - scorei[i]));
14423 	  if (donor_prob > max_prob_3_mm) {
14424 	    max_prob_3_mm = donor_prob;
14425 	    splice_genomepos_3_mm = (chrhigh - chroffset) - genomicpos;
14426 	  }
14427 	  genomicpos++;
14428 	  debug13(i++);
14429 	}
14430 	while (genomicpos <= end_genomicpos &&
14431 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14432 	  donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 3 */
14433 	  debug13(printf("3', crick, sense forward %u %u %f\n",chroffset+genomicpos,genomicpos,donor_prob));
14434 	  if (donor_prob > max_prob_3) {
14435 	    max_prob_3 = donor_prob;
14436 	    splice_genomepos_3 = (chrhigh - chroffset) - genomicpos;
14437 	  }
14438 	  genomicpos++;
14439 	}
14440 	debug13(printf("\n"));
14441       }
14442 
14443     } else if (orig_sensedir == SENSE_ANTI) {
14444       if (watsonp) {
14445 	splicetype3 = splicetype3_mm = ANTIACCEPTOR;
14446 
14447 	start_genomicpos = start + 1;
14448 	middle_genomicpos = middle + 1;
14449 	end_genomicpos = end + 1;
14450 
14451 	/* assert(start_genomicpos >= end_genomicpos); */
14452 	genomicpos = start_genomicpos;
14453 	i = 0;
14454 	while (genomicpos >= middle_genomicpos && scorei[i] < nmismatches - 2) {
14455 	  debug13(printf("3', watson, sense forward %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14456 	  genomicpos--;
14457 	  i++;
14458 	}
14459 	while (genomicpos >= middle_genomicpos &&
14460 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14461 	  acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 5 */
14462 	  debug13(printf("3', watson, sense forward %u %u %f mm %d\n",chroffset+genomicpos,genomicpos,acceptor_prob,nmismatches - scorei[i]));
14463 	  if (acceptor_prob > max_prob_3_mm) {
14464 	    max_prob_3_mm = acceptor_prob;
14465 	    splice_genomepos_3_mm = genomicpos - 1;
14466 	  }
14467 	  genomicpos--;
14468 	  debug13(i++);
14469 	}
14470 	while (genomicpos >= end_genomicpos &&
14471 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14472 	  acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 5 */
14473 	  debug13(printf("3', watson, sense forward %u %u %f\n",chroffset+genomicpos,genomicpos,acceptor_prob));
14474 	  if (acceptor_prob > max_prob_3) {
14475 	    max_prob_3 = acceptor_prob;
14476 	    splice_genomepos_3 = genomicpos - 1;
14477 	  }
14478 	  genomicpos--;
14479 	}
14480 	debug13(printf("\n"));
14481 
14482       } else {
14483 	splicetype3 = splicetype3_mm = ACCEPTOR;
14484 
14485 	start_genomicpos = (start > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - start;
14486 	middle_genomicpos = (middle > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - middle;
14487 	end_genomicpos = (end > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - end;
14488 
14489 	/* assert(start_genomicpos <= end_genomicpos); */
14490 	genomicpos = start_genomicpos;
14491 	i = 0;
14492 	while (genomicpos <= middle_genomicpos && scorei[i] < nmismatches - 2) {
14493 	  debug13(printf("3', crick, sense anti %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14494 	  genomicpos++;
14495 	  i++;
14496 	}
14497 	while (genomicpos <= middle_genomicpos &&
14498 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14499 	  acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 7 */
14500 	  debug13(printf("3', crick, sense anti %u %u %f mm %d\n",chroffset+genomicpos,genomicpos,acceptor_prob,nmismatches - scorei[i]));
14501 	  if (acceptor_prob > max_prob_3_mm) {
14502 	    max_prob_3_mm = acceptor_prob;
14503 	    splice_genomepos_3_mm = (chrhigh - chroffset) - genomicpos;
14504 	  }
14505 	  genomicpos++;
14506 	  debug13(i++);
14507 	}
14508 	while (genomicpos <= end_genomicpos &&
14509 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14510 	  acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 7 */
14511 	  debug13(printf("3', crick, sense anti %u %u %f\n",chroffset+genomicpos,genomicpos,acceptor_prob));
14512 	  if (acceptor_prob > max_prob_3) {
14513 	    max_prob_3 = acceptor_prob;
14514 	    splice_genomepos_3 = (chrhigh - chroffset) - genomicpos;
14515 	  }
14516 	  genomicpos++;
14517 	}
14518 	debug13(printf("\n"));
14519       }
14520 
14521     } else {
14522       if (watsonp) {
14523 	start_genomicpos = start + 1;
14524 	middle_genomicpos = middle + 1;
14525 	end_genomicpos = end + 1;
14526 
14527 	/* assert(start_genomicpos >= end_genomicpos); */
14528 	genomicpos = start_genomicpos;
14529 	i = 0;
14530 	while (genomicpos >= middle_genomicpos && scorei[i] < nmismatches - 2) {
14531 	  debug13(printf("3', watson, sense null %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14532 	  genomicpos--;
14533 	  i++;
14534 	}
14535 	while (genomicpos >= middle_genomicpos &&
14536 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14537 	  donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 1 */
14538 	  acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 5 */
14539 	  debug13(printf("3', watson, sense null %u %u %f %f mm %d\n",chroffset+genomicpos,genomicpos,donor_prob,acceptor_prob,nmismatches - scorei[i]));
14540 	  if (donor_prob > max_prob_sense_forward_3_mm) {
14541 	    max_prob_sense_forward_3_mm = donor_prob;
14542 	    if (donor_prob > max_prob_3_mm) {
14543 	      max_prob_3_mm = donor_prob;
14544 	      splice_genomepos_3_mm = genomicpos - 1;
14545 	      /* splice_cdna_direction_3_mm = +1; */
14546 	      splice_sensedir_3_mm = SENSE_FORWARD;
14547 	      splicetype3_mm = DONOR;
14548 	    }
14549 	  }
14550 	  if (acceptor_prob > max_prob_sense_anti_3_mm) {
14551 	    max_prob_sense_anti_3_mm = acceptor_prob;
14552 	    if (acceptor_prob > max_prob_3_mm) {
14553 	      max_prob_3_mm = acceptor_prob;
14554 	      splice_genomepos_3_mm = genomicpos - 1;
14555 	      /* splice_cdna_direction_3_mm = -1; */
14556 	      splice_sensedir_3_mm = SENSE_ANTI;
14557 	      splicetype3_mm = ANTIACCEPTOR;
14558 	    }
14559 	  }
14560 	  genomicpos--;
14561 	  debug13(i++);
14562 	}
14563 	while (genomicpos >= end_genomicpos &&
14564 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14565 	  donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 1 */
14566 	  acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 5 */
14567 	  debug13(printf("3', watson, sense null %u %u %f %f\n",chroffset+genomicpos,genomicpos,donor_prob,acceptor_prob));
14568 	  if (donor_prob > max_prob_sense_forward_3) {
14569 	    max_prob_sense_forward_3 = donor_prob;
14570 	    if (donor_prob > max_prob_3) {
14571 	      max_prob_3 = donor_prob;
14572 	      splice_genomepos_3 = genomicpos - 1;
14573 	      /* splice_cdna_direction_3 = +1; */
14574 	      splice_sensedir_3 = SENSE_FORWARD;
14575 	      splicetype3 = DONOR;
14576 	    }
14577 	  }
14578 	  if (acceptor_prob > max_prob_sense_anti_3) {
14579 	    max_prob_sense_anti_3 = acceptor_prob;
14580 	    if (acceptor_prob > max_prob_3) {
14581 	      max_prob_3 = acceptor_prob;
14582 	      splice_genomepos_3 = genomicpos - 1;
14583 	      /* splice_cdna_direction_3 = -1; */
14584 	      splice_sensedir_3 = SENSE_ANTI;
14585 	      splicetype3 = ANTIACCEPTOR;
14586 	    }
14587 	  }
14588 	  genomicpos--;
14589 	}
14590 	debug13(printf("\n"));
14591 
14592       } else {
14593 	start_genomicpos = (start > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - start;
14594 	middle_genomicpos = (middle > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - middle;
14595 	end_genomicpos = (end > chrhigh - chroffset) ? 0 : (chrhigh - chroffset) - end;
14596 
14597 	/* assert(start_genomicpos <= end_genomicpos); */
14598 	genomicpos = start_genomicpos;
14599 	i = 0;
14600 	while (genomicpos <= middle_genomicpos && scorei[i] < nmismatches - 2) {
14601 	  debug13(printf("3', crick, sense null %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14602 	  genomicpos++;
14603 	  i++;
14604 	}
14605 	while (genomicpos <= middle_genomicpos &&
14606 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14607 	  donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 3 */
14608 	  acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 7 */
14609 	  debug13(printf("3', crick, sense null %u %u %f %f mm %d\n",chroffset+genomicpos,genomicpos,donor_prob,acceptor_prob,nmismatches - scorei[i]));
14610 	  if (donor_prob > max_prob_sense_forward_3_mm) {
14611 	    max_prob_sense_forward_3_mm = donor_prob;
14612 	    if (donor_prob > max_prob_3_mm) {
14613 	      max_prob_3_mm = donor_prob;
14614 	      splice_genomepos_3_mm = (chrhigh - chroffset) - genomicpos;
14615 	      /* splice_cdna_direction_3_mm = +1; */
14616 	      splice_sensedir_3_mm = SENSE_FORWARD;
14617 	      splicetype3_mm = ANTIDONOR;
14618 	    }
14619 	  }
14620 	  if (acceptor_prob > max_prob_sense_anti_3_mm) {
14621 	    max_prob_sense_anti_3_mm = acceptor_prob;
14622 	    if (acceptor_prob > max_prob_3_mm) {
14623 	      max_prob_3_mm = acceptor_prob;
14624 	      splice_genomepos_3_mm = (chrhigh - chroffset) - genomicpos;
14625 	      /* splice_cdna_direction_3_mm = -1; */
14626 	      splice_sensedir_3_mm = SENSE_ANTI;
14627 	      splicetype3_mm = ACCEPTOR;
14628 	    }
14629 	  }
14630 	  genomicpos++;
14631 	  debug13(i++);
14632 	}
14633 	while (genomicpos <= end_genomicpos &&
14634 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14635 	  donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 3 */
14636 	  acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 7 */
14637 	  debug13(printf("3', crick, sense null %u %u %f %f\n",chroffset+genomicpos,genomicpos,donor_prob,acceptor_prob));
14638 	  if (donor_prob > max_prob_sense_forward_3) {
14639 	    max_prob_sense_forward_3 = donor_prob;
14640 	    if (donor_prob > max_prob_3) {
14641 	      max_prob_3 = donor_prob;
14642 	      splice_genomepos_3 = (chrhigh - chroffset) - genomicpos;
14643 	      /* splice_cdna_direction_3 = +1; */
14644 	      splice_sensedir_3 = SENSE_FORWARD;
14645 	      splicetype3 = ANTIDONOR;
14646 	    }
14647 	  }
14648 	  if (acceptor_prob > max_prob_sense_anti_3) {
14649 	    max_prob_sense_anti_3 = acceptor_prob;
14650 	    if (acceptor_prob > max_prob_3) {
14651 	      max_prob_3 = acceptor_prob;
14652 	      splice_genomepos_3 = (chrhigh - chroffset) - genomicpos;
14653 	      /* splice_cdna_direction_3 = -1; */
14654 	      splice_sensedir_3 = SENSE_ANTI;
14655 	      splicetype3 = ACCEPTOR;
14656 	    }
14657 	  }
14658 	  genomicpos++;
14659 	}
14660 	debug13(printf("\n"));
14661       }
14662     }
14663 
14664     if (orig_sensedir != SENSE_NULL) {
14665       if (max_prob_3 > END_SPLICESITE_PROB_MATCH) {
14666 	debug13(printf("Found good splice %s on 3' end at %u with probability %f\n",
14667 		       Splicetype_string(splicetype3),splice_genomepos_3,max_prob_3));
14668 	while (path != NULL && ((Pair_T) path->first)->genomepos > splice_genomepos_3) {
14669 	  path = Pairpool_pop(path,&pair);
14670 	}
14671 	/* path = clean_path_end3(path); -- Gives wrong end */
14672 	if (path != NULL) {
14673 	  *ambig_end_length_3 = (querylength - 1) - ((Pair_T) path->first)->querypos;
14674 	  *ambig_splicetype_3 = splicetype3;
14675 	  *ambig_prob_3 = max_prob_3;
14676 	  debug13(printf("Set ambig_end_length_3 to be %d\n",*ambig_end_length_3));
14677 	}
14678 
14679       } else if (max_prob_3_mm > END_SPLICESITE_PROB_MISMATCH) {
14680 	debug13(printf("Found good mismatch splice %s on 3' end at %u with probability %f\n",
14681 		       Splicetype_string(splicetype3_mm),splice_genomepos_3_mm,max_prob_3_mm));
14682 	while (path != NULL && ((Pair_T) path->first)->genomepos > splice_genomepos_3_mm) {
14683 	  path = Pairpool_pop(path,&pair);
14684 	}
14685 	/* path = clean_path_end3(path); -- Gives wrong end */
14686 	if (path != NULL) {
14687 	  *ambig_end_length_3 = (querylength - 1) - ((Pair_T) path->first)->querypos;
14688 	  *ambig_splicetype_3 = splicetype3_mm;
14689 	  *ambig_prob_3 = max_prob_3_mm;
14690 	  debug13(printf("Set ambig_end_length_3 to be %d\n",*ambig_end_length_3));
14691 	}
14692       }
14693     }
14694   }
14695 
14696   /* 5' end */
14697 
14698   pairs = List_reverse(path);
14699   if (pairs != NULL && knownsplice5p == false && *ambig_end_length_5 == 0
14700       /* && exon_length_5(pairs) >= END_SPLICESITE_EXON_LENGTH */) {
14701     /* See if there is a good splice site at the 5' end */
14702     /* debug13(Pair_dump_list(pairs,true)); */
14703 
14704     pair = (Pair_T) List_head(p = pairs);
14705     start = middle = end = pair->genomepos;
14706     debug13(printf("Initializing start and end to be %u\n",start));
14707 
14708     if (pair->querypos != 0) {
14709       mismatchp = true;
14710     } else {
14711       mismatchp = false;
14712     }
14713 
14714     i = 0;
14715     nmismatches = 0;
14716     while (i < trim5) {
14717       if ((p = List_next(p)) == NULL) {
14718 	break;
14719       } else if (pair->gapp == true) {
14720 	break;
14721       } else if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
14722 	middle = pair->genomepos;
14723 	scorei[i] = nmismatches;
14724 	debug13(printf("Resetting middle to be %u\n",middle));
14725       } else {
14726 	middle = pair->genomepos;
14727 	scorei[i] = ++nmismatches;
14728 	mismatchp = true;
14729 	debug13(printf("Resetting middle to be %u\n",middle));
14730       }
14731       pair = (Pair_T) List_head(p);
14732       i++;
14733     }
14734     scorei[i] = nmismatches;
14735 
14736     while (i < trim5 + END_SPLICESITE_SEARCH) {
14737       if ((p = List_next(p)) == NULL) {
14738 	break;
14739       } else if (pair->gapp == true) {
14740 	break;
14741       } else {
14742 	end = pair->genomepos;
14743 	debug13(printf("Resetting end to be %u\n",end));
14744       }
14745       pair = (Pair_T) List_head(p);
14746       i++;
14747     }
14748 
14749     /* Find distance from end to intron, if any */
14750     exondist = 0;
14751     while (p != NULL && ((Pair_T) List_head(p))->gapp == false &&
14752 	   exondist < END_MIN_EXONLENGTH) {
14753       p = List_next(p);
14754       exondist++;
14755     }
14756     debug13(printf("exondist is %d\n",exondist));
14757 
14758     if (mismatchp == false) {
14759       /* Allow perfect overhangs into intron */
14760       /* Note: pairs may not extend all the way to the end, which is why we look at end pair to initialize mismatchp */
14761       debug13(printf("Allowing perfect overhang into potential intron\n"));
14762 
14763     } else if (orig_sensedir == SENSE_FORWARD) {
14764       if (watsonp) {
14765 	splicetype5 = splicetype5_mm = ACCEPTOR;
14766 
14767 	start_genomicpos = start;
14768 	middle_genomicpos = middle;
14769 	end_genomicpos = end;
14770 
14771 	/* assert(start_genomicpos <= end_genomicpos); */
14772 	genomicpos = start_genomicpos;
14773 	i = 0;
14774 	while (genomicpos <= middle_genomicpos && scorei[i] < nmismatches - 2) {
14775 	  debug13(printf("5', watson, sense forward %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14776 	  genomicpos++;
14777 	  i++;
14778 	}
14779 	while (genomicpos <= middle_genomicpos &&
14780 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14781 	  acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 2 */
14782 	  debug13(printf("5', watson, sense forward %u %u %f mm %d\n",chroffset+genomicpos,genomicpos,acceptor_prob,nmismatches - scorei[i]));
14783 	  if (acceptor_prob > max_prob_5_mm) {
14784 	    max_prob_5_mm = acceptor_prob;
14785 	    splice_genomepos_5_mm = genomicpos;
14786 	  }
14787 	  genomicpos++;
14788 	  debug13(i++);
14789 	}
14790 	while (genomicpos <= end_genomicpos &&
14791 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14792 	  acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 2 */
14793 	  debug13(printf("5', watson, sense forward %u %u %f\n",chroffset+genomicpos,genomicpos,acceptor_prob));
14794 	  if (acceptor_prob > max_prob_5) {
14795 	    max_prob_5 = acceptor_prob;
14796 	    splice_genomepos_5 = genomicpos;
14797 	  }
14798 	  genomicpos++;
14799 	}
14800 	debug13(printf("\n"));
14801 
14802       } else {
14803 	splicetype5 = splicetype5_mm = ANTIACCEPTOR;
14804 
14805 	start_genomicpos = (start > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - start + 1;
14806 	middle_genomicpos = (middle > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - middle + 1;
14807 	end_genomicpos = (end > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - end + 1;
14808 
14809 	/* assert(start_genomicpos >= end_genomicpos); */
14810 	genomicpos = start_genomicpos;
14811 	i = 0;
14812 	while (genomicpos >= middle_genomicpos && scorei[i] < nmismatches - 2) {
14813 	  debug13(printf("5', crick, sense anti %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14814 	  genomicpos--;
14815 	  i++;
14816 	}
14817 	while (genomicpos >= middle_genomicpos &&
14818 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14819 	  acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 4 */
14820 	  debug13(printf("5', crick, sense anti %u %u %f mm %d\n",chroffset+genomicpos,genomicpos,acceptor_prob,nmismatches - scorei[i]));
14821 	  if (acceptor_prob > max_prob_5_mm) {
14822 	    max_prob_5_mm = acceptor_prob;
14823 	    splice_genomepos_5_mm = (chrhigh - chroffset) - genomicpos + 1;
14824 	  }
14825 	  genomicpos--;
14826 	  debug13(i++);
14827 	}
14828 	while (genomicpos >= end_genomicpos &&
14829 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14830 	  acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 4 */
14831 	  debug13(printf("5', crick, sense anti %u %u %f\n",chroffset+genomicpos,genomicpos,acceptor_prob));
14832 	  if (acceptor_prob > max_prob_5) {
14833 	    max_prob_5 = acceptor_prob;
14834 	    splice_genomepos_5 = (chrhigh - chroffset) - genomicpos + 1;
14835 	  }
14836 	  genomicpos--;
14837 	}
14838 	debug13(printf("\n"));
14839       }
14840 
14841     } else if (orig_sensedir == SENSE_ANTI) {
14842       if (watsonp) {
14843 	splicetype5 = splicetype5_mm = ANTIDONOR;
14844 
14845 	start_genomicpos = start;
14846 	middle_genomicpos = middle;
14847 	end_genomicpos = end;
14848 
14849 	/* assert(start_genomicpos <= end_genomicpos); */
14850 	genomicpos = start_genomicpos;
14851 	i = 0;
14852 	while (genomicpos <= middle_genomicpos && scorei[i] < nmismatches - 2) {
14853 	  debug13(printf("5', watson, sense anti %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14854 	  genomicpos++;
14855 	  i++;
14856 	}
14857 	while (genomicpos <= middle_genomicpos &&
14858 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14859 	  donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 6 */
14860 	  debug13(printf("5', watson, sense anti %u %u %f mm %d\n",chroffset+genomicpos,genomicpos,donor_prob,nmismatches - scorei[i]));
14861 	  if (donor_prob > max_prob_5_mm) {
14862 	    max_prob_5_mm = donor_prob;
14863 	    splice_genomepos_5_mm = genomicpos;
14864 	  }
14865 	  genomicpos++;
14866 	  debug13(i++);
14867 	}
14868 	while (genomicpos <= end_genomicpos &&
14869 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14870 	  donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 6 */
14871 	  debug13(printf("5', watson, sense anti %u %u %f\n",chroffset+genomicpos,genomicpos,donor_prob));
14872 	  if (donor_prob > max_prob_5) {
14873 	    max_prob_5 = donor_prob;
14874 	    splice_genomepos_5 = genomicpos;
14875 	  }
14876 	  genomicpos++;
14877 	}
14878 	debug13(printf("\n"));
14879 
14880       } else {
14881 	splicetype5 = splicetype5_mm = DONOR;
14882 
14883 	start_genomicpos = (start > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - start + 1;
14884 	middle_genomicpos = (middle > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - middle + 1;
14885 	end_genomicpos = (end > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - end + 1;
14886 
14887 	/* assert(start_genomicpos >= end_genomicpos); */
14888 	genomicpos = start_genomicpos;
14889 	i = 0;
14890 	while (genomicpos >= middle_genomicpos && scorei[i] < nmismatches - 2) {
14891 	  debug13(printf("5', crick, sense forward %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14892 	  genomicpos--;
14893 	  i++;
14894 	}
14895 	while (genomicpos >= middle_genomicpos &&
14896 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14897 	  donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 8 */
14898 	  debug13(printf("5', crick, sense forward %u %u %f mm %d\n",chroffset+genomicpos,genomicpos,donor_prob,nmismatches - scorei[i]));
14899 	  if (donor_prob > max_prob_5_mm) {
14900 	    max_prob_5_mm = donor_prob;
14901 	    splice_genomepos_5_mm = (chrhigh - chroffset) - genomicpos + 1;
14902 	  }
14903 	  genomicpos--;
14904 	  debug13(i++);
14905 	}
14906 	while (genomicpos >= end_genomicpos &&
14907 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
14908 	  donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 8 */
14909 	  debug13(printf("5', crick, sense forward %u %u %f\n",chroffset+genomicpos,genomicpos,donor_prob));
14910 	  if (donor_prob > max_prob_5) {
14911 	    max_prob_5 = donor_prob;
14912 	    splice_genomepos_5 = (chrhigh - chroffset) - genomicpos + 1;
14913 	  }
14914 	  genomicpos--;
14915 	}
14916 	debug13(printf("\n"));
14917       }
14918 
14919     } else {
14920       if (watsonp) {
14921 	start_genomicpos = start;
14922 	middle_genomicpos = middle;
14923 	end_genomicpos = end;
14924 
14925 	/* assert(start_genomicpos <= end_genomicpos); */
14926 	genomicpos = start_genomicpos;
14927 	i = 0;
14928 	while (genomicpos <= middle_genomicpos && scorei[i] < nmismatches - 2) {
14929 	  debug13(printf("5', watson, sense null %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
14930 	  genomicpos++;
14931 	  debug13(i++);
14932 	}
14933 	while (genomicpos <= middle_genomicpos &&
14934 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14935 	  acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 2 */
14936 	  donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 6 */
14937 	  debug13(printf("5', watson, sense null %u %u %f %f mm %d\n",chroffset+genomicpos,genomicpos,donor_prob,acceptor_prob,nmismatches - scorei[i]));
14938 	  if (acceptor_prob > max_prob_sense_forward_5_mm) {
14939 	    max_prob_sense_forward_5_mm = acceptor_prob;
14940 	    if (acceptor_prob > max_prob_5_mm) {
14941 	      max_prob_5_mm = acceptor_prob;
14942 	      splice_genomepos_5_mm = genomicpos;
14943 	      /* splice_cdna_direction_5_mm = +1; */
14944 	      splice_sensedir_5_mm = SENSE_FORWARD;
14945 	      splicetype5_mm = ACCEPTOR;
14946 	    }
14947 	  }
14948 	  if (donor_prob > max_prob_sense_anti_5_mm) {
14949 	    max_prob_sense_anti_5_mm = donor_prob;
14950 	    if (donor_prob > max_prob_5_mm) {
14951 	      max_prob_5_mm = donor_prob;
14952 	      splice_genomepos_5_mm = genomicpos;
14953 	      /* splice_cdna_direction_5_mm = -1; */
14954 	      splice_sensedir_5_mm = SENSE_ANTI;
14955 	      splicetype5_mm = ANTIDONOR;
14956 	    }
14957 	  }
14958 	  genomicpos++;
14959 	  debug13(i++);
14960 	}
14961 	while (genomicpos <= end_genomicpos &&
14962 	       genomicpos <= end_genomicpos + exondist - END_MIN_EXONLENGTH) {
14963 	  acceptor_prob = Maxent_hr_acceptor_prob(chroffset+genomicpos,chroffset); /* Case 2 */
14964 	  donor_prob = Maxent_hr_antidonor_prob(chroffset+genomicpos,chroffset); /* Case 6 */
14965 	  debug13(printf("5', watson, sense null %u %u %f %f\n",chroffset+genomicpos,genomicpos,donor_prob,acceptor_prob));
14966 	  if (acceptor_prob > max_prob_sense_forward_5) {
14967 	    max_prob_sense_forward_5 = acceptor_prob;
14968 	    if (acceptor_prob > max_prob_5) {
14969 	      max_prob_5 = acceptor_prob;
14970 	      splice_genomepos_5 = genomicpos;
14971 	      /* splice_cdna_direction_5 = +1; */
14972 	      splice_sensedir_5 = SENSE_FORWARD;
14973 	      splicetype5 = ACCEPTOR;
14974 	    }
14975 	  }
14976 	  if (donor_prob > max_prob_sense_anti_5) {
14977 	    max_prob_sense_anti_5 = donor_prob;
14978 	    if (donor_prob > max_prob_5) {
14979 	      max_prob_5 = donor_prob;
14980 	      splice_genomepos_5 = genomicpos;
14981 	      /* splice_cdna_direction_5 = -1; */
14982 	      splice_sensedir_5 = SENSE_ANTI;
14983 	      splicetype5 = ANTIDONOR;
14984 	    }
14985 	  }
14986 	  genomicpos++;
14987 	}
14988 	debug13(printf("\n"));
14989 
14990       } else {
14991 	start_genomicpos = (start > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - start + 1;
14992 	middle_genomicpos = (middle > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - middle + 1;
14993 	end_genomicpos = (end > chrhigh - chroffset) ? 1 : (chrhigh - chroffset) - end + 1;
14994 
14995 	/* assert(start_genomicpos >= end_genomicpos); */
14996 	genomicpos = start_genomicpos;
14997 	i = 0;
14998 	while (genomicpos >= middle_genomicpos && scorei[i] < nmismatches - 2) {
14999 	  debug13(printf("5', crick, sense null %u %u score %d\n",chroffset+genomicpos,genomicpos,nmismatches - scorei[i]));
15000 	  genomicpos--;
15001 	  i++;
15002 	}
15003 	while (genomicpos >= middle_genomicpos &&
15004 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
15005 	  acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 4 */
15006 	  donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 8 */
15007 	  debug13(printf("5', crick, sense null %u %u %f %f mm %d\n",chroffset+genomicpos,genomicpos,donor_prob,acceptor_prob,nmismatches - scorei[i]));
15008 	  if (acceptor_prob > max_prob_sense_forward_5_mm) {
15009 	    max_prob_sense_forward_5_mm = acceptor_prob;
15010 	    if (acceptor_prob > max_prob_5_mm) {
15011 	      max_prob_5_mm = acceptor_prob;
15012 	      splice_genomepos_5_mm = (chrhigh - chroffset) - genomicpos + 1;
15013 	      /* splice_cdna_direction_5_mm = +1; */
15014 	      splice_sensedir_5_mm = SENSE_FORWARD;
15015 	      splicetype5_mm = ANTIACCEPTOR;
15016 	    }
15017 	  }
15018 	  if (donor_prob > max_prob_sense_anti_5_mm) {
15019 	    max_prob_sense_anti_5_mm = donor_prob;
15020 	    if (donor_prob > max_prob_5_mm) {
15021 	      max_prob_5_mm = donor_prob;
15022 	      splice_genomepos_5_mm = (chrhigh - chroffset) - genomicpos + 1;
15023 	      /* splice_cdna_direction_5_mm = -1; */
15024 	      splice_sensedir_5_mm = SENSE_ANTI;
15025 	      splicetype5_mm = DONOR;
15026 	    }
15027 	  }
15028 	  genomicpos--;
15029 	  debug13(i++);
15030 	}
15031 	while (genomicpos >= end_genomicpos &&
15032 	       genomicpos >= end_genomicpos - exondist + END_MIN_EXONLENGTH) {
15033 	  acceptor_prob = Maxent_hr_antiacceptor_prob(chroffset+genomicpos,chroffset); /* Case 4 */
15034 	  donor_prob = Maxent_hr_donor_prob(chroffset+genomicpos,chroffset); /* Case 8 */
15035 	  debug13(printf("5', crick, sense null %u %u %f %f\n",chroffset+genomicpos,genomicpos,donor_prob,acceptor_prob));
15036 	  if (acceptor_prob > max_prob_sense_forward_5) {
15037 	    max_prob_sense_forward_5 = acceptor_prob;
15038 	    if (acceptor_prob > max_prob_5) {
15039 	      max_prob_5 = acceptor_prob;
15040 	      splice_genomepos_5 = (chrhigh - chroffset) - genomicpos + 1;
15041 	      /* splice_cdna_direction_5 = +1; */
15042 	      splice_sensedir_5 = SENSE_FORWARD;
15043 	      splicetype5 = ANTIACCEPTOR;
15044 	    }
15045 	  }
15046 	  if (donor_prob > max_prob_sense_anti_5) {
15047 	    max_prob_sense_anti_5 = donor_prob;
15048 	    if (donor_prob > max_prob_5) {
15049 	      max_prob_5 = donor_prob;
15050 	      splice_genomepos_5 = (chrhigh - chroffset) - genomicpos + 1;
15051 	      /* splice_cdna_direction_5 = -1; */
15052 	      splice_sensedir_5 = SENSE_ANTI;
15053 	      splicetype5 = DONOR;
15054 	    }
15055 	  }
15056 	  genomicpos--;
15057 	}
15058 	debug13(printf("\n"));
15059       }
15060     }
15061 
15062     if (orig_sensedir != SENSE_NULL) {
15063       if (max_prob_5 > END_SPLICESITE_PROB_MATCH) {
15064 	debug13(printf("Found good splice %s on 5' end at %u with probability %f\n",
15065 		       Splicetype_string(splicetype5),splice_genomepos_5,max_prob_5));
15066 	while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < splice_genomepos_5) {
15067 	  pairs = Pairpool_pop(pairs,&pair);
15068 	}
15069 	/* pairs = clean_pairs_end5(pairs); -- gives wrong end */
15070 	if (pairs != NULL) {
15071 	  *ambig_end_length_5 = ((Pair_T) pairs->first)->querypos;
15072 	  *ambig_splicetype_5 = splicetype5;
15073 	  *ambig_prob_5 = max_prob_5;
15074 	  debug13(printf("Set ambig_end_length_5 to be %d\n",*ambig_end_length_5));
15075 	}
15076       } else if (max_prob_5_mm > END_SPLICESITE_PROB_MISMATCH) {
15077 	debug13(printf("Found good mismatch splice %s on 5' end at %u with probability %f\n",
15078 		       Splicetype_string(splicetype5_mm),splice_genomepos_5_mm,max_prob_5_mm));
15079 	while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < splice_genomepos_5_mm) {
15080 	  pairs = Pairpool_pop(pairs,&pair);
15081 	}
15082 	/* pairs = clean_pairs_end5(pairs); -- gives wrong end */
15083 	if (pairs != NULL) {
15084 	  *ambig_end_length_5 = ((Pair_T) pairs->first)->querypos;
15085 	  *ambig_splicetype_5 = splicetype5_mm;
15086 	  *ambig_prob_5 = max_prob_5_mm;
15087 	  debug13(printf("Set ambig_end_length_5 to be %d\n",*ambig_end_length_5));
15088 	}
15089       }
15090     }
15091   }
15092 
15093   if (orig_sensedir == SENSE_NULL) {
15094     if (max_prob_3 >= END_SPLICESITE_PROB_MATCH || max_prob_5 >= END_SPLICESITE_PROB_MATCH) {
15095       if (max_prob_3 >= END_SPLICESITE_PROB_MATCH && max_prob_5 >= END_SPLICESITE_PROB_MATCH
15096 	  && max_prob_sense_forward_3 >= END_SPLICESITE_PROB_MATCH && max_prob_sense_anti_3 < END_SPLICESITE_PROB_MATCH
15097 	  && max_prob_sense_forward_5 >= END_SPLICESITE_PROB_MATCH && max_prob_sense_anti_5 < END_SPLICESITE_PROB_MATCH) {
15098 	/* Forward sense wins on both sides */
15099 
15100 	debug13(printf("Found good splice %s on 5' end at %u with probability %f\n",
15101 		       Splicetype_string(splicetype5),splice_genomepos_5,max_prob_5));
15102 	while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < splice_genomepos_5) {
15103 	  pairs = Pairpool_pop(pairs,&pair);
15104 	}
15105 	/* pairs = clean_pairs_end5(pairs); -- gives wrong end */
15106 	if (pairs != NULL) {
15107 	  *ambig_end_length_5 = ((Pair_T) pairs->first)->querypos;
15108 	  *ambig_splicetype_5 = splicetype5;
15109 	  *ambig_prob_5 = max_prob_5;
15110 	  /* *cdna_direction = splice_cdna_direction_5; */
15111 	  debug13(printf("Set ambig_end_length_5 to be %d\n",*ambig_end_length_5));
15112 	}
15113 
15114 	debug13(printf("Found good splice %s on 3' end at %u with probability %f\n",
15115 		       Splicetype_string(splicetype3),splice_genomepos_3,max_prob_3));
15116 	path = List_reverse(pairs);
15117 	while (path != NULL && ((Pair_T) path->first)->genomepos > splice_genomepos_3) {
15118 	  path = Pairpool_pop(path,&pair);
15119 	}
15120 	/* path = clean_path_end3(path); -- gives wrong end */
15121 	if (path != NULL) {
15122 	  *ambig_end_length_3 = (querylength - 1) - ((Pair_T) path->first)->querypos;
15123 	  *ambig_splicetype_3 = splicetype3;
15124 	  *ambig_prob_3 = max_prob_3;
15125 	  /* *cdna_direction = splice_cdna_direction_3; */
15126 	  debug13(printf("Set ambig_end_length_3 to be %d\n",*ambig_end_length_3));
15127 	}
15128 	*new_sensedir = SENSE_FORWARD;
15129 	pairs = List_reverse(path);
15130 
15131       } else if (max_prob_3 >= END_SPLICESITE_PROB_MATCH && max_prob_5 >= END_SPLICESITE_PROB_MATCH
15132 		 && max_prob_sense_anti_3 >= END_SPLICESITE_PROB_MATCH && max_prob_sense_forward_3 < END_SPLICESITE_PROB_MATCH
15133 		 && max_prob_sense_anti_5 >= END_SPLICESITE_PROB_MATCH && max_prob_sense_forward_5 < END_SPLICESITE_PROB_MATCH) {
15134 
15135 	/* Anti sense wins on both sides */
15136 	debug13(printf("Found good splice %s on 5' end at %u with probability %f\n",
15137 		       Splicetype_string(splicetype5),splice_genomepos_5,max_prob_5));
15138 	while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < splice_genomepos_5) {
15139 	  pairs = Pairpool_pop(pairs,&pair);
15140 	}
15141 	/* pairs = clean_pairs_end5(pairs); -- gives wrong end */
15142 	if (pairs != NULL) {
15143 	  *ambig_end_length_5 = ((Pair_T) pairs->first)->querypos;
15144 	  *ambig_splicetype_5 = splicetype5;
15145 	  *ambig_prob_5 = max_prob_5;
15146 	  /* *cdna_direction = splice_cdna_direction_5; */
15147 	  debug13(printf("Set ambig_end_length_5 to be %d\n",*ambig_end_length_5));
15148 	}
15149 
15150 	debug13(printf("Found good splice %s on 3' end at %u with probability %f\n",
15151 		       Splicetype_string(splicetype3),splice_genomepos_3,max_prob_3));
15152 	path = List_reverse(pairs);
15153 	while (path != NULL && ((Pair_T) path->first)->genomepos > splice_genomepos_3) {
15154 	  path = Pairpool_pop(path,&pair);
15155 	}
15156 	/* path = clean_path_end3(path); -- gives wrong end */
15157 	if (path != NULL) {
15158 	  *ambig_end_length_3 = (querylength - 1) - ((Pair_T) path->first)->querypos;
15159 	  *ambig_splicetype_3 = splicetype3;
15160 	  *ambig_prob_3 = max_prob_3;
15161 	  /* *cdna_direction = splice_cdna_direction_3; */
15162 	  debug13(printf("Set ambig_end_length_3 to be %d\n",*ambig_end_length_3));
15163 	}
15164 	*new_sensedir = SENSE_ANTI;
15165 	pairs = List_reverse(path);
15166 
15167       } else if (max_prob_3 > max_prob_5) {
15168 	/* Consider just 3' end */
15169 	debug13(printf("Found good splice %s on 3' end at %u with probability %f\n",
15170 		       Splicetype_string(splicetype3),splice_genomepos_3,max_prob_3));
15171 	path = List_reverse(pairs);
15172 	while (path != NULL && ((Pair_T) path->first)->genomepos > splice_genomepos_3) {
15173 	  path = Pairpool_pop(path,&pair);
15174 	}
15175 	/* path = clean_path_end3(path); -- gives wrong end */
15176 	if (path != NULL) {
15177 	  *ambig_end_length_3 = (querylength - 1) - ((Pair_T) path->first)->querypos;
15178 	  *ambig_splicetype_3 = splicetype3;
15179 	  *ambig_prob_3 = max_prob_3;
15180 	  /* *cdna_direction = splice_cdna_direction_3; */
15181 	  debug13(printf("Set ambig_end_length_3 to be %d\n",*ambig_end_length_3));
15182 	  if (max_prob_sense_forward_3 >= END_SPLICESITE_PROB_MATCH && max_prob_sense_anti_3 < END_SPLICESITE_PROB_MATCH
15183 	      && max_prob_sense_anti_5 < END_SPLICESITE_PROB_MATCH) {
15184 	    *new_sensedir = splice_sensedir_3;
15185 	  } else if (max_prob_sense_anti_3 >= END_SPLICESITE_PROB_MATCH && max_prob_sense_forward_3 < END_SPLICESITE_PROB_MATCH
15186 		     && max_prob_sense_forward_5 < END_SPLICESITE_PROB_MATCH) {
15187 	    *new_sensedir = splice_sensedir_3;
15188 	  } else {
15189 	    /* Not enough evidence to set sensedir */
15190 	  }
15191 	}
15192 	pairs = List_reverse(path);
15193 
15194       } else {
15195 	/* Consider just 5' end */
15196 	debug13(printf("Found good splice %s on 5' end at %u with probability %f\n",
15197 		       Splicetype_string(splicetype5),splice_genomepos_5,max_prob_5));
15198 	while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < splice_genomepos_5) {
15199 	  pairs = Pairpool_pop(pairs,&pair);
15200 	}
15201 	/* pairs = clean_pairs_end5(pairs); -- gives wrong end */
15202 	if (pairs != NULL) {
15203 	  *ambig_end_length_5 = ((Pair_T) pairs->first)->querypos;
15204 	  *ambig_splicetype_5 = splicetype5;
15205 	  *ambig_prob_5 = max_prob_5;
15206 	  /* *cdna_direction = splice_cdna_direction_5; */
15207 	  debug13(printf("Set ambig_end_length_5 to be %d\n",*ambig_end_length_5));
15208 	  if (max_prob_sense_forward_5 >= END_SPLICESITE_PROB_MATCH && max_prob_sense_anti_5 < END_SPLICESITE_PROB_MATCH
15209 	      && max_prob_sense_anti_3 < END_SPLICESITE_PROB_MATCH) {
15210 	    *new_sensedir = splice_sensedir_5;
15211 	  } else if (max_prob_sense_anti_5 >= END_SPLICESITE_PROB_MATCH && max_prob_sense_forward_5 < END_SPLICESITE_PROB_MATCH
15212 		     && max_prob_sense_forward_3 < END_SPLICESITE_PROB_MATCH) {
15213 	    *new_sensedir = splice_sensedir_5;
15214 	  } else {
15215 	    /* Not enough evidence to set sensedir */
15216 	  }
15217 	}
15218       }
15219 
15220     } else if (max_prob_3_mm >= END_SPLICESITE_PROB_MISMATCH || max_prob_5_mm >= END_SPLICESITE_PROB_MISMATCH) {
15221       if (max_prob_3_mm > max_prob_5_mm) {
15222 	debug13(printf("Found good mismatch splice %s on 3' end at %u with probability %f\n",
15223 		       Splicetype_string(splicetype3_mm),splice_genomepos_3_mm,max_prob_3_mm));
15224 	path = List_reverse(pairs);
15225 	while (path != NULL && ((Pair_T) path->first)->genomepos > splice_genomepos_3_mm) {
15226 	  path = Pairpool_pop(path,&pair);
15227 	}
15228 	/* path = clean_path_end3(path); -- gives wrong end */
15229 	if (path != NULL) {
15230 	  *ambig_end_length_3 = (querylength - 1) - ((Pair_T) path->first)->querypos;
15231 	  *ambig_splicetype_3 = splicetype3_mm;
15232 	  *ambig_prob_3 = max_prob_3_mm;
15233 	  /* *cdna_direction = splice_cdna_direction_3_mm; */
15234 	  debug13(printf("Set ambig_end_length_3 to be %d\n",*ambig_end_length_3));
15235 	  if (max_prob_sense_forward_3_mm >= END_SPLICESITE_PROB_MISMATCH && max_prob_sense_anti_3_mm < END_SPLICESITE_PROB_MISMATCH
15236 	      && max_prob_sense_anti_5_mm < END_SPLICESITE_PROB_MISMATCH) {
15237 	    *new_sensedir = splice_sensedir_3_mm;
15238 	  } else if (max_prob_sense_anti_3_mm >= END_SPLICESITE_PROB_MISMATCH && max_prob_sense_forward_3_mm < END_SPLICESITE_PROB_MISMATCH
15239 		     && max_prob_sense_forward_5_mm < END_SPLICESITE_PROB_MISMATCH) {
15240 	    *new_sensedir = splice_sensedir_3_mm;
15241 	  } else {
15242 	    /* Not enough evidence to set sensedir */
15243 	  }
15244 	}
15245 	pairs = List_reverse(path);
15246       } else {
15247 	debug13(printf("Found good mismatch splice %s on 5' end at %u with probability %f\n",
15248 		       Splicetype_string(splicetype5_mm),splice_genomepos_5_mm,max_prob_5_mm));
15249 	while (pairs != NULL && ((Pair_T) pairs->first)->genomepos < splice_genomepos_5_mm) {
15250 	  pairs = Pairpool_pop(pairs,&pair);
15251 	}
15252 	/* pairs = clean_pairs_end5(pairs); -- gives wrong end */
15253 	if (pairs != NULL) {
15254 	  *ambig_end_length_5 = ((Pair_T) pairs->first)->querypos;
15255 	  *ambig_splicetype_5 = splicetype5_mm;
15256 	  *ambig_prob_5 = max_prob_5_mm;
15257 	  /* *cdna_direction = splice_cdna_direction_5_mm; */
15258 	  debug13(printf("Set ambig_end_length_5 to be %d\n",*ambig_end_length_5));
15259 	  if (max_prob_sense_forward_5_mm >= END_SPLICESITE_PROB_MISMATCH && max_prob_sense_anti_5_mm < END_SPLICESITE_PROB_MISMATCH
15260 	      && max_prob_sense_anti_3_mm < END_SPLICESITE_PROB_MISMATCH) {
15261 	    *new_sensedir = splice_sensedir_5_mm;
15262 	  } else if (max_prob_sense_anti_5_mm >= END_SPLICESITE_PROB_MISMATCH && max_prob_sense_forward_5_mm < END_SPLICESITE_PROB_MISMATCH
15263 		     && max_prob_sense_forward_3_mm < END_SPLICESITE_PROB_MISMATCH) {
15264 	    *new_sensedir = splice_sensedir_5_mm;
15265 	  } else {
15266 	    /* Not enough evidence to set sensedir */
15267 	  }
15268 	}
15269       }
15270     }
15271   }
15272 
15273   FREE(scorei);
15274   return pairs;
15275 }
15276 #endif
15277 
15278 
15279 
15280 #if 0
15281 /* Still somewhat buggy */
15282 static List_T
15283 trim_novel_spliceends_new (List_T pairs,
15284 			   int *ambig_end_length_5, int *ambig_end_length_3,
15285 			   Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
15286 			   double *ambig_prob_5, double *ambig_prob_3,
15287 			   int *sensedir, bool watsonp, int querylength,
15288 			   Univcoord_T chroffset, Univcoord_T chrhigh,
15289 			   bool knownsplice5p, bool knownsplice3p) {
15290   List_T path, p;
15291   int i;
15292 
15293   Pair_T pair, prev;
15294   int trim5, trim3, exondist5, exondist3;
15295 
15296  /* start to middle has mismatches, while middle to end has none */
15297   Univcoord_T start5, middle5, end5, start3, middle3, end3;
15298   Univcoord_T genomicstart5, genomicend3;
15299   bool solve5p, solve3p, mismatchp;
15300 
15301 
15302   debug13(printf("\nEntered gmap_trim_novel_spliceends with sensedir %d, ambig_end_lengths %d and %d\n",
15303 		 *sensedir,*ambig_end_length_5,*ambig_end_length_3));
15304 
15305   debug13(Pair_dump_list(pairs,true));
15306 
15307   Pair_trim_distances(&trim5,&trim3,pairs);
15308   debug13(printf("Trim distances (where we would trim theoretically) are %d and %d\n",trim5,trim3));
15309 
15310   if (pairs != NULL && knownsplice5p == false && *ambig_end_length_5 == 0
15311       /* && exon_length_5(pairs) >= END_SPLICESITE_EXON_LENGTH */) {
15312     /* See if there is a good splice site at the 5' end */
15313 
15314     pair = (Pair_T) List_head(p = pairs);
15315     middle5 = (Univcoord_T) pair->genomepos - 1;
15316 
15317     if (pair->querypos != 0) {
15318       mismatchp = true;
15319     } else {
15320       mismatchp = false;
15321     }
15322 
15323     i = 0;
15324     while (i < trim5) {
15325       if ((p = List_next(p)) == NULL) {
15326 	break;
15327       } else if (pair->gapp == true) {
15328 	break;
15329       } else if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
15330 	middle5 = pair->genomepos - 1;
15331 	debug13(printf("Resetting middle to be %u\n",middle5));
15332       } else {
15333 	middle5 = pair->genomepos - 1;
15334 	mismatchp = true;
15335 	debug13(printf("Resetting middle to be %u\n",middle5));
15336       }
15337       prev = pair;
15338       pair = (Pair_T) List_head(p);
15339       i++;
15340     }
15341 
15342     end5 = middle5;
15343     while (i < trim5 + END_SPLICESITE_SEARCH) {
15344       if ((p = List_next(p)) == NULL) {
15345 	break;
15346       } else if (pair->gapp == true) {
15347 	break;
15348       } else {
15349 	end5 = pair->genomepos - 1;
15350 	debug13(printf("Resetting end to be %u\n",end5));
15351       }
15352       prev = pair;
15353       pair = (Pair_T) List_head(p);
15354       i++;
15355     }
15356 
15357     /* Determine genomicstart5 after indels hve been skipped */
15358     if (pair->gapp == true) {
15359       pair = prev;
15360     }
15361     if (watsonp) {
15362       debug13(printf("Plus: Setting genomicstart5 to be genomepos %u - querypos %d\n",
15363 		     pair->genomepos,pair->querypos));
15364       genomicstart5 = pair->genomepos - pair->querypos;
15365     } else {
15366       debug13(printf("Minus: Setting genomicstart5 to be genomepos %u - querypos %d - 1\n",
15367 		     pair->genomepos,pair->querypos));
15368       genomicstart5 = pair->genomepos - pair->querypos - 1;
15369     }
15370     if ((start5 = middle5 - END_SPLICESITE_SEARCH) < genomicstart5) {
15371       start5 = genomicstart5;
15372     }
15373 
15374 
15375     /* Find distance from end to intron, if any */
15376     exondist5 = 0;
15377     while (p != NULL && ((Pair_T) List_head(p))->gapp == false &&
15378 	   exondist5 < END_MIN_EXONLENGTH) {
15379       p = List_next(p);
15380       exondist5++;
15381     }
15382     debug13(printf("exondist5 is %d\n",exondist5));
15383   }
15384 
15385   if (mismatchp == false) {
15386     solve5p = false;
15387     genomicstart5 = 0;
15388 
15389   } else if (watsonp) {
15390     solve5p = true;
15391     genomicstart5 = chroffset + genomicstart5;
15392 
15393     start5 = chroffset + start5;
15394     middle5 = chroffset + middle5;
15395     end5 = chroffset + end5;
15396     debug13(printf("\n2 Set end points for 5' trim to be %u..%u..%u, plusp %d\n",
15397 		   start5 - chroffset,middle5 - chroffset,end5 - chroffset,watsonp));
15398 
15399   } else {
15400     solve5p = true;
15401     genomicstart5 = chrhigh - genomicstart5;
15402 
15403     start5 = chrhigh - start5;
15404     middle5 = chrhigh - middle5;
15405     end5 = chrhigh - end5;
15406     debug13(printf("\n2 Set end points for 5' trim to be %u..%u..%u, plusp %d\n",
15407 		   start5 - chroffset,middle5 - chroffset,end5 - chroffset,watsonp));
15408   }
15409 
15410 
15411   path = List_reverse(pairs);
15412   if (path != NULL && knownsplice3p == false && *ambig_end_length_3 == 0
15413       /* && exon_length_3(path) >= END_SPLICESITE_EXON_LENGTH */) {
15414     /* See if there is a good splice site at the 3' end */
15415     /* debug13(Pair_dump_list(path,true)); */
15416 
15417     pair = (Pair_T) List_head(p = path);
15418     middle3 = (Univcoord_T) pair->genomepos + 1;
15419 
15420     if (pair->querypos != querylength - 1) {
15421       mismatchp = true;
15422     } else {
15423       mismatchp = false;
15424     }
15425 
15426     i = 0;
15427     while (i < trim3) {
15428       if ((p = List_next(p)) == NULL) {
15429 	break;
15430       } else if (pair->gapp == true) {
15431 	break;
15432       } else if (pair->comp == MATCH_COMP || pair->comp == DYNPROG_MATCH_COMP || pair->comp == AMBIGUOUS_COMP) {
15433 	middle3 = pair->genomepos + 1;
15434 	debug13(printf("Resetting middle to be %u\n",middle3));
15435       } else {
15436 	middle3 = pair->genomepos + 1;
15437 	mismatchp = true;
15438 	debug13(printf("Resetting middle to be %u\n",middle3));
15439       }
15440       prev = pair;
15441       pair = (Pair_T) List_head(p);
15442       i++;
15443     }
15444 
15445     end3 = middle3;
15446     while (i < trim3 + END_SPLICESITE_SEARCH) {
15447       if ((p = List_next(p)) == NULL) {
15448 	break;
15449       } else if (pair->gapp == true) {
15450 	break;
15451       } else {
15452 	end3 = pair->genomepos + 1;
15453 	debug13(printf("Resetting end to be %u\n",end3));
15454       }
15455       prev = pair;
15456       pair = (Pair_T) List_head(p);
15457       i++;
15458     }
15459 
15460     /* Determine genomicend3 after indels hve been skipped */
15461     if (pair->gapp == true) {
15462       pair = prev;
15463     }
15464     if (watsonp) {
15465       debug13(printf("Plus: Setting genomicend3 to be genomepos %u + (querylength %d - querypos %d)\n",
15466 		     pair->genomepos,querylength,pair->querypos));
15467       genomicend3 = pair->genomepos + (querylength - pair->querypos);
15468     } else {
15469       debug13(printf("Minus: Setting genomicend3 to be genomepos %u + (querylength %d - 1 - querypos %d)\n",
15470 		     pair->genomepos,querylength,pair->querypos));
15471       genomicend3 = pair->genomepos + (querylength - 1 - pair->querypos);
15472     }
15473     if ((start3 = middle3 + END_SPLICESITE_SEARCH) > genomicend3) {
15474       start3 = genomicend3;
15475     }
15476 
15477 
15478     /* Find distance from end to intron, if any */
15479     exondist3 = 0;
15480     while (p != NULL && ((Pair_T) List_head(p))->gapp == false &&
15481 	   exondist3 < END_MIN_EXONLENGTH) {
15482       p = List_next(p);
15483       exondist3++;
15484     }
15485     debug13(printf("exondist3 is %d\n",exondist3));
15486   }
15487 
15488   if (mismatchp == false) {
15489     solve3p = false;
15490     genomicend3 = 0;
15491 
15492   } else if (watsonp) {
15493     solve3p = true;
15494     genomicend3 = chroffset + genomicend3;
15495 
15496     start3 = chroffset + start3;
15497     middle3 = chroffset + middle3;
15498     end3 = chroffset + end3;
15499 
15500   } else {
15501     solve3p = true;
15502     genomicend3 = chrhigh - genomicend3;
15503     start3 = chrhigh - start3;
15504     middle3 = chrhigh - middle3;
15505     end3 = chrhigh - end3;
15506   }
15507   debug13(printf("\n2 Set end points for 3' trim to be %u..%u..%u, plusp %d\n",
15508 		 start3 - chroffset,middle3 - chroffset,end3 - chroffset,watsonp));
15509 
15510   Splice_trim_novel_spliceends(&(*ambig_end_length_5),&(*ambig_end_length_3),
15511 			       &(*ambig_splicetype_5),&(*ambig_splicetype_3),
15512 			       &(*ambig_prob_5),&(*ambig_prob_3),/*orig_sensedir*/*sensedir,
15513 			       start5,middle5,end5,solve5p,start3,middle3,end3,solve3p,
15514 			       genomicstart5,genomicend3,chroffset,/*plusp*/watsonp);
15515 
15516   while (path != NULL && ((Pair_T) path->first)->querypos > (querylength - 1) - *ambig_end_length_3) {
15517     path = Pairpool_pop(path,&pair);
15518   }
15519 
15520   pairs = List_reverse(path);
15521   while (pairs != NULL && ((Pair_T) pairs->first)->querypos < *ambig_end_length_5) {
15522     pairs = Pairpool_pop(pairs,&pair);
15523   }
15524 
15525   debug13(printf("Returning ambig_end_length_5 %d and ambig_end_length_3 %d, probs %f and %f\n",
15526 		 *ambig_end_length_5,*ambig_end_length_3,*ambig_prob_5,*ambig_prob_3));
15527   debug13(Pair_dump_list(pairs,true));
15528 
15529   return pairs;
15530 }
15531 #endif
15532 
15533 
15534 
15535 static List_T
path_trim(double defect_rate,int * ambig_end_length_5,int * ambig_end_length_3,Splicetype_T * ambig_splicetype_5,Splicetype_T * ambig_splicetype_3,double * ambig_prob_5,double * ambig_prob_3,List_T pairs,int * cdna_direction,bool watsonp,int genestrand,bool jump_late_p,int querylength,int orig_sensedir,char * queryseq_ptr,char * queryuc_ptr,Univcoord_T chroffset,Univcoord_T chrhigh,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,int maxpeelback,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogR)15536 path_trim (double defect_rate, int *ambig_end_length_5, int *ambig_end_length_3,
15537 	   Splicetype_T *ambig_splicetype_5, Splicetype_T *ambig_splicetype_3,
15538 	   double *ambig_prob_5, double *ambig_prob_3,
15539 	   List_T pairs, int *cdna_direction, bool watsonp, int genestrand, bool jump_late_p,
15540 	   int querylength,
15541 #ifdef GSNAP
15542 	   int orig_sensedir,
15543 #endif
15544 	   char *queryseq_ptr, char *queryuc_ptr,
15545 	   Univcoord_T chroffset, Univcoord_T chrhigh,
15546 	   Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
15547 	   int maxpeelback, Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogR) {
15548   List_T path = NULL;
15549   int dynprogindex_minor = DYNPROGINDEX_MINOR;
15550   bool chop_exon_p;
15551   bool knownsplice5p = false, knownsplice3p = false;
15552   bool trimp, trim5p, trim3p, trim5p_ignore, trim3p_ignore, indelp;
15553   int iter = 0;
15554 #ifdef GSNAP
15555   int new_sensedir;
15556 #endif
15557 
15558 #ifdef GSNAP
15559   debug(printf("Entering path_trim with cdna_direction %d and sensedir %d\n",*cdna_direction,orig_sensedir));
15560   debug3(printf("Entering path_trim with cdna_direction %d and sensedir %d\n",*cdna_direction,orig_sensedir));
15561 #else
15562   debug(printf("Entering path_trim with cdna_direction %d\n",*cdna_direction));
15563   debug3(printf("Entering path_trim with cdna_direction %d\n",*cdna_direction));
15564 #endif
15565 
15566   debug3(Pair_dump_list(pairs,true));
15567 
15568 #ifdef GSNAP
15569   if (novelsplicingp == true) {
15570     pairs = trim_novel_spliceends(&new_sensedir,pairs,&(*ambig_end_length_5),&(*ambig_end_length_3),
15571 				  &(*ambig_splicetype_5),&(*ambig_splicetype_3),
15572 				  &(*ambig_prob_5),&(*ambig_prob_3),
15573 				  orig_sensedir,watsonp,querylength,
15574 				  chroffset,chrhigh,knownsplice5p,knownsplice3p);
15575   }
15576 #endif
15577   debug13(printf("After trim_novel_spliceends\n"));
15578   debug13(Pair_dump_list(pairs,true));
15579 
15580 
15581   if (pairs == NULL) {
15582     return (List_T) NULL;
15583   } else if (maximize_coverage_p == true) {
15584     /* Don't trim ends */
15585   } else {
15586     knownsplice5p = knownsplice3p = false;
15587     debug3(printf("Before Pair_trim_ends\n"));
15588     debug3(Pair_dump_list(pairs,true));
15589     debug3(printf("\n"));
15590 
15591     /* Done anyway within loop below */
15592     /* pairs = Pair_trim_ends(&trim5p,&trim3p,pairs); */
15593     trimp = trim5p = trim3p = true;
15594 
15595     debug3(printf("After Pair_trim_ends: trim5p %d, trim3p %d\n",trim5p,trim3p));
15596     debug3(Pair_dump_list(pairs,true));
15597     debug3(printf("\n"));
15598 
15599     while (iter++ < 3 && trimp == true) {
15600       trimp = false;
15601       /* Revised: Using QUERYEND_NOGAPS combined with Pair_trim_ends */
15602       /* Old: Extend with BEST_LOCAL to get right local (not global) answer,
15603 	 and with maxpeelback == 0 to ensure we perform no peelback */
15604       if (trim5p == true) {
15605 	debug3(printf("Extending at 5'\n"));
15606 	/* This is the third and final extension */
15607 	pairs = build_pairs_end5(&knownsplice5p,&(*ambig_end_length_5),&(*ambig_splicetype_5),&(*ambig_prob_5),
15608 				 &chop_exon_p,&dynprogindex_minor,pairs,
15609 				 chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
15610 				 queryseq_ptr,queryuc_ptr,
15611 				 *cdna_direction,watsonp,genestrand,jump_late_p,
15612 				 maxpeelback,defect_rate,pairpool,dynprogR,
15613 				 /*extendp*/true,/*endalign*/QUERYEND_NOGAPS,/*forcep*/false);
15614 	pairs = trim_end5_exons(&indelp,&trim5p,*ambig_end_length_5,pairs,dynprogR,chroffset,chrhigh,
15615 				queryseq_ptr,queryuc_ptr,querylength,*cdna_direction,watsonp,genestrand,
15616 				jump_late_p,pairpool,defect_rate);
15617 	if (indelp == true) {
15618 	  pairs = trim_end5_indels(pairs,*ambig_end_length_5,dynprogR,chroffset,chrhigh,
15619 				   queryseq_ptr,queryuc_ptr,
15620 				   watsonp,genestrand,jump_late_p,pairpool,defect_rate);
15621 	}
15622 	if (trim5p == true) {
15623 	  trimp = true;
15624 	}
15625       }
15626 
15627       if (trim3p == true) {
15628 	debug3(printf("Extending at 3'\n"));
15629 	/* This is the third and final extension */
15630 	path = List_reverse(pairs);
15631 	path = build_path_end3(&knownsplice3p,&(*ambig_end_length_3),&(*ambig_splicetype_3),&(*ambig_prob_3),
15632 			       &chop_exon_p,&dynprogindex_minor,path,
15633 			       chroffset,chrhigh,querylength,
15634 			       knownsplice_limit_low,knownsplice_limit_high,
15635 			       queryseq_ptr,queryuc_ptr,
15636 			       *cdna_direction,watsonp,genestrand,jump_late_p,
15637 			       maxpeelback,defect_rate,pairpool,dynprogL,
15638 			       /*extendp*/true,/*endalign*/QUERYEND_NOGAPS,/*forcep*/false);
15639 	path = trim_end3_exons(&indelp,&trim3p,*ambig_end_length_3,path,dynprogL,chroffset,chrhigh,
15640 			       queryseq_ptr,queryuc_ptr,querylength,
15641 			       *cdna_direction,watsonp,genestrand,jump_late_p,pairpool,defect_rate);
15642 	if (indelp == true) {
15643 	  path = trim_end3_indels(path,*ambig_end_length_3,dynprogL,chroffset,chrhigh,
15644 				  queryseq_ptr,queryuc_ptr,querylength,
15645 				  watsonp,genestrand,jump_late_p,pairpool,defect_rate);
15646 	}
15647 	if (trim3p == true) {
15648 	  trimp = true;
15649 	}
15650 	pairs = List_reverse(path);
15651       }
15652 
15653       /* Important to end the alignment with Pair_trim_ends, or else trimming will be faulty */
15654       /* Also, doing trimming within each loop yields better results in a small number of cases */
15655       pairs = Pair_trim_ends(&trim5p_ignore,&trim3p_ignore,pairs,*ambig_end_length_5,*ambig_end_length_3);
15656     }
15657 
15658     debug3(printf("After trim ends:\n"));
15659     debug3(Pair_dump_list(pairs,true));
15660   }
15661 
15662   if (watsonp) {
15663     path = List_reverse(pairs);
15664     path = clip_path_end3_chromosomal_bounds(path,chroffset,chrhigh);
15665     path = clean_path_end3_gap_indels(path);
15666     pairs = List_reverse(path);
15667   } else {
15668     pairs = clip_pairs_end5_chromosomal_bounds(pairs);
15669     pairs = clean_pairs_end5_gap_indels(pairs);
15670   }
15671 
15672   /* Cannot put trim_novel_spliceends here, which can generate an infinite loop in calling procedures */
15673 
15674   debug3(printf("Final result of path_trim: chroffset = %u, cdna_direction %d\n",
15675 		chroffset,*cdna_direction));
15676   debug3(Pair_dump_list(pairs,true));
15677   debug3(printf("\n"));
15678 
15679   return pairs;
15680 }
15681 
15682 
15683 /* Using alloca for last_genomedp5 and last_genomedp3 can cause stack overflow */
15684 Stage3middle_T
Stage3_compute_middle(List_T stage2pairs,List_T all_stage2_starts,List_T all_stage2_ends,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,int querylength,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool watsonp,int genestrand,bool jump_late_p,int maxpeelback,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,int sense_try)15685 Stage3_compute_middle (List_T stage2pairs, List_T all_stage2_starts, List_T all_stage2_ends,
15686 #ifdef PMAP
15687 		       char *queryaaseq_ptr,
15688 #endif
15689 		       char *queryseq_ptr, char *queryuc_ptr, int querylength,
15690 		       Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
15691 		       bool watsonp, int genestrand, bool jump_late_p, int maxpeelback,
15692 #ifndef GSNAP
15693 		       Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
15694 #endif
15695 		       Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
15696 		       int sense_try) {
15697   int goodness_fwd, goodness_rev;
15698   Chrpos_T *last_genomedp5_fwd = NULL, *last_genomedp3_fwd = NULL, *last_genomedp5_rev = NULL, *last_genomedp3_rev = NULL;
15699   List_T pairs_fwd, pairs_rev, path_fwd, path_rev;
15700   double defect_rate_fwd, defect_rate_rev;
15701 
15702 
15703   /* stage2pairs = Stage2_middle(stage2); */
15704 #if defined(DEBUG0) || defined(DEBUG11)
15705   if (watsonp == true) {
15706     printf("Stage 3: *** Starting stage 3 middle at chrnum #%d, chrstart %u, chrend %u, sense try %d)\n",
15707 	   chrnum,((Pair_T) stage2pairs->first)->genomepos,
15708 	   ((Pair_T) List_last_value(stage2pairs))->genomepos,sense_try);
15709   } else {
15710     printf("Stage 3: *** Starting stage 3 middle at chrnum #%d, chrstart %u, chrend %u, sense try %d)\n",
15711 	   chrnum,(chrhigh - chroffset) - ((Pair_T) stage2pairs->first)->genomepos,
15712 	   (chrhigh - chroffset) - ((Pair_T) List_last_value(stage2pairs))->genomepos,sense_try);
15713   }
15714 #endif
15715 
15716 #ifdef DEBUG
15717   if (watsonp == true) {
15718     printf("Stage 3: *** Stage3_compute_middle: Starting stage 3 at chrnum #%d, chrstart %u, chrend %u, sense try %d)\n",
15719 	   chrnum,((Pair_T) stage2pairs->first)->genomepos,
15720 	   ((Pair_T) List_last_value(stage2pairs))->genomepos,sense_try);
15721   } else {
15722     printf("Stage 3: *** Stage3_compute_middle: Starting stage 3 at chrnum #%d, chrstart %u, chrend %u, sense try %d)\n",
15723 	   chrnum,(chrhigh - chroffset) - ((Pair_T) stage2pairs->first)->genomepos,
15724 	   (chrhigh - chroffset) - ((Pair_T) List_last_value(stage2pairs))->genomepos,sense_try);
15725   }
15726 
15727   printf("stage2pairs\n");
15728   Pair_dump_list(stage2pairs,true);
15729 #endif
15730 
15731 
15732 #ifdef PMAP
15733   pairs_fwd = stage2pairs;
15734   pairs_rev = (List_T) NULL;
15735   /* do_final_p = true; */
15736 #else
15737   if (splicingp == false) {
15738     pairs_fwd = stage2pairs;
15739     pairs_rev = (List_T) NULL;
15740   } else if (sense_try > 0) {
15741     pairs_fwd = stage2pairs;
15742     pairs_rev = (List_T) NULL;
15743   } else if (sense_try < 0) {
15744     pairs_fwd = (List_T) NULL;
15745     pairs_rev = stage2pairs;
15746   } else {
15747     /* sense_try == 0: Should try both even if no introns (cf, AA011563) */
15748     pairs_fwd = stage2pairs;
15749     pairs_rev = Pairpool_copy(stage2pairs,pairpool);
15750   }
15751 #endif
15752 
15753 
15754   /* 1.  Middle */
15755   if (pairs_fwd == NULL) {
15756     path_fwd = (List_T) NULL;
15757   } else {
15758     last_genomedp5_fwd = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
15759     last_genomedp3_fwd = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
15760     debug(printf("*** Solve path_fwd\n"));
15761     path_fwd = path_compute_dir(&defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,
15762 				watsonp,genestrand,jump_late_p,
15763 #ifdef PMAP
15764 				queryaaseq_ptr,
15765 #endif
15766 				queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
15767 				maxpeelback,
15768 #ifndef GSNAP
15769 				oligoindices_minor,diagpool,cellpool,
15770 #endif
15771 				pairpool,dynprogL,dynprogM,dynprogR,
15772 				last_genomedp5_fwd,last_genomedp3_fwd/*,clean:ends_p:true*/);
15773     FREE(last_genomedp3_fwd);
15774     FREE(last_genomedp5_fwd);
15775   }
15776 
15777   if (pairs_rev == NULL) {
15778     path_rev = (List_T) NULL;
15779   } else {
15780     last_genomedp5_rev = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
15781     last_genomedp3_rev = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
15782     debug(printf("*** Solve path_rev\n"));
15783     path_rev = path_compute_dir(&defect_rate_rev,pairs_rev,/*cdna_direction*/-1,
15784 				watsonp,genestrand,jump_late_p,
15785 #ifdef PMAP
15786 				queryaaseq_ptr,
15787 #endif
15788 				queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
15789 				maxpeelback,
15790 #ifndef GSNAP
15791 				oligoindices_minor,diagpool,cellpool,
15792 #endif
15793 				pairpool,dynprogL,dynprogM,dynprogR,
15794 				last_genomedp5_rev,last_genomedp3_rev/*,clean_ends_p:true*/);
15795     FREE(last_genomedp5_rev);
15796     FREE(last_genomedp3_rev);
15797   }
15798 
15799   pairs_fwd = assign_gap_types(path_fwd,/*cdna_direction*/+1,watsonp,queryseq_ptr,
15800 			       chrnum,chroffset,chrhigh,pairpool);
15801   pairs_rev = assign_gap_types(path_rev,/*cdna_direction*/-1,watsonp,queryseq_ptr,
15802 			       chrnum,chroffset,chrhigh,pairpool);
15803 
15804   goodness_fwd = Pair_goodness_simple(pairs_fwd);
15805   goodness_rev = Pair_goodness_simple(pairs_rev);
15806   debug0(printf("goodness %d fwd, %d rev\n",goodness_fwd,goodness_rev));
15807   if (goodness_fwd >= goodness_rev) {
15808     return Stage3middle_new(goodness_fwd,defect_rate_fwd,defect_rate_rev,pairs_fwd,pairs_rev,
15809 			    chrnum,chroffset,chrhigh,chrlength,watsonp,genestrand,
15810 			    all_stage2_starts,all_stage2_ends);
15811   } else {
15812     return Stage3middle_new(goodness_rev,defect_rate_fwd,defect_rate_rev,pairs_fwd,pairs_rev,
15813 			    chrnum,chroffset,chrhigh,chrlength,watsonp,genestrand,
15814 			    all_stage2_starts,all_stage2_ends);
15815   }
15816 }
15817 
15818 
15819 /* Using alloca for last_genomedp5 and last_genomedp3 can cause stack overflow */
15820 struct Pair_T *
Stage3_compute_ends(int * cdna_direction,int * sensedir,List_T * finalpairs1,int * npairs1,int * goodness1,int * matches1,int * nmatches_posttrim_1,int * max_match_length_1,int * ambig_end_length_5_1,int * ambig_end_length_3_1,Splicetype_T * ambig_splicetype_5_1,Splicetype_T * ambig_splicetype_3_1,double * ambig_prob_5_1,double * ambig_prob_3_1,int * unknowns1,int * mismatches1,int * qopens1,int * qindels1,int * topens1,int * tindels1,int * ncanonical1,int * nsemicanonical1,int * nnoncanonical1,double * avg_splice_score_1,struct Pair_T ** pairarray2,List_T * finalpairs2,int * npairs2,int * goodness2,int * matches2,int * nmatches_posttrim_2,int * max_match_length_2,int * ambig_end_length_5_2,int * ambig_end_length_3_2,Splicetype_T * ambig_splicetype_5_2,Splicetype_T * ambig_splicetype_3_2,double * ambig_prob_5_2,double * ambig_prob_3_2,int * unknowns2,int * mismatches2,int * qopens2,int * qindels2,int * topens2,int * tindels2,int * ncanonical2,int * nsemicanonical2,int * nnoncanonical2,double * avg_splice_score_2,Stage3middle_T stage3middle,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,int querylength,int skiplength,int query_subseq_offset,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,int maxpeelback,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,int sense_filter,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool)15821 Stage3_compute_ends (int *cdna_direction, int *sensedir, List_T *finalpairs1, int *npairs1, int *goodness1,
15822 		     int *matches1, int *nmatches_posttrim_1, int *max_match_length_1,
15823 		     int *ambig_end_length_5_1, int *ambig_end_length_3_1,
15824 		     Splicetype_T *ambig_splicetype_5_1, Splicetype_T *ambig_splicetype_3_1,
15825 		     double *ambig_prob_5_1, double *ambig_prob_3_1,
15826 		     int *unknowns1, int *mismatches1, int *qopens1, int *qindels1, int *topens1, int *tindels1,
15827 		     int *ncanonical1, int *nsemicanonical1, int *nnoncanonical1, double *avg_splice_score_1,
15828 #ifdef GSNAP
15829 		     struct Pair_T **pairarray2, List_T *finalpairs2, int *npairs2, int *goodness2,
15830 		     int *matches2, int *nmatches_posttrim_2, int *max_match_length_2,
15831 		     int *ambig_end_length_5_2, int *ambig_end_length_3_2,
15832 		     Splicetype_T *ambig_splicetype_5_2, Splicetype_T *ambig_splicetype_3_2,
15833 		     double *ambig_prob_5_2, double *ambig_prob_3_2,
15834 		     int *unknowns2, int *mismatches2, int *qopens2, int *qindels2, int *topens2, int *tindels2,
15835 		     int *ncanonical2, int *nsemicanonical2, int *nnoncanonical2, double *avg_splice_score_2,
15836 #endif
15837 
15838 		     Stage3middle_T stage3middle,
15839 #ifdef PMAP
15840 		     char *queryaaseq_ptr,
15841 #endif
15842 		     char *queryseq_ptr, char *queryuc_ptr, int querylength,
15843 		     int skiplength, int query_subseq_offset,
15844 		     Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
15845 		     int maxpeelback, Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
15846 		     int sense_filter, Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool) {
15847   struct Pair_T *pairarray1;
15848   List_T all_stage2_starts, all_stage2_ends, p;
15849   double defect_rate_fwd, defect_rate_rev, defect_rate_temp, defect_rate;
15850 
15851   Chrnum_T chrnum;
15852   Univcoord_T chroffset, chrhigh;
15853   /* Chrpos_T chrlength; */
15854 
15855   bool watsonp, jump_late_p;
15856   int genestrand;
15857 
15858   Univcoord_T position;
15859   Chrpos_T chrstart, chrend, overall_end_distance;
15860   Pair_T start, end;
15861   int querypos;
15862 
15863   Chrpos_T *last_genomedp5_fwd = NULL, *last_genomedp3_fwd = NULL, *last_genomedp5_rev = NULL, *last_genomedp3_rev = NULL;
15864   List_T pairs_pretrim, pairs_fwd, pairs_rev, best_pairs, temp_pairs, path_fwd, path_rev, best_path, temp_path;
15865   List_T copy;
15866   List_T joined_ends, joined_starts;
15867   int nknown_fwd, ncanonical_fwd, nsemicanonical_fwd, nnoncanonical_fwd,
15868     nknown_rev, ncanonical_rev, nsemicanonical_rev, nnoncanonical_rev;
15869   int nbadintrons_fwd, nbadintrons_rev;
15870   double min_splice_prob_1;
15871   double max_intron_score_fwd = 0.0, max_intron_score_rev = 0.0,
15872     avg_donor_score_fwd = 0.0, avg_acceptor_score_fwd = 0.0,
15873     avg_donor_score_rev = 0.0, avg_acceptor_score_rev = 0.0;
15874   int nmatches_fwd, nmismatches_fwd, nmatches_rev, nmismatches_rev, nindels_fwd, nindels_rev;
15875   int fwd_ambig_end_length_5 = 0, fwd_ambig_end_length_3 = 0, rev_ambig_end_length_5 = 0, rev_ambig_end_length_3 = 0, temp_ambig_end_length;
15876   Splicetype_T fwd_ambig_splicetype_5, fwd_ambig_splicetype_3, rev_ambig_splicetype_5, rev_ambig_splicetype_3, temp_ambig_splicetype;
15877   double fwd_ambig_prob_5, fwd_ambig_prob_3, rev_ambig_prob_5, rev_ambig_prob_3, temp_ambig_prob;
15878 #ifdef GSNAP
15879   List_T pairs_fwd_copy, pairs_rev_copy;
15880   double min_splice_prob_2;
15881 #endif
15882 
15883 
15884   last_genomedp5_fwd = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
15885   last_genomedp3_fwd = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
15886   last_genomedp5_rev = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
15887   last_genomedp3_rev = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
15888 
15889   chrnum = stage3middle->chrnum;
15890   chroffset = stage3middle->chroffset;
15891   chrhigh = stage3middle->chrhigh;
15892   /* chrlength = stage3middle->chrlength; */
15893 
15894   watsonp = stage3middle->watsonp;
15895   genestrand = stage3middle->genestrand;
15896   jump_late_p = watsonp ? false : true;
15897 
15898   defect_rate_fwd = stage3middle->defect_rate_fwd;
15899   defect_rate_rev = stage3middle->defect_rate_rev;
15900 
15901   pairs_fwd = stage3middle->pairs_fwd;
15902   pairs_rev = stage3middle->pairs_rev;
15903   path_fwd = List_reverse(pairs_fwd);
15904   path_rev = List_reverse(pairs_rev);
15905 
15906   if (path_fwd != NULL && path_rev != NULL) {
15907     debug11(printf("Calling score_introns for path_fwd\n"));
15908     pairs_fwd = score_introns(&max_intron_score_fwd,&avg_donor_score_fwd,&avg_acceptor_score_fwd,
15909 			      &nknown_fwd,&ncanonical_fwd,&nbadintrons_fwd,path_fwd,/*cdna_direction*/+1,watsonp,
15910 			      chrnum,chroffset,chrhigh
15911 #ifdef WASTE
15912 			      ,pairpool
15913 #endif
15914 			      );
15915 
15916     debug11(printf("Calling score_introns for path_rev\n"));
15917     pairs_rev = score_introns(&max_intron_score_rev,&avg_donor_score_rev,&avg_acceptor_score_rev,
15918 			      &nknown_rev,&ncanonical_rev,&nbadintrons_rev,path_rev,/*cdna_direction*/-1,watsonp,
15919 			      chrnum,chroffset,chrhigh
15920 #ifdef WASTE
15921 			      ,pairpool
15922 #endif
15923 			      );
15924 
15925     if ((*cdna_direction = initial_cdna_direction(pairs_fwd,pairs_rev,
15926 						  avg_donor_score_fwd,avg_acceptor_score_fwd,
15927 						  avg_donor_score_rev,avg_acceptor_score_rev)) > 0) {
15928       debug11(printf("Initial cdna direction is %d\n",*cdna_direction));
15929       path_fwd = List_reverse(pairs_fwd);
15930       path_rev = (List_T) NULL;
15931 
15932     } else if (*cdna_direction < 0) {
15933       debug11(printf("Initial cdna direction is %d\n",*cdna_direction));
15934       path_fwd = (List_T) NULL;
15935       path_rev = List_reverse(pairs_rev);
15936 
15937     } else {
15938       debug11(printf("Initial cdna direction is %d\n",*cdna_direction));
15939       path_fwd = List_reverse(pairs_fwd);
15940       path_rev = List_reverse(pairs_rev);
15941     }
15942   }
15943 
15944 
15945 
15946   if (circularp[chrnum] == true) {
15947     overall_end_distance = overall_end_distance_circular;
15948   } else {
15949     overall_end_distance = overall_end_distance_linear;
15950   }
15951 
15952   /* 2.  3' and 5' ends (possibly multiple) */
15953   if (path_fwd == NULL) {
15954     pairs_fwd = (List_T) NULL;
15955   } else {
15956     /* 3' end */
15957     end = (Pair_T) List_head(path_fwd);
15958     if ((querypos = end->querypos) >= querylength - 10) {
15959       all_stage2_ends = (List_T) NULL;
15960 
15961     } else {
15962       if (watsonp == true) {
15963 	chrstart = end->genomepos;
15964 
15965 	/* We actually don't care if alignment goes past chrhigh,
15966 	   since that can be fixed later by trimming.  But we do care
15967 	   about going past the genome bounds */
15968 	if ((position = chroffset + chrstart + overall_end_distance) > genome_totallength) {
15969 	  chrend = (Chrpos_T) (genome_totallength - chroffset);
15970 	} else {
15971 	  chrend = (Chrpos_T) (position - chroffset);
15972 	}
15973 
15974       } else {
15975 	chrend = (chrhigh - chroffset) - end->genomepos;
15976 
15977 	/* This check is necessary, because we cannot have negative values for chrstart */
15978 	if (chrend < overall_end_distance) {
15979 	  chrstart = 0;
15980 	} else {
15981 	  chrstart = chrend - overall_end_distance;
15982 	}
15983       }
15984 
15985       debug(printf("QUERYPOS %d, CHRSTART %u, CHREND %u\n",querypos,chrstart,chrend));
15986       all_stage2_ends = Stage2_compute_ends(&(queryseq_ptr[querypos]),&(queryuc_ptr[querypos]),
15987 					    /*querylength*/querylength - querypos,/*query_offset*/querypos,
15988 					    chrstart,chrend,
15989 					    chroffset,chrhigh,/*plusp*/watsonp,genestrand,
15990 					    oligoindices_minor,pairpool,diagpool,cellpool,/*localp*/false,
15991 					    /*skip_repetitive_p*/false,/*favor_right_p*/false,/*max_nalignments*/1,
15992 					    /*debug_graphic_p*/false);
15993 #ifdef DEBUG
15994       printf("fwd stage2ends\n");
15995       for (p = all_stage2_ends; p != NULL; p = List_next(p)) {
15996 	Pair_dump_list((List_T) List_head(p),true);
15997       }
15998 #endif
15999     }
16000 
16001     if (all_stage2_ends == NULL) {
16002       best_path = path_compute_end3(&fwd_ambig_end_length_3,&fwd_ambig_splicetype_3,&fwd_ambig_prob_3,
16003 				    defect_rate_fwd,path_fwd,/*cdna_direction*/+1,watsonp,genestrand,
16004 				    jump_late_p,querylength,
16005 				    queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
16006 				    knownsplice_limit_low,knownsplice_limit_high,
16007 				    maxpeelback,pairpool,dynprogL);
16008     } else {
16009       best_path = Pairpool_remove_gapholders(path_fwd); /* Pairpool_join cannot handle gapholders */
16010       joined_ends = (List_T) NULL;
16011       for (p = all_stage2_ends; p != NULL; p = List_next(p)) {
16012 #ifdef PMAP
16013         copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
16014 #else
16015 	if (path_rev == NULL) {
16016 	  /* Won't need ends anymore */
16017 	  copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
16018 	} else {
16019 	  copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/true);
16020 	}
16021 #endif
16022 	joined_ends = List_push(joined_ends,(void *) copy);
16023       }
16024 
16025       for (p = joined_ends; p != NULL; p = List_next(p)) {
16026         copy = (List_T) List_head(p);
16027 	debug(printf("*** Solve path_fwd joined end\n"));
16028 	path_fwd = path_compute_dir(&defect_rate_temp,/*pairs*/List_reverse(copy),/*cdna_direction*/+1,
16029 				    watsonp,genestrand,jump_late_p,
16030 #ifdef PMAP
16031 				    queryaaseq_ptr,
16032 #endif
16033 				    queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16034 				    maxpeelback,
16035 #ifndef GSNAP
16036 				    oligoindices_minor,diagpool,cellpool,
16037 #endif
16038 				    pairpool,dynprogL,dynprogM,dynprogR,
16039 				    last_genomedp5_fwd,last_genomedp3_fwd/*,clean_ends_p:false*/);
16040 
16041 	temp_path = path_compute_end3(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
16042 				      defect_rate_temp,path_fwd,/*cdna_direction*/+1,watsonp,genestrand,
16043 				      jump_late_p,querylength,
16044 				      queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
16045 				      knownsplice_limit_low,knownsplice_limit_high,
16046 				      maxpeelback,pairpool,dynprogL);
16047 
16048 	if (temp_path != NULL && end_compare(best_path,temp_path,/*cdna_direction*/+1,watsonp,
16049 					     chrnum,chroffset,chrhigh,/*pairsp*/false) > 0) {
16050 	  best_path = temp_path;
16051 	  fwd_ambig_end_length_3 = temp_ambig_end_length;
16052 	  fwd_ambig_splicetype_3 = temp_ambig_splicetype;
16053 	  fwd_ambig_prob_3 = temp_ambig_prob;
16054 	  defect_rate_fwd = defect_rate_temp;
16055 	  debug21(printf("New best path:\n"));
16056 	  debug21(Pair_dump_list(best_path,true));
16057 	}
16058       }
16059 
16060       List_free(&joined_ends);
16061       List_free(&all_stage2_ends);
16062     }
16063 
16064     /* 5' end */
16065     pairs_fwd = List_reverse(best_path);
16066 
16067     start = (Pair_T) List_head(pairs_fwd);
16068     if ((querypos = start->querypos) <= 10) {
16069       all_stage2_starts = (List_T) NULL;
16070 
16071     } else {
16072       if (watsonp == true) {
16073 	chrend = start->genomepos;
16074 	/* This check is necessary, because we cannot have negative values for chrstart */
16075 	if (chrend < overall_end_distance) {
16076 	  chrstart = 0;
16077 	} else {
16078 	  chrstart = chrend - overall_end_distance;
16079 	}
16080 
16081       } else {
16082 	chrstart = (chrhigh - chroffset) - start->genomepos;
16083 
16084 	/* We actually don't care if alignment goes past chrhigh,
16085 	   since that can be fixed later by trimming.  But we do care
16086 	   about going past the genome bounds */
16087 	if ((position = chroffset + chrstart + overall_end_distance) > genome_totallength) {
16088 	  chrend = (Chrpos_T) (genome_totallength - chroffset);
16089 	} else {
16090 	  chrend = (Chrpos_T) (position - chroffset);
16091 	}
16092       }
16093       debug(printf("QUERYPOS %d, CHRSTART %u, CHREND %u\n",querypos,chrstart,chrend));
16094       all_stage2_starts = Stage2_compute_starts(&(queryseq_ptr[0]),&(queryuc_ptr[0]),querypos,/*query_offset*/0,
16095 						chrstart,chrend,
16096 						chroffset,chrhigh,/*plusp*/watsonp,genestrand,
16097 						oligoindices_minor,pairpool,diagpool,cellpool,/*localp*/false,
16098 						/*skip_repetitive_p*/false,/*favor_right_p*/false,/*max_nalignments*/1,
16099 						/*debug_graphic_p*/false);
16100 #ifdef DEBUG
16101       printf("fwd stage2starts\n");
16102       for (p = all_stage2_starts; p != NULL; p = List_next(p)) {
16103 	Pair_dump_list((List_T) List_head(p),true);
16104       }
16105 #endif
16106     }
16107 
16108     if (all_stage2_starts == NULL) {
16109       best_pairs = path_compute_end5(&fwd_ambig_end_length_5,&fwd_ambig_splicetype_5,&fwd_ambig_prob_5,
16110 				     defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,
16111 				     watsonp,genestrand,jump_late_p,
16112 				     queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16113 				     knownsplice_limit_low,knownsplice_limit_high,
16114 				     maxpeelback,pairpool,dynprogR);
16115     } else {
16116       best_pairs = Pairpool_remove_gapholders(pairs_fwd); /* Pairpool_join cannot handle gapholders */
16117       joined_starts = (List_T) NULL;
16118       for (p = all_stage2_starts; p != NULL; p = List_next(p)) {
16119 #ifdef PMAP
16120         copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
16121 #else
16122 	if (path_rev == NULL) {
16123 	  /* Won't need ends anymore */
16124 	  copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
16125 	} else {
16126 	  copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/true);
16127 	}
16128 #endif
16129 	joined_starts = List_push(joined_starts,(void *) copy);
16130       }
16131 
16132       for (p = joined_starts; p != NULL; p = List_next(p)) {
16133 	copy = (List_T) List_head(p);
16134 	debug(printf("*** Solve path_fwd joined start\n"));
16135 	path_fwd = path_compute_dir(&defect_rate_temp,/*pairs*/copy,/*cdna_direction*/+1,
16136 				    watsonp,genestrand,jump_late_p,
16137 #ifdef PMAP
16138 				    queryaaseq_ptr,
16139 #endif
16140 				    queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16141 				    maxpeelback,
16142 #ifndef GSNAP
16143 				    oligoindices_minor,diagpool,cellpool,
16144 #endif
16145 				    pairpool,dynprogL,dynprogM,dynprogR,
16146 				    last_genomedp5_fwd,last_genomedp3_fwd/*,clean_ends_p:false*/);
16147 
16148 	temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
16149 				       defect_rate_temp,/*pairs*/List_reverse(path_fwd),
16150 				       /*cdna_direction*/+1,watsonp,genestrand,jump_late_p,
16151 				       queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16152 				       knownsplice_limit_low,knownsplice_limit_high,
16153 				       maxpeelback,pairpool,dynprogR);
16154 
16155 	if (temp_pairs != NULL && end_compare(best_pairs,temp_pairs,/*cdna_direction*/+1,watsonp,
16156 					      chrnum,chroffset,chrhigh,/*pairsp*/true) > 0) {
16157 	  best_pairs = temp_pairs;
16158 	  fwd_ambig_end_length_5 = temp_ambig_end_length;
16159 	  fwd_ambig_splicetype_5 = temp_ambig_splicetype;
16160 	  fwd_ambig_prob_5 = temp_ambig_prob;
16161 	  defect_rate_fwd = defect_rate_temp;
16162 	  debug21(printf("New best pairs:\n"));
16163 	  debug21(Pair_dump_list(best_pairs,true));
16164 	}
16165       }
16166 
16167       List_free(&joined_starts);
16168       List_free(&all_stage2_starts);
16169     }
16170 
16171     pairs_fwd = best_pairs;
16172   }
16173 
16174 
16175 #ifndef PMAP
16176   if (path_rev == NULL) {
16177     pairs_rev = (List_T) NULL;
16178   } else {
16179     /* 3' end */
16180     end = (Pair_T) List_head(path_rev);
16181     if ((querypos = end->querypos) >= querylength - 10) {
16182       all_stage2_ends = (List_T) NULL;
16183 
16184     } else {
16185       if (watsonp == true) {
16186 	chrstart = end->genomepos;
16187 
16188 	/* We actually don't care if alignment goes past chrhigh,
16189 	   since that can be fixed later by trimming.  But we do care
16190 	   about going past the genome bounds */
16191 	if ((position = chroffset + chrstart + overall_end_distance) > genome_totallength) {
16192 	  chrend = (Chrpos_T) (genome_totallength - chroffset);
16193 	} else {
16194 	  chrend = (Chrpos_T) (position - chroffset);
16195 	}
16196 
16197       } else {
16198 	chrend = (chrhigh - chroffset) - end->genomepos;
16199 
16200 	/* This check is necessary, because we cannot have negative values for chrstart */
16201 	if (chrend < overall_end_distance) {
16202 	  chrstart = 0;
16203 	} else {
16204 	  chrstart = chrend - overall_end_distance;
16205 	}
16206       }
16207 
16208       debug(printf("QUERYPOS %d, CHRSTART %u, CHREND %u\n",querypos,chrstart,chrend));
16209       all_stage2_ends = Stage2_compute_ends(&(queryseq_ptr[querypos]),&(queryuc_ptr[querypos]),
16210 					    /*querylength*/querylength - querypos,/*query_offset*/querypos,
16211 					    chrstart,chrend,
16212 					    chroffset,chrhigh,/*plusp*/watsonp,genestrand,
16213 					    oligoindices_minor,pairpool,diagpool,cellpool,/*localp*/false,
16214 					    /*skip_repetitive_p*/false,/*favor_right_p*/false,/*max_nalignments*/1,
16215 					    /*debug_graphic_p*/false);
16216 #ifdef DEBUG
16217       printf("rev stage2ends\n");
16218       for (p = all_stage2_ends; p != NULL; p = List_next(p)) {
16219 	Pair_dump_list((List_T) List_head(p),true);
16220       }
16221 #endif
16222     }
16223 
16224     if (all_stage2_ends == NULL) {
16225       best_path = path_compute_end3(&rev_ambig_end_length_3,&rev_ambig_splicetype_3,&rev_ambig_prob_3,
16226 				    defect_rate_rev,path_rev,/*cdna_direction*/-1,watsonp,genestrand,
16227 				    jump_late_p,querylength,
16228 				    queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
16229 				    knownsplice_limit_low,knownsplice_limit_high,
16230 				    maxpeelback,pairpool,dynprogL);
16231 
16232     } else {
16233       best_path = Pairpool_remove_gapholders(path_rev); /* Pairpool_join cannot handle gapholders */
16234       joined_ends = (List_T) NULL;
16235       for (p = all_stage2_ends; p != NULL; p = List_next(p)) {
16236 	copy = Pairpool_join_end3(/*path*/path_rev,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
16237 	joined_ends = List_push(joined_ends,(void *) copy);
16238       }
16239 
16240       for (p = joined_ends; p != NULL; p = List_next(p)) {
16241         copy = (List_T) List_head(p);
16242 	debug(printf("*** Solve path_rev joined end\n"));
16243 	path_rev = path_compute_dir(&defect_rate_temp,/*pairs*/List_reverse(copy),/*cdna_direction*/-1,
16244 				    watsonp,genestrand,jump_late_p,
16245 #ifdef PMAP
16246 				    queryaaseq_ptr,
16247 #endif
16248 				    queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16249 				    maxpeelback,
16250 #ifndef GSNAP
16251 				    oligoindices_minor,diagpool,cellpool,
16252 #endif
16253 				    pairpool,dynprogL,dynprogM,dynprogR,
16254 				    last_genomedp5_rev,last_genomedp3_rev/*,clean_ends_p:false*/);
16255 
16256 	temp_path = path_compute_end3(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
16257 				      defect_rate_temp,path_rev,/*cdna_direction*/-1,watsonp,genestrand,
16258 				      jump_late_p,querylength,
16259 				      queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
16260 				      knownsplice_limit_low,knownsplice_limit_high,
16261 				      maxpeelback,pairpool,dynprogL);
16262 
16263 	if (temp_path != NULL && end_compare(best_path,temp_path,/*cdna_direction*/-1,watsonp,
16264 					     chrnum,chroffset,chrhigh,/*pairsp*/false) > 0) {
16265 	  best_path = temp_path;
16266 	  rev_ambig_end_length_3 = temp_ambig_end_length;
16267 	  rev_ambig_splicetype_3 = temp_ambig_splicetype;
16268 	  rev_ambig_prob_3 = temp_ambig_prob;
16269 	  defect_rate_rev = defect_rate_temp;
16270 	  debug21(printf("New best path:\n"));
16271 	  debug21(Pair_dump_list(best_path,true));
16272 	}
16273       }
16274 
16275       List_free(&joined_ends);
16276       List_free(&all_stage2_ends);
16277     }
16278 
16279     /* 5' end */
16280     pairs_rev = List_reverse(best_path);
16281 
16282     start = (Pair_T) List_head(pairs_rev);
16283     if ((querypos = start->querypos) <= 10) {
16284       all_stage2_starts = (List_T) NULL;
16285 
16286     } else {
16287       if (watsonp == true) {
16288 	chrend = start->genomepos;
16289 	/* This check is necessary, because we cannot have negative values for chrstart */
16290 	if (chrend < overall_end_distance) {
16291 	  chrstart = 0;
16292 	} else {
16293 	  chrstart = chrend - overall_end_distance;
16294 	}
16295 
16296       } else {
16297 	chrstart = (chrhigh - chroffset) - start->genomepos;
16298 
16299 	/* We actually don't care if alignment goes past chrhigh,
16300 	   since that can be fixed later by trimming.  But we do care
16301 	   about going past the genome bounds */
16302 	if ((position = chroffset + chrstart + overall_end_distance) > genome_totallength) {
16303 	  chrend = (Chrpos_T) (genome_totallength - chroffset);
16304 	} else {
16305 	  chrend = (Chrpos_T) (position - chroffset);
16306 	}
16307       }
16308       debug(printf("QUERYPOS %d, CHRSTART %u, CHREND %u\n",querypos,chrstart,chrend));
16309       all_stage2_starts = Stage2_compute_starts(&(queryseq_ptr[0]),&(queryuc_ptr[0]),querypos,/*query_offset*/0,
16310 						chrstart,chrend,
16311 						chroffset,chrhigh,/*plusp*/watsonp,genestrand,
16312 						oligoindices_minor,pairpool,diagpool,cellpool,/*localp*/false,
16313 						/*skip_repetitive_p*/false,/*favor_right_p*/false,/*max_nalignments*/1,
16314 						/*debug_graphic_p*/false);
16315 #ifdef DEBUG
16316       printf("rev stage2starts\n");
16317       for (p = all_stage2_starts; p != NULL; p = List_next(p)) {
16318 	Pair_dump_list((List_T) List_head(p),true);
16319       }
16320 #endif
16321     }
16322 
16323     if (all_stage2_starts == NULL) {
16324       best_pairs = path_compute_end5(&rev_ambig_end_length_5,&rev_ambig_splicetype_5,&rev_ambig_prob_5,
16325 				     defect_rate_rev,pairs_rev,/*cdna_direction*/-1,
16326 				     watsonp,genestrand,jump_late_p,
16327 				     queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16328 				     knownsplice_limit_low,knownsplice_limit_high,
16329 				     maxpeelback,pairpool,dynprogR);
16330 
16331     } else {
16332       best_pairs = Pairpool_remove_gapholders(pairs_rev); /* Pairpool_join cannot handle gapholders */
16333       joined_starts = (List_T) NULL;
16334       for (p = all_stage2_starts; p != NULL; p = List_next(p)) {
16335 	copy = Pairpool_join_end5(/*pairs*/pairs_rev,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
16336 	joined_starts = List_push(joined_starts,(void *) copy);
16337       }
16338 
16339       for (p = joined_starts; p != NULL; p = List_next(p)) {
16340 	copy = (List_T) List_head(p);
16341 	debug(printf("*** Solve path_rev joined start\n"));
16342 	path_rev = path_compute_dir(&defect_rate_temp,/*pairs*/copy,/*cdna_direction*/-1,
16343 				    watsonp,genestrand,jump_late_p,
16344 #ifdef PMAP
16345 				    queryaaseq_ptr,
16346 #endif
16347 				    queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16348 				    maxpeelback,
16349 #ifndef GSNAP
16350 				    oligoindices_minor,diagpool,cellpool,
16351 #endif
16352 				    pairpool,dynprogL,dynprogM,dynprogR,
16353 				    last_genomedp5_rev,last_genomedp3_rev/*,clean_ends_p:false*/);
16354 
16355 	temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
16356 				       defect_rate_temp,/*pairs*/List_reverse(path_rev),
16357 				       /*cdna_direction*/-1,watsonp,genestrand,jump_late_p,
16358 				       queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16359 				       knownsplice_limit_low,knownsplice_limit_high,
16360 				       maxpeelback,pairpool,dynprogR);
16361 	if (temp_pairs != NULL && end_compare(best_pairs,temp_pairs,/*cdna_direction*/-1,watsonp,
16362 					      chrnum,chroffset,chrhigh,/*pairsp*/true) > 0) {
16363 	  best_pairs = temp_pairs;
16364 	  rev_ambig_end_length_5 = temp_ambig_end_length;
16365 	  rev_ambig_splicetype_5 = temp_ambig_splicetype;
16366 	  rev_ambig_prob_5 = temp_ambig_prob;
16367 	  defect_rate_rev = defect_rate_temp;
16368 	  debug21(printf("New best pairs:\n"));
16369 	  debug21(Pair_dump_list(best_pairs,true));
16370 	}
16371       }
16372 
16373       List_free(&joined_starts);
16374       List_free(&all_stage2_starts);
16375     }
16376 
16377     pairs_rev = best_pairs;
16378   }
16379 #endif
16380 
16381 
16382   pairs_fwd = path_compute_final(defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,
16383 				 watsonp,genestrand,jump_late_p,querylength,
16384 #ifdef PMAP
16385 				 queryaaseq_ptr,
16386 #endif
16387 				 queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
16388 				 maxpeelback,
16389 #ifndef GSNAP
16390 				 oligoindices_minor,diagpool,cellpool,
16391 #endif
16392 				 pairpool,dynprogL,dynprogM,dynprogR,
16393 				 last_genomedp5_fwd,last_genomedp3_fwd);
16394 
16395   pairs_rev = path_compute_final(defect_rate_rev,pairs_rev,/*cdna_direction*/-1,
16396 				 watsonp,genestrand,jump_late_p,querylength,
16397 #ifdef PMAP
16398 				 queryaaseq_ptr,
16399 #endif
16400 				 queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
16401 				 maxpeelback,
16402 #ifndef GSNAP
16403 				 oligoindices_minor,diagpool,cellpool,
16404 #endif
16405 				 pairpool,dynprogL,dynprogM,dynprogR,
16406 				 last_genomedp5_rev,last_genomedp3_rev);
16407 
16408   FREE(last_genomedp3_rev);
16409   FREE(last_genomedp5_rev);
16410   FREE(last_genomedp3_fwd);
16411   FREE(last_genomedp5_fwd);
16412 
16413 
16414   debug(printf("Forward:\n"));
16415   debug(Pair_dump_list(pairs_fwd,true));
16416   debug(printf("\n"));
16417 
16418   debug(printf("Reverse:\n"));
16419   debug(Pair_dump_list(pairs_rev,true));
16420   debug(printf("\n"));
16421 
16422   debug11(printf("Forward:\n"));
16423   debug11(Pair_dump_list(pairs_fwd,true));
16424   debug11(printf("\n"));
16425 
16426   debug11(printf("Reverse:\n"));
16427   debug11(Pair_dump_list(pairs_rev,true));
16428   debug11(printf("\n"));
16429 
16430   debug(printf("Intronscores: %f,%f fwd, %f,%f rev\n",
16431 	       avg_donor_score_fwd,avg_acceptor_score_fwd,avg_donor_score_rev,avg_acceptor_score_rev));
16432   if (pairs_rev == NULL) {
16433     pairs_pretrim = pairs_fwd;
16434     *cdna_direction = +1;
16435     *sensedir = SENSE_FORWARD;
16436 
16437   } else if (pairs_fwd == NULL) {
16438     pairs_pretrim = pairs_rev;
16439     *cdna_direction = -1;
16440     *sensedir = SENSE_ANTI;
16441 
16442   } else {
16443     path_fwd = List_reverse(pairs_fwd);
16444     debug11(printf("Calling score_introns for path_fwd before path_trim\n"));
16445     pairs_fwd = score_introns(&max_intron_score_fwd,&avg_donor_score_fwd,&avg_acceptor_score_fwd,
16446 			      &nknown_fwd,&ncanonical_fwd,&nbadintrons_fwd,path_fwd,/*cdna_direction*/+1,watsonp,
16447 			      chrnum,chroffset,chrhigh
16448 #ifdef WASTE
16449 			      ,pairpool
16450 #endif
16451 			      );
16452     /* alignment_score_fwd = */ score_alignment(&nmatches_fwd,&nmismatches_fwd,&nindels_fwd,
16453 #ifdef COMPLEX_DIRECTION
16454 						&indel_alignment_score_fwd,
16455 #endif
16456 						&nsemicanonical_fwd,&nnoncanonical_fwd,
16457 						pairs_fwd,/*cdna_direction*/+1);
16458 
16459     path_rev = List_reverse(pairs_rev);
16460     debug11(printf("Calling score_introns for path_rev before path_trim\n"));
16461     pairs_rev = score_introns(&max_intron_score_rev,&avg_donor_score_rev,&avg_acceptor_score_rev,
16462 			      &nknown_rev,&ncanonical_rev,&nbadintrons_rev,path_rev,/*cdna_direction*/-1,watsonp,
16463 			      chrnum,chroffset,chrhigh
16464 #ifdef WASTE
16465 			      ,pairpool
16466 #endif
16467 			      );
16468     /* alignment_score_rev = */ score_alignment(&nmatches_rev,&nmismatches_rev,&nindels_rev,
16469 #ifdef COMPLEX_DIRECTION
16470 						&indel_alignment_score_rev,
16471 #endif
16472 						&nsemicanonical_rev,&nnoncanonical_rev,
16473 						pairs_rev,/*cdna_direction*/-1);
16474 
16475     pairs_pretrim = pick_cdna_direction(&(*cdna_direction),&(*sensedir),pairs_fwd,pairs_rev,
16476 					defect_rate_fwd,defect_rate_rev,
16477 					nknown_fwd,ncanonical_fwd,nsemicanonical_fwd,nnoncanonical_fwd,nbadintrons_fwd,
16478 					nknown_rev,ncanonical_rev,nsemicanonical_rev,nnoncanonical_rev,nbadintrons_rev,
16479 					max_intron_score_fwd,avg_donor_score_fwd,avg_acceptor_score_fwd,
16480 					max_intron_score_rev,avg_donor_score_rev,avg_acceptor_score_rev,
16481 #ifdef COMPLEX_DIRECTION
16482 					nmatches_fwd,nmismatches_fwd,nmatches_rev,nmismatches_rev,nindels_fwd,nindels_rev,
16483 					indel_alignment_score_fwd,indel_alignment_score_rev,
16484 #endif
16485 					sense_filter);
16486   }
16487 
16488 
16489   if (pairs_pretrim == NULL) {
16490 #if 0
16491     *npairs1 = 0;
16492     *goodness1 = 0;
16493     *nmatches_posttrim_1 = 0;
16494     *ambig_end_length_5_1 = *ambig_end_length_3_1 = 0;
16495     *ambig_prob_5_1 = *ambig_prob_3_1 = 0.0;
16496 #endif
16497     return (struct Pair_T *) NULL;
16498   }
16499 
16500   if (splicingp == false) {
16501     *sensedir = SENSE_NULL;
16502   }
16503 
16504 #ifdef GSNAP
16505   if (*cdna_direction == 0) {
16506     /* If both pairarrays are returned, then first one is fwd and second one is rev */
16507     debug11(printf("Initial cdna_direction is 0\n"));
16508     *ambig_end_length_5_1 = fwd_ambig_end_length_5;
16509     *ambig_end_length_3_1 = fwd_ambig_end_length_3;
16510     *ambig_splicetype_5_1 = fwd_ambig_splicetype_5;
16511     *ambig_splicetype_3_1 = fwd_ambig_splicetype_3;
16512     *ambig_prob_5_1 = fwd_ambig_prob_5;
16513     *ambig_prob_3_1 = fwd_ambig_prob_3;
16514 
16515     *cdna_direction = +1;
16516 
16517     /* path_trim alters pairs_fwd, so make a copy in case we use it for pairs_pretrim */
16518     pairs_fwd_copy = Pairpool_copy(pairs_fwd,pairpool);
16519     *finalpairs1 = path_trim(defect_rate_fwd,&(*ambig_end_length_5_1),&(*ambig_end_length_3_1),
16520 			     &(*ambig_splicetype_5_1),&(*ambig_splicetype_3_1),
16521 			     &(*ambig_prob_5_1),&(*ambig_prob_3_1),
16522 			     pairs_fwd_copy,&(*cdna_direction),watsonp,genestrand,
16523 			     jump_late_p,querylength,
16524 #ifdef GSNAP
16525 			     /*orig_sensedir*/SENSE_FORWARD,
16526 #endif
16527 			     queryseq_ptr,queryuc_ptr,
16528 			     chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
16529 			     maxpeelback,pairpool,dynprogL,dynprogR);
16530 
16531     *ambig_end_length_5_2 = rev_ambig_end_length_5;
16532     *ambig_end_length_3_2 = rev_ambig_end_length_3;
16533     *ambig_splicetype_5_2 = rev_ambig_splicetype_5;
16534     *ambig_splicetype_3_2 = rev_ambig_splicetype_3;
16535     *ambig_prob_5_2 = rev_ambig_prob_5;
16536     *ambig_prob_3_2 = rev_ambig_prob_3;
16537 
16538     *cdna_direction = -1;
16539 
16540     pairs_rev_copy = Pairpool_copy(pairs_rev,pairpool);
16541     *finalpairs2 = path_trim(defect_rate_rev,&(*ambig_end_length_5_2),&(*ambig_end_length_3_2),
16542 			     &(*ambig_splicetype_5_2),&(*ambig_splicetype_3_2),
16543 			     &(*ambig_prob_5_2),&(*ambig_prob_3_2),
16544 			     pairs_rev_copy,&(*cdna_direction),watsonp,genestrand,
16545 			     jump_late_p,querylength,
16546 #ifdef GSNAP
16547 			     /*orig_sensedir*/SENSE_ANTI,
16548 #endif
16549 			     queryseq_ptr,queryuc_ptr,
16550 			     chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
16551 			     maxpeelback,pairpool,dynprogL,dynprogR);
16552 
16553     if (*finalpairs1 != NULL && *finalpairs2 != NULL) {
16554       debug11(printf("Both directions are non-null, so returning both\n"));
16555       /* Pairarray 1 (cdna_direction +1): */
16556       *nmatches_posttrim_1 = Pair_nmatches_posttrim(&(*max_match_length_1),*finalpairs1,/*pos5*/*ambig_end_length_5_1,
16557 						    /*pos3*/querylength - (*ambig_end_length_3_1));
16558       pairarray1 = make_pairarray(&(*npairs1),&(*finalpairs1),/*cdna_direction*/+1,watsonp,
16559 				  pairpool,queryseq_ptr,chroffset,chrhigh,
16560 				  ngap,query_subseq_offset,skiplength);
16561       *goodness1 = Pair_fracidentity_array(&(*matches1),&(*unknowns1),&(*mismatches1),
16562 					   &(*qopens1),&(*qindels1),&(*topens1),&(*tindels1),
16563 					   &(*ncanonical1),&(*nsemicanonical1),&(*nnoncanonical1),
16564 					   &min_splice_prob_1,pairarray1,*npairs1,/*cdna_direction*/+1);
16565       *avg_splice_score_1 = avg_donor_score_fwd + avg_acceptor_score_fwd;
16566 
16567 
16568       debug0(printf("Result 1 (%d pairs): %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels, splice score %f\n",
16569 		    *npairs1,*matches1,*mismatches1,*qopens1,*qindels1,*topens1,*tindels1,*avg_splice_score_1));
16570       debug0(Pair_dump_array(pairarray1,*npairs1,/*zerobasedp*/true));
16571 
16572       /* Note avg_donor_score_fwd and so on do not include evaluations
16573 	 of the end splice junctions.  So if cdna_direction == 0,
16574 	 callers should assume that the sensedir is not known */
16575 
16576       if (0 /*&& Pair_identical_p(*finalpairs1,*finalpairs2) == true*/) {
16577 	/* This causes misses in resolve-inside procedures */
16578 	debug0(printf("Result 2 is identical to Result 1, so not returning it\n"));
16579 	*pairarray2 = (struct Pair_T *) NULL;
16580 
16581       } else {
16582 	/* Pairarray 2 (cdna_direction -1): */
16583 	*nmatches_posttrim_2 = Pair_nmatches_posttrim(&(*max_match_length_2),*finalpairs2,/*pos5*/*ambig_end_length_5_2,
16584 						      /*pos3*/querylength - (*ambig_end_length_3_2));
16585 	*pairarray2 = make_pairarray(&(*npairs2),&(*finalpairs2),/*cdna_direction*/-1,watsonp,
16586 				     pairpool,queryseq_ptr,chroffset,chrhigh,
16587 				     ngap,query_subseq_offset,skiplength);
16588 	*goodness2 = Pair_fracidentity_array(&(*matches2),&(*unknowns2),&(*mismatches2),
16589 					     &(*qopens2),&(*qindels2),&(*topens2),&(*tindels2),
16590 					     &(*ncanonical2),&(*nsemicanonical2),&(*nnoncanonical2),
16591 					     &min_splice_prob_2,*pairarray2,*npairs2,/*cdna_direction*/-1);
16592 	*avg_splice_score_2 = avg_donor_score_rev + avg_acceptor_score_rev;
16593 
16594 	debug0(printf("Result 2 (%d pairs): %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels, splice score %f\n",
16595 		      *npairs2,*matches2,*mismatches2,*qopens2,*qindels2,*topens2,*tindels2,*avg_splice_score_2));
16596 	debug0(Pair_dump_array(*pairarray2,*npairs2,/*zerobasedp*/true));
16597       }
16598 
16599       *cdna_direction = 0;
16600       *sensedir = SENSE_NULL;
16601       return pairarray1;
16602 
16603     } else if (*finalpairs1 != NULL) {
16604       debug11(printf("Only forward direction is non-null, so retrying...\n"));
16605       pairs_pretrim = pairs_fwd;
16606       *cdna_direction = +1;
16607       /* Continue below */
16608 
16609     } else if (*finalpairs2 != NULL) {
16610       debug11(printf("Only reverse direction is non-null, so retrying...\n"));
16611       pairs_pretrim = pairs_rev;
16612       *cdna_direction = -1;
16613       /* Continue below */
16614 
16615     } else {
16616       return (struct Pair_T *) NULL;
16617     }
16618   }
16619 #endif
16620 
16621   if (*cdna_direction > 0) {
16622     debug11(printf("Solving for forward direction\n"));
16623     *ambig_end_length_5_1 = fwd_ambig_end_length_5;
16624     *ambig_end_length_3_1 = fwd_ambig_end_length_3;
16625     *ambig_splicetype_5_1 = fwd_ambig_splicetype_5;
16626     *ambig_splicetype_3_1 = fwd_ambig_splicetype_3;
16627     *ambig_prob_5_1 = fwd_ambig_prob_5;
16628     *ambig_prob_3_1 = fwd_ambig_prob_3;
16629     *sensedir = SENSE_FORWARD;
16630     *avg_splice_score_1 = avg_donor_score_fwd + avg_acceptor_score_fwd;
16631     defect_rate = defect_rate_fwd;
16632 
16633   } else if (*cdna_direction < 0) {
16634     debug11(printf("Solving for reverse direction\n"));
16635     *ambig_end_length_5_1 = rev_ambig_end_length_5;
16636     *ambig_end_length_3_1 = rev_ambig_end_length_3;
16637     *ambig_splicetype_5_1 = rev_ambig_splicetype_5;
16638     *ambig_splicetype_3_1 = rev_ambig_splicetype_3;
16639     *ambig_prob_5_1 = rev_ambig_prob_5;
16640     *ambig_prob_3_1 = rev_ambig_prob_3;
16641     *sensedir = SENSE_ANTI;
16642     *avg_splice_score_1 = avg_donor_score_rev + avg_acceptor_score_rev;
16643     defect_rate = defect_rate_rev;
16644 
16645   } else {
16646 #ifdef GSNAP
16647     abort();
16648 #else
16649     debug11(printf("Solving for unknown (forward) direction\n"));
16650     *ambig_end_length_5_1 = fwd_ambig_end_length_5;
16651     *ambig_end_length_3_1 = fwd_ambig_end_length_3;
16652     *ambig_splicetype_5_1 = fwd_ambig_splicetype_5;
16653     *ambig_splicetype_3_1 = fwd_ambig_splicetype_3;
16654     *ambig_prob_5_1 = fwd_ambig_prob_5;
16655     *ambig_prob_3_1 = fwd_ambig_prob_3;
16656     *sensedir = SENSE_FORWARD;
16657     *avg_splice_score_1 = 0.0;
16658     defect_rate = defect_rate_fwd;
16659 #endif
16660   }
16661 
16662   /* Okay for path_trim to alter pairs_pretrim */
16663   *finalpairs1 = path_trim(defect_rate,&(*ambig_end_length_5_1),&(*ambig_end_length_3_1),
16664 			   &(*ambig_splicetype_5_1),&(*ambig_splicetype_3_1),
16665 			   &(*ambig_prob_5_1),&(*ambig_prob_3_1),
16666 			   pairs_pretrim,&(*cdna_direction),watsonp,genestrand,
16667 			   jump_late_p,querylength,
16668 #ifdef GSNAP
16669 			   /*orig_sensedir*/*sensedir,
16670 #endif
16671 			   queryseq_ptr,queryuc_ptr,
16672 			   chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
16673 			   maxpeelback,pairpool,dynprogL,dynprogR);
16674 
16675   *nmatches_posttrim_1 = Pair_nmatches_posttrim(&(*max_match_length_1),*finalpairs1,/*pos5*/*ambig_end_length_5_1,
16676 						/*pos3*/querylength - (*ambig_end_length_3_1));
16677   pairarray1 = make_pairarray(&(*npairs1),&(*finalpairs1),*cdna_direction,watsonp,
16678 			      pairpool,queryseq_ptr,chroffset,chrhigh,
16679 			      ngap,query_subseq_offset,skiplength);
16680   *goodness1 = Pair_fracidentity_array(&(*matches1),&(*unknowns1),&(*mismatches1),
16681 				       &(*qopens1),&(*qindels1),&(*topens1),&(*tindels1),
16682 				       &(*ncanonical1),&(*nsemicanonical1),&(*nnoncanonical1),
16683 				       &min_splice_prob_1,pairarray1,*npairs1,*cdna_direction);
16684   /* *avg_splice_score_1 assigned above */
16685 
16686 
16687   debug0(printf("Result (%d pairs): %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels, splice score %f\n",
16688 		*npairs1,*matches1,*mismatches1,*qopens1,*qindels1,*topens1,*tindels1,*avg_splice_score_1));
16689   debug0(Pair_dump_array(pairarray1,*npairs1,/*zerobasedp*/true));
16690 
16691 #ifdef GSNAP
16692   *pairarray2 = (struct Pair_T *) NULL;
16693   *npairs2 = 0;
16694 #endif
16695 
16696   debug11(printf("Final cdna direction is %d\n",*cdna_direction));
16697   debug11(printf("Final sensedir is %d\n",*sensedir));
16698 
16699   return pairarray1;
16700 }
16701 
16702 
16703 
16704 /* Using alloca for last_genomedp5 and last_genomedp3 can cause stack overflow */
16705 /* Combines Stage3_compute_middle and Stage3_compute_ends */
16706 struct Pair_T *
Stage3_compute_one(int * cdna_direction,int * sensedir,List_T * finalpairs1,int * npairs1,int * goodness1,int * matches1,int * nmatches_posttrim_1,int * max_match_length_1,int * ambig_end_length_5_1,int * ambig_end_length_3_1,Splicetype_T * ambig_splicetype_5_1,Splicetype_T * ambig_splicetype_3_1,double * ambig_prob_5_1,double * ambig_prob_3_1,int * unknowns1,int * mismatches1,int * qopens1,int * qindels1,int * topens1,int * tindels1,int * ncanonical1,int * nsemicanonical1,int * nnoncanonical1,double * avg_splice_score_1,struct Pair_T ** pairarray2,List_T * finalpairs2,int * npairs2,int * goodness2,int * matches2,int * nmatches_posttrim_2,int * max_match_length_2,int * ambig_end_length_5_2,int * ambig_end_length_3_2,Splicetype_T * ambig_splicetype_5_2,Splicetype_T * ambig_splicetype_3_2,double * ambig_prob_5_2,double * ambig_prob_3_2,int * unknowns2,int * mismatches2,int * qopens2,int * qindels2,int * topens2,int * tindels2,int * ncanonical2,int * nsemicanonical2,int * nnoncanonical2,double * avg_splice_score_2,List_T stage2pairs,List_T all_stage2_starts,List_T all_stage2_ends,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,int querylength,int skiplength,int query_subseq_offset,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Univcoord_T knownsplice_limit_low,Univcoord_T knownsplice_limit_high,bool watsonp,int genestrand,bool jump_late_p,int maxpeelback,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,int sense_try,int sense_filter)16707 Stage3_compute_one (int *cdna_direction, int *sensedir, List_T *finalpairs1, int *npairs1, int *goodness1,
16708 		    int *matches1, int *nmatches_posttrim_1, int *max_match_length_1,
16709 		    int *ambig_end_length_5_1, int *ambig_end_length_3_1,
16710 		    Splicetype_T *ambig_splicetype_5_1, Splicetype_T *ambig_splicetype_3_1,
16711 		    double *ambig_prob_5_1, double *ambig_prob_3_1,
16712 		    int *unknowns1, int *mismatches1, int *qopens1, int *qindels1, int *topens1, int *tindels1,
16713 		    int *ncanonical1, int *nsemicanonical1, int *nnoncanonical1, double *avg_splice_score_1,
16714 #ifdef GSNAP
16715 		    struct Pair_T **pairarray2, List_T *finalpairs2, int *npairs2, int *goodness2,
16716 		    int *matches2, int *nmatches_posttrim_2, int *max_match_length_2,
16717 		    int *ambig_end_length_5_2, int *ambig_end_length_3_2,
16718 		    Splicetype_T *ambig_splicetype_5_2, Splicetype_T *ambig_splicetype_3_2,
16719 		    double *ambig_prob_5_2, double *ambig_prob_3_2,
16720 		    int *unknowns2, int *mismatches2, int *qopens2, int *qindels2, int *topens2, int *tindels2,
16721 		    int *ncanonical2, int *nsemicanonical2, int *nnoncanonical2, double *avg_splice_score_2,
16722 #endif
16723 
16724 		    List_T stage2pairs, List_T all_stage2_starts, List_T all_stage2_ends,
16725 #ifdef PMAP
16726 		    char *queryaaseq_ptr,
16727 #endif
16728 		    char *queryseq_ptr, char *queryuc_ptr, int querylength,
16729 		    int skiplength, int query_subseq_offset,
16730 		    Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
16731 		    Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
16732 		    bool watsonp, int genestrand, bool jump_late_p,
16733 		    int maxpeelback,
16734 #ifndef GSNAP
16735 		    Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
16736 #endif
16737 		    Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
16738 		    int sense_try, int sense_filter) {
16739   struct Pair_T *pairarray1;
16740   List_T p;
16741   Chrpos_T *last_genomedp5_fwd = NULL, *last_genomedp3_fwd = NULL, *last_genomedp5_rev = NULL, *last_genomedp3_rev = NULL;
16742   List_T pairs_pretrim, pairs_fwd, pairs_rev, best_pairs, temp_pairs, path_fwd, path_rev, best_path, temp_path;
16743   List_T copy;
16744   List_T joined_ends, joined_starts;
16745   int nknown_fwd, ncanonical_fwd, nsemicanonical_fwd, nnoncanonical_fwd,
16746     nknown_rev, ncanonical_rev, nsemicanonical_rev, nnoncanonical_rev;
16747   int nbadintrons_fwd, nbadintrons_rev;
16748   double min_splice_prob_1;
16749   double max_intron_score_fwd = 0.0, max_intron_score_rev = 0.0,
16750     avg_donor_score_fwd = 0.0, avg_acceptor_score_fwd = 0.0,
16751     avg_donor_score_rev = 0.0, avg_acceptor_score_rev = 0.0;
16752   double defect_rate_fwd, defect_rate_rev, defect_rate_temp, defect_rate;
16753   int nmatches_fwd, nmismatches_fwd, nmatches_rev, nmismatches_rev, nindels_fwd, nindels_rev;
16754   int fwd_ambig_end_length_5 = 0, fwd_ambig_end_length_3 = 0, rev_ambig_end_length_5 = 0, rev_ambig_end_length_3 = 0, temp_ambig_end_length;
16755   Splicetype_T fwd_ambig_splicetype_5, fwd_ambig_splicetype_3, rev_ambig_splicetype_5, rev_ambig_splicetype_3, temp_ambig_splicetype;
16756   double fwd_ambig_prob_5, fwd_ambig_prob_3, rev_ambig_prob_5, rev_ambig_prob_3, temp_ambig_prob;
16757 #ifdef GSNAP
16758   List_T pairs_fwd_copy, pairs_rev_copy;
16759   double min_splice_prob_2;
16760 #endif
16761 
16762 
16763 #ifdef COMPLEX_DIRECTION
16764   int indel_alignment_score_fwd, indel_alignment_score_rev;
16765 #endif
16766 
16767   /* stage2pairs = Stage2_middle(stage2); */
16768 #if defined(DEBUG0) || defined(DEBUG11)
16769   if (watsonp == true) {
16770     printf("Stage 3: *** Starting stage 3 at chrnum #%d, chrstart %u, chrend %u, query_subseq_offset %d, sense try %d)\n",
16771 	   chrnum,((Pair_T) stage2pairs->first)->genomepos,
16772 	   ((Pair_T) List_last_value(stage2pairs))->genomepos,query_subseq_offset,sense_try);
16773   } else {
16774     printf("Stage 3: *** Starting stage 3 at chrnum #%d, chrstart %u, chrend %u, query_subseq_offset %d, sense try %d)\n",
16775 	   chrnum,(chrhigh - chroffset) - ((Pair_T) stage2pairs->first)->genomepos,
16776 	   (chrhigh - chroffset) - ((Pair_T) List_last_value(stage2pairs))->genomepos,query_subseq_offset,sense_try);
16777   }
16778 #endif
16779 
16780 #ifdef DEBUG
16781   if (watsonp == true) {
16782     printf("Stage 3: *** Stage3_compute_one: Starting stage 3 at chrnum #%d, chrstart %u, chrend %u, query_subseq_offset %d, sense try %d)\n",
16783 	   chrnum,((Pair_T) stage2pairs->first)->genomepos,
16784 	   ((Pair_T) List_last_value(stage2pairs))->genomepos,query_subseq_offset,sense_try);
16785   } else {
16786     printf("Stage 3: *** Stage3_compute_one: Starting stage 3 at chrnum #%d, chrstart %u, chrend %u, query_subseq_offset %d, sense try %d)\n",
16787 	   chrnum,(chrhigh - chroffset) - ((Pair_T) stage2pairs->first)->genomepos,
16788 	   (chrhigh - chroffset) - ((Pair_T) List_last_value(stage2pairs))->genomepos,query_subseq_offset,sense_try);
16789   }
16790 
16791   printf("stage2pairs\n");
16792   Pair_dump_list(stage2pairs,true);
16793 
16794   printf("stage2starts\n");
16795   for (p = all_stage2_starts; p != NULL; p = List_next(p)) {
16796     Pair_dump_list((List_T) List_head(p),true);
16797   }
16798 
16799   printf("stage2ends\n");
16800   for (p = all_stage2_ends; p != NULL; p = List_next(p)) {
16801     Pair_dump_list((List_T) List_head(p),true);
16802   }
16803 #endif
16804 
16805 
16806 #ifdef PMAP
16807   pairs_fwd = stage2pairs;
16808   pairs_rev = (List_T) NULL;
16809   /* do_final_p = true; */
16810 #else
16811   if (splicingp == false) {
16812     pairs_fwd = stage2pairs;
16813     pairs_rev = (List_T) NULL;
16814   } else if (sense_try > 0) {
16815     pairs_fwd = stage2pairs;
16816     pairs_rev = (List_T) NULL;
16817   } else if (sense_try < 0) {
16818     pairs_fwd = (List_T) NULL;
16819     pairs_rev = stage2pairs;
16820   } else {
16821     /* sense_try == 0: Should try both even if no introns (cf, AA011563) */
16822     pairs_fwd = stage2pairs;
16823     pairs_rev = Pairpool_copy(stage2pairs,pairpool);
16824   }
16825 #endif
16826 
16827 
16828   /* 1.  Middle */
16829   if (pairs_fwd == NULL) {
16830     path_fwd = (List_T) NULL;
16831   } else {
16832     last_genomedp5_fwd = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
16833     last_genomedp3_fwd = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
16834     debug(printf("*** Solve path_fwd\n"));
16835     path_fwd = path_compute_dir(&defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,
16836 				watsonp,genestrand,jump_late_p,
16837 #ifdef PMAP
16838 				queryaaseq_ptr,
16839 #endif
16840 				queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16841 				maxpeelback,
16842 #ifndef GSNAP
16843 				oligoindices_minor,diagpool,cellpool,
16844 #endif
16845 				pairpool,dynprogL,dynprogM,dynprogR,
16846 				last_genomedp5_fwd,last_genomedp3_fwd/*,clean_ends_p:true*/);
16847     /* FREE(last_genomedp3_fwd); -- Do not free here, but at end */
16848     /* FREE(last_genomedp5_fwd); -- Do not free here, but at end */
16849   }
16850 
16851   if (pairs_rev == NULL) {
16852     path_rev = (List_T) NULL;
16853   } else {
16854     last_genomedp5_rev = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
16855     last_genomedp3_rev = (Chrpos_T *) CALLOC(querylength,sizeof(Chrpos_T));
16856     debug(printf("*** Solve path_rev\n"));
16857     path_rev = path_compute_dir(&defect_rate_rev,pairs_rev,/*cdna_direction*/-1,
16858 				watsonp,genestrand,jump_late_p,
16859 #ifdef PMAP
16860 				queryaaseq_ptr,
16861 #endif
16862 				queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16863 				maxpeelback,
16864 #ifndef GSNAP
16865 				oligoindices_minor,diagpool,cellpool,
16866 #endif
16867 				pairpool,dynprogL,dynprogM,dynprogR,
16868 				last_genomedp5_rev,last_genomedp3_rev/*,clean_ends_p:true*/);
16869     /* FREE(last_genomedp5_rev); -- Do not free here, but at end */
16870     /* FREE(last_genomedp3_rev); -- Do not free here, but at end */
16871   }
16872 
16873 
16874 #ifdef GSNAP
16875   if (path_fwd != NULL && path_rev != NULL) {
16876     /* Pick cdna_direction based on initial alignment to avoid unnecessary computation */
16877     pairs_fwd = assign_gap_types(path_fwd,/*cdna_direction*/+1,watsonp,queryseq_ptr,
16878 				 chrnum,chroffset,chrhigh,pairpool);
16879     path_fwd = List_reverse(pairs_fwd);
16880     debug11(printf("Calling score_introns for path_fwd after path_compute_dir\n"));
16881     pairs_fwd = score_introns(&max_intron_score_fwd,&avg_donor_score_fwd,&avg_acceptor_score_fwd,
16882 			      &nknown_fwd,&ncanonical_fwd,&nbadintrons_fwd,path_fwd,/*cdna_direction*/+1,watsonp,
16883 			      chrnum,chroffset,chrhigh
16884 #ifdef WASTE
16885 			      ,pairpool
16886 #endif
16887 			      );
16888 
16889     pairs_rev = assign_gap_types(path_rev,/*cdna_direction*/-1,watsonp,queryseq_ptr,
16890 				 chrnum,chroffset,chrhigh,pairpool);
16891     path_rev = List_reverse(pairs_rev);
16892     debug11(printf("Calling score_introns for path_rev after path_compute_dir\n"));
16893     pairs_rev = score_introns(&max_intron_score_rev,&avg_donor_score_rev,&avg_acceptor_score_rev,
16894 			      &nknown_rev,&ncanonical_rev,&nbadintrons_rev,path_rev,/*cdna_direction*/-1,watsonp,
16895 			      chrnum,chroffset,chrhigh
16896 #ifdef WASTE
16897 			      ,pairpool
16898 #endif
16899 			      );
16900 
16901     if ((*cdna_direction = initial_cdna_direction(pairs_fwd,pairs_rev,
16902 						  avg_donor_score_fwd,avg_acceptor_score_fwd,
16903 						  avg_donor_score_rev,avg_acceptor_score_rev)) > 0) {
16904       debug(printf("Initial cdna direction is %d\n",*cdna_direction));
16905       path_fwd = List_reverse(pairs_fwd);
16906       path_rev = (List_T) NULL;
16907 
16908     } else if (*cdna_direction < 0) {
16909       debug(printf("Initial cdna direction is %d\n",*cdna_direction));
16910       path_fwd = (List_T) NULL;
16911       path_rev = List_reverse(pairs_rev);
16912 
16913     } else {
16914       debug(printf("Initial cdna direction is %d\n",*cdna_direction));
16915       path_fwd = List_reverse(pairs_fwd);
16916       path_rev = List_reverse(pairs_rev);
16917     }
16918   }
16919 #endif
16920 
16921 
16922   /* 2.  3' and 5' ends (possibly multiple) */
16923   debug(printf("Stage2 has %d starts and %d ends\n",List_length(all_stage2_starts),List_length(all_stage2_ends)));
16924   if (path_fwd == NULL) {
16925     pairs_fwd = (List_T) NULL;
16926   } else {
16927     /* 3' end */
16928     if (all_stage2_ends == NULL) {
16929       best_path = path_compute_end3(&fwd_ambig_end_length_3,&fwd_ambig_splicetype_3,&fwd_ambig_prob_3,
16930 				    defect_rate_fwd,path_fwd,/*cdna_direction*/+1,watsonp,genestrand,
16931 				    jump_late_p,querylength,
16932 				    queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
16933 				    knownsplice_limit_low,knownsplice_limit_high,
16934 				    maxpeelback,pairpool,dynprogL);
16935     } else {
16936       best_path = Pairpool_remove_gapholders(path_fwd); /* Pairpool_join cannot handle gapholders */
16937       joined_ends = (List_T) NULL;
16938       for (p = all_stage2_ends; p != NULL; p = List_next(p)) {
16939 #ifdef PMAP
16940         copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
16941 #else
16942 	if (path_rev == NULL) {
16943 	  /* Won't need ends anymore */
16944 	  copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
16945 	} else {
16946 	  copy = Pairpool_join_end3(/*path*/path_fwd,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/true);
16947 	}
16948 #endif
16949 	joined_ends = List_push(joined_ends,(void *) copy);
16950       }
16951 
16952       for (p = joined_ends; p != NULL; p = List_next(p)) {
16953         copy = (List_T) List_head(p);
16954 	debug(printf("*** Solve path_fwd joined end\n"));
16955 	path_fwd = path_compute_dir(&defect_rate_temp,/*pairs*/List_reverse(copy),/*cdna_direction*/+1,
16956 				    watsonp,genestrand,jump_late_p,
16957 #ifdef PMAP
16958 				    queryaaseq_ptr,
16959 #endif
16960 				    queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16961 				    maxpeelback,
16962 #ifndef GSNAP
16963 				    oligoindices_minor,diagpool,cellpool,
16964 #endif
16965 				    pairpool,dynprogL,dynprogM,dynprogR,
16966 				    last_genomedp5_fwd,last_genomedp3_fwd/*,clean_ends_p:false*/);
16967 
16968 	temp_path = path_compute_end3(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
16969 				      defect_rate_temp,path_fwd,/*cdna_direction*/+1,watsonp,genestrand,
16970 				      jump_late_p,querylength,
16971 				      queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
16972 				      knownsplice_limit_low,knownsplice_limit_high,
16973 				      maxpeelback,pairpool,dynprogL);
16974 
16975 	if (temp_path != NULL && end_compare(best_path,temp_path,/*cdna_direction*/+1,watsonp,
16976 					     chrnum,chroffset,chrhigh,/*pairsp*/false) > 0) {
16977 	  best_path = temp_path;
16978 	  fwd_ambig_end_length_3 = temp_ambig_end_length;
16979 	  fwd_ambig_splicetype_3 = temp_ambig_splicetype;
16980 	  fwd_ambig_prob_3 = temp_ambig_prob;
16981 	  defect_rate_fwd = defect_rate_temp;
16982 	  debug21(printf("New best path:\n"));
16983 	  debug21(Pair_dump_list(best_path,true));
16984 	}
16985       }
16986 
16987       List_free(&joined_ends);
16988     }
16989 
16990     /* 5' end */
16991     pairs_fwd = List_reverse(best_path);
16992     if (all_stage2_starts == NULL) {
16993       best_pairs = path_compute_end5(&fwd_ambig_end_length_5,&fwd_ambig_splicetype_5,&fwd_ambig_prob_5,
16994 				     defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,
16995 				     watsonp,genestrand,jump_late_p,
16996 				     queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
16997 				     knownsplice_limit_low,knownsplice_limit_high,
16998 				     maxpeelback,pairpool,dynprogR);
16999     } else {
17000       best_pairs = Pairpool_remove_gapholders(pairs_fwd); /* Pairpool_join cannot handle gapholders */
17001       joined_starts = (List_T) NULL;
17002       for (p = all_stage2_starts; p != NULL; p = List_next(p)) {
17003 #ifdef PMAP
17004         copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
17005 #else
17006 	if (path_rev == NULL) {
17007 	  /* Won't need ends anymore */
17008 	  copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
17009 	} else {
17010 	  copy = Pairpool_join_end5(/*pairs*/pairs_fwd,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/true);
17011 	}
17012 #endif
17013 	joined_starts = List_push(joined_starts,(void *) copy);
17014       }
17015 
17016       for (p = joined_starts; p != NULL; p = List_next(p)) {
17017 	copy = (List_T) List_head(p);
17018 	debug(printf("*** Solve path_fwd joined start\n"));
17019 	path_fwd = path_compute_dir(&defect_rate_temp,/*pairs*/copy,/*cdna_direction*/+1,
17020 				    watsonp,genestrand,jump_late_p,
17021 #ifdef PMAP
17022 				    queryaaseq_ptr,
17023 #endif
17024 				    queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
17025 				    maxpeelback,
17026 #ifndef GSNAP
17027 				    oligoindices_minor,diagpool,cellpool,
17028 #endif
17029 				    pairpool,dynprogL,dynprogM,dynprogR,
17030 				    last_genomedp5_fwd,last_genomedp3_fwd/*,clean_ends_p:false*/);
17031 
17032 	temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
17033 				       defect_rate_temp,/*pairs*/List_reverse(path_fwd),
17034 				       /*cdna_direction*/+1,watsonp,genestrand,jump_late_p,
17035 				       queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
17036 				       knownsplice_limit_low,knownsplice_limit_high,
17037 				       maxpeelback,pairpool,dynprogR);
17038 
17039 	if (temp_pairs != NULL && end_compare(best_pairs,temp_pairs,/*cdna_direction*/+1,watsonp,
17040 					      chrnum,chroffset,chrhigh,/*pairsp*/true) > 0) {
17041 	  best_pairs = temp_pairs;
17042 	  fwd_ambig_end_length_5 = temp_ambig_end_length;
17043 	  fwd_ambig_splicetype_5 = temp_ambig_splicetype;
17044 	  fwd_ambig_prob_5 = temp_ambig_prob;
17045 	  defect_rate_fwd = defect_rate_temp;
17046 	  debug21(printf("New best pairs:\n"));
17047 	  debug21(Pair_dump_list(best_pairs,true));
17048 	}
17049       }
17050 
17051       List_free(&joined_starts);
17052     }
17053 
17054     pairs_fwd = best_pairs;
17055   }
17056 
17057 
17058 #ifndef PMAP
17059   if (path_rev == NULL) {
17060     pairs_rev = (List_T) NULL;
17061   } else {
17062     /* 3' end */
17063     if (all_stage2_ends == NULL) {
17064       best_path = path_compute_end3(&rev_ambig_end_length_3,&rev_ambig_splicetype_3,&rev_ambig_prob_3,
17065 				    defect_rate_rev,path_rev,/*cdna_direction*/-1,watsonp,genestrand,
17066 				    jump_late_p,querylength,
17067 				    queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
17068 				    knownsplice_limit_low,knownsplice_limit_high,
17069 				    maxpeelback,pairpool,dynprogL);
17070 
17071     } else {
17072       best_path = Pairpool_remove_gapholders(path_rev); /* Pairpool_join cannot handle gapholders */
17073       joined_ends = (List_T) NULL;
17074       for (p = all_stage2_ends; p != NULL; p = List_next(p)) {
17075 	copy = Pairpool_join_end3(/*path*/path_rev,/*end3_pairs*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
17076 	joined_ends = List_push(joined_ends,(void *) copy);
17077       }
17078 
17079       for (p = joined_ends; p != NULL; p = List_next(p)) {
17080         copy = (List_T) List_head(p);
17081 	debug(printf("*** Solve path_rev joined end\n"));
17082 	path_rev = path_compute_dir(&defect_rate_temp,/*pairs*/List_reverse(copy),/*cdna_direction*/-1,
17083 				    watsonp,genestrand,jump_late_p,
17084 #ifdef PMAP
17085 				    queryaaseq_ptr,
17086 #endif
17087 				    queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
17088 				    maxpeelback,
17089 #ifndef GSNAP
17090 				    oligoindices_minor,diagpool,cellpool,
17091 #endif
17092 				    pairpool,dynprogL,dynprogM,dynprogR,
17093 				    last_genomedp5_rev,last_genomedp3_rev/*,clean_ends_p:false*/);
17094 
17095 	temp_path = path_compute_end3(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
17096 				      defect_rate_temp,path_rev,/*cdna_direction*/-1,watsonp,genestrand,
17097 				      jump_late_p,querylength,
17098 				      queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
17099 				      knownsplice_limit_low,knownsplice_limit_high,
17100 				      maxpeelback,pairpool,dynprogL);
17101 
17102 	if (temp_path != NULL && end_compare(best_path,temp_path,/*cdna_direction*/-1,watsonp,
17103 					     chrnum,chroffset,chrhigh,/*pairsp*/false) > 0) {
17104 	  best_path = temp_path;
17105 	  rev_ambig_end_length_3 = temp_ambig_end_length;
17106 	  rev_ambig_splicetype_3 = temp_ambig_splicetype;
17107 	  rev_ambig_prob_3 = temp_ambig_prob;
17108 	  defect_rate_rev = defect_rate_temp;
17109 	  debug21(printf("New best path:\n"));
17110 	  debug21(Pair_dump_list(best_path,true));
17111 	}
17112       }
17113 
17114       List_free(&joined_ends);
17115     }
17116 
17117     /* 5' end */
17118     pairs_rev = List_reverse(best_path);
17119     if (all_stage2_starts == NULL) {
17120       best_pairs = path_compute_end5(&rev_ambig_end_length_5,&rev_ambig_splicetype_5,&rev_ambig_prob_5,
17121 				     defect_rate_rev,pairs_rev,/*cdna_direction*/-1,
17122 				     watsonp,genestrand,jump_late_p,
17123 				     queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
17124 				     knownsplice_limit_low,knownsplice_limit_high,
17125 				     maxpeelback,pairpool,dynprogR);
17126 
17127     } else {
17128       best_pairs = Pairpool_remove_gapholders(pairs_rev); /* Pairpool_join cannot handle gapholders */
17129       joined_starts = (List_T) NULL;
17130       for (p = all_stage2_starts; p != NULL; p = List_next(p)) {
17131 	copy = Pairpool_join_end5(/*pairs*/pairs_rev,/*end5_path*/(List_T) List_head(p),pairpool,/*copy_end_p*/false);
17132 	joined_starts = List_push(joined_starts,(void *) copy);
17133       }
17134 
17135       for (p = joined_starts; p != NULL; p = List_next(p)) {
17136 	copy = (List_T) List_head(p);
17137 	debug(printf("*** Solve path_rev joined start\n"));
17138 	path_rev = path_compute_dir(&defect_rate_temp,/*pairs*/copy,/*cdna_direction*/-1,
17139 				    watsonp,genestrand,jump_late_p,
17140 #ifdef PMAP
17141 				    queryaaseq_ptr,
17142 #endif
17143 				    queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
17144 				    maxpeelback,
17145 #ifndef GSNAP
17146 				    oligoindices_minor,diagpool,cellpool,
17147 #endif
17148 				    pairpool,dynprogL,dynprogM,dynprogR,
17149 				    last_genomedp5_rev,last_genomedp3_rev/*,clean_ends_p:false*/);
17150 
17151 	temp_pairs = path_compute_end5(&temp_ambig_end_length,&temp_ambig_splicetype,&temp_ambig_prob,
17152 				       defect_rate_temp,/*pairs*/List_reverse(path_rev),
17153 				       /*cdna_direction*/-1,watsonp,genestrand,jump_late_p,
17154 				       queryseq_ptr,queryuc_ptr,querylength,chrnum,chroffset,chrhigh,
17155 				       knownsplice_limit_low,knownsplice_limit_high,
17156 				       maxpeelback,pairpool,dynprogR);
17157 	if (temp_pairs != NULL && end_compare(best_pairs,temp_pairs,/*cdna_direction*/-1,watsonp,
17158 					      chrnum,chroffset,chrhigh,/*pairsp*/true) > 0) {
17159 	  best_pairs = temp_pairs;
17160 	  rev_ambig_end_length_5 = temp_ambig_end_length;
17161 	  rev_ambig_splicetype_5 = temp_ambig_splicetype;
17162 	  rev_ambig_prob_5 = temp_ambig_prob;
17163 	  defect_rate_rev = defect_rate_temp;
17164 	  debug21(printf("New best pairs:\n"));
17165 	  debug21(Pair_dump_list(best_pairs,true));
17166 	}
17167       }
17168 
17169       List_free(&joined_starts);
17170     }
17171 
17172     pairs_rev = best_pairs;
17173   }
17174 #endif
17175 
17176   pairs_fwd = path_compute_final(defect_rate_fwd,pairs_fwd,/*cdna_direction*/+1,
17177 				 watsonp,genestrand,jump_late_p,querylength,
17178 #ifdef PMAP
17179 				 queryaaseq_ptr,
17180 #endif
17181 				 queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
17182 				 maxpeelback,
17183 #ifndef GSNAP
17184 				 oligoindices_minor,diagpool,cellpool,
17185 #endif
17186 				 pairpool,dynprogL,dynprogM,dynprogR,
17187 				 last_genomedp5_fwd,last_genomedp3_fwd);
17188 
17189   pairs_rev = path_compute_final(defect_rate_rev,pairs_rev,/*cdna_direction*/-1,
17190 				 watsonp,genestrand,jump_late_p,querylength,
17191 #ifdef PMAP
17192 				 queryaaseq_ptr,
17193 #endif
17194 				 queryseq_ptr,queryuc_ptr,chrnum,chroffset,chrhigh,
17195 				 maxpeelback,
17196 #ifndef GSNAP
17197 				 oligoindices_minor,diagpool,cellpool,
17198 #endif
17199 				 pairpool,dynprogL,dynprogM,dynprogR,
17200 				 last_genomedp5_rev,last_genomedp3_rev);
17201 
17202   FREE(last_genomedp3_rev);
17203   FREE(last_genomedp5_rev);
17204   FREE(last_genomedp3_fwd);
17205   FREE(last_genomedp5_fwd);
17206 
17207 
17208   debug(printf("Forward:\n"));
17209   debug(Pair_dump_list(pairs_fwd,true));
17210   debug(printf("\n"));
17211 
17212   debug(printf("Reverse:\n"));
17213   debug(Pair_dump_list(pairs_rev,true));
17214   debug(printf("\n"));
17215 
17216   debug11(printf("Forward:\n"));
17217   debug11(Pair_dump_list(pairs_fwd,true));
17218   debug11(printf("\n"));
17219 
17220   debug11(printf("Reverse:\n"));
17221   debug11(Pair_dump_list(pairs_rev,true));
17222   debug11(printf("\n"));
17223 
17224   debug(printf("Intronscores: %f,%f fwd, %f,%f rev\n",
17225 	       avg_donor_score_fwd,avg_acceptor_score_fwd,avg_donor_score_rev,avg_acceptor_score_rev));
17226   if (pairs_rev == NULL) {
17227     pairs_pretrim = pairs_fwd;
17228     *cdna_direction = +1;
17229     *sensedir = SENSE_FORWARD;
17230 
17231   } else if (pairs_fwd == NULL) {
17232     pairs_pretrim = pairs_rev;
17233     *cdna_direction = -1;
17234     *sensedir = SENSE_ANTI;
17235 
17236   } else {
17237     path_fwd = List_reverse(pairs_fwd);
17238     debug11(printf("Calling score_introns for path_fwd before path_trim\n"));
17239     pairs_fwd = score_introns(&max_intron_score_fwd,&avg_donor_score_fwd,&avg_acceptor_score_fwd,
17240 			      &nknown_fwd,&ncanonical_fwd,&nbadintrons_fwd,path_fwd,/*cdna_direction*/+1,watsonp,
17241 			      chrnum,chroffset,chrhigh
17242 #ifdef WASTE
17243 			      ,pairpool
17244 #endif
17245 			      );
17246     /* alignment_score_fwd = */ score_alignment(&nmatches_fwd,&nmismatches_fwd,&nindels_fwd,
17247 #ifdef COMPLEX_DIRECTION
17248 						&indel_alignment_score_fwd,
17249 #endif
17250 						&nsemicanonical_fwd,&nnoncanonical_fwd,
17251 						pairs_fwd,/*cdna_direction*/+1);
17252 
17253     path_rev = List_reverse(pairs_rev);
17254     debug11(printf("Calling score_introns for path_rev before path_trim\n"));
17255     pairs_rev = score_introns(&max_intron_score_rev,&avg_donor_score_rev,&avg_acceptor_score_rev,
17256 			      &nknown_rev,&ncanonical_rev,&nbadintrons_rev,path_rev,/*cdna_direction*/-1,watsonp,
17257 			      chrnum,chroffset,chrhigh
17258 #ifdef WASTE
17259 			      ,pairpool
17260 #endif
17261 			      );
17262     /* alignment_score_rev = */ score_alignment(&nmatches_rev,&nmismatches_rev,&nindels_rev,
17263 #ifdef COMPLEX_DIRECTION
17264 						&indel_alignment_score_rev,
17265 #endif
17266 						&nsemicanonical_rev,&nnoncanonical_rev,
17267 						pairs_rev,/*cdna_direction*/-1);
17268 
17269     pairs_pretrim = pick_cdna_direction(&(*cdna_direction),&(*sensedir),pairs_fwd,pairs_rev,
17270 					defect_rate_fwd,defect_rate_rev,
17271 					nknown_fwd,ncanonical_fwd,nsemicanonical_fwd,nnoncanonical_fwd,nbadintrons_fwd,
17272 					nknown_rev,ncanonical_rev,nsemicanonical_rev,nnoncanonical_rev,nbadintrons_rev,
17273 					max_intron_score_fwd,avg_donor_score_fwd,avg_acceptor_score_fwd,
17274 					max_intron_score_rev,avg_donor_score_rev,avg_acceptor_score_rev,
17275 #ifdef COMPLEX_DIRECTION
17276 					nmatches_fwd,nmismatches_fwd,nmatches_rev,nmismatches_rev,nindels_fwd,nindels_rev,
17277 					indel_alignment_score_fwd,indel_alignment_score_rev,
17278 #endif
17279 					sense_filter);
17280   }
17281 
17282 
17283   if (pairs_pretrim == NULL) {
17284 #if 0
17285     *npairs1 = 0;
17286     *goodness1 = 0;
17287     *nmatches_posttrim_1 = 0;
17288     *ambig_end_length_5_1 = *ambig_end_length_3_1 = 0;
17289     *ambig_prob_5_1 = *ambig_prob_3_1 = 0.0;
17290 #endif
17291     return (struct Pair_T *) NULL;
17292   }
17293 
17294   if (splicingp == false) {
17295     *sensedir = SENSE_NULL;
17296   }
17297 
17298 #ifdef GSNAP
17299   if (*cdna_direction == 0) {
17300     /* If both pairarrays are returned, then first one is fwd and second one is rev */
17301     debug11(printf("Initial cdna_direction is 0\n"));
17302     *ambig_end_length_5_1 = fwd_ambig_end_length_5;
17303     *ambig_end_length_3_1 = fwd_ambig_end_length_3;
17304     *ambig_splicetype_5_1 = fwd_ambig_splicetype_5;
17305     *ambig_splicetype_3_1 = fwd_ambig_splicetype_3;
17306     *ambig_prob_5_1 = fwd_ambig_prob_5;
17307     *ambig_prob_3_1 = fwd_ambig_prob_3;
17308 
17309     *cdna_direction = +1;
17310 
17311     /* path_trim alters pairs_fwd, so make a copy in case we use it for pairs_pretrim */
17312     pairs_fwd_copy = Pairpool_copy(pairs_fwd,pairpool);
17313     *finalpairs1 = path_trim(defect_rate_fwd,&(*ambig_end_length_5_1),&(*ambig_end_length_3_1),
17314 			     &(*ambig_splicetype_5_1),&(*ambig_splicetype_3_1),
17315 			     &(*ambig_prob_5_1),&(*ambig_prob_3_1),
17316 			     pairs_fwd_copy,&(*cdna_direction),watsonp,genestrand,
17317 			     jump_late_p,querylength,
17318 #ifdef GSNAP
17319 			     /*orig_sensedir*/SENSE_FORWARD,
17320 #endif
17321 			     queryseq_ptr,queryuc_ptr,
17322 			     chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
17323 			     maxpeelback,pairpool,dynprogL,dynprogR);
17324 
17325     *ambig_end_length_5_2 = rev_ambig_end_length_5;
17326     *ambig_end_length_3_2 = rev_ambig_end_length_3;
17327     *ambig_splicetype_5_2 = rev_ambig_splicetype_5;
17328     *ambig_splicetype_3_2 = rev_ambig_splicetype_3;
17329     *ambig_prob_5_2 = rev_ambig_prob_5;
17330     *ambig_prob_3_2 = rev_ambig_prob_3;
17331 
17332     *cdna_direction = -1;
17333 
17334     pairs_rev_copy = Pairpool_copy(pairs_rev,pairpool);
17335     *finalpairs2 = path_trim(defect_rate_rev,&(*ambig_end_length_5_2),&(*ambig_end_length_3_2),
17336 			     &(*ambig_splicetype_5_2),&(*ambig_splicetype_3_2),
17337 			     &(*ambig_prob_5_2),&(*ambig_prob_3_2),
17338 			     pairs_rev_copy,&(*cdna_direction),watsonp,genestrand,
17339 			     jump_late_p,querylength,
17340 #ifdef GSNAP
17341 			     /*orig_sensedir*/SENSE_ANTI,
17342 #endif
17343 			     queryseq_ptr,queryuc_ptr,
17344 			     chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
17345 			     maxpeelback,pairpool,dynprogL,dynprogR);
17346 
17347     if (*finalpairs1 != NULL && *finalpairs2 != NULL) {
17348       debug11(printf("Both directions are non-null, so returning both\n"));
17349       /* Pairarray 1 (cdna_direction +1): */
17350       *nmatches_posttrim_1 = Pair_nmatches_posttrim(&(*max_match_length_1),*finalpairs1,/*pos5*/*ambig_end_length_5_1,
17351 						    /*pos3*/querylength - (*ambig_end_length_3_1));
17352       pairarray1 = make_pairarray(&(*npairs1),&(*finalpairs1),/*cdna_direction*/+1,watsonp,
17353 				  pairpool,queryseq_ptr,chroffset,chrhigh,
17354 				  ngap,query_subseq_offset,skiplength);
17355       *goodness1 = Pair_fracidentity_array(&(*matches1),&(*unknowns1),&(*mismatches1),
17356 					   &(*qopens1),&(*qindels1),&(*topens1),&(*tindels1),
17357 					   &(*ncanonical1),&(*nsemicanonical1),&(*nnoncanonical1),
17358 					   &min_splice_prob_1,pairarray1,*npairs1,/*cdna_direction*/+1);
17359       *avg_splice_score_1 = avg_donor_score_fwd + avg_acceptor_score_fwd;
17360 
17361 
17362       debug0(printf("Result 1 (%d pairs): %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels, splice score %f\n",
17363 		    *npairs1,*matches1,*mismatches1,*qopens1,*qindels1,*topens1,*tindels1,*avg_splice_score_1));
17364       debug0(Pair_dump_array(pairarray1,*npairs1,/*zerobasedp*/true));
17365 
17366       /* Note avg_donor_score_fwd and so on do not include evaluations
17367 	 of the end splice junctions.  So if cdna_direction == 0,
17368 	 callers should assume that the sensedir is not known */
17369 
17370       if (0 /*&& Pair_identical_p(*finalpairs1,*finalpairs2) == true*/) {
17371 	/* This causes misses in resolve-inside procedures */
17372 	debug0(printf("Result 2 is identical to Result 1, so not returning it\n"));
17373 	*pairarray2 = (struct Pair_T *) NULL;
17374 
17375       } else {
17376 	/* Pairarray 2 (cdna_direction -1): */
17377 	*nmatches_posttrim_2 = Pair_nmatches_posttrim(&(*max_match_length_2),*finalpairs2,/*pos5*/*ambig_end_length_5_2,
17378 						      /*pos3*/querylength - (*ambig_end_length_3_2));
17379 	*pairarray2 = make_pairarray(&(*npairs2),&(*finalpairs2),/*cdna_direction*/-1,watsonp,
17380 				     pairpool,queryseq_ptr,chroffset,chrhigh,
17381 				     ngap,query_subseq_offset,skiplength);
17382 	*goodness2 = Pair_fracidentity_array(&(*matches2),&(*unknowns2),&(*mismatches2),
17383 					     &(*qopens2),&(*qindels2),&(*topens2),&(*tindels2),
17384 					     &(*ncanonical2),&(*nsemicanonical2),&(*nnoncanonical2),
17385 					     &min_splice_prob_2,*pairarray2,*npairs2,/*cdna_direction*/-1);
17386 	*avg_splice_score_2 = avg_donor_score_rev + avg_acceptor_score_rev;
17387 
17388 	debug0(printf("Result 2 (%d pairs): %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels, splice score %f\n",
17389 		      *npairs2,*matches2,*mismatches2,*qopens2,*qindels2,*topens2,*tindels2,*avg_splice_score_2));
17390 	debug0(Pair_dump_array(*pairarray2,*npairs2,/*zerobasedp*/true));
17391       }
17392 
17393       *cdna_direction = 0;
17394       *sensedir = SENSE_NULL;
17395       return pairarray1;
17396 
17397     } else if (*finalpairs1 != NULL) {
17398       debug11(printf("Only forward direction is non-null, so retrying...\n"));
17399       pairs_pretrim = pairs_fwd;
17400       *cdna_direction = +1;
17401       /* Continue below */
17402 
17403     } else if (*finalpairs2 != NULL) {
17404       debug11(printf("Only reverse direction is non-null, so retrying...\n"));
17405       pairs_pretrim = pairs_rev;
17406       *cdna_direction = -1;
17407       /* Continue below */
17408 
17409     } else {
17410       return (struct Pair_T *) NULL;
17411     }
17412   }
17413 #endif
17414 
17415   if (*cdna_direction > 0) {
17416     debug11(printf("Solving for forward direction\n"));
17417     *ambig_end_length_5_1 = fwd_ambig_end_length_5;
17418     *ambig_end_length_3_1 = fwd_ambig_end_length_3;
17419     *ambig_splicetype_5_1 = fwd_ambig_splicetype_5;
17420     *ambig_splicetype_3_1 = fwd_ambig_splicetype_3;
17421     *ambig_prob_5_1 = fwd_ambig_prob_5;
17422     *ambig_prob_3_1 = fwd_ambig_prob_3;
17423     *sensedir = SENSE_FORWARD;
17424     *avg_splice_score_1 = avg_donor_score_fwd + avg_acceptor_score_fwd;
17425     defect_rate = defect_rate_fwd;
17426 
17427   } else if (*cdna_direction < 0) {
17428     debug11(printf("Solving for reverse direction\n"));
17429     *ambig_end_length_5_1 = rev_ambig_end_length_5;
17430     *ambig_end_length_3_1 = rev_ambig_end_length_3;
17431     *ambig_splicetype_5_1 = rev_ambig_splicetype_5;
17432     *ambig_splicetype_3_1 = rev_ambig_splicetype_3;
17433     *ambig_prob_5_1 = rev_ambig_prob_5;
17434     *ambig_prob_3_1 = rev_ambig_prob_3;
17435     *sensedir = SENSE_ANTI;
17436     *avg_splice_score_1 = avg_donor_score_rev + avg_acceptor_score_rev;
17437     defect_rate = defect_rate_rev;
17438 
17439   } else {
17440 #ifdef GSNAP
17441     abort();
17442 #else
17443     debug11(printf("Solving for unknown (forward) direction\n"));
17444     *ambig_end_length_5_1 = fwd_ambig_end_length_5;
17445     *ambig_end_length_3_1 = fwd_ambig_end_length_3;
17446     *ambig_splicetype_5_1 = fwd_ambig_splicetype_5;
17447     *ambig_splicetype_3_1 = fwd_ambig_splicetype_3;
17448     *ambig_prob_5_1 = fwd_ambig_prob_5;
17449     *ambig_prob_3_1 = fwd_ambig_prob_3;
17450     *sensedir = SENSE_FORWARD;
17451     *avg_splice_score_1 = 0.0;
17452     defect_rate = defect_rate_fwd;
17453 #endif
17454   }
17455 
17456   /* Okay for path_trim to alter pairs_pretrim */
17457   *finalpairs1 = path_trim(defect_rate,&(*ambig_end_length_5_1),&(*ambig_end_length_3_1),
17458 			   &(*ambig_splicetype_5_1),&(*ambig_splicetype_3_1),
17459 			   &(*ambig_prob_5_1),&(*ambig_prob_3_1),
17460 			   pairs_pretrim,&(*cdna_direction),watsonp,genestrand,
17461 			   jump_late_p,querylength,
17462 #ifdef GSNAP
17463 			   /*orig_sensedir*/*sensedir,
17464 #endif
17465 			   queryseq_ptr,queryuc_ptr,
17466 			   chroffset,chrhigh,knownsplice_limit_low,knownsplice_limit_high,
17467 			   maxpeelback,pairpool,dynprogL,dynprogR);
17468 
17469   *nmatches_posttrim_1 = Pair_nmatches_posttrim(&(*max_match_length_1),*finalpairs1,/*pos5*/*ambig_end_length_5_1,
17470 						/*pos3*/querylength - (*ambig_end_length_3_1));
17471   pairarray1 = make_pairarray(&(*npairs1),&(*finalpairs1),*cdna_direction,watsonp,
17472 			      pairpool,queryseq_ptr,chroffset,chrhigh,
17473 			      ngap,query_subseq_offset,skiplength);
17474   *goodness1 = Pair_fracidentity_array(&(*matches1),&(*unknowns1),&(*mismatches1),
17475 				       &(*qopens1),&(*qindels1),&(*topens1),&(*tindels1),
17476 				       &(*ncanonical1),&(*nsemicanonical1),&(*nnoncanonical1),
17477 				       &min_splice_prob_1,pairarray1,*npairs1,*cdna_direction);
17478   /* *avg_splice_score_1 assigned above */
17479 
17480 
17481   debug0(printf("Result (%d pairs): %d matches, %d mismatches, %d qopens, %d qindels, %d topens, %d tindels, splice score %f\n",
17482 		*npairs1,*matches1,*mismatches1,*qopens1,*qindels1,*topens1,*tindels1,*avg_splice_score_1));
17483   debug0(Pair_dump_array(pairarray1,*npairs1,/*zerobasedp*/true));
17484 
17485 #ifdef GSNAP
17486   *pairarray2 = (struct Pair_T *) NULL;
17487   *npairs2 = 0;
17488 #endif
17489 
17490   debug11(printf("Final cdna direction is %d\n",*cdna_direction));
17491   debug11(printf("Final sensedir is %d\n",*sensedir));
17492 
17493   return pairarray1;
17494 }
17495 
17496 
17497 /************************************************************************
17498  *  Merging
17499  ************************************************************************/
17500 
17501 bool
Stage3_mergeable(Stage3_T firstpart,Stage3_T secondpart,int breakpoint,int queryntlength)17502 Stage3_mergeable (Stage3_T firstpart, Stage3_T secondpart,
17503 		  int breakpoint, int queryntlength) {
17504   Pair_T end1, start2;
17505   bool watsonp, connectablep = false;
17506   Chrpos_T endchrpos1, startchrpos2;
17507   int npairs_left, npairs_right, nstart;
17508   int cdna_direction_1, cdna_direction_2;
17509 
17510 
17511   assert(breakpoint > 0);	/* Caller should check for this */
17512   assert(firstpart->pairs != NULL);
17513   assert(secondpart->pairs != NULL);
17514 
17515   debug20(printf("Stage3_mergeable called with breakpoint %d, watsonp %d and %d, and cdna_directions %d and %d\n",
17516 		 breakpoint,firstpart->watsonp,secondpart->watsonp,firstpart->cdna_direction,secondpart->cdna_direction));
17517   debug10(Stage3_print_ends(firstpart));
17518   debug10(Stage3_print_ends(secondpart));
17519 
17520   if (firstpart->chrnum != secondpart->chrnum) {
17521     debug20(printf("not mergeable: chrnum %d != chrnum %d\n",firstpart->chrnum,secondpart->chrnum));
17522     return false;
17523 
17524   } else if (firstpart->watsonp != secondpart->watsonp) {
17525     debug20(printf("not mergeable: watsonp %d != watsonp %d\n",firstpart->watsonp,secondpart->watsonp));
17526     return false;
17527 
17528 #if 0
17529   } else if (firstpart->sensedir != secondpart->sensedir &&
17530 	     firstpart->sensedir != SENSE_NULL && secondpart->sensedir != SENSE_NULL) {
17531     /* Could be mergeable if an intron is trimmed during the merge */
17532     debug20(printf("not mergeable: sensedir %d != sensedir %d\n",
17533 		   firstpart->sensedir,secondpart->sensedir));
17534     return false;
17535 #endif
17536 
17537   } else {
17538     /* Find end pairs. Ignore cdna directions for now. */
17539     end1 = Pair_end_bound(&cdna_direction_1,firstpart->pairs,breakpoint);
17540     start2 = Pair_start_bound(&cdna_direction_2,secondpart->pairs,breakpoint+1);
17541 
17542     if ((watsonp = firstpart->watsonp) == true) {
17543       endchrpos1 = end1->genomepos;
17544       startchrpos2 = start2->genomepos;
17545 
17546       debug20(printf("? connectable, watson: endchrpos1 %d at querypos %d versus startchrpos2 %d at querypos %d\n",
17547 		     endchrpos1,end1->querypos,startchrpos2,start2->querypos));
17548       if (endchrpos1 < startchrpos2) {
17549 	/* Deletion */
17550 	/* *genomejump = startchrpos2 - endchrpos1 - 1; */
17551 	debug20(printf("endchrpos1 < startchrpos2, so deletion of length %u\n",startchrpos2 - endchrpos1 - 1));
17552 	if (startchrpos2 < endchrpos1 + maxintronlen) {
17553 	  connectablep = true;
17554 	}
17555 
17556       } else if (startchrpos2 + (end1->querypos - start2->querypos) + 100 >= endchrpos1) {
17557 	/* Insertion */
17558 	debug20(printf("startchrpos2 + (%d - %d) + %d >= endchrpos2, so insertion\n",end1->querypos,start2->querypos,20));
17559 	/* *genomejump = 0; */
17560 	connectablep = true;
17561       }
17562 
17563 
17564     } else {
17565       /* These are genomicpos, not really chrpos.  If go to chrpos, need to rewrite logic. */
17566       endchrpos1 = firstpart->chrhigh - end1->genomepos;
17567       startchrpos2 = secondpart->chrhigh - start2->genomepos;
17568 
17569       debug20(printf("? connectable, crick: startchrpos2 %u at querypos %d versus endchrpos1 %u at querypos %d\n",
17570 		     startchrpos2,start2->querypos,endchrpos1,end1->querypos));
17571       if (startchrpos2 < endchrpos1) {
17572 	/* Deletion */
17573 	/* *genomejump = endchrpos1 - startchrpos2 - 1; */
17574 	debug20(printf("startchrpos2 < endchrpos1, so deletion of length %u\n",endchrpos1 - startchrpos2 - 1));
17575 	if (endchrpos1 < startchrpos2 + maxintronlen) {
17576 	  connectablep = true;
17577 	}
17578       } else if (endchrpos1 + (end1->querypos - start2->querypos) + 100 >= startchrpos2) {
17579 	/* Insertion */
17580 	debug20(printf("endchrpos1 + (%d - %d) + %d >= endchrpos1, so insertion\n",end1->querypos,start2->querypos,20));
17581 	/* *genomejump = 0; */
17582 	connectablep = true;
17583       }
17584     }
17585 
17586     if (connectablep == false) {
17587       debug20(printf("result: not mergeable\n\n"));
17588       return false;
17589     } else {
17590       npairs_left = Pairpool_count_bounded(&nstart,firstpart->pairs,0,breakpoint);
17591       npairs_right = Pairpool_count_bounded(&nstart,secondpart->pairs,breakpoint,queryntlength);
17592       debug20(printf("Predicted after splicing: npairs_left %d, npairs_right %d\n",npairs_left,npairs_right));
17593       if (npairs_left < 25 || npairs_right < 25) {
17594 	return false;
17595       } else {
17596 	/* *queryjump = start2->querypos - end1->querypos - 1; */
17597 	debug20(printf("result: mergeable: queryjump = %d - %d - 1 = %d\n\n",
17598 		       start2->querypos,end1->querypos,start2->querypos - end1->querypos - 1));
17599 
17600 	return true;
17601       }
17602     }
17603   }
17604 }
17605 
17606 
17607 bool
Stage3_merge_chimera(T * new_left,T * new_right,T old_left,T old_right,int minpos1,int maxpos1,int minpos2,int maxpos2,Sequence_T queryseq,char * queryseq_ptr,char * queryuc_ptr,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogR,int maxpeelback)17608 Stage3_merge_chimera (T *new_left, T *new_right, T old_left, T old_right,
17609 		      int minpos1, int maxpos1, int minpos2, int maxpos2,
17610 		      Sequence_T queryseq, char *queryseq_ptr, char *queryuc_ptr, Pairpool_T pairpool,
17611 		      Dynprog_T dynprogL, Dynprog_T dynprogR, int maxpeelback) {
17612   List_T left_pairs, right_pairs;
17613   List_T path, peeled_path, pairs, peeled_pairs;
17614   bool knownsplicep, chop_exon_p;
17615   int ambig_end_length_5 = 0, ambig_end_length_3 = 0;	/* Need to be set for build_pairs_end5 and build_path_end3 */
17616   double ambig_prob_5, ambig_prob_3;
17617   int dynprogindex_minor = 0;
17618   Splicetype_T ambig_splicetype;
17619 
17620   Pair_T endpair;
17621   int querydp5, querydp3, n_peeled_indels;
17622   Chrpos_T genomedp5, genomedp3;
17623   bool protectedp;
17624 
17625   debug10(printf("Entered stage3_merge_chimera with minpos1 %d, maxpos1 %d, minpos2 %d, maxpos2 %d\n",
17626 		minpos1,maxpos1,minpos2,maxpos2));
17627 
17628 #ifdef DEBUG10
17629   printf("ORIG LEFT PAIRS:\n");
17630   Pair_dump_list(old_left->pairs,true);
17631   printf("ORIG RIGHT PAIRS:\n");
17632   Pair_dump_list(old_right->pairs,true);
17633 #endif
17634 
17635   left_pairs = Pairpool_copy(old_left->pairs,pairpool);
17636   left_pairs = Pair_clip_bounded_list_5(left_pairs,minpos1,maxpos1);
17637   right_pairs = Pairpool_copy(old_right->pairs,pairpool);
17638   right_pairs = Pair_clip_bounded_list_3(right_pairs,minpos2,maxpos2);
17639 
17640 #ifdef DEBUG10
17641   printf("CLIPPED LEFT PAIRS:\n");
17642   Pair_dump_list(left_pairs,true);
17643   printf("CLIPPED RIGHT PAIRS:\n");
17644   Pair_dump_list(right_pairs,true);
17645 #endif
17646 
17647   if (left_pairs == NULL || right_pairs == NULL) {
17648     *new_left = *new_right = (T) NULL;
17649     return false;
17650 
17651   } else {
17652     *new_left = Stage3_new_from_pairs(left_pairs,old_left->cdna_direction,old_left->watsonp,
17653 				      old_left->genestrand,old_left->sensedir,pairpool,queryseq,
17654 				      /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
17655 				      old_left->chrnum,old_left->chroffset,old_left->chrhigh,old_left->chrlength);
17656     *new_right = Stage3_new_from_pairs(right_pairs,old_right->cdna_direction,old_right->watsonp,
17657 				       old_right->genestrand,old_right->sensedir,pairpool,queryseq,
17658 				       /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
17659 				       old_right->chrnum,old_right->chroffset,old_right->chrhigh,old_right->chrlength);
17660 
17661     path = List_reverse((*new_left)->pairs);
17662 
17663     /* To avoid indels at chimeric join, need to peelback, clean ends, extend with nogaps, and then re-clip*/
17664     endpair = (Pair_T) path->first;
17665     querydp5 = endpair->querypos + 1;
17666     genomedp5 = endpair->genomepos + 1;
17667     protectedp = false;
17668     path = peel_leftward(&n_peeled_indels,&protectedp,&peeled_path,path,&querydp5,&genomedp5,
17669 			 maxpeelback,/*stop_at_indels_p*/true);
17670     path = clean_path_end3_gap_indels(path);
17671 
17672     /* Have to use forcep == true, because we cannot put back the
17673        pairs removed from peel_leftward and clean_path_end3_gap_indels */
17674     path = build_path_end3(&knownsplicep,&ambig_end_length_3,&ambig_splicetype,&ambig_prob_3,
17675 			   &chop_exon_p,&dynprogindex_minor,path,
17676 			   (*new_left)->chroffset,(*new_left)->chrhigh,/*querylength*/maxpos1+1,
17677 			   /*knownsplice_limit_low*/-1,/*knownsplice_limit_high*/0,
17678 			   queryseq_ptr,queryuc_ptr,
17679 			   (*new_left)->cdna_direction,(*new_left)->watsonp,(*new_left)->genestrand,
17680 			   /*jump_late_p*/(*new_left)->watsonp ? false : true,
17681 			   maxpeelback,/*defect_rate*/0.0,pairpool,dynprogL,
17682 			   /*extendp*/true,/*endalign*/QUERYEND_NOGAPS,/*forcep*/true);
17683 
17684     (*new_left)->pairs = List_reverse(path);
17685     (*new_left)->pairs = Pair_clip_bounded_list_5((*new_left)->pairs,minpos1,maxpos1);
17686     debug10(printf("CLEANED LEFT PAIRS:\n"));
17687     debug10(Pair_dump_list((*new_left)->pairs,true));
17688 
17689     /* To avoid indels at chimeric join, need to peelback, clean ends, extend with nogaps, and then re-clip*/
17690     pairs = (*new_right)->pairs;
17691 
17692     endpair = (Pair_T) pairs->first;
17693     querydp3 = endpair->querypos - 1;
17694     genomedp3 = endpair->genomepos - 1;
17695     protectedp = false;
17696     pairs = peel_rightward(&n_peeled_indels,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
17697 			   maxpeelback,/*stop_at_indels_p*/true);
17698     pairs = clean_pairs_end5_gap_indels(pairs);
17699 
17700     /* Have to use forcep == true, because we cannot put back the
17701        pairs removed from peel_rightward and clean_pairs_end5_gap_indels */
17702     pairs = build_pairs_end5(&knownsplicep,&ambig_end_length_5,&ambig_splicetype,&ambig_prob_5,
17703 			     &chop_exon_p,&dynprogindex_minor,pairs,
17704 			     (*new_right)->chroffset,(*new_right)->chrhigh,
17705 			     /*knownsplice_limit_low*/-1,/*knownsplice_limit_high*/0,
17706 			     queryseq_ptr,queryuc_ptr,
17707 			     (*new_right)->cdna_direction,(*new_right)->watsonp,(*new_right)->genestrand,
17708 			     /*jump_late_p*/(*new_right)->watsonp ? false : true,
17709 			     maxpeelback,/*defect_rate*/0.0,pairpool,dynprogR,
17710 			     /*extendp*/true,/*endalign*/QUERYEND_NOGAPS,/*forcep*/true);
17711     (*new_right)->pairs = Pair_clip_bounded_list_3(pairs,minpos2,maxpos2);
17712     debug10(printf("CLEANED RIGHT PAIRS:\n"));
17713     debug10(Pair_dump_list((*new_right)->pairs,true));
17714 
17715     if ((*new_left)->pairs == NULL || (*new_right)->pairs == NULL) {
17716       Stage3_free(&(*new_left));
17717       Stage3_free(&(*new_right));
17718       *new_left = *new_right = (T) NULL;
17719       return false;
17720     } else {
17721       make_pairarrays_chimera(*new_left,*new_right,queryseq_ptr,pairpool,/*gaplength*/0,ngap);
17722 
17723       (*new_left)->chimera_right_p = true;
17724       (*new_right)->chimera_left_p = true;
17725       return true;
17726     }
17727   }
17728 }
17729 
17730 
17731 
17732 void
Stage3_extend_right(T this,int goal,int querylength,char * queryseq_ptr,char * queryuc_ptr,bool max_extend_p,Pairpool_T pairpool,int genestrand,int maxpeelback)17733 Stage3_extend_right (T this, int goal, int querylength,
17734 		     char *queryseq_ptr, char *queryuc_ptr,
17735 		     bool max_extend_p, Pairpool_T pairpool,
17736 		     int genestrand, int maxpeelback) {
17737   List_T path, peeled_path;
17738   Pair_T leftpair;
17739 
17740   int nconsecutive_mismatches;
17741   int querypos, querydp5;
17742   Chrpos_T genomedp5;
17743   Chrpos_T genomepos;
17744   char c, c_upper, g, g_alt, comp;
17745   bool protectedp;
17746   int n_peeled_indels;
17747 
17748   int ncanonical, nsemicanonical;
17749   double min_splice_prob;
17750 
17751 
17752   debug10(printf("Entered Stage3_extend_right with goal %d\n",goal));
17753   debug10(printf("LEFT BEFORE FILL\n"));
17754   debug10(Pair_dump_list(this->pairs,true));
17755   debug10(printf("END_LEFT BEFORE FILL\n"));
17756 
17757 
17758   path = List_reverse(this->pairs);
17759   leftpair = (Pair_T) path->first;
17760 
17761   debug(printf("\nEXTEND_RIGHT\n"));
17762   querydp5 = leftpair->querypos + 1;
17763   genomedp5 = leftpair->genomepos + 1;
17764   /* if (leftpair->cdna == ' ') querydp5--; -- For old dynamic programming */
17765   /* if (leftpair->genome == ' ') genomedp5--; -- For old dynamic programming */
17766 
17767   protectedp = false;
17768   path = peel_leftward(&n_peeled_indels,&protectedp,&peeled_path,path,&querydp5,&genomedp5,
17769 		       maxpeelback,/*stop_at_indels_p*/true);
17770   if (path == NULL) {
17771     querypos = querydp5 - 1;
17772     genomepos = genomedp5 - 1;
17773   } else {
17774     path = clean_path_end3_gap_indels(path);
17775     leftpair = (Pair_T) path->first;
17776     querypos = leftpair->querypos;
17777     genomepos = leftpair->genomepos;
17778   }
17779 
17780   if (this->watsonp == true) {
17781     /* pos = this->chroffset + genomepos; */
17782     debug10(printf("watsonp on left is true.  pos is %u.  goal querypos is %d\n",genomepos,goal));
17783 
17784     querypos++;
17785     genomepos++;
17786     /* pos++; */
17787     while (querypos < goal /* && pos <= this->chrhigh */) {
17788       c = queryseq_ptr[querypos];
17789       c_upper = queryuc_ptr[querypos];
17790       /* g = Genome_get_char(genome,pos); */
17791       g = get_genomic_nt(&g_alt,genomepos,this->chroffset,this->chrhigh,/*watsonp*/true);
17792       if (g != '*') {
17793 	if (Dynprog_consistent_p(c_upper,g,g_alt,genestrand) == true) {
17794 	  comp = MATCH_COMP;
17795 #ifdef PMAP
17796 	} else if (Dynprog_consistent_p(c_upper,g,g_alt) == true) {
17797 	  comp = AMBIGUOUS_COMP;
17798 #endif
17799 	} else {
17800 	  comp = MISMATCH_COMP;
17801 	}
17802 	debug10(printf("At querypos %d and pos %u on left, have %c and %c\n",
17803 		       querypos,genomepos,c,g));
17804 	path = Pairpool_push(path,pairpool,querypos,genomepos,c,comp,g,g_alt,/*dynprogindex*/0);
17805       }
17806       querypos++;
17807       genomepos++;
17808       /* pos++; */
17809     }
17810 
17811     if (max_extend_p == true) {
17812       debug10(printf("\nGoal achieved.  Now looking for consecutive mismatches\n"));
17813       nconsecutive_mismatches = 0;
17814       while (querypos < querylength /* && pos <= this->chrhigh */ && nconsecutive_mismatches < 3) {
17815 	c = queryseq_ptr[querypos];
17816 	c_upper = queryuc_ptr[querypos];
17817 	/* g = Genome_get_char(genome,pos); */
17818 	g = get_genomic_nt(&g_alt,genomepos,this->chroffset,this->chrhigh,/*watsonp*/true);
17819 	if (g != '*') {
17820 	  if (Dynprog_consistent_p(c_upper,g,g_alt,genestrand) == true) {
17821 	    comp = MATCH_COMP;
17822 	    nconsecutive_mismatches = 0;
17823 #ifdef PMAP
17824 	  } else if (Dynprog_consistent_p(c_upper,g,g_alt) == true) {
17825 	    comp = AMBIGUOUS_COMP;
17826 	    /* Don't consider as match or mismatch */
17827 #endif
17828 	  } else {
17829 	    comp = MISMATCH_COMP;
17830 	    nconsecutive_mismatches += 1;
17831 	  }
17832 	  debug10(printf("At querypos %d and pos %u on left, have %c and %c\n",
17833 			 querypos,genomepos,c,g));
17834 	  path = Pairpool_push(path,pairpool,querypos,genomepos,c,comp,g,g_alt,/*dynprogindex*/0);
17835 	}
17836 	querypos++;
17837 	genomepos++;
17838 	/* pos++; */
17839       }
17840     }
17841 
17842   } else {
17843     /* pos = this->chrhigh - genomepos; */
17844     debug10(printf("watsonp on left is false.  pos is %u.  querypos is %d.  want to go up to goal querypos %d\n",
17845 		   genomepos,querypos,goal));
17846 
17847     querypos++;
17848     genomepos++;
17849     /* pos--; */
17850     while (querypos < goal /* && pos != this->chroffset - 1U */) {
17851       c = queryseq_ptr[querypos];
17852       c_upper = queryuc_ptr[querypos];
17853       /* g = complCode[(int) Genome_get_char(genome,pos)]; */
17854       g = get_genomic_nt(&g_alt,genomepos,this->chroffset,this->chrhigh,/*watsonp*/false);
17855       if (g != '*') {
17856 	if (Dynprog_consistent_p(c_upper,g,g_alt,genestrand) == true) {
17857 	  comp = MATCH_COMP;
17858 #ifdef PMAP
17859 	} else if (Dynprog_consistent_p(c_upper,g,g_alt) == true) {
17860 	  comp = AMBIGUOUS_COMP;
17861 #endif
17862 	} else {
17863 	  comp = MISMATCH_COMP;
17864 	}
17865 	debug10(printf("At querypos %d and pos %u on left, have %c and %c\n",
17866 		       querypos,genomepos,c,g));
17867 	path = Pairpool_push(path,pairpool,querypos,genomepos,c,comp,g,g_alt,/*dynprogindex*/0);
17868       }
17869       querypos++;
17870       genomepos++;
17871       /* pos--; */
17872     }
17873 
17874     if (max_extend_p == true) {
17875       debug10(printf("\nGoal achieved.  Now looking for consecutive mismatches\n"));
17876       nconsecutive_mismatches = 0;
17877       while (querypos < querylength /* && pos != this->chroffset - 1U */ && nconsecutive_mismatches < 3) {
17878 	c = queryseq_ptr[querypos];
17879 	c_upper = queryuc_ptr[querypos];
17880 	/* g = complCode[(int) Genome_get_char(genome,pos)]; */
17881 	g = get_genomic_nt(&g_alt,genomepos,this->chroffset,this->chrhigh,/*watsonp*/false);
17882 	if (g != '*') {
17883 	  if (Dynprog_consistent_p(c_upper,g,g_alt,genestrand) == true) {
17884 	    comp = MATCH_COMP;
17885 	    nconsecutive_mismatches = 0;
17886 #ifdef PMAP
17887 	  } else if (Dynprog_consistent_p(c_upper,g,g_alt) == true) {
17888 	    comp = AMBIGUOUS_COMP;
17889 	    /* Don't count as either match or mismatch */
17890 #endif
17891 	  } else {
17892 	    comp = MISMATCH_COMP;
17893 	    nconsecutive_mismatches += 1;
17894 	  }
17895 	  debug10(printf("At querypos %d and pos %u on left, have %c and %c\n",
17896 			 querypos,genomepos,c,g));
17897 	  path = Pairpool_push(path,pairpool,querypos,genomepos,c,comp,g,g_alt,/*dynprogindex*/0);
17898 	}
17899 	querypos++;
17900 	genomepos++;
17901 	/* pos--; */
17902       }
17903     }
17904   }
17905 
17906   this->pairs = List_reverse(path);
17907 
17908   debug10(printf("LEFT AFTER FILL\n"));
17909   debug10(Pair_dump_list(this->pairs,true));
17910   debug10(printf("END_LEFT AFTER FILL\n"));
17911 
17912   Stage3_free_pairarray(&this);
17913   this->pairarray = make_pairarray(&this->npairs,&this->pairs,this->cdna_direction,
17914 				   this->watsonp,pairpool,queryseq_ptr,
17915 				   this->chroffset,this->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0);
17916   this->goodness = Pair_fracidentity_array(&this->matches,&this->unknowns,&this->mismatches,
17917 					   &this->qopens,&this->qindels,&this->topens,&this->tindels,
17918 					   &ncanonical,&nsemicanonical,&this->noncanonical,
17919 					   &min_splice_prob,this->pairarray,this->npairs,this->cdna_direction);
17920 
17921   if (this->pairarray == NULL) {
17922     this->pairarray_freeable_p = false;
17923   } else {
17924     this->pairarray_freeable_p = true;
17925   }
17926 
17927   return;
17928 }
17929 
17930 
17931 void
Stage3_extend_left(T this,int goal,char * queryseq_ptr,char * queryuc_ptr,bool max_extend_p,Pairpool_T pairpool,int genestrand,int maxpeelback)17932 Stage3_extend_left (T this, int goal,
17933 		    char *queryseq_ptr, char *queryuc_ptr,
17934 		    bool max_extend_p, Pairpool_T pairpool,
17935 		    int genestrand, int maxpeelback) {
17936   List_T pairs, peeled_pairs;
17937   Pair_T rightpair;
17938 
17939   int nconsecutive_mismatches;
17940   int querypos, querydp3;
17941   Chrpos_T genomedp3;
17942   Chrpos_T genomepos;
17943   char c, c_upper, g, g_alt, comp;
17944   bool protectedp;
17945   int n_peeled_indels;
17946 
17947   int ncanonical, nsemicanonical;
17948   double min_splice_prob;
17949 
17950 
17951   debug10(printf("Entered Stage3_extend_left with goal %d\n",goal));
17952   debug10(printf("RIGHT BEFORE FILL\n"));
17953   debug10(Pair_dump_list(this->pairs,true));
17954   debug10(printf("END_RIGHT BEFORE FILL\n"));
17955 
17956 
17957   /* Do not call insert_gapholders */
17958   pairs = this->pairs;
17959   rightpair = (Pair_T) pairs->first;
17960 
17961   debug(printf("\nEXTEND_LEFT\n"));
17962   querydp3 = rightpair->querypos - 1;
17963   genomedp3 = rightpair->genomepos - 1;
17964 
17965   protectedp = false;
17966   pairs = peel_rightward(&n_peeled_indels,&protectedp,&peeled_pairs,pairs,&querydp3,&genomedp3,
17967 			 maxpeelback,/*stop_at_indels_p*/true);
17968   if (pairs == NULL) {
17969     querypos = querydp3 + 1;
17970     genomepos = genomedp3 + 1;
17971   } else {
17972     pairs = clean_pairs_end5_gap_indels(pairs);
17973     rightpair = (Pair_T) pairs->first;
17974     querypos = rightpair->querypos;
17975     genomepos = rightpair->genomepos;
17976   }
17977 
17978   if (this->watsonp == true) {
17979     /* pos = this->chroffset + genomepos; */
17980     debug10(printf("watsonp on right is true.  pos is %u.  goal querypos is %d\n",genomepos,goal));
17981 
17982     querypos--;
17983     genomepos--;
17984     /* pos--; */
17985     while (querypos >= goal /* && pos != this->chroffset - 1U */) {
17986       c = queryseq_ptr[querypos];
17987       c_upper = queryuc_ptr[querypos];
17988       /* g = Genome_get_char(genome,pos); */
17989       g = get_genomic_nt(&g_alt,genomepos,this->chroffset,this->chrhigh,/*watsonp*/true);
17990       if (g != '*') {
17991 	if (Dynprog_consistent_p(c_upper,g,g_alt,genestrand) == true) {
17992 	  comp = MATCH_COMP;
17993 #ifdef PMAP
17994 	} else if (Dynprog_consistent_p(c_upper,g,g_alt) == true) {
17995 	  comp = AMBIGUOUS_COMP;
17996 #endif
17997 	} else {
17998 	  comp = MISMATCH_COMP;
17999 	}
18000 	debug10(printf("At querypos %d and pos %u on right, have %c and %c\n",
18001 		       querypos,genomepos,c,g));
18002 	pairs = Pairpool_push(pairs,pairpool,querypos,genomepos,c,comp,g,g_alt,/*dynprogindex*/0);
18003       }
18004       querypos--;
18005       genomepos--;
18006       /* pos--; */
18007     }
18008 
18009     if (max_extend_p == true) {
18010       debug10(printf("\nGoal achieved.  Now looking for consecutive mismatches\n"));
18011       nconsecutive_mismatches = 0;
18012       while (querypos >= 0 /* && pos != this->chroffset - 1U */ && nconsecutive_mismatches < 3) {
18013 	c = queryseq_ptr[querypos];
18014 	c_upper = queryuc_ptr[querypos];
18015 	/* g = Genome_get_char(genome,pos); */
18016 	g = get_genomic_nt(&g_alt,genomepos,this->chroffset,this->chrhigh,/*watsonp*/true);
18017 	if (g != '*') {
18018 	  if (Dynprog_consistent_p(c_upper,g,g_alt,genestrand) == true) {
18019 	    comp = MATCH_COMP;
18020 	    nconsecutive_mismatches = 0;
18021 #ifdef PMAP
18022 	  } else if (Dynprog_consistent_p(c_upper,g,g_alt) == true) {
18023 	    comp = AMBIGUOUS_COMP;
18024 	    /* Don't count as either match or mismatch */
18025 #endif
18026 	  } else {
18027 	    comp = MISMATCH_COMP;
18028 	    nconsecutive_mismatches += 1;
18029 	  }
18030 	  debug10(printf("At querypos %d and pos %u on right, have %c and %c\n",
18031 			 querypos,genomepos,c,g));
18032 	  pairs = Pairpool_push(pairs,pairpool,querypos,genomepos,c,comp,g,g_alt,/*dynprogindex*/0);
18033 	}
18034 	querypos--;
18035 	genomepos--;
18036 	/* pos--; */
18037       }
18038     }
18039 
18040   } else {
18041     /* pos = this->chrhigh - genomepos; */
18042     debug10(printf("watsonp on right is false.  pos is %u.  goal querypos is %d\n",genomepos,goal));
18043 
18044     querypos--;
18045     genomepos--;
18046     /* pos++; */
18047     while (querypos >= goal /* && pos <= this->chrhigh */) {
18048       c = queryseq_ptr[querypos];
18049       c_upper = queryuc_ptr[querypos];
18050       /* g = complCode[(int) Genome_get_char(genome,pos)]; */
18051       g = get_genomic_nt(&g_alt,genomepos,this->chroffset,this->chrhigh,/*watsonp*/false);
18052       if (g != '*') {
18053 	if (Dynprog_consistent_p(c_upper,g,g_alt,genestrand) == true) {
18054 	  comp = MATCH_COMP;
18055 #ifdef PMAP
18056 	} else if (Dynprog_consistent_p(c_upper,g,g_alt) == true) {
18057 	  comp = AMBIGUOUS_COMP;
18058 #endif
18059 	} else {
18060 	  comp = MISMATCH_COMP;
18061 	}
18062 	debug10(printf("At querypos %d and pos %u on right, have %c and %c\n",
18063 		       querypos,genomepos,c,g));
18064 	pairs = Pairpool_push(pairs,pairpool,querypos,genomepos,c,comp,g,g_alt,/*dynprogindex*/0);
18065       }
18066       querypos--;
18067       genomepos--;
18068       /* pos++; */
18069     }
18070 
18071     if (max_extend_p == true) {
18072       debug10(printf("\nGoal achieved.  Now looking for consecutive mismatches\n"));
18073       nconsecutive_mismatches = 0;
18074       while (querypos >= 0 /* && pos <= this->chrhigh */ && nconsecutive_mismatches > 3) {
18075 	c = queryseq_ptr[querypos];
18076 	c_upper = queryuc_ptr[querypos];
18077 	/* g = complCode[(int) Genome_get_char(genome,pos)]; */
18078 	g = get_genomic_nt(&g_alt,genomepos,this->chroffset,this->chrhigh,/*watsonp*/false);
18079 	if (g != '*') {
18080 	  if (Dynprog_consistent_p(c_upper,g,g_alt,genestrand) == true) {
18081 	    comp = MATCH_COMP;
18082 	    nconsecutive_mismatches = 0;
18083 #ifdef PMAP
18084 	  } else if (Dynprog_consistent_p(c_upper,g,g_alt) == true) {
18085 	    comp = AMBIGUOUS_COMP;
18086 	    /* Don't count as either match or mismatch */
18087 #endif
18088 	  } else {
18089 	    comp = MISMATCH_COMP;
18090 	    nconsecutive_mismatches += 1;
18091 	  }
18092 	  debug10(printf("At querypos %d and pos %u on right, have %c and %c\n",
18093 			 querypos,genomepos,c,g));
18094 	  pairs = Pairpool_push(pairs,pairpool,querypos,genomepos,c,comp,g,g_alt,/*dynprogindex*/0);
18095 	}
18096 	querypos--;
18097 	genomepos--;
18098 	/* pos++; */
18099       }
18100     }
18101   }
18102 
18103   this->pairs = pairs;
18104 
18105   debug10(printf("RIGHT AFTER FILL\n"));
18106   debug10(Pair_dump_list(this->pairs,true));
18107   debug10(printf("END_RIGHT AFTER FILL\n"));
18108 
18109   Stage3_free_pairarray(&this);
18110   this->pairarray = make_pairarray(&this->npairs,&this->pairs,this->cdna_direction,
18111 				   this->watsonp,pairpool,queryseq_ptr,
18112 				   this->chroffset,this->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0);
18113   this->goodness = Pair_fracidentity_array(&this->matches,&this->unknowns,&this->mismatches,
18114 					   &this->qopens,&this->qindels,&this->topens,&this->tindels,
18115 					   &ncanonical,&nsemicanonical,&this->noncanonical,
18116 					   &min_splice_prob,this->pairarray,this->npairs,this->cdna_direction);
18117 
18118   if (this->pairarray == NULL) {
18119     this->pairarray_freeable_p = false;
18120   } else {
18121     this->pairarray_freeable_p = true;
18122   }
18123 
18124   return;
18125 }
18126 
18127 
18128 void
Stage3_trim_right(T this,int goal,char * queryseq_ptr,Pairpool_T pairpool)18129 Stage3_trim_right (T this, int goal, char *queryseq_ptr, Pairpool_T pairpool) {
18130   List_T path;
18131   Pair_T pair;
18132 
18133   int ncanonical, nsemicanonical;
18134   double min_splice_prob;
18135 
18136 
18137   debug10(printf("Entered Stage3_trim_right with goal %d\n",goal));
18138   debug10(printf("LEFT BEFORE TRIM\n"));
18139   debug10(Pair_dump_list(this->pairs,true));
18140   debug10(printf("END_LEFT BEFORE TRIM\n"));
18141 
18142   path = List_reverse(this->pairs);
18143 
18144   while (((Pair_T) path->first)->querypos > goal /* && pos <= this->chrhigh */) {
18145     path = Pairpool_pop(path,&pair);
18146   }
18147 
18148   this->pairs = List_reverse(path);
18149 
18150   debug10(printf("LEFT AFTER TRIM\n"));
18151   debug10(Pair_dump_list(this->pairs,true));
18152   debug10(printf("END_LEFT AFTER TRIM\n"));
18153 
18154   Stage3_free_pairarray(&this);
18155   this->pairarray = make_pairarray(&this->npairs,&this->pairs,this->cdna_direction,
18156 				   this->watsonp,pairpool,queryseq_ptr,
18157 				   this->chroffset,this->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0);
18158   this->goodness = Pair_fracidentity_array(&this->matches,&this->unknowns,&this->mismatches,
18159 					   &this->qopens,&this->qindels,&this->topens,&this->tindels,
18160 					   &ncanonical,&nsemicanonical,&this->noncanonical,
18161 					   &min_splice_prob,this->pairarray,this->npairs,this->cdna_direction);
18162 
18163   if (this->pairarray == NULL) {
18164     this->pairarray_freeable_p = false;
18165   } else {
18166     this->pairarray_freeable_p = true;
18167   }
18168 
18169   return;
18170 }
18171 
18172 
18173 void
Stage3_trim_left(T this,int goal,char * queryseq_ptr,Pairpool_T pairpool)18174 Stage3_trim_left (T this, int goal, char *queryseq_ptr, Pairpool_T pairpool) {
18175 
18176   List_T pairs;
18177   Pair_T pair;
18178 
18179   int ncanonical, nsemicanonical;
18180   double min_splice_prob;
18181 
18182 
18183   debug10(printf("Entered Stage3_trim_left with goal %d\n",goal));
18184   debug10(printf("RIGHT BEFORE TRIM\n"));
18185   debug10(Pair_dump_list(this->pairs,true));
18186   debug10(printf("END_RIGHT BEFORE TRIM\n"));
18187 
18188 
18189   /* Do not call insert_gapholders */
18190   pairs = this->pairs;
18191 
18192   while (((Pair_T) pairs->first)->querypos < goal) {
18193     pairs = Pairpool_pop(pairs,&pair);
18194   }
18195 
18196   this->pairs = pairs;
18197 
18198   debug10(printf("RIGHT AFTER TRIM\n"));
18199   debug10(Pair_dump_list(this->pairs,true));
18200   debug10(printf("END_RIGHT AFTER TRIM\n"));
18201 
18202   Stage3_free_pairarray(&this);
18203   this->pairarray = make_pairarray(&this->npairs,&this->pairs,this->cdna_direction,
18204 				   this->watsonp,pairpool,queryseq_ptr,
18205 				   this->chroffset,this->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0);
18206   this->goodness = Pair_fracidentity_array(&this->matches,&this->unknowns,&this->mismatches,
18207 					   &this->qopens,&this->qindels,&this->topens,&this->tindels,
18208 					   &ncanonical,&nsemicanonical,&this->noncanonical,
18209 					   &min_splice_prob,this->pairarray,this->npairs,this->cdna_direction);
18210 
18211   if (this->pairarray == NULL) {
18212     this->pairarray_freeable_p = false;
18213   } else {
18214     this->pairarray_freeable_p = true;
18215   }
18216 
18217   return;
18218 }
18219 
18220 
18221 
18222 #if 0
18223 static void
18224 adjust_genomepos (T this, int delta) {
18225   Pair_T pair;
18226   List_T p;
18227 
18228   for (p = this->pairs; p != NULL; p = List_next(p)) {
18229     pair = (Pair_T) List_head(p);
18230     pair->genomepos += delta;
18231   }
18232 
18233   return;
18234 }
18235 #endif
18236 
18237 
18238 static bool
merge_local_single(T this_left,T this_right,int minpos1,int maxpos1,int minpos2,int maxpos2,char * queryseq_ptr,char * queryuc_ptr,Pairpool_T pairpool,Dynprog_T dynprogM,int maxpeelback)18239 merge_local_single (T this_left, T this_right,
18240 		    int minpos1, int maxpos1, int minpos2, int maxpos2,
18241 		    char *queryseq_ptr, char *queryuc_ptr,
18242 		    Pairpool_T pairpool, Dynprog_T dynprogM,
18243 		    int maxpeelback) {
18244   bool successp;
18245   Pair_T leftpair, rightpair;
18246   List_T path, orig_left_pairs, orig_right_pairs;
18247   bool watsonp, filledp;
18248 
18249   int ncanonical, nsemicanonical;
18250   double min_splice_prob;
18251 
18252 
18253 #ifdef EXTRACT_GENOMICSEG
18254   char *genomicseg_ptr = NULL;
18255 #endif
18256   int dynprogindex_minor = 0;
18257 
18258 
18259   orig_left_pairs = Pairpool_copy(this_left->pairs,pairpool);
18260   orig_right_pairs = Pairpool_copy(this_right->pairs,pairpool);
18261 
18262   this_left->pairs = Pair_clip_bounded_list_5(this_left->pairs,minpos1,maxpos1);
18263   this_right->pairs = Pair_clip_bounded_list_3(this_right->pairs,minpos2,maxpos2);
18264 
18265   /* Stage3_free_pairarray(&this_left); */
18266   /* Stage3_free_pairarray(&this_right); */
18267 
18268   if (this_left->pairs == NULL && this_right->pairs == NULL) {
18269     this_left->pairs = orig_left_pairs;
18270     this_right->pairs = orig_right_pairs;
18271 
18272 #if 0
18273     this_left->pairarray = (struct Pair_T *) NULL;
18274     this_right->pairarray = (struct Pair_T *) NULL;
18275     this_left->pairarray_freeable_p = false;
18276     this_right->pairarray_freeable_p = false;
18277 #endif
18278     return false;
18279 
18280   } else if ((watsonp = this_left->watsonp) == true) {
18281     debug10(printf("watsonp %d\n",watsonp));
18282 
18283 #if 0
18284     /* Has no effect on plus strand */
18285     Pair_set_genomepos_list(this_left->pairs,chroffset,chrhigh,/*watsonp*/true);
18286     Pair_set_genomepos_list(this_right->pairs,chroffset,chrhigh,/*watsonp*/true);
18287 #endif
18288 
18289     debug10(printf("LEFT\n"));
18290     debug10(Pair_dump_list(this_left->pairs,true));
18291     debug10(printf("END LEFT\n"));
18292 
18293     debug10(printf("RIGHT\n"));
18294     debug10(Pair_dump_list(this_right->pairs,true));
18295     debug10(printf("END RIGHT\n"));
18296 
18297 #ifdef EXTRACT_GENOMICSEG
18298     firstpair = (Pair_T) List_head(this_left->pairs);
18299     lastpair = (Pair_T) List_last_value(this_right->pairs);
18300     firstpos = firstpair->genomepos;
18301     lastpos = lastpair->genomepos;
18302     left = this_left->chroffset + firstpos;
18303     genomicseg = Genome_get_segment(genome,left,genomiclength,/*chromosome_iit*/NULL,/*revcomp*/false);
18304     genomicseg_ptr = genomicuc_ptr = Sequence_fullpointer(genomicseg);
18305 #endif
18306 
18307 #if 0
18308     /* Has no effect on plus strand */
18309     Pair_set_genomepos_list(this_left->pairs,chroffset,chrhigh,/*watsonp*/true);
18310     Pair_set_genomepos_list(this_right->pairs,chroffset,chrhigh,/*watsonp*/true);
18311 #endif
18312 
18313     debug10(printf("LEFT\n"));
18314     debug10(Pair_dump_list(this_left->pairs,true));
18315     debug10(printf("END LEFT\n"));
18316 
18317     debug10(printf("RIGHT\n"));
18318     debug10(Pair_dump_list(this_right->pairs,true));
18319     debug10(printf("END RIGHT\n"));
18320 
18321     path = List_reverse(this_left->pairs);
18322 
18323     leftpair = (Pair_T) path->first;
18324     rightpair = (Pair_T) this_right->pairs->first;
18325 
18326     debug10(printf("Running traverse_single_gap\n"));
18327     if ((this_right->pairs = traverse_single_gap(&filledp,&dynprogindex_minor,this_right->pairs,&path,leftpair,rightpair,
18328 						 this_right->chroffset,this_right->chrhigh,
18329 						 queryseq_ptr,queryuc_ptr,/*querylength*/0,watsonp,this_right->genestrand,
18330 						 /*jump_late_p*/watsonp ? false : true,pairpool,dynprogM,
18331 						 /*last_genomedp5*/NULL,/*last_genomedp3*/NULL,
18332 						 maxpeelback,/*defect_rate*/0,/*forcep*/false,/*finalp*/true)) == NULL) {
18333       debug10(printf(" => failed\n"));
18334       successp = false;
18335     } else if (filledp == false) {
18336       debug10(printf(" => failed\n"));
18337       successp = false;
18338     } else {
18339       debug10(printf(" => succeeded\n"));
18340       successp = true;
18341     }
18342     this_left->pairs = List_reverse(path);
18343 
18344   } else {
18345     debug10(printf("watsonp %d\n",watsonp));
18346 
18347 #if 0
18348     /* Do not change list, just pairarray */
18349     Pair_set_genomepos_list(this_left->pairs,chroffset,chrhigh,/*watsonp*/false);
18350     Pair_set_genomepos_list(this_right->pairs,chroffset,chrhigh,/*watsonp*/false);
18351 #endif
18352 
18353     debug10(printf("LEFT\n"));
18354     debug10(Pair_dump_list(this_left->pairs,true));
18355     debug10(printf("END LEFT\n"));
18356 
18357     debug10(printf("RIGHT\n"));
18358     debug10(Pair_dump_list(this_right->pairs,true));
18359     debug10(printf("END RIGHT\n"));
18360 
18361 #ifdef EXTRACT_GENOMICSEG
18362     firstpair = (Pair_T) List_head(this_left->pairs);
18363     lastpair = (Pair_T) List_last_value(this_right->pairs);
18364     firstpos = firstpair->genomepos;
18365     lastpos = lastpair->genomepos;
18366     left = this_right->chroffset + lastpos;
18367     genomicseg = Genome_get_segment(genome,left,genomiclength,/*chromosome_iit*/NULL,/*revcomp*/true);
18368     genomicseg_ptr = genomicuc_ptr = Sequence_fullpointer(genomicseg);
18369 #endif
18370 
18371 #if 0
18372     /* Do not change list, just pairarray */
18373     Pair_set_genomepos_list(this_left->pairs,chroffset,chrhigh,/*watsonp*/false);
18374     Pair_set_genomepos_list(this_right->pairs,chroffset,chrhigh,/*watsonp*/false);
18375 #endif
18376 
18377     debug10(printf("LEFT\n"));
18378     debug10(Pair_dump_list(this_left->pairs,true));
18379     debug10(printf("END LEFT\n"));
18380 
18381     debug10(printf("RIGHT\n"));
18382     debug10(Pair_dump_list(this_right->pairs,true));
18383     debug10(printf("END RIGHT\n"));
18384 
18385     path = List_reverse(this_left->pairs);
18386 
18387     leftpair = (Pair_T) path->first;
18388     rightpair = (Pair_T) this_right->pairs->first;
18389 
18390     if ((this_right->pairs = traverse_single_gap(&filledp,&dynprogindex_minor,this_right->pairs,&path,leftpair,rightpair,
18391 						 this_right->chroffset,this_right->chrhigh,
18392 						 queryseq_ptr,queryuc_ptr,/*querylength*/0,watsonp,this_right->genestrand,
18393 						 /*jump_late_p*/watsonp ? false : true,pairpool,dynprogM,
18394 						 /*last_genomedp5*/NULL,/*last_genomedp3*/NULL,
18395 						 maxpeelback,/*defect_rate*/0,/*forcep*/false,/*finalp*/true)) == NULL) {
18396       debug10(printf(" => failed\n"));
18397       successp = false;
18398     } else if (filledp == false) {
18399       debug10(printf(" => failed\n"));
18400       successp = false;
18401     } else {
18402       debug10(printf(" => succeeded\n"));
18403       successp = true;
18404     }
18405     this_left->pairs = List_reverse(path);
18406   }
18407 
18408   if (successp == false) {
18409     this_left->pairs = orig_left_pairs;
18410     this_left->pairarray = make_pairarray(&this_left->npairs,&this_left->pairs,this_left->cdna_direction,
18411 					  this_left->watsonp,pairpool,queryseq_ptr,
18412 					  this_left->chroffset,this_left->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0);
18413     this_left->goodness = Pair_fracidentity_array(&this_left->matches,&this_left->unknowns,&this_left->mismatches,
18414 						  &this_left->qopens,&this_left->qindels,&this_left->topens,&this_left->tindels,
18415 						  &ncanonical,&nsemicanonical,&this_left->noncanonical,
18416 						  &min_splice_prob,this_left->pairarray,this_left->npairs,this_left->cdna_direction);
18417 
18418     this_right->pairs = orig_right_pairs;
18419     this_right->pairarray = make_pairarray(&this_right->npairs,&this_right->pairs,this_right->cdna_direction,
18420 					   this_right->watsonp,pairpool,queryseq_ptr,
18421 					   this_right->chroffset,this_right->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0);
18422     this_right->goodness = Pair_fracidentity_array(&this_right->matches,&this_right->unknowns,&this_right->mismatches,
18423 						   &this_right->qopens,&this_right->qindels,&this_right->topens,&this_right->tindels,
18424 						   &ncanonical,&nsemicanonical,&this_right->noncanonical,
18425 						   &min_splice_prob,this_right->pairarray,this_right->npairs,this_right->cdna_direction);
18426 
18427   } else {
18428     this_left->pairs = List_append(this_left->pairs,this_right->pairs);
18429     this_right->pairs = (List_T) NULL;
18430   }
18431 
18432 
18433   debug10(printf(" => returning successp %d\n",successp));
18434   return successp;
18435 }
18436 
18437 
18438 static List_T
recompute_for_cdna_direction(int * cdna_direction,List_T pairs,int genestrand,bool watsonp,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,int maxpeelback)18439 recompute_for_cdna_direction (int *cdna_direction, List_T pairs, int genestrand, bool watsonp,
18440 #ifdef PMAP
18441 			      char *queryaaseq_ptr,
18442 #endif
18443 			      char *queryseq_ptr, char *queryuc_ptr,
18444 			      Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
18445 #ifndef GSNAP
18446 			      Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
18447 #endif
18448 			      Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
18449 			      int maxpeelback) {
18450   List_T pairs_fwd, path_fwd, pairs_rev, path_rev, copy;
18451   double max_intron_score_fwd = 0.0, max_intron_score_rev = 0.0,
18452     avg_donor_score_fwd = 0.0, avg_acceptor_score_fwd = 0.0,
18453     avg_donor_score_rev = 0.0, avg_acceptor_score_rev = 0.0;
18454   int nmatches_fwd, nmismatches_fwd, nindels_fwd,
18455     nknown_fwd, ncanonical_fwd, nsemicanonical_fwd, nnoncanonical_fwd, nbadintrons_fwd,
18456     nmatches_rev, nmismatches_rev, nindels_rev,
18457     nknown_rev, ncanonical_rev, nsemicanonical_rev, nnoncanonical_rev, nbadintrons_rev;
18458   int sensedir;
18459 
18460   double defect_rate_fwd, defect_rate_rev;
18461 
18462   copy = Pairpool_copy(pairs,pairpool);
18463 
18464   /* Compute fwd */
18465   path_fwd = path_compute_dir(&defect_rate_fwd,/*pairs*/copy,/*cdna_direction*/+1,watsonp,
18466 			      genestrand,/*jump_late_p*/watsonp ? false : true,
18467 #ifdef PMAP
18468 			      queryaaseq_ptr,
18469 #endif
18470 			      queryseq_ptr,queryuc_ptr,/*querylength*/0,chrnum,chroffset,chrhigh,
18471 			      maxpeelback,
18472 #ifndef GSNAP
18473 			      oligoindices_minor,diagpool,cellpool,
18474 #endif
18475 			      pairpool,dynprogL,dynprogM,dynprogR,
18476 			      /*last_genomedp5*/NULL,/*last_genomedp3*/NULL/*,clean_ends_p:true*/);
18477   pairs_fwd = score_introns(&max_intron_score_fwd,&avg_donor_score_fwd,&avg_acceptor_score_fwd,
18478 			    &nknown_fwd,&ncanonical_fwd,&nbadintrons_fwd,path_fwd,/*cdna_direction*/+1,watsonp,
18479 			    chrnum,chroffset,chrhigh
18480 #ifdef WASTE
18481 			    ,pairpool
18482 #endif
18483 			    );
18484   /* alignment_score_fwd = */ score_alignment(&nmatches_fwd,&nmismatches_fwd,&nindels_fwd,
18485 #ifdef COMPLEX_DIRECTION
18486 					      &indel_alignment_score_fwd,
18487 #endif
18488 					      &nsemicanonical_fwd,&nnoncanonical_fwd,
18489 					      pairs_fwd,/*cdna_direction*/+1);
18490 
18491 
18492   /* Compute rev */
18493   path_rev = path_compute_dir(&defect_rate_rev,/*pairs*/pairs,/*cdna_direction*/-1,watsonp,
18494 			      genestrand,/*jump_late_p*/watsonp ? false : true,
18495 #ifdef PMAP
18496 			      queryaaseq_ptr,
18497 #endif
18498 			      queryseq_ptr,queryuc_ptr,/*querylength*/0,chrnum,chroffset,chrhigh,
18499 			      maxpeelback,
18500 #ifndef GSNAP
18501 			      oligoindices_minor,diagpool,cellpool,
18502 #endif
18503 			      pairpool,dynprogL,dynprogM,dynprogR,
18504 			      /*last_genomedp5*/NULL,/*last_genomedp3*/NULL/*,clean_ends_p:true*/);
18505   pairs_rev = score_introns(&max_intron_score_rev,&avg_donor_score_rev,&avg_acceptor_score_rev,
18506 			    &nknown_rev,&ncanonical_rev,&nbadintrons_rev,path_rev,/*cdna_direction*/-1,watsonp,
18507 			    chrnum,chroffset,chrhigh
18508 #ifdef WASTE
18509 			    ,pairpool
18510 #endif
18511 			    );
18512   /* alignment_score_rev = */ score_alignment(&nmatches_rev,&nmismatches_rev,&nindels_rev,
18513 #ifdef COMPLEX_DIRECTION
18514 					      &indel_alignment_score_rev,
18515 #endif
18516 					      &nsemicanonical_rev,&nnoncanonical_rev,
18517 					      pairs_rev,/*cdna_direction*/-1);
18518 
18519   pairs = pick_cdna_direction(&(*cdna_direction),&sensedir,pairs_fwd,pairs_rev,
18520 			      defect_rate_fwd,defect_rate_rev,
18521 			      nknown_fwd,ncanonical_fwd,nsemicanonical_fwd,nnoncanonical_fwd,nbadintrons_fwd,
18522 			      nknown_rev,ncanonical_rev,nsemicanonical_rev,nnoncanonical_rev,nbadintrons_rev,
18523 			      max_intron_score_fwd,avg_donor_score_fwd,avg_acceptor_score_fwd,
18524 			      max_intron_score_rev,avg_donor_score_rev,avg_acceptor_score_rev,
18525 #ifdef COMPLEX_DIRECTION
18526 			      nmatches_fwd,nmismatches_fwd,nmatches_rev,nmismatches_rev,nindels_fwd,nindels_rev,
18527 			      indel_alignment_score_fwd,indel_alignment_score_rev,
18528 #endif
18529 			      /*sense_filter*/0);
18530 
18531   /* Don't know if we need to call path_compute_final */
18532 
18533   return pairs;
18534 }
18535 
18536 
18537 T
Stage3_merge_local(T old_left,T old_right,int minpos1,int maxpos1,int minpos2,int maxpos2,Sequence_T queryseq,char * queryaaseq_ptr,char * queryseq_ptr,char * queryuc_ptr,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,int maxpeelback)18538 Stage3_merge_local (T old_left, T old_right,
18539 		    int minpos1, int maxpos1, int minpos2, int maxpos2,
18540 		    Sequence_T queryseq,
18541 #ifdef PMAP
18542 		    char *queryaaseq_ptr,
18543 #endif
18544 		    char *queryseq_ptr, char *queryuc_ptr,
18545 #ifndef GSNAP
18546 		    Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
18547 #endif
18548 		    Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
18549 		    int maxpeelback) {
18550   T new_left, new_right;
18551   Pair_T end1, start2, leftpair, rightpair;
18552   List_T left_pairs, right_pairs, path;
18553   bool watsonp, filledp, shiftp, incompletep;
18554   int cdna_direction, cdna_direction_1, cdna_direction_2;
18555   bool make_dir_consistent_p;
18556 
18557   int intronlength, queryjump, genomejump;
18558 
18559   int dynprogindex_minor = 0, dynprogindex_major = 0;
18560 
18561 
18562   debug10(printf("Entered Stage3_merge_local with bounds1 %d..%d and bounds2 %d..%d\n",
18563 		 minpos1,maxpos1,minpos2,maxpos2));
18564 
18565 #ifdef DEBUG10
18566   printf("ORIG LEFT PAIRS:\n");
18567   Pair_dump_list(old_left->pairs,true);
18568   printf("ORIG RIGHT PAIRS:\n");
18569   Pair_dump_list(old_right->pairs,true);
18570 #endif
18571 
18572   left_pairs = Pairpool_copy(old_left->pairs,pairpool);
18573   left_pairs = Pair_clip_bounded_list_5(left_pairs,minpos1,maxpos1);
18574   right_pairs = Pairpool_copy(old_right->pairs,pairpool);
18575   right_pairs = Pair_clip_bounded_list_3(right_pairs,minpos2,maxpos2);
18576 
18577 #ifdef DEBUG10
18578   printf("CLIPPED LEFT PAIRS:\n");
18579   Pair_dump_list(left_pairs,true);
18580   printf("CLIPPED RIGHT PAIRS:\n");
18581   Pair_dump_list(right_pairs,true);
18582 #endif
18583 
18584 
18585   path = clean_end_chimera(List_reverse(left_pairs));
18586   right_pairs = clean_end_chimera(right_pairs);
18587 
18588   Pairpool_clean_join(&path,&right_pairs);
18589 
18590   if (path == NULL || right_pairs == NULL) {
18591     /* Do not attach copies of pairs to this_left or this_right */
18592     return (T) NULL;
18593 
18594   } else {
18595     new_left = Stage3_new_from_pairs(left_pairs,old_left->cdna_direction,old_left->watsonp,
18596 				     old_left->genestrand,old_left->sensedir,pairpool,queryseq,
18597 				     /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
18598 				     old_left->chrnum,old_left->chroffset,old_left->chrhigh,old_left->chrlength);
18599     new_right = Stage3_new_from_pairs(right_pairs,old_right->cdna_direction,old_right->watsonp,
18600 				      old_right->genestrand,old_right->sensedir,pairpool,queryseq,
18601 				     /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
18602 				      old_right->chrnum,old_right->chroffset,old_right->chrhigh,old_right->chrlength);
18603 
18604     new_left->pairs = List_reverse(path);
18605     new_right->pairs = right_pairs;
18606   }
18607 
18608 #ifdef DEBUG10
18609   printf("JOINED LEFT PAIRS:\n");
18610   Pair_dump_list(new_left->pairs,true);
18611   printf("JOINED RIGHT PAIRS:\n");
18612   Pair_dump_list(new_right->pairs,true);
18613 #endif
18614 
18615   Stage3_free_pairarray(&new_left);
18616   Stage3_free_pairarray(&new_right);
18617 
18618 
18619   watsonp = new_left->watsonp;
18620 #if 0
18621   if (watsonp == true) {
18622     debug10(printf("watsonp true\n"));
18623 
18624     firstpair = (Pair_T) List_head(new_left->pairs);
18625     lastpair = (Pair_T) List_last_value(new_right->pairs);
18626     firstpos = firstpair->genomepos;
18627     lastpos = lastpair->genomepos;
18628     left = new_left->chroffset + firstpos;
18629 
18630   } else {
18631     debug10(printf("watsonp false\n"));
18632 
18633     firstpair = (Pair_T) List_head(new_left->pairs);
18634     lastpair = (Pair_T) List_last_value(new_right->pairs);
18635     firstpos = firstpair->genomepos;
18636     lastpos = lastpair->genomepos;
18637     left = new_right->chroffset + lastpos;
18638   }
18639 #endif
18640 
18641   /* Determine if need to make cdna_direction consistent */
18642   end1 = Pair_end_bound(&cdna_direction_1,new_left->pairs,/*breakpoint*/maxpos1);
18643   start2 = Pair_start_bound(&cdna_direction_2,new_right->pairs,/*breakpoint+1*/minpos2);
18644   debug10(printf("cdna_directions up to breakpoint are %d and %d\n",cdna_direction_1,cdna_direction_2));
18645   assert(end1 != NULL);
18646   assert(start2 != NULL);
18647 
18648   if (cdna_direction_1 > 0 && cdna_direction_2 < 0) {
18649     make_dir_consistent_p = true;
18650 
18651   } else if (cdna_direction_1 < 0 && cdna_direction_2 > 0) {
18652     make_dir_consistent_p = true;
18653 
18654   } else {
18655     make_dir_consistent_p = false;
18656     if (cdna_direction_1 == 0) {
18657       cdna_direction = cdna_direction_2;
18658     } else if (cdna_direction_2 == 0) {
18659       cdna_direction = cdna_direction_1;
18660     } else {
18661       cdna_direction = cdna_direction_1;
18662     }
18663     debug10(printf("cdna_direction is %d\n",cdna_direction));
18664   }
18665 
18666 
18667   /* Determine if the gap is an intron or not */
18668   end1 = (Pair_T) List_last_value(new_left->pairs);
18669   start2 = (Pair_T) List_head(new_right->pairs);
18670   queryjump = start2->querypos - end1->querypos - 1;
18671   genomejump = start2->genomepos - end1->genomepos - 1;
18672   intronlength = (int) (genomejump - queryjump);
18673 
18674   debug10(printf("intronlength %d = (start2->genomepos %d - end1->genomepos %d - 1) - (start2->querypos %d - end1->querypos %d - 1)\n",
18675 		 intronlength,start2->genomepos,end1->genomepos,start2->querypos,end1->querypos));
18676 
18677   if (intronlength >= min_intronlength && splicingp == true) {
18678     debug10(printf("intronlength %d >= min_intronlength %d, so an intron\n",
18679 		   intronlength,min_intronlength));
18680     /* Intron */
18681     path = List_reverse(new_left->pairs);
18682     leftpair = (Pair_T) path->first;
18683     rightpair = (Pair_T) new_right->pairs->first;
18684 
18685     if (make_dir_consistent_p == true) {
18686       /* Solve intron when re-computing for cdna_direction */
18687       debug10(printf("intron, but make dir consistent\n"));
18688 
18689       new_right->pairs = Pairpool_push_gapholder(new_right->pairs,pairpool,queryjump,genomejump,
18690 						  /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
18691       new_left->pairs = List_reverse(path);
18692       new_left->pairs = List_append(new_left->pairs,new_right->pairs);
18693       new_right->pairs = (List_T) NULL;
18694 
18695       new_left->pairs =
18696 	recompute_for_cdna_direction(&cdna_direction,new_left->pairs,new_left->genestrand,watsonp,
18697 #ifdef PMAP
18698 				     queryaaseq_ptr,
18699 #endif
18700 				     queryseq_ptr,queryuc_ptr,
18701 				     new_left->chrnum,new_left->chroffset,new_left->chrhigh,
18702 #ifndef GSNAP
18703 				     oligoindices_minor,diagpool,cellpool,
18704 #endif
18705 				     pairpool,dynprogL,dynprogM,dynprogR,maxpeelback);
18706 
18707     } else {
18708       debug10(printf("traverse_genome_gap with cdna_direction %d...",cdna_direction));
18709       new_right->pairs = traverse_genome_gap(&filledp,&shiftp,&dynprogindex_minor,&dynprogindex_major,
18710 					      new_right->pairs,&path,leftpair,rightpair,
18711 					      new_left->chrnum,new_left->chroffset,new_left->chrhigh,
18712 					      queryseq_ptr,queryuc_ptr,/*querylength*/0,cdna_direction,
18713 					      watsonp,new_right->genestrand,
18714 					      /*jump_late_p*/watsonp ? false : true,pairpool,
18715 					      dynprogL,dynprogM,dynprogR,/*last_genomedp5*/NULL,/*last_genomedp3*/NULL,
18716 					      maxpeelback,/*defect_rate*/0,/*finalp*/true,/*simplep*/false);
18717       debug10(printf("done"));
18718 
18719       if (filledp == false) {
18720 	new_right->pairs = Pairpool_push_gapholder(new_right->pairs,pairpool,queryjump,genomejump,
18721 						    /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
18722       }
18723 
18724       new_left->pairs = List_reverse(path);
18725       new_left->pairs = List_append(new_left->pairs,new_right->pairs);
18726       new_right->pairs = (List_T) NULL;
18727     }
18728 
18729     if (make_pairarray_merge(new_left,cdna_direction,new_left->watsonp,pairpool,queryseq_ptr,
18730 			     new_left->chroffset,new_left->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0,
18731 			     /*new_gap_p*/true) == false) {
18732       Stage3_free(&new_left);
18733       Stage3_free(&new_right);
18734       return (T) NULL;
18735     }
18736 
18737   } else if (intronlength < 0) { /* Was intronlength < -EXTRAQUERYGAP, but this missed some short insertions */
18738     /* If traverse_cdna_gap fails, causes seg faults later on */
18739     /* cDNA gap */
18740     debug10(printf("cDNA gap, but make dir consistent\n"));
18741 
18742     path = List_reverse(new_left->pairs);
18743     leftpair = (Pair_T) path->first;
18744     rightpair = (Pair_T) new_right->pairs->first;
18745 
18746     if (make_dir_consistent_p == true) {
18747       /* Solve cDNA gap when re-computing for cdna_direction */
18748       new_right->pairs = Pairpool_push_gapholder(new_right->pairs,pairpool,queryjump,genomejump,
18749 						  /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
18750       new_left->pairs = List_reverse(path);
18751       new_left->pairs = List_append(new_left->pairs,new_right->pairs);
18752       new_right->pairs = (List_T) NULL;
18753 
18754       new_left->pairs =
18755 	recompute_for_cdna_direction(&cdna_direction,new_left->pairs,new_left->genestrand,watsonp,
18756 #ifdef PMAP
18757 				     queryaaseq_ptr,
18758 #endif
18759 				     queryseq_ptr,queryuc_ptr,
18760 				     new_left->chrnum,new_left->chroffset,new_left->chrhigh,
18761 #ifndef GSNAP
18762 				     oligoindices_minor,diagpool,cellpool,
18763 #endif
18764 				     pairpool,dynprogL,dynprogM,dynprogR,maxpeelback);
18765 
18766     } else {
18767       debug10(printf("traverse_cdna_gap..."));
18768       new_right->pairs = traverse_cdna_gap(&filledp,&incompletep,&dynprogindex_minor,&dynprogindex_major,
18769 					    new_right->pairs,&path,leftpair,rightpair,
18770 					    new_left->chroffset,new_left->chrhigh,
18771 					    queryseq_ptr,queryuc_ptr,/*querylength*/0,watsonp,new_right->genestrand,
18772 					    /*jump_late_p*/watsonp ? false : true,pairpool,
18773 					    dynprogL,dynprogM,dynprogR,/*last_genomedp5*/NULL,/*last_genomedp3*/NULL,
18774 					    maxpeelback,/*defect_rate*/0,/*finalp*/true);
18775       debug10(printf("done"));
18776 
18777       if (filledp == false) {
18778 	new_right->pairs = Pairpool_push_gapholder(new_right->pairs,pairpool,queryjump,genomejump,
18779 						    /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
18780       }
18781 
18782       new_left->pairs = List_reverse(path);
18783       new_left->pairs = List_append(new_left->pairs,new_right->pairs);
18784       new_right->pairs = (List_T) NULL;
18785     }
18786 
18787     if (make_pairarray_merge(new_left,cdna_direction,new_left->watsonp,pairpool,queryseq_ptr,
18788 			     new_left->chroffset,new_left->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0,
18789 			     /*new_gap_p*/true) == false) {
18790       Stage3_free(&new_left);
18791       Stage3_free(&new_right);
18792       return (T) NULL;
18793     }
18794 
18795   } else {
18796     /* Single gap */
18797     debug10(printf("intronlength %d, so a single gap\n",intronlength));
18798     debug10(printf("Before\n"));
18799     debug10(Pair_dump_list(new_left->pairs,true));
18800     debug10(Pair_dump_list(new_right->pairs,true));
18801 
18802     end1 = (Pair_T) List_last_value(new_left->pairs);
18803     start2 = (Pair_T) List_head(new_right->pairs);
18804 
18805     debug10(printf("Running merge_local_single\n"));
18806     if (merge_local_single(new_left,new_right,
18807 			   minpos1,/*maxpos1*/end1->querypos,
18808 			   /*minpos2*/start2->querypos,maxpos2,
18809 			   queryseq_ptr,queryuc_ptr,
18810 			   pairpool,dynprogM,maxpeelback) == false) {
18811       Stage3_free(&new_left);
18812       Stage3_free(&new_right);
18813       return (T) NULL;
18814 
18815     } else if (make_dir_consistent_p == true) {
18816       debug10(printf("Need to make dir consistent\n"));
18817       new_left->pairs =
18818 	recompute_for_cdna_direction(&cdna_direction,new_left->pairs,new_left->genestrand,watsonp,
18819 #ifdef PMAP
18820 				     queryaaseq_ptr,
18821 #endif
18822 				     queryseq_ptr,queryuc_ptr,
18823 				     new_left->chrnum,new_left->chroffset,new_left->chrhigh,
18824 #ifndef GSNAP
18825 				     oligoindices_minor,diagpool,cellpool,
18826 #endif
18827 				     pairpool,dynprogL,dynprogM,dynprogR,maxpeelback);
18828     }
18829 
18830     if (make_pairarray_merge(new_left,cdna_direction,new_left->watsonp,pairpool,queryseq_ptr,
18831 			     new_left->chroffset,new_left->chrhigh,ngap,/*subseq_offset*/0,/*skiplength*/0,
18832 			     /*new_gap_p*/false) == false) {
18833       Stage3_free(&new_left);
18834       Stage3_free(&new_right);
18835       return (T) NULL;
18836     }
18837 
18838     debug10(printf("After\n"));
18839     debug10(Pair_dump_list(new_left->pairs,true));
18840   }
18841 
18842   new_left->cdna_direction = cdna_direction;
18843 
18844   Stage3_free(&new_right);
18845   return new_left;
18846 }
18847 
18848 
18849 List_T
Stage3_split(T this,Sequence_T queryseq,Pairpool_T pairpool)18850 Stage3_split (T this, Sequence_T queryseq, Pairpool_T pairpool) {
18851   List_T split_objects = NULL;
18852   Stage3_T stage3;
18853   struct Pair_T *pairarray;
18854   int npairs;
18855   int goodness, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
18856     ncanonical, nsemicanonical, nnoncanonical;
18857   double min_splice_prob;
18858 
18859   List_T path, pairs, p;
18860   Pair_T pair;
18861   bool large_intron_p;
18862 
18863   debug(printf("\n** Starting Stage3_split with watsonp %d and cdna_direction %d\n",this->watsonp,this->cdna_direction));
18864 
18865   large_intron_p = false;
18866   for (p = this->pairs; p != NULL; p = p->rest) {
18867     pair = (Pair_T) p->first;
18868     if (pair->gapp == true && pair->genomejump > maxintronlen) {
18869       debug(printf("Found large intron of size %d\n",pair->genomejump));
18870       large_intron_p = true;
18871     }
18872   }
18873 
18874   if (large_intron_p == false) {
18875     return (List_T) NULL;
18876 
18877   } else {
18878     pairs = (List_T) NULL;
18879     path = List_reverse(this->pairs);
18880     while (path != NULL) {
18881       /* pairptr = path; */
18882       /* path = Pairpool_pop(path,&pair); */
18883       pair = (Pair_T) path->first;
18884       if (pair->gapp == false) {
18885 #ifdef WASTE
18886 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
18887 #else
18888 	pairs = List_transfer_one(pairs,&path);
18889 #endif
18890 
18891       } else if (pair->genomejump <= maxintronlen) {
18892 #ifdef WASTE
18893 	pairs = Pairpool_push_existing(pairs,pairpool,pair);
18894 #else
18895 	pairs = List_transfer_one(pairs,&path);
18896 #endif
18897 
18898       } else {
18899 	/* Start a new path */
18900 	/* Pair_dump_list(pairs,true); */
18901 
18902 	pairarray = make_pairarray(&npairs,&pairs,this->cdna_direction,this->watsonp,
18903 				   pairpool,/*queryseq_ptr*/Sequence_fullpointer(queryseq),
18904 				   this->chroffset,this->chrhigh,
18905 				   ngap,/*query_subseq_offset*/Sequence_subseq_offset(queryseq),
18906 				   /*skiplength*/Sequence_skiplength(queryseq));
18907 
18908 	goodness = Pair_fracidentity_array(&matches,&unknowns,&mismatches,
18909 					   &qopens,&qindels,&topens,&tindels,
18910 					   &ncanonical,&nsemicanonical,&nnoncanonical,
18911 					   &min_splice_prob,pairarray,npairs,this->cdna_direction);
18912 
18913 	if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,this->cdna_direction,this->sensedir,
18914 				 matches,unknowns,mismatches,
18915 				 qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
18916 				 this->chrnum,this->chroffset,this->chrhigh,this->chrlength,
18917 				 this->watsonp,this->genestrand,
18918 				 /*querylength*/Sequence_fulllength(queryseq),
18919 				 /*skiplength*/Sequence_skiplength(queryseq),
18920 				 /*trimlength*/Sequence_trimlength(queryseq),
18921 				 this->straintype,this->strain,/*altstrain_iit*/NULL)) != NULL) {
18922 	  split_objects = List_push(split_objects,(void *) stage3);
18923 	}
18924 
18925 	pairs = (List_T) NULL;
18926 	path = path->rest;	/* Discard gap */
18927       }
18928     }
18929 
18930     /* Handle final path */
18931     /* Pair_dump_list(pairs,true); */
18932 
18933     pairarray = make_pairarray(&npairs,&pairs,this->cdna_direction,this->watsonp,
18934 			       pairpool,/*queryseq_ptr*/Sequence_fullpointer(queryseq),
18935 			       this->chroffset,this->chrhigh,
18936 			       ngap,/*query_subseq_offset*/Sequence_subseq_offset(queryseq),
18937 			       /*skiplength*/Sequence_skiplength(queryseq));
18938 
18939     goodness = Pair_fracidentity_array(&matches,&unknowns,&mismatches,
18940 				       &qopens,&qindels,&topens,&tindels,
18941 				       &ncanonical,&nsemicanonical,&nnoncanonical,
18942 				       &min_splice_prob,pairarray,npairs,this->cdna_direction);
18943 
18944     if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,this->cdna_direction,this->sensedir,
18945 			     matches,unknowns,mismatches,
18946 			     qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
18947 			     this->chrnum,this->chroffset,this->chrhigh,this->chrlength,
18948 			     this->watsonp,this->genestrand,
18949 			     /*querylength*/Sequence_fulllength(queryseq),
18950 			     /*skiplength*/Sequence_skiplength(queryseq),
18951 			     /*trimlength*/Sequence_trimlength(queryseq),
18952 			     this->straintype,this->strain,/*altstrain_iit*/NULL)) != NULL) {
18953       split_objects = List_push(split_objects,(void *) stage3);
18954     }
18955   }
18956 
18957   return split_objects;
18958 }
18959 
18960 
18961 #ifndef PMAP
18962 void
Stage3_guess_cdna_direction(T this)18963 Stage3_guess_cdna_direction (T this) {
18964   this->cdna_direction = Pair_guess_cdna_direction_array(&this->sensedir,this->pairarray,this->npairs,
18965 							 /*invertedp*/false,this->chroffset,this->watsonp);
18966   Pair_fix_cdna_direction_array(this->pairarray,this->npairs,this->cdna_direction);
18967   return;
18968 }
18969 #endif
18970