1 static char rcsid[] = "$Id: stage3hr.c 223081 2020-09-13 14:21:03Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "stage3hr.h"
7 #include "stage3hrdef.h"
8 
9 #include <stdlib.h>		/* For qsort */
10 #include <string.h>
11 #include <strings.h>
12 #include <ctype.h>		/* For islower */
13 #include <math.h>		/* For exp() and log10() */
14 #include "assert.h"
15 #include "mem.h"
16 #include "univcoord.h"
17 
18 #include "chrnum.h"
19 #include "complement.h"
20 #include "interval.h"
21 #include "univdiag.h"
22 #include "univdiagdef.h"
23 #include "substring.h"
24 #include "junction.h"
25 #include "genome128_hr.h"
26 #include "mapq.h"
27 #include "cigar.h"
28 #include "comp.h"		/* For Stage3end_run_gmap */
29 #include "maxent_hr.h"
30 #include "fastlog.h"
31 #include "transcript.h"
32 #include "kmer-search.h"
33 
34 
35 
36 /* Scores for alts_status_inside */
37 #define ALTS_RESOLVED_BYLENGTH 0
38 #define ALTS_NOT_AMBIGUOUS 1
39 
40 
41 /* Eliminates distant splices if short splices are found */
42 /* #define DISTANT_SPLICE_SPECIAL 1 */
43 
44 #define CONCORDANT_TEXT "concordant"
45 #define PAIRED_TEXT "paired"
46 #define UNPAIRED_TEXT "unpaired"
47 
48 #ifdef USE_TALLY_RATIO
49 #define TALLY_RATIO 2.0
50 #endif
51 
52 /* #define SUBSUMPTION_SLOP 10 */	/* Should allow for short insert lengths */
53 #define ADJ_NMATCHES_SLOP 2	/* Corresponds to one mismatch, sacrificing for a better splice score */
54 #define NMATCHES_SLOP 6
55 /* #define NMATCHES_TO_TRIMS_SLOP 9 */ /* Looser to allow for different splice options */
56 #define INSERTLENGTH_SLOP 100
57 #define OUTERLENGTH_SLOP 1000
58 #define SPLICE_SCORE_SLOP 0.03
59 
60 /* #define MIN_ALIGNMENT_LEN 20  -- Now taken care of by min-coverage */
61 
62 #define SPLICED_END_PENALTY 1		/* For long spliced ends.  Add to score 1 point per each 8 bp */
63 #define NONSPLICED_END_RESTORE 6		/* For long spliced ends.  Reduce score by 6 points per each 8 bp */
64 #define END_BINSIZE 8
65 
66 #define SCORE_EVENTRIM_SLOP 2
67 #define SCORE_INDELS_EVENTRIM 1 /* Needed to compare genomic positions with and without indels */
68 #define EVENTRIM_BADINTRON_PENALTY 2
69 #define DO_FINAL 1
70 
71 
72 #ifdef CHECK_ASSERTIONS
73 #define CHECK_NMISMATCHES 1
74 #endif
75 
76 
77 #if 0
78 /* This is a bad idea.  Better to use nconcordant as a guide to stopping */
79 #define MAX_HITS 100		/* For evaluating concordance */
80 #endif
81 
82 /* #define USE_ALLOCA_FOR_HITS 1 -- can lead to stack overflow */
83 
84 
85 /* Stage3end_new */
86 #ifdef DEBUG0
87 #define debug0(x) x
88 #else
89 #define debug0(x)
90 #endif
91 
92 /* Stage3end_filter */
93 #ifdef DEBUG1
94 #define debug1(x) x
95 #else
96 #define debug1(x)
97 #endif
98 
99 
100 /* transcript-guided alignment */
101 /* May want to turn on debug2 in transcript.c */
102 #ifdef DEBUG2
103 #define debug2(x) x
104 #else
105 #define debug2(x)
106 #endif
107 
108 /* Stage3end_T comparisons.  Need to modify calls from path-solve.c */
109 #ifdef DEBUG4
110 #define debug4(x) x
111 #else
112 #define debug4(x)
113 #endif
114 
115 /* Stage3end_nmatches_substrings */
116 #ifdef DEBUG7
117 #define debug7(x) x
118 #else
119 #define debug7(x)
120 #endif
121 
122 
123 /* Stage3pair_T comparisons */
124 #ifdef DEBUG8
125 #define debug8(x) x
126 #else
127 #define debug8(x)
128 #endif
129 
130 
131 /* Resolving insides */
132 #ifdef DEBUG9
133 #define debug9(x) x
134 #else
135 #define debug9(x)
136 #endif
137 
138 /* insert length calculation */
139 #ifdef DEBUG10
140 #define debug10(x) x
141 #else
142 #define debug10(x)
143 #endif
144 
145 /* circular chromosomes */
146 #ifdef DEBUG12
147 #define debug12(x) x
148 #else
149 #define debug12(x)
150 #endif
151 
152 /* substring_gmap */
153 #ifdef DEBUG13
154 #define debug13(x) x
155 #else
156 #define debug13(x)
157 #endif
158 
159 /* Stage3_determine_pairtype */
160 #ifdef DEBUG14
161 #define debug14(x) x
162 #else
163 #define debug14(x)
164 #endif
165 
166 /* Stage3pair_overlap */
167 #ifdef DEBUG15
168 #define debug15(x) x
169 #else
170 #define debug15(x)
171 #endif
172 
173 
174 
175 #define MAPQ_MAXIMUM_SCORE 40
176 
177 static bool omit_concordant_uniq_p = false;
178 static bool omit_concordant_mult_p = false;
179 static bool filter_within_trims_p = false;
180 
181 /* static int kmer_search_sizelimit = 100; */
182 
183 static int subopt_levels;
184 
185 static bool want_random_p;
186 static bool transcriptomep;
187 static bool invert_first_p;
188 static bool invert_second_p;
189 static Genome_T genomecomp;
190 static Genome_T genomebits;
191 static Genome_T genomebits_alt;
192 
193 static Univ_IIT_T chromosome_iit;
194 static int nchromosomes;
195 static int circular_typeint;
196 
197 static Genome_T transcriptomebits;
198 static Transcriptome_T transcriptome;
199 static Univ_IIT_T transcript_iit;
200 static bool remap_transcriptome_p = false;
201 
202 static IIT_T tally_iit;
203 static int *tally_divint_crosstable;
204 static IIT_T runlength_iit;
205 static int *runlength_divint_crosstable;
206 
207 static Chrpos_T pairmax_linear;
208 static Chrpos_T pairmax_circular;
209 
210 static Chrpos_T expected_pairlength;
211 static Chrpos_T pairlength_deviation;
212 
213 static Chrpos_T expected_pairlength_low;
214 static Chrpos_T expected_pairlength_high;
215 static Chrpos_T expected_pairlength_very_high;
216 
217 static int localsplicing_penalty;
218 static int indel_penalty_middle;
219 static int antistranded_penalty;
220 static bool favor_multiexon_p;
221 
222 static int ambig_end_interval;	/* For penalizing large ambiguous ends
223 				   in GMAP alignments, since such ends
224 				   should have been found */
225 
226 static Univcoord_T genomelength;
227 static bool *circularp;
228 static bool *altlocp;
229 static Univcoord_T *alias_starts;
230 static Univcoord_T *alias_ends;
231 
232 static char *failedinput_root;
233 static Outputtype_T output_type;
234 static bool merge_samechr_p;
235 static bool method_print_p = false;
236 
237 
238 /* Probably not good to use in certain genomic regions, unless we also
239    use known splicesites with distance information. */
240 /* But sometimes need to use to get correct mapping */
241 static bool favor_ambiguous_p;
242 
243 
244 void
Stage3hr_setup(bool transcriptomep_in,bool invert_first_p_in,bool invert_second_p_in,Genome_T genomecomp_in,Genome_T genomebits_in,Genome_T genomebits_alt_in,Univ_IIT_T chromosome_iit_in,Univcoord_T genomelength_in,int nchromosomes_in,int circular_typeint_in,Genome_T transcriptomebits_in,Transcriptome_T transcriptome_in,Univ_IIT_T transcript_iit_in,IIT_T tally_iit_in,int * tally_divint_crosstable_in,IIT_T runlength_iit_in,int * runlength_divint_crosstable_in,bool distances_observed_p,Chrpos_T pairmax_linear_in,Chrpos_T pairmax_circular_in,Chrpos_T expected_pairlength_in,Chrpos_T pairlength_deviation_in,int localsplicing_penalty_in,int indel_penalty_middle_in,int antistranded_penalty_in,bool favor_multiexon_p_in,int subopt_levels_in,bool * circularp_in,bool * altlocp_in,Univcoord_T * alias_starts_in,Univcoord_T * alias_ends_in,bool filter_within_trims_p_in,bool omit_concordant_uniq_p_in,bool omit_concordant_mult_p_in,char * failedinput_root_in,Outputtype_T output_type_in,bool merge_samechr_p_in,bool method_print_p_in,bool want_random_p_in)245 Stage3hr_setup (bool transcriptomep_in, bool invert_first_p_in, bool invert_second_p_in,
246 		Genome_T genomecomp_in, Genome_T genomebits_in, Genome_T genomebits_alt_in,
247 		Univ_IIT_T chromosome_iit_in, Univcoord_T genomelength_in, int nchromosomes_in, int circular_typeint_in,
248 
249 		Genome_T transcriptomebits_in, Transcriptome_T transcriptome_in, Univ_IIT_T transcript_iit_in,
250 
251 		IIT_T tally_iit_in, int *tally_divint_crosstable_in,
252 		IIT_T runlength_iit_in, int *runlength_divint_crosstable_in,
253 		bool distances_observed_p,
254 		Chrpos_T pairmax_linear_in, Chrpos_T pairmax_circular_in,
255 		Chrpos_T expected_pairlength_in, Chrpos_T pairlength_deviation_in,
256 		int localsplicing_penalty_in, int indel_penalty_middle_in,
257 		int antistranded_penalty_in, bool favor_multiexon_p_in, int subopt_levels_in,
258 		bool *circularp_in, bool *altlocp_in,
259 		Univcoord_T *alias_starts_in, Univcoord_T *alias_ends_in,
260 		bool filter_within_trims_p_in, bool omit_concordant_uniq_p_in, bool omit_concordant_mult_p_in,
261 		char *failedinput_root_in, Outputtype_T output_type_in, bool merge_samechr_p_in,
262 		bool method_print_p_in, bool want_random_p_in) {
263 
264   transcriptomep = transcriptomep_in;
265   invert_first_p = invert_first_p_in;
266   invert_second_p = invert_second_p_in;
267   genomecomp = genomecomp_in;
268   genomebits = genomebits_in;
269   genomebits_alt = genomebits_alt_in;
270 
271   chromosome_iit = chromosome_iit_in;
272   nchromosomes = nchromosomes_in;
273   circular_typeint = circular_typeint_in;
274 
275   transcriptomebits = transcriptomebits_in;
276   transcriptome = transcriptome_in;
277   transcript_iit = transcript_iit_in;
278 
279   tally_iit = tally_iit_in;
280   tally_divint_crosstable = tally_divint_crosstable_in;
281   runlength_iit = runlength_iit_in;
282   runlength_divint_crosstable = runlength_divint_crosstable_in;
283   localsplicing_penalty = localsplicing_penalty_in;
284   indel_penalty_middle = indel_penalty_middle_in;
285   antistranded_penalty = antistranded_penalty_in;
286   favor_multiexon_p = favor_multiexon_p_in;
287 
288   pairmax_linear = pairmax_linear_in;
289   pairmax_circular = pairmax_circular_in;
290   expected_pairlength = expected_pairlength_in;
291   pairlength_deviation = pairlength_deviation_in;
292 
293   if (pairlength_deviation > expected_pairlength) {
294     expected_pairlength_low = 0;
295   } else {
296     expected_pairlength_low = expected_pairlength - pairlength_deviation;
297   }
298   expected_pairlength_high = expected_pairlength + pairlength_deviation;
299   expected_pairlength_very_high = expected_pairlength + 10*pairlength_deviation;
300 
301   if (distances_observed_p == true) {
302     favor_ambiguous_p = false;
303   } else {
304     favor_ambiguous_p = true;
305   }
306 
307 #if 0
308   ambig_end_interval = index1part + (index1interval - 1);
309 #else
310   ambig_end_interval = 8;	/* Since GMAP uses 8-mers */
311 #endif
312 
313   subopt_levels = subopt_levels_in;
314 
315   genomelength = genomelength_in;
316   circularp = circularp_in;
317   altlocp = altlocp_in;
318   alias_starts = alias_starts_in;
319   alias_ends = alias_ends_in;
320 
321   failedinput_root = failedinput_root_in;
322 
323   filter_within_trims_p = filter_within_trims_p_in;
324   omit_concordant_uniq_p = omit_concordant_uniq_p_in;
325   omit_concordant_mult_p = omit_concordant_mult_p_in;
326 
327   output_type = output_type_in;
328   merge_samechr_p = merge_samechr_p_in;
329   method_print_p = method_print_p_in;
330   want_random_p = want_random_p_in;
331 
332   return;
333 }
334 
335 
336 
337 #define T Stage3end_T
338 
339 Hittype_T
Stage3end_hittype(T this)340 Stage3end_hittype (T this) {
341   return this->hittype;
342 }
343 
344 static char *
hittype_string(Hittype_T hittype)345 hittype_string (Hittype_T hittype) {
346   switch (hittype) {
347   case EXACT: return "exact";
348   case SUB: return "sub";
349   case HALFSPLICE_DONOR: return "donor";
350   case HALFSPLICE_ACCEPTOR: return "acceptor";
351   case SPLICE: return "splice";
352   case SAMECHR_SPLICE: return "samechr_splice";
353   case TRANSLOC_SPLICE: return "transloc_splice";
354   case SUBSTRINGS: return "substrings";
355   default: abort();
356   }
357 }
358 
359 char *
Stage3end_hittype_string(T this)360 Stage3end_hittype_string (T this) {
361   return hittype_string(this->hittype);
362 }
363 
364 Method_T
Stage3end_method(T this)365 Stage3end_method (T this) {
366   return this->method;
367 }
368 
369 
370 int
Stage3end_genestrand(T this)371 Stage3end_genestrand (T this) {
372   return this->genestrand;
373 }
374 
375 bool
Stage3end_transcriptomep(T this)376 Stage3end_transcriptomep (T this) {
377   if (this == NULL) {
378     /* Can happen if we call upon a mate in a halfmapping */
379     return false;
380   } else if (this->method == TR) {
381     return true;
382   } else {
383     return false;
384   }
385 }
386 
387 List_T
Stage3end_transcripts(T this)388 Stage3end_transcripts (T this) {
389   return this->transcripts;
390 }
391 
392 void
Stage3end_set_transcripts(T this,List_T transcripts)393 Stage3end_set_transcripts (T this, List_T transcripts) {
394   List_free(&this->transcripts);
395   this->transcripts = transcripts;
396   return;
397 }
398 
399 List_T
Stage3end_transcripts_other(T this)400 Stage3end_transcripts_other (T this) {
401   return this->transcripts_other;
402 }
403 
404 
405 #if 0
406 void
407 Stage3end_transfer_transcripts (T dest, List_T sources) {
408   List_T p, q;
409   T source;
410   Transcript_T transcript;
411 
412   for (p = sources; p != NULL; p = List_next(p)) {
413     source = (T) List_head(p);
414     debug2(printf("Transferring %d transcripts from %s to %s\n",
415 		  List_length(source->transcripts),hittype_string(source->hittype),hittype_string(dest->hittype)));
416     for (q = source->transcripts; q != NULL; q = List_next(q)) {
417       transcript = (Transcript_T) List_head(q);
418       if (Transcript_in_list_p(transcript,dest->transcripts) == true) {
419 	Transcript_free(&transcript);
420       } else {
421 	printf("Pushing onto transcripts %p,",dest->transcripts);
422 	dest->transcripts = List_push(dest->transcripts,(void *) transcript);
423 	printf(" now %p\n",dest->transcripts);
424       }
425     }
426     List_free(&source->transcripts);
427     debug2(Transcript_print_nums(dest->transcripts));
428     debug2(printf("\n"));
429 
430     Stage3end_free(&source);
431   }
432 
433   return;
434 }
435 #endif
436 
437 #if 0
438 static void
439 Stage3end_transfer_transcripts_other (T dest, List_T sources) {
440   List_T p, q;
441   T source;
442   Transcript_T transcript;
443 
444   for (p = sources; p != NULL; p = List_next(p)) {
445     source = (T) List_head(p);
446     for (q = source->transcripts; q != NULL; q = List_next(q)) {
447       transcript = (Transcript_T) List_head(q);
448       if (Transcript_in_list_p(transcript,dest->transcripts_other) == true) {
449 	Transcript_free(&transcript);
450       } else {
451 	printf("Pushing onto transcripts %p,",dest->transcripts);
452 	dest->transcripts_other = List_push(dest->transcripts_other,(void *) transcript);
453 	printf(" now %p\n",dest->transcripts);
454       }
455     }
456     List_free(&source->transcripts);
457     Stage3end_free(&source);
458   }
459 
460   return;
461 }
462 #endif
463 
464 
465 static void
Stage3end_transfer_transcripts_one(T dest,T source)466 Stage3end_transfer_transcripts_one (T dest, T source) {
467   List_T q;
468   Transcript_T transcript;
469 
470 #ifdef DEBUG2
471   printf("Transferring %d transcripts from %s to %s\n",
472 	 List_length(source->transcripts),hittype_string(source->hittype),hittype_string(dest->hittype));
473 
474   printf("Before:\n");
475   printf("Dest: "); Transcript_print_nums(dest->transcripts); printf("\n");
476   printf("Source: "); Transcript_print_nums(source->transcripts); printf("\n");
477 #endif
478 
479   for (q = source->transcripts; q != NULL; q = List_next(q)) {
480     transcript = (Transcript_T) List_head(q);
481     if (Transcript_in_list_p(transcript,dest->transcripts) == true) {
482       Transcript_free(&transcript);
483     } else {
484       dest->transcripts = List_push(dest->transcripts,(void *) transcript);
485     }
486   }
487   List_free(&source->transcripts);
488 
489   for (q = source->transcripts_other; q != NULL; q = List_next(q)) {
490     transcript = (Transcript_T) List_head(q);
491     if (Transcript_in_list_p(transcript,dest->transcripts_other) == true) {
492       Transcript_free(&transcript);
493     } else {
494       dest->transcripts_other = List_push(dest->transcripts_other,(void *) transcript);
495     }
496   }
497   List_free(&source->transcripts_other);
498 
499 #ifdef DEBUG2
500   printf("After:\n");
501   printf("Dest: "); Transcript_print_nums(dest->transcripts); printf("\n");
502   /* Source lists will be empty */
503 #endif
504 
505   return;
506 }
507 
508 static void
Stage3pair_transfer_transcripts_one(Stage3pair_T dest,Stage3pair_T source)509 Stage3pair_transfer_transcripts_one (Stage3pair_T dest, Stage3pair_T source) {
510 
511   Stage3end_transfer_transcripts_one(dest->hit5,source->hit5);
512   Stage3end_transfer_transcripts_one(dest->hit3,source->hit3);
513 
514   return;
515 }
516 
517 
518 bool
Stage3end_distant_splice_p(T this)519 Stage3end_distant_splice_p (T this) {
520   if (this->distant_splice_p == true) {
521     return true;
522   } else {
523     return false;
524   }
525 }
526 
527 
528 Chrnum_T
Stage3end_chrnum(T this)529 Stage3end_chrnum (T this) {
530   if (this == NULL) {
531     /* Can happen if we call upon a mate in a halfmapping */
532     return 0;
533   } else {
534     return this->chrnum;
535   }
536 }
537 
538 Chrnum_T
Stage3end_effective_chrnum(T this)539 Stage3end_effective_chrnum (T this) {
540   if (this == NULL) {
541     /* Can happen if we call upon a mate in a halfmapping */
542     return 0;
543   } else {
544     return this->effective_chrnum;
545   }
546 }
547 
548 Chrnum_T
Stage3end_other_chrnum(T this)549 Stage3end_other_chrnum (T this) {
550   if (this == NULL) {
551     /* Can happen if we call upon a mate in a halfmapping */
552     return 0;
553   } else {
554     return this->other_chrnum;
555   }
556 }
557 
558 Univcoord_T
Stage3end_chroffset(T this)559 Stage3end_chroffset (T this) {
560   return this->chroffset;
561 }
562 
563 Univcoord_T
Stage3end_chrhigh(T this)564 Stage3end_chrhigh (T this) {
565   return this->chrhigh;
566 }
567 
568 Chrpos_T
Stage3end_chrlength(T this)569 Stage3end_chrlength (T this) {
570   if (this == NULL) {
571     /* Can happen if we call upon a mate in a halfmapping */
572     return 0;
573   } else {
574     return this->chrlength;
575   }
576 }
577 
578 Chrpos_T
Stage3end_chrpos_low(T this)579 Stage3end_chrpos_low (T this) {
580   return this->low - this->chroffset;
581 }
582 
583 Chrpos_T
Stage3end_chrpos_high(T this)584 Stage3end_chrpos_high (T this) {
585   return this->high - this->chroffset;
586 }
587 
588 
589 Univcoord_T
Stage3end_genomicstart(T this)590 Stage3end_genomicstart (T this) {
591   return this->genomicstart;
592 }
593 
594 Univcoord_T
Stage3end_genomicend(T this)595 Stage3end_genomicend (T this) {
596   return this->genomicend;
597 }
598 
599 #if 0
600 /* For Goby */
601 int
602 Stage3end_query_alignment_length (T this) {
603   int length = 0;
604   List_T p;
605   Substring_T substring;
606   Junction_T junction;
607 
608   for (p = this->substrings_LtoH; p != NULL; p = List_next(p)) {
609     substring = (Substring_T) List_head(p);
610     length += Substring_match_length(substring);
611   }
612   for (p = this->junctions_LtoH; p != NULL; p = List_next(p)) {
613     junction = (Junction_T) List_head(p);
614     if (Junction_type(junction) == INS_JUNCTION) {
615       length += Junction_nindels(junction);
616     }
617   }
618 
619   return length;
620 }
621 #endif
622 
623 
624 #if 0
625 Chrpos_T
626 Stage3end_genomic_alignment_length (T this) {
627   Chrpos_T length = 0;
628   List_T p;
629   Substring_T substring;
630   Junction_T junction;
631 
632   for (p = this->substrings_LtoH; p != NULL; p = List_next(p)) {
633     substring = (Substring_T) List_head(p);
634     length += Substring_genomic_alignment_length(substring);
635   }
636   for (p = this->junctions_LtoH; p != NULL; p = List_next(p)) {
637     junction = (Junction_T) List_head(p);
638     if (Junction_type(junction) == DEL_JUNCTION) {
639       length += (Chrpos_T) Junction_nindels(junction);
640     }
641   }
642 
643   return length;
644 }
645 #endif
646 
647 
648 #if 0
649 static Substring_T
650 find_substring_low (T this) {
651   Substring_T substring_low;
652   List_T substrings_LtoH, p;
653 
654   if (this->plusp == true) {
655     substrings_LtoH = this->substrings_1toN;
656   } else {
657     substrings_LtoH = this->substrings_Nto1;
658   }
659 
660   p = substrings_LtoH;
661   substring_low = (Substring_T) List_head(p);
662   if (Substring_has_alts_p(substring_low) == true) {
663     p = List_next(p);
664     substring_low = (Substring_T) List_head(p);
665   }
666 
667   return substring_low;
668 }
669 #endif
670 
671 
672 #if 0
673 static Substring_T
674 find_substring_high (T this) {
675   Substring_T substring_high;
676   List_T substrings_HtoL, p;
677 
678   if (this->plusp == true) {
679     substrings_HtoL = this->substrings_Nto1;
680   } else {
681     substrings_HtoL = this->substrings_1toN;
682   }
683 
684   p = substrings_HtoL;
685   substring_high = (Substring_T) List_head(p);
686   if (Substring_has_alts_p(substring_high) == true) {
687     p = List_next(p);
688     substring_high = (Substring_T) List_head(p);
689   }
690 
691   return substring_high;
692 }
693 #endif
694 
695 
696 int
Stage3end_mapq_score(T this)697 Stage3end_mapq_score (T this) {
698   return this->mapq_score;
699 }
700 
701 int
Stage3end_absmq_score(T this)702 Stage3end_absmq_score (T this) {
703   return this->absmq_score;
704 }
705 
706 int
Stage3end_nmismatches_bothdiff(T this)707 Stage3end_nmismatches_bothdiff (T this) {
708   return this->nmismatches_bothdiff;
709 }
710 
711 int
Stage3end_nmismatches_refdiff(T this)712 Stage3end_nmismatches_refdiff (T this) {
713   return this->nmismatches_refdiff;
714 }
715 
716 
717 #if 0
718 Endtype_T
719 Stage3end_start_endtype (T this) {
720   Substring_T substring;
721 
722   if (this->plusp == true) {
723     substring = (Substring_T) List_head(this->substrings_1toN);
724   } else {
725     substring = (Substring_T) List_head(this->substrings_Nto1);
726   }
727   return Substring_start_endtype(substring);
728 }
729 #endif
730 
731 #if 0
732 Endtype_T
733 Stage3end_end_endtype (T this) {
734   Substring_T substring;
735 
736   if (this->plusp == true) {
737     substring = (Substring_T) List_head(this->substrings_Nto1);
738   } else {
739     substring = (Substring_T) List_head(this->substrings_1toN);
740   }
741   return Substring_end_endtype(substring);
742 }
743 #endif
744 
745 int
Stage3end_nindels(T this)746 Stage3end_nindels (T this) {
747   return this->nindels;
748 }
749 
750 int
Stage3end_querylength(T this)751 Stage3end_querylength (T this) {
752   return this->querylength;
753 }
754 
755 bool
Stage3end_plusp(T this)756 Stage3end_plusp (T this) {
757   return this->plusp;
758 }
759 
760 bool
Stage3end_paired_usedp(T this)761 Stage3end_paired_usedp (T this) {
762   return this->paired_usedp;
763 }
764 
765 int
Stage3end_max_trim(T this)766 Stage3end_max_trim (T this) {
767   if (this->trim_querystart > this->trim_queryend) {
768     return this->trim_querystart;
769   } else {
770     return this->trim_queryend;
771   }
772 }
773 
774 
775 static int
start_amb_length(T this)776 start_amb_length (T this) {
777   return Substring_start_amb_length((Substring_T) List_head(this->substrings_1toN));
778 }
779 
780 static int
end_amb_length(T this)781 end_amb_length (T this) {
782   return Substring_end_amb_length((Substring_T) List_head(this->substrings_Nto1));
783 }
784 
785 #if 0
786 static int
787 n_amb_ends (T this) {
788   int n = 0;
789 
790   if (start_amb_length(this) > 0) {
791     n++;
792   }
793   if (end_amb_length(this) > 0) {
794     n++;
795   }
796 
797   return n;
798 }
799 #endif
800 
801 
802 #ifdef DEBUG8
803 static int
amb_length(T this)804 amb_length (T this) {
805   return Substring_start_amb_length((Substring_T) List_head(this->substrings_1toN)) +
806     Substring_end_amb_length((Substring_T) List_head(this->substrings_Nto1));
807 }
808 #endif
809 
810 
811 #if 0
812 /* Two types of ambiguity: known amb (mapped to >1 genomic place) and unknown amb (splice site seen) */
813 static bool
814 known_ambiguous_p (T this) {
815   if (Substring_ambiguous_p((Substring_T) List_head(this->substrings_1toN))) {
816     return true;
817   } else if (Substring_ambiguous_p((Substring_T) List_head(this->substrings_Nto1))) {
818     return true;
819   } else {
820     return false;
821   }
822 }
823 #endif
824 
825 
826 /* Includes amb and non-amb */
827 int
Stage3end_total_trim(T this)828 Stage3end_total_trim (T this) {
829   return this->trim_querystart + this->trim_queryend;
830 }
831 
832 
833 int
Stage3end_circularpos(T this)834 Stage3end_circularpos (T this) {
835   return this->circularpos;
836 }
837 
838 
839 Junction_T
Stage3end_junctionD(T this)840 Stage3end_junctionD (T this) {
841   if (this->sensedir == SENSE_ANTI) {
842     return (Junction_T) List_head(this->junctions_Nto1);
843   } else {
844     return (Junction_T) List_head(this->junctions_1toN);
845   }
846 }
847 
848 Junction_T
Stage3end_junctionA(T this)849 Stage3end_junctionA (T this) {
850   if (this->sensedir == SENSE_ANTI) {
851     return (Junction_T) List_head(this->junctions_1toN);
852   } else {
853     return (Junction_T) List_head(this->junctions_Nto1);
854   }
855 }
856 
857 List_T
Stage3end_substrings_LtoH(T this)858 Stage3end_substrings_LtoH (T this) {
859   if (this->plusp == true) {
860     return this->substrings_1toN;
861   } else {
862     return this->substrings_Nto1;
863   }
864 }
865 
866 List_T
Stage3end_junctions_LtoH(T this)867 Stage3end_junctions_LtoH (T this) {
868   if (this->plusp == true) {
869     return this->junctions_1toN;
870   } else {
871     return this->junctions_Nto1;
872   }
873 }
874 
875 
876 /* Called only by samprint currently */
877 Substring_T
Stage3end_substring1(T this)878 Stage3end_substring1 (T this) {
879   return (Substring_T) List_head(this->substrings_1toN);
880 }
881 
882 /* Called only by samprint currently */
883 Substring_T
Stage3end_substringN(T this)884 Stage3end_substringN (T this) {
885   return (Substring_T) List_head(this->substrings_Nto1);
886 }
887 
888 
889 Substring_T
Stage3end_substring_for_concordance(T this,bool first_read_p)890 Stage3end_substring_for_concordance (T this, bool first_read_p) {
891   if (first_read_p == true) {
892     return (Substring_T) List_head(this->substrings_Nto1);
893   } else {
894     return (Substring_T) List_head(this->substrings_1toN);
895   }
896 }
897 
898 Substring_T
Stage3end_substring_other(T this,bool first_read_p)899 Stage3end_substring_other (T this, bool first_read_p) {
900   if (first_read_p == true) {
901     return (Substring_T) List_head(this->substrings_1toN);
902   } else {
903     return (Substring_T) List_head(this->substrings_Nto1);
904   }
905 }
906 
907 
908 bool
Stage3end_donor_concordant_p(T this,bool first_read_p)909 Stage3end_donor_concordant_p (T this, bool first_read_p) {
910   if (this->sensedir != SENSE_ANTI) {
911     if (first_read_p == true) {
912       return false;
913     } else {
914       return true;
915     }
916   } else {
917     if (first_read_p == true) {
918       return true;
919     } else {
920       return false;
921     }
922   }
923 }
924 
925 
926 Substring_T
Stage3end_substring_donor(T this)927 Stage3end_substring_donor (T this) {
928   if (this->sensedir == SENSE_ANTI) {
929     return (Substring_T) List_head(this->substrings_Nto1);
930   } else if (this->sensedir == SENSE_FORWARD) {
931     return (Substring_T) List_head(this->substrings_1toN);
932   } else {
933     fprintf(stderr,"sensedir is SENSE_NULL in Stage3end_substring_donor\n");
934     abort();
935   }
936 }
937 
938 Substring_T
Stage3end_substring_acceptor(T this)939 Stage3end_substring_acceptor (T this) {
940   if (this->sensedir == SENSE_ANTI) {
941     return (Substring_T) List_head(this->substrings_1toN);
942   } else if (this->sensedir == SENSE_FORWARD) {
943     return (Substring_T) List_head(this->substrings_Nto1);
944   } else {
945     fprintf(stderr,"sensedir is SENSE_NULL in Stage3end_substring_acceptor\n");
946     abort();
947   }
948 }
949 
950 /* Now same as Stage3end_substring_donor */
951 Substring_T
Stage3end_substringD(T this)952 Stage3end_substringD (T this) {
953   if (this->sensedir == SENSE_ANTI) {
954     return (Substring_T) List_head(this->substrings_Nto1);
955   } else {
956     return (Substring_T) List_head(this->substrings_1toN);
957   }
958 }
959 
960 /* Now same as Stage3end_substring_acceptor */
961 Substring_T
Stage3end_substringA(T this)962 Stage3end_substringA (T this) {
963   if (this->sensedir == SENSE_ANTI) {
964     return (Substring_T) List_head(this->substrings_1toN);
965   } else {
966     return (Substring_T) List_head(this->substrings_Nto1);
967   }
968 }
969 
970 
971 Substring_T
Stage3end_substringS(T this)972 Stage3end_substringS (T this) {
973   return (Substring_T) List_head(List_next(this->substrings_1toN));
974 }
975 
976 
977 
978 /* Same logic as in print_substrings in samprint.c to get the first substring for CIGAR or MD string */
979 Substring_T
Stage3end_substring_low(T this,int hardclip_low)980 Stage3end_substring_low (T this, int hardclip_low) {
981   List_T p;
982 
983   debug15(printf("Entered Stage3end_substring_low\n"));
984 
985   if (this == NULL) {
986     return (Substring_T) NULL;
987 
988   } else if (this->plusp == true) {
989     p = this->substrings_1toN;  /* substrings_LtoH */
990     if (Substring_has_alts_p((Substring_T) List_head(p)) == true) {
991       p = List_next(p);
992     }
993     while (p != NULL && Substring_queryend((Substring_T) List_head(p)) <= hardclip_low) {
994       debug15(printf("Plus: Skipping substring %d..%d against hardclip_low %d\n",
995 		     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
996 		     hardclip_low));
997       p = List_next(p);
998     }
999 
1000     if (p == NULL) {
1001       return (Substring_T) NULL;
1002     } else {
1003       debug15(printf("Plus: Returning substring %d..%d against hardclip_low %d\n",
1004 		     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1005 		     hardclip_low));
1006       return (Substring_T) List_head(p);
1007     }
1008 
1009   } else {
1010 #ifdef DEBUG15
1011     for (p = this->substrings_LtoH; p != NULL; p = List_next(p)) {
1012       printf("LtoH: %d..%d\n",
1013 	     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)));
1014     }
1015 #endif
1016 
1017     p = this->substrings_Nto1;  /* substrings_LtoH */
1018     if (Substring_has_alts_p((Substring_T) List_head(p)) == true) {
1019       p = List_next(p);
1020     }
1021 
1022     while (p != NULL && Substring_querystart((Substring_T) List_head(p)) >= this->querylength - hardclip_low) {
1023       debug15(printf("Minus: Skipping substring %d..%d against %d = querylength %d - hardclip_low %d\n",
1024 		     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1025 		     this->querylength - hardclip_low,this->querylength,hardclip_low));
1026       p = List_next(p);
1027     }
1028 
1029     if (p == NULL) {
1030       return (Substring_T) NULL;
1031     } else {
1032       debug15(printf("Minus: Returning substring %d..%d against %d = querylength %d - hardclip_low %d\n",
1033 		     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1034 		     this->querylength - hardclip_low,this->querylength,hardclip_low));
1035       return (Substring_T) List_head(p);
1036     }
1037   }
1038 }
1039 
1040 
1041 #if 0
1042 /* Modified from Stage3end_substring_low */
1043 Substring_T
1044 Stage3end_substring_high (T this, int hardclip_high) {
1045   List_T p;
1046 
1047   debug15(printf("Entered Stage3end_substring_high\n"));
1048 
1049   if (this == NULL) {
1050     return (Substring_T) NULL;
1051 
1052   } else if (this->plusp == true) {
1053     p = this->substrings_HtoL;
1054     if (Substring_has_alts_p((Substring_T) List_head(p)) == true) {
1055       p = List_next(p);
1056     }
1057 
1058     while (p != NULL && Substring_querystart((Substring_T) List_head(p)) >= this->querylength - hardclip_high) {
1059       debug15(printf("Plus: Skipping substring %d..%d against %d = querylength %d - hardclip_high %d\n",
1060 		     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1061 		     this->querylength - hardclip_high,this->querylength,hardclip_high));
1062       p = List_next(p);
1063     }
1064 
1065     if (p == NULL) {
1066       return (Substring_T) NULL;
1067     } else {
1068       debug15(printf("Plus: Returning substring %d..%d against %d = querylength %d - hardclip_high %d\n",
1069 		     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1070 		     this->querylength - hardclip_high,this->querylength,hardclip_high));
1071       return (Substring_T) List_head(p);
1072     }
1073 
1074   } else {
1075 #ifdef DEBUG15
1076     for (p = this->substrings_HtoL; p != NULL; p = List_next(p)) {
1077       printf("HtoL: %d..%d\n",
1078 	     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)));
1079     }
1080 #endif
1081 
1082     p = this->substrings_HtoL;
1083     if (Substring_has_alts_p((Substring_T) List_head(p)) == true) {
1084       p = List_next(p);
1085     }
1086 
1087     while (p != NULL && Substring_queryend((Substring_T) List_head(p)) <= hardclip_high) {
1088       debug15(printf("Minus: Skipping substring %d..%d against hardclip_high %d\n",
1089 		     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1090 		     hardclip_high));
1091       p = List_next(p);
1092     }
1093 
1094     if (p == NULL) {
1095       return (Substring_T) NULL;
1096     } else {
1097       debug15(printf("Minus: Returning substring %d..%d against hardclip_high %d\n",
1098 		     Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1099 		     hardclip_high));
1100       return (Substring_T) List_head(p);
1101     }
1102   }
1103 }
1104 #endif
1105 
1106 
1107 
1108 Substring_T
Stage3end_substring_containing(T this,int querypos)1109 Stage3end_substring_containing (T this, int querypos) {
1110   Substring_T substring;
1111   List_T substrings_LtoH, p;
1112 
1113   if (this->plusp == true) {
1114     substrings_LtoH = this->substrings_1toN;
1115   } else {
1116     substrings_LtoH = this->substrings_Nto1;
1117   }
1118 
1119   for (p = substrings_LtoH; p != NULL; p = List_next(p)) {
1120     substring = (Substring_T) List_head(p);
1121     if (Substring_contains_p(substring,querypos) == true) {
1122       return substring;
1123     }
1124   }
1125 
1126   return (Substring_T) NULL;
1127 }
1128 
1129 
1130 double
Stage3end_min_evalue(T this)1131 Stage3end_min_evalue (T this) {
1132   double min_evalue = 1000.0, evalue;
1133   Substring_T substring;
1134   List_T p;
1135 
1136   for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
1137     substring = (Substring_T) List_head(p);
1138     if ((evalue = Substring_evalue(substring)) < min_evalue) {
1139       min_evalue = evalue;
1140     }
1141   }
1142 
1143   return min_evalue;
1144 }
1145 
1146 
1147 double
Stage3end_chimera_prob(T this)1148 Stage3end_chimera_prob (T this) {
1149   List_T p;
1150   Junction_T junction;
1151 
1152   for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
1153     junction = (Junction_T) List_head(p);
1154     if (Junction_type(junction) == CHIMERA_JUNCTION) {
1155       return Junction_prob(junction);
1156     }
1157   }
1158 
1159   return 0.0;
1160 }
1161 
1162 static double
Stage3end_prob(T this)1163 Stage3end_prob (T this) {
1164   double prob = 0.0;
1165   List_T p;
1166   Junction_T junction;
1167 
1168   for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
1169     junction = (Junction_T) List_head(p);
1170     prob += Junction_prob(junction);
1171   }
1172 
1173   return prob;
1174 }
1175 
1176 
1177 /* Should eventually look for substrings adjacent to the chimeric junction */
1178 Univcoord_T
Stage3end_chimera_segmenti_left(T this)1179 Stage3end_chimera_segmenti_left (T this) {
1180   Univcoord_T x_segmenti, x_segmentj;
1181   Substring_T substring_donor, substring_acceptor;
1182 
1183   if (this->sensedir == SENSE_ANTI) {
1184     substring_donor = (Substring_T) List_head(this->substrings_Nto1);
1185     substring_acceptor = (Substring_T) List_head(this->substrings_1toN);
1186   } else {
1187     substring_donor = (Substring_T) List_head(this->substrings_1toN);
1188     substring_acceptor = (Substring_T) List_head(this->substrings_Nto1);
1189   }
1190 
1191   x_segmenti = Substring_left_genomicseg(substring_donor);
1192   x_segmentj = Substring_left_genomicseg(substring_acceptor);
1193   if (x_segmenti < x_segmentj) {
1194     return x_segmenti;
1195   } else {
1196     return x_segmentj;
1197   }
1198 }
1199 
1200 /* Should eventually look for substrings adjacent to the chimeric junction */
1201 Univcoord_T
Stage3end_chimera_segmentj_left(T this)1202 Stage3end_chimera_segmentj_left (T this) {
1203   Univcoord_T x_segmenti, x_segmentj;
1204   Substring_T substring_donor, substring_acceptor;
1205 
1206   if (this->sensedir == SENSE_ANTI) {
1207     substring_donor = (Substring_T) List_head(this->substrings_Nto1);
1208     substring_acceptor = (Substring_T) List_head(this->substrings_1toN);
1209   } else {
1210     substring_donor = (Substring_T) List_head(this->substrings_1toN);
1211     substring_acceptor = (Substring_T) List_head(this->substrings_Nto1);
1212   }
1213 
1214   x_segmenti = Substring_left_genomicseg(substring_donor);
1215   x_segmentj = Substring_left_genomicseg(substring_acceptor);
1216   if (x_segmenti > x_segmentj) {
1217     return x_segmenti;
1218   } else {
1219     return x_segmentj;
1220   }
1221 }
1222 
1223 
1224 int
Stage3end_chimera_segmenti_cmp(const void * a,const void * b)1225 Stage3end_chimera_segmenti_cmp (const void *a, const void *b) {
1226   T x = * (T *) a;
1227   T y = * (T *) b;
1228   Univcoord_T x_segmenti, x_segmentj, y_segmenti, y_segmentj, temp;
1229   Substring_T x_substring_donor, x_substring_acceptor,
1230     y_substring_donor, y_substring_acceptor;
1231 
1232   if (x->sensedir == SENSE_ANTI) {
1233     x_substring_donor = (Substring_T) List_head(x->substrings_Nto1);
1234     x_substring_acceptor = (Substring_T) List_head(x->substrings_1toN);
1235   } else {
1236     x_substring_donor = (Substring_T) List_head(x->substrings_1toN);
1237     x_substring_acceptor = (Substring_T) List_head(x->substrings_Nto1);
1238   }
1239 
1240   if (y->sensedir == SENSE_ANTI) {
1241     y_substring_donor = (Substring_T) List_head(y->substrings_Nto1);
1242     y_substring_acceptor = (Substring_T) List_head(y->substrings_1toN);
1243   } else {
1244     y_substring_donor = (Substring_T) List_head(y->substrings_1toN);
1245     y_substring_acceptor = (Substring_T) List_head(y->substrings_Nto1);
1246   }
1247 
1248   x_segmenti = Substring_left_genomicseg(x_substring_donor);
1249   x_segmentj = Substring_left_genomicseg(x_substring_acceptor);
1250   if (x_segmentj < x_segmenti) {
1251     temp = x_segmentj;
1252     x_segmentj = x_segmenti;
1253     x_segmenti = temp;
1254   }
1255 
1256   y_segmenti = Substring_left_genomicseg(y_substring_donor);
1257   y_segmentj = Substring_left_genomicseg(y_substring_acceptor);
1258   if (y_segmentj < y_segmenti) {
1259     temp = y_segmentj;
1260     y_segmentj = y_segmenti;
1261     y_segmenti = temp;
1262   }
1263 
1264   if (x_segmenti < y_segmenti) {
1265     return -1;
1266   } else if (y_segmenti < x_segmenti) {
1267     return +1;
1268   } else if (x_segmentj > y_segmentj) {
1269     return -1;
1270   } else if (y_segmentj > x_segmentj) {
1271     return +1;
1272   } else {
1273     return 0;
1274   }
1275 }
1276 
1277 
1278 
1279 int
Stage3end_chimera_segmentj_cmp(const void * a,const void * b)1280 Stage3end_chimera_segmentj_cmp (const void *a, const void *b) {
1281   T x = * (T *) a;
1282   T y = * (T *) b;
1283   Univcoord_T x_segmenti, x_segmentj, y_segmenti, y_segmentj, temp;
1284   Substring_T x_substring_donor, x_substring_acceptor,
1285     y_substring_donor, y_substring_acceptor;
1286 
1287   if (x->sensedir == SENSE_ANTI) {
1288     x_substring_donor = (Substring_T) List_head(x->substrings_Nto1);
1289     x_substring_acceptor = (Substring_T) List_head(x->substrings_1toN);
1290   } else {
1291     x_substring_donor = (Substring_T) List_head(x->substrings_1toN);
1292     x_substring_acceptor = (Substring_T) List_head(x->substrings_Nto1);
1293   }
1294 
1295   if (y->sensedir == SENSE_ANTI) {
1296     y_substring_donor = (Substring_T) List_head(y->substrings_Nto1);
1297     y_substring_acceptor = (Substring_T) List_head(y->substrings_1toN);
1298   } else {
1299     y_substring_donor = (Substring_T) List_head(y->substrings_1toN);
1300     y_substring_acceptor = (Substring_T) List_head(y->substrings_Nto1);
1301   }
1302 
1303 
1304   x_segmenti = Substring_left_genomicseg(x_substring_donor);
1305   x_segmentj = Substring_left_genomicseg(x_substring_acceptor);
1306   if (x_segmentj < x_segmenti) {
1307     temp = x_segmentj;
1308     x_segmentj = x_segmenti;
1309     x_segmenti = temp;
1310   }
1311 
1312   y_segmenti = Substring_left_genomicseg(y_substring_donor);
1313   y_segmentj = Substring_left_genomicseg(y_substring_acceptor);
1314   if (y_segmentj < y_segmenti) {
1315     temp = y_segmentj;
1316     y_segmentj = y_segmenti;
1317     y_segmenti = temp;
1318   }
1319 
1320   if (x_segmentj < y_segmentj) {
1321     return -1;
1322   } else if (y_segmentj < x_segmentj) {
1323     return +1;
1324   } else if (x_segmenti > y_segmenti) {
1325     return -1;
1326   } else if (y_segmenti > x_segmenti) {
1327     return +1;
1328   } else {
1329     return 0;
1330   }
1331 }
1332 
1333 
1334 int
Stage3end_sensedir(T this)1335 Stage3end_sensedir (T this) {
1336   if (this == NULL) {
1337     /* Can happen if we call upon a mate in a halfmapping */
1338     return SENSE_NULL;
1339   } else {
1340     return this->sensedir;
1341   }
1342 }
1343 
1344 #if 0
1345 int
1346 Stage3end_cdna_direction (T this) {
1347   if (this == NULL) {
1348     return SENSE_NULL;
1349   } else if (this->sensedir == SENSE_FORWARD) {
1350     return +1;
1351   } else if (this->sensedir == SENSE_ANTI) {
1352     return -1;
1353   } else {
1354     return SENSE_NULL;
1355   }
1356 }
1357 #endif
1358 
1359 #if 0
1360 bool
1361 Stage3end_start_ambiguous_p (T this) {
1362   Substring_T substring;
1363 
1364   substring = (Substring_T) List_head(this->substrings_1toN);
1365   return Substring_ambiguous_p(substring);
1366 }
1367 #endif
1368 
1369 #if 0
1370 bool
1371 Stage3end_end_ambiguous_p (T this) {
1372   Substring_T substring;
1373 
1374   substring = (Substring_T) List_head(this->substrings_Nto1);
1375   return Substring_ambiguous_p(substring);
1376 }
1377 #endif
1378 
1379 bool
Stage3end_start_has_alts_p(T this)1380 Stage3end_start_has_alts_p (T this) {
1381   Substring_T substring;
1382 
1383   substring = (Substring_T) List_head(this->substrings_1toN);
1384   return Substring_has_alts_p(substring);
1385 }
1386 
1387 bool
Stage3end_end_has_alts_p(T this)1388 Stage3end_end_has_alts_p (T this) {
1389   Substring_T substring;
1390 
1391   substring = (Substring_T) List_head(this->substrings_Nto1);
1392   return Substring_has_alts_p(substring);
1393 }
1394 
1395 
1396 Univcoord_T *
Stage3end_start_alts_coords(T this)1397 Stage3end_start_alts_coords (T this) {
1398   Substring_T substring;
1399 
1400   substring = (Substring_T) List_head(this->substrings_1toN);
1401   if (Substring_has_alts_p(substring) == false) {
1402     return (Univcoord_T *) NULL;
1403   } else {
1404     return Substring_alts_coords(substring);
1405   }
1406 }
1407 
1408 Univcoord_T *
Stage3end_end_alts_coords(T this)1409 Stage3end_end_alts_coords (T this) {
1410   Substring_T substring;
1411 
1412   substring = (Substring_T) List_head(this->substrings_Nto1);
1413   if (Substring_has_alts_p(substring) == false) {
1414     return (Univcoord_T *) NULL;
1415   } else {
1416     return Substring_alts_coords(substring);
1417   }
1418 }
1419 
1420 int
Stage3end_start_alts_ncoords(T this)1421 Stage3end_start_alts_ncoords (T this) {
1422   Substring_T substring;
1423 
1424   substring = (Substring_T) List_head(this->substrings_1toN);
1425   if (Substring_has_alts_p(substring) == false) {
1426     return 0;
1427   } else {
1428     return Substring_alts_ncoords(substring);
1429   }
1430 }
1431 
1432 int
Stage3end_end_alts_ncoords(T this)1433 Stage3end_end_alts_ncoords (T this) {
1434   Substring_T substring;
1435 
1436   substring = (Substring_T) List_head(this->substrings_Nto1);
1437   if (Substring_has_alts_p(substring) == false) {
1438     return 0;
1439   } else {
1440     return Substring_alts_ncoords(substring);
1441   }
1442 }
1443 
1444 
1445 int
Stage3end_substrings_querystart(T this)1446 Stage3end_substrings_querystart (T this) {
1447   Substring_T substring;
1448 
1449   substring = (Substring_T) List_head(this->substrings_1toN);
1450   return Substring_querystart(substring);
1451 }
1452 
1453 int
Stage3end_substrings_queryend(T this)1454 Stage3end_substrings_queryend (T this) {
1455   Substring_T substring;
1456 
1457   substring = (Substring_T) List_head(this->substrings_Nto1);
1458   return Substring_queryend(substring);
1459 }
1460 
1461 
1462 int
Stage3end_trimlength(T this)1463 Stage3end_trimlength (T this) {
1464   return this->trim_querystart + this->trim_queryend;
1465 }
1466 
1467 
1468 void
Stage3end_count_hits(int * npaths_primary,int * npaths_altloc,List_T hits)1469 Stage3end_count_hits (int *npaths_primary, int *npaths_altloc, List_T hits) {
1470   T hit;
1471 
1472   *npaths_primary = *npaths_altloc = 0;
1473 
1474   while (hits != NULL) {
1475     hit = (T) List_head(hits);
1476     if (altlocp[hit->chrnum] == true) {
1477       *npaths_altloc += 1;
1478     } else {
1479       *npaths_primary += 1;
1480     }
1481     hits = List_next(hits);
1482   }
1483 
1484   return;
1485 }
1486 
1487 #if 0
1488 static long int
1489 Stage3end_compute_tally (T this) {
1490   long int tally = 0L;
1491   List_T p;
1492   Substring_T substring;
1493 
1494   for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
1495     substring = (Substring_T) List_head(p);
1496     tally += Substring_tally(substring,tally_iit,tally_divint_crosstable);
1497   }
1498 
1499   return tally;
1500 }
1501 #endif
1502 
1503 #if 0
1504 static bool
1505 Stage3end_runlength_p (T this) {
1506   List_T p;
1507   Substring_T substring;
1508 
1509   for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
1510     substring = (Substring_T) List_head(p);
1511     if (Substring_runlength_p(substring,runlength_iit,runlength_divint_crosstable) == true) {
1512       return true;
1513     }
1514   }
1515 
1516   return false;
1517 }
1518 #endif
1519 
1520 
1521 void
Stage3end_free(T * old)1522 Stage3end_free (T *old) {
1523   List_T p;
1524   Substring_T substring;
1525   Junction_T junction;
1526 
1527 
1528   if (*old != NULL) {
1529     debug0(printf("Freeing Stage3end %p from method %s\n",*old,Method_string((*old)->method)));
1530 
1531     if ((*old)->transcripts_other != NULL) {
1532       Transcript_gc(&(*old)->transcripts_other);
1533     }
1534     if ((*old)->transcripts != NULL) {
1535       Transcript_gc(&(*old)->transcripts);
1536     }
1537 
1538     for (p = (*old)->substrings_1toN; p != NULL; p = List_next(p)) {
1539       substring = (Substring_T) List_head(p);
1540       Substring_free(&substring);
1541     }
1542     /* List_free(&(*old)->substrings_1toN); -- allocated by Listpool_push */
1543     /* List_free(&(*old)->substrings_Nto1); -- allocated by Listpool_push */
1544     /* List_free(&(*old)->substrings_LtoH); -- allocated by Listpool_push */
1545     /* List_free(&(*old)->substrings_HtoL); -- allocated by Listpool_push */
1546 
1547     for (p = (*old)->junctions_1toN; p != NULL; p = List_next(p)) {
1548       junction = (Junction_T) List_head(p);
1549       Junction_free(&junction);
1550     }
1551     /* List_free(&(*old)->junctions_1toN); -- allocated by Listpool_push */
1552     /* List_free(&(*old)->junctions_Nto1); -- allocated by Listpool_push */
1553     /* List_free(&(*old)->junctions_LtoH); -- allocated by Listpool_push */
1554     /* List_free(&(*old)->junctions_HtoL); */
1555 
1556     FREE_OUT(*old);
1557   }
1558 
1559   return;
1560 }
1561 
1562 
1563 /* Used for freeing list contents in Concordance_pair_up procedures */
1564 /* Do not free the list itself, though, which was previously freed in
1565    stage1hr.c, and now allocated by Hitlistpool_T */
1566 void
Stage3end_gc(List_T values)1567 Stage3end_gc (List_T values) {
1568   List_T p;
1569   T hit;
1570 
1571   for (p = values; p != NULL; p = p->rest) {
1572     if ((hit = (T) p->first) != NULL) {
1573       Stage3end_free(&hit);
1574     }
1575   }
1576   Hitlist_free(&values);
1577   return;
1578 }
1579 
1580 
1581 
1582 bool
Stage3pair_distant_splice_p(Stage3pair_T this)1583 Stage3pair_distant_splice_p (Stage3pair_T this) {
1584   if (this->hit5 != NULL && this->hit5->distant_splice_p == true) {
1585     return true;
1586   } else if (this->hit3 != NULL && this->hit3->distant_splice_p == true) {
1587     return true;
1588   } else {
1589     return false;
1590   }
1591 }
1592 
1593 
1594 int
Stage3pair_genestrand(Stage3pair_T this)1595 Stage3pair_genestrand (Stage3pair_T this) {
1596   return this->genestrand;
1597 }
1598 
1599 Stage3end_T
Stage3pair_hit5(Stage3pair_T this)1600 Stage3pair_hit5 (Stage3pair_T this) {
1601   return this->hit5;
1602 }
1603 
1604 Stage3end_T
Stage3pair_hit3(Stage3pair_T this)1605 Stage3pair_hit3 (Stage3pair_T this) {
1606   return this->hit3;
1607 }
1608 
1609 int
Stage3pair_mapq_score(Stage3pair_T this)1610 Stage3pair_mapq_score (Stage3pair_T this) {
1611   return this->mapq_score;
1612 }
1613 
1614 int
Stage3pair_absmq_score(Stage3pair_T this)1615 Stage3pair_absmq_score (Stage3pair_T this) {
1616   return this->absmq_score;
1617 }
1618 
1619 List_T
Stage3pair_transcripts5(Stage3pair_T this)1620 Stage3pair_transcripts5 (Stage3pair_T this) {
1621   return this->hit5->transcripts;
1622 }
1623 
1624 List_T
Stage3pair_transcripts3(Stage3pair_T this)1625 Stage3pair_transcripts3 (Stage3pair_T this) {
1626   return this->hit3->transcripts;
1627 }
1628 
1629 Chrpos_T
Stage3pair_pairlength(Stage3pair_T this)1630 Stage3pair_pairlength (Stage3pair_T this) {
1631   return this->insertlength;
1632 }
1633 
1634 int
Stage3pair_relationship(Stage3pair_T this)1635 Stage3pair_relationship (Stage3pair_T this) {
1636   return this->pair_relationship;
1637 }
1638 
1639 int
Stage3pair_total_trim(Stage3pair_T this)1640 Stage3pair_total_trim (Stage3pair_T this) {
1641   return Stage3end_total_trim(this->hit5) + Stage3end_total_trim(this->hit3);
1642 }
1643 
1644 int
Stage3pair_max_trim(Stage3pair_T this)1645 Stage3pair_max_trim (Stage3pair_T this) {
1646   int trim5, trim3;
1647   T hit;
1648 
1649 #if 0
1650   /* Don't want ambiguous ends for purpose of defining concordant terminals */
1651   trim5 = Stage3end_total_trim(this->hit5);
1652   trim3 = Stage3end_total_trim(this->hit3);
1653 #else
1654   hit = this->hit5;
1655   trim5 = hit->trim_querystart + hit->trim_queryend;
1656   hit = this->hit3;
1657   trim3 = hit->trim_querystart + hit->trim_queryend;
1658 #endif
1659 
1660   if (trim5 > trim3) {
1661     return trim5;
1662   } else {
1663     return trim3;
1664   }
1665 }
1666 
1667 int
Stage3pair_nmatches_to_trims(int * nmatches5,int * nmatches3,Stage3pair_T this)1668 Stage3pair_nmatches_to_trims (int *nmatches5, int *nmatches3, Stage3pair_T this) {
1669   *nmatches5 = this->hit5->refalt_nmatches_to_trims;
1670   *nmatches3 = this->hit3->refalt_nmatches_to_trims;
1671   return (*nmatches5) + (*nmatches3);
1672 }
1673 
1674 int
Stage3pair_ref_nmatches_to_trims(int * nmatches5,int * nmatches3,Stage3pair_T this)1675 Stage3pair_ref_nmatches_to_trims (int *nmatches5, int *nmatches3, Stage3pair_T this) {
1676   *nmatches5 = this->hit5->ref_nmatches_to_trims;
1677   *nmatches3 = this->hit3->ref_nmatches_to_trims;
1678   return (*nmatches5) + (*nmatches3);
1679 }
1680 
1681 
1682 bool
Stage3pair_concordantp(List_T hitpairs)1683 Stage3pair_concordantp (List_T hitpairs) {
1684   List_T p;
1685   Stage3pair_T hitpair;
1686 
1687   for (p = hitpairs; p != NULL; p = List_next(p)) {
1688     hitpair = (Stage3pair_T) List_head(p);
1689 #if 0
1690     /* Not necessary, since we are getting the result after GMAP align pair */
1691     if (Stage3_determine_pairtype(hitpair->hit5,hitpair->hit3,hitpair) == CONCORDANT) {
1692       return true;
1693     }
1694 #else
1695     if (hitpair->pairtype == CONCORDANT) {
1696       return true;
1697     }
1698 #endif
1699   }
1700   return false;
1701 }
1702 
1703 void
Stage3pair_count_hits(int * npaths_primary,int * npaths_altloc,List_T hitpairs)1704 Stage3pair_count_hits (int *npaths_primary, int *npaths_altloc, List_T hitpairs) {
1705   Stage3pair_T hitpair;
1706 
1707   *npaths_primary = *npaths_altloc = 0;
1708 
1709   while (hitpairs != NULL) {
1710     hitpair = (Stage3pair_T) List_head(hitpairs);
1711     if (altlocp[hitpair->hit5->chrnum] == true) {
1712       *npaths_altloc += 1;
1713     } else if (altlocp[hitpair->hit3->chrnum] == true) {
1714       *npaths_altloc += 1;
1715     } else {
1716       *npaths_primary += 1;
1717     }
1718     hitpairs = List_next(hitpairs);
1719   }
1720 
1721   return;
1722 }
1723 
1724 List_T
Stage3pair_filter_nonconcordant(List_T hitpairs,Hitlistpool_T hitlistpool)1725 Stage3pair_filter_nonconcordant (List_T hitpairs, Hitlistpool_T hitlistpool) {
1726   List_T filtered = NULL, p;
1727   Stage3pair_T hitpair;
1728 
1729   for (p = hitpairs; p != NULL; p = List_next(p)) {
1730     hitpair = (Stage3pair_T) List_head(p);
1731     if (hitpair->pairtype != CONCORDANT) {
1732       Stage3pair_free(&hitpair);
1733     } else {
1734       filtered = Hitlist_push(filtered,hitlistpool,(void *) hitpair);
1735     }
1736   }
1737   Hitlist_free(&hitpairs);
1738   return filtered;
1739 }
1740 
1741 
1742 /* Returns true if ilengths are valid */
1743 static bool
find_ilengths(int * ilength_low,int * ilength_high,Stage3end_T hit,Univcoord_T common_genomicpos)1744 find_ilengths (int *ilength_low, int *ilength_high, Stage3end_T hit, Univcoord_T common_genomicpos) {
1745   List_T p, q;
1746   Substring_T substring;
1747   Junction_T junction;
1748 
1749 
1750   debug15(printf("Finding ilengths for common_genomicpos %u\n",(Chrpos_T) (common_genomicpos - chroffset)));
1751   if (hit->plusp == true) {
1752 #ifdef DEBUG15
1753     printf("plus.  Checking common genomicpos %llu against\n",common_genomicpos - hit->chroffset);
1754     for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
1755       substring = (Substring_T) List_head(p);
1756       printf("substring %p: %u..%u, trim %d..%d\n",
1757 	     substring,Substring_alignstart_trim(substring) - hit->chroffset,
1758 	     Substring_alignend_trim(substring) - 1U - hit->chroffset,
1759 	     Substring_trim_querystart(substring),Substring_trim_queryend(substring));
1760     }
1761     printf("\n");
1762 #endif
1763     /* Plus: Subtract 1 from alignend */
1764     *ilength_low = 0;
1765     for (p = hit->substrings_1toN, q = hit->junctions_1toN; p != NULL; p = List_next(p), q = List_next(q)) {
1766       substring = (Substring_T) List_head(p);
1767       debug15(printf("substring %p: %u..%u, trim %d..%d\n",substring,
1768 		     Substring_alignstart_trim(substring) - hit->chroffset,
1769 		     Substring_alignend_trim(substring) - 1U - hit->chroffset,
1770 		     Substring_trim_querystart(substring),Substring_trim_queryend(substring)));
1771       if (Substring_overlap_point_trimmed_p(substring,common_genomicpos) == false) {
1772 	*ilength_low += Substring_genomic_alignment_length(substring);
1773 	if (q != NULL) {
1774 	  junction = (Junction_T) List_head(q);
1775 	  if (Junction_type(junction) == INS_JUNCTION) {
1776 	    *ilength_low += Junction_nindels(junction);
1777 	  }
1778 	}
1779 
1780       } else {
1781 	*ilength_low += (common_genomicpos - Substring_alignstart_trim(substring) + 1);
1782 	*ilength_high = ((Substring_alignend_trim(substring) - 1) - common_genomicpos + 1);
1783 	p = List_next(p);
1784 	while (p != NULL) {
1785 	  substring = (Substring_T) List_head(p);
1786 	  *ilength_high += Substring_genomic_alignment_length(substring);
1787 	  p = List_next(p);
1788 	}
1789 	while (q != NULL) {
1790 	  junction = (Junction_T) List_head(q);
1791 	  if (Junction_type(junction) == INS_JUNCTION) {
1792 	    *ilength_high += Junction_nindels(junction);
1793 	  }
1794 	  q = List_next(q);
1795 	}
1796 	debug15(printf("Plus: Have ilength_low %d and ilength_high %d\n",*ilength_low,*ilength_high));
1797 	return true;
1798       }
1799     }
1800   } else {
1801 #ifdef DEBUG15
1802     printf("minus.  Checking common genomicpos %llu against\n",common_genomicpos - hit->chroffset);
1803     for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
1804       substring = (Substring_T) List_head(p);
1805       printf("substring %p: %u..%u, trim %d..%d\n",
1806 	     substring,Substring_alignstart_trim(substring) - hit->chroffset,
1807 	     Substring_alignend_trim(substring) - 1U - hit->chroffset,
1808 	     Substring_trim_querystart(substring),Substring_trim_queryend(substring));
1809     }
1810     printf("\n");
1811 #endif
1812     /* Minus: Subtract 1 from alignstart */
1813     *ilength_high = 0;
1814     for (p = hit->substrings_1toN, q = hit->junctions_1toN; p != NULL; p = List_next(p), q = List_next(q)) {
1815       substring = (Substring_T) List_head(p);
1816       debug15(printf("substring: %u..%u\n",
1817 		     Substring_alignstart_trim(substring) - 1U - hit->chroffset,
1818 		     Substring_alignend_trim(substring) - hit->chroffset));
1819       if (Substring_overlap_point_trimmed_p(substring,common_genomicpos) == false) {
1820 	*ilength_high += Substring_genomic_alignment_length(substring);
1821 	if (q != NULL) {
1822 	  junction = (Junction_T) List_head(q);
1823 	  if (Junction_type(junction) == INS_JUNCTION) {
1824 	    *ilength_high += Junction_nindels(junction);
1825 	  }
1826 	}
1827 
1828       } else {
1829 	*ilength_high += ((Substring_alignstart_trim(substring) - 1) - common_genomicpos + 1);
1830 	*ilength_low = (common_genomicpos - (Substring_alignend_trim(substring) /*+ 1*/) + 1);
1831 	p = List_next(p);
1832 	while (p != NULL) {
1833 	  substring = (Substring_T) List_head(p);
1834 	  *ilength_low += Substring_genomic_alignment_length(substring);
1835 	  p = List_next(p);
1836 	}
1837 	while (q != NULL) {
1838 	  junction = (Junction_T) List_head(q);
1839 	  if (Junction_type(junction) == INS_JUNCTION) {
1840 	    *ilength_low += Junction_nindels(junction);
1841 	  }
1842 	  q = List_next(q);
1843 	}
1844 	debug15(printf("Minus: Have ilength_low %d and ilength_high %d\n",*ilength_low,*ilength_high));
1845 	return true;
1846       }
1847     }
1848   }
1849 
1850   return false;
1851 }
1852 
1853 
1854 
1855 /* Needed to compute overlap properly.  Based on pair_insert_length below, plus code for handling GMAP. */
1856 static Univcoord_T
pair_common_genomicpos(Stage3end_T hit5,Stage3end_T hit3)1857 pair_common_genomicpos (Stage3end_T hit5, Stage3end_T hit3) {
1858   Univcoord_T common_genomicpos;
1859   Univcoord_T start5, end5, start3, end3;
1860   List_T p, q;
1861   Substring_T substring, substring5, substring3;
1862 
1863   if (hit5->plusp == true && hit3->plusp == true) {
1864     /* plus/plus */
1865     debug15(printf("Computing overlap using substrings plus/plus\n"));
1866 
1867     start5 = hit5->genomicstart + hit5->trim_querystart + start_amb_length(hit5);
1868     end5 = (hit5->genomicend - 1) - hit5->trim_queryend - end_amb_length(hit5);
1869     start3 = hit3->genomicstart + hit3->trim_querystart + start_amb_length(hit3);
1870     end3 = (hit3->genomicend - 1) - hit3->trim_queryend - end_amb_length(hit3);
1871     debug15(printf("hit5 endpoints are %u..%u.  hit3 endpoints are %u..%u\n",
1872 		   start5-hit5->chroffset,end5-hit5->chroffset,start3-hit3->chroffset,end3-hit3->chroffset));
1873 
1874     if (end3 < start5) {
1875       /* Case 1 */
1876       return false;
1877     } else if (end5 < start3) {
1878       /* Case 6 */
1879       return false;
1880     } else if (start3 < start5) {
1881       if (end3 < end5) {
1882 	/* Case 2: Tails overlap.  Go from start5 to end3 */
1883 	debug15(printf("plus/plus case 2a: start5 %u\n",start5 - hit5->chroffset));
1884 	for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
1885 	  substring = (Substring_T) List_head(p);
1886 	  if (Substring_overlap_point_trimmed_p(substring,start5)) {
1887 	    return start5;
1888 	  }
1889 	}
1890 
1891 	/* Case 2: Tails overlap.  Go from start5 to end3 */
1892 	debug15(printf("plus/plus case 2b: end3 %u\n",end3 - hit3->chroffset));
1893 	for (p = hit5->substrings_Nto1; p != NULL; p = List_next(p)) {
1894 	  substring = (Substring_T) List_head(p);
1895 	  if (Substring_overlap_point_trimmed_p(substring,end3)) {
1896 	    return end3;
1897 	  }
1898 	}
1899 	/* Fall through to general algorithm */
1900 
1901       } else {
1902 	/* Case 3: hit3 subsumes hit5 */
1903 	debug15(printf("plus/plus case 3\n"));
1904 	for (p = hit3->substrings_Nto1; p != NULL; p = List_next(p)) {
1905 	  substring = (Substring_T) List_head(p);
1906 	  if (Substring_overlap_point_trimmed_p(substring,end5)) {
1907 	    return end5;
1908 	  }
1909 	}
1910 	/* Fall through to general algorithm */
1911       }
1912 
1913     } else {
1914       if (end3 < end5) {
1915 	/* Case 4: hit5 subsumes hit3 */
1916 	debug15(printf("plus/plus case 4\n"));
1917 	for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
1918 	  substring = (Substring_T) List_head(p);
1919 	  if (Substring_overlap_point_trimmed_p(substring,start3)) {
1920 	    return start3;
1921 	  }
1922 	}
1923 	/* Fall through to general algorithm */
1924 
1925       } else {
1926 	/* Case 5: Based on hit3_trimmed_length */
1927 	debug15(printf("plus/plus case 5a\n"));
1928 	for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
1929 	  substring = (Substring_T) List_head(p);
1930 	  if (Substring_overlap_point_trimmed_p(substring,start3)) {
1931 	    return start3;
1932 	  }
1933 	}
1934 
1935 	/* Case 5: Based on hit5_trimmed_length */
1936 	debug15(printf("plus/plus case 5b\n"));
1937 	for (p = hit3->substrings_Nto1; p != NULL; p = List_next(p)) {
1938 	  substring = (Substring_T) List_head(p);
1939 	  if (Substring_overlap_point_trimmed_p(substring,end5)) {
1940 	    return end5;
1941 	  }
1942 	}
1943 	/* Fall through to general algorithm */
1944       }
1945     }
1946 
1947     /* General algorithm */
1948     debug15(printf("plus/plus general\n"));
1949     for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
1950       substring3 = (Substring_T) List_head(p);
1951       for (q = hit5->substrings_1toN; q != NULL; q = List_next(q)) {
1952 	substring5 = (Substring_T) List_head(q);
1953 	if ((common_genomicpos = Substring_overlap_segment_trimmed(substring5,substring3)) != 0) {
1954 	  return common_genomicpos;
1955 	}
1956       }
1957     }
1958 
1959     return 0;
1960 
1961   } else if (hit5->plusp == true && hit3->plusp == false) {
1962     /* plus/minus */
1963     debug15(printf("Computing overlap using substrings plus/minus\n"));
1964     return 0;
1965 
1966 #if 0
1967     start5 = hit5->genomicstart + hit5->trim_querystart + start_amb_length(hit5);
1968     end5 = hit5->genomicend - hit5->trim_queryend - end_amb_length(hit5);
1969     start3 = hit3->genomicstart - hit3->trim_querystart - start_amb_length(hit3);
1970     end3 = hit3->genomicend + hit3->trim_queryend + end_amb_length(hit3);
1971 
1972     if (start3 < start5) {
1973       /* Case 1 */
1974       return 0;
1975     } else if (end5 < end3) {
1976       /* Case 6 */
1977       return 0;
1978     } else if (end3 < start5) {
1979       if (start3 < end5) {
1980 	/* Case 2: Tails overlap.  Go from start5 to start3 */
1981 	debug15(printf("plus case 2a: start5 %u\n",start5 - hit5->chroffset));
1982 	if (Substring_overlap_point_trimmed_p(hit3->substring0,start5)) {
1983 	  return start5;
1984 	} else if (Substring_overlap_point_trimmed_p(hit3->substring1,start5)) {
1985 	  return start5;
1986 	} else if (Substring_overlap_point_trimmed_p(hit3->substring2,start5)) {
1987 	  return start5;
1988 	}
1989 
1990 	/* Case 2: Tails overlap.  Go from start5 to start3 */
1991 	debug15(printf("plus case 2b: start3 %u\n",start3 - hit3->chroffset));
1992 	if (Substring_overlap_point_trimmed_p(hit5->substring2,start3)) {
1993 	  return start3;
1994 	} else if (Substring_overlap_point_trimmed_p(hit5->substring1,start3)) {
1995 	  return start3;
1996 	} else if (Substring_overlap_point_trimmed_p(hit5->substring0,start3)) {
1997 	  return start3;
1998 	}
1999 	/* Fall through to general algorithm */
2000 
2001       } else {
2002 	/* Case 3: hit3 subsumes hit5 */
2003 	debug15(printf("plus case 3\n"));
2004 	if (Substring_overlap_point_trimmed_p(hit3->substring2,end5)) {
2005 	  return end5;
2006 	} else if (Substring_overlap_point_trimmed_p(hit3->substring1,end5)) {
2007 	  return end5;
2008 	} else if (Substring_overlap_point_trimmed_p(hit3->substring0,end5)) {
2009 	  return end5;
2010 	}
2011 	/* Fall through to general algorithm */
2012       }
2013 
2014     } else {
2015       if (start3 < end5) {
2016 	/* Case 4: hit5 subsumes hit3 */
2017 	debug15(printf("plus case 4\n"));
2018 	if (Substring_overlap_point_trimmed_p(hit5->substring0,end3)) {
2019 	  return end3;
2020 	} else if (Substring_overlap_point_trimmed_p(hit5->substring1,end3)) {
2021 	  return end3;
2022 	} else if (Substring_overlap_point_trimmed_p(hit5->substring2,end3)) {
2023 	  return end3;
2024 	}
2025 	/* Fall through to general algorithm */
2026 
2027       } else {
2028 	/* Case 5: Based on hit3_trimmed_length */
2029 	debug15(printf("plus case 5a\n"));
2030 	if (Substring_overlap_point_trimmed_p(hit5->substring0,end3)) {
2031 	  return end3;
2032 	} else if (Substring_overlap_point_trimmed_p(hit5->substring1,end3)) {
2033 	  return end3;
2034 	} else if (Substring_overlap_point_trimmed_p(hit5->substring2,end3)) {
2035 	  return end3;
2036 	}
2037 
2038 	/* Case 5: Based on hit5_trimmed_length */
2039 	debug15(printf("plus case 5b\n"));
2040 	if (Substring_overlap_point_trimmed_p(hit3->substring2,end5)) {
2041 	  return end5;
2042 	} else if (Substring_overlap_point_trimmed_p(hit3->substring1,end5)) {
2043 	  return end5;
2044 	} else if (Substring_overlap_point_trimmed_p(hit3->substring0,end5)) {
2045 	  return end5;
2046 	}
2047 	/* Fall through to general algorithm */
2048       }
2049     }
2050 
2051     /* General algorithm */
2052     debug15(printf("plus general: hit3->substring1\n"));
2053     if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring1)) != 0) {
2054       return common_genomicpos;
2055     } else if (hit5->substring2 != NULL &&
2056 	       (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring1)) != 0) {
2057       return common_genomicpos;
2058     } else if (hit5->substring0 != NULL &&
2059 	       (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring1)) != 0) {
2060       return common_genomicpos;
2061     }
2062 
2063     if (hit3->substring2 != NULL) {
2064       debug15(printf("plus general: hit3->substring2\n"));
2065       if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring2)) != 0) {
2066 	return common_genomicpos;
2067       } else if (hit5->substring2 != NULL &&
2068 		 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring2)) != 0) {
2069 	return common_genomicpos;
2070       } else if (hit5->substring0 != NULL &&
2071 		 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring2)) != 0) {
2072 	return common_genomicpos;
2073       }
2074     }
2075 
2076     if (hit3->substring0 != NULL) {
2077       debug15(printf("plus general: hit3->substring0\n"));
2078       if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring0)) != 0) {
2079 	return common_genomicpos;
2080       } else if (hit5->substring2 != NULL &&
2081 		 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring0)) != 0) {
2082 	return common_genomicpos;
2083       } else if (hit5->substring0 != NULL &&
2084 		 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring0)) != 0) {
2085 	return common_genomicpos;
2086       }
2087     }
2088 
2089     return 0U;
2090 #endif
2091 
2092   } else if (hit5->plusp == false && hit3->plusp == true) {
2093     /* minus/plus */
2094     debug15(printf("Computing overlap using substrings minus/plus\n"));
2095     return 0;
2096 
2097 #if 0
2098     start5 = hit5->genomicstart - hit5->trim_querystart - start_amb_length(hit5);
2099     end5 = hit5->genomicend + hit5->trim_queryend + end_amb_length(hit5);
2100     start3 = hit3->genomicstart + hit3->trim_querystart + start_amb_length(hit3);
2101     end3 = hit3->genomicend - hit3->trim_queryend - end_amb_length(hit3);
2102 
2103     if (end3 < end5) {
2104       /* Case 1 */
2105       return 0;
2106     } else if (start5 < start3) {
2107       /* Case 6 */
2108       return 0;
2109     } else if (start3 < end5) {
2110       if (end3 < start5) {
2111 	/* Case 2: Tails overlap.  Go from end5 to end3 */
2112 	debug15(printf("plus case 2a: end5 %u\n",end5 - hit5->chroffset));
2113 	if (Substring_overlap_point_trimmed_p(hit3->substring0,end5)) {
2114 	  return end5;
2115 	} else if (Substring_overlap_point_trimmed_p(hit3->substring1,end5)) {
2116 	  return end5;
2117 	} else if (Substring_overlap_point_trimmed_p(hit3->substring2,end5)) {
2118 	  return end5;
2119 	}
2120 
2121 	/* Case 2: Tails overlap.  Go from end5 to end3 */
2122 	debug15(printf("plus case 2b: end3 %u\n",end3 - hit3->chroffset));
2123 	if (Substring_overlap_point_trimmed_p(hit5->substring2,end3)) {
2124 	  return end3;
2125 	} else if (Substring_overlap_point_trimmed_p(hit5->substring1,end3)) {
2126 	  return end3;
2127 	} else if (Substring_overlap_point_trimmed_p(hit5->substring0,end3)) {
2128 	  return end3;
2129 	}
2130 	/* Fall through to general algorithm */
2131 
2132       } else {
2133 	/* Case 3: hit3 subsumes hit5 */
2134 	debug15(printf("plus case 3\n"));
2135 	if (Substring_overlap_point_trimmed_p(hit3->substring2,start5)) {
2136 	  return start5;
2137 	} else if (Substring_overlap_point_trimmed_p(hit3->substring1,start5)) {
2138 	  return start5;
2139 	} else if (Substring_overlap_point_trimmed_p(hit3->substring0,start5)) {
2140 	  return start5;
2141 	}
2142 	/* Fall through to general algorithm */
2143       }
2144 
2145     } else {
2146       if (end3 < start5) {
2147 	/* Case 4: hit5 subsumes hit3 */
2148 	debug15(printf("plus case 4\n"));
2149 	if (Substring_overlap_point_trimmed_p(hit5->substring0,start3)) {
2150 	  return start3;
2151 	} else if (Substring_overlap_point_trimmed_p(hit5->substring1,start3)) {
2152 	  return start3;
2153 	} else if (Substring_overlap_point_trimmed_p(hit5->substring2,start3)) {
2154 	  return start3;
2155 	}
2156 	/* Fall through to general algorithm */
2157 
2158       } else {
2159 	/* Case 5: Based on hit3_trimmed_length */
2160 	debug15(printf("plus case 5a\n"));
2161 	if (Substring_overlap_point_trimmed_p(hit5->substring0,start3)) {
2162 	  return start3;
2163 	} else if (Substring_overlap_point_trimmed_p(hit5->substring1,start3)) {
2164 	  return start3;
2165 	} else if (Substring_overlap_point_trimmed_p(hit5->substring2,start3)) {
2166 	  return start3;
2167 	}
2168 
2169 	/* Case 5: Based on hit5_trimmed_length */
2170 	debug15(printf("plus case 5b\n"));
2171 	if (Substring_overlap_point_trimmed_p(hit3->substring2,start5)) {
2172 	  return start5;
2173 	} else if (Substring_overlap_point_trimmed_p(hit3->substring1,start5)) {
2174 	  return start5;
2175 	} else if (Substring_overlap_point_trimmed_p(hit3->substring0,start5)) {
2176 	  return start5;
2177 	}
2178 	/* Fall through to general algorithm */
2179       }
2180     }
2181 
2182     /* General algorithm */
2183     debug15(printf("plus general: hit3->substring1\n"));
2184     if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring1)) != 0) {
2185       return common_genomicpos;
2186     } else if (hit5->substring2 != NULL &&
2187 	       (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring1)) != 0) {
2188       return common_genomicpos;
2189     } else if (hit5->substring0 != NULL &&
2190 	       (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring1)) != 0) {
2191       return common_genomicpos;
2192     }
2193 
2194     if (hit3->substring2 != NULL) {
2195       debug15(printf("plus general: hit3->substring2\n"));
2196       if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring2)) != 0) {
2197 	return common_genomicpos;
2198       } else if (hit5->substring2 != NULL &&
2199 		 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring2)) != 0) {
2200 	return common_genomicpos;
2201       } else if (hit5->substring0 != NULL &&
2202 		 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring2)) != 0) {
2203 	return common_genomicpos;
2204       }
2205     }
2206 
2207     if (hit3->substring0 != NULL) {
2208       debug15(printf("plus general: hit3->substring0\n"));
2209       if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring0)) != 0) {
2210 	return common_genomicpos;
2211       } else if (hit5->substring2 != NULL &&
2212 		 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring0)) != 0) {
2213 	return common_genomicpos;
2214       } else if (hit5->substring0 != NULL &&
2215 		 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring0)) != 0) {
2216 	return common_genomicpos;
2217       }
2218     }
2219 
2220     return 0;
2221 #endif
2222 
2223   } else if (hit5->plusp == false && hit3->plusp == false) {
2224     /* minus/minus */
2225     debug15(printf("Computing overlap using substrings minus/minus\n"));
2226 
2227     start5 = (hit5->genomicstart - 1) - hit5->trim_querystart /*- start_amb_length(hit5)*/;
2228     end5 = hit5->genomicend + hit5->trim_queryend /*+ end_amb_length(hit5)*/;
2229     start3 = (hit3->genomicstart - 1) - hit3->trim_querystart /*- start_amb_length(hit3)*/;
2230     end3 = hit3->genomicend + hit3->trim_queryend /*+ end_amb_length(hit3)*/;
2231     debug15(printf("hit5 endpoints are %u..%u.  hit3 endpoints are %u..%u\n",
2232 		   start5-hit5->chroffset,end5-hit5->chroffset,start3-hit3->chroffset,end3-hit3->chroffset));
2233 
2234     if (end3 > start5) {
2235       /* Case 1 */
2236       return 0;
2237     } else if (end5 > start3) {
2238       /* Case 6 */
2239       return 0;
2240     } else if (start3 > start5) {
2241       if (end3 > end5) {
2242 	/* Case 2: Tails overlap.  Go from start5 to end3 */
2243 	debug15(printf("minus/minus case 2a: start5 %llu (%u)\n",start5,start5 - hit5->chroffset));
2244 	for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
2245 	  substring = (Substring_T) List_head(p);
2246 	  if (Substring_overlap_point_trimmed_p(substring,start5)) {
2247 	    return start5;
2248 	  }
2249 	}
2250 
2251 	/* Case 2: Tails overlap.  Go from start5 to end3 */
2252 	debug15(printf("plus case 2b: end3 %u\n",end3 - hit3->chroffset));
2253 	for (p = hit5->substrings_Nto1; p != NULL; p = List_next(p)) {
2254 	  substring = (Substring_T) List_head(p);
2255 	  if (Substring_overlap_point_trimmed_p(substring,end3)) {
2256 	    return end3;
2257 	  }
2258 	}
2259 	/* Fall through to general algorithm */
2260 
2261       } else {
2262 	/* Case 3: hit3 subsumes hit5 */
2263 	debug15(printf("minus/minus case 3: end5 %u\n",end5 - hit5->chroffset));
2264 	for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
2265 	  substring = (Substring_T) List_head(p);
2266 	  if (Substring_overlap_point_trimmed_p(substring,end5)) {
2267 	    return end5;
2268 	  }
2269 	}
2270 
2271 	/* Fall through to general algorithm */
2272       }
2273 
2274     } else {
2275       if (end3 > end5) {
2276 	/* Case 4: hit5 subsumes hit3 */
2277 	debug15(printf("minus/minus case 4: start3 %u\n",(Chrpos_T) (start3 - hit3->chroffset)));
2278 	for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
2279 	  substring = (Substring_T) List_head(p);
2280 	  if (Substring_overlap_point_trimmed_p(substring,start3)) {
2281 	    return start3;
2282 	  }
2283 	}
2284 	/* Fall through to general algorithm */
2285 
2286       } else {
2287 	/* Case 5: Based on hit3_trimmed_length */
2288 	debug15(printf("minus case 5a: start3 %u\n",start3 - hit3->chroffset));
2289 	for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
2290 	  substring = (Substring_T) List_head(p);
2291 	  if (Substring_overlap_point_trimmed_p(substring,start3)) {
2292 	    return start3;
2293 	  }
2294 	}
2295 
2296 	/* Case 5: Based on hit5_trimmed_length */
2297 	debug15(printf("minus case 5b: end5 %u\n",end5 - hit5->chroffset));
2298 	for (p = hit3->substrings_Nto1; p != NULL; p = List_next(p)) {
2299 	  substring = (Substring_T) List_head(p);
2300 	  if (Substring_overlap_point_trimmed_p(substring,end5)) {
2301 	    return end5;
2302 	  }
2303 	}
2304 	/* Fall through to general algorithm */
2305       }
2306     }
2307 
2308     /* General algorithm */
2309     debug15(printf("minus/minus general\n"));
2310     for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
2311       substring3 = (Substring_T) List_head(p);
2312       for (q = hit5->substrings_1toN; q != NULL; q = List_next(q)) {
2313 	substring5 = (Substring_T) List_head(q);
2314 	if ((common_genomicpos = Substring_overlap_segment_trimmed(substring5,substring3)) != 0) {
2315 	  return common_genomicpos;
2316 	}
2317       }
2318     }
2319 
2320     return 0;
2321 
2322   } else {
2323     abort();
2324     return 0;
2325   }
2326 }
2327 
2328 
2329 static bool
test_hardclips(Univcoord_T * common_genomicpos,int hardclip_low,Stage3end_T hit_low,int hardclip_high,Stage3end_T hit_high,Univcoord_T chroffset)2330 test_hardclips (Univcoord_T *common_genomicpos, int hardclip_low, Stage3end_T hit_low,
2331 		int hardclip_high, Stage3end_T hit_high, Univcoord_T chroffset) {
2332   Substring_T low_substring, high_substring;
2333   int low_querypos, high_querypos;
2334   int low_querylength, high_querylength;
2335   bool plusp;
2336 
2337   low_querylength = hit_low->querylength;
2338   high_querylength = hit_high->querylength;
2339 
2340   debug15(printf("Entering test_hardclips with hardclip_low %d, hardclip_high %d\n",
2341 		 hardclip_low,hardclip_high));
2342   debug15(printf("querylength_low %d, querylength_high %d\n",low_querylength,high_querylength));
2343 
2344   plusp = Stage3end_plusp(hit_low);
2345 
2346   if (plusp == true) {
2347     low_querypos = hardclip_low;
2348     high_querypos = high_querylength /*- 1*/ - hardclip_high;
2349     debug15(printf("Both substrings, plus.  low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2350 
2351     if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2352       debug15(printf("Fails because low_querypos %d gives a NULL substring\n",low_querypos));
2353       return false;
2354     } else if (Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring) {
2355       debug15(printf("Fails because low_querypos %d - 1 gives substring %p\n",
2356 		     low_querypos,Stage3end_substring_containing(hit_low,low_querypos-1)));
2357       return false;
2358     } else if (Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring) {
2359       debug15(printf("Fails because low_querypos %d + 1 gives substring %p\n",
2360 		     low_querypos,Stage3end_substring_containing(hit_low,low_querypos+1)));
2361       return false;
2362     } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2363       debug15(printf("Fails because high_querypos %d gives a NULL substring\n",high_querypos));
2364       return false;
2365     } else if (Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring) {
2366       debug15(printf("Fails because high_querypos %d - 1 gives substring %p\n",
2367 		     high_querypos,Stage3end_substring_containing(hit_high,high_querypos-1)));
2368       return false;
2369     } else if (Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring) {
2370       debug15(printf("Fails because high_querypos %d + 1 gives substring %p\n",
2371 		     high_querypos,Stage3end_substring_containing(hit_high,high_querypos+1)));
2372       return false;
2373     } else if (Substring_genomicstart(low_substring) + low_querypos - chroffset != Substring_genomicstart(high_substring) + high_querypos - chroffset) {
2374       debug15(printf("Fails because low chrpos %u != high chrpos %u\n",
2375 		     Substring_genomicstart(low_substring) + low_querypos - chroffset,
2376 		     Substring_genomicstart(high_substring) + high_querypos - chroffset));
2377       return false;
2378     } else {
2379       *common_genomicpos = Substring_genomicstart(low_substring) + low_querypos; /* Want univcoord */
2380       debug15(printf("Succeeds with common point %u\n",*common_genomicpos - chroffset));
2381       return true;
2382     }
2383 
2384   } else {
2385     low_querypos = low_querylength /*- 1*/ - hardclip_low;
2386     high_querypos = hardclip_high;
2387     debug15(printf("Both substrings, minus.  low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2388 
2389     if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2390       debug15(printf("Fails because low_querypos %d gives a NULL substring\n",low_querypos));
2391       return false;
2392     } else if (Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring) {
2393       debug15(printf("Fails because low_querypos %d - 1 gives substring %p\n",
2394 		     low_querypos,Stage3end_substring_containing(hit_low,low_querypos-1)));
2395       return false;
2396     } else if (Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring) {
2397       debug15(printf("Fails because low_querypos %d + 1 gives substring %p\n",
2398 		     low_querypos,Stage3end_substring_containing(hit_low,low_querypos+1)));
2399       return false;
2400     } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2401       debug15(printf("Fails because high_querypos %d gives a NULL substring\n",high_querypos));
2402       return false;
2403     } else if (Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring) {
2404       debug15(printf("Fails because high_querypos %d - 1 gives substring %p\n",
2405 		     high_querypos,Stage3end_substring_containing(hit_high,high_querypos-1)));
2406       return false;
2407     } else if (Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring) {
2408       debug15(printf("Fails because high_querypos %d + 1 gives substring %p\n",
2409 		     high_querypos,Stage3end_substring_containing(hit_high,high_querypos+1)));
2410       return false;
2411     }  else if ((Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset != (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset) {
2412       debug15(printf("Fails because low chrpos %u != high chrpos %u\n",
2413 		     (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset,
2414 		     (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset));
2415       return false;
2416     } else {
2417       *common_genomicpos = (Substring_genomicstart(low_substring) - 1) - low_querypos; /* Want univcoord */
2418       debug15(printf("Succeeds with common point %u\n",*common_genomicpos - chroffset));
2419       return true;
2420     }
2421   }
2422 }
2423 
2424 
2425 
2426 /* Replaces adjust_hardclips in samprint.c */
2427 static Univcoord_T
adjust_hardclips_right(int * shift,int hardclip_low,Stage3end_T hit_low,int hardclip_high,Stage3end_T hit_high,Univcoord_T chroffset)2428 adjust_hardclips_right (int *shift, int hardclip_low, Stage3end_T hit_low,
2429 			int hardclip_high, Stage3end_T hit_high, Univcoord_T chroffset) {
2430   Substring_T low_substring, high_substring;
2431   int low_querypos, high_querypos;
2432   int low_querylength, high_querylength;
2433   Chrpos_T low_chrpos, high_chrpos;
2434   bool plusp;
2435 
2436 
2437   low_querylength = hit_low->querylength;
2438   high_querylength = hit_high->querylength;
2439 
2440   debug15(printf("Entering adjust_hardclips_right with hardclip_low %d, hardclip_high %d\n",
2441 		 hardclip_low,hardclip_high));
2442   *shift = 1;			/* Making an initial move before each while loop */
2443   plusp = Stage3end_plusp(hit_low);
2444 
2445   if (plusp == true) {
2446     low_querypos = hardclip_low;
2447     high_querypos = high_querylength /*- 1*/ - hardclip_high;
2448     debug15(printf("Both substrings, plus.  low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2449 
2450     low_querypos++;
2451     high_querypos++;
2452     debug15(printf("right shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2453     while ((low_querypos + 1) < low_querylength && (high_querypos + 1) < high_querylength &&
2454 	   ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2455 	    Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring ||
2456 	    Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring ||
2457 	    (high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL ||
2458 	    Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring ||
2459 	    Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring ||
2460 	    Substring_genomicstart(low_substring) + low_querypos - chroffset != Substring_genomicstart(high_substring) + high_querypos - chroffset)) {
2461       (*shift) += 1;
2462       if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2463 	low_querypos++;
2464       } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2465 	high_querypos++;
2466       } else {
2467 	low_chrpos = Substring_genomicstart(low_substring) + low_querypos - chroffset;
2468 	high_chrpos = Substring_genomicstart(high_substring) + high_querypos - chroffset;
2469 	if (low_chrpos < high_chrpos) {
2470 	  debug15(printf("low_chrpos %u < high_chrpos %u, so advancing low_querypos\n",low_chrpos,high_chrpos));
2471 	  low_querypos++;
2472 	} else if (high_chrpos < low_chrpos) {
2473 	  debug15(printf("high_chrpos %u < low_chrpos %u, so advancing high_querypos\n",high_chrpos,low_chrpos));
2474 	  high_querypos++;
2475 	} else {
2476 	  low_querypos++;
2477 	  high_querypos++;
2478 	}
2479       }
2480       debug15(printf("right shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2481     }
2482 
2483     if ((low_querypos + 1) >= low_querylength ||
2484 	(high_querypos + 1) >= high_querylength ||
2485 	(low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2486 	Stage3end_substring_containing(hit_high,high_querypos) == NULL) {
2487       *shift = 0;
2488       return 0;
2489     } else {
2490       debug15(printf("Returning %u + %d\n",Substring_genomicstart(low_substring) - chroffset,
2491 		     low_querypos));
2492       assert((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) != NULL);
2493       assert((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) != NULL);
2494       assert(Stage3end_substring_containing(hit_low,low_querypos-1) == low_substring);
2495       assert(Stage3end_substring_containing(hit_low,low_querypos+1) == low_substring);
2496       assert(Stage3end_substring_containing(hit_high,high_querypos-1) == high_substring);
2497       assert(Stage3end_substring_containing(hit_high,high_querypos+1) == high_substring);
2498       return Substring_genomicstart(low_substring) + low_querypos; /* Want univcoord */
2499     }
2500 
2501   } else {
2502     low_querypos = low_querylength /*- 1*/ - hardclip_low;
2503     high_querypos = hardclip_high;
2504     debug15(printf("Both substrings, minus.  low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2505 
2506     low_querypos--;
2507     high_querypos--;
2508     debug15(printf("right shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2509     while ((low_querypos - 1) >= 0 && (high_querypos - 1) >= 0 &&
2510 	   ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2511 	    Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring ||
2512 	    Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring ||
2513 	    (high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL ||
2514 	    Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring ||
2515 	    Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring ||
2516 	    (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset != (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset)) {
2517       (*shift) += 1;
2518       if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2519 	low_querypos--;
2520       } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2521 	high_querypos--;
2522       } else {
2523 	low_chrpos = (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset;
2524 	high_chrpos = (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset;
2525 	if (low_chrpos < high_chrpos) {
2526 	  debug15(printf("low_chrpos %u < high_chrpos %u, so decreasing low_querypos\n",low_chrpos,high_chrpos));
2527 	  low_querypos--;
2528 	} else if (high_chrpos < low_chrpos) {
2529 	  debug15(printf("high_chrpos %u < low_chrpos %u, so decreasing high_querypos\n",high_chrpos,low_chrpos));
2530 	  high_querypos--;
2531 	} else {
2532 	  low_querypos--;
2533 	  high_querypos--;
2534 	}
2535       }
2536       debug15(printf("right shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2537     }
2538 
2539     if ((low_querypos - 1) < 0 ||
2540 	(high_querypos - 1) < 0 ||
2541 	(low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2542 	Stage3end_substring_containing(hit_high,high_querypos) == NULL) {
2543       *shift = 0;
2544       return 0;
2545     } else {
2546       debug15(printf("Returning %u - %d\n",Substring_genomicstart(low_substring) - chroffset,
2547 		     low_querypos));
2548       assert((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) != NULL);
2549       assert((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) != NULL);
2550       assert(Stage3end_substring_containing(hit_low,low_querypos-1) == low_substring);
2551       assert(Stage3end_substring_containing(hit_low,low_querypos+1) == low_substring);
2552       assert(Stage3end_substring_containing(hit_high,high_querypos-1) == high_substring);
2553       assert(Stage3end_substring_containing(hit_high,high_querypos+1) == high_substring);
2554       return (Substring_genomicstart(low_substring) - 1) - low_querypos; /* Want univcoord */
2555     }
2556   }
2557 }
2558 
2559 
2560 /* Replaces adjust_hardclips in samprint.c */
2561 static Univcoord_T
adjust_hardclips_left(int * shift,int hardclip_low,Stage3end_T hit_low,int hardclip_high,Stage3end_T hit_high,Univcoord_T chroffset)2562 adjust_hardclips_left (int *shift, int hardclip_low, Stage3end_T hit_low,
2563 		       int hardclip_high, Stage3end_T hit_high, Univcoord_T chroffset) {
2564   Substring_T low_substring, high_substring;
2565   int low_querypos, high_querypos;
2566   int low_querylength, high_querylength;
2567   Chrpos_T low_chrpos, high_chrpos;
2568   bool plusp;
2569 
2570 
2571   low_querylength = hit_low->querylength;
2572   high_querylength = hit_high->querylength;
2573 
2574   debug15(printf("Entering adjust_hardclips_left with hardclip_low %d, hardclip_high %d\n",
2575 		 hardclip_low,hardclip_high));
2576   *shift = 1;			/* Making an initial move before each while loop */
2577   plusp = Stage3end_plusp(hit_low);
2578 
2579   if (plusp == true) {
2580     low_querypos = hardclip_low;
2581     high_querypos = high_querylength /*- 1*/ - hardclip_high;
2582     debug15(printf("Both substrings, plus.  low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2583 
2584     low_querypos--;
2585     high_querypos--;
2586     debug15(printf("left shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2587     while ((low_querypos - 1) >= 0 && (high_querypos - 1) >= 0 &&
2588 	   ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2589 	    Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring ||
2590 	    Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring ||
2591 	    (high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL ||
2592 	    Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring ||
2593 	    Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring ||
2594 	    Substring_genomicstart(low_substring) + low_querypos - chroffset != Substring_genomicstart(high_substring) + high_querypos - chroffset)) {
2595       (*shift) += 1;
2596       if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2597 	low_querypos--;
2598       } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2599 	high_querypos--;
2600       } else {
2601 	low_chrpos = Substring_genomicstart(low_substring) + low_querypos - chroffset;
2602 	high_chrpos = Substring_genomicstart(high_substring) + high_querypos - chroffset;
2603 	if (low_chrpos > high_chrpos) {
2604 	  debug15(printf("low_chrpos %u > high_chrpos %u, so decreasing low_querypos\n",low_chrpos,high_chrpos));
2605 	  low_querypos--;
2606 	} else if (high_chrpos > low_chrpos) {
2607 	  debug15(printf("high_chrpos %u > low_chrpos %u, so decreasing high_querypos\n",high_chrpos,low_chrpos));
2608 	  high_querypos--;
2609 	} else {
2610 	  low_querypos--;
2611 	  high_querypos--;
2612 	}
2613       }
2614       debug15(printf("left shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2615     }
2616 
2617     if ((low_querypos - 1) < 0 || (high_querypos - 1) < 0 ||
2618 	(low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2619 	Stage3end_substring_containing(hit_high,high_querypos) == NULL) {
2620       *shift = 0;
2621       return 0;
2622     } else {
2623       debug15(printf("Returning %u + %d\n",Substring_genomicstart(low_substring) - chroffset,
2624 		     low_querypos));
2625       assert((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) != NULL);
2626       assert((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) != NULL);
2627       assert(Stage3end_substring_containing(hit_low,low_querypos-1) == low_substring);
2628       assert(Stage3end_substring_containing(hit_low,low_querypos+1) == low_substring);
2629       assert(Stage3end_substring_containing(hit_high,high_querypos-1) == high_substring);
2630       assert(Stage3end_substring_containing(hit_high,high_querypos+1) == high_substring);
2631       return Substring_genomicstart(low_substring) + low_querypos; /* Want univcoord */
2632     }
2633 
2634   } else {
2635     low_querypos = low_querylength /*- 1*/ - hardclip_low;
2636     high_querypos = hardclip_high;
2637     debug15(printf("Both substrings, minus.  low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2638 
2639     low_querypos++;
2640     high_querypos++;
2641     debug15(printf("left shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2642     while ((low_querypos + 1) < low_querylength && (high_querypos + 1) < high_querylength &&
2643 	   ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2644 	    Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring ||
2645 	    Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring ||
2646 	    (high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL ||
2647 	    Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring ||
2648 	    Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring ||
2649 	    (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset != (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset)) {
2650       (*shift) += 1;
2651       if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2652 	low_querypos++;
2653       } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2654 	high_querypos++;
2655       } else {
2656 	low_chrpos = (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset;
2657 	high_chrpos = (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset;
2658 	if (low_chrpos > high_chrpos) {
2659 	  debug15(printf("low_chrpos %u > high_chrpos %u, so advancing low_querypos\n",low_chrpos,high_chrpos));
2660 	  low_querypos++;
2661 	} else if (high_chrpos > low_chrpos) {
2662 	  debug15(printf("high_chrpos %u > low_chrpos %u, so advancing high_querypos\n",high_chrpos,low_chrpos));
2663 	  high_querypos++;
2664 	} else {
2665 	  low_querypos++;
2666 	  high_querypos++;
2667 	}
2668       }
2669       debug15(printf("left shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2670     }
2671 
2672     if ((low_querypos + 1) >= low_querylength || (high_querypos + 1) >= high_querylength ||
2673 	(low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2674 	Stage3end_substring_containing(hit_high,high_querypos) == NULL) {
2675       *shift = 0;
2676       return 0;
2677     } else {
2678       debug15(printf("Returning %u - %d\n",Substring_genomicstart(low_substring) - chroffset,
2679 		     low_querypos));
2680       assert((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) != NULL);
2681       assert((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) != NULL);
2682       assert(Stage3end_substring_containing(hit_low,low_querypos-1) == low_substring);
2683       assert(Stage3end_substring_containing(hit_low,low_querypos+1) == low_substring);
2684       assert(Stage3end_substring_containing(hit_high,high_querypos-1) == high_substring);
2685       assert(Stage3end_substring_containing(hit_high,high_querypos+1) == high_substring);
2686       return (Substring_genomicstart(low_substring) - 1) - low_querypos; /* Want univcoord */
2687     }
2688   }
2689 }
2690 
2691 
2692 
2693 /* Note: Do not alter this->insertlength, which is used for SAM
2694    output.  The insertlength computed here is used only for performing
2695    --clip-overlap or --merge-overlap */
2696 int
Stage3pair_overlap(int * hardclip5_low,int * hardclip5_high,int * hardclip3_low,int * hardclip3_high,Stage3pair_T this)2697 Stage3pair_overlap (int *hardclip5_low, int *hardclip5_high, int *hardclip3_low, int *hardclip3_high, Stage3pair_T this) {
2698   Stage3end_T hit5, hit3;
2699   int clipdir;
2700   int ilength53, ilength35, ilength5_low, ilength5_high, ilength3_low, ilength3_high;
2701   int common_shift, common_left, common_right;
2702   Univcoord_T common_genomicpos, common_genomicpos_right, common_genomicpos_left;
2703   int shift_right, shift_left;
2704 #ifdef DEBUG15
2705   int overlap;
2706 #endif
2707 
2708 
2709   *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2710 
2711   hit5 = this->hit5;
2712   hit3 = this->hit3;
2713 
2714   debug15(printf("Entered Stage3pair_overlap with hittype %s and %s\n",
2715 		 hittype_string(hit5->hittype),hittype_string(hit3->hittype)));
2716   if (hit5->hittype == SAMECHR_SPLICE || hit5->hittype == TRANSLOC_SPLICE) {
2717     return 0;
2718   } else if (hit3->hittype == SAMECHR_SPLICE || hit3->hittype == TRANSLOC_SPLICE) {
2719     return 0;
2720   } else if (hit5->plusp != hit3->plusp) {
2721     debug15(printf("The two ends are not on the same strand, so returning 0\n"));
2722     return 0;
2723   } else {
2724     debug15(printf("hit5 trim_querystart %d + amb_start %d, trim_queryend %d + amb_end %d, hit3 trim_querystart %d + amb_start %d, trim_queryend %d + amb_end %d\n",
2725 		   hit5->trim_querystart,start_amb_length(hit5),hit5->trim_queryend,end_amb_length(hit5),
2726 		   hit3->trim_querystart,start_amb_length(hit3),hit3->trim_queryend,end_amb_length(hit3)));
2727     if (hit5->plusp == true) {
2728       /* plus */
2729 #if 0
2730       hit5_trimmed_length = hit5->querylength - hit5->trim_querystart - hit5->trim_queryend - start_amb_length(hit5) - end_amb_length(hit5);
2731       hit3_trimmed_length = hit3->querylength - hit3->trim_querystart - hit3->trim_queryend - start_amb_length(hit3) - end_amb_length(hit3);
2732       totallength = hit5_trimmed_length + hit3_trimmed_length;
2733       debug15(printf("totallength = %d, hit5 trimmed length = %d, hit3 trimmed length = %d\n",
2734 		     totallength,hit5_trimmed_length,hit3_trimmed_length));
2735       debug15(printf("original insertlength: %d, trim+amb5: %d..%d, trim+amb3: %d..%d\n",
2736 		     this->insertlength,hit5->trim_querystart + start_amb_length(hit5),
2737 		     hit5->trim_queryend + end_amb_length(hit5),hit3->trim_querystart + start_amb_length(hit3),
2738 		     hit3->trim_queryend + end_amb_length(hit3)));
2739 #endif
2740 
2741       if ((common_genomicpos = pair_common_genomicpos(hit5,hit3)) == 0) {
2742 	debug15(printf("Cannot determine a common point, so returning 0\n"));
2743 	return 0;
2744 
2745       } else if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
2746 		 find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
2747 	debug15(printf("Cannot determine ilengths, so returning 0\n"));
2748 	return 0;
2749 
2750       } else {
2751 	debug15(printf("Inclusive: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2752 	debug15(printf("ilength53 is %d, ilength 35 is %d\n",ilength5_low + ilength3_high - 1,ilength3_low + ilength5_high - 1));
2753 
2754 	common_left = (ilength5_low < ilength3_low) ? ilength5_low : ilength3_low;
2755 	common_right = (ilength5_high < ilength3_high) ? ilength5_high : ilength3_high;
2756 	if (common_right > common_left) {
2757 	  common_shift = common_right/2 - (common_left - 1)/2;
2758 	  debug15(printf("Common shift is %d = common_right %d/2 - (common_left %d - 1)/2\n",
2759 			 common_shift,common_right,common_left));
2760 	  assert(ilength5_low > 0);
2761 	  assert(ilength3_low > 0);
2762 	  ilength5_low -= 1;
2763 	  ilength3_low -= 1;
2764 	} else {
2765 	  common_shift = (common_right - 1)/2 - common_left/2;
2766 	  debug15(printf("Common shift is %d = (common_right %d - 1)/2 - common_left %d/2\n",
2767 			 common_shift,common_right,common_left));
2768 	  assert(ilength5_high > 0);
2769 	  assert(ilength3_high > 0);
2770 	  ilength5_high -= 1;
2771 	  ilength3_high -= 1;
2772 	}
2773 	debug15(printf("Exclusive: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2774 
2775 
2776 	if ((ilength53 = ilength5_low + ilength3_high) >= (ilength35 = ilength3_low + ilength5_high)) {
2777 	  /* Use >=, not >, so we favor clipping heads over clipping tails in case of a tie */
2778 	  debug15(printf("plus, ilength53 is longer.  Clipping heads.\n"));
2779 	  debug15(printf("Overlap is %d = common_left %d + common_right %d - 1\n",
2780 			 common_left+common_right-1,common_left,common_right));
2781 	  clipdir = +1;
2782 
2783 	  /* Want to clip 5 high and 3 low */
2784 	  *hardclip5_high = ilength5_high - common_shift;
2785 	  *hardclip3_low = ilength3_low + common_shift;
2786 	  debug15(printf("Overlap clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2787 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2788 	  *hardclip5_high += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
2789 	  *hardclip3_low += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
2790 	  debug15(printf("Ambig clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2791 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2792 
2793 	  if (common_shift != 0) {
2794 	    if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == true) {
2795 	      /* No adjustment needed, but need to recompute ilengths for shifted common_genomicpos */
2796 	    } else {
2797 	      common_genomicpos_right = adjust_hardclips_right(&shift_right,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset);
2798 	      common_genomicpos_left = adjust_hardclips_left(&shift_left,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset);
2799 	      debug15(printf("shift_right %d, shift_left %d\n",shift_right,shift_left));
2800 	      if (shift_right == 0 && shift_left == 0) {
2801 		/* Try original position without a shift */
2802 		*hardclip5_high = ilength5_high /*- common_shift*/;
2803 		*hardclip3_low = ilength3_low /*+ common_shift*/;
2804 		*hardclip5_high += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
2805 		*hardclip3_low += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
2806 		if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == false) {
2807 		  *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2808 		  return 0;
2809 		}
2810 	      } else if (shift_left == 0) {
2811 		common_genomicpos = common_genomicpos_right;
2812 	      } else if (shift_right == 0) {
2813 		common_genomicpos = common_genomicpos_left;
2814 	      } else if (shift_right <= shift_left) {
2815 		common_genomicpos = common_genomicpos_right;
2816 	      } else {
2817 		common_genomicpos = common_genomicpos_left;
2818 	      }
2819 	    }
2820 
2821 	    debug15(printf("New common point is %u\n",common_genomicpos - hit3->chroffset));
2822 	    /* Recompute hardclips */
2823 	    if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
2824 		find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
2825 	      *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2826 	      return 0;
2827 	    } else if (ilength3_low > ilength5_high) {
2828 	      debug15(printf("Uneven: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2829 	      assert(ilength3_low > 0);
2830 	      ilength3_low -= 1;
2831 	    } else {
2832 	      debug15(printf("Uneven: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2833 	      assert(ilength5_high > 0);
2834 	      ilength5_high -= 1;
2835 	    }
2836 	    debug15(printf("Even: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2837 
2838 	    *hardclip5_high = ilength5_high /*- common_shift*/;
2839 	    *hardclip3_low = ilength3_low /*+ common_shift*/;
2840 	    debug15(printf("Initial computation of clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2841 			   *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2842 
2843 	    *hardclip5_high += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
2844 	    *hardclip3_low += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
2845 	    debug15(printf("Recomputed clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2846 			   *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2847 	  }
2848 
2849 #if 0
2850 	  if (*hardclip5_high < 0) {
2851 	    *hardclip5_high = 0;
2852 	  }
2853 	  if (*hardclip3_low < 0) {
2854 	    *hardclip3_low = 0;
2855 	  }
2856 	  debug15(printf("Positive clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2857 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2858 #endif
2859 
2860 	} else {
2861 	  debug15(printf("plus, ilength35 is longer.  Clipping tails.\n"));
2862 	  debug15(printf("Overlap is %d = common_left %d + common_right %d - 1\n",
2863 			 common_left+common_right-1,common_left,common_right));
2864 	  clipdir = -1;
2865 
2866 	  /* Want to clip 5 low and 3 high */
2867 	  *hardclip5_low = ilength5_low + common_shift;
2868 	  *hardclip3_high = ilength3_high - common_shift;
2869 	  debug15(printf("Overlap clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2870 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2871 	  *hardclip5_low += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
2872 	  *hardclip3_high += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
2873 	  debug15(printf("Ambig clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2874 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2875 
2876 	  if (common_shift != 0) {
2877 	    if (test_hardclips(&common_genomicpos,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset) == true) {
2878 	      /* No adjustment needed, but need to recompute ilengths for shifted common_genomicpos */
2879 	    } else {
2880 	      common_genomicpos_right = adjust_hardclips_right(&shift_right,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset);
2881 	      common_genomicpos_left = adjust_hardclips_left(&shift_left,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset);
2882 	      debug15(printf("shift_right %d, shift_left %d\n",shift_right,shift_left));
2883 	      if (shift_right == 0 && shift_left == 0) {
2884 		/* Try original position without a shift */
2885 		*hardclip5_low = ilength5_low /*+ common_shift*/;
2886 		*hardclip3_high = ilength3_high /*- common_shift*/;
2887 		*hardclip5_low += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
2888 		*hardclip3_high += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
2889 		if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == false) {
2890 		  *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2891 		  return 0;
2892 		}
2893 	      } else if (shift_left == 0) {
2894 		common_genomicpos = common_genomicpos_right;
2895 	      } else if (shift_right == 0) {
2896 		common_genomicpos = common_genomicpos_left;
2897 	      } else if (shift_right <= shift_left) {
2898 		common_genomicpos = common_genomicpos_right;
2899 	      } else {
2900 		common_genomicpos = common_genomicpos_left;
2901 	      }
2902 	    }
2903 
2904 	    debug15(printf("New common point is %u\n",common_genomicpos - hit3->chroffset));
2905 	    /* Recompute hardclips */
2906 	    if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
2907 		find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
2908 	      *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2909 	      return 0;
2910 	    } else if (ilength5_low > ilength3_high) {
2911 	      debug15(printf("Uneven: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2912 	      assert(ilength5_low > 0);
2913 	      ilength5_low -= 1;
2914 	    } else {
2915 	      debug15(printf("Uneven: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2916 	      assert(ilength3_high > 0);
2917 	      ilength3_high -= 1;
2918 	    }
2919 	    debug15(printf("Even: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2920 
2921 	    *hardclip5_low = ilength5_low /*+ common_shift*/;
2922 	    *hardclip3_high = ilength3_high /*- common_shift*/;
2923 	    debug15(printf("Initial computation of clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2924 			   *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2925 
2926 	    *hardclip5_low += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
2927 	    *hardclip3_high += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
2928 	    debug15(printf("Recomputed clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2929 			   *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2930 	  }
2931 
2932 #if 0
2933 	  if (*hardclip5_low < 0) {
2934 	    *hardclip5_low = 0;
2935 	  }
2936 	  if (*hardclip3_high < 0) {
2937 	    *hardclip3_high = 0;
2938 	  }
2939 	  debug15(printf("Positive clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2940 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2941 #endif
2942 	}
2943 
2944 	debug15(printf("returning clipdir %d\n",clipdir));
2945 	return clipdir;
2946       }
2947 
2948     } else {
2949       /* minus */
2950 #if 0
2951       hit5_trimmed_length = hit5->querylength - hit5->trim_querystart - hit5->trim_queryend - start_amb_length(hit5) - end_amb_length(hit5);
2952       hit3_trimmed_length = hit3->querylength - hit3->trim_querystart - hit3->trim_queryend - start_amb_length(hit3) - end_amb_length(hit3);
2953       totallength = hit5_trimmed_length + hit3_trimmed_length;
2954       debug15(printf("totallength = %d, hit5 trimmed length = %d, hit3 trimmed length = %d\n",
2955 		     totallength,hit5_trimmed_length,hit3_trimmed_length));
2956       debug15(printf("original insertlength: %d, trim+amb5: %d..%d, trim+amb3: %d..%d\n",
2957 		     this->insertlength,hit5->trim_querystart + start_amb_length(hit5),
2958 		     hit5->trim_queryend + hit5->end_amb_length,hit3->trim_querystart + start_amb_length(hit3),
2959 		     hit3->trim_queryend + hit3->end_amb_length));
2960 #endif
2961 
2962       if ((common_genomicpos = pair_common_genomicpos(hit5,hit3)) == 0) {
2963 	debug15(printf("Cannot determine a common point, so returning 0\n"));
2964 	return 0;
2965 
2966       } else if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
2967 		 find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
2968 	debug15(printf("Cannot determine ilengths, so returning 0\n"));
2969 	return 0;
2970 
2971       } else {
2972 	debug15(printf("Inclusive: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2973 	debug15(printf("ilength53lh is %d, ilength35lh is %d\n",ilength5_low + ilength3_high - 1,ilength3_low + ilength5_high - 1));
2974 
2975 	common_left = (ilength5_low < ilength3_low) ? ilength5_low : ilength3_low;
2976 	common_right = (ilength5_high < ilength3_high) ? ilength5_high : ilength3_high;
2977 	if (common_right > common_left) {
2978 	  common_shift = common_right/2 - (common_left - 1)/2;
2979 	  debug15(printf("Common shift is %d = common_right %d/2 - (common_left %d - 1)/2\n",
2980 			 common_shift,common_right,common_left));
2981 	  assert(ilength5_low > 0);
2982 	  assert(ilength3_low > 0);
2983 	  ilength5_low -= 1;
2984 	  ilength3_low -= 1;
2985 	} else {
2986 	  common_shift = (common_right - 1)/2 - common_left/2;
2987 	  debug15(printf("Common shift is %d = (common_right %d - 1)/2 - common_left %d/2\n",
2988 			 common_shift,common_right,common_left));
2989 	  assert(ilength5_high > 0);
2990 	  assert(ilength3_high > 0);
2991 	  ilength5_high -= 1;
2992 	  ilength3_high -= 1;
2993 	}
2994 	debug15(printf("Exclusive: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2995 
2996 	if ((ilength53 = ilength5_low + ilength3_high) > (ilength35 = ilength3_low + ilength5_high)) {
2997 	  /* Use >, not >=, so we favor clipping heads over clipping tails in case of a tie */
2998 	  debug15(printf("minus, ilength53 is longer.  Clipping tails.\n"));
2999 	  debug15(overlap = common_left + common_right - 1);
3000 	  debug15(printf("Overlap is %d = common_left %d + common_right %d - 1\n",
3001 			 overlap,common_left,common_right));
3002 	  clipdir = +1;
3003 
3004 
3005 	  /* Want to clip 5 high and 3 low */
3006 	  *hardclip5_high = ilength5_high - common_shift;
3007 	  *hardclip3_low = ilength3_low + common_shift;
3008 	  debug15(printf("Overlap clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3009 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3010 	  *hardclip5_high += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
3011 	  *hardclip3_low += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
3012 	  debug15(printf("Ambig clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3013 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3014 
3015 	  if (common_shift != 0) {
3016 	    if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == true) {
3017 	      /* No adjustment needed, but need to recompute ilengths for shifted common_genomicpos */
3018 	    } else {
3019 	      common_genomicpos_right = adjust_hardclips_right(&shift_right,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset);
3020 	      common_genomicpos_left = adjust_hardclips_left(&shift_left,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset);
3021 	      debug15(printf("shift_right %d, shift_left %d\n",shift_right,shift_left));
3022 	      if (shift_right == 0 && shift_left == 0) {
3023 		/* Try original position without a shift */
3024 		*hardclip5_high = ilength5_high /*- common_shift*/;
3025 		*hardclip3_low = ilength3_low /*+ common_shift*/;
3026 		*hardclip5_high += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
3027 		*hardclip3_low += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
3028 		if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == false) {
3029 		  *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
3030 		  return 0;
3031 		}
3032 	      } else if (shift_left == 0) {
3033 		common_genomicpos = common_genomicpos_right;
3034 	      } else if (shift_right == 0) {
3035 		common_genomicpos = common_genomicpos_left;
3036 	      } else if (shift_right <= shift_left) {
3037 		common_genomicpos = common_genomicpos_right;
3038 	      } else {
3039 		common_genomicpos = common_genomicpos_left;
3040 	      }
3041 	    }
3042 
3043 	    debug15(printf("New common point is %u\n",common_genomicpos - hit3->chroffset));
3044 	    /* Recompute hardclips */
3045 	    if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
3046 		find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
3047 	      *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
3048 	      return 0;
3049 	    } else if (ilength3_low > ilength5_high) {
3050 	      debug15(printf("Uneven: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3051 	      assert(ilength3_low > 0);
3052 	      ilength3_low -= 1;
3053 	    } else {
3054 	      debug15(printf("Uneven: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3055 	      assert(ilength5_high > 0);
3056 	      ilength5_high -= 1;
3057 	    }
3058 	    debug15(printf("Even: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3059 
3060 	    *hardclip5_high = ilength5_high /*- common_shift*/;
3061 	    *hardclip3_low = ilength3_low /*+ common_shift*/;
3062 	    debug15(printf("Initial computation of clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3063 			   *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3064 
3065 	    *hardclip5_high += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
3066 	    *hardclip3_low += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
3067 	    debug15(printf("Recomputed clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3068 			   *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3069 	  }
3070 
3071 #if 0
3072 	  if (*hardclip5_high < 0) {
3073 	    *hardclip5_high = 0;
3074 	  }
3075 	  if (*hardclip3_low < 0) {
3076 	    *hardclip3_low = 0;
3077 	  }
3078 	  debug15(printf("Positive clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3079 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3080 #endif
3081 
3082 	} else {
3083 	  debug15(printf("minus, ilength35 is longer.  Clipping heads.\n"));
3084 	  debug15(overlap = common_left + common_right - 1);
3085 	  debug15(printf("Overlap is %d = common_left %d + common_right %d - 1\n",
3086 			 overlap,common_left,common_right));
3087 	  clipdir = -1;
3088 
3089 	  /* Want to clip 5 low and 3 high */
3090 	  *hardclip5_low = ilength5_low + common_shift;
3091 	  *hardclip3_high = ilength3_high - common_shift;
3092 	  debug15(printf("Overlap clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3093 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3094 	  *hardclip5_low += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
3095 	  *hardclip3_high += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
3096 	  debug15(printf("Ambig clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3097 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3098 
3099 	  if (common_shift != 0) {
3100 	    if (test_hardclips(&common_genomicpos,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset) == true) {
3101 	      /* No adjustment needed, but need to recompute ilengths for shifted common_genomicpos */
3102 	    } else {
3103 	      common_genomicpos_right = adjust_hardclips_right(&shift_right,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset);
3104 	      common_genomicpos_left = adjust_hardclips_left(&shift_left,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset);
3105 	      debug15(printf("shift_right %d, shift_left %d\n",shift_right,shift_left));
3106 	      if (shift_right == 0 && shift_left == 0) {
3107 		/* Try original position without a shift */
3108 		*hardclip5_low = ilength5_low /*+ common_shift*/;
3109 		*hardclip3_high = ilength3_high /*- common_shift*/;
3110 		*hardclip5_low += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
3111 		*hardclip3_high += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
3112 		if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == false) {
3113 		  *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
3114 		  return 0;
3115 		}
3116 	      } else if (shift_left == 0) {
3117 		common_genomicpos = common_genomicpos_right;
3118 	      } else if (shift_right == 0) {
3119 		common_genomicpos = common_genomicpos_left;
3120 	      } else if (shift_right <= shift_left) {
3121 		common_genomicpos = common_genomicpos_right;
3122 	      } else {
3123 		common_genomicpos = common_genomicpos_left;
3124 	      }
3125 	    }
3126 
3127 	    debug15(printf("New common point is %u\n",common_genomicpos - hit3->chroffset));
3128 	    /* Recompute hardclips */
3129 	    if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
3130 		find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
3131 	      *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
3132 	      return 0;
3133 	    } else if (ilength5_low > ilength3_high) {
3134 	      debug15(printf("Uneven: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3135 	      assert(ilength5_low > 0);
3136 	      ilength5_low -= 1;
3137 	    } else {
3138 	      debug15(printf("Uneven: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3139 	      assert(ilength3_high > 0);
3140 	      ilength3_high -= 1;
3141 	    }
3142 	    debug15(printf("Even: ilengths5: %d|%d.  ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3143 
3144 	    *hardclip5_low = ilength5_low /*+ common_shift*/;
3145 	    *hardclip3_high = ilength3_high /*- common_shift*/;
3146 	    debug15(printf("Initial computation of clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3147 			   *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3148 
3149 	    *hardclip5_low += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
3150 	    *hardclip3_high += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
3151 	    debug15(printf("Recomputed clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3152 			   *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3153 	  }
3154 
3155 #if 0
3156 	  if (*hardclip5_low < 0) {
3157 	    *hardclip5_low = 0;
3158 	  }
3159 	  if (*hardclip3_high < 0) {
3160 	    *hardclip3_high = 0;
3161 	  }
3162 	  debug15(printf("Positive clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3163 			 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3164 #endif
3165 	}
3166       }
3167 
3168       debug15(printf("returning clipdir %d\n",clipdir));
3169       return clipdir;
3170     }
3171   }
3172 }
3173 
3174 
3175 void
Stage3pair_free(Stage3pair_T * old)3176 Stage3pair_free (Stage3pair_T *old) {
3177   debug0(printf("Freeing pair %p with hits %p and %p\n",*old,(*old)->hit5,(*old)->hit3));
3178   assert((*old)->hit3 != NULL);
3179   debug0(printf("Freeing end3 at %p\n",(*old)->hit3));
3180   Stage3end_free(&(*old)->hit3);
3181 
3182   assert((*old)->hit5 != NULL);
3183   debug0(printf("Freeing end5 at %p\n",(*old)->hit5));
3184   Stage3end_free(&(*old)->hit5);
3185 
3186   FREE_OUT(*old);
3187   return;
3188 }
3189 
3190 
3191 
3192 #if 0
3193 static long int
3194 Stage3pair_tally (Stage3pair_T this) {
3195 
3196   if (tally_iit == NULL) {
3197     return 0L;
3198   } else if (this->tally >= 0) {
3199     return this->tally;
3200   } else {
3201     this->tally = Stage3end_compute_tally(this->hit5) + Stage3end_compute_tally(this->hit3);
3202     return this->tally;
3203   }
3204 }
3205 #endif
3206 
3207 
3208 static char complCode[128] = COMPLEMENT_LC;
3209 
3210 #if 0
3211 static char *
3212 make_complement_buffered (char *complement, char *sequence, unsigned int length) {
3213   int i, j;
3214 
3215   /* complement = (char *) CALLOC_OUT(length+1,sizeof(char)); */
3216   for (i = length-1, j = 0; i >= 0; i--, j++) {
3217     complement[j] = complCode[(int) sequence[i]];
3218   }
3219   complement[length] = '\0';
3220   return complement;
3221 }
3222 #endif
3223 
3224 static char *
make_complement_inplace(char * sequence,unsigned int length)3225 make_complement_inplace (char *sequence, unsigned int length) {
3226   char temp;
3227   unsigned int i, j;
3228 
3229   for (i = 0, j = length-1; i < length/2; i++, j--) {
3230     temp = complCode[(int) sequence[i]];
3231     sequence[i] = complCode[(int) sequence[j]];
3232     sequence[j] = temp;
3233   }
3234   if (i == j) {
3235     sequence[i] = complCode[(int) sequence[i]];
3236   }
3237 
3238   return sequence;
3239 }
3240 
3241 char *
Stage3end_substrings_genomic_sequence(int * seqlength,T this,Genome_T genome)3242 Stage3end_substrings_genomic_sequence (int *seqlength, T this, Genome_T genome) {
3243   char *gbuffer;
3244   List_T p, q;
3245   Substring_T substring;
3246   Junction_T junction;
3247   int querypos, querystart, queryend, querylength, substring_length;
3248 
3249   *seqlength = 0;
3250   for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
3251     substring = (Substring_T) List_head(p);
3252 #ifdef NO_SOFT_CLIPS
3253     querystart = Substring_querystart_orig(substring);
3254     queryend = Substring_queryend_orig(substring);
3255 #else
3256     querystart = Substring_querystart(substring);
3257     queryend = Substring_queryend(substring);
3258 #endif
3259     *seqlength += queryend - querystart;
3260   }
3261   for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
3262     junction = (Junction_T) List_head(p);
3263     if (Junction_type(junction) == DEL_JUNCTION) {
3264       *seqlength += Junction_nindels(junction);
3265     }
3266   }
3267 
3268   gbuffer = (char *) MALLOC((*seqlength+1) * sizeof(char));
3269   if (this->plusp == true) {
3270     /* Build from querystart to queryend, so we don't wipe out sequence with terminating \0 character */
3271     querypos = 0;
3272     for (p = this->substrings_1toN, q = this->junctions_1toN; p != NULL; p = List_next(p), q = List_next(q)) {
3273       substring = (Substring_T) List_head(p);
3274 #ifdef NO_SOFT_CLIPS
3275       querystart = Substring_querystart_orig(substring);
3276       queryend = Substring_queryend_orig(substring);
3277 #else
3278       querystart = Substring_querystart(substring);
3279       queryend = Substring_queryend(substring);
3280 #endif
3281       substring_length = queryend - querystart;
3282       Genome_fill_buffer_simple(genome,Substring_left(substring) + querystart,
3283 				substring_length,&(gbuffer[querypos]));
3284       querypos += substring_length;
3285 
3286       if (q != NULL) {
3287 	junction = (Junction_T) List_head(q);
3288 	if (Junction_type(junction) == DEL_JUNCTION) {
3289 	  substring_length = Junction_nindels(junction);
3290 	  Genome_fill_buffer_simple(genome,Junction_deletionpos(junction),
3291 				    substring_length,&(gbuffer[querypos]));
3292 	  querypos += substring_length;
3293 	}
3294       }
3295     }
3296 
3297     return gbuffer;
3298 
3299   } else {
3300     /* Build from queryend to querystart, so we don't wipe out sequence with terminating \0 character */
3301     querypos = 0;
3302     querylength = this->querylength;
3303     for (p = this->substrings_Nto1, q = this->junctions_Nto1; p != NULL; p = List_next(p), q = List_next(q)) {
3304       substring = (Substring_T) List_head(p);
3305 #ifdef NO_SOFT_CLIPS
3306       querystart = Substring_querystart_orig(substring);
3307       queryend = Substring_queryend_orig(substring);
3308 #else
3309       querystart = Substring_querystart(substring);
3310       queryend = Substring_queryend(substring);
3311 #endif
3312       substring_length = queryend - querystart;
3313       Genome_fill_buffer_simple(genome,Substring_left(substring) + (querylength - queryend),
3314 				substring_length,&(gbuffer[querypos]));
3315       querypos += substring_length;
3316 
3317       if (q != NULL) {
3318 	junction = (Junction_T) List_head(q);
3319 	if (Junction_type(junction) == DEL_JUNCTION) {
3320 	  substring_length = Junction_nindels(junction);
3321 	  Genome_fill_buffer_simple(genome,Junction_deletionpos(junction),
3322 				    substring_length,&(gbuffer[querypos]));
3323 	  querypos += substring_length;
3324 	}
3325       }
3326     }
3327 
3328     return make_complement_inplace(gbuffer,*seqlength);
3329   }
3330 }
3331 
3332 
3333 const Except_T Copy_Substring = { "Substring invalid during copy" };
3334 
3335 static T
Stage3end_copy(T old,Listpool_T listpool)3336 Stage3end_copy (T old, Listpool_T listpool) {
3337   T new = (T) MALLOC_OUT(sizeof(*new));
3338   List_T p;
3339   Substring_T old_substring, new_substring;
3340   Junction_T old_junction, new_junction;
3341 
3342   debug0(printf("*****Copying Stage3end %p -> %p of type %s\n",
3343 		old,new,hittype_string(old->hittype)));
3344 
3345   new->hittype = old->hittype;
3346   new->method = old->method;
3347   new->level = old->level;
3348 
3349   new->querylength = old->querylength;
3350   new->querylength_adj = old->querylength_adj;
3351 
3352   new->transcripts = Transcript_copy_list(old->transcripts);
3353   new->transcripts_other = Transcript_copy_list(old->transcripts_other);
3354 
3355   new->substrings_1toN = (List_T) NULL;
3356   new->substrings_Nto1 = (List_T) NULL;
3357 
3358   new->junctions_1toN = (List_T) NULL;
3359   new->junctions_Nto1 = (List_T) NULL;
3360 
3361   for (p = old->substrings_1toN; p != NULL; p = List_next(p)) {
3362     old_substring = (Substring_T) List_head(p);
3363     new_substring = Substring_copy(old_substring);
3364     new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) new_substring);
3365   }
3366 
3367   for (p = old->junctions_1toN; p != NULL; p = List_next(p)) {
3368     old_junction = (Junction_T) List_head(p);
3369     new_junction = Junction_copy(old_junction);
3370     new->junctions_1toN = Listpool_push(new->junctions_1toN,listpool,(void *) new_junction);
3371   }
3372 
3373   new->substrings_Nto1 = Listpool_copy(new->substrings_1toN,listpool); /* Before reversal of 1toN */
3374   new->junctions_Nto1 = Listpool_copy(new->junctions_1toN,listpool);   /* Before reversal of 1toN */
3375 
3376   /* Reversals to handle builds of 1toN */
3377   new->substrings_1toN = List_reverse(new->substrings_1toN);
3378   new->junctions_1toN = List_reverse(new->junctions_1toN);
3379 
3380 
3381   new->trim_querystart = old->trim_querystart;
3382   new->trim_queryend = old->trim_queryend;
3383   new->mandatory_trim_querystart = old->mandatory_trim_querystart;
3384   new->mandatory_trim_queryend = old->mandatory_trim_queryend;
3385   new->trim_querystart_splicep = old->trim_querystart_splicep;
3386   new->trim_queryend_splicep = old->trim_queryend_splicep;
3387 
3388   new->querystart_chrbound = old->querystart_chrbound;
3389   new->queryend_chrbound = old->queryend_chrbound;
3390 
3391   new->genomicstart = old->genomicstart;
3392   new->genomicend = old->genomicend;
3393 
3394   new->low = old->low;
3395   new->high = old->high;
3396   new->genomiclength = old->genomiclength;
3397   new->guided_insertlength = old->guided_insertlength;
3398 
3399   new->distant_splice_p = old->distant_splice_p;
3400   new->chrnum = old->chrnum;
3401   new->effective_chrnum = old->effective_chrnum;
3402   new->other_chrnum = old->other_chrnum;
3403   new->chroffset = old->chroffset;
3404   new->chrhigh = old->chrhigh;
3405   new->chrlength = old->chrlength;
3406   new->plusp = old->plusp;
3407   new->genestrand = old->genestrand;
3408 
3409   new->sensedir = old->sensedir;
3410   new->sensedir_for_concordance = old->sensedir_for_concordance;
3411 
3412   new->nsplices = old->nsplices;
3413   new->splice_score = old->splice_score;
3414   new->nindels = old->nindels;
3415 
3416   new->nmismatches_bothdiff = old->nmismatches_bothdiff;
3417   new->nmismatches_refdiff = old->nmismatches_refdiff;
3418   new->nsegments = old->nsegments;
3419 
3420   new->refalt_nmatches_to_trims = old->refalt_nmatches_to_trims;
3421   new->ref_nmatches_to_trims = old->ref_nmatches_to_trims;
3422 
3423   new->ref_score_overall = old->ref_score_overall;
3424   new->refalt_score_overall = old->refalt_score_overall;
3425   new->refalt_score_within_trims = old->refalt_score_within_trims;
3426 
3427   new->refalt_nmatches_plus_spliced_trims = old->refalt_nmatches_plus_spliced_trims;
3428   new->ref_nmatches_plus_spliced_trims = old->ref_nmatches_plus_spliced_trims;
3429 
3430   new->paired_usedp = old->paired_usedp;
3431 
3432   new->circularalias = old->circularalias;
3433   new->circularpos = old->circularpos;
3434   new->altlocp = old->altlocp;
3435   debug12(printf("Copying circularpos of %d from hit %p to hit %p\n",new->circularpos,old,new));
3436 
3437   new->score_eventrim = old->score_eventrim;
3438   new->mapq_loglik = old->mapq_loglik;
3439   new->mapq_score = old->mapq_score;
3440   new->absmq_score = old->absmq_score;
3441 
3442   /* Actually, the assertion is excluded only for the JOIN hittype */
3443   assert(new->hittype == SPLICE || Substring_querystart(List_head(new->substrings_1toN)) <= Substring_querystart(List_head(new->substrings_Nto1)));
3444 
3445   return new;
3446 }
3447 
3448 
3449 static int
compute_circularpos(int * circularalias,T hit)3450 compute_circularpos (int *circularalias, T hit) {
3451   int circularpos;
3452   List_T substrings_LtoH, p;
3453   Substring_T substring;
3454 
3455 
3456   debug12(printf("Computing circularpos on hit at %u..%u, plusp %d, with trim left %d and trim right %d\n",
3457 		 hit->low - hit->chroffset,hit->high - hit->chroffset,
3458 		 hit->plusp,hit->trim_querystart,hit->trim_queryend));
3459   if (circularp[hit->chrnum] == false) {
3460     debug12(printf("Chromosome #%d is not circular\n",hit->chrnum));
3461     /* This also handles hit->chrnum == 0, where translocation cannot be circular */
3462     *circularalias = 0;
3463     return -1;
3464 
3465   } else if (hit->low - hit->chroffset >= hit->chrlength) {
3466     /* All of read after trimming is in high part.  Previously
3467        checked hit->high against hit->chrhigh, for circularalias of
3468        +2, but that should be fixed now */
3469 
3470     debug12(printf("Circular chromosome of length %u\n",hit->chrlength));
3471     debug12(printf("All of read after trimming %u..%u is in high part\n",
3472 		   hit->low - hit->chroffset,hit->high - hit->chroffset));
3473     *circularalias = +1;		/* All of read is in second copy */
3474     debug12(printf("For hit %p, pair circularpos is -1, circularalias is %d\n",hit,*circularalias));
3475     return -1;
3476 
3477   } else if (hit->high - hit->chroffset < hit->chrlength) {
3478     /* All of read after trimming is in low part.  Previously
3479        checked hit->low against hit->chroffset for circularalias of
3480        -2, but that should be fixed now */
3481 
3482     debug12(printf("Circular chromosome of length %u\n",hit->chrlength));
3483     debug12(printf("All of read after trimming %u..%u is in low part\n",
3484 		   hit->low - hit->chroffset,hit->high - hit->chroffset));
3485     *circularalias = -1;		/* All of read is in first copy */
3486     debug12(printf("For hit %p, pair circularpos is -1, circularalias is %d\n",hit,*circularalias));
3487     return -1;
3488 
3489   } else {
3490     *circularalias = 0;	/* Straddling middle */
3491     if (hit->plusp == true) {
3492       substrings_LtoH = hit->substrings_1toN;
3493     } else {
3494       substrings_LtoH = hit->substrings_Nto1;
3495     }
3496 
3497     debug12(printf("Circular chromosome of length %u\n",hit->chrlength));
3498     for (p = substrings_LtoH; p != NULL; p = List_next(p)) {
3499       substring = (Substring_T) List_head(p);
3500       if ((circularpos = Substring_circularpos(substring)) > 0) {
3501 	debug12(printf("For hit %p, returning circularpos %d from substring (plus)\n",hit,circularpos));
3502 	return circularpos;
3503       }
3504     }
3505     debug12(printf("For hit %p, pair circularpos is -1, circularalias is %d\n",hit,*circularalias));
3506     return -1;
3507   }
3508 }
3509 
3510 
3511 /* Modified from Stage3end_new_precomputed for a single substring */
3512 T
Stage3end_new_terminal(int * found_score_overall,int * found_score_within_trims,Substring_T substring_in,int querylength,bool gplusp,int genestrand,int sensedir,Listpool_T listpool,Method_T method,int level)3513 Stage3end_new_terminal (int *found_score_overall, int *found_score_within_trims,
3514 			Substring_T substring_in, int querylength,
3515 			bool gplusp, int genestrand, int sensedir, Listpool_T listpool,
3516 			Method_T method, int level) {
3517   T new;
3518 
3519   Substring_T substring;
3520   Chrnum_T chrnum;
3521   Univcoord_T chroffset, chrhigh;
3522   Chrpos_T chrlength;
3523 
3524   Univcoord_T genomicstart, genomicend;
3525   List_T substrings;
3526   List_T p;
3527   int adj = 0;
3528 
3529 
3530   substring = Substring_copy(substring_in); /* Always make a copy of the input substring */
3531   chrnum = Substring_chrnum(substring);
3532   chroffset = Substring_chroffset(substring);
3533   chrhigh = Substring_chrhigh(substring);
3534   chrlength = Substring_chrlength(substring);
3535 
3536   debug0(printf("Entered Stage3end_new_terminal, method %s, with chrnum %d, query %d..%d\n",
3537 		Method_string(method),chrnum,Substring_querystart(substring),Substring_queryend(substring)));
3538 
3539   new = (T) MALLOC_OUT(sizeof(*new));
3540   new->hittype = SUBSTRINGS;
3541   new->method = method;
3542   new->level = level;
3543 
3544   new->querylength = querylength;
3545   new->querylength_adj = querylength + adj;
3546 
3547   /* Caller must not free these lists */
3548   new->transcripts = (List_T) NULL;
3549   new->transcripts_other = (List_T) NULL;
3550 
3551   /* Unlike Stage3end_new_substrings, where substrings and junctions
3552      are in opposite orders, substrings and junctions here are in the
3553      same order. */
3554 
3555   substrings = Listpool_push(NULL,listpool,(void *) substring);
3556   new->substrings_1toN = substrings;
3557   new->substrings_Nto1 = Listpool_copy(substrings,listpool);
3558   /* Do not use substrings after this */
3559 
3560   new->junctions_1toN = new->junctions_Nto1 = (List_T) NULL;
3561   /* There is no junctions_HtoL field */
3562   /* Do not use junctions after this */
3563 
3564 #if 0
3565   /* No need to reverse for a single substring */
3566   if (gplusp == true) {
3567     /* Substrings, head to tail, are query low to high and genome low to high */
3568     new->substrings_HtoL = List_reverse(new->substrings_HtoL);
3569   } else {
3570     /* Substrings, head to tail, are query low to high and genome high to low */
3571     new->substrings_LtoH = List_reverse(new->substrings_LtoH);
3572     new->junctions_LtoH = List_reverse(new->junctions_LtoH);
3573   }
3574 #endif
3575 
3576 #ifdef DEBUG0
3577   printf("NEW SUBSTRING\n");
3578   printf("%d..%d\t%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",Substring_querystart(substring),Substring_queryend(substring),
3579     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
3580     Substring_nmatches_to_trims(substring),Substring_amb_length(substring));
3581   printf("\n");
3582 #endif
3583 
3584   new->trim_querystart = Substring_trim_querystart(substring);
3585   new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring);
3586   new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring);
3587   new->trim_queryend = Substring_trim_queryend(substring);
3588   new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substring);
3589   new->trim_queryend_splicep = Substring_trim_queryend_splicep(substring);
3590   debug0(printf("  trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
3591   debug0(printf("  trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
3592 
3593   new->querystart_chrbound = Substring_querystart_chrbound(substring);
3594   new->queryend_chrbound = Substring_queryend_chrbound(substring);
3595   if (new->trim_querystart > new->querystart_chrbound) {
3596     new->querystart_chrbound = new->trim_querystart;
3597   }
3598   if (querylength - new->trim_queryend < new->queryend_chrbound) {
3599     new->queryend_chrbound = querylength - new->trim_queryend;
3600   }
3601   assert(new->querystart_chrbound < new->queryend_chrbound);
3602   debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
3603 
3604 
3605   genomicstart = Substring_genomicstart(substring);
3606   genomicend = Substring_genomicend(substring);
3607   new->genomicstart = genomicstart;
3608   new->genomicend = genomicend;
3609 
3610   if (gplusp == true) {
3611     new->low = genomicstart + new->querystart_chrbound;
3612     new->high = genomicend - (querylength - new->queryend_chrbound);
3613     new->genomiclength = genomicend - genomicstart;
3614   } else {
3615     new->low = genomicend + (querylength - new->queryend_chrbound);
3616     new->high = genomicstart - new->querystart_chrbound;
3617     new->genomiclength = genomicstart - genomicend;
3618   }
3619   assert(new->low < new->high);
3620   debug0(printf("low %u, high %u\n",new->low - chroffset,new->high - chroffset));
3621 
3622   new->guided_insertlength = 0U;
3623 
3624   new->distant_splice_p = false;
3625   new->chrnum = new->effective_chrnum = chrnum;
3626   new->other_chrnum = 0;
3627   new->chroffset = chroffset;
3628   new->chrhigh = chrhigh;
3629   new->chrlength = chrlength;
3630   new->plusp = gplusp;
3631   new->genestrand = genestrand;
3632 
3633   new->sensedir_for_concordance = new->sensedir = sensedir;
3634 
3635   new->nsplices = 0;
3636   new->splice_score = 0.0;
3637   new->nindels = 0;
3638 
3639   new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(substring); /* Trimmed */
3640   new->nmismatches_refdiff = Substring_nmismatches_refdiff(substring);
3641   new->nsegments = List_length(new->substrings_1toN);
3642 
3643 
3644   new->refalt_nmatches_to_trims = new->ref_nmatches_to_trims = 0;
3645   /* Note: Cannot use substrings variable here.  Need to use new->substrings_1toN */
3646   for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
3647     substring = (Substring_T) List_head(p);
3648     new->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
3649     new->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
3650   }
3651   debug0(printf("**Setting nmatches_to_trims to be %d\n",new->refalt_nmatches_to_trims));
3652 
3653   new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims + Substring_amb_length(substring);
3654   new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + Substring_amb_length(substring);
3655   assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
3656 
3657   /* Used for global comparisons */
3658   new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
3659   new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
3660   new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims;
3661   if (Substring_trim_querystart_splicep(substring) == false) {
3662     new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring)/END_BINSIZE);
3663   } else {
3664     new->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring)/END_BINSIZE);
3665   }
3666   if (Substring_trim_queryend_splicep(substring) == false) {
3667     new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((querylength - Substring_queryend(substring))/END_BINSIZE);
3668   } else {
3669     new->refalt_score_within_trims += SPLICED_END_PENALTY*((querylength - Substring_queryend(substring))/END_BINSIZE);
3670   }
3671   /* was Substring_amb_length(substring)/AMB_PENALTY, but doesn't work for DNA-seq */
3672 
3673   if (chrlength < (Univcoord_T) querylength) {
3674     new->ref_score_overall -= ((Univcoord_T) querylength - chrlength);
3675     new->refalt_score_overall -= ((Univcoord_T) querylength - chrlength);
3676     new->refalt_score_within_trims -= ((Univcoord_T) querylength - chrlength);
3677   }
3678   assert(new->refalt_score_within_trims >= 0);
3679 
3680 
3681   /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
3682   if (new->refalt_score_overall < *found_score_overall) {
3683     *found_score_overall = new->refalt_score_overall;
3684   }
3685   /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
3686   if (new->refalt_score_within_trims < *found_score_within_trims) {
3687     *found_score_within_trims = new->refalt_score_within_trims;
3688   }
3689 
3690 
3691   /* new->penalties = 0; */
3692 
3693   /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
3694   /* new->tally = -1L; */
3695 
3696   new->paired_usedp = false;
3697 
3698   /* new->query_splicepos = -1; */
3699   new->circularpos = compute_circularpos(&new->circularalias,new);
3700 
3701   if ((new->altlocp = altlocp[chrnum]) == false) {
3702     debug0(printf("*****Method %s: Stage3end_new_terminal returning primary %p at %u..%u\n\n",
3703 		  Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset));
3704     return new;
3705 
3706   } else {
3707     debug0(printf("*****Method %s: Stage3end_new_terminal returning altloc %p at %u..%u\n\n",
3708 		  Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset));
3709     return new;
3710   }
3711 }
3712 
3713 
3714 
3715 /* Called only by kmer-search.c */
3716 T
Stage3end_new_precomputed(int * found_score_overall,int * found_score_within_trims,int nmismatches_bothdiff,int nmismatches_refdiff,List_T substrings,List_T junctions,List_T transcripts,List_T transcripts_other,int querylength,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool gplusp,int genestrand,int sensedir,Listpool_T listpool,Method_T method,int level)3717 Stage3end_new_precomputed (int *found_score_overall, int *found_score_within_trims,
3718 			   int nmismatches_bothdiff, int nmismatches_refdiff,
3719 			   List_T substrings, List_T junctions, List_T transcripts, List_T transcripts_other,
3720 			   int querylength, Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
3721 			   bool gplusp, int genestrand, int sensedir, Listpool_T listpool, Method_T method, int level) {
3722   T new;
3723 
3724   Univcoord_T genomicstart, genomicend;
3725   Substring_T substring, substring1, substringN;
3726   Junction_T junction;
3727   List_T p;
3728   int adj = 0;
3729   int nsites;
3730   double prob_total;
3731 
3732 
3733 #ifdef DEBUG0
3734   printf("Entered Stage3end_new_precomputed, method %s, with gplusp %d\n",Method_string(method),gplusp);
3735   printf("%d substrings\n",List_length(substrings));
3736   printf("%d junctions\n",List_length(junctions));
3737 #endif
3738   assert(List_length(substrings) == List_length(junctions) + 1);
3739 
3740   new = (T) MALLOC_OUT(sizeof(*new));
3741   new->hittype = SUBSTRINGS;
3742   new->method = method;
3743   new->level = level;
3744 
3745   new->querylength = querylength;
3746   new->querylength_adj = querylength + adj;
3747 
3748   /* Caller must not free these lists */
3749   new->transcripts = transcripts;
3750   new->transcripts_other = transcripts_other;
3751 
3752   /* Unlike Stage3end_new_substrings, where substrings and junctions
3753      are in opposite orders, substrings and junctions here are in the
3754      same order. */
3755 
3756   new->substrings_1toN = substrings;
3757   new->substrings_Nto1 = List_reverse(Listpool_copy(substrings,listpool));
3758   new->junctions_1toN = junctions;
3759   new->junctions_Nto1 = List_reverse(Listpool_copy(junctions,listpool));
3760 
3761 
3762   /* There is no junctions_HtoL field */
3763 
3764 #if 0
3765   if (gplusp == true) {
3766     /* Substrings, head to tail, are query low to high and genome low to high */
3767     new->substrings_LtoH = Listpool_copy(new->substrings_1toN,listpool);
3768     new->substrings_HtoL = Listpool_copy(new->substrings_Nto1,listpool);
3769     new->junctions_LtoH = Listpool_copy(new->junctions_1toN,listpool);
3770     /* new->junctions_HtoL = Listpool_copy(new->junctions_Nto1,listpool); */
3771   } else {
3772     /* Substrings, head to tail, are query low to high and genome high to low */
3773     new->substrings_LtoH = Listpool_copy(new->substrings_Nto1,listpool);
3774     new->substrings_HtoL = Listpool_copy(new->substrings_1toN,listpool);
3775     new->junctions_LtoH = Listpool_copy(new->junctions_Nto1,listpool);
3776     /* new->junctions_HtoL = Listpool_copy(new->junctions_1toN,listpool); */
3777   }
3778 #endif
3779   /* Do not use substrings after this */
3780   /* Do not use junctions after this */
3781 
3782 
3783 
3784 #ifdef DEBUG0
3785   printf("NEW SUBSTRINGS (query order)\n");
3786   for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
3787     substring = List_head(p);
3788     if (Substring_ambiguous_p(substring) == true) {
3789       printf("%d..%d\t%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\tprobs:%f and %f\n",Substring_querystart(substring),Substring_queryend(substring),
3790 	     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
3791 	     Substring_nmatches_to_trims(substring),Substring_amb_length(substring),Substring_amb_donor_prob(substring),Substring_amb_acceptor_prob(substring)
3792 );
3793     } else {
3794       printf("%d..%d\t%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",Substring_querystart(substring),Substring_queryend(substring),
3795 	     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
3796 	     Substring_nmatches_to_trims(substring),Substring_amb_length(substring));
3797     }
3798   }
3799   printf("\n");
3800 
3801   printf("NEW JUNCTIONS (query order)\n");
3802   for (p = new->junctions_1toN; p != NULL; p = List_next(p)) {
3803     junction = List_head(p);
3804     printf("splice distance %u, nindels %d\n",Junction_splice_distance(junction),Junction_nindels(junction));
3805   }
3806   printf("\n");
3807 #endif
3808 
3809 
3810   substring1 = (Substring_T) List_head(new->substrings_1toN);
3811   substringN = (Substring_T) List_head(new->substrings_Nto1);
3812 
3813   new->trim_querystart = Substring_trim_querystart(substring1);
3814   new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring1);
3815   new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring1);
3816   new->trim_queryend = Substring_trim_queryend(substringN);
3817   new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substringN);
3818   new->trim_queryend_splicep = Substring_trim_queryend_splicep(substringN);
3819   debug0(printf("  trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
3820   debug0(printf("  trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
3821 
3822   new->querystart_chrbound = Substring_querystart_chrbound(substring1);
3823   new->queryend_chrbound = Substring_queryend_chrbound(substringN);
3824   if (new->trim_querystart > new->querystart_chrbound) {
3825     new->querystart_chrbound = new->trim_querystart;
3826   }
3827   if (querylength - new->trim_queryend < new->queryend_chrbound) {
3828     new->queryend_chrbound = querylength - new->trim_queryend;
3829   }
3830   assert(new->querystart_chrbound < new->queryend_chrbound);
3831   debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
3832 
3833 
3834   genomicstart = Substring_genomicstart(substring1);
3835   genomicend = Substring_genomicend(substringN);
3836   new->genomicstart = genomicstart;
3837   new->genomicend = genomicend;
3838 
3839   if (gplusp == true) {
3840     new->low = genomicstart + new->querystart_chrbound;
3841     new->high = genomicend - (querylength - new->queryend_chrbound);
3842     new->genomiclength = genomicend - genomicstart;
3843   } else {
3844     new->low = genomicend + (querylength - new->queryend_chrbound);
3845     new->high = genomicstart - new->querystart_chrbound;
3846     new->genomiclength = genomicstart - genomicend;
3847   }
3848   assert(new->low < new->high);
3849   debug0(printf("low %u, high %u\n",new->low - chroffset,new->high - chroffset));
3850 
3851   new->guided_insertlength = 0U;
3852 
3853   new->distant_splice_p = false;
3854   new->chrnum = new->effective_chrnum = chrnum;
3855   new->other_chrnum = 0;
3856   new->chroffset = chroffset;
3857   new->chrhigh = chrhigh;
3858   new->chrlength = chrlength;
3859   new->plusp = gplusp;
3860   new->genestrand = genestrand;
3861 
3862 
3863   prob_total = 0.0;
3864   nsites = 0;
3865   new->nsplices = 0;
3866   for (p = junctions; p != NULL; p = List_next(p)) {
3867     junction = (Junction_T) List_head(p);
3868     if (Junction_type(junction) == SPLICE_JUNCTION) {
3869       prob_total += Junction_splice_score(junction);
3870       nsites += 2;
3871       new->nsplices += 1;
3872     }
3873   }
3874   if (nsites == 0) {
3875     new->splice_score = 0.0;
3876   } else {
3877     new->splice_score = prob_total / (double) nsites;
3878   }
3879   debug0(printf("SPLICE SCORE: %f\n",new->splice_score));
3880   new->nindels = 0;
3881 
3882 
3883   new->nmismatches_bothdiff = nmismatches_bothdiff; /* Trimmed */
3884   new->nmismatches_refdiff = nmismatches_refdiff;
3885   new->nsegments = List_length(new->substrings_1toN);
3886 
3887   /* new->nmatches_to_trims = querylength_trimmed - nmismatches_whole; */
3888   new->refalt_nmatches_to_trims = new->ref_nmatches_to_trims = 0;
3889   /* Note: Cannot use substrings variable here.  Need to use new->substrings_1toN */
3890   for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
3891     substring = (Substring_T) List_head(p);
3892     new->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
3893     new->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
3894   }
3895   debug0(printf("**Setting nmatches_to_trims to be %d\n",new->ref_nmatches_to_trims));
3896 
3897   new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims + Substring_start_amb_length(substring) + Substring_end_amb_length(substring);
3898   new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + Substring_start_amb_length(substring) + Substring_end_amb_length(substring);
3899   for (p = new->junctions_1toN; p != NULL; p = List_next(p)) {
3900     junction = List_head(p);
3901     new->refalt_nmatches_plus_spliced_trims += Junction_ninserts(junction);
3902     new->ref_nmatches_plus_spliced_trims += Junction_ninserts(junction);
3903   }
3904   assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
3905 
3906   new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
3907   new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
3908   new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims;
3909   if (Substring_trim_querystart_splicep(substring1) == false) {
3910     new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring1)/END_BINSIZE);
3911   } else {
3912     new->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring1)/END_BINSIZE);
3913   }
3914   if (Substring_trim_queryend_splicep(substringN) == false) {
3915     new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((querylength - Substring_queryend(substringN))/END_BINSIZE);
3916   } else {
3917     new->refalt_score_within_trims += SPLICED_END_PENALTY*((querylength - Substring_queryend(substringN))/END_BINSIZE);
3918   }
3919   /* was Substring_amb_length(substring)/AMB_PENALTY, but doesn't work for DNA-seq */
3920 
3921   if (chrlength < (Univcoord_T) querylength) {
3922     new->ref_score_overall -= ((Univcoord_T) querylength - chrlength);
3923     new->refalt_score_overall -= ((Univcoord_T) querylength - chrlength);
3924     new->refalt_score_within_trims -= ((Univcoord_T) querylength - chrlength);
3925   }
3926   assert(new->refalt_score_within_trims >= 0);
3927 
3928 
3929   /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
3930   if (new->refalt_score_overall < *found_score_overall) {
3931     *found_score_overall = new->refalt_score_overall;
3932   }
3933   /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
3934   if (new->refalt_score_within_trims < *found_score_within_trims) {
3935     *found_score_within_trims = new->refalt_score_within_trims;
3936   }
3937 
3938 
3939   /* new->penalties = 0; */
3940 
3941   /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
3942   /* new->tally = -1L; */
3943 
3944   new->sensedir_for_concordance = new->sensedir = sensedir;
3945 
3946   new->paired_usedp = false;
3947 
3948   /* new->query_splicepos = -1; */
3949   new->circularpos = compute_circularpos(&new->circularalias,new);
3950 
3951   if ((new->altlocp = altlocp[chrnum]) == false) {
3952     debug0(printf("*****Method %s: Stage3end_new_precomputed returning primary %p at %u..%u with splice_score %f\n\n",
3953 		  Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset,new->splice_score));
3954     return new;
3955 
3956   } else {
3957     debug0(printf("*****Method %s: Stage3end_new_precomputed returning altloc %p at %u..%u with splice_score %f\n\n",
3958 		  Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset,new->splice_score));
3959     return new;
3960   }
3961 }
3962 
3963 
3964 int
Stage3end_nmatches_substrings(int * ref_nmatches,Intlist_T endpoints,Univcoordlist_T univdiagonals,Intlist_T nmismatches_list,Intlist_T ref_nmismatches_list,List_T junctions,int querylength,Compress_T query_compress,Substring_T qend_alts,Substring_T qstart_alts,bool plusp,int genestrand,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool splice5p_in,bool splice3p_in,Listpool_T listpool)3965 Stage3end_nmatches_substrings (int *ref_nmatches, Intlist_T endpoints, Univcoordlist_T univdiagonals,
3966 			       Intlist_T nmismatches_list, Intlist_T ref_nmismatches_list, List_T junctions,
3967 			       int querylength, Compress_T query_compress,
3968 			       Substring_T qend_alts, Substring_T qstart_alts, bool plusp, int genestrand,
3969 			       Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
3970 			       bool splice5p_in, bool splice3p_in, Listpool_T listpool) {
3971   int nmatches, substring_nmatches, substring_ref_nmatches;
3972   int qstart, qend;
3973   Univcoord_T univdiagonal, left;
3974   Intlist_T r, x, y;
3975   Univcoordlist_T q;
3976   Junction_T junction;
3977 #ifdef MAKE_JUNCTION
3978   Junction_T qstart_junction = NULL, qend_junction = NULL;
3979   double donor_prob, acceptor_prob;
3980 #endif
3981   List_T newjunctions, p, j;
3982   bool splice5p, splice3p;
3983   int adj0;			/* deletions - insertions */
3984   int nmismatches, ref_nmismatches, indel_score = 0, nindels = 0;
3985   int nindelbreaks, n_large_indels;
3986   /* double donor_prob, acceptor_prob; */
3987 
3988 
3989   debug0(printf("Entered Stage3end_nmatches_substrings with %s, plusp %d, splice5p %d, splice3p %d\n",
3990 		Intlist_to_string(endpoints),plusp,splice5p_in,splice3p_in));
3991   nmatches = 0;
3992   *ref_nmatches = 0;
3993 
3994 #ifdef DEBUG7
3995   printf("Entered Stage3end_nmatches_substrings, at univdiagonal %u [%u], with chrnum #%d, plusp %d, and endpoints %s\n",
3996 	 Univcoordlist_head(univdiagonals),Univcoordlist_head(univdiagonals) - chroffset,chrnum,plusp,Intlist_to_string(endpoints));
3997   printf("There are %d endpoints, %d univdiagonals, %d nmismatches, and %d junctions\n",
3998 	 Intlist_length(endpoints),Univcoordlist_length(univdiagonals),Intlist_length(nmismatches_list),List_length(junctions));
3999   if (qstart_alts != NULL) {
4000     printf("qstart_alts at %d..%d\n",Substring_querystart(qstart_alts),Substring_queryend(qstart_alts));
4001   }
4002   if (qend_alts != NULL) {
4003     printf("qend_alts at %d..%d\n",Substring_querystart(qend_alts),Substring_queryend(qend_alts));
4004   }
4005   printf("Endpoints: %s\n",Intlist_to_string(endpoints));
4006   printf("Univdiagonals: %s\n",Univcoordlist_to_string_offset(univdiagonals,chroffset));
4007   printf("Mismatches: %s\n",Intlist_to_string(nmismatches_list));
4008   printf("Ref mismatches: %s\n",Intlist_to_string(ref_nmismatches_list));
4009 #endif
4010 
4011   assert(Univcoordlist_length(univdiagonals) == Intlist_length(endpoints) - 1);
4012   assert(Intlist_length(nmismatches_list) == Intlist_length(endpoints) - 1);
4013   assert(Intlist_length(ref_nmismatches_list) == Intlist_length(endpoints) - 1);
4014   assert(List_length(junctions) == Intlist_length(endpoints) - 2);
4015 
4016 
4017   newjunctions = Listpool_copy(junctions,listpool);
4018 
4019 
4020 #ifdef DEBUG0
4021   for (p = junctions; p != NULL; p = List_next(p)) {
4022     Junction_print((Junction_T) List_head(p));
4023   }
4024   printf("\n");
4025 #endif
4026 
4027 
4028   qstart = Intlist_head(endpoints);
4029   nmismatches = Intlist_head(nmismatches_list);
4030   ref_nmismatches = Intlist_head(ref_nmismatches_list);
4031 
4032   if (plusp == true) {
4033     j = newjunctions;		/* Put here before we handle querystart_alts */
4034     if (qstart_alts != NULL) {
4035       debug7(printf("Adding %d matches for qstart_alts\n",Substring_nmatches(qstart_alts)));
4036       nmatches += Substring_nmatches(qstart_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4037       *ref_nmatches += Substring_ref_nmatches(qstart_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4038 #ifdef MAKE_JUNCTION
4039       donor_prob = Substring_amb_donor_prob(qstart_alts);
4040       acceptor_prob = Substring_amb_acceptor_prob(qstart_alts);
4041       qstart_junction = Junction_new_ambig_splice(orig_sensedir,donor_prob,acceptor_prob);
4042       newjunctions = Listpool_push(newjunctions,listpool,(void *) qstart_junction);
4043 #else
4044       newjunctions = Listpool_push(newjunctions,listpool,(void *) NULL);
4045 #endif
4046       splice5p = false;
4047     } else {
4048       splice5p = splice5p_in;
4049     }
4050 
4051     /* Add qpos to get alignstart/alignend */
4052     for (q = univdiagonals, x = nmismatches_list, y = ref_nmismatches_list, r = Intlist_next(endpoints); q != NULL;
4053 	 q = Univcoordlist_next(q), x = Intlist_next(x), y = Intlist_next(y), r = Intlist_next(r), j = List_next(j)) {
4054       qend = Intlist_head(r);
4055       nmismatches = Intlist_head(x);
4056       ref_nmismatches = Intlist_head(y);
4057       univdiagonal = Univcoordlist_head(q);
4058       left = univdiagonal - (Univcoord_T) querylength;
4059       debug0(printf("Stage3end_nmatches_substrings: qstart %d..qend %d at univdiagonal %u [%u]\n",
4060 		    qstart,qend,univdiagonal,univdiagonal - chroffset));
4061 
4062       /* genomicstart = left; */
4063       /* genomicend = left + querylength; */
4064       /* alignstart = genomicstart + qstart; */
4065       /* alignend = genomicstart + qend; */
4066 
4067       if (nmismatches >= 0 && ref_nmismatches >= 0) {
4068 	debug7(printf("Checking mismatches at %u from querystart %d to queryend %d\n",univdiagonal - chroffset,qstart,qend));
4069 	debug7(printf("%d vs %d\n",nmismatches,
4070 		      Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4071 							/*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand)));
4072 #ifdef CHECK_NMISMATCHES
4073 	assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4074 								/*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand));
4075 #endif
4076       } else {
4077 	nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4078 							/*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand);
4079 	Intlist_head_set(x,nmismatches);		/* Save for Stage3end_new_substrings */
4080 	Intlist_head_set(y,ref_nmismatches);		/* Save for Stage3end_new_substrings */
4081 	debug7(printf("%d (%d ref) mismatches from genome over querypos %d..%d\n",
4082 		      nmismatches,ref_nmismatches,qstart,qend));
4083       }
4084       if (Univcoordlist_next(q) != NULL || qend_alts != NULL) {
4085 	splice3p = false;
4086       } else {
4087 	splice3p = splice3p_in;
4088       }
4089 
4090       if (splice5p == false && splice3p == false) {
4091 	/* Could potentially check here if qstart < qend, but relying upon caller to use endpoints_acceptable_p */
4092 	debug7(printf("Shortcut computes matches of %d = (%d - %d) - nmismatches %d\n",
4093 		      (qend-qstart)-nmismatches,qend,qstart,nmismatches));
4094 	nmatches += (qend - qstart) - nmismatches;
4095 	*ref_nmatches += (qend - qstart) - ref_nmismatches;
4096       } else {
4097         substring_nmatches =
4098 	  Substring_compute_nmatches(&substring_ref_nmatches,left,/*querystart*/qstart,/*queryend*/qend,querylength,
4099 				     /*plusp*/true,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
4100 				     /*splice_querystart_p*/splice5p,/*splice_queryend_p*/splice3p,/*chrnum_fixed_p*/true);
4101 	if (substring_nmatches < 0) {
4102 	  /* Don't know how to fix junctions */
4103 	  debug0(printf("Poor substring (plus) for %d..%d, so returning -1 from Stage3end_nmatches_substrings\n",
4104 			qstart,qend));
4105 	  *ref_nmatches = -1;
4106 	  return -1;
4107 	} else {
4108 	  debug7(printf("Substring_compute_nmatches returns nmatches %d over querypos %d..%d\n",
4109 			substring_nmatches,qstart,qend));
4110 	  nmatches += substring_nmatches;
4111 	  *ref_nmatches += substring_ref_nmatches;
4112 	}
4113       }
4114 
4115       /* Prepare for next iteration */
4116       qstart = qend;
4117       if (j != NULL) {
4118 	if ((junction = (Junction_T) List_head(j)) == NULL) {
4119 	  /* qstart_junction */
4120 	} else if ((adj0 = Junction_adj(junction)) != 0) {
4121 	  /* adj += adj0; */
4122 	  indel_score += indel_penalty_middle;
4123 	  nindels += Junction_nindels(junction);
4124 	  if (adj0 < 0) {
4125 	    debug7(printf("Adjusting qstart %d up by %d\n",qstart,-adj0));
4126 	    qstart -= adj0;	/* Insertion */
4127 	  }
4128 	}
4129       }
4130       splice5p = false;
4131     }
4132 
4133   } else {
4134     j = newjunctions;		/* Put here before we handle querystart_alts */
4135     if (qstart_alts != NULL) {
4136       debug7(printf("Adding %d matches for qstart_alts\n",Substring_nmatches(qstart_alts)));
4137       nmatches += Substring_nmatches(qstart_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4138       *ref_nmatches += Substring_ref_nmatches(qstart_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4139 #ifdef MAKE_JUNCTION
4140       donor_prob = Substring_amb_donor_prob(qstart_alts);
4141       acceptor_prob = Substring_amb_acceptor_prob(qstart_alts);
4142       qstart_junction = Junction_new_ambig_splice(orig_sensedir,donor_prob,acceptor_prob);
4143       /* printf("Creating junction with donor_prob %f and acceptor_prob %f\n",donor_prob,acceptor_prob); */
4144       newjunctions = Listpool_push(newjunctions,listpool,(void *) qstart_junction);
4145 #else
4146       newjunctions = Listpool_push(newjunctions,listpool,(void *) NULL);
4147 #endif
4148       splice5p = false;
4149     } else {
4150       splice5p = splice5p_in;
4151     }
4152 
4153     /* Subtract querypos to get alignstart/alignend */
4154     for (q = univdiagonals, x = nmismatches_list, y = ref_nmismatches_list, r = Intlist_next(endpoints); q != NULL;
4155 	 q = Univcoordlist_next(q), x = Intlist_next(x), y = Intlist_next(y), r = Intlist_next(r), j = List_next(j)) {
4156       qend = Intlist_head(r);
4157       nmismatches = Intlist_head(x);
4158       ref_nmismatches = Intlist_head(y);
4159       univdiagonal = Univcoordlist_head(q);
4160       left = univdiagonal - (Univcoord_T) querylength;
4161       debug0(printf("Stage3end_nmatches_substrings: qstart %d..qend %d at univdiagonal %u [%u]\n",
4162 		    qstart,qend,univdiagonal,univdiagonal - chroffset));
4163 
4164       /* genomicend = left; */
4165       /* genomicstart = left + querylength; */
4166       /* genomicend_adj = genomicend - adj; */
4167       /* genomicstart_adj = genomicend - adj; */
4168       /* alignstart = genomicstart - (querylength - qend); */
4169       /* alignend = genomicstart - (querylength - qstart); */
4170 
4171       if (nmismatches >= 0 && ref_nmismatches >= 0) {
4172 #ifdef CHECK_NMISMATCHES
4173 	assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4174 								/*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand));
4175 #endif
4176       } else {
4177 	nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4178 							/*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand);
4179 	Intlist_head_set(x,nmismatches);		/* Save for Stage3end_new_substrings */
4180 	Intlist_head_set(y,ref_nmismatches);		/* Save for Stage3end_new_substrings */
4181 	debug7(printf("%d (%d ref) mismatches from genome over querypos %d..%d\n",
4182 		      nmismatches,ref_nmismatches,querylength - qend,querylength - qstart));
4183       }
4184       if (Univcoordlist_next(q) != NULL || qend_alts != NULL) {
4185 	splice3p = false;
4186       } else {
4187 	splice3p = splice3p_in;
4188       }
4189 
4190       if (splice5p == false && splice3p == false) {
4191 	/* Could potentially check here if qstart < qend, but relying upon caller to use endpoints_acceptable_p */
4192 	debug7(printf("Shortcut computes matches of %d = (%d - %d) - nmismatches %d\n",
4193 		      (qend-qstart)-nmismatches,querylength - qstart,querylength - qend,nmismatches));
4194 	nmatches += (qend - qstart) - nmismatches;
4195 	*ref_nmatches += (qend - qstart) - ref_nmismatches;
4196       } else {
4197 	substring_nmatches =
4198 	  Substring_compute_nmatches(&substring_ref_nmatches,left,/*querystart*/querylength - qend,
4199 				     /*queryend*/querylength - qstart,querylength,
4200 				     /*plusp*/false,genestrand,query_compress,
4201 				     chrnum,chroffset,chrhigh,chrlength,/*splice_querystart_p*/splice3p,
4202 				     /*splice_queryend_p*/splice5p,/*chrnum_fixed_p*/true);
4203 	if (substring_nmatches < 0) {
4204 	  /* Don't know how to fix junctions */
4205 	  debug0(printf("Poor substring (minus) for querypos %d..%d, so returning -1 from Stage3end_new_substrings\n",
4206 			querylength - qend,querylength - qstart));
4207 	  *ref_nmatches = -1;
4208 	  return -1;
4209 	} else {
4210 	  debug7(printf("Substring_compute_nmatches returns nmatches %d over querypos %d..%d\n",
4211 			substring_nmatches,querylength - qend,querylength - qstart));
4212 	  nmatches += substring_nmatches;
4213 	  *ref_nmatches += substring_ref_nmatches;
4214 	}
4215       }
4216 
4217       /* Prepare for next iteration */
4218       qstart = qend;
4219       if (j != NULL) {
4220 	if ((junction = (Junction_T) List_head(j)) == NULL) {
4221 	  /* qstart_junction */
4222 	} else if ((adj0 = Junction_adj(junction)) != 0) {
4223 	  /* adj += adj0; */
4224 	  indel_score += indel_penalty_middle;
4225 	  nindels += Junction_nindels(junction);
4226 	  if (adj0 < 0) {
4227 	    debug7(printf("Adjusting qstart %d up by %d\n",qstart,-adj0));
4228 	    qstart -= adj0;	/* Insertion */
4229 	  }
4230 	}
4231       }
4232       splice5p = false;
4233     }
4234   }
4235 
4236   if (qend_alts != NULL) {
4237     debug7(printf("Adding %d matches for qend_alts\n",Substring_nmatches(qend_alts)));
4238     nmatches += Substring_nmatches(qend_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4239     *ref_nmatches += Substring_ref_nmatches(qend_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4240 #ifdef MAKE_JUNCTION
4241     newjunctions = List_reverse(newjunctions);
4242     donor_prob = Substring_amb_donor_prob(qend_alts);
4243     acceptor_prob = Substring_amb_acceptor_prob(qend_alts);
4244     qend_junction = Junction_new_ambig_splice(orig_sensedir,donor_prob,acceptor_prob);
4245     /* printf("Creating junction with donor_prob %f and acceptor_prob %f\n",donor_prob,acceptor_prob); */
4246     newjunctions = Listpool_push(newjunctions,listpool,(void *) qend_junction);
4247     newjunctions = List_reverse(newjunctions);
4248 #endif
4249   }
4250 
4251 
4252   nindelbreaks = 0;
4253   n_large_indels = 0;
4254 
4255   for (p = newjunctions; p != NULL; p = List_next(p)) {
4256     junction = (Junction_T) List_head(p);
4257     /* CHIMERA_JUNCTION not possible */
4258     if (junction == NULL) {
4259       /* qstart_junction */
4260     } else if (Junction_type(junction) == SPLICE_JUNCTION) {
4261       /* No indel breaks.  ? Add penalty for bad splice probs */
4262 
4263     } else if (Junction_type(junction) == INS_JUNCTION) {
4264       nindelbreaks++;
4265       if (Junction_nindels(junction) > 6) {
4266 	n_large_indels++;
4267       }
4268     } else if (Junction_type(junction) == DEL_JUNCTION) {
4269       nindelbreaks++;
4270       if (Junction_nindels(junction) > 6) {
4271 	n_large_indels++;
4272       }
4273     }
4274   }
4275 
4276 #if 0
4277   nmatches = nmatches - nindelbreaks*indel_penalty_middle - n_large_indels*3;
4278   for (p = newjunctions; p != NULL; p = List_next(p)) {
4279     if ((junction = List_head(p)) != NULL) {
4280       nmatches += Junction_ninserts(junction);
4281     }
4282   }
4283 #endif
4284 
4285 
4286 #ifdef MAKE_JUNCTION
4287   Junction_free(&qstart_junction);
4288   Junction_free(&qend_junction);
4289 #endif
4290 
4291   debug7(printf("Stage3end_nmatches_substrings returning %d matches\n",nmatches));
4292   /* List_free(&newjunctions); -- allocated by Listpool_push */
4293 
4294   assert(nmatches <= querylength);
4295   return nmatches;
4296 }
4297 
4298 
4299 
4300 /* endpoints are all in qstart/qend convention.  Need to convert to
4301    querystart and queryend when creating Substring_T objects */
4302 /* Three actions at each end: extend, chop, or compute_trim */
4303 T
Stage3end_new_substrings(int * found_score_overall,int * found_score_within_trims,Intlist_T endpoints,Univcoordlist_T univdiagonals,Intlist_T nmismatches_list,Intlist_T ref_nmismatches_list,List_T junctions,int querylength,Compress_T query_compress,Substring_T qend_alts,Substring_T qstart_alts,bool plusp,int genestrand,int sensedir,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool splice5p_in,Splicetype_T splicetype5,double ambig_prob_5,bool splice3p_in,Splicetype_T splicetype3,double ambig_prob_3,Listpool_T listpool,Method_T method,int level)4304 Stage3end_new_substrings (int *found_score_overall, int *found_score_within_trims,
4305 			  Intlist_T endpoints, Univcoordlist_T univdiagonals,
4306 			  Intlist_T nmismatches_list, Intlist_T ref_nmismatches_list, List_T junctions,
4307 			  int querylength, Compress_T query_compress,
4308 			  Substring_T qend_alts, Substring_T qstart_alts,
4309 			  bool plusp, int genestrand, int sensedir,
4310 			  Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
4311 			  bool splice5p_in, Splicetype_T splicetype5, double ambig_prob_5,
4312 			  bool splice3p_in, Splicetype_T splicetype3, double ambig_prob_3,
4313 			  Listpool_T listpool, Method_T method, int level) {
4314   T new;
4315 
4316   Univcoord_T genomicstart, genomicend;
4317   int querylength_trimmed = 0;
4318   int qstart, qend, queryspan;
4319   Univcoord_T univdiagonal, left;
4320   Intlist_T r, x, y;
4321   Univcoordlist_T q;
4322   Substring_T substring, substring1, substringN;
4323   Junction_T junction;
4324   List_T substrings_HtoL, substrings_LtoH, junctions_LtoH;
4325   List_T substrings = NULL, p, j;
4326   List_T newjunctions;
4327   bool splice5p, splice3p, passp;
4328   int adj = 0, adj0;			/* deletions - insertions */
4329   int nmismatches, ref_nmismatches, indel_score = 0, nindels = 0;
4330   int nmismatches_bothdiff = 0, nmismatches_refdiff = 0;
4331   int new_sensedir;
4332   bool contradictionp;
4333   int nsites, nindelbreaks, n_large_indels;
4334   double prob_total, donor_prob, acceptor_prob;
4335 
4336 
4337   debug7(printf("Entered Stage3end_new_substrings, method %s, with %s, plusp %d, splice5p %d, splice3p %d\n",
4338 		Method_string(method),Intlist_to_string(endpoints),plusp,splice5p_in,splice3p_in));
4339 
4340 #ifdef DEBUG0
4341   printf("Entered Stage3end_new_substrings, method %s, at univdiagonal %u [%u], with chrnum #%d, plusp %d, sensedir %d, and endpoints %s\n",
4342 	 Method_string(method),Univcoordlist_head(univdiagonals),Univcoordlist_head(univdiagonals) - chroffset,chrnum,plusp,sensedir,Intlist_to_string(endpoints));
4343   printf("There are %d endpoints, %d univdiagonals, %d nmismatches, and %d junctions\n",
4344 	 Intlist_length(endpoints),Univcoordlist_length(univdiagonals),Intlist_length(nmismatches_list),List_length(junctions));
4345   if (qstart_alts != NULL) {
4346     printf("qstart_alts at %d..%d.  ",Substring_querystart(qstart_alts),Substring_queryend(qstart_alts));
4347     Substring_print_alts_coords(qstart_alts);
4348     printf("\n");
4349   }
4350   if (qend_alts != NULL) {
4351     printf("qend_alts at %d..%d.  ",Substring_querystart(qend_alts),Substring_queryend(qend_alts));
4352     Substring_print_alts_coords(qend_alts);
4353     printf("\n");
4354   }
4355   printf("Endpoints: %s\n",Intlist_to_string(endpoints));
4356   printf("Univdiagonals: %s\n",Univcoordlist_to_string_offset(univdiagonals,chroffset));
4357   printf("Mismatches: %s\n",Intlist_to_string(nmismatches_list));
4358   printf("Ref mismatches: %s\n",Intlist_to_string(ref_nmismatches_list));
4359 #endif
4360 
4361   assert(Univcoordlist_length(univdiagonals) == Intlist_length(endpoints) - 1);
4362   assert(Intlist_length(nmismatches_list) == Intlist_length(endpoints) - 1);
4363   assert(Intlist_length(ref_nmismatches_list) == Intlist_length(endpoints) - 1);
4364   assert(List_length(junctions) == Intlist_length(endpoints) - 2);
4365 
4366 
4367   newjunctions = Junction_copy_list(junctions,listpool);
4368 
4369 #ifdef DEBUG0
4370   for (p = newjunctions; p != NULL; p = List_next(p)) {
4371     Junction_print((Junction_T) List_head(p));
4372   }
4373   printf("\n");
4374 #endif
4375 
4376   qstart = Intlist_head(endpoints);
4377   nmismatches = Intlist_head(nmismatches_list);
4378   ref_nmismatches = Intlist_head(ref_nmismatches_list);
4379 
4380   if (plusp == true) {
4381     j = newjunctions;		/* Put here before we handle qstart_alts */
4382     if (qstart_alts != NULL) {
4383       substrings = Listpool_push(substrings,listpool,(void *) Substring_copy(qstart_alts));
4384       donor_prob = Substring_amb_donor_prob(qstart_alts);
4385       acceptor_prob = Substring_amb_acceptor_prob(qstart_alts);
4386       junction = Junction_new_ambig_splice(sensedir,donor_prob,acceptor_prob);
4387       newjunctions = Listpool_push(newjunctions,listpool,(void *) junction);
4388       splice5p = false;
4389     } else {
4390       splice5p = splice5p_in;
4391     }
4392 
4393     /* Add qpos to get alignstart/alignend */
4394     for (q = univdiagonals, x = nmismatches_list, y = ref_nmismatches_list, r = Intlist_next(endpoints); q != NULL;
4395 	 q = Univcoordlist_next(q), x = Intlist_next(x), y = Intlist_next(y), r = Intlist_next(r), j = List_next(j)) {
4396       qend = Intlist_head(r);
4397       nmismatches = Intlist_head(x);
4398       ref_nmismatches = Intlist_head(y);
4399       univdiagonal = Univcoordlist_head(q);
4400       left = univdiagonal - (Univcoord_T) querylength;
4401       debug0(printf("Stage3end_new_substrings: qstart %d..qend %d at univdiagonal %u [%u}\n",
4402 		    qstart,qend,univdiagonal,univdiagonal - chroffset));
4403 
4404       /* genomicstart = left; */
4405       /* genomicend = left + querylength; */
4406       /* alignstart = genomicstart + qstart; */
4407       /* alignend = genomicstart + queryend; */
4408 
4409       if (nmismatches >= 0 && ref_nmismatches >= 0) {
4410 	debug7(printf("Checking mismatches at %u from querystart %d to queryend %d\n",univdiagonal - chroffset,qstart,qend));
4411 	debug7(printf("%d vs %d\n",nmismatches,
4412 		      Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4413 							/*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand)));
4414 #ifdef CHECK_NMISMATCHES
4415 	assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4416 								/*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand));
4417 #endif
4418       } else {
4419 	nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4420 							/*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand);
4421       }
4422       if (Univcoordlist_next(q) != NULL || qend_alts != NULL) {
4423 	splice3p = false;
4424       } else {
4425 	splice3p = splice3p_in;
4426       }
4427 
4428       if ((substring = Substring_new(nmismatches,ref_nmismatches,left,/*querystart*/qstart,/*queryend*/qend,querylength,
4429 				     /*plusp*/true,genestrand,query_compress,
4430 				     chrnum,chroffset,chrhigh,chrlength,
4431 				     /*splice_querystart_p*/splice5p,/*splicetype_querystart*/splicetype5,
4432 				     /*ambig_prob_querystart*/ambig_prob_5,
4433 				     /*splice_queryend_p*/splice3p,/*splicetype_queryend*/splicetype3,
4434 				     /*ambig_prob_queryend*/ambig_prob_3,sensedir)) == NULL) {
4435 	/* Don't know how to fix junctions */
4436 	debug0(printf("Poor substring (plus) for %d..%d, so returning NULL from Stage3end_new_substrings\n",
4437 		      qstart,qend));
4438 	for (p = substrings; p != NULL; p = List_next(p)) {
4439 	  substring = (Substring_T) List_head(p);
4440 	  if (substring == qstart_alts) {
4441 	    /* qstart_alts freed by calling procedure.  Need to free junction created for querystart_alts. */
4442 	    /* junctions = List_pop(junctions,(void **) &junction); */
4443 	    /* Junction_free(&junction); */
4444 	  } else {
4445 	    Substring_free(&substring);
4446 	  }
4447 	}
4448 	/* List_free(&substrings); -- allocated by Listpool_push */
4449 	debug0(printf("Stage3end_new_substrings returning NULL\n"));
4450 	Junction_list_gc(&newjunctions);
4451 	return (T) NULL;
4452 
4453       } else {
4454 	debug7(printf("Substring_new returns nmismatches %d, nmatches %d, ambp %d, amb %d over querypos %d..%d\n",
4455 		      Substring_nmismatches_bothdiff(substring),
4456 		      Substring_nmatches(substring),Substring_ambiguous_p(substring),
4457 		      Substring_amb_length(substring),Substring_querystart(substring),Substring_queryend(substring)));
4458 
4459 	debug0(printf("Substring_new returns nmismatches %d, nmatches %d, ambp %d, amb %d over querypos %d..%d\n",
4460 		      Substring_nmismatches_bothdiff(substring),
4461 		      Substring_nmatches(substring),Substring_ambiguous_p(substring),
4462 		      Substring_amb_length(substring),Substring_querystart(substring),Substring_queryend(substring)));
4463 	substrings = Listpool_push(substrings,listpool,(void *) substring);
4464 	nmismatches_bothdiff += Substring_nmismatches_bothdiff(substring);
4465 	nmismatches_refdiff += Substring_nmismatches_refdiff(substring);
4466 	querylength_trimmed += Substring_match_length(substring);
4467       }
4468 
4469       /* Prepare for next iteration */
4470       qstart = qend;
4471       if (j != NULL) {
4472 	junction = (Junction_T) List_head(j);
4473 	if ((adj0 = Junction_adj(junction)) != 0) {
4474 	  adj += adj0;
4475 	  indel_score += indel_penalty_middle;
4476 	  nindels += Junction_nindels(junction);
4477 	  if (adj0 < 0) {
4478 	    qstart -= adj0;	/* Insertion */
4479 	  }
4480 	}
4481       }
4482       splice5p = false;
4483     }
4484 
4485   } else {
4486     j = newjunctions;		/* Put here before we handle querystart_alts */
4487     if (qstart_alts != NULL) {
4488       substrings = Listpool_push(substrings,listpool,(void *) Substring_copy(qstart_alts));
4489       donor_prob = Substring_amb_donor_prob(qstart_alts);
4490       acceptor_prob = Substring_amb_acceptor_prob(qstart_alts);
4491       junction = Junction_new_ambig_splice(sensedir,donor_prob,acceptor_prob);
4492       /* printf("Creating junction with donor_prob %f and acceptor_prob %f\n",donor_prob,acceptor_prob); */
4493       newjunctions = Listpool_push(newjunctions,listpool,(void *) junction);
4494       splice5p = false;
4495     } else {
4496       splice5p = splice5p_in;
4497     }
4498 
4499     /* Subtract qpos to get alignstart/alignend */
4500     for (q = univdiagonals, x = nmismatches_list, y = ref_nmismatches_list, r = Intlist_next(endpoints); q != NULL;
4501 	 q = Univcoordlist_next(q), x = Intlist_next(x), y = Intlist_next(y), r = Intlist_next(r), j = List_next(j)) {
4502       qend = Intlist_head(r);
4503       nmismatches = Intlist_head(x);
4504       ref_nmismatches = Intlist_head(y);
4505       univdiagonal = Univcoordlist_head(q);
4506       left = univdiagonal - (Univcoord_T) querylength;
4507       debug0(printf("Stage3end_new_substrings: qstart %d..qend %d at univdiagonal %u [%u]\n",
4508 		    qstart,qend,univdiagonal,univdiagonal - chroffset));
4509 
4510       /* genomicend = left; */
4511       /* genomicstart = left + querylength; */
4512       /* genomicend_adj = genomicend - adj; */
4513       /* genomicstart_adj = genomicend - adj; */
4514       /* alignstart = genomicstart - (querylength - qend); */
4515       /* alignend = genomicstart - (querylength - qstart); */
4516 
4517       if (nmismatches >= 0 && ref_nmismatches >= 0) {
4518 #ifdef CHECK_NMISMATCHES
4519 	assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4520 								/*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand));
4521 #endif
4522       } else {
4523 	nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4524 							/*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand);
4525       }
4526       if (Univcoordlist_next(q) != NULL || qend_alts != NULL) {
4527 	splice3p = false;
4528       } else {
4529 	splice3p = splice3p_in;
4530       }
4531 
4532       if ((substring = Substring_new(nmismatches,ref_nmismatches,left,/*querystart*/querylength - qend,
4533 				     /*queryend*/querylength - qstart,querylength,
4534 				     /*plusp*/false,genestrand,query_compress,
4535 				     chrnum,chroffset,chrhigh,chrlength,
4536 				     /*splice_querystart_p*/splice3p,/*splicetype_querystart*/splicetype3,
4537 				     /*ambig_prob_querystart*/ambig_prob_3,
4538 				     /*splice_queryend_p*/splice5p,/*splicetype_queryend*/splicetype5,
4539 				     /*ambig_prob_queryend*/ambig_prob_5,sensedir)) == NULL) {
4540 	/* Don't know how to fix junctions */
4541 	debug0(printf("Poor substring (minus) for %d..%d, so returning NULL from Stage3end_new_substrings\n",
4542 		      querylength - qend,querylength - qstart));
4543 	for (p = substrings; p != NULL; p = List_next(p)) {
4544 	  substring = (Substring_T) List_head(p);
4545 	  if (substring == qstart_alts) {
4546 	    /* querystart_alts freed by calling procedure.  Need to free junction created for querystart_alts. */
4547 	    /* junctions = List_pop(junctions,(void **) &junction); */
4548 	    /* Junction_free(&junction); */
4549 	  } else {
4550 	    Substring_free(&substring);
4551 	  }
4552 	}
4553 	/* List_free(&substrings); -- allocated by Listpool_push */
4554 
4555 	debug0(printf("Stage3end_new_substrings returning NULL\n"));
4556 	Junction_list_gc(&newjunctions);
4557 	return (T) NULL;
4558 
4559       } else {
4560 	debug7(printf("Substring_new returns nmismatches %d, nmatches %d, ambp %d, amb %d over querypos %d..%d\n",
4561 		      Substring_nmismatches_bothdiff(substring),
4562 		      Substring_nmatches(substring),Substring_ambiguous_p(substring),
4563 		      Substring_amb_length(substring),Substring_querystart(substring),Substring_queryend(substring)));
4564 
4565 	debug0(printf("Substring_new returns nmismatches %d, nmatches %d, ambp %d, amb %d over querypos %d..%d\n",
4566 		      Substring_nmismatches_bothdiff(substring),
4567 		      Substring_nmatches(substring),Substring_ambiguous_p(substring),
4568 		      Substring_amb_length(substring),Substring_querystart(substring),Substring_queryend(substring)));
4569 	substrings = Listpool_push(substrings,listpool,(void *) substring);
4570 	nmismatches_bothdiff += Substring_nmismatches_bothdiff(substring);
4571 	nmismatches_refdiff += Substring_nmismatches_refdiff(substring);
4572 	querylength_trimmed += Substring_match_length(substring);
4573       }
4574 
4575       /* Prepare for next iteration */
4576       qstart = qend;
4577       if (j != NULL) {
4578 	junction = (Junction_T) List_head(j);
4579 	if ((adj0 = Junction_adj(junction)) != 0) {
4580 	  adj += adj0;
4581 	  indel_score += indel_penalty_middle;
4582 	  nindels += Junction_nindels(junction);
4583 	  if (adj0 < 0) {
4584 	    qstart -= adj0;	/* Insertion */
4585 	  }
4586 	}
4587       }
4588       splice5p = false;
4589     }
4590   }
4591 
4592   if (qend_alts != NULL) {
4593     substrings = Listpool_push(substrings,listpool,(void *) Substring_copy(qend_alts));
4594     newjunctions = List_reverse(newjunctions);
4595     donor_prob = Substring_amb_donor_prob(qend_alts);
4596     acceptor_prob = Substring_amb_acceptor_prob(qend_alts);
4597     junction = Junction_new_ambig_splice(sensedir,donor_prob,acceptor_prob);
4598     /* printf("Creating junction with donor_prob %f and acceptor_prob %f\n",donor_prob,acceptor_prob); */
4599     newjunctions = Listpool_push(newjunctions,listpool,(void *) junction);
4600     newjunctions = List_reverse(newjunctions);
4601   }
4602 
4603 #ifdef DEBUG0
4604   printf("NEW JUNCTIONS\n");
4605   for (p = newjunctions; p != NULL; p = List_next(p)) {
4606     Junction_print(List_head(p));
4607   }
4608   printf("\n");
4609 #endif
4610 
4611 
4612   if (plusp == true) {
4613     substring1 = List_last_value(substrings);
4614     substringN = List_head(substrings);
4615   } else {
4616     substring1 = List_head(substrings);
4617     substringN = List_last_value(substrings);
4618   }
4619 
4620   debug0(printf("Trim left: %d.  Trim right: %d\n",
4621 		Substring_trim_querystart(substring1),Substring_trim_queryend(substringN)));
4622 
4623   passp = true;
4624   if (Substring_chrnum(substring1) != Substring_chrnum(substringN)) {
4625     debug0(printf("ABORTING BECAUSE SUBSTRINGS HAVE DIFFERENT CHRNUMS: %d AND %d\n",
4626 		  Substring_chrnum(substring1),Substring_chrnum(substringN)));
4627     passp = false;
4628 
4629   } else if (circularp[chrnum] == true && plusp == true && Substring_alignend_trim(substringN) - Substring_alignstart_trim(substring1) >= chrlength) {
4630     debug0(printf("ABORTING BECAUSE CIRCULAR CHROMOSOME CHRLENGTH %u AND ALIGNMENT %u..%u\n",
4631 		  chrlength,Substring_alignstart_trim(substring1),Substring_alignend_trim(substringN)));
4632     passp = false;
4633 
4634   } else if (circularp[chrnum] == true && plusp == false && Substring_alignstart_trim(substring1) - Substring_alignend_trim(substringN) >= chrlength) {
4635     debug0(printf("ABORTING BECAUSE CIRCULAR CHROMOSOME CHRLENGTH %u AND ALIGNMENT %u..%u\n",
4636 		  chrlength,Substring_alignstart_trim(substring1),Substring_alignend_trim(substringN)));
4637     passp = false;
4638 
4639   } else if ((queryspan = Substring_queryend(substringN) - Substring_querystart(substring1)) == querylength) {
4640     /* Allow short queries to match completely */
4641 
4642 #if 0
4643   } else if (queryspan < MIN_ALIGNMENT_LEN) {
4644     debug0(printf("ABORTING BECAUSE QUERYSPAN %d < MIN_ALIGNMENT_LEN %d\n",
4645 		  queryspan,MIN_ALIGNMENT_LEN));
4646     passp = false;
4647 #endif
4648 
4649   }
4650 
4651 
4652   if (passp == false) {
4653     for (p = substrings; p != NULL; p = List_next(p)) {
4654       substring = (Substring_T) List_head(p);
4655       if (substring == qstart_alts || substring == qend_alts) {
4656 	/* qstart_alts and qend_alts freed by calling procedure */
4657       } else {
4658 	Substring_free(&substring);
4659       }
4660     }
4661     /* List_free(&substrings); -- allocated by Listpool_push */
4662 
4663     debug0(printf("Stage3end_new_substrings returning NULL\n"));
4664     Junction_list_gc(&newjunctions);
4665     return (T) NULL;
4666   }
4667 
4668 
4669   new = (T) MALLOC_OUT(sizeof(*new));
4670   new->hittype = SUBSTRINGS;
4671   new->method = method;
4672   new->level = level;
4673 
4674   new->transcripts = (List_T) NULL;
4675   new->transcripts_other = (List_T) NULL;
4676 
4677   new->querylength = querylength;
4678   new->querylength_adj = querylength + adj;
4679 
4680   /* Note differences between substrings and junctions.  Substrings
4681      were pushed onto lists above, and junctions were created by the
4682      caller, so they are originally in opposite orders */
4683   substrings_HtoL = substrings;
4684   substrings_LtoH = List_reverse(Listpool_copy(substrings,listpool));
4685   junctions_LtoH = newjunctions;
4686 
4687   if (plusp == true) {
4688     new->substrings_1toN = substrings_LtoH;
4689     new->substrings_Nto1 = substrings_HtoL;
4690 
4691     new->junctions_1toN = junctions_LtoH;
4692     new->junctions_Nto1 = List_reverse(Listpool_copy(junctions_LtoH,listpool));
4693 
4694   } else {
4695     new->substrings_1toN = substrings_HtoL;
4696     new->substrings_Nto1 = substrings_LtoH;
4697 
4698     new->junctions_1toN = List_reverse(Listpool_copy(junctions_LtoH,listpool));
4699     new->junctions_Nto1 = junctions_LtoH;
4700   }
4701 
4702 
4703 #ifdef DEBUG0
4704   printf("NEW SUBSTRINGS\n");
4705   for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
4706     substring = List_head(p);
4707     if (Substring_has_alts_p(substring) == true) {
4708       printf("%d..%d\t#%d\talts\tmatches_to_trims: %d\tamb:%d\t%d common_prob:%f alts:",
4709 	     Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
4710 	     Substring_nmatches_to_trims(substring),Substring_amb_length(substring),
4711 	     Substring_alts_ncoords(substring),Substring_alts_common_prob(substring));
4712       Substring_print_alts_coords(substring);
4713       printf("\n");
4714 
4715     } else if (Substring_ambiguous_p(substring) == true) {
4716       printf("%d..%d\t#%d\t%u..%u\tambig\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\tprobs:%f and %f\n",
4717 	     Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
4718 	     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),
4719 	     Substring_nmismatches_bothdiff(substring),Substring_nmatches_to_trims(substring),Substring_amb_length(substring),
4720 	     Substring_amb_donor_prob(substring),Substring_amb_acceptor_prob(substring));
4721     } else {
4722       printf("%d..%d\t#%d\t%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",
4723 	     Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
4724 	     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),
4725 	     Substring_nmismatches_bothdiff(substring),Substring_nmatches_to_trims(substring),
4726 	     Substring_amb_length(substring));
4727     }
4728   }
4729   printf("\n");
4730 #endif
4731 
4732 
4733   substring1 = (Substring_T) List_head(new->substrings_1toN);
4734   substringN = (Substring_T) List_head(new->substrings_Nto1);
4735 
4736   new->trim_querystart = Substring_trim_querystart(substring1);
4737   new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring1);
4738   new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring1);
4739   new->trim_queryend = Substring_trim_queryend(substringN);
4740   new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substringN);
4741   new->trim_queryend_splicep = Substring_trim_queryend_splicep(substringN);
4742   debug0(printf("  trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
4743   debug0(printf("  trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
4744 
4745   new->querystart_chrbound = Substring_querystart_chrbound(substring1);
4746   new->queryend_chrbound = Substring_queryend_chrbound(substringN);
4747   if (new->trim_querystart > new->querystart_chrbound) {
4748     new->querystart_chrbound = new->trim_querystart;
4749   }
4750   if (querylength - new->trim_queryend < new->queryend_chrbound) {
4751     new->queryend_chrbound = querylength - new->trim_queryend;
4752   }
4753   assert(new->querystart_chrbound < new->queryend_chrbound);
4754   debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
4755 
4756 
4757   genomicstart = Substring_genomicstart(substring1);
4758   genomicend = Substring_genomicend(substringN);
4759   new->genomicstart = genomicstart;
4760   new->genomicend = genomicend;
4761 
4762   if (plusp == true) {
4763     new->low = genomicstart + new->querystart_chrbound;
4764     new->high = genomicend - (querylength - new->queryend_chrbound);
4765     new->genomiclength = genomicend - genomicstart;
4766   } else {
4767     new->low = genomicend + (querylength - new->queryend_chrbound);
4768     new->high = genomicstart - new->querystart_chrbound;
4769     new->genomiclength = genomicstart - genomicend;
4770   }
4771   assert(new->low < new->high);
4772   debug0(printf("low %u, high %u\n",new->low - chroffset,new->high - chroffset));
4773 
4774   new->guided_insertlength = 0U;
4775 
4776   new->distant_splice_p = false;
4777   new->chrnum = new->effective_chrnum = chrnum;
4778   new->other_chrnum = 0;
4779   new->chroffset = chroffset;
4780   new->chrhigh = chrhigh;
4781   new->chrlength = chrlength;
4782   new->plusp = plusp;
4783   new->genestrand = genestrand;
4784 
4785   if (sensedir != SENSE_NULL) {
4786     debug0(printf("sensedir is %d (original)\n",sensedir));
4787     new->sensedir = sensedir;
4788   } else {
4789     new->sensedir = SENSE_NULL;
4790     contradictionp = false;
4791     for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
4792       substring = (Substring_T) List_head(p);
4793       debug0(printf("substring has sensedir %d\n",Substring_sensedir(substring)));
4794       if (Substring_sensedir(substring) == SENSE_NULL) {
4795 	/* Ignore */
4796       } else if (new_sensedir == SENSE_NULL) {
4797 	new_sensedir = Substring_sensedir(substring);
4798       } else if (Substring_sensedir(substring) != new_sensedir) {
4799 	contradictionp = true;
4800       }
4801     }
4802 
4803     for (p = new->junctions_1toN; p != NULL; p = List_next(p)) {
4804       junction = (Junction_T) List_head(p);
4805       debug0(printf("junction has sensedir %d\n",Junction_sensedir(junction)));
4806       if (Junction_sensedir(junction) == SENSE_NULL) {
4807 	/* Ignore.  Probably an indel. */
4808       } else if (new_sensedir == SENSE_NULL) {
4809 	new_sensedir = Junction_sensedir(junction);
4810       } else if (Junction_sensedir(junction) != new_sensedir) {
4811 	contradictionp = true;
4812       }
4813     }
4814 
4815     if (contradictionp == true) {
4816       debug0(printf("CONTRADICTION IN SENSEDIR\n"));
4817       new->sensedir = SENSE_NULL;
4818     } else {
4819       debug0(printf("sensedir is %d\n",new_sensedir));
4820       new->sensedir = new_sensedir;
4821     }
4822   }
4823   new->sensedir_for_concordance = new->sensedir;
4824 
4825   prob_total = 0.0;
4826   nsites = 0;
4827   if (splice5p_in == true) {
4828     prob_total += ambig_prob_5;
4829     nsites++;
4830   }
4831   if (splice3p_in == true) {
4832     prob_total += ambig_prob_3;
4833     nsites++;
4834   }
4835 
4836   new->nsplices = 0;
4837   for (p = newjunctions; p != NULL; p = List_next(p)) {
4838     junction = (Junction_T) List_head(p);
4839     if (Junction_type(junction) == SPLICE_JUNCTION) {
4840       prob_total += Junction_splice_score(junction);
4841       nsites += 2;
4842       new->nsplices += 1;
4843     }
4844   }
4845   if (nsites == 0) {
4846     new->splice_score = 0.0;
4847   } else {
4848     new->splice_score = prob_total / (double) nsites;
4849   }
4850   debug0(printf("SPLICE SCORE: %f\n",new->splice_score));
4851 
4852 
4853   nindelbreaks = 0;
4854   n_large_indels = 0;
4855   for (p = newjunctions; p != NULL; p = List_next(p)) {
4856     junction = (Junction_T) List_head(p);
4857     /* CHIMERA_JUNCTION not possible */
4858     if (Junction_type(junction) == INS_JUNCTION) {
4859       nindelbreaks++;
4860       if (Junction_nindels(junction) > 6) {
4861 	n_large_indels++;
4862       }
4863     } else if (Junction_type(junction) == DEL_JUNCTION) {
4864       nindelbreaks++;
4865       if (Junction_nindels(junction) > 6) {
4866 	n_large_indels++;
4867       }
4868     }
4869   }
4870 
4871 
4872   /* nmismatches_bothdiff is computed after trimming */
4873   new->nindels = nindels;
4874   new->nmismatches_bothdiff = nmismatches_bothdiff; /* Trimmed */
4875   new->nmismatches_refdiff = nmismatches_refdiff;
4876   new->nsegments = List_length(new->substrings_1toN);
4877 
4878 
4879   new->refalt_nmatches_to_trims = new->ref_nmatches_to_trims = 0;
4880   /* Note: Cannot use substrings variable here.  Need to use new->substrings_1toN */
4881   for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
4882     substring = (Substring_T) List_head(p);
4883     new->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
4884     new->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
4885   }
4886   debug0(printf("Setting nmatches_to_trims to be %d\n",new->refalt_nmatches_to_trims));
4887 
4888   new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims + Substring_start_amb_length(substring1) + Substring_end_amb_length(substringN);
4889   new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + Substring_start_amb_length(substring1) + Substring_end_amb_length(substringN);
4890   debug0(printf("Setting nmatches_plus_spliced_trims to be %d = %d + %d + %d\n",
4891 		new->ref_nmatches_plus_spliced_trims,new->ref_nmatches_to_trims,
4892 		Substring_start_amb_length(substring1),Substring_end_amb_length(substringN)));
4893 
4894   for (p = new->junctions_1toN; p != NULL; p = List_next(p)) {
4895     junction = List_head(p);
4896     new->refalt_nmatches_plus_spliced_trims += Junction_ninserts(junction);
4897     new->ref_nmatches_plus_spliced_trims += Junction_ninserts(junction);
4898   }
4899   assert(new->refalt_nmatches_plus_spliced_trims >= 0);
4900   assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
4901 
4902   new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
4903   new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
4904   /* Needed to make -m flag work properly.  Generally improves alignments */
4905 #if 1
4906   new->ref_score_overall += indel_score; /* -nindels was an attempt to compensate for missing matches */
4907   new->refalt_score_overall += indel_score; /* -nindels was an attempt to compensate for missing matches */
4908 #endif
4909   new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims;
4910   if (Substring_trim_querystart_splicep(substring1) == false) {
4911     new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring1)/END_BINSIZE);
4912   } else {
4913     new->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring1)/END_BINSIZE);
4914   }
4915   if (Substring_trim_queryend_splicep(substringN) == false) {
4916     new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((querylength - Substring_queryend(substringN))/END_BINSIZE);
4917   } else {
4918     new->refalt_score_within_trims += SPLICED_END_PENALTY*((querylength - Substring_queryend(substringN))/END_BINSIZE);
4919   }
4920   /* was Substring_start_amb_length(substring1)/AMB_PENALTY - Substring_end_amb_length(substringN)/AMB_PENALTY, but doesn't work for DNA-seq */
4921 
4922   if (chrlength < (Univcoord_T) querylength) {
4923     new->ref_score_overall -= ((Univcoord_T) querylength - chrlength);
4924     new->refalt_score_overall -= ((Univcoord_T) querylength - chrlength);
4925     new->refalt_score_within_trims -= ((Univcoord_T) querylength - chrlength);
4926   }
4927   assert(new->refalt_score_within_trims >= 0);
4928 
4929 
4930   /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
4931   if (new->refalt_score_overall < *found_score_overall) {
4932     *found_score_overall = new->refalt_score_overall;
4933   }
4934   /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
4935   if (new->refalt_score_within_trims < *found_score_within_trims) {
4936     *found_score_within_trims = new->refalt_score_within_trims;
4937   }
4938 
4939 
4940   /* new->penalties = 0; */
4941 
4942   /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
4943   /* new->tally = -1L; */
4944 
4945   new->paired_usedp = false;
4946 
4947   /* new->query_splicepos = -1; */
4948   new->circularpos = compute_circularpos(&new->circularalias,new);
4949 
4950   debug0(printf("%d substrings\n",List_length(new->substrings_1toN)));
4951   debug0(printf("%d junctions\n",List_length(new->junctions_1toN)));
4952   assert(List_length(new->substrings_1toN) == List_length(new->junctions_1toN) + 1);
4953 
4954 
4955   /* Previously checked for (new->circularalias == +2 || new->circularalias == -2) */
4956 
4957   debug7(printf("Stage3end_new_substrings returning %d matches_plus_spliced_trims\n",
4958 		new->refalt_nmatches_plus_spliced_trims));
4959 
4960   if (new->circularpos >= 0) {
4961     new->altlocp = false;
4962     debug0(printf("*****Method %s: Stage3end_new_substrings returning circular %p from Stage3end_new_substrings with score %d within trims, %d overall (found_score %d), nmatches %d, sensedir %d, splice score %f\n\n",
4963 		  Method_string(method),new,new->refalt_score_within_trims,new->refalt_score_overall,*found_score_within_trims,
4964 		  new->refalt_nmatches_plus_spliced_trims,new->sensedir,new->splice_score));
4965     return new;
4966 
4967   } else if ((new->altlocp = altlocp[chrnum]) == false) {
4968     debug0(printf("*****Method %s: Stage3end_new_substrings returning primary %p from Stage3end_new_substrings with score %d within trims, %d overall (found_score %d), nmatches %d, sensedir %d, splice score %f\n\n",
4969 		  Method_string(method),new,new->refalt_score_within_trims,new->refalt_score_overall,*found_score_within_trims,
4970 		  new->refalt_nmatches_plus_spliced_trims,new->sensedir,new->splice_score));
4971     return new;
4972 
4973   } else {
4974     debug0(printf("*****Method %s: Stage3end_new_substrings returning altloc %p from Stage3end_new_substrings with score %d within trims, %d overall (found_score %d), nmatches %d, sensedir %d, splice score %f\n\n",
4975 		  Method_string(method),new,new->refalt_score_within_trims,new->refalt_score_overall,*found_score_within_trims,
4976 		  new->refalt_nmatches_plus_spliced_trims,new->sensedir,new->splice_score));
4977     return new;
4978   }
4979 }
4980 
4981 
4982 #define add_bounded(x,plusterm,highbound) ((x + (plusterm) >= highbound) ? (highbound - 1) : x + (plusterm))
4983 #define subtract_bounded(x,minusterm,lowbound) ((x < lowbound + (minusterm)) ? lowbound : x - (minusterm))
4984 
4985 
4986 T
Stage3end_new_substitution(int * found_score_overall,int * found_score_within_trims,Univcoord_T univdiagonal,int pos5,int pos3,int querylength,int * mismatch_positions_alloc,Compress_T query_compress,bool plusp,int genestrand,int sensedir,int nmismatches_allowed,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Listpool_T listpool,Method_T method,int level)4987 Stage3end_new_substitution (int *found_score_overall, int *found_score_within_trims,
4988 			    Univcoord_T univdiagonal, int pos5, int pos3, int querylength,
4989 			    int *mismatch_positions_alloc, Compress_T query_compress,
4990 			    bool plusp, int genestrand, int sensedir, int nmismatches_allowed,
4991 			    Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
4992 			    Chrpos_T chrlength, Listpool_T listpool, Method_T method, int level) {
4993   T new;
4994   Univcoord_T left;
4995   Substring_T substring;
4996   int qstart, qend, nmismatches, ref_nmismatches;
4997   bool splice_querystart_p, splice_queryend_p;
4998   Splicetype_T splicetype_querystart, splicetype_queryend;
4999   double ambig_prob_querystart, ambig_prob_queryend;
5000 
5001 
5002   debug0(printf("Entered Stage3end_new_substitution, method %s, sensedir %d at univdiagonal %u [%u] and chrhigh %u\n",
5003 		Method_string(method),sensedir,univdiagonal,univdiagonal - chroffset,chrhigh));
5004 
5005   left = univdiagonal - (Univcoord_T) querylength;
5006 
5007   if (plusp == true) {
5008     splice_querystart_p = Substring_qstart_trim(&qstart,&splicetype_querystart,&ambig_prob_querystart,
5009 						univdiagonal,pos3,querylength,plusp,genestrand,
5010 						mismatch_positions_alloc,query_compress,chroffset,sensedir);
5011     splice_queryend_p = Substring_qend_trim(&qend,&splicetype_queryend,&ambig_prob_queryend,
5012 					    univdiagonal,pos5,querylength,plusp,genestrand,
5013 					    mismatch_positions_alloc,query_compress,chroffset,chrhigh,sensedir);
5014 
5015     debug0(printf("Trimming querystart yields splicep %d, qstart %d, prob %f\n",splice_querystart_p,qstart,ambig_prob_querystart));
5016     debug0(printf("Trimming queryend yields splicep %d, qend %d, prob %f\n",splice_queryend_p,qend,ambig_prob_queryend));
5017 
5018     if (qstart < 0 || qend < 0) {
5019       debug0(printf("Returning NULL\n"));
5020       return (T) NULL;
5021 
5022     } else if (qend <= qstart) {
5023       /* Otherwise, calling Genome_count_mismatches_substring will not be defined */
5024       debug0(printf("Returning NULL\n"));
5025       return (T) NULL;
5026 
5027     } else {
5028       nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
5029 						      /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand);
5030       if (nmismatches > nmismatches_allowed) {
5031 	debug0(printf("Returning NULL\n"));
5032 	return (T) NULL;
5033 
5034       } else if ((substring = Substring_new(nmismatches,ref_nmismatches,left,/*querystart*/qstart,/*queryend*/qend,
5035 					    querylength,/*plusp*/true,genestrand,query_compress,
5036 					    chrnum,chroffset,chrhigh,chrlength,
5037 					    splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
5038 					    splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
5039 					    sensedir)) == NULL) {
5040 	debug0(printf("Returning NULL\n"));
5041 	return (T) NULL;
5042       }
5043     }
5044 
5045   } else {
5046     /* trim_querystart and trim_queryend Genome_count_mismatches_substring are flipped, but not for Substring_new */
5047     splice_querystart_p = Substring_qend_trim(&qend,&splicetype_querystart,&ambig_prob_querystart,
5048 					      univdiagonal,pos5,querylength,plusp,genestrand,
5049 					      mismatch_positions_alloc,query_compress,chroffset,chrhigh,sensedir);
5050     splice_queryend_p = Substring_qstart_trim(&qstart,&splicetype_queryend,&ambig_prob_queryend,
5051 					      univdiagonal,pos3,querylength,plusp,genestrand,
5052 					      mismatch_positions_alloc,query_compress,chroffset,sensedir);
5053 
5054     debug0(printf("Trimming querystart yields splicep %d, qstart %d, prob %f\n",splice_querystart_p,qstart,ambig_prob_querystart));
5055     debug0(printf("Trimming queryend yields splicep %d, qend %d, prob %f\n",splice_queryend_p,qend,ambig_prob_queryend));
5056 
5057     if (qstart < 0 || qend < 0) {
5058       debug0(printf("Returning NULL\n"));
5059       return (T) NULL;
5060 
5061     } else if (qend <= qstart) {
5062       /* Otherwise, calling Genome_count_mismatches_substring will not be defined */
5063       debug0(printf("Returning NULL\n"));
5064       return (T) NULL;
5065 
5066     } else {
5067       nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
5068 						      /*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand);
5069 
5070       if (nmismatches > nmismatches_allowed) {
5071 	debug0(printf("Returning NULL\n"));
5072 	return (T) NULL;
5073 
5074       } else if ((substring = Substring_new(nmismatches,ref_nmismatches,left,/*querystart*/querylength - qend,/*queryend*/querylength - qstart,
5075 					    querylength,/*plusp*/false,genestrand,query_compress,
5076 					    chrnum,chroffset,chrhigh,chrlength,
5077 					    splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
5078 					    splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
5079 					    sensedir)) == NULL) {
5080 	debug0(printf("Returning NULL\n"));
5081 	return (T) NULL;
5082       }
5083     }
5084   }
5085 
5086   new = (T) MALLOC_OUT(sizeof(*new));
5087   debug0(printf("Stage3end_new_substitution %p: univdiagonal %llu, chrnum %d, nmismatches %d\n",
5088 		new,(unsigned long long) univdiagonal,Substring_chrnum(substring),nmismatches));
5089 
5090   new->substrings_1toN = Listpool_push(NULL,listpool,(void *) substring);
5091   new->substrings_Nto1 = Listpool_push(NULL,listpool,(void *) substring);
5092 
5093   new->junctions_1toN = (List_T) NULL;
5094   new->junctions_Nto1 = (List_T) NULL;
5095 
5096 #if 0
5097   if (plusp) {
5098     new->trim_querystart = qstart;
5099     new->trim_queryend = querylength - qend;
5100   } else {
5101     new->trim_querystart = querylength - qend;
5102     new->trim_queryend = qstart;
5103   }
5104   new->trim_querystart_splicep = splice_querystart_p;
5105   new->trim_queryend_splicep = splice_queryend_p;
5106 #else
5107   new->trim_querystart = Substring_trim_querystart(substring);
5108   new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring);
5109   new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring);
5110   new->trim_queryend = Substring_trim_queryend(substring);
5111   new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substring);
5112   new->trim_queryend_splicep = Substring_trim_queryend_splicep(substring);
5113 #endif
5114   debug0(printf("  trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
5115   debug0(printf("  trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
5116 
5117   new->querystart_chrbound = Substring_querystart_chrbound(substring);
5118   new->queryend_chrbound = Substring_queryend_chrbound(substring);
5119   if (new->trim_querystart > new->querystart_chrbound) {
5120     new->querystart_chrbound = new->trim_querystart;
5121   }
5122   if (querylength - new->trim_queryend < new->queryend_chrbound) {
5123     new->queryend_chrbound = querylength - new->trim_queryend;
5124   }
5125   assert(new->querystart_chrbound < new->queryend_chrbound);
5126   debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
5127 
5128 
5129   new->transcripts = (List_T) NULL;
5130   new->transcripts_other = (List_T) NULL;
5131 
5132   new->querylength_adj = new->querylength = querylength;
5133   if (plusp == true) {
5134     new->genomicstart = left;
5135     new->genomicend = left + (Univcoord_T) querylength;
5136     new->low = new->genomicstart + (Univcoord_T) new->querystart_chrbound;
5137     new->high = new->genomicend - (Univcoord_T) (querylength - new->queryend_chrbound);
5138   } else {
5139     new->genomicend = left;
5140     new->genomicstart = left + (Univcoord_T) querylength;
5141     new->low = new->genomicend + (Univcoord_T) (querylength - new->queryend_chrbound);
5142     new->high = new->genomicstart - (Univcoord_T) new->querystart_chrbound;
5143   }
5144   assert(new->low < new->high);
5145   debug0(printf("low %u, high %u\n",new->low - chroffset,new->high - chroffset));
5146 
5147   new->genomiclength = querylength;
5148 
5149   new->guided_insertlength = 0U;
5150 
5151 #if 0
5152   if (nmismatches == 0) {
5153     /* Proper hittype needed so we can eliminate identical hits */
5154     new->hittype = EXACT;
5155   } else {
5156     new->hittype = SUB;
5157   }
5158 #else
5159   new->hittype = SUB;
5160 #endif
5161   new->method = method;
5162   new->level = level;
5163 
5164 
5165   /* Note: It is possible that Substring_new has assigned a new chrnum, different from the one given */
5166   new->distant_splice_p = false;
5167   new->chrnum = new->effective_chrnum = Substring_chrnum(substring);
5168   new->other_chrnum = 0;
5169   new->chroffset = Substring_chroffset(substring);
5170   new->chrhigh = Substring_chrhigh(substring);
5171   new->chrlength = Substring_chrlength(substring);
5172   new->plusp = plusp;
5173   new->genestrand = genestrand;
5174 
5175   new->sensedir_for_concordance = new->sensedir = sensedir;
5176 
5177 #if 0
5178   new->mapq_loglik = Substring_mapq_loglik(substring);
5179   new->mapq_score = 0;
5180   new->absmq_score = 0;
5181 #endif
5182 
5183   new->nindels = 0;
5184   new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(substring);
5185   new->nmismatches_refdiff = Substring_nmismatches_refdiff(substring);
5186   new->nsegments = 1;
5187 
5188 
5189   new->refalt_nmatches_to_trims = Substring_nmatches_to_trims(substring);
5190   new->ref_nmatches_to_trims = Substring_ref_nmatches_to_trims(substring);
5191   new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims + Substring_amb_length(substring);
5192   new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + Substring_amb_length(substring);
5193   assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
5194 
5195   new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
5196   new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
5197   new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims;
5198   if (Substring_trim_querystart_splicep(substring) == false) {
5199     new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring)/END_BINSIZE);
5200   } else {
5201     new->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring)/END_BINSIZE);
5202   }
5203   if (Substring_trim_queryend_splicep(substring) == false) {
5204     new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((querylength - Substring_queryend(substring))/END_BINSIZE);
5205   } else {
5206     new->refalt_score_within_trims += SPLICED_END_PENALTY*((querylength - Substring_queryend(substring))/END_BINSIZE);
5207   }
5208   /* was Substring_amb_length(substring)/AMB_PENALTY, but doesn't work for DNA-seq */
5209 
5210   if (chrlength < (Univcoord_T) querylength) {
5211     new->ref_score_overall -= ((Univcoord_T) querylength - chrlength);
5212     new->refalt_score_overall -= ((Univcoord_T) querylength - chrlength);
5213     new->refalt_score_within_trims -= ((Univcoord_T) querylength - chrlength);
5214   }
5215   assert(new->refalt_score_within_trims >= 0);
5216 
5217 
5218   /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
5219   if (new->refalt_score_overall < *found_score_overall) {
5220     *found_score_overall = new->refalt_score_overall;
5221   }
5222   /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
5223   if (new->refalt_score_within_trims < *found_score_within_trims) {
5224     *found_score_within_trims = new->refalt_score_within_trims;
5225   }
5226 
5227 
5228   /* new->penalties = 0; */
5229 
5230   /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
5231   /* new->tally = -1L; */
5232 
5233   new->nsplices = 0;
5234   if (splice_querystart_p == true && splice_queryend_p == true) {
5235     new->splice_score = (ambig_prob_querystart + ambig_prob_queryend)/2.0;
5236   } else if (splice_querystart_p == true) {
5237     new->splice_score = ambig_prob_querystart;
5238   } else if (splice_queryend_p == true) {
5239     new->splice_score = ambig_prob_queryend;
5240   } else {
5241     new->splice_score = 0.0;
5242   }
5243 
5244   new->paired_usedp = false;
5245 
5246   /* new->query_splicepos = -1; */
5247   new->circularpos = compute_circularpos(&new->circularalias,new);
5248 
5249   debug0(printf("*****Method %s: Stage3end_new_substitution returning %p at %u..%u with nmatches_to_trims %d and amb length %d+%d\n\n",
5250 		Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset,new->ref_nmatches_to_trims,
5251 		start_amb_length(new),end_amb_length(new)));
5252 
5253   /* Previously checked for (new->circularalias == +2 || new->circularalias == -2) */
5254 
5255   if (new->circularpos >= 0) {
5256     new->altlocp = false;
5257     return new;
5258 
5259   } else if ((new->altlocp = altlocp[chrnum]) == false) {
5260     return new;
5261 
5262   } else {
5263     return new;
5264   }
5265 }
5266 
5267 
5268 
5269 /* Previously allowed donor or acceptor to be NULL, when we performed Splice_group_by_segment */
5270 /* Previously new->substring1 was donor and new->substring2 was acceptor */
5271 /* TODO: Modify a Stage3end_new_splice to take two Stage3end_T parts, somewhat like a Stage3pair_T */
5272 T
Stage3end_new_splice(int * found_score_overall,int * found_score_within_trims,Substring_T donor,Substring_T acceptor,Chrpos_T distance,bool shortdistancep,int querylength,bool copy_donor_p,bool copy_acceptor_p,bool first_read_p,int orig_sensedir,Listpool_T listpool,Method_T method,int level)5273 Stage3end_new_splice (int *found_score_overall, int *found_score_within_trims,
5274 		      Substring_T donor, Substring_T acceptor,
5275 		      Chrpos_T distance, bool shortdistancep, int querylength,
5276 		      bool copy_donor_p, bool copy_acceptor_p, bool first_read_p, int orig_sensedir,
5277 		      Listpool_T listpool, Method_T method, int level) {
5278   T new;
5279   Substring_T substring_for_concordance; /* always the inner substring */
5280   Substring_T substring_other;		 /* the outer substring */
5281   Substring_T substring1, substringN;
5282   Junction_T junction;
5283 
5284   List_T transcripts;
5285   char *remap_sequence;
5286   int remap_seqlength;
5287   double donor_prob, acceptor_prob;
5288 #ifdef DEBUG0
5289   Substring_T substring;
5290   List_T p;
5291 #endif
5292 
5293 
5294   if (Substring_nmatches_to_trims(donor) < 15 ||
5295       Substring_nmatches_to_trims(acceptor) < 15) {
5296     /* Not enough evidence to find each end of the translocation */
5297     return (T) NULL;
5298   } else {
5299     new = (T) MALLOC_OUT(sizeof(*new));
5300   }
5301 
5302   donor_prob = Substring_siteD_prob(donor);
5303   acceptor_prob = Substring_siteA_prob(acceptor);
5304 
5305   debug0(printf("Stage3end_new_splice, method %s: %p with first_read_p %d, sensedir %d, donor substring %p and acceptor substring %p, donor_prob %f and acceptor_prob %f\n",
5306 		Method_string(method),new,first_read_p,orig_sensedir,donor,acceptor,donor_prob,acceptor_prob));
5307 
5308 #if 0
5309   assert(Substring_match_length_orig(donor) + Substring_match_length_orig(acceptor) + amb_length == querylength);
5310 #endif
5311 
5312   new->querylength_adj = new->querylength = querylength;
5313 
5314   new->nindels = 0;
5315 
5316   new->transcripts = (List_T) NULL;
5317   new->transcripts_other = (List_T) NULL;
5318 
5319   new->splice_score = donor_prob + acceptor_prob;
5320 
5321   new->method = method;
5322   new->level = level;
5323 
5324   if (shortdistancep == true) {
5325     new->distant_splice_p = false;
5326 
5327     new->hittype = SPLICE;
5328     new->genestrand = Substring_genestrand(donor);
5329     new->chrnum = Substring_chrnum(donor);
5330     new->chroffset = Substring_chroffset(donor);
5331     new->chrhigh = Substring_chrhigh(donor);
5332     new->chrlength = Substring_chrlength(donor);
5333 
5334     assert(Substring_plusp(donor) == Substring_plusp(acceptor));
5335     assert(SENSE_CONSISTENT_P(Substring_sensedir(donor),Substring_sensedir(acceptor)));
5336 
5337   } else {
5338     new->distant_splice_p = true;
5339 
5340     if (Substring_chrnum(donor) == Substring_chrnum(acceptor) &&
5341 	Substring_plusp(donor) == Substring_plusp(acceptor) &&
5342 	SENSE_CONSISTENT_P(Substring_sensedir(donor),Substring_sensedir(acceptor))) {
5343       new->genestrand = Substring_genestrand(donor);
5344       new->hittype = SAMECHR_SPLICE;
5345       new->chrnum = Substring_chrnum(donor);
5346       new->chroffset = Substring_chroffset(donor);
5347       new->chrhigh = Substring_chrhigh(donor);
5348       new->chrlength = Substring_chrlength(donor);
5349     } else {
5350       new->hittype = TRANSLOC_SPLICE;
5351       new->genestrand = 0;
5352       new->chrnum = 0;
5353       new->chroffset = 0;
5354       new->chrhigh = 0;
5355       new->chrlength = 0;
5356     }
5357   }
5358 
5359   /* printf("Making splice with shortdistancep = %d, donor chrnum %d, and acceptor chrnum %d => chrnum %d\n",
5360      shortdistancep,Substring_chrnum(donor),Substring_chrnum(acceptor),new->chrnum); */
5361 
5362   new->guided_insertlength = 0U;
5363   new->nsegments = 2;
5364   new->nsplices = 1;
5365 
5366   /* Define substrings and junctions */
5367   if (new->chrnum != 0) {
5368     new->sensedir = orig_sensedir;
5369     junction = Junction_new_splice(distance,orig_sensedir,donor_prob,acceptor_prob);
5370 
5371   } else if (Substring_querystart(donor) < Substring_querystart(acceptor)) {
5372     /* Translocation, sense */
5373     new->sensedir = SENSE_FORWARD;
5374     junction = Junction_new_chimera(/*sensedir:SENSE_FORWARD,*/donor_prob,acceptor_prob);
5375 
5376   } else {
5377     /* Translocation, antisense */
5378     new->sensedir = SENSE_ANTI;
5379     junction = Junction_new_chimera(/*sensedir:SENSE_ANTI,*/donor_prob,acceptor_prob);
5380   }
5381   new->sensedir_for_concordance = new->sensedir;
5382 
5383   debug0(printf("donor querypos %d..%d\n",Substring_querystart(donor),Substring_queryend(donor)));
5384   debug0(printf("acceptor querypos %d..%d\n",Substring_querystart(acceptor),Substring_queryend(acceptor)));
5385   debug0(printf("sensedir %d\n",new->sensedir));
5386 
5387 
5388   /* new->junctions_LtoH = Listpool_push(NULL,listpool,(void *) junction); */
5389   /* new->junctions_HtoL = Listpool_push(NULL,listpool,(void *) junction); */
5390   new->junctions_1toN = Listpool_push(NULL,listpool,(void *) junction);
5391   new->junctions_Nto1 = Listpool_push(NULL,listpool,(void *) junction);
5392 
5393   donor = copy_donor_p ? Substring_copy(donor) : donor;
5394   acceptor = copy_acceptor_p ? Substring_copy(acceptor) : acceptor;
5395   if (new->sensedir != SENSE_ANTI) {
5396     /* SENSE_FORWARD or SENSE_NULL */
5397     /* Order is donor (substring1), acceptor (substring2) */
5398     new->substrings_1toN = Listpool_push(NULL,listpool,(void *) acceptor);
5399     new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) donor);
5400   } else {
5401     /* SENSE_ANTI */
5402     /* Order is acceptor (substring1), donor (substring2) */
5403     new->substrings_1toN = Listpool_push(NULL,listpool,(void *) donor);
5404     new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) acceptor);
5405   }
5406   new->substrings_Nto1 = List_reverse(Listpool_copy(new->substrings_1toN,listpool));
5407   assert(Substring_querystart(List_head(new->substrings_1toN)) < Substring_querystart(List_head(new->substrings_Nto1)));
5408   /* Done assigning substrings */
5409 
5410 
5411   substring1 = (Substring_T) List_head(new->substrings_1toN);
5412   substringN = (Substring_T) List_head(new->substrings_Nto1);
5413 
5414   new->trim_querystart = Substring_trim_querystart(substring1);
5415   new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring1);
5416   new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring1);
5417   new->trim_queryend = Substring_trim_queryend(substringN);
5418   new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substringN);
5419   new->trim_queryend_splicep = Substring_trim_queryend_splicep(substringN);
5420   debug0(printf("  trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
5421   debug0(printf("  trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
5422 
5423   new->querystart_chrbound = Substring_querystart_chrbound(substring1);
5424   new->queryend_chrbound = Substring_queryend_chrbound(substringN);
5425   if (new->trim_querystart > new->querystart_chrbound) {
5426     new->querystart_chrbound = new->trim_querystart;
5427   }
5428   if (querylength - new->trim_queryend < new->queryend_chrbound) {
5429     new->queryend_chrbound = querylength - new->trim_queryend;
5430   }
5431   assert(new->querystart_chrbound < new->queryend_chrbound);
5432   debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
5433 
5434 
5435   if (new->chrnum != 0) {
5436     /* Ordinary splice.  No need to distinguish effective_chrnum and other_chrnum */
5437     substring_for_concordance = substring_other = (Substring_T) NULL;
5438     new->effective_chrnum = new->chrnum;
5439     new->other_chrnum = 0;
5440 
5441     /* Define coordinates as usual */
5442     new->genomicstart = Substring_genomicstart(substring1);
5443     new->genomicend = Substring_genomicend(substringN);
5444     new->plusp = Substring_plusp(substring1);
5445 
5446   } else {
5447     /* Translocation.  Concordant substring is the inner one */
5448     if (first_read_p == true) {
5449       substring_for_concordance = substringN;  /* (Substring_T) List_head(new->substrings_Nto1); */
5450       substring_other = substring1;  /* (Substring_T) List_head(new->substrings_1toN); */
5451       debug0(printf("Since first read, substring for concordance is at chr %d\n",Substring_chrnum(substring_for_concordance)));
5452     } else {
5453       substring_for_concordance = substring1;  /* (Substring_T) List_head(new->substrings_1toN); */
5454       substring_other = substringN;  /* (Substring_T) List_head(new->substrings_Nto1); */
5455       debug0(printf("Since second read, substring for concordance is at chr %d\n",Substring_chrnum(substring_for_concordance)));
5456     }
5457 
5458     new->effective_chrnum = Substring_chrnum(substring_for_concordance);
5459     new->other_chrnum = Substring_chrnum(substring_other);
5460 
5461     /* Define coordinates based on substring for concordance */
5462     new->genomicstart = Substring_genomicstart(substring_for_concordance);
5463     new->genomicend = Substring_genomicend(substring_for_concordance);
5464 
5465     /* This plusp is somewhat artificial, based on substring_for_concordance,
5466        but it defines order of substrings_LtoH */
5467     new->plusp = Substring_plusp(substring_for_concordance);
5468   }
5469 
5470 #ifdef DEBUG0
5471   printf("NEW SUBSTRINGS (query order)\n");
5472   for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
5473     substring = List_head(p);
5474     if (Substring_ambiguous_p(substring) == true) {
5475       printf("%d..%d\t%d:%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\tprobs:%f and %f\n",
5476 	     Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
5477 	     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
5478 	     Substring_nmatches_to_trims(substring),Substring_amb_length(substring),
5479 	     Substring_amb_donor_prob(substring),Substring_amb_acceptor_prob(substring));
5480     } else {
5481       printf("%d..%d\t%d:%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",
5482 	     Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
5483 	     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
5484 	     Substring_nmatches_to_trims(substring),Substring_amb_length(substring));
5485     }
5486   }
5487   printf("\n");
5488 #endif
5489 
5490 
5491   /* genomicstart and genomicend could be reversed for a scramble */
5492   if (new->genomicstart < new->genomicend) {
5493     new->low = new->genomicstart + new->querystart_chrbound;
5494     new->high = new->genomicend - (querylength - new->queryend_chrbound);
5495     new->genomiclength = new->genomicend - new->genomicstart;
5496   } else {
5497     new->low = new->genomicend + (querylength - new->queryend_chrbound);
5498     new->high = new->genomicstart - new->querystart_chrbound;
5499     new->genomiclength = new->genomicstart - new->genomicend;
5500   }
5501   /* assert(new->low < new->high); */
5502   debug0(printf("low %u, high %u\n",new->low,new->high));
5503 
5504   debug0(printf("  hittype is %s, plusp %d, genomicpos %u..%u\n",
5505 		hittype_string(new->hittype),new->plusp,new->genomicstart - new->chroffset,new->genomicend - new->chroffset));
5506 
5507 
5508   new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(donor) + Substring_nmismatches_bothdiff(acceptor);
5509   new->nmismatches_refdiff = Substring_nmismatches_refdiff(donor) + Substring_nmismatches_refdiff(acceptor);
5510 
5511   new->refalt_nmatches_to_trims = Substring_nmatches_to_trims(donor) + Substring_nmatches_to_trims(acceptor);
5512   new->ref_nmatches_to_trims = Substring_ref_nmatches_to_trims(donor) + Substring_ref_nmatches_to_trims(acceptor);
5513   new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims;
5514   new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims;
5515   assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
5516 
5517   new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
5518   new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
5519   new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims; /* Should not have any trims at the ends */
5520   if (new->chrlength == 0) {
5521     /* Cannot compare querylength with chrlength, which is 0 */
5522   } else if (new->chrlength < (Univcoord_T) querylength) {
5523     new->ref_score_overall -= ((Univcoord_T) querylength - new->chrlength);
5524     new->refalt_score_overall -= ((Univcoord_T) querylength - new->chrlength);
5525     new->refalt_score_within_trims -= ((Univcoord_T) querylength - new->chrlength);
5526   }
5527   assert(new->refalt_score_within_trims >= 0);
5528 
5529   /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
5530   if (new->refalt_score_overall < *found_score_overall) {
5531     *found_score_overall = new->refalt_score_overall;
5532   }
5533   /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
5534   if (new->refalt_score_within_trims < *found_score_within_trims) {
5535     *found_score_within_trims = new->refalt_score_within_trims;
5536   }
5537 
5538   debug0(printf("New splice has donor %d + acceptor %d matches, sensedir %d\n",
5539 		Substring_nmatches(donor),Substring_nmatches(acceptor),new->sensedir));
5540 
5541   /* new->penalties = splicing_penalty; */
5542 
5543   /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
5544   /* new->tally = -1L; */
5545 
5546 #if 0
5547   new->mapq_score = 0;
5548   new->absmq_score = 0;
5549 #endif
5550 
5551   new->paired_usedp = false;
5552 
5553 #if 0
5554   if (new->sensedir != SENSE_ANTI) {
5555     assert(Substring_queryend(donor) == Substring_querystart(acceptor));
5556     /* new->query_splicepos = Substring_queryend(donor); */
5557   } else {
5558     assert(Substring_queryend(acceptor) == Substring_querystart(donor));
5559     /* new->query_splicepos = Substring_queryend(acceptor); */
5560   }
5561   assert(new->query_splicepos > 0 && new->query_splicepos < querylength - 1);
5562 #endif
5563 
5564   new->circularpos = compute_circularpos(&new->circularalias,new);
5565   /* Previously checked for (new->circularalias == +2 || new->circularalias == -2) */
5566 
5567   if (new->circularpos >= 0) {
5568     new->altlocp = false;
5569   } else if ((new->altlocp = altlocp[new->chrnum]) == false) {
5570   } else {
5571   }
5572 
5573   if (transcriptomep == true && remap_transcriptome_p == true && substring_for_concordance != NULL) {
5574     /* Remap substring_for_concordance */
5575     remap_sequence = Substring_genomic_sequence(&remap_seqlength,substring_for_concordance,genomecomp);
5576     if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,new->effective_chrnum,
5577 						Substring_chrpos_low(substring_for_concordance),
5578 						Substring_chrpos_high(substring_for_concordance),
5579 						transcript_iit,transcriptomebits,transcriptome)) != NULL) {
5580       new->transcripts = transcripts;
5581     }
5582     FREE(remap_sequence);
5583 
5584     /* Remap substring_other */
5585     remap_sequence = Substring_genomic_sequence(&remap_seqlength,substring_other,genomecomp);
5586     if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,new->other_chrnum,
5587 						Substring_chrpos_low(substring_other),
5588 						Substring_chrpos_high(substring_other),
5589 						transcript_iit,transcriptomebits,transcriptome)) != NULL) {
5590       new->transcripts_other = transcripts;
5591     }
5592     FREE(remap_sequence);
5593   }
5594 
5595   debug0(printf("*****Method %s: Returning new splice %p at genomic %u..%u, donor %p (%u => %u), acceptor %p (%u => %u), score %d\n\n",
5596 		Method_string(method),new,new->genomicstart - new->chroffset,new->genomicend - new->chroffset,donor,
5597 		donor == NULL ? 0 : Substring_left_genomicseg(donor),
5598 		donor == NULL ? 0 : Substring_splicecoord_D(donor),
5599 		acceptor,acceptor == NULL ? 0 : Substring_left_genomicseg(acceptor),
5600 		acceptor == NULL ? 0 : Substring_splicecoord_A(acceptor),new->refalt_score_within_trims));
5601   debug0(printf("sensedir %d\n",new->sensedir));
5602   return new;
5603 }
5604 
5605 
5606 T
Stage3end_new_distant(int * found_score_overall,int * found_score_within_trims,Substring_T startfrag,Substring_T endfrag,int splice_pos,int nmismatches1,int nmismatches2,double prob1,double prob2,int sensedir_distant_guess,Chrpos_T distance,bool shortdistancep,int querylength,bool first_read_p,Listpool_T listpool,int level)5607 Stage3end_new_distant (int *found_score_overall, int *found_score_within_trims,
5608 		       Substring_T startfrag, Substring_T endfrag, int splice_pos,
5609 		       int nmismatches1, int nmismatches2,
5610 		       double prob1, double prob2, int sensedir_distant_guess,
5611 		       Chrpos_T distance, bool shortdistancep, int querylength,
5612 		       bool first_read_p, Listpool_T listpool, int level) {
5613   T new;
5614   Substring_T substring_for_concordance; /* always the inner substring */
5615   Substring_T substring_other;		 /* the outer substring */
5616   Substring_T substring1, substringN;
5617   Substring_T donor, acceptor;
5618   Junction_T junction;
5619 
5620   List_T transcripts;
5621   char *remap_sequence;
5622   int remap_seqlength;
5623 #ifdef DEBUG0
5624   Substring_T substring;
5625   List_T p;
5626 #endif
5627 
5628   new = (T) MALLOC_OUT(sizeof(*new));
5629 
5630   debug0(printf("Stage3end_new_distant: %p with first_read_p %d, shortdistancep %d, sensedir guessed to be %d\n",
5631 		new,first_read_p,shortdistancep,sensedir_distant_guess));
5632 
5633   new->querylength_adj = new->querylength = querylength;
5634 
5635   new->nindels = 0;
5636 
5637   new->transcripts = (List_T) NULL;
5638   new->transcripts_other = (List_T) NULL;
5639 
5640   new->splice_score = 0.0;
5641 
5642   new->method = DISTANT_DNA;
5643   new->level = level;
5644 
5645   debug0(printf("chrnum: %d and %d, plusp: %d and %d, sensedir: %d and %d\n",
5646 		Substring_chrnum(startfrag),Substring_chrnum(endfrag),
5647 		Substring_plusp(startfrag),Substring_plusp(endfrag),
5648 		Substring_sensedir(startfrag),Substring_sensedir(endfrag)));
5649 
5650   if (shortdistancep == true) {
5651     new->distant_splice_p = false;
5652 
5653     new->hittype = SPLICE;
5654     new->genestrand = Substring_genestrand(startfrag);
5655     new->chrnum = Substring_chrnum(startfrag);
5656     new->chroffset = Substring_chroffset(startfrag);
5657     new->chrhigh = Substring_chrhigh(startfrag);
5658     new->chrlength = Substring_chrlength(startfrag);
5659 
5660     assert(Substring_plusp(startfrag) == Substring_plusp(endfrag));
5661     assert(SENSE_CONSISTENT_P(Substring_sensedir(startfrag),Substring_sensedir(endfrag)));
5662 
5663   } else {
5664     new->distant_splice_p = true;
5665 
5666     new->hittype = TRANSLOC_SPLICE;
5667     new->genestrand = 0;
5668     new->chrnum = 0;
5669     new->chroffset = 0;
5670     new->chrhigh = 0;
5671     new->chrlength = 0;
5672   }
5673 
5674   /* printf("Making splice with shortdistancep = %d, startfrag chrnum %d, and endfrag chrnum %d => chrnum %d\n",
5675      shortdistancep,Substring_chrnum(startfrag),Substring_chrnum(endfrag),new->chrnum); */
5676 
5677   new->guided_insertlength = 0U;
5678   new->nsegments = 2;
5679   new->nsplices = 1;
5680 
5681   /* Trim startfrag and endfrag at splice_pos */
5682   startfrag = Substring_trim_startfrag(nmismatches1,/*old*/startfrag,/*new_queryend*/splice_pos);
5683   endfrag = Substring_trim_endfrag(nmismatches2,/*old*/endfrag,/*new_querystart*/splice_pos);
5684 
5685   /* Define substrings and junctions */
5686   new->sensedir_for_concordance = sensedir_distant_guess; /* was SENSE_NULL */
5687   new->sensedir = sensedir_distant_guess;
5688   if (sensedir_distant_guess != SENSE_ANTI) {
5689     /* Order is donor (substring1), acceptor (substring2) */
5690     donor = startfrag;
5691     Substring_label_donor(donor,splice_pos,prob1,sensedir_distant_guess);
5692 
5693     acceptor = endfrag;
5694     Substring_label_acceptor(acceptor,splice_pos,prob2,sensedir_distant_guess);
5695 
5696     new->substrings_1toN = Listpool_push(NULL,listpool,(void *) acceptor);
5697     new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) donor);
5698 
5699   } else {
5700     /* Order is acceptor (substring1), donor (substring2) */
5701     acceptor = startfrag;
5702     Substring_label_acceptor(acceptor,splice_pos,prob1,sensedir_distant_guess);
5703 
5704     donor = endfrag;
5705     Substring_label_donor(donor,splice_pos,prob2,sensedir_distant_guess);
5706 
5707     new->substrings_1toN = Listpool_push(NULL,listpool,(void *) donor);
5708     new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) acceptor);
5709   }
5710 
5711   if (shortdistancep == true) {
5712     junction = Junction_new_splice(distance,sensedir_distant_guess,Substring_siteD_prob(donor),Substring_siteA_prob(acceptor));
5713   } else {
5714     junction = Junction_new_chimera(/*sensedir_distant_guess,*/Substring_siteD_prob(donor),Substring_siteA_prob(acceptor));
5715   }
5716 
5717   /* new->junctions_LtoH = Listpool_push(NULL,listpool,(void *) junction); */
5718   /* new->junctions_HtoL = Listpool_push(NULL,listpool,(void *) junction); */
5719   new->junctions_1toN = Listpool_push(NULL,listpool,(void *) junction);
5720   new->junctions_Nto1 = Listpool_push(NULL,listpool,(void *) junction);
5721 
5722   new->substrings_Nto1 = List_reverse(Listpool_copy(new->substrings_1toN,listpool));
5723   assert(Substring_querystart(List_head(new->substrings_1toN)) < Substring_querystart(List_head(new->substrings_Nto1)));
5724   /* Done assigning substrings */
5725 
5726 
5727   substring1 = (Substring_T) List_head(new->substrings_1toN);
5728   substringN = (Substring_T) List_head(new->substrings_Nto1);
5729 
5730   new->trim_querystart = Substring_trim_querystart(substring1);
5731   new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring1);
5732   new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring1);
5733   new->trim_queryend = Substring_trim_queryend(substringN);
5734   new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substringN);
5735   new->trim_queryend_splicep = Substring_trim_queryend_splicep(substringN);
5736   debug0(printf("  trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
5737   debug0(printf("  trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
5738 
5739   new->querystart_chrbound = Substring_querystart_chrbound(substring1);
5740   new->queryend_chrbound = Substring_queryend_chrbound(substringN);
5741   if (new->trim_querystart > new->querystart_chrbound) {
5742     new->querystart_chrbound = new->trim_querystart;
5743   }
5744   if (querylength - new->trim_queryend < new->queryend_chrbound) {
5745     new->queryend_chrbound = querylength - new->trim_queryend;
5746   }
5747   assert(new->querystart_chrbound < new->queryend_chrbound);
5748   debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
5749 
5750 
5751   /* Translocation.  Concordant substring is the inner one */
5752   if (first_read_p == true) {
5753     substring_for_concordance = substringN;  /* (Substring_T) List_head(new->substrings_Nto1); */
5754     substring_other = substring1;  /* (Substring_T) List_head(new->substrings_1toN); */
5755     debug0(printf("Since first read, substring for concordance is at chr %d\n",Substring_chrnum(substring_for_concordance)));
5756   } else {
5757     substring_for_concordance = substring1;  /* (Substring_T) List_head(new->substrings_1toN); */
5758     substring_other = substringN;  /* (Substring_T) List_head(new->substrings_Nto1); */
5759     debug0(printf("Since second read, substring for concordance is at chr %d\n",Substring_chrnum(substring_for_concordance)));
5760   }
5761 
5762   new->effective_chrnum = Substring_chrnum(substring_for_concordance);
5763   new->other_chrnum = Substring_chrnum(substring_other);
5764 
5765   /* Define coordinates based on substring for concordance */
5766   new->genomicstart = Substring_genomicstart(substring_for_concordance);
5767   new->genomicend = Substring_genomicend(substring_for_concordance);
5768 
5769   /* This plusp is somewhat artificial, based on substring_for_concordance,
5770      but it defines order of substrings_LtoH */
5771   new->plusp = Substring_plusp(substring_for_concordance);
5772 
5773 #ifdef DEBUG0
5774   printf("NEW SUBSTRINGS (query order)\n");
5775   for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
5776     substring = List_head(p);
5777     if (Substring_ambiguous_p(substring) == true) {
5778       printf("%d..%d\t%d:%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\tprobs:%f and %f\n",
5779 	     Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
5780 	     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
5781 	     Substring_nmatches_to_trims(substring),Substring_amb_length(substring),
5782 	     Substring_amb_donor_prob(substring),Substring_amb_acceptor_prob(substring));
5783     } else {
5784       printf("%d..%d\t%d:%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",
5785 	     Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
5786 	     Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
5787 	     Substring_nmatches_to_trims(substring),Substring_amb_length(substring));
5788     }
5789   }
5790   printf("\n");
5791 #endif
5792 
5793   /* genomicstart and genomicend could be reversed for a scramble */
5794   if (new->genomicstart < new->genomicend) {
5795     new->low = new->genomicstart + new->querystart_chrbound;
5796     new->high = new->genomicend - (querylength - new->queryend_chrbound);
5797     new->genomiclength = new->genomicend - new->genomicstart;
5798   } else {
5799     new->low = new->genomicend + (querylength - new->queryend_chrbound);
5800     new->high = new->genomicstart - new->querystart_chrbound;
5801     new->genomiclength = new->genomicstart - new->genomicend;
5802   }
5803   /* assert(new->low < new->high); */
5804   debug0(printf("low %u, high %u\n",new->low,new->high));
5805 
5806   debug0(printf("  hittype is %s, plusp %d, genomicpos %u..%u\n",
5807 		hittype_string(new->hittype),new->plusp,new->genomicstart - new->chroffset,new->genomicend - new->chroffset));
5808 
5809   new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(startfrag) + Substring_nmismatches_bothdiff(endfrag);
5810   new->nmismatches_refdiff = Substring_nmismatches_refdiff(startfrag) + Substring_nmismatches_refdiff(endfrag);
5811 
5812   new->refalt_nmatches_to_trims = Substring_nmatches_to_trims(startfrag) + Substring_nmatches_to_trims(endfrag);
5813   new->ref_nmatches_to_trims = Substring_ref_nmatches_to_trims(startfrag) + Substring_ref_nmatches_to_trims(endfrag);
5814   new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims;
5815   new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims;
5816   assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
5817 
5818   new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
5819   new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
5820   new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims; /* Should not have any trims at the ends */
5821   if (new->chrlength == 0) {
5822     /* Cannot compare querylength with chrlength, which is 0 */
5823   } else if (new->chrlength < (Univcoord_T) querylength) {
5824     new->ref_score_overall -= ((Univcoord_T) querylength - new->chrlength);
5825     new->refalt_score_overall -= ((Univcoord_T) querylength - new->chrlength);
5826     new->refalt_score_within_trims -= ((Univcoord_T) querylength - new->chrlength);
5827   }
5828   assert(new->refalt_score_within_trims >= 0);
5829 
5830   /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
5831   if (new->refalt_score_overall < *found_score_overall) {
5832     *found_score_overall = new->refalt_score_overall;
5833   }
5834   /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
5835   if (new->refalt_score_within_trims < *found_score_within_trims) {
5836     *found_score_within_trims = new->refalt_score_within_trims;
5837   }
5838 
5839   debug0(printf("New distant has startfrag %d + endfrag %d matches, sensedir %d, score %d overall and %d within trims\n",
5840 		Substring_nmatches(startfrag),Substring_nmatches(endfrag),new->sensedir,
5841 		new->refalt_score_overall,new->refalt_score_within_trims));
5842 
5843   /* new->penalties = splicing_penalty; */
5844 
5845   /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
5846   /* new->tally = -1L; */
5847 
5848 #if 0
5849   new->mapq_score = 0;
5850   new->absmq_score = 0;
5851 #endif
5852 
5853   new->paired_usedp = false;
5854   /* new->query_splicepos = splice_pos; */
5855 
5856   new->circularpos = compute_circularpos(&new->circularalias,new);
5857   /* Previously checked for (new->circularalias == +2 || new->circularalias == -2) */
5858 
5859   if (new->circularpos >= 0) {
5860     new->altlocp = false;
5861   } else if ((new->altlocp = altlocp[new->chrnum]) == false) {
5862   } else {
5863   }
5864 
5865   if (transcriptomep == true && remap_transcriptome_p == true && substring_for_concordance != NULL) {
5866     /* Remap substring_for_concordance */
5867     remap_sequence = Substring_genomic_sequence(&remap_seqlength,substring_for_concordance,genomecomp);
5868     if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,new->effective_chrnum,
5869 						Substring_chrpos_low(substring_for_concordance),
5870 						Substring_chrpos_high(substring_for_concordance),
5871 						transcript_iit,transcriptomebits,transcriptome)) != NULL) {
5872       new->transcripts = transcripts;
5873     }
5874     FREE(remap_sequence);
5875 
5876     /* Remap substring_other */
5877     remap_sequence = Substring_genomic_sequence(&remap_seqlength,substring_other,genomecomp);
5878     if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,new->other_chrnum,
5879 						Substring_chrpos_low(substring_other),
5880 						Substring_chrpos_high(substring_other),
5881 						transcript_iit,transcriptomebits,transcriptome)) != NULL) {
5882       new->transcripts_other = transcripts;
5883     }
5884     FREE(remap_sequence);
5885   }
5886 
5887   debug0(printf("*****Method distant: Returning new distant %p at genomic %u..%u, startfrag %p (%u => ), endfrag %p (%u => ), score %d\n\n",
5888 		new,new->genomicstart - new->chroffset,new->genomicend - new->chroffset,
5889 		startfrag,Substring_left_genomicseg(startfrag),endfrag,Substring_left_genomicseg(endfrag),
5890 		new->refalt_score_within_trims));
5891   return new;
5892 }
5893 
5894 
5895 static int
Stage3end_output_cmp(const void * a,const void * b)5896 Stage3end_output_cmp (const void *a, const void *b) {
5897   T x = * (T *) a;
5898   T y = * (T *) b;
5899 
5900   if (x->refalt_nmatches_plus_spliced_trims > y->refalt_nmatches_plus_spliced_trims) {
5901     return -1;
5902   } else if (y->refalt_nmatches_plus_spliced_trims > x->refalt_nmatches_plus_spliced_trims) {
5903     return +1;
5904   } else if (x->ref_nmatches_plus_spliced_trims > y->ref_nmatches_plus_spliced_trims) {
5905     return -1;
5906   } else if (y->ref_nmatches_plus_spliced_trims > x->ref_nmatches_plus_spliced_trims) {
5907     return +1;
5908   } else if (x->mapq_loglik > y->mapq_loglik) {
5909     return -1;
5910   } else if (y->mapq_loglik > x->mapq_loglik) {
5911     return +1;
5912   } else if (x->distant_splice_p == false && y->distant_splice_p == true) {
5913     return -1;
5914   } else if (y->distant_splice_p == false && x->distant_splice_p == true) {
5915     return +1;
5916   } else if (x->guided_insertlength > 0 && y->guided_insertlength == 0) {
5917     return -1;
5918   } else if (y->guided_insertlength > 0 && x->guided_insertlength == 0) {
5919     return +1;
5920   } else if (x->guided_insertlength < y->guided_insertlength) {
5921     return -1;
5922   } else if (y->guided_insertlength < x->guided_insertlength) {
5923     return +1;
5924   } else if (x->refalt_score_within_trims < y->refalt_score_within_trims) {
5925     return -1;
5926   } else if (y->refalt_score_within_trims < x->refalt_score_within_trims) {
5927     return +1;
5928 
5929     /* This genomic ordering will be undone if want_random_p is true */
5930   } else if (x->genomicstart < y->genomicstart) {
5931     return -1;
5932   } else if (y->genomicstart < x->genomicstart) {
5933     return +1;
5934 
5935   } else if (x->genomicend < y->genomicend) {
5936     return -1;
5937   } else if (y->genomicend < x->genomicend) {
5938     return +1;
5939 
5940   } else if (x->plusp == true && y->plusp == false) {
5941     return -1;
5942   } else if (x->plusp == false && y->plusp == true) {
5943     return +1;
5944 
5945   } else if (x->hittype < y->hittype) {
5946     return -1;
5947   } else if (y->hittype < x->hittype) {
5948     return +1;
5949 
5950   } else {
5951     return 0;
5952   }
5953 }
5954 
5955 
5956 static int
Stage3pair_output_cmp(const void * a,const void * b)5957 Stage3pair_output_cmp (const void *a, const void *b) {
5958   Stage3pair_T x = * (Stage3pair_T *) a;
5959   Stage3pair_T y = * (Stage3pair_T *) b;
5960 
5961 #ifdef USE_BINGO
5962   if (x->absdifflength_bingo_p == true && y->absdifflength_bingo_p == false) {
5963     return -1;
5964   } else if (y->absdifflength_bingo_p == true && x->absdifflength_bingo_p == false) {
5965     return +1;
5966   }
5967 #endif
5968 
5969   if (x->hit5->refalt_nmatches_plus_spliced_trims +
5970       x->hit3->refalt_nmatches_plus_spliced_trims >
5971       y->hit5->refalt_nmatches_plus_spliced_trims +
5972       y->hit3->refalt_nmatches_plus_spliced_trims) {
5973     return -1;
5974   } else if (y->hit5->refalt_nmatches_plus_spliced_trims +
5975 	     y->hit3->refalt_nmatches_plus_spliced_trims >
5976 	     x->hit5->refalt_nmatches_plus_spliced_trims +
5977 	     x->hit3->refalt_nmatches_plus_spliced_trims) {
5978     return +1;
5979   } else if (x->hit5->ref_nmatches_plus_spliced_trims +
5980 	     x->hit3->ref_nmatches_plus_spliced_trims >
5981 	     y->hit5->ref_nmatches_plus_spliced_trims +
5982 	     y->hit3->ref_nmatches_plus_spliced_trims) {
5983     return -1;
5984   } else if (y->hit5->ref_nmatches_plus_spliced_trims +
5985 	     y->hit3->ref_nmatches_plus_spliced_trims >
5986 	     x->hit5->ref_nmatches_plus_spliced_trims +
5987 	     x->hit3->ref_nmatches_plus_spliced_trims) {
5988     return +1;
5989   } else if (x->mapq_loglik > y->mapq_loglik) {
5990     return -1;
5991   } else if (y->mapq_loglik > x->mapq_loglik) {
5992     return +1;
5993   } else if (x->insertlength > 0 && y->insertlength == 0) {
5994     return -1;
5995   } else if (y->insertlength > 0 && x->insertlength == 0) {
5996     return +1;
5997   } else if (x->insertlength < y->insertlength) {
5998     return -1;
5999   } else if (y->insertlength < x->insertlength) {
6000     return +1;
6001   } else if (x->hit5->refalt_score_within_trims +
6002 	     x->hit3->refalt_score_within_trims <
6003 	     y->hit5->refalt_score_within_trims +
6004 	     y->hit3->refalt_score_within_trims) {
6005     return -1;
6006   } else if (y->hit5->refalt_score_within_trims +
6007 	     y->hit3->refalt_score_within_trims <
6008 	     x->hit5->refalt_score_within_trims +
6009 	     x->hit3->refalt_score_within_trims) {
6010     return +1;
6011 
6012     /* This genomic ordering will be undone if want_random_p is true */
6013   } else if (x->low < y->low) {
6014     return -1;
6015   } else if (y->low < x->low) {
6016     return +1;
6017 
6018   } else if (x->high < y->high) {
6019     return -1;
6020   } else if (y->high < x->high) {
6021     return +1;
6022 
6023   } else {
6024     return 0;
6025   }
6026 }
6027 
6028 
6029 
6030 static float
Stage3end_compute_mapq(Stage3end_T this,char * quality_string)6031 Stage3end_compute_mapq (Stage3end_T this, char *quality_string) {
6032   List_T p;
6033   Substring_T substring;
6034 
6035   if (this == NULL) {
6036     return 0.0;
6037 
6038   } else {
6039     this->mapq_loglik = 0.0;
6040     for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
6041       substring = (Substring_T) List_head(p);
6042       this->mapq_loglik += Substring_compute_mapq(substring,quality_string);
6043     }
6044   }
6045 
6046   return this->mapq_loglik;
6047 }
6048 
6049 
6050 
6051 static void
Stage3end_display_prep(Stage3end_T this,char * queryuc_ptr,bool first_read_p)6052 Stage3end_display_prep (Stage3end_T this, char *queryuc_ptr, bool first_read_p) {
6053   List_T p, q;
6054   Substring_T substring;
6055   Junction_T pre_junction, post_junction;
6056   Junctiontype_T type;
6057   int extraleft, extraright;
6058   bool sam_print_xt_p = false;
6059   /* int type; */
6060   /* int extralow, extrahigh; */
6061 
6062   if (this != NULL) {
6063     if (output_type == SAM_OUTPUT) {
6064       if (this->hittype == TRANSLOC_SPLICE ||
6065 	  (this->hittype == SAMECHR_SPLICE && merge_samechr_p == false)) {
6066 	  /* This is the condition in samprint to print the XT field, which needs the splice information */
6067 	sam_print_xt_p = true;
6068       }
6069     }
6070 
6071     debug0(printf("Doing a display prep of end %p\n",this));
6072 
6073     this->nmismatches_refdiff = 0;
6074 
6075     /* First segments */
6076     /* For operations on substrings, proceed in 1toN order, not LtoH order */
6077     substring = (Substring_T) List_head(this->substrings_1toN);
6078     if (output_type == STD_OUTPUT) {
6079       extraleft = Substring_querystart(substring); /* terminal start */
6080     } else {
6081       extraleft = 0;
6082     }
6083 
6084     if (List_length(this->substrings_1toN) == 1) {
6085       post_junction = (Junction_T) NULL;
6086       if (output_type == STD_OUTPUT) {
6087 	extraright = this->querylength - Substring_queryend(substring); /* terminal end */
6088       } else {
6089 	extraright = 0;
6090       }
6091     } else {
6092       post_junction = (Junction_T) List_head(this->junctions_1toN);
6093       /* Junction_print(post_junction); */
6094 
6095       if (output_type == M8_OUTPUT) {
6096 	extraright = 0;
6097       } else if ((type = Junction_type(post_junction)) == CHIMERA_JUNCTION || sam_print_xt_p == true) {
6098 	extraright = 2;
6099       } else if (output_type == SAM_OUTPUT) {
6100 	extraright = 0;
6101       } else if (type == SPLICE_JUNCTION) {
6102 	extraright = 2;
6103       } else if (first_read_p == true && type == DEL_JUNCTION) {
6104 	extraright = Junction_nindels(post_junction);
6105       } else {
6106 	extraright = 0;
6107       }
6108     }
6109 
6110     if (Substring_has_alts_p(substring) == true) {
6111       /* Skip */
6112     } else {
6113       this->nmismatches_refdiff +=
6114 	Substring_display_prep(substring,queryuc_ptr,this->querylength,
6115 			       extraleft,extraright,genomecomp);
6116     }
6117 
6118     assert(List_length(this->substrings_1toN) == List_length(this->junctions_1toN) + 1);
6119     if ((p = List_next(this->substrings_1toN)) == NULL) {
6120       /* No middle segments */
6121     } else {
6122       for (q = List_next(this->junctions_1toN); q != NULL; p = List_next(p), q = List_next(q)) {
6123 	/* Middle segments */
6124 	pre_junction = post_junction;
6125 	post_junction = List_head(q);
6126 
6127 	/* Junction_print(pre_junction); */
6128 	/* Junction_print(post_junction); */
6129 
6130 	if (output_type == M8_OUTPUT) {
6131 	  extraleft = 0;
6132 	} else if ((type = Junction_type(pre_junction)) == CHIMERA_JUNCTION || sam_print_xt_p == true) {
6133 	  extraleft = 2;
6134 	} else if (output_type == SAM_OUTPUT) {
6135 	  extraleft = 0;
6136 	} else if (type == SPLICE_JUNCTION) {
6137 	  extraleft = 2;
6138 	} else if (first_read_p == false && type == DEL_JUNCTION) {
6139 	  extraleft = Junction_nindels(pre_junction);
6140 	} else {
6141 	  extraleft = 0;
6142 	}
6143 
6144 	if (output_type == M8_OUTPUT) {
6145 	  extraright = 0;
6146 	} else if ((type = Junction_type(post_junction)) == CHIMERA_JUNCTION || sam_print_xt_p == true) {
6147 	  extraright = 2;
6148 	} else if (output_type == SAM_OUTPUT) {
6149 	  extraright = 0;
6150 	} else if (type == SPLICE_JUNCTION) {
6151 	  extraright = 2;
6152 	} else if (first_read_p == true && type == DEL_JUNCTION) {
6153 	  extraright = Junction_nindels(post_junction);
6154 	} else {
6155 	  extraright = 0;
6156 	}
6157 
6158 	substring = (Substring_T) List_head(p);
6159 	if (Substring_has_alts_p(substring) == true) {
6160 	  /* Skip */
6161 	} else {
6162 	  this->nmismatches_refdiff +=
6163 	    Substring_display_prep(substring,queryuc_ptr,this->querylength,
6164 				   extraleft,extraright,genomecomp);
6165 	}
6166       }
6167 
6168       /* Last segment */
6169       pre_junction = post_junction;
6170       /* Junction_print(pre_junction); */
6171 
6172       if (output_type == M8_OUTPUT) {
6173 	extraleft = 0;
6174       } else if ((type = Junction_type(pre_junction)) == CHIMERA_JUNCTION || sam_print_xt_p == true) {
6175 	extraleft = 2;
6176       } else if (output_type == SAM_OUTPUT) {
6177 	extraleft = 0;
6178       } else if (type == SPLICE_JUNCTION) {
6179 	extraleft = 2;
6180       } else if (first_read_p == false && type == DEL_JUNCTION) {
6181 	extraleft = Junction_nindels(pre_junction);
6182       } else {
6183 	extraleft = 0;
6184       }
6185 
6186       substring = (Substring_T) List_head(p);
6187       if (output_type == STD_OUTPUT) {
6188 	extraright = this->querylength - Substring_queryend(substring);
6189       } else {
6190 	extraright = 0;
6191       }
6192 
6193       if (Substring_has_alts_p(substring) == true) {
6194 	/* Skip */
6195       } else {
6196 	this->nmismatches_refdiff +=
6197 	  Substring_display_prep(substring,queryuc_ptr,this->querylength,
6198 				 extraleft,extraright,genomecomp);
6199       }
6200     }
6201   }
6202 
6203   return;
6204 }
6205 
6206 
6207 List_T
Stage3end_filter(List_T hits,Hitlistpool_T hitlistpool,int max_mismatches_refalt,int max_mismatches_ref,int min_coverage)6208 Stage3end_filter (List_T hits, Hitlistpool_T hitlistpool,
6209 		  int max_mismatches_refalt, int max_mismatches_ref, int min_coverage) {
6210   List_T newhits = NULL, p;
6211   Stage3end_T hit;
6212 
6213   debug1(printf("Entered Stage3end_filter with max_mismatches_refalt %d, max_mismatches_ref %d, and min_coverage %d\n",
6214 		max_mismatches_refalt,max_mismatches_ref,min_coverage));
6215 
6216   if (filter_within_trims_p == false) {
6217     /* Generally want overall mismatches for DNA-seq, so use refalt_score_overall */
6218     for (p = hits; p != NULL; p = List_next(p)) {
6219       hit = (Stage3end_T) List_head(p);
6220       debug1(printf("DNA-seq: Comparing refalt score %d against max_mismatches_refalt %d, ref %d against %d, and coverage %d against min_coverage %d\n",
6221 		    hit->refalt_score_overall,max_mismatches_refalt,
6222 		    hit->ref_score_overall,max_mismatches_ref,
6223 		    hit->querylength - hit->trim_querystart - hit->trim_queryend,min_coverage));
6224       debug1(printf("Coverage is querylength %d - trim_querystart %d - trim_queryend %d + mandatory %d + mandatory %d\n",
6225 		    hit->querylength,hit->trim_querystart,hit->trim_queryend,hit->mandatory_trim_querystart,hit->mandatory_trim_queryend));
6226 
6227       if (hit->refalt_score_overall > max_mismatches_refalt) {
6228 	debug1(printf(" => FREE\n"));
6229 	Stage3end_free(&hit);
6230       } else if (hit->ref_score_overall > max_mismatches_ref) {
6231 	debug1(printf(" => FREE\n"));
6232 	Stage3end_free(&hit);
6233       } else if (hit->querylength - hit->trim_querystart - hit->trim_queryend + hit->mandatory_trim_querystart + hit->mandatory_trim_queryend < min_coverage) {
6234 	debug1(printf(" => FREE\n"));
6235 	Stage3end_free(&hit);
6236       } else {
6237 	debug1(printf(" => KEEP\n"));
6238 	newhits = Hitlist_push(newhits,hitlistpool,(void *) hit);
6239       }
6240     }
6241 
6242   } else {
6243     /* Generally expect trims for RNA-seq, so use refalt_score_within_trims */
6244     for (p = hits; p != NULL; p = List_next(p)) {
6245       hit = (Stage3end_T) List_head(p);
6246       debug1(printf("RNA-seq: Comparing refalt score %d against max_mismatches_refalt %d, and coverage %d against min_coverage %d\n",
6247 		    hit->refalt_score_within_trims,max_mismatches_refalt,hit->querylength - hit->trim_querystart - hit->trim_queryend,min_coverage));
6248       debug1(printf("Coverage is querylength %d - trim_querystart %d - trim_queryend %d + mandatory %d + mandatory %d\n",
6249 		    hit->querylength,hit->trim_querystart,hit->trim_queryend,hit->mandatory_trim_querystart,hit->mandatory_trim_queryend));
6250 
6251       if (hit->refalt_score_within_trims > max_mismatches_refalt) {
6252 	debug1(printf(" => FREE\n"));
6253 	Stage3end_free(&hit);
6254       } else if (hit->querylength - hit->trim_querystart - hit->trim_queryend + hit->mandatory_trim_querystart + hit->mandatory_trim_queryend < min_coverage) {
6255 	debug1(printf(" => FREE\n"));
6256 	Stage3end_free(&hit);
6257       } else {
6258 	debug1(printf(" => KEEP\n"));
6259 	newhits = Hitlist_push(newhits,hitlistpool,(void *) hit);
6260       }
6261     }
6262   }
6263 
6264 
6265   Hitlist_free(&hits);
6266   return newhits;
6267 }
6268 
6269 
6270 
6271 
6272 Stage3end_T *
Stage3end_eval_and_sort(int npaths,int * first_absmq,int * second_absmq,Stage3end_T * stage3array,char * queryuc_ptr,char * quality_string,bool displayp)6273 Stage3end_eval_and_sort (int npaths, int *first_absmq, int *second_absmq,
6274 			 Stage3end_T *stage3array, char *queryuc_ptr, char *quality_string,
6275 			 bool displayp) {
6276   float maxlik, loglik;
6277   float total, q;		/* For Bayesian mapq calculation */
6278   int compute_npaths;
6279 
6280   int randomi, i;
6281   Stage3end_T temp, hit;
6282 
6283   if (npaths == 0) {
6284     /* Skip */
6285     *first_absmq = 0;
6286     *second_absmq = 0;
6287 
6288   } else if (npaths == 1) {
6289     hit = stage3array[0];
6290     hit->mapq_loglik = MAPQ_MAXIMUM_SCORE;
6291     hit->mapq_score = MAPQ_max_quality_score(quality_string,hit->querylength);
6292     hit->absmq_score = MAPQ_MAXIMUM_SCORE;
6293 
6294     if (displayp == true) {
6295       Stage3end_display_prep(hit,queryuc_ptr,/*first_read_p*/true);
6296     }
6297     *first_absmq = hit->absmq_score;
6298     *second_absmq = 0;
6299 
6300   } else {
6301     /* Compute mapq_loglik */
6302     for (i = 0; i < npaths; i++) {
6303       Stage3end_compute_mapq(stage3array[i],quality_string);
6304     }
6305 
6306     /* Sort by nmatches, then mapq */
6307     qsort(stage3array,npaths,sizeof(Stage3end_T),Stage3end_output_cmp);
6308 
6309     if (want_random_p) {
6310       /* Randomize among best alignments */
6311       i = 1;
6312       while (i < npaths && Stage3end_output_cmp(&(stage3array[i]),&(stage3array[0])) == 0) {
6313 	i++;
6314       }
6315       if (i > 1) {		/* i is number of ties */
6316 	/* randomi = (int) ((double) i * rand()/((double) RAND_MAX + 1.0)); */
6317 	randomi = (int) (rand() / (((double) RAND_MAX + 1.0) / (double) i));
6318 	/* fprintf(stderr,"%d dups => random %d\n",i,randomi); */
6319 	temp = stage3array[0];
6320 	stage3array[0] = stage3array[randomi];
6321 	stage3array[randomi] = temp;
6322       }
6323     }
6324 
6325     /* Enforce monotonicity */
6326     for (i = npaths - 1; i > 0; i--) {
6327       if (stage3array[i-1]->mapq_loglik < stage3array[i]->mapq_loglik) {
6328 	stage3array[i-1]->mapq_loglik = stage3array[i]->mapq_loglik;
6329       }
6330     }
6331     maxlik = stage3array[0]->mapq_loglik;
6332 
6333     /* Subtract maxlik to avoid underflow */
6334     for (i = 0; i < npaths; i++) {
6335       stage3array[i]->mapq_loglik -= maxlik;
6336     }
6337 
6338 #if 0
6339     /* Save on computation if possible */
6340     /* Not possible, since we are going to select randomly from among all npaths */
6341     if (npaths < maxpaths) {
6342       compute_npaths = npaths;
6343     } else {
6344       compute_npaths = maxpaths;
6345     }
6346     if (compute_npaths < 2) {
6347       compute_npaths = 2;
6348     }
6349 #else
6350     compute_npaths = npaths;
6351 #endif
6352 
6353     /* Compute absolute mapq */
6354     for (i = 0; i < compute_npaths; i++) {
6355       loglik = stage3array[i]->mapq_loglik + MAPQ_MAXIMUM_SCORE;
6356       if (loglik < 0.0) {
6357 	loglik = 0.0;
6358       }
6359       stage3array[i]->absmq_score = rint(loglik);
6360     }
6361     *first_absmq = stage3array[0]->absmq_score;
6362     *second_absmq = stage3array[1]->absmq_score;
6363 
6364 
6365     /* Compute Bayesian mapq */
6366     total = 0.0;
6367     for (i = 0; i < npaths; i++) {
6368       total += (stage3array[i]->mapq_loglik = fasterexp(stage3array[i]->mapq_loglik));
6369     }
6370 
6371     /* Obtain posterior probabilities of being true */
6372     for (i = 0; i < compute_npaths; i++) {
6373       stage3array[i]->mapq_loglik /= total;
6374     }
6375 
6376     /* Convert to Phred scores */
6377     for (i = 0; i < compute_npaths; i++) {
6378       if ((q = 1.0 - stage3array[i]->mapq_loglik) < 2.5e-10 /* 10^-9.6 */) {
6379 	stage3array[i]->mapq_score = 96;
6380       } else {
6381 	stage3array[i]->mapq_score = rint(-10.0 * log10(q));
6382       }
6383     }
6384 
6385     if (displayp == true) {
6386       /* Prepare for display */
6387       for (i = 0; i < compute_npaths; i++) {
6388 	Stage3end_display_prep(stage3array[i],queryuc_ptr,/*first_read_p*/true);
6389       }
6390     }
6391 
6392 #if 0
6393     /* Apply filtering for mapq unique -- currently not used since mapq_unique_score is high */
6394     if (stage3array[0]->mapq_score >= mapq_unique_score &&
6395 	stage3array[1]->mapq_score < mapq_unique_score) {
6396       for (i = 1; i < *npaths; i++) {
6397 	Stage3end_free(&(stage3array[i]));
6398       }
6399       *npaths = 1;
6400     }
6401 #endif
6402   }
6403 
6404   return stage3array;
6405 }
6406 
6407 
6408 static int
insertlength_expected(Chrpos_T insertlength)6409 insertlength_expected (Chrpos_T insertlength) {
6410   if (insertlength < expected_pairlength_low) {
6411     return -1;
6412   } else if (insertlength > expected_pairlength_very_high) {
6413     return -1;
6414   } else if (insertlength > expected_pairlength_high) {
6415     return 0;
6416   } else {
6417     return +1;
6418   }
6419 }
6420 
6421 
6422 /* For concordant ends */
6423 static Chrpos_T
pair_insert_length(int * pair_relationship,Stage3end_T hit5,Stage3end_T hit3)6424 pair_insert_length (int *pair_relationship, Stage3end_T hit5, Stage3end_T hit3) {
6425   List_T p, q;
6426   Substring_T substring5, substring3;
6427 
6428   if (hit5->plusp != hit3->plusp) {
6429     debug10(printf("pair_insert_length: hit5->plusp %d != hit3->plusp %d, so returning 0\n",
6430 		   hit5->plusp,hit3->plusp));
6431     *pair_relationship = 0;
6432     return 0;
6433   }
6434 
6435   if (hit5->chrnum != 0 && hit3->chrnum != 0) {
6436     for (q = hit3->substrings_1toN; q != NULL; q = List_next(q)) {
6437       substring3 = (Substring_T) List_head(q);
6438       for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
6439 	substring5 = (Substring_T) List_head(p);
6440 	if (Substring_overlap_p(substring5,substring3)) {
6441 	  debug10(printf("Calling Substring_insert_length on %d..%d and %d..%d\n",
6442 			 Substring_querystart(substring5),Substring_queryend(substring5),
6443 			 Substring_querystart(substring3),Substring_queryend(substring3)));
6444 	  return Substring_insert_length(&(*pair_relationship),substring5,substring3);
6445 	}
6446       }
6447     }
6448   }
6449 
6450   /* No overlap found between any combination of substrings */
6451   if (hit5->plusp == true) {
6452     if (hit5->genomicend > hit3->genomicstart + hit5->querylength + hit3->querylength) {
6453       debug10(printf("pair_insert_length: no overlap found, and %u - %u + %d + %d < 0, so returning 0\n",
6454 		     hit3->genomicstart - hit3->chroffset,hit5->genomicend - hit5->chroffset,
6455 		     hit5->querylength,hit3->querylength));
6456       *pair_relationship = 0;
6457       return 0;
6458     } else {
6459       debug10(printf("pair_insert_length: no overlap found, so returning %u - %u + %d + %d\n",
6460 		     hit3->genomicstart - hit3->chroffset,hit5->genomicend - hit5->chroffset,
6461 		     hit5->querylength,hit3->querylength));
6462     }
6463     *pair_relationship = +1;
6464     return hit3->genomicstart - hit5->genomicend + hit5->querylength + hit3->querylength;
6465 
6466   } else {
6467     if (hit3->genomicstart > hit5->genomicend + hit5->querylength + hit3->querylength) {
6468       debug10(printf("pair_insert_length: no overlap found, and %u - %u + %d + %d < 0, so returning 0\n",
6469 		     hit5->genomicend - hit5->chroffset,hit3->genomicstart - hit3->chroffset,
6470 		     hit5->querylength,hit3->querylength));
6471       *pair_relationship = 0;
6472       return 0;
6473     } else {
6474       debug10(printf("pair_insert_length: no overlap found, so returning %u - %u + %d + %d\n",
6475 		     hit5->genomicend - hit5->chroffset,hit3->genomicstart - hit3->chroffset,
6476 		     hit5->querylength,hit3->querylength));
6477       *pair_relationship = -1;
6478       return hit5->genomicend - hit3->genomicstart + hit5->querylength + hit3->querylength;
6479     }
6480   }
6481 }
6482 
6483 
6484 
6485 /* For unpaired ends */
6486 static Chrpos_T
pair_insert_length_unpaired(Stage3end_T hit5,Stage3end_T hit3)6487 pair_insert_length_unpaired (Stage3end_T hit5, Stage3end_T hit3) {
6488 
6489   if (hit5->effective_chrnum != hit3->effective_chrnum) {
6490     debug10(printf("pair_insert_length: hit5->plusp %d != hit3->plusp %d, so returning 0\n",
6491 		   hit5->plusp,hit3->plusp));
6492     return 0;
6493   } else if (hit5->distant_splice_p == true) {
6494     return 0;
6495   } else if (hit3->distant_splice_p == true) {
6496     return 0;
6497   } else if (hit5->high < hit3->low) {
6498     /* was hit3->low - hit5->high + hit5->querylength + hit3->querylength; */
6499     return hit3->genomicstart - hit5->genomicstart;
6500   } else if (hit3->high < hit5->low) {
6501     /* was hit5->low - hit3->high + hit5->querylength + hit3->querylength; */
6502     return hit5->genomicstart - hit3->genomicstart;
6503   } else {
6504     return hit5->querylength + hit3->querylength;
6505   }
6506 }
6507 
6508 
6509 Stage3end_T *
Stage3end_eval_and_sort_guided(int npaths,int * first_absmq,int * second_absmq,Stage3end_T guide,Stage3end_T * stage3array,char * queryuc_ptr,char * quality_string,bool displayp)6510 Stage3end_eval_and_sort_guided (int npaths, int *first_absmq, int *second_absmq, Stage3end_T guide,
6511 				Stage3end_T *stage3array, char *queryuc_ptr, char *quality_string,
6512 				bool displayp) {
6513   float maxlik, loglik;
6514   float total, q;		/* For Bayesian mapq calculation */
6515   int compute_npaths;
6516 
6517   int randomi, i;
6518   Stage3end_T temp, hit;
6519 
6520   if (npaths == 0) {
6521     /* Skip */
6522     *first_absmq = 0;
6523     *second_absmq = 0;
6524 
6525   } else if (npaths == 1) {
6526     hit = stage3array[0];
6527     hit->mapq_loglik = MAPQ_MAXIMUM_SCORE;
6528     hit->mapq_score = MAPQ_max_quality_score(quality_string,hit->querylength);
6529     hit->absmq_score = MAPQ_MAXIMUM_SCORE;
6530 
6531     if (displayp == true) {
6532       Stage3end_display_prep(hit,queryuc_ptr,/*first_read_p*/true);
6533     }
6534     *first_absmq = hit->absmq_score;
6535     *second_absmq = 0;
6536 
6537   } else {
6538     /* Compute mapq_loglik */
6539     for (i = 0; i < npaths; i++) {
6540       Stage3end_compute_mapq(stage3array[i],quality_string);
6541     }
6542 
6543     /* Compute insert_length relative to guide.  This is the only change from the unguided procedure. */
6544     for (i = 0; i < npaths; i++) {
6545       stage3array[i]->guided_insertlength = pair_insert_length_unpaired(stage3array[i],guide);
6546     }
6547 
6548     /* Sort by nmatches, then mapq */
6549     qsort(stage3array,npaths,sizeof(Stage3end_T),Stage3end_output_cmp);
6550 
6551     if (want_random_p) {
6552       /* Randomize among best alignments */
6553       i = 1;
6554       while (i < npaths && Stage3end_output_cmp(&(stage3array[i]),&(stage3array[0])) == 0) {
6555 	i++;
6556       }
6557       if (i > 1) {		/* i is number of ties */
6558 	/* randomi = (int) ((double) i * rand()/((double) RAND_MAX + 1.0)); */
6559 	randomi = (int) (rand() / (((double) RAND_MAX + 1.0) / (double) i));
6560 	/* fprintf(stderr,"%d dups => random %d\n",i,randomi); */
6561 	temp = stage3array[0];
6562 	stage3array[0] = stage3array[randomi];
6563 	stage3array[randomi] = temp;
6564       }
6565     }
6566 
6567     /* Enforce monotonicity */
6568     for (i = npaths - 1; i > 0; i--) {
6569       if (stage3array[i-1]->mapq_loglik < stage3array[i]->mapq_loglik) {
6570 	stage3array[i-1]->mapq_loglik = stage3array[i]->mapq_loglik;
6571       }
6572     }
6573     maxlik = stage3array[0]->mapq_loglik;
6574 
6575     /* Subtract maxlik to avoid underflow */
6576     for (i = 0; i < npaths; i++) {
6577       stage3array[i]->mapq_loglik -= maxlik;
6578     }
6579 
6580 #if 0
6581     /* Save on computation if possible */
6582     /* Not possible, since we are going to select randomly from among all paths */
6583     if (npaths < maxpaths) {
6584       compute_npaths = npaths;
6585     } else {
6586       compute_npaths = maxpaths;
6587     }
6588     if (compute_npaths < 2) {
6589       compute_npaths = 2;
6590     }
6591 #else
6592     compute_npaths = npaths;
6593 #endif
6594 
6595     /* Compute absolute mapq */
6596     for (i = 0; i < compute_npaths; i++) {
6597       loglik = stage3array[i]->mapq_loglik + MAPQ_MAXIMUM_SCORE;
6598       if (loglik < 0.0) {
6599 	loglik = 0.0;
6600       }
6601       stage3array[i]->absmq_score = rint(loglik);
6602     }
6603     *first_absmq = stage3array[0]->absmq_score;
6604     *second_absmq = stage3array[1]->absmq_score;
6605 
6606 
6607     /* Compute Bayesian mapq */
6608     total = 0.0;
6609     for (i = 0; i < npaths; i++) {
6610       total += (stage3array[i]->mapq_loglik = fasterexp(stage3array[i]->mapq_loglik));
6611     }
6612 
6613     /* Obtain posterior probabilities of being true */
6614     for (i = 0; i < compute_npaths; i++) {
6615       stage3array[i]->mapq_loglik /= total;
6616     }
6617 
6618     /* Convert to Phred scores */
6619     for (i = 0; i < compute_npaths; i++) {
6620       if ((q = 1.0 - stage3array[i]->mapq_loglik) < 2.5e-10 /* 10^-9.6 */) {
6621 	stage3array[i]->mapq_score = 96;
6622       } else {
6623 	stage3array[i]->mapq_score = rint(-10.0 * log10(q));
6624       }
6625     }
6626 
6627     if (displayp == true) {
6628       /* Prepare for display */
6629       for (i = 0; i < compute_npaths; i++) {
6630 	Stage3end_display_prep(stage3array[i],queryuc_ptr,/*first_read_p*/true);
6631       }
6632     }
6633 
6634 #if 0
6635     /* Apply filtering for mapq unique -- currently not used since mapq_unique_score is high */
6636     if (stage3array[0]->mapq_score >= mapq_unique_score &&
6637 	stage3array[1]->mapq_score < mapq_unique_score) {
6638       for (i = 1; i < *npaths; i++) {
6639 	Stage3end_free(&(stage3array[i]));
6640       }
6641       *npaths = 1;
6642     }
6643 #endif
6644   }
6645 
6646   return stage3array;
6647 }
6648 
6649 
6650 /* Note: single-end terminals can be present with non-terminals when
6651    paired-end reads are searched for concordance, which can accumulate
6652    terminal alignments */
6653 
6654 /* Pre-final: max (max-terminal, min-other)
6655    Final: max (min-terminal, max-GMAP, min-other) */
6656 
6657 
6658 static List_T
Stage3end_optimal_score_prefinal(bool * eliminatedp,List_T hitlist,Hitlistpool_T hitlistpool,int querylength)6659 Stage3end_optimal_score_prefinal (bool *eliminatedp, List_T hitlist,
6660 				  Hitlistpool_T hitlistpool, int querylength) {
6661   List_T optimal = NULL, p, q;
6662   T hit;
6663   Substring_T substring;
6664   Junction_T junction;
6665   int n;
6666   int cutoff_level, ref_nmismatches;
6667   int minscore = querylength;
6668   int trim_querystart = 0, trim_queryend = 0, trim_querystart_0, trim_queryend_0;
6669 
6670 
6671 #ifdef DISTANT_SPLICE_SPECIAL
6672   bool shortdistance_p = false;
6673 #endif
6674 
6675 
6676   *eliminatedp = false;
6677   n = List_length(hitlist);
6678   debug4(printf("\nEntered Stage3end_optimal_score with %d hits\n",n));
6679 
6680   if (n <= 1) {
6681     return hitlist;
6682   }
6683 
6684   /* Use eventrim for comparing alignments.  Previously picked
6685      smallest trims, but now picking largest ones */
6686   for (p = hitlist; p != NULL; p = p->rest) {
6687     hit = (T) p->first;
6688 
6689     debug4(printf("hit %u..%u method %s, nsegments %d, nindels %d, trim_querystart: %d%s, trim_queryend %d%s, start_ambig %d, end_ambig %d.  sensedir %d\n",
6690 		  hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,Method_string(hit->method),
6691 		  hit->nsegments,hit->nindels,hit->trim_querystart,hit->trim_querystart_splicep ? " (splice)" : "",
6692 		  hit->trim_queryend,hit->trim_queryend_splicep ? " (splice)" : "",
6693 		  start_amb_length(hit),end_amb_length(hit),hit->sensedir));
6694 
6695     if (hit->trim_querystart_splicep == true) {
6696       /* Skip */
6697     } else if (hit->trim_querystart > trim_querystart) {
6698       trim_querystart = hit->trim_querystart;
6699     }
6700     if (hit->trim_queryend_splicep == true) {
6701       /* Skip */
6702     } else if (hit->trim_queryend > trim_queryend) {
6703       trim_queryend = hit->trim_queryend;
6704     }
6705   }
6706 
6707   if (trim_querystart == querylength) {
6708     trim_querystart = 0;
6709   }
6710   if (trim_queryend == querylength) {
6711     trim_queryend = 0;
6712   }
6713   debug4(printf("trim_querystart: %d, trim_queryend %d\n",trim_querystart,trim_queryend));
6714 
6715   for (p = hitlist; p != NULL; p = p->rest) {
6716     hit = (T) p->first;
6717 
6718 #ifdef CONSIDER_ENDS_IN_EVAL
6719     hit->score_eventrim = hit->trim_querystart / 8 + hit->trim_queryend / 8;
6720 #else
6721     hit->score_eventrim = 0;
6722 #endif
6723 
6724     debug4(printf("score OTHER:"));
6725 
6726     if (trim_querystart + trim_queryend >= querylength) {
6727       for (q = hit->substrings_1toN; q != NULL; q = List_next(q)) {
6728 	substring = (Substring_T) List_head(q);
6729 	hit->score_eventrim += Substring_nmismatches_bothdiff(substring);
6730       }
6731 
6732     } else {
6733       for (q = hit->substrings_1toN; q != NULL; q = List_next(q)) {
6734 	substring = (Substring_T) List_head(q);
6735 	trim_querystart_0 = trim_querystart;
6736 	trim_queryend_0 = trim_queryend;
6737 	if (Substring_mandatory_trim_querystart(substring) > trim_querystart_0) {
6738 	  trim_querystart_0 = Substring_mandatory_trim_querystart(substring);
6739 	}
6740 	if (Substring_mandatory_trim_queryend(substring) > trim_queryend_0) {
6741 	  trim_queryend_0 = Substring_mandatory_trim_queryend(substring);
6742 	}
6743 	hit->score_eventrim += Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0);
6744 	debug4(printf("  substring (%d..%d) %d.",trim_querystart,trim_queryend,
6745 		      Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0)));
6746       }
6747     }
6748 
6749     for (q = hit->junctions_1toN; q != NULL; q = List_next(q)) {
6750       junction = (Junction_T) List_head(q);
6751       if (Junction_nindels(junction) > 0) {
6752 	hit->score_eventrim += indel_penalty_middle;
6753 	debug4(printf(" => add %d.",indel_penalty_middle));
6754       }
6755     }
6756 
6757 
6758 #if 0
6759     /* Accept a single indel */
6760 #ifdef SCORE_INDELS_EVENTRIM
6761     if (hit->hittype == INSERTION || hit->hittype == DELETION) {
6762       debugee(printf("  indel at %d",hit->indel_pos));
6763       if (hit->indel_pos > trim_querystart && hit->indel_pos < querylength - trim_queryend) {
6764 	hit->score_eventrim += indel_penalty_middle;
6765 	debug4(printf(" => add %d.",indel_penalty_middle));
6766       }
6767     }
6768 #endif
6769 #endif
6770     debug4(printf("  RESULT: %d\n",hit->score_eventrim));
6771 
6772     if (hit->score_eventrim < minscore) {
6773       minscore = hit->score_eventrim;
6774     }
6775   }
6776   debug4(printf("MINSCORE: %d\n",minscore));
6777 
6778 
6779   /* Prefinal: Use score_eventrim */
6780   debug4(printf("Stage3end_optimal_score over %d hits: minscore = %d + subopt:%d\n",
6781 		n,minscore,subopt_levels));
6782   minscore += subopt_levels;
6783   cutoff_level = minscore;
6784 
6785   for (p = hitlist; p != NULL; p = p->rest) {
6786     hit = (T) p->first;
6787 
6788     if (hit->score_eventrim > cutoff_level + SCORE_EVENTRIM_SLOP) {
6789       debug4(printf("Prefinal: Eliminating hit %p at %u..%u with score_eventrim %d > cutoff_level %d\n",
6790 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6791 		    hit->score_eventrim,cutoff_level));
6792       Stage3end_free(&hit);
6793       *eliminatedp = true;
6794 
6795     } else {
6796       debug4(printf("Prefinal: Keeping hit %p at %u..%u with score_eventrim %d <= cutoff_level %d\n",
6797 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6798 		    hit->score_eventrim,cutoff_level));
6799       optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
6800     }
6801   }
6802   Hitlist_free(&hitlist);
6803 
6804 
6805 #if 0
6806   /* Filter on nsegments */
6807   if (finalp == true && optimal != NULL) {
6808     hitlist = optimal;
6809     optimal = (List_T) NULL;
6810 
6811     hit = (T) hitlist->first;
6812     best_nsegments = hit->nsegments;
6813 
6814     for (p = hitlist; p != NULL; p = p->rest) {
6815       hit = (T) p->first;
6816       if (hit->nsegments < best_nsegments) {
6817 	best_nsegments = hit->nsegments;
6818       }
6819     }
6820 
6821     for (p = hitlist; p != NULL; p = p->rest) {
6822       hit = (T) p->first;
6823       if (hit->nsegments > best_nsegments + 2) {
6824 	debug4(printf("Eliminating a hit with nsegments %d\n",hit->nsegments));
6825 	Stage3end_free(&hit);
6826 	*eliminatedp = true;
6827       } else {
6828 	debug4(printf("Keeping a hit with nsegments %d, nindels %d\n",hit->nsegments,hit->nindels));
6829 	optimal = Hitlist_push(optimal,hitlitpool,(void *) hit);
6830       }
6831     }
6832 
6833     Hitlist_free(&hitlist);
6834   }
6835 #endif
6836 
6837   debug4(printf("hitlist now has %d entries\n",List_length(optimal)));
6838   return optimal;
6839 }
6840 
6841 
6842 static int
hit_position_cmp(const void * a,const void * b)6843 hit_position_cmp (const void *a, const void *b) {
6844   T x = * (T *) a;
6845   T y = * (T *) b;
6846 
6847   if (x->plusp < y->plusp) {
6848     return -1;
6849   } else if (y->plusp < x->plusp) {
6850     return +1;
6851   } else if (x->low < y->low) {
6852     return -1;
6853   } else if (y->low < x->low) {
6854     return +1;
6855   } else if (x->high > y->high) {
6856     return -1;
6857   } else if (y->high > x->high) {
6858     return +1;
6859   } else {
6860     return 0;
6861   }
6862 }
6863 
6864 static bool
hit_equal(Stage3end_T x,Stage3end_T y)6865 hit_equal (Stage3end_T x, Stage3end_T y) {
6866   List_T p, q;
6867   Substring_T substring_x, substring_y;
6868 
6869   if (x->plusp != y->plusp) {
6870     return false;		/* Different strands */
6871   } else {
6872     p = x->substrings_1toN;
6873     q = y->substrings_1toN;
6874     while (p != NULL && q != NULL) {
6875       substring_x = (Substring_T) p->first;
6876       substring_y = (Substring_T) q->first;
6877       if (Substring_equal(substring_x,substring_y) == false) {
6878 	return false;
6879       }
6880       p = List_next(p);
6881       q = List_next(q);
6882     }
6883     if (p != NULL || q != NULL) {
6884       return false;
6885     }
6886 
6887     return true;
6888   }
6889 }
6890 
6891 
6892 static bool
hit_overlap_p(T x,T y)6893 hit_overlap_p (T x, T y) {
6894   if (x->chrnum != y->chrnum) {
6895     return false;		/* Different chrnums */
6896   } else if (x->plusp != y->plusp) {
6897     return false;		/* Different strands */
6898   } else if (x->high < y->low) {
6899     return false;
6900   } else if (x->low > y->high) {
6901     return false;
6902   } else {
6903     return true;
6904   }
6905 }
6906 
6907 #if 0
6908 static List_T
6909 Stage3end_optimal_score_final_old (bool *eliminatedp, List_T hitlist, Hitlistpool_T hitlistpool,
6910 				   int querylength) {
6911   List_T optimal = NULL, p;
6912   T *hits, hit;
6913   int n, i, j, k;
6914   int best_nsegments;
6915   int best_nmatches_to_trims;
6916   double max_splice_score;
6917   int max_nmatches = 0, cutoff_level;
6918   /* int trim_querystart, trim_queryend, min_trim; */
6919   bool *eliminate, keptp;
6920 
6921 
6922   *eliminatedp = false;
6923   n = List_length(hitlist);
6924   debug4(printf("\nEntered Stage3end_optimal_score with %d hits\n",n));
6925 
6926   if (n <= 1) {
6927     return hitlist;
6928   }
6929 
6930 #ifdef DEBUG4
6931   for (p = hitlist; p != NULL; p = p->rest) {
6932     hit = (Stage3end_T) p->first;
6933     printf("%p %u..%u method %s, score_eventrim %d, nmatches %d (%d to_trims)\n",
6934 		  hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6935 	   Method_string(hit->method),hit->score_eventrim,hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims);
6936   }
6937 #endif
6938 
6939   /* Prune based on refalt_nmatches_plus_spliced_trims (to get the splice ends) */
6940   max_nmatches = 0;
6941   for (p = hitlist; p != NULL; p = p->rest) {
6942     hit = (Stage3end_T) p->first;
6943     if (hit->refalt_nmatches_plus_spliced_trims > max_nmatches) {
6944       max_nmatches = hit->refalt_nmatches_plus_spliced_trims;
6945       assert(max_nmatches <= querylength);
6946     }
6947   }
6948 
6949   cutoff_level = max_nmatches - subopt_levels;
6950   debug4(printf("(1) refalt cutoff level %d = max_nmatches %d\n",cutoff_level,max_nmatches));
6951 
6952   for (p = hitlist; p != NULL; p = List_next(p)) {
6953     hit = (Stage3end_T) p->first;
6954 
6955     if (hit->refalt_nmatches_plus_spliced_trims < cutoff_level /*- NMATCHES_SLOP*/) {
6956       debug4(printf("Final (nmatches %d < %d): Eliminating hit %p at %u..%u with nmatches %d (%d to_trims) < cutoff_level %d\n",
6957 		    hit->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
6958 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6959 		    hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
6960       Stage3end_free(&hit);
6961       *eliminatedp = true;
6962 
6963     } else {
6964       debug4(printf("Final (nmatches %d >= %d): Keeping hit %p at %u..%u with nmatches %d (%d to_trims) >= cutoff_level %d\n",
6965 		    hit->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
6966 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6967 		    hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
6968       optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
6969     }
6970   }
6971   Hitlist_free(&hitlist);
6972   hitlist = optimal;
6973   optimal = (List_T) NULL;
6974 
6975 
6976   /* Prune based on ref_nmatches_plus_spliced_trims (to get the splice ends) */
6977   max_nmatches = 0;
6978   for (p = hitlist; p != NULL; p = p->rest) {
6979     hit = (Stage3end_T) p->first;
6980     if (hit->ref_nmatches_plus_spliced_trims > max_nmatches) {
6981       max_nmatches = hit->ref_nmatches_plus_spliced_trims;
6982       assert(max_nmatches <= querylength);
6983     }
6984   }
6985 
6986   /* May not want to be greedy on cutoff level here.  Might want to raise subopt_levels */
6987   cutoff_level = max_nmatches - subopt_levels;
6988   debug4(printf("(2) ref cutoff level %d = max_nmatches %d\n",cutoff_level,max_nmatches));
6989 
6990   for (p = hitlist; p != NULL; p = List_next(p)) {
6991     hit = (Stage3end_T) p->first;
6992 
6993     if (hit->ref_nmatches_plus_spliced_trims < cutoff_level /*- NMATCHES_SLOP*/) {
6994       debug4(printf("Final (nmatches %d < %d): Eliminating hit %p at %u..%u with nmatches %d (%d to_trims) < cutoff_level %d\n",
6995 		    hit->ref_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
6996 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6997 		    hit->ref_nmatches_plus_spliced_trims,hit->ref_nmatches_to_trims,cutoff_level));
6998       Stage3end_free(&hit);
6999       *eliminatedp = true;
7000 
7001     } else {
7002       debug4(printf("Final (nmatches %d >= %d): Keeping hit %p at %u..%u with nmatches %d (%d to_trims) >= cutoff_level %d\n",
7003 		    hit->ref_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
7004 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7005 		    hit->ref_nmatches_plus_spliced_trims,hit->ref_nmatches_to_trims,cutoff_level));
7006       optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7007     }
7008   }
7009   Hitlist_free(&hitlist);
7010   hitlist = optimal;
7011   optimal = (List_T) NULL;
7012 
7013 
7014   /* Prune based on nmatches_to_trims */
7015   best_nmatches_to_trims = 0;
7016   for (p = hitlist; p != NULL; p = p->rest) {
7017     hit = (Stage3end_T) p->first;
7018     if (hit->refalt_nmatches_to_trims > best_nmatches_to_trims) {
7019       best_nmatches_to_trims = hit->refalt_nmatches_to_trims;
7020       assert(best_nmatches_to_trims <= querylength);
7021     }
7022   }
7023 
7024   cutoff_level = best_nmatches_to_trims - subopt_levels;
7025   debug4(printf("cutoff level %d = best_nmatches_to_trims %d\n",cutoff_level,best_nmatches_to_trims));
7026 
7027   /* Do not allow slop for final */
7028   for (p = hitlist; p != NULL; p = List_next(p)) {
7029     hit = (Stage3end_T) p->first;
7030 
7031     if (hit->refalt_nmatches_to_trims < cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/) {
7032       debug4(printf("Final (nmatches_to_trims %d < %d): Eliminating hit %p at %u..%u with nmatches_to_trims %d (%d to_trims) < cutoff_level %d\n",
7033 		    hit->refalt_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
7034 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7035 		    hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
7036       Stage3end_free(&hit);
7037       *eliminatedp = true;
7038 
7039     } else {
7040       debug4(printf("Final (nmatches_to_trims %d >= %d): Keeping hit %p at %u..%u with nmatches_to_trims %d (%d to_trims) >= cutoff_level %d\n",
7041 		    hit->refalt_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
7042 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7043 		    hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
7044       optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7045     }
7046   }
7047   Hitlist_free(&hitlist);
7048   hitlist = optimal;
7049   optimal = (List_T) NULL;
7050 
7051 
7052   /* Eliminate within loci (1): refalt_nmatches_to_trims only */
7053   keptp = false;
7054   hits = (T *) List_to_array_n(&n,hitlist);
7055   eliminate = (bool *) CALLOC(n,sizeof(bool));
7056   qsort(hits,n,sizeof(T),hit_position_cmp);
7057   i = 0;
7058   while (i < n) {
7059     j = i+1;
7060     while (j < n && hit_overlap_p(hits[j],hits[i]) == true) {
7061       j++;
7062     }
7063     if (j - i > 1) {
7064       debug4(printf("Found a group from %d to %d\n",i,j));
7065       best_nmatches_to_trims = 0;
7066       for (k = i; k < j; k++) {
7067 	hit = hits[k];
7068 	if (hit->refalt_nmatches_to_trims > best_nmatches_to_trims) {
7069 	  best_nmatches_to_trims = hit->refalt_nmatches_to_trims;
7070 	}
7071       }
7072       debug4(printf("best_nmatches_to_trims %d\n",best_nmatches_to_trims));
7073 
7074       for (k = i; k < j; k++) {
7075 	hit = hits[k];
7076 	/* Do not allow slop for final */
7077 	if (hit->refalt_nmatches_to_trims < best_nmatches_to_trims /*- NMATCHES_TO_TRIMS_SLOP*/) {
7078 	  debug4(printf("Within loci end (nmatches_to_trims): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7079 			hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7080 			hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7081 	  eliminate[k] = true;
7082 	} else {
7083 	  keptp = true;
7084 	}
7085       }
7086     }
7087 
7088     i = j;
7089   }
7090 
7091   if (keptp == false) {
7092     optimal = hitlist;
7093   } else {
7094     for (k = 0; k < n; k++) {
7095       hit = hits[k];
7096       if (eliminate[k] == true) {
7097 	debug4(printf("Within loci end: Eliminating hit %p at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7098 		      hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7099 		      hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7100 	Stage3end_free(&hit);
7101 	*eliminatedp = true;
7102       } else {
7103 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7104       }
7105     }
7106     Hitlist_free(&hitlist);
7107   }
7108   FREE(hits);
7109   FREE(eliminate);
7110   hitlist = optimal;
7111   optimal = (List_T) NULL;
7112 
7113 
7114   /* Eliminate within loci (2): nsegments and splice score */
7115   keptp = false;
7116   hits = (T *) List_to_array_n(&n,hitlist);
7117   eliminate = (bool *) CALLOC(n,sizeof(bool));
7118   qsort(hits,n,sizeof(T),hit_position_cmp);
7119   i = 0;
7120   while (i < n) {
7121     j = i+1;
7122     while (j < n && hit_overlap_p(hits[j],hits[i]) == true) {
7123       j++;
7124     }
7125     if (j - i > 1) {
7126       debug4(printf("Found a group from %d to %d\n",i,j));
7127       best_nsegments = querylength;
7128       max_splice_score = 0.0;
7129       for (k = i; k < j; k++) {
7130 	hit = hits[k];
7131 	if (hit->nsegments < best_nsegments) {
7132 	  best_nsegments = hit->nsegments;
7133 	  max_splice_score = hit->splice_score;
7134 
7135 	} else if (hit->nsegments == best_nsegments) {
7136 	  if (hit->splice_score > max_splice_score) {
7137 	    max_splice_score = hit->splice_score;
7138 	  }
7139 	}
7140       }
7141       debug8(printf("best_nsegments %d, max_splice_score %f\n",
7142 		    best_nsegments,max_splice_score));
7143 
7144       for (k = i; k < j; k++) {
7145 	hit = hits[k];
7146 	if (hit->nsegments > best_nsegments) {
7147 	debug4(printf("Within loci end (nsegments %d > %d): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7148 		      hit->nsegments,best_nsegments,
7149 		      hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7150 		      hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7151 	  eliminate[k] = true;
7152 
7153 	} else if (hit->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
7154 	debug4(printf("Within loci end (splice score w/slop %f < %f): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7155 		      hit->splice_score,max_splice_score,
7156 		      hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7157 		      hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7158 	  eliminate[k] = true;
7159 
7160 	} else {
7161 	  keptp = true;
7162 	}
7163       }
7164     }
7165 
7166     i = j;
7167   }
7168 
7169   if (keptp == false) {
7170     optimal = hitlist;
7171   } else {
7172     for (k = 0; k < n; k++) {
7173       hit = hits[k];
7174       if (eliminate[k] == true) {
7175 	debug4(printf("Within loci end: Eliminating hit %p at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7176 		      hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7177 		      hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7178 	Stage3end_free(&hit);
7179 	*eliminatedp = true;
7180       } else {
7181 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7182       }
7183     }
7184     Hitlist_free(&hitlist);
7185   }
7186   FREE(hits);
7187   FREE(eliminate);
7188   hitlist = optimal;
7189   /* optimal = (List_T) NULL; */
7190 
7191 #if 0
7192   /* Filter on trim amount */
7193   optimal = (List_T) NULL;
7194   min_trim = querylength;
7195   for (p = hitlist; p != NULL; p = p->rest) {
7196     hit = (T) p->first;
7197     if (hit->trim_querystart_splicep == true) {
7198       /* Skip */
7199       trim_querystart = 0;
7200     } else {
7201       trim_querystart = hit->trim_querystart;
7202     }
7203     if (hit->trim_queryend_splicep == true) {
7204       /* Skip */
7205       trim_queryend = 0;
7206     } else {
7207       trim_queryend = hit->trim_queryend;
7208     }
7209 
7210     if (trim_querystart + trim_queryend < min_trim) {
7211       min_trim = trim_querystart + trim_queryend;
7212     }
7213   }
7214 
7215   for (p = hitlist; p != NULL; p = p->rest) {
7216     hit = (T) p->first;
7217     if (hit->trim_querystart_splicep == true) {
7218       /* Skip */
7219       trim_querystart = 0;
7220     } else {
7221       trim_querystart = hit->trim_querystart;
7222     }
7223     if (hit->trim_queryend_splicep == true) {
7224       /* Skip */
7225       trim_queryend = 0;
7226     } else {
7227       trim_queryend = hit->trim_queryend;
7228     }
7229 
7230     if (trim_querystart + trim_queryend > min_trim) {
7231       debug4(printf("Final: Eliminating hit %p at %u..%u with trim %d + %d > min_trim %d\n",
7232 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7233 		    trim_querystart,trim_queryend,min_trim));
7234       Stage3end_free(&hit);
7235       *eliminatedp = true;
7236 
7237     } else {
7238       debug4(printf("Final: Keeping hit %p at %u..%u with trim %d + %d == min_trim %d\n",
7239 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7240 		    trim_querystart,trim_queryend,min_trim));
7241       optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7242     }
7243   }
7244   Hitlist_free(&hitlist);
7245 #endif
7246 
7247 
7248   debug4(printf("Exiting Stage3end_optimal_score_final with %d hits\n",List_length(hitlist)));
7249   return hitlist;
7250 }
7251 #endif
7252 
7253 
7254 static List_T
Stage3end_optimal_score_final(bool * eliminatedp,List_T hitlist,Hitlistpool_T hitlistpool,int querylength)7255 Stage3end_optimal_score_final (bool *eliminatedp, List_T hitlist, Hitlistpool_T hitlistpool,
7256 			       int querylength) {
7257   List_T optimal = NULL, p;
7258   T hit;
7259   int n;
7260   int max_adj_nmatches, score;
7261   int best_nmatches_to_trims;
7262   int cutoff_level;
7263   /* int trim_querystart, trim_queryend, min_trim; */
7264 
7265 
7266   *eliminatedp = false;
7267   n = List_length(hitlist);
7268   debug4(printf("\nEntered Stage3end_optimal_score with %d hits\n",n));
7269 
7270   if (n <= 1) {
7271     return hitlist;
7272   }
7273 
7274 #ifdef DEBUG4
7275   for (p = hitlist; p != NULL; p = p->rest) {
7276     hit = (Stage3end_T) p->first;
7277     printf("%p %u..%u method %s, score_eventrim %d, nmatches %d (%d to_trims), refalt score %d\n",
7278 		  hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7279 	   Method_string(hit->method),hit->score_eventrim,hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,
7280 	   hit->refalt_score_overall);
7281   }
7282   printf("\n");
7283 #endif
7284 
7285   /* (1) Prune based on nmatches adjusted by score to get a tradeoff between matches and parsimony */
7286   max_adj_nmatches = 0;
7287   for (p = hitlist; p != NULL; p = p->rest) {
7288     hit = (Stage3end_T) p->first;
7289     if ((score = hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall) > max_adj_nmatches) {
7290       max_adj_nmatches = score;
7291     }
7292   }
7293 
7294   cutoff_level = max_adj_nmatches - subopt_levels;
7295   debug4(printf("(1) refalt cutoff level %d = max_adj_nmatches %d - subopt_levels %d\n",
7296 		cutoff_level,max_adj_nmatches,subopt_levels));
7297 
7298   for (p = hitlist; p != NULL; p = List_next(p)) {
7299     hit = (Stage3end_T) p->first;
7300 
7301     if (hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall < cutoff_level /*- NMATCHES_SLOP*/) {
7302       debug4(printf("Final (adj nmatches %d < %d): Eliminating hit %p at %u..%u with nmatches %d (%d to_trims) < cutoff_level %d\n",
7303 		    hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_within_trims,cutoff_level /*- NMATCHES_SLOP*/,
7304 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7305 		    hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
7306       Stage3end_free(&hit);
7307       *eliminatedp = true;
7308 
7309     } else {
7310       debug4(printf("Final (nmatches %d >= %d): Keeping hit %p at %u..%u with nmatches %d (%d to_trims) >= cutoff_level %d\n",
7311 		    hit->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
7312 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7313 		    hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
7314       optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7315     }
7316   }
7317   Hitlist_free(&hitlist);
7318   hitlist = optimal;
7319   optimal = (List_T) NULL;
7320 
7321 
7322   /* (2) Prune based on ref_nmatches_to_trims */
7323   best_nmatches_to_trims = 0;
7324   for (p = hitlist; p != NULL; p = p->rest) {
7325     hit = (Stage3end_T) p->first;
7326     if (hit->ref_nmatches_to_trims > best_nmatches_to_trims) {
7327       best_nmatches_to_trims = hit->ref_nmatches_to_trims;
7328       assert(best_nmatches_to_trims <= querylength);
7329     }
7330   }
7331 
7332   cutoff_level = best_nmatches_to_trims - subopt_levels;
7333   debug4(printf("cutoff level %d = best_nmatches_to_trims %d\n",cutoff_level,best_nmatches_to_trims));
7334 
7335   /* Do not allow slop for final */
7336   for (p = hitlist; p != NULL; p = List_next(p)) {
7337     hit = (Stage3end_T) p->first;
7338 
7339     if (hit->ref_nmatches_to_trims < cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/) {
7340       debug4(printf("Final (nmatches_to_trims %d < %d): Eliminating hit %p at %u..%u with nmatches_to_trims %d (%d to_trims) < cutoff_level %d\n",
7341 		    hit->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
7342 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7343 		    hit->ref_nmatches_plus_spliced_trims,hit->ref_nmatches_to_trims,cutoff_level));
7344       Stage3end_free(&hit);
7345       *eliminatedp = true;
7346 
7347     } else {
7348       debug4(printf("Final (nmatches_to_trims %d >= %d): Keeping hit %p at %u..%u with nmatches_to_trims %d (%d to_trims) >= cutoff_level %d\n",
7349 		    hit->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
7350 		    hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7351 		    hit->ref_nmatches_plus_spliced_trims,hit->ref_nmatches_to_trims,cutoff_level));
7352       optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7353     }
7354   }
7355   Hitlist_free(&hitlist);
7356   hitlist = optimal;
7357   /* optimal = (List_T) NULL; */
7358 
7359   /* Shouldn't need to eliminate within loci, since that was done during prefinal */
7360 
7361   debug4(printf("Exiting Stage3end_optimal_score_final with %d hits\n",List_length(hitlist)));
7362   return hitlist;
7363 }
7364 
7365 
7366 
7367 List_T
Stage3end_optimal_score(List_T hitlist,Hitlistpool_T hitlistpool,int querylength,bool finalp)7368 Stage3end_optimal_score (List_T hitlist, Hitlistpool_T hitlistpool, int querylength, bool finalp) {
7369   List_T optimal;
7370   bool eliminatedp;
7371 
7372   if (finalp == false) {
7373     optimal = Stage3end_optimal_score_prefinal(&eliminatedp,hitlist,hitlistpool,querylength);
7374     while (eliminatedp == true) {
7375       optimal = Stage3end_optimal_score_prefinal(&eliminatedp,optimal,hitlistpool,querylength);
7376     }
7377 
7378   } else {
7379     optimal = Stage3end_optimal_score_final(&eliminatedp,hitlist,hitlistpool,querylength);
7380     while (eliminatedp == true) {
7381       optimal = Stage3end_optimal_score_final(&eliminatedp,optimal,hitlistpool,querylength);
7382     }
7383   }
7384 
7385   return optimal;
7386 }
7387 
7388 
7389 static void
unalias_circular(T hit)7390 unalias_circular (T hit) {
7391   Chrpos_T chrlength = hit->chrlength;
7392   List_T p;
7393   Substring_T substring;
7394 
7395   assert(hit->circularalias == +1);
7396   debug12(printf("Calling unalias_circular on substrings\n"));
7397   for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
7398     substring = (Substring_T) List_head(p);
7399     Substring_unalias_circular(substring);
7400   }
7401 
7402   /* Doesn't fix hitpair->low and hitpair->high */
7403   hit->genomicstart -= chrlength;
7404   hit->genomicend -= chrlength;
7405   hit->low -= chrlength;
7406   hit->high -= chrlength;
7407 
7408   hit->circularalias = -1;
7409 
7410   return;
7411 }
7412 
7413 
7414 #if 0
7415 List_T
7416 Stage3end_unalias_circular (List_T hitlist) {
7417   List_T p;
7418   T hit;
7419 
7420   for (p = hitlist; p != NULL; p = p->rest) {
7421     hit = (T) p->first;
7422     if (hit->circularalias == +1) {
7423       unalias_circular(hit);
7424     }
7425   }
7426 
7427   return hitlist;
7428 }
7429 #endif
7430 
7431 List_T
Stage3end_remove_circular_alias(List_T hitlist,Hitlistpool_T hitlistpool)7432 Stage3end_remove_circular_alias (List_T hitlist, Hitlistpool_T hitlistpool) {
7433   List_T newlist = NULL, p;
7434   T hit;
7435 
7436   debug12(printf("Calling Stage3end_remove_circular_alias on %d hits\n",List_length(hitlist)));
7437   for (p = hitlist; p != NULL; p = p->rest) {
7438     hit = (T) p->first;
7439 
7440     if (hit->circularalias == +1) {
7441       /* First, try to salvage alias +1 */
7442       unalias_circular(hit);
7443     }
7444 
7445     if (hit->chrnum == 0) {
7446       /* Translocation */
7447       newlist = Hitlist_push(newlist,hitlistpool,(void *) hit);
7448 
7449     } else if (hit->low - hit->chroffset >= hit->chrlength) {
7450       /* All in circular alias */
7451       debug12(printf("Freeing hit because all is in circular alias\n"));
7452       Stage3end_free(&hit);
7453 
7454     } else {
7455       newlist = Hitlist_push(newlist,hitlistpool,(void *) hit);
7456     }
7457   }
7458 
7459   Hitlist_free(&hitlist);
7460   return newlist;
7461 }
7462 
7463 
7464 #if 0
7465 int
7466 Stage3end_noptimal (List_T hitlist, int querylength) {
7467   int noptimal;
7468   List_T p;
7469   T hit;
7470   int minscore = querylength;
7471 
7472   noptimal = 0;
7473   for (p = hitlist; p != NULL; p = p->rest) {
7474     hit = (T) p->first;
7475     if (hit->score < minscore) {
7476       minscore = hit->score;
7477       noptimal = 0;
7478     }
7479     if (hit->score == minscore) {
7480       noptimal++;
7481     }
7482   }
7483 
7484   return noptimal;
7485 }
7486 #endif
7487 
7488 
7489 static Univcoord_T
normalize_coord(Univcoord_T orig,int circularalias,Chrpos_T chrlength)7490 normalize_coord (Univcoord_T orig, int circularalias, Chrpos_T chrlength) {
7491   if (circularalias == +1) {
7492     return orig - chrlength;
7493   } else {
7494     return orig;
7495   }
7496 }
7497 
7498 
7499 
7500 static int
duplicate_sort_cmp(const void * a,const void * b)7501 duplicate_sort_cmp (const void *a, const void *b) {
7502   int cmp;
7503   T x = * (T *) a;
7504   T y = * (T *) b;
7505   Univcoord_T x_genomicstart, y_genomicstart;
7506   Univcoord_T x_genomicend, y_genomicend;
7507   List_T p, q;
7508   Substring_T x_substring, y_substring;
7509 
7510   if (altlocp[x->chrnum] == true && altlocp[y->chrnum] == true) {
7511     if (alias_ends[y->chrnum] >= alias_starts[x->chrnum] &&
7512 	alias_starts[y->chrnum] <= alias_ends[x->chrnum]) {
7513       /* The primary regions overlap */
7514       return 0;
7515     } else if (alias_starts[x->chrnum] < alias_starts[y->chrnum]) {
7516       return -1;
7517     } else if (alias_starts[y->chrnum] < alias_starts[x->chrnum]) {
7518       return +1;
7519     } else if (alias_ends[x->chrnum] < alias_ends[y->chrnum]) {
7520       return -1;
7521     } else if (alias_ends[y->chrnum] < alias_ends[x->chrnum]) {
7522       return +1;
7523     } else {
7524       return 0;
7525     }
7526 
7527   } else if (altlocp[x->chrnum] == true) {
7528     if (y->genomicend >= alias_starts[x->chrnum] &&
7529 	y->genomicstart <= alias_ends[x->chrnum]) {
7530       /* y overlaps with the primary region for x */
7531       return +1;		/* Put primary region first */
7532     }
7533     /* Don't overlap, so fall through to rest of procedure */
7534 
7535   } else if (altlocp[y->chrnum] == true) {
7536     if (alias_ends[y->chrnum] >= x->genomicstart &&
7537 	alias_starts[y->chrnum] <= x->genomicend) {
7538       /* x overlaps with the primary region for y */
7539       return -1;		/* Put primary region first */
7540     }
7541     /* Don't overlap, so fall through to rest of procedure */
7542   }
7543 
7544 
7545   x_genomicstart = normalize_coord(x->genomicstart,x->circularalias,x->chrlength);
7546   x_genomicend = normalize_coord(x->genomicend,x->circularalias,x->chrlength);
7547 
7548   y_genomicstart = normalize_coord(y->genomicstart,y->circularalias,y->chrlength);
7549   y_genomicend = normalize_coord(y->genomicend,y->circularalias,y->chrlength);
7550 
7551 
7552   if (x_genomicstart < y_genomicstart) {
7553     return -1;
7554   } else if (x_genomicstart > y_genomicstart) {
7555     return +1;
7556   } else if (x->hittype < y->hittype) {
7557     return -1;
7558   } else if (x->hittype > y->hittype) {
7559     return +1;
7560   } else if (x_genomicend < y_genomicend) {
7561     return -1;
7562   } else if (x_genomicend > y_genomicend) {
7563     return +1;
7564 
7565     /* sensedir is relevant for transcriptome-guided alignment, with overlapping genes */
7566   } else if (x->sensedir > y->sensedir) {
7567     return -1;
7568   } else if (y->sensedir > x->sensedir) {
7569     return +1;
7570 
7571   } else {
7572     for (p = x->substrings_1toN, q = y->substrings_1toN; p != NULL && q != NULL; p = List_next(p), q = List_next(q)) {
7573       x_substring = (Substring_T) List_head(p);
7574       y_substring = (Substring_T) List_head(q);
7575       if ((cmp = Substring_compare(x_substring,y_substring,x->circularalias,y->circularalias,x->chrlength,y->chrlength)) != 0) {
7576 	return cmp;
7577       }
7578     }
7579     if (p == NULL && q != NULL) {
7580       return -1;
7581     } else if (p != NULL && q == NULL) {
7582       return +1;
7583     }
7584 
7585 #if 0
7586     /* Need to change to search on junctions */
7587     if (x->indel_low < y->indel_low) {
7588       return -1;
7589     } else if (y->indel_low < x->indel_low) {
7590       return +1;
7591     }
7592 #endif
7593 
7594     return 0;
7595   }
7596 }
7597 
7598 /* Same as duplicate_sort_cmp, except for indel_low */
7599 static int
duplicate_equiv_cmp(const void * a,const void * b)7600 duplicate_equiv_cmp (const void *a, const void *b) {
7601   int cmp;
7602   T x = * (T *) a;
7603   T y = * (T *) b;
7604   List_T p, q;
7605   Substring_T x_substring, y_substring;
7606 
7607   Univcoord_T x_genomicstart, x_genomicend, y_genomicstart, y_genomicend;
7608 
7609   if (altlocp[x->chrnum] == true && altlocp[y->chrnum] == true) {
7610     if (alias_ends[y->chrnum] >= alias_starts[x->chrnum] &&
7611 	alias_starts[y->chrnum] <= alias_ends[x->chrnum]) {
7612       /* The primary regions overlap */
7613       return 0;
7614     }
7615 
7616   } else if (altlocp[x->chrnum] == true) {
7617     if (y->genomicend >= alias_starts[x->chrnum] &&
7618 	y->genomicstart <= alias_ends[x->chrnum]) {
7619       /* y overlaps with the primary region for x */
7620       return 0;
7621     }
7622 
7623   } else if (altlocp[y->chrnum] == true) {
7624     if (alias_ends[y->chrnum] >= x->genomicstart &&
7625 	alias_starts[y->chrnum] <= x->genomicend) {
7626       /* x overlaps with the primary region for y */
7627       return 0;
7628     }
7629   }
7630 
7631   x_genomicstart = normalize_coord(x->genomicstart,x->circularalias,x->chrlength);
7632   x_genomicend = normalize_coord(x->genomicend,x->circularalias,x->chrlength);
7633 
7634   y_genomicstart = normalize_coord(y->genomicstart,y->circularalias,y->chrlength);
7635   y_genomicend = normalize_coord(y->genomicend,y->circularalias,y->chrlength);
7636 
7637   if (x_genomicstart < y_genomicstart) {
7638     return -1;
7639   } else if (x_genomicstart > y_genomicstart) {
7640     return +1;
7641 #if 0
7642   } else if (x->hittype < y->hittype) {
7643     return -1;
7644   } else if (x->hittype > y->hittype) {
7645     return +1;
7646 #endif
7647   } else if (x_genomicend < y_genomicend) {
7648     return -1;
7649   } else if (x_genomicend > y_genomicend) {
7650     return +1;
7651 
7652     /* sensedir is relevant for transcriptome-guided alignment, with overlapping genes */
7653   } else if (x->sensedir > y->sensedir) {
7654     return -1;
7655   } else if (y->sensedir > x->sensedir) {
7656     return +1;
7657 
7658   } else {
7659     for (p = x->substrings_1toN, q = y->substrings_1toN; p != NULL && q != NULL; p = List_next(p), q = List_next(q)) {
7660       x_substring = (Substring_T) List_head(p);
7661       y_substring = (Substring_T) List_head(q);
7662       if ((cmp = Substring_compare(x_substring,y_substring,x->circularalias,y->circularalias,x->chrlength,y->chrlength)) != 0) {
7663 	return cmp;
7664       }
7665     }
7666     if (p == NULL && q != NULL) {
7667       return -1;
7668     } else if (p != NULL && q == NULL) {
7669       return +1;
7670     } else {
7671       return 0;
7672     }
7673   }
7674 }
7675 
7676 
7677 #if defined(DEBUG0) || defined(DEBUG4)
7678 static void
Stage3end_print_substrings(Stage3end_T hit)7679 Stage3end_print_substrings (Stage3end_T hit) {
7680   List_T p;
7681   Substring_T substring;
7682 
7683   for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
7684     if ((substring = (Substring_T) List_head(p)) == NULL) {
7685       printf("NA ");
7686     } else {
7687       printf("#%d:%llu..%llu ",
7688 	     Substring_chrnum(substring),
7689 	     (unsigned long long) Substring_alignstart_trim(substring),
7690 	     (unsigned long long) Substring_alignend_trim(substring));
7691     }
7692   }
7693   return;
7694 }
7695 #endif
7696 
7697 
7698 const Except_T Duplicate_Pairing = { "Duplicates both seen in pairing" };
7699 
7700 List_T
Stage3end_remove_duplicates(List_T hitlist,Hitlistpool_T hitlistpool)7701 Stage3end_remove_duplicates (List_T hitlist, Hitlistpool_T hitlistpool) {
7702 #ifdef DEBUG4
7703   List_T p;
7704 #endif
7705   T x, y, *hits;
7706   int n, usedi, i, j, k;
7707   bool *eliminate, eliminatep;
7708 
7709   debug4(printf("Entered Stage3end_remove_duplicates with %d hits\n",List_length(hitlist)));
7710   if ((n = List_length(hitlist)) == 0) {
7711     return (List_T) NULL;
7712   } else {
7713 #ifdef USE_ALLOCA_FOR_HITS
7714     eliminate = (bool *) CALLOCA(n,sizeof(bool));
7715     hits = (T *) MALLOCA(n * sizeof(T));
7716     List_fill_array((void **) hits,hitlist); /* hitlist is a return value */
7717 #else
7718     eliminate = (bool *) CALLOC(n,sizeof(bool));
7719     hits = (T *) List_to_array(hitlist,NULL);
7720 #endif
7721   }
7722 
7723 
7724   /* By equivalence */
7725   debug4(printf("Stage3end_remove_duplicates: checking %d hits by equivalence class\n",n));
7726   qsort(hits,n,sizeof(T),duplicate_sort_cmp);
7727 
7728   debug4(
7729 	 for (i = 0; i < n; i++) {
7730 	   x = hits[i];
7731 	   printf("  Initial %d (%s): %p #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d, sense %d ",
7732 		  i,Method_string(x->method),x,x->chrnum,x->low - x->chroffset,x->high - x->chroffset,
7733 		  x->circularalias,x->refalt_nmatches_plus_spliced_trims,x->refalt_nmatches_to_trims,x->refalt_score_within_trims,x->sensedir);
7734 	   Stage3end_print_substrings(x);
7735 	   if (x->transcripts != NULL) {
7736 	     Transcript_print_list(x->transcripts);
7737 	   }
7738 	   printf("\n");
7739 	 }
7740 	 );
7741 
7742   eliminatep = false;
7743   i = 0;
7744   while (i < n) {
7745     j = i+1;
7746     while (j < n && duplicate_equiv_cmp(&(hits[j]),&(hits[i])) == 0) {
7747       j++;
7748     }
7749 
7750     if (j > i+1) {
7751       debug4(printf("Equivalence class #%d through #%d.  ",i,j-1));
7752 
7753       x = hits[i];
7754       if (x->paired_usedp == true) {
7755 	usedi = i;
7756       } else {
7757 	usedi = -1;
7758       }
7759 
7760       for (k = i+1; k < j; k++) {
7761 	y = hits[k];
7762 	if (y->paired_usedp == true) {
7763 	  if (usedi >= 0) {
7764 	    debug4(printf("  #%d equivalent to #%d and both used (%p and %p)\n",k,usedi,hits[k],hits[usedi]));
7765 #if 0
7766 	    /* This doesn't matter anymore.  Example from NM_001033853:
7767 	       TTGCCCTTGGTCACCCCGATGACGTCGATCATCTCATCCTGCCCAAACACTTGGTTCACAGGTACCTGCTGCTCA
7768 	       AGTGATGAATCCAAGAGGCGTTTCTATAAGAATTGGCATAAATCTAAGAAGAAGGCCCACCTGATGGAGATCCAG */
7769 	    fprintf(stderr,"Duplicates of Stage3end_T both seen\n");
7770 #if 0
7771 	    /* No longer providing queryseq1 and queryseq2 */
7772 	    Shortread_print_query_pairedend_fasta(stderr,queryseq1,queryseq2,
7773 						  /*invert_first_p*/false,/*invert_second_p*/true);
7774 #endif
7775 	    Except_raise(&Duplicate_Pairing, __FILE__, __LINE__);
7776 #endif
7777 	  } else {
7778 	    usedi = k;
7779 	  }
7780 	}
7781       }
7782 
7783       if (usedi < 0) {
7784 	debug4(printf("None used yet so eliminating #%d through #%d\n",i+1,j-1));
7785 	for (k = i+1; k < j; k++) {
7786 	  y = hits[k];
7787 	  if (y->transcripts != NULL) {
7788 	    x->transcripts = List_append(y->transcripts,x->transcripts);
7789 	    y->transcripts = (List_T) NULL;
7790 	  }
7791 	  eliminate[k] = true;
7792 	  eliminatep = true;
7793 	}
7794       } else {
7795 	debug4(printf("One used already so eliminating all but #%d\n",usedi));
7796 	for (k = i; k < j; k++) {
7797 	  if (k != usedi) {
7798 	    y = hits[k];
7799 	    if (y->transcripts != NULL) {
7800 	      x->transcripts = List_append(y->transcripts,x->transcripts);
7801 	      y->transcripts = (List_T) NULL;
7802 	    }
7803 	    eliminate[k] = true;
7804 	    eliminatep = true;
7805 	  }
7806 	}
7807       }
7808     }
7809 
7810     i = j;
7811   }
7812 
7813 
7814 #if 0
7815   nkept = 0;
7816   for (i = 0; i < n; i++) {
7817     if (eliminate[i] == false) {
7818       nkept++;
7819     }
7820   }
7821   if (nkept == 0) {
7822     /* All entries eliminated one another, so keep the first one */
7823     eliminate[0] = false;
7824   }
7825 #endif
7826 
7827   if (eliminatep == false) {
7828     debug4(printf("No eliminations, so hitlist is unchanged\n"));
7829   } else {
7830     Hitlist_free(&hitlist);
7831     for (i = n-1; i >= 0; i--) {
7832       x = hits[i];
7833       if (eliminate[i] == false) {
7834 #ifdef DEBUG4
7835 	printf("  Keeping #%d at chr #%d:%u..%u, score %d, nmatches %d (nindels %d, chrnum %d) (plusp = %d, sensedir = %d) ",
7836 	       i,x->chrnum,x->low - x->chroffset,x->high - x->chroffset,
7837 	       x->refalt_score_within_trims,x->refalt_nmatches_plus_spliced_trims,x->nindels,x->chrnum,x->plusp,x->sensedir);
7838 	Stage3end_print_substrings(x);
7839 	if (x->transcripts != NULL) {
7840 	  Transcript_print_nums(x->transcripts);
7841 	}
7842 	printf("\n");
7843 #endif
7844 	hitlist = Hitlist_push(hitlist,hitlistpool,(void *) x);
7845 
7846       } else {
7847 #ifdef DEBUG4
7848 	printf("  Eliminating #%d at chr #%d:%u..%u, score %d, nmatches %d (nindels %d, chrnum %d) (plusp = %d, sensedir = %d) ",
7849 	       i,x->chrnum,x->low - x->chroffset,x->high - x->chroffset,
7850 	       x->refalt_score_within_trims,x->refalt_nmatches_plus_spliced_trims,x->nindels,x->chrnum,x->plusp,x->sensedir);
7851 	Stage3end_print_substrings(x);
7852 	if (x->transcripts != NULL) {
7853 	  Transcript_print_nums(x->transcripts);
7854 	}
7855 	printf("\n");
7856 #endif
7857 	Stage3end_free(&x);
7858       }
7859     }
7860   }
7861 
7862 #ifdef USE_ALLOCA_FOR_HITS
7863   FREEA(hits);
7864   FREEA(eliminate);
7865 #else
7866   FREE(hits);
7867   FREE(eliminate);
7868 #endif
7869 
7870 #ifdef DEBUG4
7871   for (p = hitlist, i = 0; p != NULL; p = p->rest, i++) {
7872     x = (T) p->first;
7873     printf("  Final %d: #%d:%u..%u (plusp = %d, sensedir = %d) ",
7874 	   i,x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,x->plusp,x->sensedir);
7875     Stage3end_print_substrings(x);
7876     if (x->transcripts != NULL) {
7877       Transcript_print_nums(x->transcripts);
7878     }
7879     printf("\n");
7880   }
7881 #endif
7882 
7883   debug4(printf("Exited Stage3end_remove_duplicates with %d hits\n",List_length(hitlist)));
7884   return hitlist;
7885 }
7886 
7887 
7888 
7889 T *
Stage3end_remove_duplicates_array(int * nunique,List_T * duplicates,T * hits,int nhits,Hitlistpool_T hitlistpool)7890 Stage3end_remove_duplicates_array (int *nunique, List_T *duplicates, T *hits, int nhits,
7891 				   Hitlistpool_T hitlistpool) {
7892   T *unique, *out, x, y;
7893   int usedi, i, j, k;
7894   bool *eliminate, eliminatep;
7895 
7896   debug4(printf("Entered Stage3end_remove_duplicates_array with %d hits\n",nhits));
7897   if (nhits == 0) {
7898     *nunique = 0;
7899     return (T *) NULL;
7900 
7901   } else {
7902     eliminate = (bool *) CALLOC(nhits,sizeof(bool));
7903   }
7904 
7905 
7906   /* By equivalence */
7907   debug4(printf("Stage3end_remove_duplicates_array: checking %d hits by equivalence class\n",nhits));
7908   qsort(hits,nhits,sizeof(T),duplicate_sort_cmp);
7909 
7910   debug4(
7911 	 for (i = 0; i < nhits; i++) {
7912 	   x = hits[i];
7913 	   printf("  Initial %d (%s): %p #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d, sense %d ",
7914 		  i,Method_string(x->method),x,x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
7915 		  x->circularalias,x->refalt_nmatches_plus_spliced_trims,x->refalt_nmatches_to_trims,x->refalt_score_within_trims,x->sensedir);
7916 	   Stage3end_print_substrings(x);
7917 	   if (x->transcripts != NULL) {
7918 	     Transcript_print_list(x->transcripts);
7919 	   }
7920 	   printf("\n");
7921 	 }
7922 	 );
7923 
7924   eliminatep = false;
7925   i = 0;
7926   while (i < nhits) {
7927     j = i+1;
7928     while (j < nhits && duplicate_equiv_cmp(&(hits[j]),&(hits[i])) == 0) {
7929       j++;
7930     }
7931 
7932     if (j > i+1) {
7933       debug4(printf("Equivalence class #%d through #%d.  ",i,j-1));
7934 
7935       x = hits[i];
7936       if (x->paired_usedp == true) {
7937 	usedi = i;
7938       } else {
7939 	usedi = -1;
7940       }
7941 
7942       for (k = i+1; k < j; k++) {
7943 	y = hits[k];
7944 	if (y->paired_usedp == true) {
7945 	  if (usedi >= 0) {
7946 	    debug4(printf("  #%d equivalent to #%d and both used (%p and %p)\n",k,usedi,hits[k],hits[usedi]));
7947 #if 0
7948 	    /* This doesn't matter anymore.  Example from NM_001033853:
7949 	       TTGCCCTTGGTCACCCCGATGACGTCGATCATCTCATCCTGCCCAAACACTTGGTTCACAGGTACCTGCTGCTCA
7950 	       AGTGATGAATCCAAGAGGCGTTTCTATAAGAATTGGCATAAATCTAAGAAGAAGGCCCACCTGATGGAGATCCAG */
7951 	    fprintf(stderr,"Duplicates of Stage3end_T both seen\n");
7952 #if 0
7953 	    /* No longer providing queryseq1 and queryseq2 */
7954 	    Shortread_print_query_pairedend_fasta(stderr,queryseq1,queryseq2,
7955 						  /*invert_first_p*/false,/*invert_second_p*/true);
7956 #endif
7957 	    Except_raise(&Duplicate_Pairing, __FILE__, __LINE__);
7958 #endif
7959 	  } else {
7960 	    usedi = k;
7961 	  }
7962 	}
7963       }
7964 
7965       if (usedi < 0) {
7966 	debug4(printf("None used yet so eliminating #%d through #%d\n",i+1,j-1));
7967 	for (k = i+1; k < j; k++) {
7968 	  y = hits[k];
7969 	  if (y->transcripts != NULL) {
7970 	    x->transcripts = List_append(y->transcripts,x->transcripts);
7971 	    y->transcripts = (List_T) NULL;
7972 	  }
7973 	  eliminate[k] = true;
7974 	  eliminatep = true;
7975 	}
7976       } else {
7977 	debug4(printf("One used already so eliminating all but #%d\n",usedi));
7978 	for (k = i; k < j; k++) {
7979 	  if (k != usedi) {
7980 	    y = hits[k];
7981 	    if (y->transcripts != NULL) {
7982 	      x->transcripts = List_append(y->transcripts,x->transcripts);
7983 	      y->transcripts = (List_T) NULL;
7984 	    }
7985 	    eliminate[k] = true;
7986 	    eliminatep = true;
7987 	  }
7988 	}
7989       }
7990     }
7991 
7992     i = j;
7993   }
7994 
7995 
7996 #if 0
7997   nkept = 0;
7998   for (i = 0; i < nhits; i++) {
7999     if (eliminate[i] == false) {
8000       nkept++;
8001     }
8002   }
8003   if (nkept == 0) {
8004     /* All entries eliminated one another, so keep the first one */
8005     eliminate[0] = false;
8006   }
8007 #endif
8008 
8009   if (eliminatep == false) {
8010     debug4(printf("No eliminations, so hits are unchanged\n"));
8011     unique = hits;
8012     *nunique = nhits;
8013 
8014   } else {
8015     /* Caller needs (*nunique)+1, but since we are guaranteed to have one elimination, nhits will suffice */
8016     out = unique = (T *) MALLOC(nhits*sizeof(T));
8017 
8018     for (i = nhits-1; i >= 0; i--) {
8019       x = hits[i];
8020       if (eliminate[i] == false) {
8021 #ifdef DEBUG4
8022 	printf("  Keeping #%d:%u..%u, score %d, nmatches %d (nindels %d, chrnum %d) (plusp = %d, sensedir = %d) ",
8023 	       x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
8024 	       x->refalt_score_within_trims,x->refalt_nmatches_plus_spliced_trims,x->nindels,x->chrnum,x->plusp,x->sensedir);
8025 	Stage3end_print_substrings(x);
8026 	if (x->transcripts != NULL) {
8027 	  Transcript_print_nums(x->transcripts);
8028 	}
8029 	printf("\n");
8030 #endif
8031 	*out++ = x;
8032 
8033       } else {
8034 #ifdef DEBUG4
8035 	printf("  Eliminating #%d:%u..%u, score %d, nmatches %d (nindels %d, chrnum %d) (plusp = %d, sensedir = %d) ",
8036 	       x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
8037 	       x->refalt_score_within_trims,x->refalt_nmatches_plus_spliced_trims,x->nindels,x->chrnum,x->plusp,x->sensedir);
8038 	Stage3end_print_substrings(x);
8039 	if (x->transcripts != NULL) {
8040 	  Transcript_print_nums(x->transcripts);
8041 	}
8042 	printf("\n");
8043 #endif
8044 	/* Stage3end_free(&x); -- Cannot free, because newladder and ladder might share this hit */
8045 	*duplicates = Hitlist_push(*duplicates,hitlistpool,(void *) x);
8046       }
8047     }
8048 
8049     *nunique = out - unique;
8050     FREE(hits);
8051   }
8052 
8053   FREE(eliminate);
8054 
8055 #ifdef DEBUG4
8056   for (i = 0; i < *nunique; i++) {
8057     x = unique[i];
8058     printf("  Final %d: #%d:%u..%u (plusp = %d, sensedir = %d) ",
8059 	   i,x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,x->plusp,x->sensedir);
8060     Stage3end_print_substrings(x);
8061     if (x->transcripts != NULL) {
8062       Transcript_print_nums(x->transcripts);
8063     }
8064     printf("\n");
8065   }
8066 #endif
8067 
8068   debug4(printf("Exited Stage3end_remove_duplicates_array with %d hits\n",*nunique));
8069   return unique;
8070 }
8071 
8072 
8073 
8074 #if 0
8075 static bool
8076 extra_ambiguous_ends_p (List_T substrings) {
8077   int nambiguous;
8078   List_T p;
8079 
8080   p = substrings;
8081   nambiguous = 0;
8082   while (Substring_ambiguous_p((Substring_T) List_head(p)) == true) {
8083     p = List_next(p);
8084     nambiguous += 1;
8085   }
8086   if (nambiguous > 1) {
8087     return true;
8088   }
8089 
8090   substrings = List_reverse(substrings);
8091 
8092   p = substrings;
8093   nambiguous = 0;
8094   while (Substring_ambiguous_p((Substring_T) List_head(p)) == true) {
8095     p = List_next(p);
8096     nambiguous += 1;
8097   }
8098 
8099   substrings = List_reverse(substrings);
8100 
8101   if (nambiguous > 1) {
8102     return true;
8103   } else {
8104     return false;
8105   }
8106 }
8107 #endif
8108 
8109 
8110 #if 0
8111 List_T
8112 Stage3end_reject_trimlengths (List_T hits, Hitlistpool_T hitlistpool) {
8113   List_T filtered = NULL, p;
8114   T hit;
8115 
8116   for (p = hits; p != NULL; p = p->rest) {
8117     hit = (T) p->first;
8118     if (hit->trim_querystart + hit->trim_queryend >= reject_trimlength) {
8119       Stage3end_free(&hit);
8120     } else {
8121       filtered = Hitlist_push(filtered,hitlistpool,(void *) hit);
8122     }
8123   }
8124 
8125   Hitlist_free(&hits);
8126   return filtered;
8127 }
8128 #endif
8129 
8130 
8131 /* Used for eliminating exact duplicates.  Also sorts secondarily by hittype. */
8132 static int
hit_sort_cmp(const void * a,const void * b)8133 hit_sort_cmp (const void *a, const void *b) {
8134   Stage3end_T x = * (Stage3end_T *) a;
8135   Stage3end_T y = * (Stage3end_T *) b;
8136 
8137   debug4(printf("Comparing %s: #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d with %s: #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d\n",
8138 		Method_string(x->method),x->chrnum,x->genomicstart-x->chroffset,x->genomicend-x->chroffset,
8139 		x->circularalias,x->refalt_nmatches_plus_spliced_trims,x->refalt_nmatches_to_trims,x->refalt_score_within_trims,
8140 		Method_string(y->method),y->chrnum,y->genomicstart-y->chroffset,y->genomicend-y->chroffset,
8141 		y->circularalias,y->refalt_nmatches_plus_spliced_trims,x->refalt_nmatches_to_trims,y->refalt_score_within_trims));
8142 
8143   if (altlocp[x->chrnum] == true && altlocp[y->chrnum] == true) {
8144     if (alias_ends[y->chrnum] >= alias_starts[x->chrnum] &&
8145 	alias_starts[y->chrnum] <= alias_ends[x->chrnum]) {
8146       /* The primary regions overlap */
8147       return 0;
8148     } else if (alias_starts[x->chrnum] < alias_starts[y->chrnum]) {
8149       return -1;
8150     } else if (alias_starts[y->chrnum] < alias_starts[x->chrnum]) {
8151       return +1;
8152     } else if (alias_ends[x->chrnum] < alias_ends[y->chrnum]) {
8153       return -1;
8154     } else if (alias_ends[y->chrnum] < alias_ends[x->chrnum]) {
8155       return +1;
8156     } else {
8157       return 0;
8158     }
8159 
8160   } else if (altlocp[x->chrnum] == true) {
8161     if (y->genomicend >= alias_starts[x->chrnum] &&
8162 	y->genomicstart <= alias_ends[x->chrnum]) {
8163       /* y overlaps with the primary region for x */
8164       return +1;		/* Put primary region first */
8165     }
8166     /* Don't overlap, so fall through to rest of procedure */
8167 
8168   } else if (altlocp[y->chrnum] == true) {
8169     if (alias_ends[y->chrnum] >= x->genomicstart &&
8170 	alias_starts[y->chrnum] <= x->genomicend) {
8171       /* x overlaps with the primary region for y */
8172       return -1;		/* Put primary region first */
8173     }
8174     /* Don't overlap, so fall through to rest of procedure */
8175   }
8176 
8177 
8178   if (x->plusp > y->plusp) {
8179     return -1;
8180   } else if (y->plusp > x->plusp) {
8181     return +1;
8182 
8183 
8184   } else if (x->low < y->low) {
8185     debug4(printf("Returning -1 for low\n"));
8186     return -1;
8187   } else if (y->low < x->low) {
8188     debug4(printf("Returning +1 for low\n"));
8189     return +1;
8190 
8191   } else if (x->high < y->high) {
8192     debug4(printf("Returning -1 for high\n"));
8193     return -1;
8194   } else if (y->high < x->high) {
8195     debug4(printf("Returning +1 for high\n"));
8196     return +1;
8197 
8198 
8199   } else if (x->refalt_score_within_trims < y->refalt_score_within_trims) {
8200     return -1;
8201   } else if (y->refalt_score_within_trims < x->refalt_score_within_trims) {
8202     return +1;
8203   } else if (x->refalt_nmatches_plus_spliced_trims > y->refalt_nmatches_plus_spliced_trims) {
8204     return -1;
8205   } else if (y->refalt_nmatches_plus_spliced_trims > x->refalt_nmatches_plus_spliced_trims) {
8206     return +1;
8207   } else if (x->ref_nmatches_plus_spliced_trims > y->ref_nmatches_plus_spliced_trims) {
8208     return -1;
8209   } else if (y->ref_nmatches_plus_spliced_trims > x->ref_nmatches_plus_spliced_trims) {
8210     return +1;
8211 
8212     /* Prioritize last method used */
8213   } else if (x->method > y->method) {
8214     return -1;
8215   } else if (y->method > x->method) {
8216     return +1;
8217 
8218   } else if (x->altlocp < y->altlocp) {
8219     return -1;
8220   } else if (y->altlocp < x->altlocp) {
8221     return +1;
8222 
8223 
8224   } else if (x->sensedir != 0 && y->sensedir == 0) {
8225     return -1;
8226   } else if (y->sensedir != 0 && x->sensedir == 0) {
8227     return +1;
8228 
8229   } else if (x->splice_score > y->splice_score) {
8230     debug4(printf(" => loses by splice score\n"));
8231     return -1;
8232   } else if (y->splice_score > x->splice_score) {
8233     debug4(printf(" => wins by splice score\n"));
8234     return +1;
8235 
8236   } else {
8237     debug4(printf("Returning 0 for equivalent\n"));
8238     return 0;
8239   }
8240 }
8241 
8242 
8243 
8244 #if 0
8245 /* Same as hit_sort_cmp, except for hittype, nmatches_to_trims, and indel_low */
8246 static int
8247 hit_equiv_cmp (Stage3end_T x, Stage3end_T y) {
8248 
8249   if (altlocp[x->chrnum] == true && altlocp[y->chrnum] == true) {
8250     if (alias_ends[y->chrnum] >= alias_starts[x->chrnum] &&
8251 	alias_starts[y->chrnum] <= alias_ends[x->chrnum]) {
8252       /* The primary regions overlap */
8253       return 0;
8254     }
8255 
8256   } else if (altlocp[x->chrnum] == true) {
8257     if (y->genomicend >= alias_starts[x->chrnum] &&
8258 	y->genomicstart <= alias_ends[x->chrnum]) {
8259       /* y overlaps with the primary region for x */
8260       return 0;
8261     }
8262 
8263   } else if (altlocp[y->chrnum] == true) {
8264     if (alias_ends[y->chrnum] >= x->genomicstart &&
8265 	alias_starts[y->chrnum] <= x->genomicend) {
8266       /* x overlaps with the primary region for y */
8267       return 0;
8268     }
8269   }
8270 
8271   if (x->plusp > y->plusp) {
8272     return -1;
8273   } else if (y->plusp > x->plusp) {
8274     return +1;
8275   } else if (x->low < y->low) {
8276     return -1;
8277   } else if (y->low < x->low) {
8278     return +1;
8279   } else if (x->high < y->high) {
8280     return +1;
8281   } else if (y->high < x->high) {
8282     return -1;
8283 
8284   } else if (x->refalt_score_within_trims < y->refalt_score_within_trims) {
8285     return -1;
8286   } else if (y->refalt_score_within_trims < x->refalt_score_within_trims) {
8287     return +1;
8288   } else if (x->refalt_nmatches_plus_spliced_trims > y->refalt_nmatches_plus_spliced_trims) {
8289     return -1;
8290   } else if (y->refalt_nmatches_plus_spliced_trims > x->refalt_nmatches_plus_spliced_trims) {
8291     return +1;
8292   } else if (x->ref_nmatches_plus_spliced_trims > y->ref_nmatches_plus_spliced_trims) {
8293     return -1;
8294   } else if (y->ref_nmatches_plus_spliced_trims > x->ref_nmatches_plus_spliced_trims) {
8295     return +1;
8296 
8297 #if 0
8298     /* Causes hits to not be recognized as equivalent */
8299   } else if (x->nsplices < y->nsplices) {
8300     return -1;
8301   } else if (y->nsplices < x->nsplices) {
8302     return +1;
8303 #endif
8304 
8305 #if 0
8306   } else if (y->start_amb_length + y->end_amb_length == 0 &&
8307 	     x->start_amb_length + x->end_amb_length > 0) {
8308     return -1;
8309   } else if (x->start_amb_length + x->end_amb_length == 0 &&
8310 	     y->start_amb_length + y->end_amb_length > 0) {
8311     return +1;
8312 #endif
8313 
8314 #if 0
8315   } else if (x->indel_low < y->indel_low) {
8316     return -1;
8317   } else if (y->indel_low < x->indel_low) {
8318     return +1;
8319 #endif
8320 
8321 #if 0
8322     /* Used for sorting but not equiv */
8323   } else if (x->sensedir != 0 && y->sensedir == 0) {
8324     return -1;
8325   } else if (y->sensedir != 0 && x->sensedir == 0) {
8326     return +1;
8327 #endif
8328 
8329 #if 0
8330   } else if (x->sensedir == y->sensedir) {
8331     return 0;
8332   } else if (x->sensedir > y->sensedir) {
8333     return +1;
8334   } else if (y->sensedir > x->sensedir) {
8335     return -1;
8336 #endif
8337 
8338   } else if (x->splice_score > y->splice_score) {
8339     debug4(printf(" => loses by splice score\n"));
8340     return -1;
8341 
8342   } else if (y->splice_score > x->splice_score) {
8343     debug4(printf(" => wins by splice score\n"));
8344     return +1;
8345 
8346   } else {
8347     debug4(printf(" => identical for sorting purposes\n"));
8348     return 0;
8349   }
8350 }
8351 #endif
8352 
8353 
8354 int
Stage3end_hit_goodness_cmp(bool * equalp,Stage3end_T hit,Stage3end_T best_hit,bool finalp)8355 Stage3end_hit_goodness_cmp (bool *equalp, Stage3end_T hit,
8356 			    Stage3end_T best_hit, bool finalp) {
8357   double prob1, prob2;
8358 
8359 #ifdef PRE_RESOLVE_MULTIMAPPING
8360   if (Stage3end_tally(x) > TALLY_RATIO*Stage3end_tally(y)) {
8361     debug4(printf("  #%d overlaps #%d and tally %ld > %f*%ld, so marking %d for elimination\n",
8362 		  i,j,x->tally,TALLY_RATIO,y->tally,j));
8363     eliminate[j] = true;
8364   } else if (Stage3end_tally(y) > TALLY_RATIO*Stage3end_tally(x)) {
8365     debug4(printf("  #%d overlaps #%d and tally %f*%ld < %ld, so marking %d for elimination\n",
8366 		  i,j,TALLY_RATIO,x->tally,y->tally,i));
8367     eliminate[i] = true;
8368   }
8369 #endif
8370 
8371   *equalp = false;
8372 
8373 #if 0
8374   /* Don't want to use nmatches_to_trims */
8375   /* Favors definitive splices over ambiguous ones (by using nmatches_to_trims) */
8376   if (known_ambiguous_p(hit) == true && known_ambiguous_p(best_hit) == false) {
8377     return -1;
8378   } else if (known_ambiguous_p(hit) == false && known_ambiguous_p(best_hit) == true) {
8379     return +1;
8380   }
8381 #endif
8382 
8383   if (hit->refalt_nmatches_plus_spliced_trims > best_hit->refalt_nmatches_plus_spliced_trims + NMATCHES_SLOP) {
8384     /* Significantly more matches */
8385     debug4(printf("More matches (to_trims)\n"));
8386     return +1;
8387   } else if (hit->refalt_nmatches_plus_spliced_trims < best_hit->refalt_nmatches_plus_spliced_trims - NMATCHES_SLOP) {
8388     /* Significantly fewer matches */
8389     debug4(printf("Fewer matches (to_trims)\n"));
8390     return -1;
8391 
8392 #if 0
8393   } else if (hit->nsplices > best_hit->nsplices) {
8394     debug4(printf("  => loses by nsplices: %d > %d in best\n",hit->nsplices,best_hit->nsplices));
8395     return -1;
8396   } else if (hit->nsplices < best_hit->nsplices) {
8397     debug4(printf("  => wins by nsplices: %d < %d in best\n",hit->nsplices,best_hit->nsplices));
8398     return +1;
8399 #endif
8400 
8401   } else if (hit->hittype > best_hit->hittype) {
8402     debug4(printf("  => loses by hittype\n"));
8403     return -1;
8404   } else if (hit->hittype < best_hit->hittype) {
8405     debug4(printf("  => wins by hittype\n"));
8406     return +1;
8407 
8408 #if 0
8409   } else if (start_amb_length(hit) + end_amb_length(hit) > 0 &&
8410 	     start_amb_length(best_hit) + end_amb_length(best_hit) == 0) {
8411     debug4(printf("  => loses by ambiguity\n"));
8412     return -1;
8413   } else if (start_amb_length(hit) + end_amb_length(hit) == 0 &&
8414 	     start_amb_length(best_hit) + end_amb_length(best_hit) > 0) {
8415     debug4(printf("  => wins by ambiguity\n"));
8416     return +1;
8417 #endif
8418 
8419   } else if (hit->nindels > best_hit->nindels) {
8420     debug4(printf("  => loses by nindels\n"));
8421     return -1;
8422   } else if (hit->nindels < best_hit->nindels) {
8423     debug4(printf("  => wins by nindels\n"));
8424     return +1;
8425 
8426   } else if (hit->distant_splice_p == true && best_hit->distant_splice_p == false) {
8427     debug4(printf("  => loses because distant splice\n"));
8428     return -1;
8429   } else if (hit->distant_splice_p == false && best_hit->distant_splice_p == true) {
8430     debug4(printf("  => wins because not distant splice\n"));
8431     return +1;
8432 
8433   } else if (finalp == false) {
8434     debug4(printf("  => indistinguishable\n"));
8435     return 0;
8436 
8437   } else if (hit->hittype == TRANSLOC_SPLICE && best_hit->hittype == TRANSLOC_SPLICE) {
8438     prob1 = hit->splice_score;
8439     prob2 = best_hit->splice_score;
8440 
8441     if (prob1 < prob2) {
8442       debug4(printf("  => loses by TRANSLOC_SPLICE splice prob %f vs %f\n",prob1,prob2));
8443       return -1;
8444     } else if (prob1 > prob2) {
8445       debug4(printf("  => wins by TRANSLOC_SPLICE splice prob %f vs %f\n",prob1,prob2));
8446       return +1;
8447     } else {
8448       debug4(printf("  => equal\n"));
8449       *equalp = true;
8450       return 0;
8451     }
8452 
8453   } else {
8454     prob1 = Stage3end_prob(hit);
8455     prob2 = Stage3end_prob(best_hit);
8456     if (prob1 < prob2) {
8457       debug4(printf("  => loses by splice prob %f vs %f\n",prob1,prob2));
8458       return -1;
8459     } else if (prob1 > prob2) {
8460       debug4(printf("  => wins by splice prob %f vs %f\n",prob1,prob2));
8461       return +1;
8462     }
8463 
8464     if (hit->genomiclength > best_hit->genomiclength) {
8465       debug4(printf("  => loses by genomiclength: %u > %u\n",
8466 		    hit->genomiclength,best_hit->genomiclength));
8467       return -1;
8468     } else if (hit->genomiclength < best_hit->genomiclength) {
8469       debug4(printf("  => wins by genomiclength: %u < %u\n",
8470 		    hit->genomiclength,best_hit->genomiclength));
8471       return +1;
8472 
8473     } else {
8474       debug4(printf("  => equal\n"));
8475       *equalp = true;
8476       return 0;
8477     }
8478   }
8479 }
8480 
8481 
8482 /* Not clear how to handle altloc */
8483 static bool
hit_subsumption(Stage3end_T x,Stage3end_T y)8484 hit_subsumption (Stage3end_T x, Stage3end_T y) {
8485   if (x->chrnum != y->chrnum) {
8486     /* Previously true for straddles, but then corrected that issue */
8487     /* Now potentially true for lefts below 0 */
8488     return false;
8489   } else if (x->plusp != y->plusp) {
8490     return false;		/* Different strands */
8491   } else if (x->low <= y->low && x->high >= y->high) {
8492     return true;
8493   } else if (y->low <= x->low && y->high >= x->high) {
8494     return true;
8495   } else {
8496     return false;
8497   }
8498 }
8499 
8500 /* Not clear how to handle altloc */
8501 static bool
hit_endpoint_equivp(Stage3end_T x,Stage3end_T y)8502 hit_endpoint_equivp (Stage3end_T x, Stage3end_T y) {
8503   if (x->plusp != y->plusp) {
8504     return false;		/* Different strands */
8505   } else if (x->genomicstart != y->genomicstart) {
8506     return false;
8507   } else if (x->genomicend != y->genomicend) {
8508     return false;
8509   } else {
8510     return true;
8511   }
8512 }
8513 
8514 
8515 static bool
hit_bad_superstretch_p(Stage3end_T hit_k,Stage3end_T * hits,int k,int j,bool finalp)8516 hit_bad_superstretch_p (Stage3end_T hit_k, Stage3end_T *hits, int k, int j, bool finalp) {
8517   int a;
8518   bool equalp;
8519 
8520   for (a = k+1; a <= j; a++) {
8521     if (hit_subsumption(hit_k,hits[a]) == true) {
8522       debug4(printf("Testing %d because stretches over %d",k,a));
8523       if (Stage3end_hit_goodness_cmp(&equalp,hits[a],hit_k,finalp) > 0 || equalp == true) {
8524 	debug4(printf(" => eliminating\n"));
8525 	return true;
8526       }
8527       debug4(printf("\n"));
8528     }
8529   }
8530   return false;
8531 }
8532 
8533 
8534 static List_T
remove_overlaps_distant(List_T hitlist,Hitlistpool_T hitlistpool)8535 remove_overlaps_distant (List_T hitlist, Hitlistpool_T hitlistpool) {
8536   List_T unique = NULL;
8537   T best_hit, hit, *hits;
8538   int cmp;
8539   int n, i, j, k, besti;
8540   bool *eliminate, equalp;
8541 #ifdef PRE_RESOLVE_MULTIMAPPING
8542   long int best_tally;
8543 #endif
8544 
8545   if ((n = List_length(hitlist)) == 0) {
8546     return (List_T) NULL;
8547   } else {
8548 #ifdef USE_ALLOCA_FOR_HITS
8549     eliminate = (bool *) CALLOCA(n,sizeof(bool));
8550     hits = (T *) MALLOCA(n * sizeof(T));
8551     List_fill_array((void **) hits,hitlist);
8552     Hitlist_free(&hitlist);
8553 #else
8554     eliminate = (bool *) CALLOC(n,sizeof(bool));
8555     hits = (T *) List_to_array(hitlist,NULL);
8556     Hitlist_free(&hitlist);
8557 #endif
8558   }
8559 
8560   debug4(printf("Step 0.  Checking for duplicates among distant\n"));
8561   qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp);
8562 
8563   /* Find clusters from left */
8564   i = 0;
8565   while (i < n) {
8566     j = i;
8567     while (j+1 < n && hit_endpoint_equivp(hits[i],hits[j+1]) == true) {
8568       j = j+1;
8569     }
8570 
8571     if (j > i) {
8572       debug4(printf("Cluster from %d up through %d\n",i,j));
8573 
8574       best_hit = hits[i];
8575       besti = i;
8576       debug4(printf("Assume best is %d\n",besti));
8577 
8578       for (k = i+1; k <= j; k++) {
8579 	cmp = Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,/*finalp*/true);
8580 	debug4(printf("Comparison of %d with best %d yields %d\n",k,besti,cmp));
8581 	if (cmp > 0) {
8582 	  best_hit = hits[k];
8583 	  besti = k;
8584 	  debug4(printf("Best is now %d\n",besti));
8585 	}
8586       }
8587 
8588       for (k = i; k <= j; k++) {
8589 	if (k == besti) {
8590 	  /* Skip */
8591 	} else if (Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,/*finalp*/true) < 0 || equalp == true) {
8592 	  debug4(printf("  Eliminating hit %d from left, because beaten by %d\n",k,besti));
8593 	  eliminate[k] = true;
8594 	}
8595       }
8596     }
8597 
8598     i = j+1;
8599   }
8600 
8601   for (i = n-1; i >= 0; i--) {
8602     hit = hits[i];
8603     if (eliminate[i] == false) {
8604       unique = Hitlist_push(unique,hitlistpool,(void *) hit);
8605     } else if (hit->paired_usedp == true) {
8606       unique = Hitlist_push(unique,hitlistpool,(void *) hit);
8607     } else {
8608       Stage3end_free(&hit);
8609     }
8610   }
8611 
8612 #ifdef USE_ALLOCA_FOR_HITS
8613   FREEA(hits);
8614   FREEA(eliminate);
8615 #else
8616   FREE(hits);
8617   FREE(eliminate);
8618 #endif
8619 
8620   debug4(printf("Returning %d unique distant splices\n",List_length(unique)));
8621   return unique;
8622 }
8623 
8624 
8625 
8626 
8627 #if 0
8628 List_T
8629 Stage3end_remove_overlaps_old (List_T hitlist, Hitlistpool_T hitlistpool,
8630 			       int querylength, bool finalp) {
8631   List_T unique = NULL, distant = NULL, local = NULL, p;
8632   T best_hit, hit, parent, *hits, *prev;
8633   int cmp;
8634   int nkept, n, i, j, k, besti;
8635   bool *eliminate, equalp;
8636   int *parenti;
8637 #ifdef PRE_RESOLVE_MULTIMAPPING
8638   long int best_tally;
8639 #endif
8640 
8641 
8642   debug4(printf("Entered Stage3end_remove_overlaps with %d hits: %s\n",
8643 		List_length(hitlist),finalp == true ? "FINAL" : "not final"));
8644 
8645   for (p = hitlist; p != NULL; p = List_next(p)) {
8646     hit = (T) List_head(p);
8647     if (hit->distant_splice_p == false) {
8648       local = Hitlist_push(local,hitlistpool,(void *) hit);
8649     } else {
8650       distant = Hitlist_push(distant,hitlistpool,(void *) hit);
8651     }
8652   }
8653   Hitlist_free(&hitlist);
8654 
8655   distant = remove_overlaps_distant(distant,hitlistpool);
8656 
8657   if ((n = List_length(local)) == 0) {
8658     return distant;
8659   } else {
8660 #ifdef USE_ALLOCA_FOR_HITS
8661     eliminate = (bool *) CALLOCA(n,sizeof(bool));
8662     hits = (T *) MALLOCA(n * sizeof(T));
8663     List_fill_array((void **) hits,local);
8664     Hitlist_free(&local);
8665 #else
8666     eliminate = (bool *) CALLOC(n,sizeof(bool));
8667     hits = (T *) List_to_array(local,NULL);
8668     Hitlist_free(&local);
8669 #endif
8670   }
8671 
8672 
8673   /* Step 1.  Check for exact duplicates */
8674   /* Probably don't want to eliminate aliases at this point */
8675   debug4(printf("Step 1.  Checking for exact duplicates\n"));
8676   qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp);
8677 
8678   debug4(
8679 	 for (i = 0; i < n; i++) {
8680 	   hit = hits[i];
8681 	   printf("  Initial %d (%s): %p #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d",
8682 		  i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
8683 		  hit->circularalias,hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
8684 	   if (hit->transcripts != NULL) {
8685 	     Transcript_print_list(hit->transcripts);
8686 	   }
8687 	   printf("\n");
8688 	 }
8689 	 );
8690 
8691   i = 0;
8692   while (i < n) {
8693     j = i+1;
8694     debug4(printf(" %d,%d",i,j));
8695     while (j < n && hit_equal(hits[j],hits[i]) == true) {
8696       debug4(printf("  %d is identical to %d => eliminating\n",j,i));
8697       eliminate[j] = true;
8698       j++;
8699     }
8700     i = j;
8701   }
8702   debug4(printf("\n"));
8703 
8704 
8705   nkept = 0;
8706   for (i = 0; i < n; i++) {
8707     if (eliminate[i] == false) {
8708       nkept++;
8709     } else if (hits[i]->paired_usedp == true) {
8710       nkept++;
8711     }
8712   }
8713   if (nkept == 0) {
8714     /* All entries eliminated one another, so keep the first one */
8715     eliminate[0] = false;
8716     nkept = 1;
8717   }
8718 
8719   prev = hits;
8720 #ifdef USE_ALLOCA_FOR_HITS
8721   hits = (Stage3end_T *) MALLOCA(nkept * sizeof(Stage3end_T));
8722 #else
8723   hits = (Stage3end_T *) MALLOC(nkept * sizeof(Stage3end_T));
8724 #endif
8725 
8726   for (i = 0, j = 0; i < n; i++) {
8727     hit = prev[i];
8728     if (eliminate[i] == false) {
8729       debug4(printf("  Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8730 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8731 		    hit->plusp,hit->sensedir));
8732       best_hit = hits[j++] = hit;
8733     } else if (hit->paired_usedp == true) {
8734       debug4(printf("  Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8735 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8736 		    hit->plusp,hit->sensedir));
8737       hits[j++] = hit;
8738     } else {
8739       debug4(printf("  Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8740 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8741 		    hit->plusp,hit->sensedir));
8742       Stage3end_transfer_transcripts_one(best_hit,hit);
8743       Stage3end_free(&hit);
8744     }
8745   }
8746 
8747 #ifdef USE_ALLOCA_FOR_HITS
8748   FREEA(prev);
8749 #else
8750   FREE(prev);
8751 #endif
8752 
8753 
8754   /* Step 2: Check for superstretches */
8755   n = nkept;
8756   debug4(printf("Step 2.  Checking for superstretches among %d hits within subsumption clusters\n",n));
8757 
8758   for (i = 0; i < n; i++) {
8759     eliminate[i] = false;
8760   }
8761 
8762   debug4(
8763 	 for (i = 0; i < n; i++) {
8764 	   hit = hits[i];
8765 	   printf("  Initial %d (%s): %p #%d:%u..%u, nmatches %d (%d to_trims), score %d",
8766 		  i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
8767 		  hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
8768 	   if (hit->transcripts != NULL) {
8769 	     Transcript_print_list(hit->transcripts);
8770 	   }
8771 	   printf("\n");
8772 	 }
8773 	 );
8774 
8775   /* Find clusters */
8776   i = 0;
8777   while (i < n) {
8778     j = i;
8779     /* Previously checked if (hits[i]->distant_splice_p == false) */
8780     while (j+1 < n && hit_subsumption(hits[i],hits[j+1]) == true) {
8781       j = j+1;
8782     }
8783 
8784     if (j > i) {
8785       debug4(printf("Cluster from %d up through %d\n",i,j));
8786 
8787       /* Find bad superstretches */
8788       for (k = i; k <= j; k++) {
8789 	/* Previously checked if (hits[i]->distant_splice_p == false) */
8790 	if (hit_bad_superstretch_p(hits[k],hits,k,j,finalp) == true) {
8791 	  eliminate[k] = true;
8792 	  /* parenti[k] = j; */
8793 	}
8794       }
8795     }
8796 
8797     i = j+1;
8798   }
8799 
8800   nkept = 0;
8801   for (i = 0; i < n; i++) {
8802     if (eliminate[i] == false) {
8803       nkept++;
8804     } else if (hits[i]->paired_usedp == true) {
8805       nkept++;
8806     }
8807   }
8808   if (nkept == 0) {
8809     /* All entries eliminated one another, so keep the first one */
8810     eliminate[0] = false;
8811     nkept = 1;
8812   }
8813 
8814   prev = hits;
8815 #ifdef USE_ALLOCA_FOR_HITS
8816   hits = (Stage3end_T *) MALLOCA(nkept * sizeof(Stage3end_T));
8817 #else
8818   hits = (Stage3end_T *) MALLOC(nkept * sizeof(Stage3end_T));
8819 #endif
8820 
8821   for (i = 0, j = 0; i < n; i++) {
8822     hit = prev[i];
8823     if (eliminate[i] == false) {
8824       debug4(printf("  Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8825 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8826 		    hit->plusp,hit->sensedir));
8827       hits[j++] = hit;
8828     } else if (hit->paired_usedp == true) {
8829       debug4(printf("  Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8830 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8831 		    hit->plusp,hit->sensedir));
8832       hits[j++] = hit;
8833     } else {
8834       debug4(printf("  Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8835 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8836 		    hit->plusp,hit->sensedir));
8837       /* parent = prev[parenti[i]]; */
8838       /* Stage3end_transfer_transcripts_one(parent,hit); */
8839       Stage3end_free(&hit);
8840     }
8841   }
8842 
8843 #ifdef USE_ALLOCA_FOR_HITS
8844   FREEA(prev);
8845 #else
8846   FREE(prev);
8847 #endif
8848 
8849 
8850   /* Step 3: Check for best within subsumption clusters */
8851   n = nkept;
8852   debug4(printf("Checking for best among %d hits within subsumption clusters\n",n));
8853 
8854   for (i = 0; i < n; i++) {
8855     eliminate[i] = false;
8856   }
8857   /* qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp); -- No need since original order was kept */
8858 
8859   debug4(
8860 	 for (i = 0; i < n; i++) {
8861 	   hit = hits[i];
8862 	   printf("  Initial %d (%s): %p #%d:%u..%u, nmatches %d (%d to_trims), score %d",
8863 		  i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
8864 		  hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
8865 	   if (hit->transcripts != NULL) {
8866 	     Transcript_print_list(hit->transcripts);
8867 	   }
8868 	   printf("\n");
8869 	 }
8870 	 );
8871 
8872   /* Find clusters from left */
8873   i = 0;
8874   while (i < n) {
8875     j = i;
8876     /* Previously checked if (hits[i]->distant_splice_p == false) */
8877     while (j+1 < n && hit_subsumption(hits[i],hits[j+1]) == true) {
8878       j = j+1;
8879     }
8880 
8881     if (j > i) {
8882       debug4(printf("Cluster from %d up through %d\n",i,j));
8883 
8884       best_hit = hits[i];
8885       besti = i;
8886       debug4(printf("Assume best is %d\n",besti));
8887 
8888       for (k = i+1; k <= j; k++) {
8889 	/* Previously checked if (hits[i]->distant_splice_p == false) */
8890 	cmp = Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,finalp);
8891 	debug4(printf("Comparison of %d with best %d yields %d\n",k,besti,cmp));
8892 	if (cmp > 0) {
8893 	  best_hit = hits[k];
8894 	  besti = k;
8895 	  debug4(printf("Best is now %d\n",besti));
8896 	}
8897       }
8898 
8899       for (k = i; k <= j; k++) {
8900 	if (k == besti) {
8901 	  /* Skip */
8902 	  /* Previously checked if (hits[i]->distant_splice_p == false) */
8903 	} else if (Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,finalp) < 0 || equalp == true) {
8904 	  debug4(printf("  Eliminating hit %d from left, because beaten by %d\n",k,besti));
8905 	  eliminate[k] = true;
8906 	  /* parenti[k] = i; */
8907 	}
8908       }
8909     }
8910 
8911     i = j+1;
8912   }
8913 
8914 
8915   /* Find clusters starting from right */
8916   j = n - 1;
8917   while (j >= 0) {
8918     i = j;
8919     /* Previously checked if (hits[i]->distant_splice_p == false) */
8920     while (i-1 >= 0 && hit_subsumption(hits[j],hits[i-1]) == true) {
8921       i = i-1;
8922     }
8923 
8924     if (i < j) {
8925       debug4(printf("Cluster from %d down through %d\n",j,i));
8926       best_hit = hits[i];
8927       besti = i;
8928       debug4(printf("Assume best is %d\n",besti));
8929 
8930       for (k = i+1; k <= j; k++) {
8931 	/* Previously checked if (hits[i]->distant_splice_p == false) */
8932 	cmp = Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,finalp);
8933 	debug4(printf("Comparison of %d with best %d yields %d\n",k,besti,cmp));
8934 	if (cmp > 0) {
8935 	  best_hit = hits[k];
8936 	  besti = k;
8937 	  debug4(printf("Best is now %d\n",besti));
8938 	}
8939       }
8940 
8941       for (k = i; k <= j; k++) {
8942 	if (k == besti) {
8943 	  /* Skip */
8944 	  /* Previously checked if (hits[i]->distant_splice_p == false) */
8945 	} else if (Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,finalp) < 0 || equalp == true) {
8946 	  debug4(printf("  Eliminating hit %d from right, because beaten by %d\n",k,besti));
8947 	  eliminate[k] = true;
8948 	  /* parenti[k] = i; */
8949 	}
8950       }
8951     }
8952 
8953     j = i-1;
8954   }
8955 
8956 
8957   nkept = 0;
8958   for (i = 0; i < n; i++) {
8959     if (eliminate[i] == false) {
8960       nkept++;
8961     } else if (hits[i]->paired_usedp == true) {
8962       nkept++;
8963     }
8964   }
8965   if (nkept == 0) {
8966     eliminate[0] = false;
8967     nkept = 1;
8968   }
8969 
8970   prev = hits;
8971 #ifdef USE_ALLOCA_FOR_HITS
8972   hits = (Stage3end_T *) MALLOCA(nkept * sizeof(Stage3end_T));
8973 #else
8974   hits = (Stage3end_T *) MALLOC(nkept * sizeof(Stage3end_T));
8975 #endif
8976 
8977   for (i = 0, j = 0; i < n; i++) {
8978     hit = prev[i];
8979     if (eliminate[i] == false) {
8980       debug4(printf("  Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8981 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8982 		    hit->plusp,hit->sensedir));
8983       hits[j++] = hit;
8984     } else if (hit->paired_usedp == true) {
8985       debug4(printf("  Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8986 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8987 		    hit->plusp,hit->sensedir));
8988       hits[j++] = hit;
8989     } else {
8990       debug4(printf("  Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8991 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8992 		    hit->plusp,hit->sensedir));
8993       /* parent = prev[parenti[i]]; */
8994       /* Stage3end_transfer_transcripts_one(parent,hit); */
8995       Stage3end_free(&hit);
8996     }
8997   }
8998 
8999 #ifdef USE_ALLOCA_FOR_HITS
9000   FREEA(prev);
9001   parenti = (int *) CALLOCA(nkept,sizeof(int));
9002 #else
9003   FREE(prev);
9004   parenti = (int *) CALLOC(nkept,sizeof(int));
9005 #endif
9006 
9007 
9008   /* Step 4: Check for identity */
9009   n = nkept;
9010   debug4(printf("Checking for duplicates among %d hits by identity\n",n));
9011 
9012   for (i = 0; i < n; i++) {
9013     eliminate[i] = false;
9014   }
9015   /* qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp); -- No need since original order was kept */
9016 
9017   debug4(
9018 	 for (i = 0; i < n; i++) {
9019 	   hit = hits[i];
9020 	   printf("  Initial %d (%s): %p #%d:%u..%u, nmatches %d (%d to_trims), score %d",
9021 		  i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
9022 		  hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
9023 	   if (hit->transcripts != NULL) {
9024 	     Transcript_print_list(hit->transcripts);
9025 	   }
9026 	   printf("\n");
9027 	 }
9028 	 );
9029 
9030   i = 0;
9031   while (i < n) {
9032     debug4(printf("Looking at %d with score %d\n",i,hits[i]->refalt_score_within_trims));
9033     j = i+1;
9034     while (j < n && hit_equal(hits[j],hits[i]) == true) {
9035       debug4(printf("  %d equal to %d\n",j,i));
9036       eliminate[j] = true;
9037       parenti[j] = i;
9038       j++;
9039     }
9040 
9041     i = j;
9042   }
9043 
9044   for (i = n-1; i >= 0; i--) {
9045     hit = hits[i];
9046     if (eliminate[i] == false) {
9047       unique = Hitlist_push(unique,hitlistpool,(void *) hit);
9048     } else if (hit->paired_usedp == true) {
9049       unique = Hitlist_push(unique,hitlistpool,(void *) hit);
9050     } else {
9051       parent = hits[parenti[i]]; /* Not prev, since we are using hits instead */
9052       Stage3end_transfer_transcripts_one(parent,hit);
9053       Stage3end_free(&hit);
9054     }
9055   }
9056 
9057 #ifdef USE_ALLOCA_FOR_HITS
9058   FREEA(hits);
9059   FREEA(eliminate);
9060   FREEA(parenti);
9061 #else
9062   FREE(hits);
9063   FREE(eliminate);
9064   FREE(parenti);
9065 #endif
9066 
9067 
9068 #ifdef PRE_RESOLVE_MULTIMAPPING
9069   if (use_tally_p == true && tally_iit != NULL) {
9070     if ((n = List_length(unique)) > 1) {
9071 #ifdef USE_ALLOCA_FOR_HITS
9072       hits = (T *) MALLOCA(n * sizeof(T));
9073       List_fill_array((void **) hits,unique);
9074       Hitlist_free(&unique);
9075 #else
9076       hits = (T *) List_to_array(unique,NULL);
9077       Hitlist_free(&unique);
9078 #endif
9079 
9080       best_tally = 0;
9081       for (i = 0; i < n; i++) {
9082 	if (hits[i]->tally < 0) {
9083 	  hits[i]->tally = Stage3end_compute_tally(hits[i]);
9084 	}
9085 	if (hits[i]->tally > best_tally) {
9086 	  best_tally = hits[i]->tally;
9087 	}
9088       }
9089 
9090       unique = (List_T) NULL;
9091       for (i = 0; i < n; i++) {
9092 	if (hits[i]->tally < best_tally) {
9093 	  /* Stage3end_free(&(hits[i])); */
9094 	} else {
9095 	  unique = Hitlist_push(unique,hitlistpool,(void *) hits[i]);
9096 	}
9097       }
9098 
9099 #ifdef USE_ALLOCA_FOR_HITS
9100       FREEA(hits);
9101 #else
9102       FREE(hits);
9103 #endif
9104     }
9105   }
9106 #endif
9107 
9108   unique = List_append(unique,distant);
9109   debug4(printf("Exited Stage3end_remove_overlaps with %d hits\n",List_length(unique)));
9110   return unique;
9111 }
9112 #endif
9113 
9114 
9115 /* Tries to match Stage3pair_remove_overlaps */
9116 List_T
Stage3end_remove_overlaps(List_T hitlist,Hitlistpool_T hitlistpool,int querylength,bool finalp)9117 Stage3end_remove_overlaps (List_T hitlist, Hitlistpool_T hitlistpool, int querylength, bool finalp) {
9118   List_T optimal, distant = NULL, local = NULL, p;
9119   T best_hit, hit, *hits, *prev;
9120 
9121   int max_adj_nmatches, score;
9122   int best_nsegments;
9123   double max_splice_score;
9124 
9125   int nkept, n, i, j, k;
9126   bool *eliminate, keptp;
9127 #ifdef PRE_RESOLVE_MULTIMAPPING
9128   long int best_tally;
9129 #endif
9130 
9131 
9132   debug4(printf("Entered Stage3end_remove_overlaps with %d hits: %s\n",
9133 		List_length(hitlist),finalp == true ? "FINAL" : "not final"));
9134 
9135   for (p = hitlist; p != NULL; p = List_next(p)) {
9136     hit = (T) List_head(p);
9137     if (hit->distant_splice_p == false) {
9138       local = Hitlist_push(local,hitlistpool,(void *) hit);
9139     } else {
9140       distant = Hitlist_push(distant,hitlistpool,(void *) hit);
9141     }
9142   }
9143   Hitlist_free(&hitlist);
9144 
9145   distant = remove_overlaps_distant(distant,hitlistpool);
9146 
9147   if ((n = List_length(local)) == 0) {
9148     return distant;
9149   } else {
9150 #ifdef USE_ALLOCA_FOR_HITS
9151     eliminate = (bool *) CALLOCA(n,sizeof(bool));
9152     hits = (T *) MALLOCA(n * sizeof(T));
9153     List_fill_array((void **) hits,local);
9154     Hitlist_free(&local);
9155 #else
9156     eliminate = (bool *) CALLOC(n,sizeof(bool));
9157     hits = (T *) List_to_array(local,NULL);
9158     Hitlist_free(&local);
9159 #endif
9160   }
9161   /* local alignments are in hits, but distant alignments are in distant */
9162 
9163 
9164   /* Step 1.  Check for exact duplicates */
9165   /* Probably don't want to eliminate aliases at this point */
9166   debug4(printf("Step 1.  Checking for exact duplicates\n"));
9167   qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp);
9168 
9169   debug4(
9170 	 for (i = 0; i < n; i++) {
9171 	   hit = hits[i];
9172 	   printf("  Initial %d (%s): %p #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d",
9173 		  i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
9174 		  hit->circularalias,hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
9175 	   if (hit->transcripts != NULL) {
9176 	     Transcript_print_list(hit->transcripts);
9177 	   }
9178 	   printf("\n");
9179 	 }
9180 	 );
9181 
9182   i = 0;
9183   while (i < n) {
9184     j = i+1;
9185     debug4(printf(" %d,%d",i,j));
9186     while (j < n && hit_equal(hits[j],hits[i]) == true) {
9187       debug4(printf("  %d is identical to %d => eliminating\n",j,i));
9188       eliminate[j] = true;
9189       j++;
9190     }
9191     i = j;
9192   }
9193   debug4(printf("\n"));
9194 
9195 
9196   nkept = 0;
9197   for (i = 0; i < n; i++) {
9198     if (eliminate[i] == false) {
9199       nkept++;
9200     } else if (hits[i]->paired_usedp == true) {
9201       nkept++;
9202     }
9203   }
9204   if (nkept == 0) {
9205     /* All entries eliminated one another, so keep the first one */
9206     eliminate[0] = false;
9207     nkept = 1;
9208   }
9209 
9210   prev = hits;
9211 #ifdef USE_ALLOCA_FOR_HITS
9212   hits = (Stage3end_T *) MALLOCA(nkept * sizeof(Stage3end_T));
9213 #else
9214   hits = (Stage3end_T *) MALLOC(nkept * sizeof(Stage3end_T));
9215 #endif
9216 
9217   for (i = 0, j = 0; i < n; i++) {
9218     hit = prev[i];
9219     if (eliminate[i] == false) {
9220       debug4(printf("  Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9221 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9222 		    hit->plusp,hit->sensedir));
9223       best_hit = hits[j++] = hit;
9224     } else if (hit->paired_usedp == true) {
9225       debug4(printf("  Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9226 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9227 		    hit->plusp,hit->sensedir));
9228       hits[j++] = hit;
9229     } else {
9230       debug4(printf("  Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9231 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9232 		    hit->plusp,hit->sensedir));
9233       Stage3end_transfer_transcripts_one(best_hit,hit);
9234       Stage3end_free(&hit);
9235     }
9236   }
9237 
9238 #ifdef USE_ALLOCA_FOR_HITS
9239   FREEA(prev);
9240 #else
9241   FREE(prev);
9242 #endif
9243 
9244 
9245   /* Step 2: Check for superstretches */
9246   hitlist = (List_T) NULL;
9247   n = nkept;
9248   debug4(printf("Step 2.  Checking for superstretches among %d hits within subsumption clusters\n",n));
9249 
9250   for (i = 0; i < n; i++) {
9251     eliminate[i] = false;
9252   }
9253 
9254   debug4(
9255 	 for (i = 0; i < n; i++) {
9256 	   hit = hits[i];
9257 	   printf("  Initial %d (%s): %p #%d:%u..%u, nmatches %d (%d to_trims), score %d",
9258 		  i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
9259 		  hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
9260 	   if (hit->transcripts != NULL) {
9261 	     Transcript_print_list(hit->transcripts);
9262 	   }
9263 	   printf("\n");
9264 	 }
9265 	 );
9266 
9267   /* Find clusters */
9268   i = 0;
9269   while (i < n) {
9270     j = i;
9271     /* Previously checked if (hits[i]->distant_splice_p == false) */
9272     while (j+1 < n && hit_subsumption(hits[i],hits[j+1]) == true) {
9273       j = j+1;
9274     }
9275 
9276     if (j > i) {
9277       debug4(printf("Cluster from %d up through %d\n",i,j));
9278 
9279       /* Find bad superstretches */
9280       for (k = i; k <= j; k++) {
9281 	/* Previously checked if (hits[i]->distant_splice_p == false) */
9282 	if (hit_bad_superstretch_p(hits[k],hits,k,j,finalp) == true) {
9283 	  eliminate[k] = true;
9284 	  /* parenti[k] = j; */
9285 	}
9286       }
9287     }
9288 
9289     i = j+1;
9290   }
9291 
9292   nkept = 0;
9293   for (i = 0; i < n; i++) {
9294     if (eliminate[i] == false) {
9295       nkept++;
9296     } else if (hits[i]->paired_usedp == true) {
9297       nkept++;
9298     }
9299   }
9300   if (nkept == 0) {
9301     /* All entries eliminated one another, so keep the first one */
9302     eliminate[0] = false;
9303     nkept = 1;
9304   }
9305 
9306 
9307   for (i = 0, j = 0; i < n; i++) {
9308     hit = hits[i];
9309     if (eliminate[i] == false) {
9310       debug4(printf("  Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9311 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9312 		    hit->plusp,hit->sensedir));
9313       hitlist = Hitlist_push(hitlist,hitlistpool,(void *) hit);
9314     } else if (hit->paired_usedp == true) {
9315       debug4(printf("  Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9316 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9317 		    hit->plusp,hit->sensedir));
9318       hitlist = Hitlist_push(hitlist,hitlistpool,(void *) hit);
9319     } else {
9320       debug4(printf("  Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9321 		    hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9322 		    hit->plusp,hit->sensedir));
9323       /* parent = prev[parenti[i]]; */
9324       /* Stage3end_transfer_transcripts_one(parent,hit); */
9325       Stage3end_free(&hit);
9326     }
9327   }
9328 
9329 #ifdef USE_ALLOCA_FOR_HITS
9330   FREEA(hits);
9331   FREEA(eliminate);
9332 #else
9333   FREE(hits);
9334   FREE(eliminate);
9335 #endif
9336 
9337 
9338   /* Prune based on nmatches adjusted by score to get a tradeoff between matches and parsimony */
9339   /* Same as step 1 of Stage3pair_optimal_score_final */
9340   debug8(printf("  Step 3.  Maximize nmatches adjusted by score (with slop)\n"));
9341   optimal = (List_T) NULL;
9342 
9343   keptp = false;
9344   hits = (T *) List_to_array_n(&n,hitlist);
9345   eliminate = (bool *) CALLOC(n,sizeof(bool));
9346   qsort(hits,n,sizeof(T),hit_position_cmp);
9347   i = 0;
9348   while (i < n) {
9349     j = i+1;
9350     while (j < n && hit_overlap_p(hits[j],hits[i]) == true) {
9351       j++;
9352     }
9353     if (j - 1 > 1) {
9354       debug4(printf("Found a group from %d to %d\n",i,j));
9355       max_adj_nmatches = 0;
9356       for (k = i; k < j; k++) {
9357 	hit = hits[k];
9358 	if ((score = hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall) > max_adj_nmatches) {
9359 	  max_adj_nmatches = score;
9360 	}
9361       }
9362       debug4(printf("max_adj_nmatches = %d\n",max_adj_nmatches));
9363 
9364       for (k = i; k < j; k++) {
9365 	hit = hits[k];
9366 	if (hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall < max_adj_nmatches - ADJ_NMATCHES_SLOP) {
9367 	  debug4(printf("Within loci end (adj score %d (%d-%d) < %d w/slop): Eliminating hit %p at %u..%u with nmatches %d (%d+ to trims)\n",
9368 			hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall,
9369 			hit->refalt_nmatches_plus_spliced_trims,hit->refalt_score_overall,max_adj_nmatches,
9370 			hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9371 			hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims));
9372 	  eliminate[k] = true;
9373 	} else {
9374 	  debug4(printf("Within loci end (adj score %d (%d-%d) == %d w/slop): Keeping hit %p at %u..%u with nmatches %d (%d+ to trims)\n",
9375 			hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall,
9376 			hit->refalt_nmatches_plus_spliced_trims,hit->refalt_score_overall,max_adj_nmatches,
9377 			hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9378 			hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims));
9379 	  keptp = true;
9380 	}
9381       }
9382     }
9383 
9384     i = j;
9385   }
9386 
9387   if (keptp == false) {
9388     optimal = hitlist;
9389   } else {
9390     for (k = 0; k < n; k++) {
9391       hit = hits[k];
9392       if (eliminate[k] == true) {
9393 	debug4(printf("Within loci end: Eliminating hit %p at %u..%u with nsegments %d, nmatches %d (%d to_trims), sensedir %d, splice score %f\n",
9394 		      hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9395 		      hit->nsegments,
9396 		      hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,
9397 		      hit->sensedir,hit->splice_score));
9398 	Stage3end_free(&hit);
9399 	/* eliminatedp = true; */
9400       } else {
9401 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
9402       }
9403     }
9404     Hitlist_free(&hitlist);
9405   }
9406   FREE(hits);
9407   FREE(eliminate);
9408   hitlist = optimal;
9409 
9410 
9411   /* Eliminate within loci: minimize nsegments and maximize splice score */
9412   /* Since we have achieved same number of matches, we should minimize nsegments to achieve parsimony */
9413   debug4(printf("  Step 4.  Minimize nsegments and splice score\n"));
9414   optimal = (List_T) NULL;
9415 
9416   keptp = false;
9417   hits = (T *) List_to_array_n(&n,hitlist);
9418   eliminate = (bool *) CALLOC(n,sizeof(bool));
9419   qsort(hits,n,sizeof(T),hit_position_cmp);
9420   i = 0;
9421   while (i < n) {
9422     j = i+1;
9423     while (j < n && hit_overlap_p(hits[j],hits[i]) == true) {
9424       j++;
9425     }
9426     if (j - 1 > 1) {
9427       debug4(printf("Found a group from %d to %d\n",i,j));
9428       best_nsegments = querylength;
9429       max_splice_score = 0.0;
9430       for (k = i; k < j; k++) {
9431 	hit = hits[k];
9432 	if (hit->nsegments < best_nsegments) {
9433 	  best_nsegments = hit->nsegments;
9434 	  max_splice_score = hit->splice_score;
9435 	} else if (hit->nsegments == best_nsegments) {
9436 	  max_splice_score = hit->splice_score;
9437 	}
9438       }
9439       debug4(printf("best_nsegments %d, max_splice_score %f\n",
9440 		    best_nsegments,max_splice_score));
9441 
9442       for (k = i; k < j; k++) {
9443 	hit = hits[k];
9444 	if (hit->nsegments > best_nsegments) {
9445 	  debug4(printf("Within loci end (nsegments %d < %d): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), sensedir %d, splice score %f\n",
9446 			hit->nsegments,best_nsegments,hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9447 			hit->nsegments,hit->refalt_nmatches_plus_spliced_trims,
9448 			hit->refalt_nmatches_to_trims,hit->sensedir,hit->splice_score));
9449 	  eliminate[k] = true;
9450 	} else if (hit->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
9451 	  debug4(printf("Within loci end (splice_score w/slop %f < %f): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), sensedir %d, splice score %f\n",
9452 			hit->splice_score,max_splice_score,hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9453 			hit->nsegments,hit->refalt_nmatches_plus_spliced_trims,
9454 			hit->refalt_nmatches_to_trims,hit->sensedir,hit->splice_score));
9455 	  eliminate[k] = true;
9456 	} else {
9457 	  keptp = true;
9458 	}
9459       }
9460     }
9461 
9462     i = j;
9463   }
9464 
9465   if (keptp == false) {
9466     optimal = hitlist;
9467   } else {
9468     for (k = 0; k < n; k++) {
9469       hit = hits[k];
9470       if (eliminate[k] == true) {
9471 	debug4(printf("Within loci end: Eliminating hit %p at %u..%u with nsegments %d, nmatches %d (%d to_trims), sensedir %d, splice score %f\n",
9472 		      hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9473 		      hit->nsegments,
9474 		      hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,
9475 		      hit->sensedir,hit->splice_score));
9476 	Stage3end_free(&hit);
9477 	/* eliminatedp = true; */
9478       } else {
9479 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
9480       }
9481     }
9482     Hitlist_free(&hitlist);
9483   }
9484   FREE(hits);
9485   FREE(eliminate);
9486   hitlist = optimal;
9487 
9488   /* Step 5.  Removing outerlength not applicable to a single end */
9489 
9490   return List_append(hitlist,distant);
9491 }
9492 
9493 
9494 
9495 #ifdef PRE_RESOLVE_MULTIMAPPING
9496 List_T
Stage3end_resolve_multimapping(List_T hits,Hitlistpool_T hitlistpool)9497 Stage3end_resolve_multimapping (List_T hits, Hitlistpool_T hitlistpool) {
9498   List_T resolve1, resolve2, resolve3, p;
9499   Stage3end_T hit;
9500 
9501   /* Overlap_T best_overlap; */
9502   long int best_tally;
9503   double tally_threshold;
9504   bool runlengthp;
9505 
9506   if (List_length(hits) <= 1) {
9507     return hits;
9508   }
9509 
9510   resolve1 = hits;
9511 
9512   if (tally_iit == NULL) {
9513     resolve2 = resolve1;
9514   } else {
9515     best_tally = 0L;
9516     for (p = resolve1; p != NULL; p = p->rest) {
9517       hit = (Stage3end_T) p->first;
9518       if ((hit->tally = Stage3end_compute_tally(hit)) > best_tally) {
9519 	best_tally = hit->tally;
9520       }
9521     }
9522     if (best_tally == 0L) {
9523       resolve2 = resolve1;
9524     } else {
9525       resolve2 = (List_T) NULL;
9526 #ifdef USE_TALLY_RATIO
9527       tally_threshold = (double) best_tally / TALLY_RATIO;
9528 #else
9529       tally_threshold = 1.0;
9530 #endif
9531       for (p = resolve1; p != NULL; p = p->rest) {
9532 	hit = (Stage3end_T) p->first;
9533 	if ((double) hit->tally < tally_threshold) {
9534 	  Stage3end_free(&hit);
9535 	} else {
9536 	  resolve2 = Hitlist_push(resolve2,hitlistpool,(void *) hit);
9537 	}
9538       }
9539       Hitlist_free(&resolve1);
9540     }
9541   }
9542 
9543 
9544   if (List_length(resolve2) <= 1) {
9545     return resolve2;
9546   }
9547 
9548   if (runlength_iit == NULL) {
9549     resolve3 = resolve2;
9550   } else {
9551     runlengthp = false;
9552     for (p = resolve2; p != NULL; p = p->rest) {
9553       hit = (Stage3end_T) p->first;
9554       if (Stage3end_runlength_p(hit) == true) {
9555 	runlengthp = true;
9556       }
9557     }
9558     if (runlengthp == false) {
9559       resolve3 = resolve2;
9560     } else {
9561       resolve3 = (List_T) NULL;
9562       for (p = resolve2; p != NULL; p = p->rest) {
9563 	hit = (Stage3end_T) p->first;
9564 	if (Stage3end_runlength_p(hit) == false) {
9565 	  Stage3end_free(&hit);
9566 	} else {
9567 	  resolve3 = Hitlist_push(resolve3,hitlistpool,(void *) hit);
9568 	}
9569       }
9570       Hitlist_free(&resolve2);
9571     }
9572   }
9573 
9574 
9575   return resolve3;
9576 }
9577 #endif
9578 
9579 
9580 Pairtype_T
Stage3_determine_pairtype(T hit5,T hit3,Stage3pair_T stage3pair)9581 Stage3_determine_pairtype (T hit5, T hit3, Stage3pair_T stage3pair) {
9582   int pairmax;
9583 
9584   debug14(printf("Entered Stage3_determine_pairtype\n"));
9585   if (hit5->effective_chrnum != hit3->effective_chrnum) {
9586     debug14(printf("Returning unpaired\n"));
9587     return UNPAIRED;
9588   } else if (hit5->plusp != hit3->plusp) {
9589     debug14(printf("Returning paired_inversion\n"));
9590     return PAIRED_INVERSION;
9591   } else if (hit5->plusp == true) {
9592     if (hit3->genomicend < hit5->genomicstart) {
9593       debug14(printf("Returning paired_scramble\n"));
9594       return PAIRED_SCRAMBLE;
9595     } else {
9596       if (circularp[hit5->effective_chrnum] == true) {
9597 	pairmax = pairmax_circular;
9598       } else {
9599 	pairmax = pairmax_linear;
9600       }
9601       if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9602 	debug14(printf("Returning concordant based on transcriptome\n"));
9603 	return CONCORDANT;
9604       } else if (hit3->genomicstart > hit5->genomicend + pairmax) {
9605 	debug14(printf("Returning paired_toolong\n"));
9606 	return PAIRED_TOOLONG;
9607       } else {
9608 	debug14(printf("Returning concordant\n"));
9609 	return CONCORDANT;
9610       }
9611     }
9612   } else {
9613     if (hit3->genomicend > hit5->genomicstart) {
9614       debug14(printf("Returning paired_scramble\n"));
9615       return PAIRED_SCRAMBLE;
9616     } else {
9617       if (circularp[hit3->effective_chrnum] == true) {
9618 	pairmax = pairmax_circular;
9619       } else {
9620 	pairmax = pairmax_linear;
9621       }
9622       if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9623 	debug14(printf("Returning concordant based on transcriptome\n"));
9624 	return CONCORDANT;
9625       } else if (hit3->genomicstart + pairmax < hit5->genomicend) {
9626 	debug14(printf("Returning paired_toolong\n"));
9627 	return PAIRED_TOOLONG;
9628       } else {
9629 	debug14(printf("Returning concordant\n"));
9630 	return CONCORDANT;
9631       }
9632     }
9633   }
9634 }
9635 
9636 
9637 #if 0
9638 /* Previously, samprint.c called this, but it can lead to incorrect answers when transcripts are added later */
9639 Pairtype_T
9640 Stage3pair_pairtype (Stage3pair_T this) {
9641   return this->pairtype;
9642 }
9643 #else
9644 Pairtype_T
Stage3pair_determine_pairtype(Stage3pair_T this)9645 Stage3pair_determine_pairtype (Stage3pair_T this) {
9646   return Stage3_determine_pairtype(this->hit5,this->hit3,this);
9647 }
9648 #endif
9649 
9650 bool
Stage3pair_circularp(Stage3pair_T this)9651 Stage3pair_circularp (Stage3pair_T this) {
9652   return this->circularp;
9653 }
9654 
9655 bool
Stage3pair_altlocp(Stage3pair_T this)9656 Stage3pair_altlocp (Stage3pair_T this) {
9657   if (altlocp[this->hit5->chrnum] == true) {
9658     return true;
9659   } else if (altlocp[this->hit3->chrnum] == true) {
9660     return true;
9661   } else {
9662     return false;
9663   }
9664 }
9665 
9666 
9667 #if 0
9668 static char *
9669 unpaired_type_text (T hit5, T hit3) {
9670   if (hit5->chrnum != hit3->chrnum) {
9671     return UNPAIRED_INTERCHROM_TEXT;
9672   } else if (hit5->plusp != hit3->plusp) {
9673     return PAIRED_INVERSION_TEXT;
9674   } else if (hit5->plusp == true) {
9675     if (hit3->genomicstart < hit5->genomicstart) {
9676       return PAIRED_SCRAMBLE_TEXT;
9677     } else {
9678       return UNPAIRED_TOOLONG_TEXT;
9679     }
9680   } else {
9681     if (hit5->genomicstart < hit3->genomicstart) {
9682       return PAIRED_SCRAMBLE_TEXT;
9683     } else {
9684       return UNPAIRED_TOOLONG_TEXT;
9685     }
9686   }
9687 }
9688 #endif
9689 
9690 
9691 
9692 
9693 /* Has a copy in pair.c */
9694 static void
print_pair_info(Filestring_T fp,T hit5,T hit3,int insertlength,int pairscore,Pairtype_T pairtype)9695 print_pair_info (Filestring_T fp, T hit5, T hit3, int insertlength, int pairscore,
9696 		 Pairtype_T pairtype) {
9697 
9698   assert(hit5->effective_chrnum == hit3->effective_chrnum); /* Same chromosomes */
9699 
9700 #if 0
9701   /* Doesn't hold for paired (inversion) */
9702   assert(hit5->plusp == hit3->plusp);	/* Same direction */
9703 #endif
9704 
9705 #ifndef NO_COMPARE
9706   FPRINTF(fp,"pair_score:%d",pairscore);
9707   FPRINTF(fp,",insert_length:%d",insertlength);
9708 #endif
9709 
9710   switch (pairtype) {
9711   case CONCORDANT: break;
9712   case PAIRED_SCRAMBLE: FPRINTF(fp,",pairtype:scramble"); break;
9713   case PAIRED_INVERSION: FPRINTF(fp,",pairtype:inversion"); break;
9714   case PAIRED_TOOLONG: FPRINTF(fp,",pairtype:toolong"); break;
9715   case CONCORDANT_TRANSLOCATIONS: break;
9716   case PAIRED_UNSPECIFIED: abort();
9717   case UNPAIRED: abort();
9718   case UNSPECIFIED: abort();
9719   }
9720 
9721   return;
9722 }
9723 
9724 
9725 
9726 
9727 static void
print_substrings(Filestring_T fp,Stage3pair_T stage3pair,T this,int score,Univ_IIT_T chromosome_iit,Shortread_T queryseq,Shortread_T headerseq,char * acc_suffix,bool invertp,T hit5,T hit3,int insertlength,int pairscore,Pairtype_T pairtype,int mapq_score,bool first_read_p)9728 print_substrings (Filestring_T fp, Stage3pair_T stage3pair, T this,
9729 		  int score, Univ_IIT_T chromosome_iit, Shortread_T queryseq,
9730 		  Shortread_T headerseq, char *acc_suffix, bool invertp, T hit5, T hit3, int insertlength,
9731 		  int pairscore, Pairtype_T pairtype, int mapq_score, bool first_read_p) {
9732   char *single_chr, *chr;
9733   bool allocp, alloc1p, pairinfo_printed_p = false;
9734   List_T substrings, junctions, p, q;
9735   Substring_T substring;
9736   Junction_T pre_junction, post_junction;
9737   int nblocks;
9738 
9739   if (this->chrnum == 0) {
9740     single_chr = (char *) NULL;
9741     alloc1p = false;
9742   } else {
9743     single_chr = Univ_IIT_label(chromosome_iit,this->chrnum,&alloc1p);
9744   }
9745   if (invertp == true) {
9746     substrings = this->substrings_Nto1;
9747     junctions = this->junctions_Nto1;
9748   } else {
9749     substrings = this->substrings_1toN;
9750     junctions = this->junctions_1toN;
9751   }
9752 
9753   if (output_type == M8_OUTPUT) {
9754     for (p = substrings; p != NULL; p = List_next(p)) {
9755       substring = (Substring_T) List_head(p);
9756       if (Substring_has_alts_p(substring) == true) {
9757 	/* Skip */
9758       } else {
9759 	if ((chr = single_chr) == NULL) {
9760 	  chr = Univ_IIT_label(chromosome_iit,Substring_chrnum(substring),&allocp);
9761 	}
9762 	Substring_print_m8(fp,substring,headerseq,acc_suffix,chr,invertp);
9763 	if (single_chr == NULL && allocp == true) {
9764 	  FREE(chr);
9765 	}
9766       }
9767     }
9768 
9769   } else {
9770     if ((nblocks = List_length(substrings)) == 1) {
9771       post_junction = (Junction_T) NULL;
9772     } else {
9773       post_junction = (Junction_T) List_head(junctions);
9774     }
9775     substring = (Substring_T) List_head(substrings);
9776     if (Substring_has_alts_p(substring) == true) {
9777       nblocks -= 1;
9778     }
9779     substring = (Substring_T) List_last_value(substrings);
9780     if (Substring_has_alts_p(substring) == true) {
9781       nblocks -= 1;
9782     }
9783 
9784 
9785     /* First line */
9786     substring = (Substring_T) List_head(substrings);
9787     if (Substring_has_alts_p(substring) == true) {
9788       /* Skip */
9789     } else {
9790       if ((chr = single_chr) == NULL) {
9791 	chr = Univ_IIT_label(chromosome_iit,Substring_chrnum(substring),&allocp);
9792       }
9793       FPRINTF(fp," ");
9794       Substring_print_alignment(fp,/*pre_junction*/NULL,substring,post_junction,queryseq,genomecomp,chr,invertp);
9795       if (single_chr == NULL && allocp == true) {
9796 	FREE(chr);
9797       }
9798 
9799       /* Alignment info */
9800 #ifndef NO_COMPARE
9801       FPRINTF(fp,"\tsegs:%d,align_score:%d,mapq:%d",nblocks,score,mapq_score);
9802       if (method_print_p == true) {
9803 	Method_print(fp,this->method);
9804       }
9805 #endif
9806 
9807       /* Transcriptome info */
9808       if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9809 	Transcript_concordance_print(fp,hit5->transcripts,hit3->transcripts,transcript_iit,
9810 				     /*concordantp*/true,first_read_p,invertp,/*header*/"\tTranscripts:");
9811       } else if (this->transcripts != NULL) {
9812 	Transcript_singleend_print(fp,this->transcripts,transcript_iit,invertp,/*header*/"\tTranscripts:");
9813       }
9814 
9815       /* Pairing info */
9816       if (hit5 != NULL && hit3 != NULL) {
9817 	FPRINTF(fp,"\t");
9818 	print_pair_info(fp,hit5,hit3,insertlength,pairscore,pairtype);
9819       }
9820       pairinfo_printed_p = true;
9821 
9822       FPRINTF(fp,"\n");
9823     }
9824 
9825     if ((p = List_next(substrings)) == NULL) {
9826       /* Done */
9827     } else {
9828       /* Middle lines */
9829       for (q = List_next(junctions); q != NULL; p = List_next(p), q = List_next(q)) {
9830 	pre_junction = post_junction;
9831 	post_junction = List_head(q);
9832 
9833 	substring = (Substring_T) List_head(p);
9834 	if (Substring_has_alts_p(substring) == true) {
9835 	  /* Skip */
9836 	} else {
9837 	  if (pairinfo_printed_p == true) {
9838 	    FPRINTF(fp,",");
9839 	  } else {
9840 	    FPRINTF(fp," ");
9841 	  }
9842 	  if ((chr = single_chr) == NULL) {
9843 	    chr = Univ_IIT_label(chromosome_iit,Substring_chrnum(substring),&allocp);
9844 	  }
9845 	  Substring_print_alignment(fp,pre_junction,substring,post_junction,queryseq,genomecomp,chr,invertp);
9846 	  if (single_chr == NULL && allocp == true) {
9847 	    FREE(chr);
9848 	  }
9849 
9850 	  if (pairinfo_printed_p == false) {
9851 	    /* Alignment info if not already printed */
9852 #ifndef NO_COMPARE
9853 	    FPRINTF(fp,"\tsegs:%d,align_score:%d,mapq:%d",nblocks,score,mapq_score);
9854 	    if (method_print_p == true) {
9855 	      Method_print(fp,this->method);
9856 	    }
9857 #endif
9858 
9859 	    /* Transcriptome info if not already printed */
9860 	    if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9861 	      Transcript_concordance_print(fp,hit5->transcripts,hit3->transcripts,transcript_iit,
9862 					   /*concordantp*/true,first_read_p,invertp,/*header*/"\tTranscripts:");
9863 	    } else if (this->transcripts != NULL) {
9864 	      Transcript_singleend_print(fp,this->transcripts,transcript_iit,invertp,/*header*/"\tTranscripts:");
9865 	    }
9866 
9867 	    /* Pairing info if not already printed */
9868 	    if (hit5 != NULL && hit3 != NULL) {
9869 	      FPRINTF(fp,"\t");
9870 	      print_pair_info(fp,hit5,hit3,insertlength,pairscore,pairtype);
9871 	    }
9872 	    pairinfo_printed_p = true;
9873 	  }
9874 
9875 	  FPRINTF(fp,"\n");
9876 	}
9877       }
9878 
9879       /* Last line */
9880       pre_junction = post_junction;
9881 
9882       substring = (Substring_T) List_head(p);
9883       if (Substring_has_alts_p(substring) == true) {
9884 	/* Skip */
9885       } else {
9886 	if (pairinfo_printed_p == true) {
9887 	  FPRINTF(fp,",");
9888 	} else {
9889 	  FPRINTF(fp," ");
9890 	}
9891 	if ((chr = single_chr) == NULL) {
9892 	  chr = Univ_IIT_label(chromosome_iit,Substring_chrnum(substring),&allocp);
9893 	}
9894 	Substring_print_alignment(fp,pre_junction,substring,/*post_junction*/NULL,queryseq,genomecomp,chr,invertp);
9895 	if (single_chr == NULL && allocp == true) {
9896 	  FREE(chr);
9897 	}
9898 
9899 	if (pairinfo_printed_p == false) {
9900 	  /* Alignment info if not already printed */
9901 #ifndef NO_COMPARE
9902 	  FPRINTF(fp,"\tsegs:%d,align_score:%d,mapq:%d",nblocks,score,mapq_score);
9903 	  if (method_print_p == true) {
9904 	    Method_print(fp,this->method);
9905 	  }
9906 #endif
9907 
9908 	  /* Transcriptome info if not already printed */
9909 	  if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9910 	    Transcript_concordance_print(fp,hit5->transcripts,hit3->transcripts,transcript_iit,
9911 					 /*concordantp*/true,first_read_p,invertp,/*header*/"\tTranscripts:");
9912 	  } else if (this->transcripts != NULL) {
9913 	    Transcript_singleend_print(fp,this->transcripts,transcript_iit,invertp,/*header*/"\tTranscripts:");
9914 	  }
9915 
9916 	  /* Pairing info if not already printed */
9917 	  if (hit5 != NULL && hit3 != NULL) {
9918 	    FPRINTF(fp,"\t");
9919 	    print_pair_info(fp,hit5,hit3,insertlength,pairscore,pairtype);
9920 	  }
9921 	  /* pairinfo_printed_p = true; */
9922 	}
9923 	FPRINTF(fp,"\n");
9924       }
9925     }
9926   }
9927 
9928   if (alloc1p == true) {
9929     FREE(single_chr);
9930   }
9931 }
9932 
9933 
9934 
9935 /* May substitute paired-end loglik for single-end loglik */
9936 void
Stage3end_print(Filestring_T fp,Stage3pair_T stage3pair,T this,Univ_IIT_T chromosome_iit,Shortread_T queryseq,Shortread_T headerseq,char * acc_suffix,bool invertp,T hit5,T hit3,int insertlength,int pairscore,Pairtype_T pairtype,int mapq_score,bool first_read_p)9937 Stage3end_print (Filestring_T fp, Stage3pair_T stage3pair, T this,
9938 		 Univ_IIT_T chromosome_iit, Shortread_T queryseq, Shortread_T headerseq,
9939 		 char *acc_suffix, bool invertp, T hit5, T hit3, int insertlength,
9940 		 int pairscore, Pairtype_T pairtype, int mapq_score, bool first_read_p) {
9941 
9942   /* TODO: Instead of score_within_trims, which contains penalties for
9943      ambiguous lengths, use (querylength - this->nmatches_plus_spliced_trims) instead */
9944   print_substrings(fp,stage3pair,this,this->refalt_score_within_trims,
9945 		   chromosome_iit,queryseq,headerseq,acc_suffix,invertp,
9946 		   hit5,hit3,insertlength,pairscore,pairtype,mapq_score,first_read_p);
9947 
9948   return;
9949 }
9950 
9951 
9952 static void
print_query_header(Filestring_T fp,char initchar,Shortread_T queryseq,bool invertp)9953 print_query_header (Filestring_T fp, char initchar, Shortread_T queryseq, bool invertp) {
9954   FPRINTF(fp,"%c",initchar);
9955   if (invertp == false) {
9956     Shortread_print_oneline(fp,queryseq);
9957   } else {
9958     Shortread_print_oneline_revcomp(fp,queryseq);
9959   }
9960 
9961   return;
9962 }
9963 
9964 
9965 
9966 static void
print_barcode_and_quality(Filestring_T fp,Shortread_T queryseq,bool invertp,int quality_shift)9967 print_barcode_and_quality (Filestring_T fp, Shortread_T queryseq, bool invertp, int quality_shift) {
9968   char *barcode;
9969 
9970   if ((barcode = Shortread_barcode(queryseq)) != NULL) {
9971     FPRINTF(fp,"\tbarcode:%s",barcode);
9972   }
9973 
9974   if (Shortread_quality_string(queryseq) != NULL) {
9975     FPRINTF(fp,"\t");
9976     if (invertp == false) {
9977       Shortread_print_quality(fp,queryseq,/*hardclip_low*/0,/*hardclip_high*/0,
9978 			      quality_shift,/*show_chopped_p*/true);
9979     } else {
9980       Shortread_print_quality_revcomp(fp,queryseq,/*hardclip_low*/0,/*hardclip_high*/0,
9981 				      quality_shift,/*show_chopped_p*/true);
9982     }
9983   }
9984 
9985   return;
9986 }
9987 
9988 
9989 void
Stage3pair_print_end(Filestring_T fp,Filestring_T fp_failedinput,Result_T result,Resulttype_T resulttype,char initchar,bool firstp,Univ_IIT_T chromosome_iit,Shortread_T queryseq,Shortread_T headerseq1,Shortread_T headerseq2,int maxpaths,bool quiet_if_excessive_p,bool invertp,int quality_shift)9990 Stage3pair_print_end (Filestring_T fp, Filestring_T fp_failedinput,
9991 		      Result_T result, Resulttype_T resulttype,
9992 		      char initchar, bool firstp, Univ_IIT_T chromosome_iit,
9993 		      Shortread_T queryseq, Shortread_T headerseq1, Shortread_T headerseq2,
9994 		      int maxpaths, bool quiet_if_excessive_p,
9995 		      bool invertp, int quality_shift) {
9996   Stage3pair_T *stage3pairarray, stage3pair;
9997   T *stage3array, *stage3array_mate, this, hit5, hit3;
9998   int npaths_primary, npaths_altloc, npaths_mate_primary, npaths_mate_altloc, pathnum;
9999   int first_absmq, second_absmq;
10000   bool excessivep, translocationp;
10001 
10002   if (resulttype == PAIREDEND_NOMAPPING) {
10003     if (output_type != M8_OUTPUT) {
10004       Filestring_set_split_output(fp,OUTPUT_NM);
10005       print_query_header(fp,initchar,queryseq,invertp);
10006       FPRINTF(fp,"\t0 %s",UNPAIRED_TEXT);
10007 
10008       print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10009 
10010       FPRINTF(fp,"\t");
10011       Shortread_print_header(fp,headerseq1,headerseq2);
10012       FPRINTF(fp,"\n");
10013     }
10014     /* If failedinput_root != NULL, then this case is handled by calling procedure */
10015 
10016   } else if (resulttype == CONCORDANT_UNIQ) {
10017     stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10018     stage3pair = stage3pairarray[0];
10019     hit5 = stage3pair->hit5;
10020     hit3 = stage3pair->hit3;
10021 
10022     if (stage3pair->circularp == true) {
10023       Filestring_set_split_output(fp,OUTPUT_CC);
10024     } else {
10025       Filestring_set_split_output(fp,OUTPUT_CU);
10026     }
10027 
10028     if (omit_concordant_uniq_p == true && stage3pair->circularp == false) {
10029       /* Skip printing */
10030       Filestring_set_split_output(fp,OUTPUT_NONE);
10031 
10032     } else {
10033       if (output_type != M8_OUTPUT) {
10034 	print_query_header(fp,initchar,queryseq,invertp);
10035 	FPRINTF(fp,"\t1 %s",CONCORDANT_TEXT);
10036 
10037 	print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10038 
10039 	FPRINTF(fp,"\t");
10040 	Shortread_print_header(fp,headerseq1,headerseq2);
10041       }
10042 
10043       if (firstp == true) {
10044 	Stage3end_print(fp,stage3pair,hit5,
10045 			chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10046 			invertp,hit5,hit3,stage3pair->insertlength,
10047 			/*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10048 			stage3pair->hit3->refalt_score_within_trims,
10049 			stage3pair->pairtype,stage3pair->mapq_score,
10050 			/*first_read_p*/true);
10051       } else {
10052 	Stage3end_print(fp,stage3pair,hit3,
10053 			chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10054 			invertp,hit5,hit3,stage3pair->insertlength,
10055 			/*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10056 			stage3pair->hit3->refalt_score_within_trims,
10057 			stage3pair->pairtype,stage3pair->mapq_score,
10058 			/*first_read_p*/false);
10059       }
10060 
10061       if (output_type != M8_OUTPUT) {
10062 	FPRINTF(fp,"\n");
10063       }
10064     }
10065 
10066   } else if (resulttype == CONCORDANT_TRANSLOC) {
10067     Filestring_set_split_output(fp,OUTPUT_CT);
10068     stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10069 
10070     if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10071       if (output_type != M8_OUTPUT) {
10072 	/* No xs category for transloc, so ignore quiet-if-excessive_p */
10073 	print_query_header(fp,initchar,queryseq,invertp);
10074 	FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,CONCORDANT_TEXT);
10075 	FPRINTF(fp," (transloc)");
10076 
10077 	print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10078 
10079 	FPRINTF(fp,"\t");
10080 	Shortread_print_header(fp,headerseq1,headerseq2);
10081 
10082 	/* No further output */
10083 	FPRINTF(fp,"\n");
10084       }
10085 
10086       if (failedinput_root != NULL) {
10087 	Shortread_print_query_singleend(fp_failedinput,queryseq,headerseq1);
10088       }
10089 
10090     } else {
10091       if (output_type != M8_OUTPUT) {
10092 	print_query_header(fp,initchar,queryseq,invertp);
10093 	FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,CONCORDANT_TEXT);
10094 	FPRINTF(fp," (transloc)");
10095 
10096 	print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10097 
10098 	FPRINTF(fp,"\t");
10099 	Shortread_print_header(fp,headerseq1,headerseq2);
10100       }
10101 
10102       for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10103 	stage3pair = stage3pairarray[pathnum-1];
10104 	hit5 = stage3pair->hit5;
10105 	hit3 = stage3pair->hit3;
10106 
10107 	if (firstp == true) {
10108 	  Stage3end_print(fp,stage3pair,hit5,
10109 			  chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10110 			  invertp,hit5,hit3,stage3pair->insertlength,
10111 			  /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10112 			  stage3pair->hit3->refalt_score_within_trims,
10113 			  stage3pair->pairtype,stage3pair->mapq_score,
10114 			  /*first_read_p*/true);
10115 	} else {
10116 	  Stage3end_print(fp,stage3pair,hit3,
10117 			  chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10118 			  invertp,hit5,hit3,stage3pair->insertlength,
10119 			  /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10120 			  stage3pair->hit3->refalt_score_within_trims,
10121 			  stage3pair->pairtype,stage3pair->mapq_score,
10122 			  /*first_read_p*/false);
10123 	}
10124       }
10125 
10126       if (output_type != M8_OUTPUT) {
10127 	FPRINTF(fp,"\n");
10128       }
10129     }
10130 
10131 
10132   } else if (resulttype == CONCORDANT_MULT) {
10133     stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10134 
10135     if (omit_concordant_mult_p == true) {
10136       /* Skip printing */
10137       Filestring_set_split_output(fp,OUTPUT_NONE);
10138 
10139     } else if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10140       Filestring_set_split_output(fp,OUTPUT_CX);
10141       if (output_type != M8_OUTPUT) {
10142 	print_query_header(fp,initchar,queryseq,invertp);
10143 	FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,CONCORDANT_TEXT);
10144 
10145 	print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10146 
10147 	FPRINTF(fp,"\t");
10148 	Shortread_print_header(fp,headerseq1,headerseq2);
10149 
10150 	/* No further output */
10151 	FPRINTF(fp,"\n");
10152 
10153 	if (failedinput_root != NULL) {
10154 	  Shortread_print_query_singleend(fp_failedinput,queryseq,headerseq1);
10155 	}
10156       }
10157 
10158     } else {
10159       Filestring_set_split_output(fp,OUTPUT_CM);
10160       if (output_type != M8_OUTPUT) {
10161 	print_query_header(fp,initchar,queryseq,invertp);
10162 	FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,CONCORDANT_TEXT);
10163 
10164 	print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10165 
10166 	FPRINTF(fp,"\t");
10167 	Shortread_print_header(fp,headerseq1,headerseq2);
10168       }
10169 
10170       for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10171 	stage3pair = stage3pairarray[pathnum-1];
10172 	hit5 = stage3pair->hit5;
10173 	hit3 = stage3pair->hit3;
10174 
10175 	if (firstp == true) {
10176 	  Stage3end_print(fp,stage3pair,hit5,
10177 			  chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10178 			  invertp,hit5,hit3,stage3pair->insertlength,
10179 			  /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10180 			  stage3pair->hit3->refalt_score_within_trims,
10181 			  stage3pair->pairtype,stage3pair->mapq_score,
10182 			  /*first_read_p*/true);
10183 	} else {
10184 	  Stage3end_print(fp,stage3pair,hit3,
10185 			  chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10186 			  invertp,hit5,hit3,stage3pair->insertlength,
10187 			  /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10188 			  stage3pair->hit3->refalt_score_within_trims,
10189 			  stage3pair->pairtype,stage3pair->mapq_score,
10190 			  /*first_read_p*/false);
10191 	}
10192       }
10193 
10194       if (output_type != M8_OUTPUT) {
10195 	FPRINTF(fp,"\n");
10196       }
10197     }
10198 
10199   } else if (resulttype == PAIRED_UNIQ) {
10200     stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10201     stage3pair = stage3pairarray[0];
10202 
10203     if (stage3pair->circularp == true) {
10204       Filestring_set_split_output(fp,OUTPUT_PC);
10205     } else if (stage3pair->pairtype == PAIRED_INVERSION) {
10206       Filestring_set_split_output(fp,OUTPUT_PI);
10207     } else if (stage3pair->pairtype == PAIRED_SCRAMBLE) {
10208       Filestring_set_split_output(fp,OUTPUT_PS);
10209     } else if (stage3pair->pairtype == PAIRED_TOOLONG) {
10210       Filestring_set_split_output(fp,OUTPUT_PL);
10211     } else {
10212       fprintf(stderr,"Unexpected pairtype %d\n",stage3pair->pairtype);
10213       abort();
10214     }
10215 
10216     if (output_type != M8_OUTPUT) {
10217       print_query_header(fp,initchar,queryseq,invertp);
10218       FPRINTF(fp,"\t1 %s",PAIRED_TEXT);
10219 
10220       print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10221 
10222       FPRINTF(fp,"\t");
10223       Shortread_print_header(fp,headerseq1,headerseq2);
10224     }
10225 
10226     hit5 = stage3pair->hit5;
10227     hit3 = stage3pair->hit3;
10228 
10229     if (firstp == true) {
10230       Stage3end_print(fp,stage3pair,hit5,
10231 		      chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10232 		      invertp,hit5,hit3,stage3pair->insertlength,
10233 		      /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10234 		      stage3pair->hit3->refalt_score_within_trims,
10235 		      stage3pair->pairtype,stage3pair->mapq_score,
10236 		      /*first_read_p*/true);
10237     } else {
10238       Stage3end_print(fp,stage3pair,hit3,
10239 		      chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10240 		      invertp,hit5,hit3,stage3pair->insertlength,
10241 		      stage3pair->hit5->refalt_score_within_trims +
10242 		      stage3pair->hit3->refalt_score_within_trims,
10243 		      stage3pair->pairtype,stage3pair->mapq_score,
10244 		      /*first_read_p*/false);
10245     }
10246 
10247     if (output_type != M8_OUTPUT) {
10248       FPRINTF(fp,"\n");
10249     }
10250 
10251   } else if (resulttype == PAIRED_MULT) {
10252     stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10253 
10254     if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10255       Filestring_set_split_output(fp,OUTPUT_PX);
10256       if (output_type != M8_OUTPUT) {
10257 	print_query_header(fp,initchar,queryseq,invertp);
10258 	FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,PAIRED_TEXT);
10259 
10260 	print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10261 
10262 	FPRINTF(fp,"\t");
10263 	Shortread_print_header(fp,headerseq1,headerseq2);
10264 
10265 	/* No further output */
10266 	FPRINTF(fp,"\n");
10267 
10268 	if (failedinput_root != NULL) {
10269 	  Shortread_print_query_singleend(fp_failedinput,queryseq,headerseq1);
10270 	}
10271       }
10272 
10273     } else {
10274       Filestring_set_split_output(fp,OUTPUT_PM);
10275       if (output_type != M8_OUTPUT) {
10276 	print_query_header(fp,initchar,queryseq,invertp);
10277 	FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,PAIRED_TEXT);
10278 
10279 	print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10280 
10281 	FPRINTF(fp,"\t");
10282 	Shortread_print_header(fp,headerseq1,headerseq2);
10283       }
10284 
10285       for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10286 	stage3pair = stage3pairarray[pathnum-1];
10287 	hit5 = stage3pair->hit5;
10288 	hit3 = stage3pair->hit3;
10289 
10290 	if (firstp == true) {
10291 	  Stage3end_print(fp,stage3pair,hit5,
10292 			  chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10293 			  invertp,hit5,hit3,stage3pair->insertlength,
10294 			  /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10295 			  stage3pair->hit3->refalt_score_within_trims,
10296 			  stage3pair->pairtype,stage3pair->mapq_score,
10297 			  /*first_read_p*/true);
10298 	} else {
10299 	  Stage3end_print(fp,stage3pair,hit3,
10300 			  chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10301 			  invertp,hit5,hit3,stage3pair->insertlength,
10302 			  /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10303 			  stage3pair->hit3->refalt_score_within_trims,
10304 			  stage3pair->pairtype,stage3pair->mapq_score,
10305 			  /*first_read_p*/false);
10306 	}
10307       }
10308 
10309       if (output_type != M8_OUTPUT) {
10310 	FPRINTF(fp,"\n");
10311       }
10312     }
10313 
10314 
10315   } else {
10316     /* Print as singles */
10317     if (firstp == true) {
10318       /* Get stage3array_mate first to avoid incorrect values for npaths */
10319       stage3array_mate = (T *) Result_array2(&npaths_mate_primary,&npaths_mate_altloc,&first_absmq,&second_absmq,result);
10320       stage3array = (T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10321     } else {
10322       /* Get stage3array_mate first to avoid incorrect values for npaths */
10323       stage3array_mate = (T *) Result_array(&npaths_mate_primary,&npaths_mate_altloc,&first_absmq,&second_absmq,result);
10324       stage3array = (T *) Result_array2(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10325     }
10326 
10327     excessivep = false;
10328     translocationp = false;
10329     if (resulttype == HALFMAPPING_UNIQ) {
10330       if (npaths_primary + npaths_altloc > 0 && Stage3end_circularpos(stage3array[0]) > 0) {
10331 	Filestring_set_split_output(fp,OUTPUT_HC);
10332       } else if (npaths_mate_primary + npaths_mate_altloc > 0 && Stage3end_circularpos(stage3array_mate[0]) > 0) {
10333 	Filestring_set_split_output(fp,OUTPUT_HC);
10334       } else {
10335 	Filestring_set_split_output(fp,OUTPUT_HU);
10336       }
10337 
10338     } else if (resulttype == HALFMAPPING_TRANSLOC) {
10339       Filestring_set_split_output(fp,OUTPUT_HT);
10340       translocationp = true;
10341 
10342     } else if (resulttype == HALFMAPPING_MULT) {
10343       if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10344 	Filestring_set_split_output(fp,OUTPUT_HX);
10345 	excessivep = true;
10346       } else {
10347 	Filestring_set_split_output(fp,OUTPUT_HM);
10348       }
10349 
10350     } else if (resulttype == UNPAIRED_UNIQ) {
10351       if (npaths_primary + npaths_altloc > 0 && Stage3end_circularpos(stage3array[0]) > 0) {
10352 	Filestring_set_split_output(fp,OUTPUT_UC);
10353       } else if (npaths_mate_primary + npaths_mate_altloc > 0 && Stage3end_circularpos(stage3array_mate[0]) > 0) {
10354 	Filestring_set_split_output(fp,OUTPUT_UC);
10355       } else {
10356 	Filestring_set_split_output(fp,OUTPUT_UU);
10357       }
10358 
10359     } else if (resulttype == UNPAIRED_TRANSLOC) {
10360       Filestring_set_split_output(fp,OUTPUT_UT);
10361       translocationp = true;
10362 
10363     } else if (resulttype == UNPAIRED_MULT) {
10364       if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10365 	Filestring_set_split_output(fp,OUTPUT_UX);
10366 	excessivep = true;
10367       } else {
10368 	Filestring_set_split_output(fp,OUTPUT_UM);
10369       }
10370 
10371     } else {
10372       fprintf(stderr,"Resulttype is %s\n",Resulttype_string(resulttype));
10373       abort();
10374     }
10375 
10376     if (output_type != M8_OUTPUT) {
10377       print_query_header(fp,initchar,queryseq,invertp);
10378       FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,UNPAIRED_TEXT);
10379       if (translocationp == true) {
10380 	FPRINTF(fp," (transloc)");
10381       }
10382 
10383       print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10384 
10385       FPRINTF(fp,"\t");
10386       Shortread_print_header(fp,headerseq1,headerseq2);
10387     }
10388 
10389     if (excessivep == true) {
10390       /* No output */
10391       if (failedinput_root != NULL) {
10392 	Shortread_print_query_singleend(fp_failedinput,queryseq,headerseq1);
10393       }
10394 
10395     } else {
10396       if (firstp == true) {
10397 	for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10398 	  this = stage3array[pathnum-1];
10399 	  Stage3end_print(fp,/*stage3pair*/NULL,this,
10400 			  chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10401 			  invertp,/*hit5*/(T) NULL,/*hit3*/(T) NULL,
10402 			  /*insertlength*/0,/*pairscore*/0,/*pairtype*/UNPAIRED,this->mapq_score,
10403 			  /*first_read_p*/true);
10404 	}
10405       } else {
10406 	for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10407 	  this = stage3array[pathnum-1];
10408 	  Stage3end_print(fp,/*stage3pair*/NULL,this,
10409 			  chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10410 			  invertp,/*hit5*/(T) NULL,/*hit3*/(T) NULL,
10411 			  /*insertlength*/0,/*pairscore*/0,/*pairtype*/UNPAIRED,this->mapq_score,
10412 			  /*first_read_p*/false);
10413 	}
10414       }
10415     }
10416 
10417     if (output_type != M8_OUTPUT) {
10418       FPRINTF(fp,"\n");
10419     }
10420   }
10421 
10422   return;
10423 }
10424 
10425 
10426 
10427 /* Used only for --merge-overlap features, so obey hardclip and not querystart/queryend */
10428 /* If use querylength_adj, ss.bug.4 fails.  If use querylength, ss.bug.3 fails */
10429 static List_T
Stage3end_convert_to_pairs_out(List_T pairs,T hit,Shortread_T queryseq,int hardclip_low,int hardclip_high,int queryseq_offset)10430 Stage3end_convert_to_pairs_out (List_T pairs, T hit, Shortread_T queryseq,
10431 				int hardclip_low, int hardclip_high, int queryseq_offset) {
10432   List_T p, q;
10433   /* Chrpos_T genomicpos1, genomicpos2; */
10434   Substring_T substring, prev_substring;
10435   Junction_T junction;
10436   Junctiontype_T type;
10437   char *deletion_string;
10438 
10439   if (hit->hittype == TRANSLOC_SPLICE) {
10440     /* Cannot handle translocations within a single GMAP alignment */
10441     abort();
10442     return NULL;
10443 
10444   } else {
10445     p = hit->substrings_1toN;
10446     prev_substring = (Substring_T) List_head(p);
10447     pairs = Substring_convert_to_pairs_out(pairs,prev_substring,hit->querylength,
10448 					   queryseq,hardclip_low,hardclip_high,queryseq_offset);
10449 
10450     for (q = hit->junctions_1toN, p = List_next(p); p != NULL; q = List_next(q), p = List_next(p)) {
10451       junction = (Junction_T) List_head(q);
10452       substring = (Substring_T) List_head(p);
10453 
10454       if ((type = Junction_type(junction)) == INS_JUNCTION) {
10455 	pairs = Substring_add_insertion_out(pairs,prev_substring,substring,hit->querylength,
10456 					    /*insertionlength*/Junction_nindels(junction),queryseq,
10457 					    hardclip_low,hardclip_high,queryseq_offset);
10458       } else if (type == DEL_JUNCTION) {
10459 	deletion_string = Junction_deletion_string(junction,genomecomp,hit->plusp);
10460 	pairs = Substring_add_deletion_out(pairs,prev_substring,substring,hit->querylength,
10461 					   deletion_string,/*deletionlength*/Junction_nindels(junction),
10462 					   hardclip_low,hardclip_high,queryseq_offset);
10463       } else if (type == SPLICE_JUNCTION) {
10464 	pairs = Substring_add_intron_out(pairs,prev_substring,substring,hit->querylength,
10465 					 hardclip_low,hardclip_high,queryseq_offset);
10466 
10467       } else {
10468 	abort();
10469       }
10470 
10471       pairs = Substring_convert_to_pairs_out(pairs,substring,hit->querylength,
10472 					     queryseq,hardclip_low,hardclip_high,queryseq_offset);
10473       prev_substring = substring;
10474     }
10475 
10476     debug15(Simplepair_dump_list(pairs,true));
10477     return pairs;
10478   }
10479 }
10480 
10481 
10482 /* Don't want querylength_adj */
10483 struct Simplepair_T *
Stage3pair_merge(int * npairs,int * querylength_merged,char ** queryseq_merged,char ** quality_merged,Stage3pair_T this,Shortread_T queryseq5,Shortread_T queryseq3,int querylength5,int querylength3,int clipdir,int hardclip5_low,int hardclip5_high,int hardclip3_low,int hardclip3_high)10484 Stage3pair_merge (int *npairs, int *querylength_merged, char **queryseq_merged, char **quality_merged,
10485 		  Stage3pair_T this, Shortread_T queryseq5, Shortread_T queryseq3,
10486 		  int querylength5, int querylength3, int clipdir,
10487 		  int hardclip5_low, int hardclip5_high, int hardclip3_low, int hardclip3_high) {
10488   struct Simplepair_T *pairarray, *newpair;
10489   Simplepair_T oldpair;
10490   List_T pairs, pairs5, pairs3, p;
10491   T hit5, hit3;
10492   int querylengthA, querylengthB;
10493   char *queryseq_ptr_5, *queryseq_ptr_3, *quality_ptr_5, *quality_ptr_3;
10494 #ifdef CHECK_ASSERTIONS
10495   Chrpos_T genomicpos1, genomicpos2;
10496 #endif
10497 
10498   hit5 = this->hit5;
10499   hit3 = this->hit3;
10500   queryseq_ptr_5 = Shortread_fullpointer_uc(queryseq5);
10501   queryseq_ptr_3 = Shortread_fullpointer_uc(queryseq3);
10502   quality_ptr_5 = Shortread_quality_string(queryseq5);
10503   quality_ptr_3 = Shortread_quality_string(queryseq3);
10504 
10505   if (hit5->plusp == true) {
10506     if (clipdir > 0) {
10507       pairs5 = Stage3end_convert_to_pairs_out(NULL,hit5,queryseq5,hardclip5_low,hardclip5_high,/*queryseq_offset*/0);
10508       pairs5 = Simplepair_strip_gaps_at_head(pairs5);
10509 
10510       pairs3 = Stage3end_convert_to_pairs_out(NULL,hit3,queryseq3,hardclip3_low,hardclip3_high,
10511 					      /*queryseq_offset*/querylength5-hardclip5_low-hardclip5_high-hardclip3_low-hardclip3_high);
10512       pairs3 = Simplepair_strip_gaps_at_tail(pairs3);
10513 
10514 #ifdef CHECK_ASSERTIONS
10515       genomicpos1 = Simplepair_head_genomepos(pairs5);
10516       genomicpos2 = Simplepair_last_genomepos(pairs3);
10517       if (genomicpos2 != genomicpos1 + 1U) {
10518 	printf("Accession %s, plus\n",Shortread_accession(queryseq5));
10519 	printf("Expected genomicpos2 %u == genomicpos1 %u + 1\n",genomicpos2,genomicpos1);
10520 	Simplepair_dump_list(pairs5,true);
10521 	Simplepair_dump_list(pairs3,true);
10522 	abort();
10523       }
10524 #endif
10525 
10526       pairs = List_append(pairs3,pairs5);
10527 
10528       querylengthA = querylength5 - hardclip5_low - hardclip5_high;
10529       querylengthB = querylength3 - hardclip3_low - hardclip3_high;
10530       *querylength_merged = querylengthA + querylengthB;
10531 
10532       *queryseq_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10533       strncpy(*queryseq_merged,queryseq_ptr_5,querylengthA);
10534       strncpy(&((*queryseq_merged)[querylengthA]),&(queryseq_ptr_3[querylength3 - querylengthB]),querylengthB);
10535       (*queryseq_merged)[querylengthA+querylengthB] = '\0';
10536 
10537       if (quality_ptr_5 == NULL || quality_ptr_3 == NULL) {
10538 	*quality_merged = (char *) NULL;
10539       } else {
10540 	*quality_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10541 	strncpy(*quality_merged,quality_ptr_5,querylengthA);
10542 	strncpy(&((*quality_merged)[querylengthA]),&(quality_ptr_3[querylength3 - querylengthB]),querylengthB);
10543 	(*quality_merged)[querylengthA+querylengthB] = '\0';
10544       }
10545 
10546     } else if (clipdir < 0) {
10547       pairs3 = Stage3end_convert_to_pairs_out(NULL,hit3,queryseq3,hardclip3_low,hardclip3_high,/*queryseq_offset*/0);
10548       pairs3 = Simplepair_strip_gaps_at_head(pairs3);
10549 
10550       pairs5 = Stage3end_convert_to_pairs_out(NULL,hit5,queryseq5,hardclip5_low,hardclip5_high,
10551 					      /*queryseq_offset*/querylength3-hardclip3_low-hardclip3_high-hardclip5_low-hardclip5_high);
10552       pairs5 = Simplepair_strip_gaps_at_tail(pairs5);
10553 
10554 #ifdef CHECK_ASSERTIONS
10555       genomicpos1 = Simplepair_head_genomepos(pairs3);
10556       genomicpos2 = Simplepair_last_genomepos(pairs5);
10557       if (genomicpos2 != genomicpos1 + 1U) {
10558 	printf("Accession %s, plus, clipdir %d\n",Shortread_accession(queryseq5),clipdir);
10559 	printf("Expected genomicpos2 %u == genomicpos1 %u + 1\n",genomicpos2,genomicpos1);
10560 	printf("Begin of pairs3\n");
10561 	Simplepair_dump_list(pairs3,true);
10562 	printf("Begin of pairs5\n");
10563 	Simplepair_dump_list(pairs5,true);
10564 	abort();
10565       }
10566 #endif
10567 
10568       pairs = List_append(pairs5,pairs3);
10569 
10570       querylengthA = querylength3 - hardclip3_low - hardclip3_high;
10571       querylengthB = querylength5 - hardclip5_low - hardclip5_high;
10572       *querylength_merged = querylengthA + querylengthB;
10573 
10574       *queryseq_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10575       strncpy(*queryseq_merged,queryseq_ptr_3,querylengthA);
10576       strncpy(&((*queryseq_merged)[querylengthA]),&(queryseq_ptr_5[querylength5 - querylengthB]),querylengthB);
10577       (*queryseq_merged)[querylengthA+querylengthB] = '\0';
10578 
10579       if (quality_ptr_5 == NULL || quality_ptr_3 == NULL) {
10580 	*quality_merged = (char *) NULL;
10581       } else {
10582 	*quality_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10583 	strncpy(*quality_merged,quality_ptr_3,querylengthA);
10584 	strncpy(&((*quality_merged)[querylengthA]),&(quality_ptr_5[querylength5 - querylengthB]),querylengthB);
10585 	(*quality_merged)[querylengthA+querylengthB] = '\0';
10586       }
10587 
10588     } else {
10589       abort();
10590     }
10591 
10592   } else {
10593     if (clipdir > 0) {
10594       pairs3 = Stage3end_convert_to_pairs_out(NULL,hit3,queryseq3,hardclip3_low,hardclip3_high,/*queryseq_offset*/0);
10595       pairs3 = Simplepair_strip_gaps_at_head(pairs3);
10596 
10597       pairs5 = Stage3end_convert_to_pairs_out(NULL,hit5,queryseq5,hardclip5_low,hardclip5_high,
10598 					      /*queryseq_offset*/querylength3-hardclip3_low-hardclip3_high-hardclip5_low-hardclip5_high);
10599       pairs5 = Simplepair_strip_gaps_at_tail(pairs5);
10600 
10601 #ifdef CHECK_ASSERTIONS
10602       genomicpos1 = Simplepair_head_genomepos(pairs3);
10603       genomicpos2 = Simplepair_last_genomepos(pairs5);
10604       if (genomicpos2 != genomicpos1 - 1U) {
10605 	printf("Accession %s, minus\n",Shortread_accession(queryseq5));
10606 	printf("Expected genomicpos2 %u == genomicpos1 %u - 1\n",genomicpos2,genomicpos1);
10607 	Simplepair_dump_list(pairs3,true);
10608 	Simplepair_dump_list(pairs5,true);
10609 	abort();
10610       }
10611 #endif
10612 
10613       pairs = List_append(pairs5,pairs3);
10614 
10615       querylengthA = querylength3 - hardclip3_low - hardclip3_high;
10616       querylengthB = querylength5 - hardclip5_low - hardclip5_high;
10617       *querylength_merged = querylengthA + querylengthB;
10618 
10619       *queryseq_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10620       strncpy(*queryseq_merged,queryseq_ptr_3,querylengthA);
10621       strncpy(&((*queryseq_merged)[querylengthA]),&(queryseq_ptr_5[querylength5 - querylengthB]),querylengthB);
10622       (*queryseq_merged)[querylengthA+querylengthB] = '\0';
10623 
10624       if (quality_ptr_5 == NULL || quality_ptr_3 == NULL) {
10625 	*quality_merged = (char *) NULL;
10626       } else {
10627 	*quality_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10628 	strncpy(*quality_merged,quality_ptr_3,querylengthA);
10629 	strncpy(&((*quality_merged)[querylengthA]),&(quality_ptr_5[querylength5 - querylengthB]),querylengthB);
10630 	(*quality_merged)[querylengthA+querylengthB] = '\0';
10631       }
10632 
10633     } else if (clipdir < 0) {
10634       pairs5 = Stage3end_convert_to_pairs_out(NULL,hit5,queryseq5,hardclip5_low,hardclip5_high,/*queryseq_offset*/0);
10635       pairs5 = Simplepair_strip_gaps_at_head(pairs5);
10636 
10637       pairs3 = Stage3end_convert_to_pairs_out(NULL,hit3,queryseq3,hardclip3_low,hardclip3_high,
10638 					      /*queryseq_offset*/querylength5-hardclip5_low-hardclip5_high-hardclip3_low-hardclip3_high);
10639       pairs3 = Simplepair_strip_gaps_at_tail(pairs3);
10640 
10641 #ifdef CHECK_ASSERTIONS
10642       genomicpos1 = Simplepair_head_genomepos(pairs5);
10643       genomicpos2 = Simplepair_last_genomepos(pairs3);
10644       if (genomicpos2 != genomicpos1 - 1U) {
10645 	printf("Accession %s, minus\n",Shortread_accession(queryseq5));
10646 	printf("Expected genomicpos2 %u == genomicpos1 %u - 1\n",genomicpos2,genomicpos1);
10647 	Simplepair_dump_list(pairs5,true);
10648 	Simplepair_dump_list(pairs3,true);
10649 	abort();
10650       }
10651 #endif
10652 
10653       pairs = List_append(pairs3,pairs5);
10654 
10655       querylengthA = querylength5 - hardclip5_low - hardclip5_high;
10656       querylengthB = querylength3 - hardclip3_low - hardclip3_high;
10657       *querylength_merged = querylengthA + querylengthB;
10658 
10659       *queryseq_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10660       strncpy(*queryseq_merged,queryseq_ptr_5,querylengthA);
10661       strncpy(&((*queryseq_merged)[querylengthA]),&(queryseq_ptr_3[querylength3 - querylengthB]),querylengthB);
10662       (*queryseq_merged)[querylengthA+querylengthB] = '\0';
10663 
10664       if (quality_ptr_5 == NULL || quality_ptr_3 == NULL) {
10665 	*quality_merged = (char *) NULL;
10666       } else {
10667 	*quality_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10668 	strncpy(*quality_merged,quality_ptr_5,querylengthA);
10669 	strncpy(&((*quality_merged)[querylengthA]),&(quality_ptr_3[querylength3 - querylengthB]),querylengthB);
10670 	(*quality_merged)[querylengthA+querylengthB] = '\0';
10671       }
10672 
10673     } else {
10674       abort();
10675     }
10676   }
10677 
10678   pairs = List_reverse(pairs);
10679   /* Simplepair_dump_list(pairs,true); */
10680 
10681   *npairs = List_length(pairs);
10682   newpair = pairarray = (struct Simplepair_T *) MALLOC_OUT((*npairs)*sizeof(struct Simplepair_T));
10683   for (p = pairs; p != NULL; p = p->rest) {
10684     oldpair = (Simplepair_T) p->first;
10685     memcpy(newpair++,oldpair,sizeof(struct Simplepair_T));
10686     Simplepair_free_out(&oldpair);
10687   }
10688   List_free_out(&pairs);
10689 
10690   return pairarray;
10691 }
10692 
10693 
10694 #if 0
10695 static int
10696 compute_insertlength (int *pair_relationship, Stage3pair_T this) {
10697   T hit5, hit3;
10698   int querylength5, querylength3;
10699 
10700   hit5 = this->hit5;
10701   hit3 = this->hit3;
10702   querylength5 = hit5->querylength;
10703   querylength3 = hit3->querylength;
10704 
10705   debug10(printf("Computing insertlength on %u..%u to %u..%u\n",
10706 		 hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,
10707 		 hit3->genomicend - hit3->chroffset,hit3->genomicstart - hit3->chroffset));
10708 
10709   if (hit5->plusp == true && hit3->plusp == false) {
10710     /* Have 5-start..end and 3-end..start */
10711     /*   or 3-end..start and 5-start..end */
10712 
10713     *pair_relationship = 0;
10714     if (hit5->genomicend < hit3->genomicend) {
10715       return (hit3->genomicend - hit5->genomicend) + querylength5 + querylength3;
10716     } else if (hit3->genomicstart < hit5->genomicstart) {
10717       return (hit5->genomicstart - hit3->genomicstart) + querylength5 + querylength3;
10718     } else {
10719       return pair_insert_length_unpaired(hit5,hit3);
10720     }
10721 
10722   } else if (hit5->plusp == false && hit3->plusp == true) {
10723     /* Have 5-end..start and 3-start..end */
10724     /*   or 3-start..end and 5-end..start */
10725 
10726     *pair_relationship = 0;
10727     if (hit5->genomicstart < hit3->genomicstart) {
10728       return (hit3->genomicstart - hit5->genomicstart) + querylength5 + querylength3;
10729     } else if (hit3->genomicend < hit5->genomicend) {
10730       return (hit5->genomicend - hit3->genomicend) + querylength5 + querylength3;
10731     } else {
10732       return pair_insert_length_unpaired(hit5,hit3);
10733     }
10734 
10735   } else if (hit5->plusp == true) {
10736     /* Concordant directions on same chromosome (plus) */
10737     debug10(printf("Concordant on plus strand\n"));
10738     /* Have 5-start..end and 3-start..end */
10739     if (hit5->genomicend < hit3->genomicstart) {
10740       /* No overlap */
10741       *pair_relationship = +1;
10742       return (hit3->genomicstart - hit5->genomicend) + querylength5 + querylength3;
10743     } else {
10744       return pair_insert_length(&(*pair_relationship),hit5,hit3);
10745     }
10746 
10747 
10748   } else {
10749     /* Concordant directions on same chromosome (minus) */
10750     debug10(printf("Concordant on minus strand\n"));
10751     /* Have 3-end..start and 5-end..start */
10752     if (hit3->genomicstart < hit5->genomicend) {
10753       /* No overlap */
10754       *pair_relationship = -1;
10755       return (hit5->genomicend - hit3->genomicstart) + querylength5 + querylength3;
10756     } else {
10757       return pair_insert_length(&(*pair_relationship),hit5,hit3);
10758     }
10759   }
10760 }
10761 #endif
10762 
10763 
10764 /* Need to make a copy of the hit before calling */
10765 static void
resolve_ambiguity_5(T this,int * mismatch_positions_alloc,Compress_T query_compress,int alts_resolve)10766 resolve_ambiguity_5 (T this, int *mismatch_positions_alloc, Compress_T query_compress, int alts_resolve) {
10767   Substring_T substring, anchor, substring1;
10768   Junction_T junction;
10769   Univcoord_T left, ignore;
10770   Chrpos_T splice_distance;
10771   double donor_prob, acceptor_prob;
10772   List_T p;
10773 
10774   substring = (Substring_T) List_head(this->substrings_Nto1);
10775   anchor = (Substring_T) List_head(List_next(this->substrings_Nto1));
10776   junction = (Junction_T) List_head(this->junctions_Nto1);
10777   left = Substring_set_alt(&donor_prob,&acceptor_prob,&ignore,&this->genomicend,substring,alts_resolve);
10778   if (this->plusp == true) {
10779     splice_distance = left - Substring_left(anchor);
10780     this->high = this->genomicend - (this->querylength - this->queryend_chrbound);
10781   } else {
10782     splice_distance = Substring_left(anchor) - left;
10783     this->low = this->genomicend + (this->querylength - this->queryend_chrbound);
10784   }
10785   assert(this->low < this->high);
10786 
10787   if (splice_distance > 0) {
10788     Junction_set_unambiguous(junction,splice_distance,donor_prob,acceptor_prob);
10789   } else {
10790     this->substrings_Nto1 = List_next(this->substrings_Nto1);
10791     this->junctions_Nto1 = List_next(this->junctions_Nto1);
10792     this->substrings_1toN = List_drop_last(this->substrings_1toN,(void **) &substring);
10793     this->junctions_1toN = List_drop_last(this->junctions_1toN,(void **) &junction);
10794 
10795     this->nsplices -= 1;
10796     if (this->nsplices == 0) {
10797       this->splice_score = 0.0;
10798     } else {
10799       this->splice_score = (this->splice_score * 2*(this->nsplices+1) - Junction_splice_score(junction)) / (2*this->nsplices);
10800     }
10801     this->nsegments -= 1;
10802 
10803     anchor = Substring_extend_anchor_queryend(anchor,substring,mismatch_positions_alloc,query_compress);
10804     List_head_set(this->substrings_Nto1,(void *) anchor);
10805     List_last_set(this->substrings_1toN,(void *) anchor);
10806     Substring_free(&substring);
10807     Junction_free(&junction);
10808 
10809     /* Update information for hit */
10810     this->trim_queryend = Substring_trim_queryend(anchor);
10811     this->mandatory_trim_queryend = Substring_mandatory_trim_queryend(anchor);
10812     this->trim_queryend_splicep = Substring_trim_queryend_splicep(anchor);
10813 
10814     this->refalt_nmatches_to_trims = this->ref_nmatches_to_trims = 0;
10815     for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
10816       substring = (Substring_T) List_head(p);
10817       this->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
10818       this->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
10819     }
10820 
10821     substring1 = (Substring_T) List_head(this->substrings_1toN);
10822     this->refalt_nmatches_plus_spliced_trims = this->refalt_nmatches_to_trims + Substring_start_amb_length(substring1) + Substring_end_amb_length(anchor);
10823     this->ref_nmatches_plus_spliced_trims = this->ref_nmatches_to_trims + Substring_start_amb_length(substring1) + Substring_end_amb_length(anchor);
10824     for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
10825       junction = List_head(p);
10826       this->refalt_nmatches_plus_spliced_trims += Junction_ninserts(junction);
10827       this->ref_nmatches_plus_spliced_trims += Junction_ninserts(junction);
10828     }
10829     this->ref_score_overall = this->querylength - this->ref_nmatches_to_trims;
10830     this->refalt_score_overall = this->querylength - this->refalt_nmatches_to_trims;
10831     this->refalt_score_within_trims = this->querylength - this->refalt_nmatches_plus_spliced_trims;
10832     if (Substring_trim_querystart_splicep(substring1) == false) {
10833       this->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring1)/END_BINSIZE);
10834     } else {
10835       this->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring1)/END_BINSIZE);
10836     }
10837     if (Substring_trim_queryend_splicep(anchor) == false) {
10838       this->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((this->querylength - Substring_queryend(anchor))/END_BINSIZE);
10839     } else {
10840       this->refalt_score_within_trims += SPLICED_END_PENALTY*((this->querylength - Substring_queryend(anchor))/END_BINSIZE);
10841     }
10842 
10843     if (this->chrlength < (Univcoord_T) this->querylength) {
10844       this->ref_score_overall -= ((Univcoord_T) this->querylength - this->chrlength);
10845       this->refalt_score_overall -= ((Univcoord_T) this->querylength - this->chrlength);
10846       this->refalt_score_within_trims -= ((Univcoord_T) this->querylength - this->chrlength);
10847     }
10848     assert(this->refalt_score_within_trims >= 0);
10849   }
10850 
10851   return;
10852 }
10853 
10854 
10855 /* Need to make a copy of the hit before calling */
10856 static void
resolve_ambiguity_3(T this,int * mismatch_positions_alloc,Compress_T query_compress,int alts_resolve)10857 resolve_ambiguity_3 (T this, int *mismatch_positions_alloc, Compress_T query_compress, int alts_resolve) {
10858   Substring_T substring, anchor, substringN;
10859   Junction_T junction;
10860   Univcoord_T left, ignore;
10861   Chrpos_T splice_distance;
10862   double donor_prob, acceptor_prob;
10863   List_T p;
10864 
10865   substring = (Substring_T) List_head(this->substrings_1toN);
10866   anchor = (Substring_T) List_head(List_next(this->substrings_1toN));
10867   junction = (Junction_T) List_head(this->junctions_1toN);
10868   left = Substring_set_alt(&donor_prob,&acceptor_prob,&this->genomicstart,&ignore,substring,alts_resolve);
10869   if (this->plusp == true) {
10870     splice_distance = Substring_left(anchor) - left;
10871     this->low = this->genomicstart + this->querystart_chrbound;
10872   } else {
10873     splice_distance = left - Substring_left(anchor);
10874     this->high = this->genomicstart - this->querystart_chrbound;
10875   }
10876   assert(this->low < this->high);
10877 
10878   if (splice_distance > 0) {
10879     Junction_set_unambiguous(junction,splice_distance,donor_prob,acceptor_prob);
10880   } else {
10881     this->substrings_1toN = List_next(this->substrings_1toN);
10882     this->junctions_1toN = List_next(this->junctions_1toN);
10883     this->substrings_Nto1 = List_drop_last(this->substrings_Nto1,(void **) &substring);
10884     this->junctions_Nto1 = List_drop_last(this->junctions_Nto1,(void **) &junction);
10885     this->splice_score = (this->splice_score * 2*this->nsplices - Junction_splice_score(junction)) / (2*(this->nsplices - 1));
10886 
10887     this->nsplices -= 1;
10888     if (this->nsplices == 0) {
10889       this->splice_score = 0.0;
10890     } else {
10891       this->splice_score = (this->splice_score * 2*(this->nsplices+1) - Junction_splice_score(junction)) / (2*this->nsplices);
10892     }
10893     this->nsegments -= 1;
10894 
10895     anchor = Substring_extend_anchor_querystart(anchor,substring,mismatch_positions_alloc,query_compress);
10896     List_head_set(this->substrings_1toN,(void *) anchor);
10897     List_last_set(this->substrings_Nto1,(void *) anchor);
10898     Substring_free(&substring);
10899     Junction_free(&junction);
10900 
10901     /* Update information for hit */
10902     this->trim_querystart = Substring_trim_querystart(anchor);
10903     this->mandatory_trim_querystart = Substring_mandatory_trim_querystart(anchor);
10904     this->trim_querystart_splicep = Substring_trim_querystart_splicep(anchor);
10905 
10906     this->refalt_nmatches_to_trims = this->ref_nmatches_to_trims = 0;
10907     for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
10908       substring = (Substring_T) List_head(p);
10909       this->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
10910       this->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
10911     }
10912 
10913     substringN = (Substring_T) List_head(this->substrings_Nto1);
10914     this->refalt_nmatches_plus_spliced_trims = this->refalt_nmatches_to_trims + Substring_start_amb_length(anchor) + Substring_end_amb_length(substringN);
10915     this->ref_nmatches_plus_spliced_trims = this->ref_nmatches_to_trims + Substring_start_amb_length(anchor) + Substring_end_amb_length(substringN);
10916     for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
10917       junction = List_head(p);
10918       this->refalt_nmatches_plus_spliced_trims += Junction_ninserts(junction);
10919       this->ref_nmatches_plus_spliced_trims += Junction_ninserts(junction);
10920     }
10921     this->ref_score_overall = this->querylength - this->ref_nmatches_to_trims;
10922     this->refalt_score_overall = this->querylength - this->refalt_nmatches_to_trims;
10923     this->refalt_score_within_trims = this->querylength - this->refalt_nmatches_plus_spliced_trims;
10924     if (Substring_trim_querystart_splicep(anchor) == false) {
10925       this->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(anchor)/END_BINSIZE);
10926     } else {
10927       this->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(anchor)/END_BINSIZE);
10928     }
10929     if (Substring_trim_queryend_splicep(substringN) == false) {
10930       this->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((this->querylength - Substring_queryend(substringN))/END_BINSIZE);
10931     } else {
10932       this->refalt_score_within_trims += SPLICED_END_PENALTY*((this->querylength - Substring_queryend(substringN))/END_BINSIZE);
10933     }
10934 
10935     if (this->chrlength < (Univcoord_T) this->querylength) {
10936       this->ref_score_overall -= ((Univcoord_T) this->querylength - this->chrlength);
10937       this->refalt_score_overall -= ((Univcoord_T) this->querylength - this->chrlength);
10938       this->refalt_score_within_trims -= ((Univcoord_T) this->querylength - this->chrlength);
10939     }
10940     assert(this->refalt_score_within_trims >= 0);
10941   }
10942 
10943   return;
10944 }
10945 
10946 
10947 
10948 /* Should not set ambiguous flag in substrings, because resolution of
10949    an ambiguity depends on a particular pair of ends */
10950 
10951 static void
resolve_inside_alts_splice_plus(int * alts_resolve_5,int * alts_resolve_3,int * alts_status_inside,T hit5,T hit3,int querylength5,int querylength3)10952 resolve_inside_alts_splice_plus (int *alts_resolve_5, int *alts_resolve_3,
10953 				 int *alts_status_inside, T hit5, T hit3, int querylength5, int querylength3) {
10954   Chrpos_T best_insertlength, insertlength;
10955   Univcoord_T genomicstart, genomicend;
10956   int besti5 = -1, besti3 = -1, i, j;
10957   int best_nmismatches, nmismatches;
10958 
10959   Substring_T substring5, substring3;
10960   Univcoord_T *end_alts_coords, *start_alts_coords;
10961   int *end_alts_nmismatches, *start_alts_nmismatches;
10962   int end_amb_length_5, start_amb_length_3;
10963 
10964 
10965   debug9(printf("resolve plus: hit5 %p (%s) and hit3 %p (%s)\n",
10966 		hit5,Method_string(hit5->method),hit3,Method_string(hit3->method)));
10967 
10968   substring5 = (Substring_T) List_head(hit5->substrings_Nto1); /* the substring for concordance */
10969   debug9(printf("Testing substring5 %p %d..%d alts_p %d\n",
10970 		substring5,Stage3end_substrings_querystart(hit5),Stage3end_substrings_queryend(hit5),
10971 		Substring_has_alts_p(substring5)));
10972 
10973   substring3 = (Substring_T) List_head(hit3->substrings_1toN); /* the substring for concordance (was Nto1) */
10974   debug9(printf("Testing substring3 %p %d..%d alts_p %d\n",
10975 		substring3,Stage3end_substrings_querystart(hit3),Stage3end_substrings_queryend(hit3),
10976 		Substring_has_alts_p(substring3)));
10977 
10978   if (substring5 != NULL && Substring_has_alts_p(substring5) == true &&
10979       substring3 != NULL && Substring_has_alts_p(substring3) == true) {
10980     debug9(printf("Resolve plus case 1: Got alts at 5' and alts at 3':"));
10981     end_alts_coords = Substring_alts_coords(substring5);
10982     end_alts_nmismatches = Substring_alts_nmismatches(substring5);
10983     start_alts_coords = Substring_alts_coords(substring3);
10984     start_alts_nmismatches = Substring_alts_nmismatches(substring3);
10985     end_amb_length_5 = end_amb_length(hit5);
10986     start_amb_length_3 = start_amb_length(hit3);
10987 
10988     best_insertlength = (Chrpos_T) -1;
10989     best_nmismatches = querylength5 + querylength3;
10990     for (i = 0; i < Substring_alts_ncoords(substring5); i++) {
10991       genomicend = end_alts_coords[i] + end_amb_length_5;
10992       for (j = 0; j < Substring_alts_ncoords(substring3); j++) {
10993 	genomicstart = start_alts_coords[j] - start_amb_length_3;
10994 	debug9(printf(" %u,%u",(Chrpos_T) (genomicend - hit5->chroffset),(Chrpos_T) (genomicstart - hit3->chroffset)));
10995 	if (genomicend < genomicstart) {
10996 	  /* Look for valid insertlength */
10997 	  insertlength = genomicstart - genomicend + querylength5 + querylength3;
10998 	  debug9(printf(" (insertlength %u)",insertlength));
10999 
11000 	  if (insertlength < best_insertlength) {
11001 	    besti5 = i;
11002 	    besti3 = j;
11003 	    best_insertlength = insertlength;
11004 	    best_nmismatches = end_alts_nmismatches[i] + start_alts_nmismatches[j];
11005 	    debug9(printf("*"));
11006 	  } else if (insertlength == best_insertlength &&
11007 		     (nmismatches = end_alts_nmismatches[i] + start_alts_nmismatches[j]) < best_nmismatches) {
11008 	    besti5 = i;
11009 	    besti3 = j;
11010 	    best_nmismatches = nmismatches;
11011 	    debug9(printf("*"));
11012 	  } else if (nmismatches == best_nmismatches) {
11013 	    debug9(printf("tie"));
11014 	  }
11015 	}
11016       }
11017     }
11018 
11019     if (besti5 >= 0 && besti3 >= 0) {
11020       debug9(printf("\nBEST HAS INSERTLENGTH %u AND NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11021       *alts_resolve_5 = besti5;
11022       *alts_resolve_3 = besti3;
11023       *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11024       hit5->genomicend = end_alts_coords[besti5] + end_amb_length_5;
11025       hit3->genomicstart = start_alts_coords[besti3] - start_amb_length_3;
11026     }
11027     debug9(printf("\n"));
11028 
11029   } else if (substring5 != NULL && Substring_has_alts_p(substring5) == true) {
11030     debug9(printf("Resolve plus case 2: Got alts at 5':"));
11031     end_alts_coords = Substring_alts_coords(substring5);
11032     end_alts_nmismatches = Substring_alts_nmismatches(substring5);
11033     end_amb_length_5 = end_amb_length(hit5);
11034 
11035     best_insertlength = (Chrpos_T) -1;
11036     best_nmismatches = querylength5;
11037     for (i = 0; i < Substring_alts_ncoords(substring5); i++) {
11038       genomicend = end_alts_coords[i] + end_amb_length_5;
11039       debug9(printf(" %u",(Chrpos_T) (genomicend - hit5->chroffset)));
11040       if (genomicend < hit3->genomicstart /*allow overlap*/+ querylength3) {
11041 	/* Look for valid insertlength */
11042 	insertlength = hit3->genomicstart - genomicend + querylength5 + querylength3;
11043 	debug9(printf(" (insertlength %u)",insertlength));
11044 
11045 	if (insertlength < best_insertlength) {
11046 	  besti5 = i;
11047 	  best_insertlength = insertlength;
11048 	  best_nmismatches = end_alts_nmismatches[i];
11049 	  debug9(printf("*"));
11050 	} else if (insertlength == best_insertlength &&
11051 		   (nmismatches = end_alts_nmismatches[i]) < best_nmismatches) {
11052 	  besti5 = i;
11053 	  best_nmismatches = nmismatches;
11054 	  debug9(printf("*"));
11055 	} else if (nmismatches == best_nmismatches) {
11056 	  debug9(printf("tie"));
11057 	}
11058       }
11059     }
11060 
11061     if (besti5 >= 0) {
11062       debug9(printf("\nBEST HAS INSERTLENGTH %u WITH NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11063       *alts_resolve_5 = besti5;
11064       *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11065       hit5->genomicend = end_alts_coords[besti5] + end_amb_length_5;
11066     }
11067     debug9(printf("\n"));
11068 
11069   } else if (substring3 != NULL && Substring_has_alts_p(substring3) == true) {
11070     debug9(printf("Resolve plus case 3: Got alts at 3':"));
11071     start_alts_coords = Substring_alts_coords(substring3);
11072     start_alts_nmismatches = Substring_alts_nmismatches(substring3);
11073     start_amb_length_3 = start_amb_length(hit3);
11074 
11075     best_insertlength = (Chrpos_T) -1;
11076     best_nmismatches = querylength3;
11077     for (j = 0; j < Substring_alts_ncoords(substring3); j++) {
11078       genomicstart = start_alts_coords[j] - start_amb_length_3;
11079       debug9(printf(" %u",(Chrpos_T) (genomicstart - hit3->chroffset)));
11080       if (hit5->genomicend < genomicstart /*allow overlap*/+ querylength5) {
11081 	/* Look for valid insertlength */
11082 	insertlength = genomicstart - hit5->genomicend + querylength5 + querylength3;
11083 	debug9(printf(" (insertlength %u)",insertlength));
11084 
11085 	if (insertlength < best_insertlength) {
11086 	  besti3 = j;
11087 	  best_insertlength = insertlength;
11088 	  best_nmismatches = start_alts_nmismatches[j];
11089 	  debug9(printf("*"));
11090 	} else if (insertlength == best_insertlength &&
11091 		   (nmismatches = start_alts_nmismatches[j]) < best_nmismatches) {
11092 	  besti3 = j;
11093 	  best_nmismatches = nmismatches;
11094 	  debug9(printf("*"));
11095 	} else if (nmismatches == best_nmismatches) {
11096 	  debug9(printf("tie"));
11097 	}
11098       }
11099     }
11100 
11101     if (besti3 >= 0) {
11102       debug9(printf("\nBEST HAS INSERTLENGTH %u WITH NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11103       *alts_resolve_3 = besti3;
11104       *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11105       hit3->genomicstart = start_alts_coords[besti3] - start_amb_length_3;
11106     }
11107     debug9(printf("\n"));
11108   }
11109 
11110   return;
11111 }
11112 
11113 
11114 static void
resolve_inside_alts_splice_minus(int * alts_resolve_5,int * alts_resolve_3,int * alts_status_inside,T hit5,T hit3,int querylength5,int querylength3)11115 resolve_inside_alts_splice_minus (int *alts_resolve_5, int *alts_resolve_3,
11116 				  int *alts_status_inside, T hit5, T hit3, int querylength5, int querylength3) {
11117   Chrpos_T best_insertlength, insertlength;
11118   Univcoord_T genomicstart, genomicend;
11119   int besti5 = -1, besti3 = -1, i, j;
11120   int best_nmismatches, nmismatches;
11121 
11122   Substring_T substring5, substring3;
11123   Univcoord_T *end_alts_coords, *start_alts_coords;
11124   int *end_alts_nmismatches, *start_alts_nmismatches;
11125   int end_amb_length_5, start_amb_length_3;
11126 
11127 
11128   debug9(printf("resolve minus: hit5 %p (%s) and hit3 %p (%s)\n",
11129 		hit5,Method_string(hit5->method),hit3,Method_string(hit3->method)));
11130 
11131   substring5 = (Substring_T) List_head(hit5->substrings_Nto1); /* the substring for concordance */
11132   debug9(printf("Testing substring5 %p %d..%d alts_p %d\n",
11133 		substring5,Stage3end_substrings_querystart(hit5),Stage3end_substrings_queryend(hit5),
11134 		Substring_has_alts_p(substring5)));
11135 
11136   substring3 = (Substring_T) List_head(hit3->substrings_1toN); /* the substring for concordance */
11137   debug9(printf("Testing substring3 %p %d..%d alts_p %d\n",
11138 		substring3,Stage3end_substrings_querystart(hit3),Stage3end_substrings_queryend(hit3),
11139 		Substring_has_alts_p(substring3)));
11140 
11141   if (substring5 != NULL && Substring_has_alts_p(substring5) == true &&
11142       substring3 != NULL && Substring_has_alts_p(substring3) == true) {
11143     debug9(printf("Resolve minus case 1: Got alts at 5' and alts at 3':"));
11144     end_alts_coords = Substring_alts_coords(substring5);
11145     end_alts_nmismatches = Substring_alts_nmismatches(substring5);
11146     start_alts_coords = Substring_alts_coords(substring3);
11147     start_alts_nmismatches = Substring_alts_nmismatches(substring3);
11148     end_amb_length_5 = end_amb_length(hit5);
11149     start_amb_length_3 = start_amb_length(hit3);
11150 
11151     best_insertlength = (Chrpos_T) -1;
11152     best_nmismatches = querylength5 + querylength3;
11153     for (i = 0; i < Substring_alts_ncoords(substring5); i++) {
11154       genomicend = end_alts_coords[i] - end_amb_length_5;
11155       for (j = 0; j < Substring_alts_ncoords(substring3); j++) {
11156 	genomicstart = start_alts_coords[j] + start_amb_length_3;
11157 	debug9(printf(" %u,%u",(Chrpos_T) (genomicend - hit5->chroffset),(Chrpos_T) (genomicstart - hit3->chroffset)));
11158 	if (genomicstart < genomicend) {
11159 	  /* Look for valid insertlength */
11160 	  insertlength = genomicend - genomicstart + querylength5 + querylength3;
11161 	  debug9(printf(" (insertlength %u)",insertlength));
11162 
11163 	  if (insertlength < best_insertlength) {
11164 	    besti5 = i;
11165 	    besti3 = j;
11166 	    best_insertlength = insertlength;
11167 	    best_nmismatches = end_alts_nmismatches[i] + start_alts_nmismatches[j];
11168 	    debug9(printf("*"));
11169 	  } else if (insertlength == best_insertlength &&
11170 		     (nmismatches = end_alts_nmismatches[i] + start_alts_nmismatches[j]) < best_nmismatches) {
11171 	    besti5 = i;
11172 	    besti3 = j;
11173 	    best_nmismatches = nmismatches;
11174 	    debug9(printf("*"));
11175 	  } else if (nmismatches == best_nmismatches) {
11176 	    debug9(printf("tie"));
11177 	  }
11178 	}
11179       }
11180     }
11181 
11182     if (besti5 >= 0 && besti3 >= 0) {
11183       debug9(printf("\nBEST HAS INSERTLENGTH %u AND NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11184       *alts_resolve_5 = besti5;
11185       *alts_resolve_3 = besti3;
11186       *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11187       hit5->genomicend = end_alts_coords[besti5] - end_amb_length_5;
11188       hit3->genomicstart = start_alts_coords[besti3] + start_amb_length_3;
11189     }
11190     debug9(printf("\n"));
11191 
11192   } else if (substring5 != NULL && Substring_has_alts_p(substring5) == true) {
11193     debug9(printf("Resolve minus case 2: Got alts at 5':"));
11194     end_alts_coords = Substring_alts_coords(substring5);
11195     end_alts_nmismatches = Substring_alts_nmismatches(substring5);
11196     end_amb_length_5 = end_amb_length(hit5);
11197 
11198     best_insertlength = (Chrpos_T) -1;
11199     best_nmismatches = querylength5;
11200     for (i = 0; i < Substring_alts_ncoords(substring5); i++) {
11201       genomicend = end_alts_coords[i] - end_amb_length_5;
11202       debug9(printf(" %u",(Chrpos_T) (genomicend - hit5->chroffset)));
11203       debug9(printf(" (%u <? %u + %d)",hit3->genomicstart,genomicend,querylength3));
11204       if (hit3->genomicstart < genomicend /*allow overlap*/+ querylength3) {
11205 	/* Look for valid insertlength */
11206 	insertlength = genomicend - hit3->genomicstart + querylength5 + querylength3;
11207 	debug9(printf(" (insertlength %u)",insertlength));
11208 
11209 	if (insertlength < best_insertlength) {
11210 	  besti5 = i;
11211 	  best_insertlength = insertlength;
11212 	  best_nmismatches = end_alts_nmismatches[i];
11213 	  debug9(printf("*"));
11214 	} else if (insertlength == best_insertlength &&
11215 		   (nmismatches = end_alts_nmismatches[i]) < best_nmismatches) {
11216 	  besti5 = i;
11217 	  best_nmismatches = nmismatches;
11218 	  debug9(printf("*"));
11219 	} else if (nmismatches == best_nmismatches) {
11220 	  debug9(printf("tie"));
11221 	}
11222       }
11223     }
11224 
11225     if (besti5 >= 0) {
11226       debug9(printf("\nBEST HAS INSERTLENGTH %u WITH NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11227       *alts_resolve_5 = besti5;
11228       *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11229       hit5->genomicend = end_alts_coords[besti5] - end_amb_length_5;
11230     }
11231     debug9(printf("\n"));
11232 
11233   } else if (substring3 != NULL && Substring_has_alts_p(substring3) == true) {
11234     debug9(printf("Resolve minus case 3: Got alts at 3':"));
11235     start_alts_coords = Substring_alts_coords(substring3);
11236     start_alts_nmismatches = Substring_alts_nmismatches(substring3);
11237     start_amb_length_3 = start_amb_length(hit3);
11238 
11239     best_insertlength = (Chrpos_T) -1;
11240     best_nmismatches = querylength3;
11241     for (j = 0; j < Substring_alts_ncoords(substring3); j++) {
11242       genomicstart = start_alts_coords[j] + start_amb_length_3;
11243       debug9(printf(" %u",(Chrpos_T) (genomicstart - hit3->chroffset)));
11244       if (genomicstart < hit5->genomicend /*allow overlap*/+ querylength5) {
11245 	/* Look for valid insertlength */
11246 	insertlength = hit5->genomicend - genomicstart + querylength5 + querylength3;
11247 	debug9(printf(" (insertlength %u)",insertlength));
11248 
11249 	if (insertlength < best_insertlength) {
11250 	  besti3 = j;
11251 	  best_insertlength = insertlength;
11252 	  best_nmismatches = start_alts_nmismatches[j];
11253 	  debug9(printf("*"));
11254 	} else if (insertlength == best_insertlength &&
11255 		   (nmismatches = start_alts_nmismatches[j]) < best_nmismatches) {
11256 	  besti3 = j;
11257 	  best_nmismatches = nmismatches;
11258 	  debug9(printf("*"));
11259 	} else if (nmismatches == best_nmismatches) {
11260 	  debug9(printf("tie"));
11261 	}
11262       }
11263     }
11264 
11265     if (besti3 >= 0) {
11266       debug9(printf("\nBEST HAS INSERTLENGTH %u WITH NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11267       *alts_resolve_3 = besti3;
11268       *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11269       hit3->genomicstart = start_alts_coords[besti3] + start_amb_length_3;
11270     }
11271     debug9(printf("\n"));
11272   }
11273 
11274   return;
11275 }
11276 
11277 
11278 
11279 static void
alias_circular(T hit)11280 alias_circular (T hit) {
11281   Chrpos_T chrlength = hit->chrlength;
11282   List_T p;
11283   Substring_T substring;
11284 
11285   assert(hit->circularalias == -1);
11286   for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
11287     substring = (Substring_T) List_head(p);
11288     Substring_alias_circular(substring);
11289   }
11290 
11291   /* Doesn't fix hitpair->low and hitpair->high */
11292   hit->genomicstart += chrlength;
11293   hit->genomicend += chrlength;
11294   hit->low += chrlength;
11295   hit->high += chrlength;
11296 
11297   hit->circularalias = +1;
11298 
11299   return;
11300 }
11301 
11302 
11303 /* Previously allowed for private5p or private3p to be true.  But now
11304    always copying (because concordance procedure can delete hits), and
11305    so private5p and private3p are essentially true. */
11306 Stage3pair_T
Stage3pair_new(T hit5_orig,T hit3_orig,int genestrand,int sensedir,Pairtype_T pairtype,int * mismatch_positions_alloc_5,int * mismatch_positions_alloc_3,Compress_T query5_compress_fwd,Compress_T query5_compress_rev,Compress_T query3_compress_fwd,Compress_T query3_compress_rev,Listpool_T listpool,bool expect_concordant_p,bool transcriptome_guided_p)11307 Stage3pair_new (T hit5_orig, T hit3_orig, int genestrand, int sensedir, Pairtype_T pairtype,
11308 		int *mismatch_positions_alloc_5, int *mismatch_positions_alloc_3,
11309 		Compress_T query5_compress_fwd, Compress_T query5_compress_rev,
11310 		Compress_T query3_compress_fwd, Compress_T query3_compress_rev,
11311 		Listpool_T listpool, bool expect_concordant_p, bool transcriptome_guided_p) {
11312   Stage3pair_T new;
11313   Stage3end_T hit5, hit3;
11314   Substring_T substring1, substringN;
11315   int alts_resolve_5, alts_resolve_3;
11316 
11317   /* int found_score = 0; */
11318   bool overreach5p, overreach3p;
11319   Chrpos_T pairmax;
11320 
11321   int querylength5 = hit5_orig->querylength;
11322   int querylength3 = hit3_orig->querylength;
11323 
11324   char *remap_sequence;
11325   int remap_seqlength;
11326   List_T transcripts;
11327 
11328 
11329   debug0(printf("\nStage3pair_new called with pairtype %s and chrnum %d, %d (effective %d, %d), expect_concordant_p %d\n",
11330 		Pairtype_string(pairtype),hit5_orig->chrnum,hit3_orig->chrnum,
11331 		hit5_orig->effective_chrnum,hit3_orig->effective_chrnum,expect_concordant_p));
11332 
11333   /* Always make a copy, because concordance procedure might delete the hit */
11334   hit5 = Stage3end_copy(hit5_orig,listpool);
11335   hit3 = Stage3end_copy(hit3_orig,listpool);
11336 
11337   new = (Stage3pair_T) MALLOC_OUT(sizeof(*new));
11338 
11339   if (pairtype == PAIRED_UNSPECIFIED || pairtype == UNSPECIFIED) {
11340     /* Can get here from running GMAP improvement on a paired result */
11341     pairtype = Stage3_determine_pairtype(hit5,hit3,/*stage3pair*/NULL);
11342     debug10(printf("  Changing pairtype to %s\n",Pairtype_string(pairtype)));
11343     if (pairtype == CONCORDANT) {
11344       expect_concordant_p = true;
11345     }
11346   }
11347   new->pairtype = pairtype;
11348   new->genestrand = genestrand;
11349   new->sensedir = sensedir;
11350 
11351   alts_resolve_5 = -1;
11352   alts_resolve_3 = -1;
11353   new->alts_status_inside = ALTS_NOT_AMBIGUOUS;
11354 
11355 
11356 #if 0
11357   new->mapq_loglik = hit5->mapq_loglik + hit3->mapq_loglik;
11358   new->mapq_score = 0;
11359   new->absmq_score = 0;
11360 #endif
11361 
11362   if (hit5->plusp == true && hit3->plusp == false) {
11363     debug10(printf("plus/minus\n"));
11364     new->dir = 0;
11365 
11366     /* Have 5-start..end and 3-end..start */
11367     /*   or 3-end..start and 5-start..end */
11368 
11369     new->pair_relationship = 0;
11370     if (hit5->genomicend < hit3->genomicend) {
11371       new->insertlength = (hit3->genomicend - hit5->genomicend) + querylength5 + querylength3;
11372       new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11373     } else if (hit3->genomicstart < hit5->genomicstart) {
11374       new->insertlength = (hit5->genomicstart - hit3->genomicstart) + querylength5 + querylength3;
11375       new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11376     } else {
11377       new->insertlength = pair_insert_length_unpaired(hit5,hit3); /* was 0 */
11378       new->insertlength_expected_sign = false;
11379     }
11380 
11381   } else if (hit5->plusp == false && hit3->plusp == true) {
11382     debug10(printf("minus/plus\n"));
11383     new->dir = 0;
11384 
11385     /* Have 5-end..start and 3-start..end */
11386     /*   or 3-start..end and 5-end..start */
11387 
11388     new->pair_relationship = 0;
11389     if (hit5->genomicstart < hit3->genomicstart) {
11390       new->insertlength = (hit3->genomicstart - hit5->genomicstart) + querylength5 + querylength3;
11391       new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11392     } else if (hit3->genomicend < hit5->genomicend) {
11393       new->insertlength = (hit5->genomicend - hit3->genomicend) + querylength5 + querylength3;
11394       new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11395     } else {
11396       new->insertlength = pair_insert_length_unpaired(hit5,hit3); /* was 0 */
11397       new->insertlength_expected_sign = false;
11398     }
11399 
11400   } else if (hit5->plusp == true) {
11401     /* Concordant directions on same chromosome (plus) */
11402     debug10(printf("*Concordant on plus strand\n"));
11403     new->dir = +1;
11404 
11405     if (expect_concordant_p == true) {
11406       overreach5p = overreach3p = false;
11407       if (hit5->hittype == SPLICE) {
11408 
11409 	substringN = (Substring_T) List_head(hit5->substrings_Nto1);
11410 	if (Substring_alignstart_trim(substringN) > hit3->genomicend) {
11411 	  substring1 = (Substring_T) List_head(hit5->substrings_1toN);
11412 	  if (Substring_alignend_trim(substring1) < hit3->genomicstart) {
11413 	    overreach5p = true;
11414 	  }
11415 	}
11416       }
11417       if (hit3->hittype == SPLICE) {
11418 	substring1 = (Substring_T) List_head(hit3->substrings_1toN);
11419 	if (Substring_alignend_trim(substring1) < hit5->genomicstart) {
11420 	  substringN = (Substring_T) List_head(hit3->substrings_Nto1);
11421 	  if (Substring_alignstart_trim(substringN) > hit5->genomicend) {
11422 	    overreach3p = true;
11423 	  }
11424 	}
11425       }
11426 
11427       if (overreach5p == true || overreach3p == true) {
11428 	/* Either overreach */
11429 	debug0(printf("  Returning NULL because of dual overreach\n"));
11430 	Stage3end_free(&hit5);	/* This was the copy */
11431 	Stage3end_free(&hit3);	/* This was the copy */
11432 	FREE_OUT(new);
11433 	return (Stage3pair_T) NULL;
11434 
11435 #if 0
11436       } else if (overreach5p == true) {
11437 	/* Overreach of hit5 */
11438 	debug9(printf("Overreach of hit5 of type SPLICE.  Removing substring2\n"));
11439 	if (hit5->sensedir == SENSE_FORWARD) {
11440 	  copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_bothdiff(hit5->substring1),
11441 				      /*nmismatches_acceptor*/0,/*donor*/hit5->substring1,/*acceptor*/NULL,/*distance*/0U,
11442 				      /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11443 				      /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11444 				      /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11445 				      /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11446 				      /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/true,
11447 				      /*sensedir*/hit5->sensedir,listpool,hit5->method,hit5->level);
11448 	} else if (hit5->sensedir == SENSE_ANTI) {
11449 	  copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
11450 				      /*nmismatches_acceptor*/Substring_nmismatches_bothdiff(hit5->substring1),/*donor*/NULL,
11451 				      /*acceptor*/hit5->substring1,/*distance*/0U,
11452 				      /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11453 				      /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11454 				      /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11455 				      /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11456 				      /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/true,
11457 				      /*sensedir*/hit5->sensedir,listpool,hit5->method,hit5->level);
11458 	} else {
11459 	  abort();
11460 	}
11461 	Stage3end_free(&hit5);	/* This was the copy */
11462 	hit5 = copy;
11463 
11464       } else if (overreach3p == true) {
11465 	/* Overreach of hit3 */
11466 	debug9(printf("Overreach of hit3 of type SPLICE.  Removing substring1\n"));
11467 	if (hit3->sensedir == SENSE_FORWARD) {
11468 	  copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
11469 				      /*nmismatches_acceptor*/Substring_nmismatches_bothdiff(hit3->substring2),/*donor*/NULL,
11470 				      /*acceptor*/hit3->substring2,/*distance*/0U,
11471 				      /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11472 				      /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11473 				      /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11474 				      /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11475 				      /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/false,
11476 				      /*sensedir*/hit3->sensedir,listpool,hit3->method,hit3->level);
11477 	} else if (hit3->sensedir == SENSE_ANTI) {
11478 	  copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_bothdiff(hit3->substring2),
11479 				      /*nmismatches_acceptor*/0,/*donor*/hit3->substring2,/*acceptor*/NULL,/*distance*/0U,
11480 				      /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11481 				      /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11482 				      /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11483 				      /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11484 				      /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/false,
11485 				      /*sensedir*/hit3->sensedir,listpool,hit3->method,hit3->level);
11486 	} else {
11487 	  abort();
11488 	}
11489 	Stage3end_free(&hit3);	/* This was the copy */
11490 	hit3 = copy;
11491 #endif
11492       }
11493 
11494       /* Try to resolve ambiguity on inside of concordant ends */
11495       debug9(printf("Calling resolve_inside_alts_splice_plus\n"));
11496       resolve_inside_alts_splice_plus(&alts_resolve_5,&alts_resolve_3,
11497 				      &new->alts_status_inside,hit5,hit3,querylength5,querylength3);
11498       if (alts_resolve_5 >= 0) {
11499 	resolve_ambiguity_5(hit5,mismatch_positions_alloc_5,query5_compress_fwd,alts_resolve_5);
11500       }
11501       if (alts_resolve_3 >= 0) {
11502 	resolve_ambiguity_3(hit3,mismatch_positions_alloc_3,query3_compress_fwd,alts_resolve_3);
11503       }
11504 
11505       debug9(printf("For pair %p (%p and %p), set alts_resolve_5 to be %d and alts_resolve_3 to be %d\n",
11506 		    new,hit5,hit3,alts_resolve_5,alts_resolve_3));
11507     }
11508 
11509     /* Have 5-start..end and 3-start..end */
11510     if (hit5->genomicend < hit3->genomicstart) {
11511       /* No overlap */
11512       new->pair_relationship = +1;
11513       new->insertlength = (hit3->genomicstart - hit5->genomicend) + querylength5 + querylength3;
11514       new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11515       debug10(printf("plus, no overlap: insert length %d = start3 %u - end5 %u + %d + %d\n",
11516 		     new->insertlength,hit3->genomicstart - hit3->chroffset,
11517 		     hit5->genomicend - hit5->chroffset,querylength5,querylength3));
11518 #if 0
11519     } else if (hit5->genomicend > hit3->genomicend + SUBSUMPTION_SLOP) {
11520       /* hit5 subsumes hit3 */
11521       debug10(printf("plus, subsumption %u > %u\n",
11522 		     hit5->genomicend - hit5->chroffset,hit3->genomicend - hit3->chroffset));
11523       new->pair_relationship = 0;
11524       new->insertlength = 0;
11525       new->insertlength_expected_sign = false;
11526 #endif
11527     } else {
11528       new->insertlength = pair_insert_length(&new->pair_relationship,hit5,hit3);
11529       new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11530     }
11531 
11532 
11533   } else {
11534     /* Concordant directions on same chromosome (minus) */
11535     debug10(printf("*Concordant on minus strand\n"));
11536     new->dir = -1;
11537 
11538     if (expect_concordant_p == true) {
11539       overreach5p = overreach3p = false;
11540       if (hit5->hittype == SPLICE) {
11541 	debug10(printf("Have splice on 5' end\n"));
11542 	substringN = (Substring_T) List_head(hit5->substrings_Nto1);
11543 	if (Substring_alignstart_trim(substringN) < hit3->genomicend) {
11544 	  substring1 = (Substring_T) List_head(hit5->substrings_1toN);
11545 	  if (Substring_alignend_trim(substring1) > hit3->genomicstart) {
11546 	    overreach5p = true;
11547 	  }
11548 	}
11549       }
11550       if (hit3->hittype == SPLICE) {
11551 	debug10(printf("Have splice on 3' end\n"));
11552 	substring1 = (Substring_T) List_head(hit3->substrings_1toN);
11553 	if (Substring_alignend_trim(substring1) > hit5->genomicstart) {
11554 	  substringN = (Substring_T) List_head(hit3->substrings_Nto1);
11555 	  if (Substring_alignstart_trim(substringN) < hit5->genomicend) {
11556 	    overreach3p = true;
11557 	  }
11558 	}
11559       }
11560 
11561       if (overreach5p == true || overreach3p == true) {
11562 	/* Either overreach */
11563 	debug0(printf("  Returning NULL because of dual overreach\n"));
11564 	Stage3end_free(&hit5); /* This was the copy */
11565 	Stage3end_free(&hit3); /* This was the copy */
11566 	FREE_OUT(new);
11567 	return (Stage3pair_T) NULL;
11568 
11569 #if 0
11570       } else if (overreach5p == true) {
11571 	/* Overreach of hit5 */
11572 	debug9(printf("Overreach of hit5 of type SPLICE.  Removing substring2\n"));
11573 	if (hit5->sensedir == SENSE_FORWARD) {
11574 	  copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_bothdiff(hit5->substring1),
11575 				      /*nmismatches_acceptor*/0,/*donor*/hit5->substring1,/*acceptor*/NULL,/*distance*/0U,
11576 				      /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11577 				      /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11578 				      /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11579 				      /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11580 				      /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/true,
11581 				      /*sensedir*/hit5->sensedir,listpool,hit5->method,hit5->level);
11582 	} else if (hit5->sensedir == SENSE_ANTI) {
11583 	  copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
11584 				      /*nmismatches_acceptor*/Substring_nmismatches_bothdiff(hit5->substring1),/*donor*/NULL,
11585 				      /*acceptor*/hit5->substring1,/*distance*/0U,
11586 				      /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11587 				      /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11588 				      /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11589 				      /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11590 				      /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/true,
11591 				      /*sensedir*/hit5->sensedir,listpool,hit5->method,hit5->level);
11592 	} else {
11593 	  abort();
11594 	}
11595 	Stage3end_free(&hit5);	/* This was the copy */
11596 	hit5 = copy;
11597 
11598       } else if (overreach3p == true) {
11599 	/* Overreach of hit3 */
11600 	debug9(printf("Overreach of hit3 of type SPLICE.  Removing substring1\n"));
11601 	if (hit3->sensedir == SENSE_FORWARD) {
11602 	  copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
11603 				      /*nmismatches_acceptor*/Substring_nmismatches_bothdiff(hit3->substring2),/*donor*/NULL,
11604 				      /*acceptor*/hit3->substring2,/*distance*/0U,
11605 				      /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11606 				      /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11607 				      /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11608 				      /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11609 				      /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/false,
11610 				      /*sensedir*/hit3->sensedir,listpool,hit3->method,hit3->level);
11611 	} else if (hit3->sensedir == SENSE_ANTI) {
11612 	  copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_bothdiff(hit3->substring2),
11613 				      /*nmismatches_acceptor*/0,/*donor*/hit3->substring2,/*acceptor*/NULL,/*distance*/0U,
11614 				      /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11615 				      /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11616 				      /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11617 				      /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11618 				      /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/false,
11619 				      /*sensedir*/hit3->sensedir,listpool,hit3->method,hit3->level);
11620 	} else {
11621 	  abort();
11622 	}
11623 	Stage3end_free(&hit3);	/* This was the copy */
11624 	hit3 = copy;
11625 #endif
11626       }
11627 
11628       /* Try to resolve ambiguity on inside of concordant ends */
11629       debug9(printf("Calling resolve_inside_alts_splice_minus\n"));
11630       resolve_inside_alts_splice_minus(&alts_resolve_5,&alts_resolve_3,
11631 				       &new->alts_status_inside,hit5,hit3,querylength5,querylength3);
11632       if (alts_resolve_5 >= 0) {
11633 	resolve_ambiguity_5(hit5,mismatch_positions_alloc_5,query5_compress_rev,alts_resolve_5);
11634       }
11635       if (alts_resolve_3 >= 0) {
11636 	resolve_ambiguity_3(hit3,mismatch_positions_alloc_3,query3_compress_rev,alts_resolve_3);
11637       }
11638 
11639       debug9(printf("For pair %p (%p and %p), set alts_resolve_5 to be %d and alts_resolve_3 to be %d\n",
11640 		    new,hit5,hit3,alts_resolve_5,alts_resolve_3));
11641     }
11642 
11643     /* Have 3-end..start and 5-end..start */
11644     if (hit3->genomicstart < hit5->genomicend) {
11645       /* No overlap */
11646       new->pair_relationship = -1;
11647       new->insertlength = (hit5->genomicend - hit3->genomicstart) + querylength5 + querylength3;
11648       new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11649       debug10(printf("minus, no overlap: insert length %d = end5 %u - start3 %u + %d + %d\n",
11650 		     new->insertlength,hit5->genomicend - hit5->chroffset,
11651 		     hit3->genomicstart - hit3->chroffset,querylength5,querylength3));
11652 #if 0
11653     } else if (hit3->genomicstart > hit5->genomicstart + SUBSUMPTION_SLOP) {
11654       /* hit3 subsumes hit5 */
11655       debug10(printf("minus, subsumption %u > %u\n",
11656 		     hit3->genomicstart - hit3->chroffset,hit5->genomicstart - hit5->chroffset));
11657       new->pair_relationship = 0;
11658       new->insertlength = 0;
11659       new->insertlength_expected_sign = false;
11660 #endif
11661     } else {
11662       new->insertlength = pair_insert_length(&new->pair_relationship,hit5,hit3);
11663       new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11664     }
11665   }
11666 
11667   debug10(printf("\nGot initial insertlength of %d\n",new->insertlength));
11668 
11669   new->hit5 = hit5;
11670   new->hit3 = hit3;
11671 
11672   /* Was new->insertlength <= 0, but this eliminates legitimate overlaps */
11673   /* Was new->insertlength < -pairmax, but this allows overreach */
11674   if (new->insertlength <= 0) {	/* Not possible, since insertlength is unsigned */
11675     /* Not concordant */
11676 #ifdef USE_BINGO
11677     new->absdifflength_bingo_p = false;
11678 #endif
11679 #ifdef USE_ABSDIFFLENGTH
11680     new->absdifflength = (Chrpos_T) -1;
11681 #endif
11682 
11683     if (expect_concordant_p == true) {
11684       debug0(printf("  Returning NULL, because insertlength %u, so not concordant\n",new->insertlength));
11685       Stage3end_free(&hit5);	/* This was the copy */
11686       Stage3end_free(&hit3);	/* This was the copy */
11687       FREE_OUT(new);
11688       return (Stage3pair_T) NULL;
11689     }
11690 
11691   } else {
11692     if (transcriptome_guided_p == true) {
11693       pairmax = (Chrpos_T) -1;
11694     } else if (circularp[hit5->effective_chrnum] == true) {
11695       pairmax = pairmax_circular;
11696     } else {
11697       pairmax = pairmax_linear;
11698     }
11699     if (new->insertlength > pairmax && expect_concordant_p == true) {
11700       debug0(printf("  Returning NULL because insertlength %u > pairmax %d\n",new->insertlength,pairmax));
11701       Stage3end_free(&hit5);	/* This was the copy */
11702       Stage3end_free(&hit3);	/* This was the copy */
11703       FREE_OUT(new);
11704       return (Stage3pair_T) NULL;
11705 
11706     } else {
11707 #ifdef USE_ABSDIFFLENGTH
11708       if (new->insertlength < expected_pairlength) {
11709 	new->absdifflength = expected_pairlength - new->insertlength;
11710       } else {
11711 	new->absdifflength = new->insertlength - expected_pairlength;
11712       }
11713 #endif
11714 #ifdef USE_BINGO
11715       if (new->absdifflength <= pairlength_deviation) {
11716 	new->absdifflength_bingo_p = true;
11717       } else {
11718 	new->absdifflength_bingo_p = false;
11719       }
11720 #endif
11721     }
11722   }
11723 
11724   if (SENSE_CONSISTENT_P(hit5->sensedir_for_concordance,hit3->sensedir_for_concordance)) {
11725     debug0(printf("senses %d and %d are consistent\n",hit5->sensedir_for_concordance,hit3->sensedir_for_concordance));
11726     new->sense_consistent_p = true;
11727 
11728   } else if (expect_concordant_p == true) {
11729     debug0(printf("  Returning NULL, because senses are not consistent\n"));
11730     Stage3end_free(&hit5); 	/* This was the copy */
11731     Stage3end_free(&hit3);	/* This was the copy */
11732     FREE_OUT(new);
11733     return (Stage3pair_T) NULL;
11734 
11735   } else {
11736     debug0(printf("senses are inconsistent, but allowable\n"));
11737     new->sense_consistent_p = false;
11738   }
11739 
11740   /* No longer add scores from hit5 and hit3 */
11741 
11742   /* new->overlap_known_gene_p = false; -- initialized later when resolving multimappers */
11743   /* new->tally = -1L; */
11744 
11745   new->low = (hit5->low < hit3->low) ? hit5->low : hit3->low;
11746   new->high = (hit5->high > hit3->high) ? hit5->high : hit3->high;
11747   debug0(printf("hit5 %u..%u and hit3 %u..%u => %u..%u\n",
11748 		hit5->low,hit5->high,hit3->low,hit3->high,new->low,new->high));
11749 
11750 #if 0
11751   if (new->low > new->high) {
11752     fprintf(stderr,"new->low %u > new->high %u, hit5->chrnum %d\n",
11753 	    new->low - new->chroffset,new->high - new->chroffset,hit5->chrnum);
11754     abort();
11755   }
11756 #endif
11757 
11758   if (hit5->chrnum == 0 || hit3->chrnum == 0) {
11759     new->outerlength = querylength5 + querylength3;
11760   } else {
11761     assert(new->low < new->high);
11762     new->outerlength = new->high - new->low;
11763   }
11764 
11765   if (expect_concordant_p == true) {
11766     hit5_orig->paired_usedp = hit5->paired_usedp = true;
11767     hit3_orig->paired_usedp = hit3->paired_usedp = true;
11768   }
11769 
11770   new->nsplices = hit5->nsplices + hit3->nsplices;
11771 
11772   debug0(printf("Created new pair %p from %p and %p (nmatches_to_trims %d+%d)\n",
11773 		new,hit5,hit3,hit5->refalt_nmatches_to_trims,hit3->refalt_nmatches_to_trims));
11774   debug0(printf("  methods %s and %s\n",Method_string(hit5->method),Method_string(hit3->method)));
11775   debug0(printf("  sensedirs %d and %d\n",hit5->sensedir,hit3->sensedir));
11776   debug0(printf("  chrpos_1toN %u..%u and %u..%u\n",
11777 		hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,
11778 		hit3->genomicstart - hit3->chroffset,hit3->genomicend - hit3->chroffset));
11779   debug0(printf("  chrpos_LtoH %u..%u and %u..%u\n",
11780 		hit5->low - hit5->chroffset,hit5->high - hit5->chroffset,
11781 		hit3->low - hit3->chroffset,hit3->high - hit3->chroffset));
11782   debug0(printf("  outerlength %u = %u - %u\n",new->outerlength,new->high,new->low));
11783 
11784   if (hit5->circularpos < 0 && hit3->circularpos < 0) {
11785     new->circularp = false;
11786   } else {
11787     new->circularp = true;
11788   }
11789 
11790   /* Fixing insertlength for circular pairs */
11791   if (new->insertlength > hit5->chrlength) {
11792     new->insertlength -= hit5->chrlength;
11793   }
11794 
11795   if (hit5->circularalias == +1) {
11796     debug0(printf("Unaliasing 5' end\n"));
11797     unalias_circular(hit5);
11798   }
11799 
11800   if (hit3->circularalias == +1) {
11801     debug0(printf("Unaliasing 3' end\n"));
11802     unalias_circular(hit3);
11803   }
11804 
11805   if (remap_transcriptome_p == false) {
11806     /* Do not remap */
11807 
11808   } else if (hit5->transcripts != NULL && hit3->transcripts != NULL) {
11809     /* No need to remap */
11810 
11811   } else if (hit5->transcripts != NULL && hit3->transcripts == NULL) {
11812     debug0(printf("Remapping 3' end to transcriptome to match 5' end at %d:%u..%u\n",
11813 		  hit5->chrnum,hit5->low - hit5->chroffset,hit5->high - hit5->chroffset));
11814     remap_sequence = Stage3end_substrings_genomic_sequence(&remap_seqlength,hit3,genomecomp);
11815     debug0(printf("%s\n",remap_sequence));
11816 
11817     if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,hit3->chrnum,
11818 						/*lowbound*/hit3->low - hit3->chroffset,
11819 						/*highbound*/hit3->high - hit3->chroffset,
11820 						transcript_iit,transcriptomebits,transcriptome)) != NULL) {
11821       hit3->transcripts = transcripts;
11822     }
11823     FREE(remap_sequence);
11824 
11825   } else if (hit5->transcripts == NULL && hit3->transcripts != NULL) {
11826     debug0(printf("Remapping 5' end to transcriptome to match 3' end at %d:%u..%u\n",
11827 		  hit3->chrnum,hit3->low - hit3->chroffset,hit3->high - hit3->chroffset));
11828 
11829     remap_sequence = Stage3end_substrings_genomic_sequence(&remap_seqlength,hit5,genomecomp);
11830     debug0(printf("%s\n",remap_sequence));
11831     if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,hit5->chrnum,
11832 						/*lowbound*/hit5->low - hit5->chroffset,
11833 						/*highbound*/hit5->high - hit5->chroffset,
11834 						transcript_iit,transcriptomebits,transcriptome)) != NULL) {
11835       hit5->transcripts = transcripts;
11836     }
11837     FREE(remap_sequence);
11838   }
11839 
11840 #if 0
11841   /* Need this in addition to Stage3end_filter_concordant_tr, to
11842      eliminate any inconsistent transcripts */
11843   Transcript_concordance(&new->transcripts5,&new->transcripts3,hit5->transcripts,hit3->transcripts);
11844   debug0(printf("%d transcripts5, %d transcripts3\n",List_length(new->transcripts5),List_length(new->transcripts3)));
11845 #endif
11846 
11847   pairtype = Stage3_determine_pairtype(hit5,hit3,/*stage3pair*/new);
11848 
11849   /* assert((int) new->insertlength >= 0); */
11850   return new;
11851 }
11852 
11853 
11854 /* Used for eliminating exact duplicates.  Also sorts secondarily by hittype. */
11855 static int
hitpair_sort_cmp(const void * a,const void * b)11856 hitpair_sort_cmp (const void *a, const void *b) {
11857   Stage3pair_T x = * (Stage3pair_T *) a;
11858   Stage3pair_T y = * (Stage3pair_T *) b;
11859 
11860   Univcoord_T x_hit5_high, x_hit5_low, y_hit5_high, y_hit5_low;
11861   Univcoord_T x_hit3_high, x_hit3_low, y_hit3_high, y_hit3_low;
11862   Univcoord_T x_low, x_high, y_low, y_high;
11863 
11864   debug8(printf("  Comparing (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), circularalias %d|%d, nmatches: %d+%d (%d+%d to trims), amb_lengths %d and %d, sensedirs %d-%d, score %f+%f\n",
11865 		Pairtype_string(x->pairtype),Method_string(x->hit5->method),
11866 		Method_string(x->hit3->method),x,
11867 		x->hit5->low - x->hit5->chroffset,x->hit5->high - x->hit5->chroffset,
11868 		x->hit3->low - x->hit3->chroffset,x->hit3->high - x->hit3->chroffset,
11869 		x->dir,x->hit5->circularalias,x->hit3->circularalias,
11870 		x->hit5->refalt_nmatches_plus_spliced_trims,x->hit3->refalt_nmatches_plus_spliced_trims,
11871 		x->hit5->refalt_nmatches_to_trims,x->hit3->refalt_nmatches_to_trims,
11872 		amb_length(x->hit5),amb_length(x->hit3),x->hit5->sensedir,x->hit3->sensedir,
11873 		x->hit5->splice_score,x->hit3->splice_score));
11874 
11875   debug8(printf("       with (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), circularalias %d|%d, nmatches: %d+%d (%d+%d to trims), amb_lengths %d and %d, sensedirs %d-%d, score %f+%f\n",
11876 		Pairtype_string(y->pairtype),Method_string(y->hit5->method),
11877 		Method_string(y->hit3->method),y,
11878 		y->hit5->low - y->hit5->chroffset,y->hit5->high - y->hit5->chroffset,
11879 		y->hit3->low - y->hit3->chroffset,y->hit3->high - y->hit3->chroffset,
11880 		y->dir,y->hit5->circularalias,y->hit3->circularalias,
11881 		y->hit5->refalt_nmatches_plus_spliced_trims,y->hit3->refalt_nmatches_plus_spliced_trims,
11882 		y->hit5->refalt_nmatches_to_trims,y->hit3->refalt_nmatches_to_trims,
11883 		amb_length(y->hit5),amb_length(y->hit3),y->hit5->sensedir,y->hit3->sensedir,
11884 		y->hit5->splice_score,y->hit3->splice_score));
11885 
11886   x_hit5_low = normalize_coord(x->hit5->low,x->hit5->circularalias,x->hit5->chrlength);
11887   x_hit5_high = normalize_coord(x->hit5->high,x->hit5->circularalias,x->hit5->chrlength);
11888 
11889   x_hit3_low = normalize_coord(x->hit3->low,x->hit3->circularalias,x->hit3->chrlength);
11890   x_hit3_high = normalize_coord(x->hit3->high,x->hit3->circularalias,x->hit3->chrlength);
11891 
11892   x_low = (x_hit5_low < x_hit3_low) ? x_hit5_low : x_hit3_low;
11893   x_high = (x_hit5_high > x_hit3_high) ? x_hit5_high : x_hit3_high;
11894 
11895 
11896   y_hit5_low = normalize_coord(y->hit5->low,y->hit5->circularalias,y->hit5->chrlength);
11897   y_hit5_high = normalize_coord(y->hit5->high,y->hit5->circularalias,y->hit5->chrlength);
11898 
11899   y_hit3_low = normalize_coord(y->hit3->low,y->hit3->circularalias,y->hit3->chrlength);
11900   y_hit3_high = normalize_coord(y->hit3->high,y->hit3->circularalias,y->hit3->chrlength);
11901 
11902   y_low = (y_hit5_low < y_hit3_low) ? y_hit5_low : y_hit3_low;
11903   y_high = (y_hit5_high > y_hit3_high) ? y_hit5_high : y_hit3_high;
11904 
11905 
11906   if (x->dir != 0 && y->dir == 0) {
11907     return -1;
11908   } else if (x->dir == 0 && y->dir != 0) {
11909     return +1;
11910   } else if (x->dir > 0 && y->dir < 0) {
11911     return -1;
11912   } else if (x->dir < 0 && y->dir > 0) {
11913     return +1;
11914 
11915     /* low to high pattern needed for finding overlaps */
11916   } else if (x_low < y_low) {
11917     debug8(printf("Returning -1 for low\n"));
11918     return -1;
11919   } else if (y_low < x_low) {
11920     debug8(printf("Returning +1 for low\n"));
11921     return +1;
11922 
11923   } else if (x_high > y_high) {
11924     debug8(printf("Returning -1 for high\n"));
11925     return -1;
11926   } else if (y_high > x_high) {
11927     debug8(printf("Returning +1 for high\n"));
11928     return +1;
11929 
11930     /* Need to check inside ends to avoid declaring unequal hitpairs equal */
11931   } else if (x_hit5_low < y_hit5_low) {
11932     return -1;
11933   } else if (y_hit5_low < x_hit5_low) {
11934     return +1;
11935 
11936   } else if (x_hit5_high < y_hit5_high) {
11937     return -1;
11938   } else if (y_hit5_high < x_hit5_high) {
11939     return +1;
11940 
11941   } else if (x_hit3_low < y_hit3_low) {
11942     return -1;
11943   } else if (y_hit3_low < x_hit3_low) {
11944     return +1;
11945 
11946   } else if (x_hit3_high < y_hit3_high) {
11947     return -1;
11948   } else if (y_hit3_high < x_hit3_high) {
11949     return +1;
11950 
11951 
11952   } else if (x->hit5->refalt_score_within_trims +
11953 	     x->hit3->refalt_score_within_trims <
11954 	     y->hit5->refalt_score_within_trims +
11955 	     y->hit3->refalt_score_within_trims) {
11956     return -1;
11957   } else if (y->hit5->refalt_score_within_trims +
11958 	     y->hit3->refalt_score_within_trims <
11959 	     x->hit5->refalt_score_within_trims +
11960 	     x->hit3->refalt_score_within_trims) {
11961     return +1;
11962   } else if (x->hit5->refalt_nmatches_plus_spliced_trims +
11963 	     x->hit3->refalt_nmatches_plus_spliced_trims >
11964 	     y->hit5->refalt_nmatches_plus_spliced_trims +
11965 	     y->hit3->refalt_nmatches_plus_spliced_trims) {
11966     return -1;
11967   } else if (y->hit5->refalt_nmatches_plus_spliced_trims +
11968 	     y->hit3->refalt_nmatches_plus_spliced_trims >
11969 	     x->hit5->refalt_nmatches_plus_spliced_trims +
11970 	     x->hit3->refalt_nmatches_plus_spliced_trims) {
11971     return +1;
11972   } else if (x->hit5->ref_nmatches_plus_spliced_trims +
11973 	     x->hit3->ref_nmatches_plus_spliced_trims >
11974 	     y->hit5->ref_nmatches_plus_spliced_trims +
11975 	     y->hit3->ref_nmatches_plus_spliced_trims) {
11976     return -1;
11977   } else if (y->hit5->ref_nmatches_plus_spliced_trims +
11978 	     y->hit3->ref_nmatches_plus_spliced_trims >
11979 	     x->hit5->ref_nmatches_plus_spliced_trims +
11980 	     x->hit3->ref_nmatches_plus_spliced_trims) {
11981     return +1;
11982 
11983   } else if (x->alts_status_inside < y->alts_status_inside) {
11984     return -1;
11985   } else if (y->alts_status_inside < x->alts_status_inside) {
11986     return +1;
11987 
11988   } else if (x->sense_consistent_p == true && y->sense_consistent_p == false) {
11989     debug8(printf(" => loses by sense_consistent_p\n"));
11990     return -1;
11991   } else if (x->sense_consistent_p == false && y->sense_consistent_p == true) {
11992     debug8(printf(" => wins by sense_consistent_p\n"));
11993     return +1;
11994 
11995   } else if (x->hit5->splice_score + x->hit3->splice_score >
11996 	     y->hit5->splice_score + y->hit3->splice_score) {
11997     debug8(printf(" => loses by splice score\n"));
11998     return -1;
11999 
12000   } else if (y->hit5->splice_score + y->hit3->splice_score >
12001 	     x->hit5->splice_score + x->hit3->splice_score) {
12002     debug8(printf(" => wins by splice score\n"));
12003     return +1;
12004 
12005   } else {
12006     debug8(printf(" => identical for sorting purposes\n"));
12007     return 0;
12008   }
12009 }
12010 
12011 
12012 #if 0
12013 /* Same as hitpair_sort_cmp, except for hittype, nmatches_to_trims, and indel_low */
12014 static int
12015 hitpair_equiv_cmp (Stage3pair_T x, Stage3pair_T y) {
12016   Univcoord_T x_hit5_high, x_hit5_low, y_hit5_high, y_hit5_low;
12017   Univcoord_T x_hit3_high, x_hit3_low, y_hit3_high, y_hit3_low;
12018   Univcoord_T x_low, x_high, y_low, y_high;
12019 
12020   x_hit5_low = normalize_coord(x->hit5->low,x->hit5->circularalias,x->hit5->chrlength);
12021   x_hit5_high = normalize_coord(x->hit5->high,x->hit5->circularalias,x->hit5->chrlength);
12022 
12023   x_hit3_low = normalize_coord(x->hit3->low,x->hit3->circularalias,x->hit3->chrlength);
12024   x_hit3_high = normalize_coord(x->hit3->high,x->hit3->circularalias,x->hit3->chrlength);
12025 
12026   x_low = (x_hit5_low < x_hit3_low) ? x_hit5_low : x_hit3_low;
12027   x_high = (x_hit5_high > x_hit3_high) ? x_hit5_high : x_hit3_high;
12028 
12029 
12030   y_hit5_low = normalize_coord(y->hit5->low,y->hit5->circularalias,y->hit5->chrlength);
12031   y_hit5_high = normalize_coord(y->hit5->high,y->hit5->circularalias,y->hit5->chrlength);
12032 
12033   y_hit3_low = normalize_coord(y->hit3->low,y->hit3->circularalias,y->hit3->chrlength);
12034   y_hit3_high = normalize_coord(y->hit3->high,y->hit3->circularalias,y->hit3->chrlength);
12035 
12036   y_low = (y_hit5_low < y_hit3_low) ? y_hit5_low : y_hit3_low;
12037   y_high = (y_hit5_high > y_hit3_high) ? y_hit5_high : y_hit3_high;
12038 
12039 
12040   if (x->dir != 0 && y->dir == 0) {
12041     return -1;
12042   } else if (x->dir == 0 && y->dir != 0) {
12043     return +1;
12044   } else if (x->dir > 0 && y->dir < 0) {
12045     return -1;
12046   } else if (x->dir < 0 && y->dir > 0) {
12047     return +1;
12048   } else if (x_low < y_low) {
12049     return -1;
12050   } else if (y_low < x_low) {
12051     return +1;
12052   } else if (x_high < y_high) {
12053     return -1;
12054   } else if (y_high < x_high) {
12055     return +1;
12056 
12057   } else if (x_hit5_low < y_hit5_low) {
12058     return -1;
12059   } else if (y_hit5_low < x_hit5_low) {
12060     return +1;
12061   } else if (x_hit5_high < y_hit5_high) {
12062     return -1;
12063   } else if (y_hit5_high < x_hit5_high) {
12064     return +1;
12065 
12066   } else if (x_hit3_low < y_hit3_low) {
12067     return -1;
12068   } else if (y_hit3_low < x_hit3_low) {
12069     return +1;
12070   } else if (x_hit3_high < y_hit3_high) {
12071     return -1;
12072   } else if (y_hit3_high < x_hit3_high) {
12073     return +1;
12074 
12075   } else if (x->hit5->refalt_nmatches_plus_spliced_trims +
12076 	     x->hit3->refalt_nmatches_plus_spliced_trims >
12077 	     y->hit5->refalt_nmatches_plus_spliced_trims +
12078 	     y->hit3->refalt_nmatches_plus_spliced_trims) {
12079     return -1;
12080 
12081   } else if (y->hit5->refalt_nmatches_plus_spliced_trims +
12082 	     y->hit3->refalt_nmatches_plus_spliced_trims >
12083 	     x->hit5->refalt_nmatches_plus_spliced_trims +
12084 	     x->hit3->refalt_nmatches_plus_spliced_trims) {
12085     return +1;
12086 
12087   } else if (x->hit5->ref_nmatches_plus_spliced_trims +
12088 	     x->hit3->ref_nmatches_plus_spliced_trims >
12089 	     y->hit5->ref_nmatches_plus_spliced_trims +
12090 	     y->hit3->ref_nmatches_plus_spliced_trims) {
12091     return -1;
12092 
12093   } else if (y->hit5->ref_nmatches_plus_spliced_trims +
12094 	     y->hit3->ref_nmatches_plus_spliced_trims >
12095 	     x->hit5->ref_nmatches_plus_spliced_trims +
12096 	     x->hit3->ref_nmatches_plus_spliced_trims) {
12097     return +1;
12098 
12099 #if 0
12100     /* Causes hits to not be recognized as equivalent */
12101   } else if (x->nsplices < y->nsplices) {
12102     return -1;
12103   } else if (y->nsplices < x->nsplices) {
12104     return +1;
12105 #endif
12106 
12107   } else if (x->alts_status_inside < y->alts_status_inside) {
12108     return -1;
12109   } else if (y->alts_status_inside < x->alts_status_inside) {
12110     return +1;
12111 
12112 #if 0
12113   } else if (x->hit5->start_amb_length + x->hit5->end_amb_length +
12114 	     x->hit3->start_amb_length + x->hit3->end_amb_length > 0 &&
12115 	     y->hit5->start_amb_length + y->hit5->end_amb_length +
12116 	     y->hit3->start_amb_length + y->hit3->end_amb_length == 0) {
12117     return -1;
12118   } else if (y->hit5->start_amb_length + y->hit5->end_amb_length +
12119 	     y->hit3->start_amb_length + y->hit3->end_amb_length > 0 &&
12120 	     x->hit5->start_amb_length + x->hit5->end_amb_length +
12121 	     x->hit3->start_amb_length + x->hit3->end_amb_length == 0) {
12122     return +1;
12123 #endif
12124 
12125   } else if (x->sense_consistent_p == true && y->sense_consistent_p == false) {
12126     return -1;
12127   } else if (x->sense_consistent_p == false && y->sense_consistent_p == true) {
12128     return +1;
12129 
12130 #if 0
12131   } else if (x->indel_low < y->indel_low) {
12132     return -1;
12133   } else if (y->indel_low < x->indel_low) {
12134     return +1;
12135 #endif
12136 
12137 #if 0
12138   } else if (x->sense_consistent_p == true) {
12139     /* Used for sorting, but not equiv */
12140     if ((x->hit5->sensedir_for_concordance != 0 || x->hit3->sensedir_for_concordance != 0) &&
12141 	(y->hit5->sensedir_for_concordance == 0 && y->hit3->sensedir_for_concordance == 0)) {
12142       return -1;
12143     } else if ((y->hit5->sensedir_for_concordance != 0 || y->hit3->sensedir_for_concordance != 0) &&
12144 	       (x->hit5->sensedir_for_concordance == 0 && x->hit3->sensedir_for_concordance == 0)) {
12145       return +1;
12146     } else {
12147       return 0;
12148     }
12149 #endif
12150 
12151 #if 0
12152   } else if (x->hit5->sensedir_for_concordance == y->hit5->sensedir_for_concordance &&
12153 	     x->hit3->sensedir_for_concordance == y->hit3->sensedir_for_concordance) {
12154     return 0;
12155   } else if (x->hit5->sensedir_for_concordance > y->hit5->sensedir_for_concordance) {
12156     return +1;
12157   } else if (y->hit5->sensedir_for_concordance > x->hit5->sensedir_for_concordance) {
12158     return -1;
12159   } else if (x->hit3->sensedir_for_concordance > y->hit3->sensedir_for_concordance) {
12160     return +1;
12161   } else if (y->hit3->sensedir_for_concordance > x->hit3->sensedir_for_concordance) {
12162     return -1;
12163 #endif
12164 
12165   } else {
12166     return 0;
12167   }
12168 }
12169 #endif
12170 
12171 
12172 static int
hitpair_position_cmp(const void * a,const void * b)12173 hitpair_position_cmp (const void *a, const void *b) {
12174   Stage3pair_T x = * (Stage3pair_T *) a;
12175   Stage3pair_T y = * (Stage3pair_T *) b;
12176 
12177   if (x->dir < y->dir) {
12178     return -1;
12179   } else if (y->dir < x->dir) {
12180     return +1;
12181   } else if (x->sensedir < y->sensedir) {
12182     return -1;
12183   } else if (y->sensedir < x->sensedir) {
12184     return +1;
12185   } else if (x->low < y->low) {
12186     return -1;
12187   } else if (y->low < x->low) {
12188     return +1;
12189   } else if (x->high > y->high) {
12190     return -1;
12191   } else if (y->high > x->high) {
12192     return +1;
12193   } else {
12194     return 0;
12195   }
12196 }
12197 
12198 
12199 static bool
hitpair_equal(Stage3pair_T x,Stage3pair_T y)12200 hitpair_equal (Stage3pair_T x, Stage3pair_T y) {
12201   List_T p, q;
12202   Substring_T substring_x, substring_y;
12203 
12204   if (x->dir != y->dir) {
12205     return false;		/* Different strands */
12206   } else {
12207     p = x->hit5->substrings_1toN;
12208     q = y->hit5->substrings_1toN;
12209     while (p != NULL && q != NULL) {
12210       substring_x = (Substring_T) p->first;
12211       substring_y = (Substring_T) q->first;
12212       if (Substring_equal(substring_x,substring_y) == false) {
12213 	return false;
12214       }
12215       p = List_next(p);
12216       q = List_next(q);
12217     }
12218     if (p != NULL || q != NULL) {
12219       return false;
12220     }
12221 
12222     p = x->hit3->substrings_1toN;
12223     q = y->hit3->substrings_1toN;
12224     while (p != NULL && q != NULL) {
12225       substring_x = (Substring_T) p->first;
12226       substring_y = (Substring_T) q->first;
12227       if (Substring_equal(substring_x,substring_y) == false) {
12228 	return false;
12229       }
12230       p = List_next(p);
12231       q = List_next(q);
12232     }
12233     if (p != NULL || q != NULL) {
12234       return false;
12235     }
12236 
12237     return true;
12238   }
12239 }
12240 
12241 
12242 static bool
hitpair_overlap_p(Stage3pair_T x,Stage3pair_T y)12243 hitpair_overlap_p (Stage3pair_T x, Stage3pair_T y) {
12244   /* printf("Checking for overlap of %u..%u and %u..%u ",x->low,x->high,y->low,y->high); */
12245   if (x->hit5->chrnum != y->hit5->chrnum) {
12246     /* printf("=> false\n"); */
12247     return false;		/* Different chrnums */
12248   } else if (x->hit3->chrnum != y->hit3->chrnum) {
12249     return false;		/* Different chrnums */
12250   } else if (x->dir != y->dir) {
12251     /* printf("=> false\n"); */
12252     return false;		/* Different strands */
12253   } else if (x->high < y->low) {
12254     /* printf("=> false\n"); */
12255     return false;
12256   } else if (x->low > y->high) {
12257     /* printf("=> false\n"); */
12258     return false;
12259   } else {
12260     /* printf("=> true\n"); */
12261     return true;
12262   }
12263 }
12264 
12265 
12266 static bool
hitpair_subsumption(Stage3pair_T x,Stage3pair_T y)12267 hitpair_subsumption (Stage3pair_T x, Stage3pair_T y) {
12268   if (x->dir != y->dir) {
12269     return false;		/* Different strands */
12270 
12271   } else if (x->sensedir != y->sensedir) {
12272     return false;
12273 
12274   } else if (x->low <= y->low && x->high >= y->high) {
12275     return true;
12276   } else if (y->low <= x->low && y->high >= x->high) {
12277     return true;
12278 
12279     /* Test each end of the pair.  Example: 1586..1512 and 1400..1468 should subsume 1586..1512 and 1564..1617 */
12280   } else if (x->hit5->low <= y->hit5->low && x->hit5->high >= y->hit5->high) {
12281     return true;
12282   } else if (y->hit5->low <= x->hit5->low && y->hit5->high >= x->hit5->high) {
12283     return true;
12284 
12285   } else if (x->hit3->low <= y->hit3->low && x->hit3->high >= y->hit3->high) {
12286     return true;
12287   } else if (y->hit3->low <= x->hit3->low && y->hit3->high >= x->hit3->high) {
12288     return true;
12289 
12290   } else {
12291     return false;
12292   }
12293 }
12294 
12295 
12296 static int
pair_matches_cmp(const void * a,const void * b)12297 pair_matches_cmp (const void *a, const void *b) {
12298   Stage3pair_T x = * (Stage3pair_T *) a;
12299   Stage3pair_T y = * (Stage3pair_T *) b;
12300 
12301   if (x->hit5->refalt_nmatches_plus_spliced_trims +
12302       x->hit3->refalt_nmatches_plus_spliced_trims >
12303       y->hit5->refalt_nmatches_plus_spliced_trims +
12304       y->hit3->refalt_nmatches_plus_spliced_trims) {
12305     return -1;
12306   } else if (y->hit5->refalt_nmatches_plus_spliced_trims +
12307 	     y->hit3->refalt_nmatches_plus_spliced_trims >
12308 	     x->hit5->refalt_nmatches_plus_spliced_trims +
12309 	     x->hit3->refalt_nmatches_plus_spliced_trims) {
12310     return +1;
12311   } else if (x->hit5->ref_nmatches_plus_spliced_trims +
12312 	     x->hit3->ref_nmatches_plus_spliced_trims >
12313 	     y->hit5->ref_nmatches_plus_spliced_trims +
12314 	     y->hit3->ref_nmatches_plus_spliced_trims) {
12315     return -1;
12316   } else if (y->hit5->ref_nmatches_plus_spliced_trims +
12317 	     y->hit3->ref_nmatches_plus_spliced_trims >
12318 	     x->hit5->ref_nmatches_plus_spliced_trims +
12319 	     x->hit3->ref_nmatches_plus_spliced_trims) {
12320     return +1;
12321   } else {
12322     return 0;
12323   }
12324 }
12325 
12326 List_T
Stage3pair_sort_bymatches(List_T hits,Hitlistpool_T hitlistpool)12327 Stage3pair_sort_bymatches (List_T hits, Hitlistpool_T hitlistpool) {
12328   List_T sorted = NULL;
12329   Stage3pair_T *array;
12330   int n, i;
12331 
12332 
12333   if ((n = List_length(hits)) == 0) {
12334     return (List_T) NULL;
12335   } else {
12336 #ifdef USE_ALLOCA_FOR_HITS
12337     array = (Stage3pair_T *) MALLOCA(n * sizeof(Stage3pair_T));
12338     List_fill_array((void **) array,hits);
12339     Hitlist_free(&hits);
12340 #else
12341     array = (Stage3pair_T *) List_to_array(hits,NULL);
12342     Hitlist_free(&hits);
12343 #endif
12344 
12345     qsort(array,n,sizeof(Stage3pair_T),pair_matches_cmp);
12346     for (i = n-1; i >= 0; i--) {
12347       sorted = Hitlist_push(sorted,hitlistpool,(void *) array[i]);
12348     }
12349 #ifdef USE_ALLOCA_FOR_HITS
12350     FREEA(array);
12351 #else
12352     FREE(array);
12353 #endif
12354 
12355     return sorted;
12356   }
12357 }
12358 
12359 
12360 
12361 #if 0
12362 List_T
12363 Stage3pair_remove_duplicates_exact (List_T hitpairlist) {
12364   List_T unique = NULL;
12365   Stage3pair_T hitpair, *hitpairs;
12366   int n, i, j;
12367   bool *eliminate;
12368 
12369   debug8(printf("Entered Stage3pair_remove_duplicates_exact with %d pairs\n",n));
12370   if ((n = List_length(hitpairlist)) == 0) {
12371     return NULL;
12372   } else {
12373 #ifdef USE_ALLOCA_FOR_HITS
12374     eliminate = (bool *) CALLOCA(n,sizeof(bool));
12375     hitpairs = (Stage3pair_T *) MALLOCA(n * sizeof(Stage3pair_T));
12376     List_fill_array((void **) hitpairs,hitpairlist);
12377     Hitlist_free(&hitpairlist);
12378 #else
12379     eliminate = (bool *) CALLOC(n,sizeof(bool));
12380     hitpairs = (Stage3pair_T *) List_to_array(hitpairlist,NULL);
12381     Hitlist_free(&hitpairlist);
12382 #endif
12383   }
12384 
12385   debug8(printf("Checking for exact duplicates\n"));
12386   qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_sort_cmp);
12387 
12388   debug8(
12389 	 for (i = 0; i < n; i++) {
12390 	   hitpair = hitpairs[i];
12391 	   printf("  Initial %d (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), circularalias %d|%d, nmatches: %d (%d to_trims)\n",
12392 		  i,Pairtype_string(hitpair->pairtype),Method_string(hitpair->hit5->method),
12393 		  Method_string(hitpair->hit3->method),hitpair,
12394 		  hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12395 		  hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12396 		  hitpair->dir,hitpair->hit5->circularalias,hitpair->hit3->circularalias,
12397 		  hitpair->nmatches_plus_spliced_trims,hitpair->nmatches_to_trims);
12398 	 }
12399 	 );
12400 
12401   i = 0;
12402   while (i < n) {
12403     j = i+1;
12404     while (j < n && hitpair_equal(hitpairs[j],hitpairs[i]) == true) {
12405       debug8(printf("  %d is identical to %d => eliminating\n",j,i));
12406       eliminate[j] = true;
12407       j++;
12408     }
12409     i = j;
12410   }
12411 
12412   for (i = n-1; i >= 0; i--) {
12413     hitpair = hitpairs[i];
12414     if (eliminate[i] == false) {
12415       unique = Hitlist_push(unique,hitlistpool,(void *) hitpair);
12416     } else {
12417       Stage3pair_free(&hitpair);
12418     }
12419   }
12420 
12421 #ifdef USE_ALLOCA_FOR_HITS
12422   FREEA(hitpairs);
12423   FREEA(eliminate);
12424 #else
12425   FREE(hitpairs);
12426   FREE(eliminate);
12427 #endif
12428 
12429   debug8(printf("Exited Stage3pair_remove_duplicates_exact with %d pairs\n",List_length(unique)));
12430   return unique;
12431 }
12432 #endif
12433 
12434 
12435 static int
hitpair_goodness_cmp(bool * equalp,Stage3pair_T hitpair,Stage3pair_T best_hitpair,bool finalp)12436 hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
12437 		      Stage3pair_T best_hitpair, bool finalp) {
12438   double prob1, prob2;
12439   /* Chrpos_T total_querylength, best_total_querylength; */
12440   double zscore, best_zscore;
12441 
12442 #if 0
12443   int hitpair_nmatches, best_hitpair_nmatches;
12444   int max_trim_querystart, max_trim_queryend;
12445   Stage3end_T hit5, besthit5, hit3, besthit3;
12446 
12447   if (hitpair->absdifflength_bingo_p < best_hitpair->absdifflength_bingo_p) {
12448     /* k is worse */
12449     debug8(printf(" => loses by absdifflength (bingo)\n"));
12450     return -1;
12451   } else if (hitpair->absdifflength_bingo_p > best_hitpair->absdifflength_bingo_p) {
12452     /* k is better */
12453     debug8(printf(" => wins by absdifflength (bingo)\n"));
12454     return +1;
12455   }
12456 #endif
12457 
12458 #ifdef PRE_RESOLVE_MULTIMAPPING
12459   if (TALLY_RATIO*Stage3pair_tally(hitpair) < Stage3pair_tally(best_hitpair)) {
12460     /* k is worse */
12461     debug8(printf(" => loses by tally\n"));
12462     return -1;
12463   } else if (Stage3pair_tally(hitpair) > TALLY_RATIO*Stage3pair_tally(best_hitpair)) {
12464     /* k is better */
12465     debug8(printf(" => wins by tally\n"));
12466     return +1;
12467   }
12468 #endif
12469 
12470   *equalp = false;
12471 
12472 #if 0
12473   /* Don't want to use nmatches_to_trims */
12474   /* Previously, we favored ambiguous splices over definitive ones, but
12475      now that we are generating Stage3end_T objects with and without the
12476      end exons, we prefer definitive splices */
12477   if (known_ambiguous_p(hitpair->hit5) == true && known_ambiguous_p(best_hitpair->hit5) == false &&
12478       known_ambiguous_p(hitpair->hit3) == known_ambiguous_p(best_hitpair->hit3) &&
12479       hitpair->insertlength <= best_hitpair->insertlength) {
12480     debug8(printf("Case 1\n"));
12481     return -1;
12482 
12483   } else if (known_ambiguous_p(hitpair->hit5) == false && known_ambiguous_p(best_hitpair->hit5) == true &&
12484 	     known_ambiguous_p(hitpair->hit3) == known_ambiguous_p(best_hitpair->hit3) &&
12485 	     hitpair->insertlength >= best_hitpair->insertlength) {
12486     debug8(printf("Case 2\n"));
12487     return +1;
12488 
12489   } else if (known_ambiguous_p(hitpair->hit3) == true && known_ambiguous_p(best_hitpair->hit3) == false &&
12490 	     known_ambiguous_p(hitpair->hit5) == known_ambiguous_p(best_hitpair->hit5) &&
12491 	     hitpair->insertlength <= best_hitpair->insertlength) {
12492     debug8(printf("Case 3\n"));
12493     return -1;
12494 
12495   } else if (known_ambiguous_p(hitpair->hit3) == false && known_ambiguous_p(best_hitpair->hit3) == true &&
12496 	     known_ambiguous_p(hitpair->hit5) == known_ambiguous_p(best_hitpair->hit5) &&
12497 	     hitpair->insertlength > best_hitpair->insertlength) {
12498     debug8(printf("Case 4\n"));
12499     return +1;
12500   }
12501 #endif
12502 
12503 
12504   if (hitpair->hit5->refalt_nmatches_plus_spliced_trims +
12505       hitpair->hit3->refalt_nmatches_plus_spliced_trims >
12506       best_hitpair->hit5->refalt_nmatches_plus_spliced_trims +
12507       best_hitpair->hit3->refalt_nmatches_plus_spliced_trims + NMATCHES_SLOP) {
12508     /* Significantly more matches */
12509     debug8(printf("More matches (to_trims)\n"));
12510     return +1;
12511   } else if (hitpair->hit5->refalt_nmatches_plus_spliced_trims +
12512 	     hitpair->hit3->refalt_nmatches_plus_spliced_trims <
12513 	     best_hitpair->hit5->refalt_nmatches_plus_spliced_trims +
12514 	     best_hitpair->hit3->refalt_nmatches_plus_spliced_trims - NMATCHES_SLOP) {
12515     /* Fewer matches */
12516     debug8(printf("Fewer matches (to_trims)\n"));
12517     return -1;
12518 
12519 #if 0
12520   } else if ((hitpair->hit5->hittype != TRANSCRIPTOME || hitpair->hit3->hittype != TRANSCRIPTOME) &&
12521 	     (best_hitpair->hit5->hittype == TRANSCRIPTOME || best_hitpair->hit3->hittype == TRANSCRIPTOME)) {
12522     /* k is worse */
12523     debug8(printf(" => loses by transcriptome\n"));
12524     return -1;
12525 
12526   } else if ((hitpair->hit5->hittype == TRANSCRIPTOME || hitpair->hit3->hittype == TRANSCRIPTOME) &&
12527 	     (best_hitpair->hit5->hittype != TRANSCRIPTOME || best_hitpair->hit3->hittype != TRANSCRIPTOME)) {
12528     /* k is better */
12529     debug8(printf(" => wins by transcriptome\n"));
12530     return +1;
12531 #endif
12532 
12533 #if 0
12534   } else if (hitpair->nmatches_plus_spliced_trims < best_hitpair->nmatches_plus_spliced_trims - NMATCHES_SLOP) {
12535     /* k is worse */
12536     debug8(printf(" => loses by nmatches\n"));
12537     return -1;
12538   } else if (hitpair->nmatches_plus_spliced_trims > best_hitpair->nmatches_plus_spliced_trims + NMATCHES_SLOP) {
12539     /* k is better */
12540     debug8(printf(" => wins by nmatches\n"));
12541     return +1;
12542 #endif
12543 
12544 #if 0
12545   } else if (hitpair->nsplices > best_hitpair->nsplices) {
12546     /* k is worse */
12547     debug8(printf(" => loses by nsplices: %d > %d in best\n",hitpair->nsplices,best_hitpair->nsplices));
12548     return -1;
12549   } else if (hitpair->nsplices < best_hitpair->nsplices) {
12550     /* k is better */
12551     debug8(printf(" => wins by nsplices: %d < %d in best\n",hitpair->nsplices,best_hitpair->nsplices));
12552     return +1;
12553 #endif
12554 
12555   } else if (hitpair->alts_status_inside > best_hitpair->alts_status_inside) {
12556     /* k is worse */
12557     debug8(printf(" => loses by alts_status_inside\n"));
12558     return -1;
12559   } else if (hitpair->alts_status_inside < best_hitpair->alts_status_inside) {
12560     /* k is better */
12561     debug8(printf(" => wins by alts_status_inside\n"));
12562     return +1;
12563 
12564 
12565   } else if (hitpair->hit5->hittype > best_hitpair->hit5->hittype &&
12566 	     hitpair->hit3->hittype >= best_hitpair->hit3->hittype) {
12567     /* k is worse */
12568     debug8(printf(" => loses by hittype\n"));
12569     return -1;
12570 
12571   } else if (hitpair->hit5->hittype >= best_hitpair->hit5->hittype &&
12572 	     hitpair->hit3->hittype > best_hitpair->hit3->hittype) {
12573     /* k is worse */
12574     debug8(printf(" => loses by hittype\n"));
12575     return -1;
12576 
12577   } else if (hitpair->hit5->hittype < best_hitpair->hit5->hittype &&
12578 	     hitpair->hit3->hittype <= best_hitpair->hit3->hittype) {
12579     /* k is better */
12580     debug8(printf(" => wins by hittype\n"));
12581     return +1;
12582 
12583   } else if (hitpair->hit5->hittype <= best_hitpair->hit5->hittype &&
12584 	     hitpair->hit3->hittype < best_hitpair->hit3->hittype) {
12585     /* k is better */
12586     debug8(printf(" => wins by hittype\n"));
12587     return +1;
12588 
12589 #if 0
12590   } else if (n_amb_ends(hitpair->hit5) + n_amb_ends(hitpair->hit3) >
12591 	     n_amb_ends(best_hitpair->hit5) + n_amb_ends(best_hitpair->hit3)) {
12592     /* k is worse */
12593     debug8(printf(" => loses by ambiguity\n"));
12594     return -1;
12595 
12596   } else if (n_amb_ends(hitpair->hit5) + n_amb_ends(hitpair->hit3) <
12597 	     n_amb_ends(best_hitpair->hit5) + n_amb_ends(best_hitpair->hit3)) {
12598     /* k is better */
12599     debug8(printf(" => wins by ambiguity\n"));
12600     return +1;
12601 #endif
12602 
12603   } else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score >
12604 	     best_hitpair->hit5->splice_score + best_hitpair->hit3->splice_score) {
12605     /* k is worse */
12606     debug8(printf(" => loses by splice score\n"));
12607     return -1;
12608 
12609   } else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score >
12610 	     best_hitpair->hit5->splice_score + best_hitpair->hit3->splice_score) {
12611     /* k is better */
12612     debug8(printf(" => wins by splice score\n"));
12613     return +1;
12614 
12615 #if 0
12616   } else if (hitpair->absdifflength < best_hitpair->absdifflength) {
12617     /* k is worse */
12618     debug8(printf(" => loses by absdifflength\n"));
12619     return -1;
12620   } else if (hitpair->absdifflength > best_hitpair->absdifflength) {
12621     /* k is better */
12622     debug8(printf(" => wins by absdifflength\n"));
12623     return +1;
12624 #endif
12625 
12626   } else if (finalp == false) {
12627     debug8(printf("  => indistinguishable\n"));
12628     return 0;
12629 
12630 #ifdef USE_ABSDIFFLENGTH
12631     /* If insert length is within deviation of expected pairlength, favor it */
12632   } else if (best_hitpair->absdifflength <= (Chrpos_T) pairlength_deviation &&
12633 	     hitpair->absdifflength > (Chrpos_T) pairlength_deviation) {
12634     /* k is worse */
12635     debug8(printf(" => loses by absdifflength within deviation %d\n",pairlength_deviation));
12636     return -1;
12637   } else if (hitpair->absdifflength <= (Chrpos_T) pairlength_deviation &&
12638 	     best_hitpair->absdifflength > (Chrpos_T) pairlength_deviation) {
12639     /* k is better */
12640     debug8(printf(" => wins by absdifflength within deviation %d\n",pairlength_deviation));
12641     return +1;
12642 #endif
12643 
12644 #if 0
12645     /* Previously favored longer insert lengths to give more compact
12646        splices.  However, we now accept splices first that give
12647        expected pairlength */
12648   } else if (hitpair->insertlength_expected_sign == -1 && best_hitpair->insertlength_expected_sign == +1) {
12649     /* k is worse */
12650     debug8(printf(" => loses by insertlength_expected_sign\n"));
12651     return -1;
12652   } else if (hitpair->insertlength_expected_sign == +1 && best_hitpair->insertlength_expected_sign == -1) {
12653     /* k is better */
12654     debug8(printf(" => wins by insertlength_expected_sign\n"));
12655     return +1;
12656 #endif
12657 
12658     /* Next we look at splice probability */
12659   } else {
12660     debug8(printf(" => prob"));
12661     prob1 = Stage3end_prob(hitpair->hit5) + Stage3end_prob(hitpair->hit3);
12662     prob2 = Stage3end_prob(best_hitpair->hit5) + Stage3end_prob(best_hitpair->hit3);
12663     if (prob1 + 0.3 < prob2) {
12664       /* k is worse */
12665       debug8(printf(" => loses by dual splice prob %f vs %f\n",prob1,prob2));
12666       return -1;
12667     } else if (prob1 > prob2 + 0.3) {
12668       /* k is better */
12669       debug8(printf(" => wins by dual splice prob %f vs %f\n",prob1,prob2));
12670       return +1;
12671     } else {
12672       debug8(printf(" => neither wins\n"));
12673     }
12674 
12675 
12676 #if 0
12677     /* Overlapping ends worse than separate ends */
12678     total_querylength = (Chrpos_T) (hitpair->hit5->querylength + hitpair->hit3->querylength);
12679     best_total_querylength = (Chrpos_T) (best_hitpair->hit5->querylength + best_hitpair->hit3->querylength);
12680 
12681     if (hitpair->insertlength <= total_querylength && best_hitpair->insertlength > best_total_querylength) {
12682       debug8(printf(" => loses by being overlapping\n"));
12683       return -1;
12684     } else if (hitpair->insertlength > total_querylength && best_hitpair->insertlength <= best_total_querylength) {
12685       debug8(printf(" => wins by being separate\n"));
12686       return +1;
12687 
12688       /* Next, favor shorter outerlengths to give more compact splices or closer pairs */
12689     } else if (hitpair->outerlength > best_hitpair->outerlength + OUTERLENGTH_SLOP) {
12690       /* k is worse */
12691       debug8(printf(" => loses by outerlength\n"));
12692       return -1;
12693     } else if (hitpair->outerlength + OUTERLENGTH_SLOP < best_hitpair->outerlength) {
12694       /* k is better */
12695       debug8(printf(" => wins by outerlength\n"));
12696       return +1;
12697 
12698     } else {
12699 #if 0
12700       if (hitpair->insertlength_expected_sign >= 0 && best_hitpair->insertlength_expected_sign >= 0) {
12701 	/* Both insert lengths are short, so favor shorter insert length */
12702 	debug8(printf(" => short insertlengths"));
12703 	/* Favor shorter insert lengths */
12704 	if (hitpair->insertlength > best_hitpair->insertlength) {
12705 	  /* k is worse */
12706 	  debug8(printf(" => loses by insertlength\n"));
12707 	  return -1;
12708 	} else if (hitpair->insertlength < best_hitpair->insertlength) {
12709 	  /* k is better */
12710 	  debug8(printf(" => wins by insertlength\n"));
12711 	  return +1;
12712 	}
12713       }
12714 #endif
12715 
12716       /* Both insert lengths are long, so favor longer insert length to give more compact splices */
12717       debug8(printf(" => long insertlengths"));
12718       if (hitpair->insertlength < best_hitpair->insertlength) {
12719 	/* k is worse */
12720 	debug8(printf(" => loses by insertlength\n"));
12721 	return -1;
12722       } else if (hitpair->insertlength > best_hitpair->insertlength) {
12723 	/* k is better */
12724 	debug8(printf(" => wins by insertlength\n"));
12725 	return +1;
12726       }
12727 
12728       debug8(printf("  => equal\n"));
12729       *equalp = true;
12730       return 0;
12731     }
12732 #endif
12733 
12734     /* Look at expected pairlength and pairlength deviation */
12735     if (hitpair->insertlength < expected_pairlength) {
12736       zscore = (double) (expected_pairlength - (Chrpos_T) hitpair->insertlength) / (double) pairlength_deviation;
12737     } else {
12738       zscore = (double) ((Chrpos_T) hitpair->insertlength - expected_pairlength) / (double) pairlength_deviation;
12739     }
12740     if (best_hitpair->insertlength < expected_pairlength) {
12741       best_zscore = (double) (expected_pairlength - (Chrpos_T) best_hitpair->insertlength) / (double) pairlength_deviation;
12742     } else {
12743       best_zscore = (double) ((Chrpos_T) best_hitpair->insertlength - expected_pairlength) / (double) pairlength_deviation;
12744     }
12745     debug8(printf("expected_pairlength %u, pairlength_deviation %u\n",expected_pairlength,pairlength_deviation));
12746     debug8(printf("Comparing insertlength %d (z score %f) with best_insertlength %d (zscore %f)\n",
12747 		  hitpair->insertlength,zscore,best_hitpair->insertlength,best_zscore));
12748 
12749     if (zscore > best_zscore + 1.0) {
12750       /* k is worse */
12751       debug8(printf(" => loses by insertlength and zscore\n"));
12752       return -1;
12753     } else if (best_zscore > zscore + 1.0) {
12754       /* k is better */
12755       debug8(printf(" => wins by insertlength and zscore\n"));
12756       return +1;
12757     }
12758 
12759     debug8(printf("  => equal\n"));
12760     *equalp = true;
12761     return 0;
12762   }
12763 }
12764 
12765 
12766 #if 0
12767 static bool
12768 hitpair_bad_superstretch_p (Stage3pair_T hitpair_k, Stage3pair_T *hitpairs, int k, int j,
12769 			    bool finalp) {
12770   int a;
12771   bool equalp;
12772 
12773   for (a = k+1; a <= j; a++) {
12774     if (hitpair_subsumption(hitpair_k,hitpairs[a]) == true) {
12775       debug8(printf("Testing %d because stretches over %d",k,a));
12776       if (hitpair_goodness_cmp(&equalp,hitpairs[a],
12777 			       hitpair_k,finalp) > 0 || equalp == true) {
12778 	debug8(printf(" => eliminating\n"));
12779 	return true;
12780       }
12781       debug8(printf("\n"));
12782     }
12783   }
12784   return false;
12785 }
12786 #endif
12787 
12788 
12789 /* Recursive, list-based approach */
12790 static List_T
pair_remove_bad_superstretches(bool * keep_p,Stage3pair_T superstretch,List_T list,Hitlistpool_T hitlistpool,int querylength5,int querylength3,bool finalp)12791 pair_remove_bad_superstretches (bool *keep_p, Stage3pair_T superstretch, List_T list,
12792 				Hitlistpool_T hitlistpool, int querylength5, int querylength3,
12793 				bool finalp) {
12794   List_T result = NULL, p, q, r;
12795   Stage3pair_T stage3pair, hitpair;
12796   Chrpos_T best_insertlength, best_outerlength;
12797   int best_nsegments, nsegments;
12798   double max_splice_score, splice_score;
12799   bool equalp;
12800 
12801   *keep_p = true;
12802 
12803   p = list;
12804   while (p != NULL) {
12805     stage3pair = (Stage3pair_T) List_head(p);
12806 
12807     q = List_next(p);
12808     while (q != NULL && hitpair_subsumption(stage3pair,(Stage3pair_T) List_head(q)) == true) {
12809 #ifdef DEBUG8
12810       printf("  This (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), nmatches: %d+%d (%d+%d to trims), insertlength %d, alts_status_inside %d, amb_lengths %d and %d\n",
12811 	     Pairtype_string(stage3pair->pairtype),Method_string(stage3pair->hit5->method),
12812 	     Method_string(stage3pair->hit3->method),stage3pair,
12813 	     stage3pair->hit5->low - stage3pair->hit5->chroffset,stage3pair->hit5->high - stage3pair->hit5->chroffset,
12814 	     stage3pair->hit3->low - stage3pair->hit3->chroffset,stage3pair->hit3->high - stage3pair->hit3->chroffset,
12815 	     stage3pair->dir,stage3pair->hit5->refalt_nmatches_plus_spliced_trims,stage3pair->hit3->refalt_nmatches_plus_spliced_trims,
12816 	     stage3pair->hit5->refalt_nmatches_to_trims,stage3pair->hit3->refalt_nmatches_to_trims,
12817 	     stage3pair->insertlength,stage3pair->alts_status_inside,amb_length(stage3pair->hit5),amb_length(stage3pair->hit3));
12818 
12819       hitpair = (Stage3pair_T) List_head(q);
12820       printf("subsumes that (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), nmatches: %d+%d (%d+%d to trims), insertlength %d, alts_status_inside %d, amb_lengths %d and %d\n",
12821 	     Pairtype_string(hitpair->pairtype),Method_string(hitpair->hit5->method),
12822 	     Method_string(hitpair->hit3->method),hitpair,
12823 	     hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12824 	     hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12825 	     hitpair->dir,hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12826 	     hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12827 	     hitpair->insertlength,hitpair->alts_status_inside,amb_length(hitpair->hit5),amb_length(hitpair->hit3));
12828 #endif
12829       q = List_next(q);
12830     }
12831 
12832     if (q == p) {
12833       result = Hitlist_push(result,hitlistpool,(void *) stage3pair);
12834       if (superstretch != NULL &&
12835 	  (hitpair_goodness_cmp(&equalp,stage3pair,superstretch,finalp) > 0 || equalp == true)) {
12836 	*keep_p = false;
12837       }
12838       p = List_next(q);
12839 
12840     } else {
12841       /* Cluster */
12842 
12843       /* (1) Find smallest insert length with slop across loci */
12844       debug8(printf("Finding smallest insertlength\n"));
12845       best_insertlength = (Chrpos_T) -1;
12846       for (r = p; r != q; r = List_next(r)) {
12847 	hitpair = (Stage3pair_T) r->first;
12848 	if (hitpair->insertlength < best_insertlength) {
12849 	  best_insertlength = hitpair->insertlength;
12850 	}
12851       }
12852 
12853       for (r = p; r != q; r = List_next(r)) {
12854 	hitpair = (Stage3pair_T) r->first;
12855 
12856 	if (hitpair->insertlength > best_insertlength + INSERTLENGTH_SLOP) {  /* Initial slop */
12857 	  debug8(printf("Final (insertlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims), ref %d+%d (%d+%d)\n",
12858 			hitpair->insertlength,best_insertlength,
12859 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12860 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12861 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12862 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12863 			hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
12864 			hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims));
12865 	  Stage3pair_free(&hitpair);
12866 	  r->first = (Stage3pair_T) NULL;
12867 	  *keep_p = false;
12868 
12869 	} else {
12870 	  debug8(printf("Final (insertlength %u, outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims), ref %d+%d (%d+%d)\n",
12871 			hitpair->insertlength,hitpair->outerlength,
12872 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12873 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12874 			List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
12875 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12876 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12877 			hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
12878 			hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims));
12879 	  /* result = Hitlist_push(result,hitlistpool,(void *) hitpair); -- wait for last filtering step */
12880 	}
12881       }
12882 
12883 
12884       /* (2) Find best nsegments and splice score */
12885       debug8(printf("Finding best nsegments and splice score\n"));
12886       best_nsegments = querylength5 + querylength3;
12887       max_splice_score = 0.0;
12888       for (r = p; r != q; r = List_next(r)) {
12889 	if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12890 	  /* Already eliminated */
12891 	} else if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) < best_nsegments) {
12892 	  best_nsegments = nsegments;
12893 	  max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
12894 
12895 	} else if (nsegments == best_nsegments) {
12896 	  if ((splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score) > max_splice_score) {
12897 	    max_splice_score = splice_score;
12898 	  }
12899 	}
12900       }
12901       debug8(printf("best_nsegments %d, max_splice_score %f\n",best_nsegments,max_splice_score));
12902 
12903       for (r = p; r != q; r = List_next(r)) {
12904 	if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12905 	  /* Already eliminated */
12906 
12907 	} else if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) > best_nsegments) {
12908 	  debug8(printf("Within loci pair (nsegments %d > %d): Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to trims), sensedirs %d and %d, splice scores %f and %f\n",
12909 			nsegments,best_nsegments,
12910 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12911 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12912 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
12913 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12914 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12915 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
12916 	  Stage3pair_free(&hitpair);
12917 	  r->first = (Stage3pair_T) NULL;
12918 	  *keep_p = false;
12919 
12920 	} else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
12921 	  debug8(printf("Within loci pair (splice_score w/slop %f < %f): Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to trims), sensedirs %d and %d, splice scores %f and %f\n",
12922 			hitpair->hit5->splice_score + hitpair->hit3->splice_score,max_splice_score,
12923 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12924 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12925 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
12926 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12927 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
12928 	  Stage3pair_free(&hitpair);
12929 	  r->first = (Stage3pair_T) NULL;
12930 	  *keep_p = false;
12931 
12932 	} else {
12933 	  debug8(printf("Keeping hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
12934 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12935 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12936 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
12937 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12938 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
12939 	  /* result = Hitlist_push(result,hitlistpool,(void *) hitpair);  -- wait until last filtering step */
12940 	}
12941       }
12942 
12943 
12944       /* (3) Find smallest outerlength across loci */
12945       debug8(printf("Finding smallest outerlength"));
12946       best_outerlength = (Chrpos_T) -1;
12947       for (r = p; r != q; r = List_next(r)) {
12948 	if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12949 	  /* Already eliminated */
12950 	} else if (hitpair->outerlength < best_outerlength) {
12951 	  best_outerlength = hitpair->outerlength;
12952 	}
12953       }
12954       debug8(printf(" => %u\n",best_outerlength));
12955 
12956       for (r = p; r != q; r = List_next(r)) {
12957 	if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12958 	  /* Already eliminated */
12959 
12960 	} else if (hitpair->outerlength > best_outerlength /*+ OUTERLENGTH_SLOP*/) {  /* No slop for final */
12961 	  debug8(printf("Final (outerlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
12962 			hitpair->outerlength,best_outerlength /*+ OUTERLENGTH_SLOP*/,
12963 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12964 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12965 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12966 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
12967 	  Stage3pair_free(&hitpair);
12968 	  r->first = (Stage3pair_T) NULL;
12969 	  *keep_p = false;
12970 
12971 	} else {
12972 	  debug8(printf("Final (outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims)\n",
12973 			hitpair->outerlength,
12974 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12975 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12976 			List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
12977 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12978 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
12979 	  /* result = Hitlist_push(result,hitlistpool,(void *) hitpair); -- wait for last filtering step */
12980 	}
12981       }
12982 
12983 
12984       /* (4) Find smallest insert length with slop across loci */
12985       debug8(printf("Finding smallest insertlength\n"));
12986       best_insertlength = (Chrpos_T) -1;
12987       for (r = p; r != q; r = List_next(r)) {
12988 	if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12989 	  /* Already eliminated */
12990 	} else if (hitpair->insertlength < best_insertlength) {
12991 	  best_insertlength = hitpair->insertlength;
12992 	}
12993       }
12994       debug8(printf(" => %u\n",best_insertlength));
12995 
12996       for (r = p; r != q; r = List_next(r)) {
12997 	if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12998 	  /* Already eliminated */
12999 
13000 	} else if (hitpair->insertlength > best_insertlength /*+ INSERTLENGTH_SLOP*/) {  /* No slop for final */
13001 	  debug8(printf("Final (insertlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims), ref %d+%d (%d+%d)\n",
13002 			hitpair->insertlength,best_insertlength,
13003 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13004 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13005 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13006 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13007 			hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
13008 			hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims));
13009 	  Stage3pair_free(&hitpair);
13010 	  r->first = (Stage3pair_T) NULL;
13011 	  *keep_p = false;
13012 
13013 	} else {
13014 	  debug8(printf("Final (insertlength %u, outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims), ref %d+%d (%d+%d)\n",
13015 			hitpair->insertlength,hitpair->outerlength,
13016 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13017 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13018 			List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
13019 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13020 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13021 			hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
13022 			hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims));
13023 	  result = Hitlist_push(result,hitlistpool,(void *) hitpair);
13024 	  debug8(printf("  result now has length %d\n",List_length(result)));
13025 	}
13026       }
13027 
13028       p = q;
13029     }
13030   }
13031 
13032   Hitlist_free(&list);
13033 
13034   debug8(printf("pair_remove_bad_superstretches returning result of length %d\n",List_length(result)));
13035   return List_reverse(result);
13036 }
13037 
13038 
13039 static List_T
pair_remove_overlaps(List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3,bool translocp,bool finalp)13040 pair_remove_overlaps (List_T hitpairlist, Hitlistpool_T hitlistpool,
13041 		      int querylength5, int querylength3,
13042 		      bool translocp, bool finalp) {
13043   List_T unique = NULL;
13044   Stage3pair_T hitpair, parent, *hitpairs;
13045   int nkept, n, i, j;
13046   bool *eliminate;
13047   int *parenti;
13048   bool keep_p;
13049 
13050   n = List_length(hitpairlist);
13051   debug8(printf("  Entering pair_remove_overlaps with %d pairs: %s\n",
13052 		n,finalp == true ? "FINAL" : "not final"));
13053 
13054   if (n <= 1) {
13055     debug8(printf("  Exiting pair_remove_overlaps with %d < 2 pairs\n",n));
13056     return hitpairlist;
13057   } else {
13058 #ifdef USE_ALLOCA_FOR_HITS
13059     eliminate = (bool *) CALLOCA(n,sizeof(bool));
13060     parenti = (int *) CALLOCA(n,sizeof(int));
13061     hitpairs = (Stage3pair_T *) MALLOCA(n * sizeof(Stage3pair_T));
13062     List_fill_array((void **) hitpairs,hitpairlist);
13063     Hitlist_free(&hitpairlist);
13064 #else
13065     eliminate = (bool *) CALLOC(n,sizeof(bool));
13066     parenti = (int *) CALLOC(n,sizeof(int));
13067     hitpairs = (Stage3pair_T *) List_to_array(hitpairlist,NULL);
13068     Hitlist_free(&hitpairlist);
13069 #endif
13070   }
13071 
13072   /* Step 1.  Check for exact duplicates */
13073   debug8(printf("  Step 1.  Checking for exact duplicates\n"));
13074   qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_sort_cmp);
13075 
13076   debug8(
13077 	 for (i = 0; i < n; i++) {
13078 	   hitpair = hitpairs[i];
13079 	   printf("  Initial %d (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), circularalias %d|%d, nmatches: %d+%d (%d+%d to trims), amb_lengths %d and %d, sensedirs %d and %d.",
13080 		  i,Pairtype_string(hitpair->pairtype),Method_string(hitpair->hit5->method),
13081 		  Method_string(hitpair->hit3->method),hitpair,
13082 		  hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13083 		  hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13084 		  hitpair->dir,hitpair->hit5->circularalias,hitpair->hit3->circularalias,
13085 		  hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13086 		  hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13087 		  amb_length(hitpair->hit5),amb_length(hitpair->hit3),hitpair->hit5->sensedir,hitpair->hit3->sensedir);
13088 	   if (hitpair->hit5->hittype == TRANSLOC_SPLICE) {
13089 	     printf("  5' TRANSLOC splice probs %f",hitpair->hit5->splice_score);
13090 	   }
13091 	   if (hitpair->hit3->hittype == TRANSLOC_SPLICE) {
13092 	     printf("  3' TRANSLOC splice probs %f",hitpair->hit3->splice_score);
13093 	   }
13094 	   printf("\n");
13095 	 }
13096 	 );
13097 
13098   i = 0;
13099   while (i < n) {
13100     j = i+1;
13101     debug8(printf(" %d,%d",i,j));
13102     while (j < n && hitpair_equal(hitpairs[j],hitpairs[i]) == true) {
13103       debug8(printf("  %d is identical to %d => eliminating\n",j,i));
13104       eliminate[j] = true;
13105       parenti[j] = i;
13106       j++;
13107     }
13108     i = j;
13109   }
13110   debug8(printf("\n"));
13111 
13112   nkept = 0;
13113   for (i = 0; i < n; i++) {
13114     if (eliminate[i] == false) {
13115       nkept++;
13116     }
13117   }
13118   debug8(printf("nkept = %d\n",nkept));
13119 
13120   if (nkept == 0) {
13121     /* All entries eliminated one another, so keep the first one */
13122     debug8(printf("All entries eliminate one another, so keep the first one\n"));
13123     eliminate[0] = false;
13124     nkept = 1;
13125   }
13126 
13127   for (i = n - 1; i >= 0; --i) {
13128     hitpair = hitpairs[i];
13129     if (eliminate[i] == false) {
13130       debug8(printf("  Keeping %s|%s %u..%u|%u..%u, nmatches (trimmed) %d+%d, score %d+%d, (dir = %d)\n",
13131 		    Method_string(hitpair->hit5->method),Method_string(hitpair->hit3->method),
13132 		    hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13133 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13134 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13135 		    hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,hitpair->dir));
13136       unique = Hitlist_push(unique,hitlistpool,(void *) hitpair);
13137 
13138     } else {
13139       debug8(printf("  Eliminating %s|%s %u..%u|%u..%u, nmatches (trimmed) %d+%d, score %d+%d, (dir = %d)\n",
13140 		    Method_string(hitpair->hit5->method),Method_string(hitpair->hit3->method),
13141 		    hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13142 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13143 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13144 		    hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,hitpair->dir));
13145 
13146       parent = hitpairs[parenti[i]];
13147       Stage3pair_transfer_transcripts_one(parent,hitpair);
13148       Stage3pair_free(&hitpair);
13149     }
13150   }
13151 
13152 #ifdef USE_ALLOCA_FOR_HITS
13153   FREEA(hitpairs);
13154   FREEA(eliminate);
13155   FREEA(parenti);
13156 #else
13157   FREE(hitpairs);
13158   FREE(eliminate);
13159   FREE(parenti);
13160 #endif
13161 
13162 
13163   debug8(printf("  Step 2.  Checking for bad superstretches\n"));
13164   if (0 && translocp == true) {
13165     return unique;
13166   } else {
13167     return pair_remove_bad_superstretches(&keep_p,/*superstretch*/NULL,unique,
13168 					  hitlistpool,querylength5,querylength3,finalp);
13169   }
13170 }
13171 
13172 
13173 static int
calc_insertlength_score(Chrpos_T insertlength)13174 calc_insertlength_score (Chrpos_T insertlength) {
13175   if (insertlength > 80000) {
13176     return 2;
13177   } else if (insertlength > 1000) {
13178     return 1;
13179   } else {
13180     return 0;
13181   }
13182 }
13183 
13184 
13185 List_T
Stage3pair_remove_overlaps(List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3,bool translocp,bool finalp)13186 Stage3pair_remove_overlaps (List_T hitpairlist, Hitlistpool_T hitlistpool,
13187 			    int querylength5, int querylength3,
13188 			    bool translocp, bool finalp) {
13189   List_T optimal, unique_separate, unique_overlapping,
13190     separate = NULL, overlapping = NULL, p;
13191   Stage3pair_T hitpair_separate, hitpair_overlapping, *hitpairs, hitpair;
13192 
13193   Stage3pair_T *array_separate, *array_overlapping;
13194   Univcoord_T low, high;
13195 
13196   int max_adj_nmatches, score;
13197   int best_nsegments, nsegments;
13198   int best_insertlength_score, insertlength_score;
13199   double max_splice_score, splice_score;
13200   Chrpos_T best_outerlength;
13201 
13202   bool subsumedp, equalp, *eliminate, keptp;
13203   int n_separate, n_overlapping, n, i, j, k;
13204 
13205 
13206   debug8(printf("Entered Stage3pair_remove_overlaps with %d hitpairs\n",List_length(hitpairlist)));
13207   for (p = hitpairlist; p != NULL; p = List_next(p)) {
13208     hitpair = (Stage3pair_T) List_head(p);
13209     if (hitpair->insertlength <= (Chrpos_T) (hitpair->hit5->querylength + hitpair->hit3->querylength)) {
13210       overlapping = Hitlist_push(overlapping,hitlistpool,(void *) hitpair);
13211     } else {
13212       separate = Hitlist_push(separate,hitlistpool,(void *) hitpair);
13213     }
13214   }
13215   Hitlist_free(&hitpairlist);
13216 
13217   debug8(printf("Calling Stage3pair_remove_overlaps for separate pair ends\n"));
13218   unique_separate = pair_remove_overlaps(separate,hitlistpool,querylength5,querylength3,translocp,finalp);
13219 
13220   debug8(printf("Calling Stage3pair_remove_overlaps for overlapping pair ends\n"));
13221   unique_overlapping = pair_remove_overlaps(overlapping,hitlistpool,querylength5,querylength3,translocp,finalp);
13222 
13223   if (unique_overlapping == NULL) {
13224     debug8(printf("Unique overlapping is NULL\n"));
13225     hitpairlist = unique_separate;
13226   } else if (unique_separate == NULL) {
13227     debug8(printf("Unique separate is NULL\n"));
13228     hitpairlist = unique_overlapping;
13229   } else {
13230     debug8(printf("Have both overlapping and separate\n"));
13231     n_overlapping = List_length(unique_overlapping);
13232 #ifdef USE_ALLOCA_FOR_HITS
13233     array_overlapping = (Stage3pair_T *) MALLOCA(n_overlapping * sizeof(Stage3pair_T));
13234     List_fill_array((void **) array_overlapping,unique_overlapping);
13235 #else
13236     array_overlapping = (Stage3pair_T *) List_to_array(unique_overlapping,NULL);
13237 #endif
13238 
13239     n_separate = List_length(unique_separate);
13240 #ifdef USE_ALLOCA_FOR_HITS
13241     array_separate = (Stage3pair_T *) MALLOCA(n_separate * sizeof(Stage3pair_T));
13242     List_fill_array((void **) array_separate,unique_separate);
13243 #else
13244     array_separate = (Stage3pair_T *) List_to_array(unique_separate,NULL);
13245 #endif
13246 
13247     qsort(array_overlapping,n_overlapping,sizeof(Stage3pair_T),hitpair_position_cmp);
13248     qsort(array_separate,n_separate,sizeof(Stage3pair_T),hitpair_position_cmp);
13249 
13250     /* 1.  First, favor overlapping (with smaller insertlengths) */
13251     /* Keep unique_overlapping and filter unique_separate into indep_separate */
13252     Hitlist_free(&unique_separate);
13253     unique_separate = (List_T) NULL;
13254 
13255     i = j = 0;
13256     for (i = 0; i < n_separate; i++) {
13257       hitpair_separate = array_separate[i];
13258       low = hitpair_separate->low;
13259       high = hitpair_separate->high;
13260       while (j >= 0 && array_overlapping[j]->high >= low) {
13261 	j--;
13262       }
13263       j += 1;
13264 
13265       subsumedp = false;
13266       while (j < n_overlapping && subsumedp == false && array_overlapping[j]->low <= high) {
13267 	if (hitpair_goodness_cmp(&equalp,array_overlapping[j],
13268 				 hitpair_separate,finalp) > 0) {
13269 	  debug8(printf("overlapping pair %d better than separate pair %d\n",j,i));
13270 	  subsumedp = hitpair_subsumption(hitpair_separate,array_overlapping[j]);
13271 	  debug8(printf("  checking if separate pair %d subsumes overlapping pair %d => %d\n",
13272 			i,j,subsumedp));
13273 	}
13274 	j++;
13275       }
13276       j -= 1;
13277 
13278       if (subsumedp == true) {
13279 	Stage3pair_free(&hitpair_separate);
13280       } else {
13281         unique_separate = Hitlist_push(unique_separate,hitlistpool,(void *) hitpair_separate);
13282       }
13283     }
13284 
13285 #ifdef USE_ALLOCA_FOR_HITS
13286     FREEA(array_separate);
13287 #else
13288     FREE(array_separate);
13289 #endif
13290 
13291     if ((n_separate = List_length(unique_separate)) == 0) {
13292 #ifdef USE_ALLOCA_FOR_HITS
13293       FREEA(array_overlapping);
13294 #else
13295       FREE(array_overlapping);
13296 #endif
13297       hitpairlist = unique_overlapping;
13298 
13299     } else {
13300 #ifdef USE_ALLOCA_FOR_HITS
13301       array_separate = (Stage3pair_T *) MALLOCA(n_separate * sizeof(Stage3pair_T));
13302       List_fill_array((void **) array_separate,unique_separate);
13303 #else
13304       array_separate = (Stage3pair_T *) List_to_array(unique_separate,NULL);
13305 #endif
13306 
13307       /* 2.  Second, favor separate (with larger insertlengths) */
13308       /* Keep indep_separate and filter unique_overlapping into indep_overlapping */
13309       Hitlist_free(&unique_overlapping);
13310       unique_overlapping = (List_T) NULL;
13311 
13312       i = j = 0;
13313       for (i = 0; i < n_overlapping; i++) {
13314 	hitpair_overlapping = array_overlapping[i];
13315 	low = hitpair_overlapping->low;
13316 	high = hitpair_overlapping->high;
13317 	while (j >= 0 && array_separate[j]->high >= low) {
13318 	  j--;
13319 	}
13320 	j += 1;
13321 
13322 	subsumedp = false;
13323 	while (j < n_separate && subsumedp == false && array_separate[j]->low <= high) {
13324 	  if (hitpair_goodness_cmp(&equalp,array_separate[j],
13325 				   hitpair_overlapping,finalp) > 0) {
13326 	    debug8(printf("separate pair %d better than overlapping pair %d\n",j,i));
13327 	    subsumedp = hitpair_subsumption(array_separate[j],hitpair_overlapping);
13328 	    debug8(printf("  checking if separate pair %d subsumes overlapping pair %d => %d\n",
13329 			  j,i,subsumedp));
13330 	  }
13331 	  j++;
13332 	}
13333 	j -= 1;
13334 
13335 	if (subsumedp == true) {
13336 	  Stage3pair_free(&hitpair_overlapping);
13337 	} else {
13338 	  unique_overlapping = Hitlist_push(unique_overlapping,hitlistpool,(void *) hitpair_overlapping);
13339 	}
13340       }
13341     }
13342 
13343 #ifdef USE_ALLOCA_FOR_HITS
13344     FREEA(array_separate);
13345     FREEA(array_overlapping);
13346 #else
13347     FREE(array_separate);
13348     FREE(array_overlapping);
13349 #endif
13350 
13351     hitpairlist = List_append(unique_overlapping,unique_separate);
13352   }
13353 
13354 
13355   /* Prune based on nmatches adjusted by score to get a tradeoff between matches and parsimony */
13356   /* Same as step 1 of Stage3pair_optimal_score_final */
13357   debug8(printf("  Step 3.  Maximize nmatches adjusted by score (with slop)\n"));
13358   optimal = (List_T) NULL;
13359 
13360   keptp = false;
13361   hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
13362   eliminate = (bool *) CALLOC(n,sizeof(bool));
13363   qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
13364   i = 0;
13365   while (i < n) {
13366     j = i+1;
13367     while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
13368       j++;
13369     }
13370     if (j - i > 1) {
13371       debug8(printf("Found a group from %d to %d\n",i,j));
13372       max_adj_nmatches = 0;
13373       for (k = i; k < j; k++) {
13374 	hitpair = hitpairs[k];
13375 	if ((score = hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
13376 	     - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall) > max_adj_nmatches) {
13377 	  max_adj_nmatches = score;
13378 	}
13379       }
13380       debug8(printf("max_adj_nmatches = %d\n",max_adj_nmatches));
13381 
13382       for (k = i; k < j; k++) {
13383 	hitpair = hitpairs[k];
13384 	if (hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
13385 	    - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall < max_adj_nmatches - ADJ_NMATCHES_SLOP) {
13386 	  debug8(printf("Within loci pair (adj score %d (%d+%d -%d-%d) < %d w/slop): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
13387 			hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
13388 			- hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall,
13389 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13390 			hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,max_adj_nmatches,
13391 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13392 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13393 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13394 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
13395 	  eliminate[k] = true;
13396 
13397 	} else {
13398 	  debug8(printf("Within loci pair (adj score %d (%d+%d -%d-%d) == %d w/slop): Keeping hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
13399 			hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
13400 			- hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall,
13401 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13402 			hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,max_adj_nmatches,
13403 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13404 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13405 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13406 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
13407 	  keptp = true;
13408 	}
13409       }
13410     }
13411 
13412     i = j;
13413   }
13414 
13415   if (keptp == false) {
13416     optimal = hitpairlist;
13417   } else {
13418     for (k = 0; k < n; k++) {
13419       hitpair = hitpairs[k];
13420       if (eliminate[k] == true) {
13421 	debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13422 		      hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13423 		      hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13424 		      hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13425 		      hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13426 		      hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13427 	Stage3pair_free(&hitpair);
13428 	/* eliminatedp = true; */
13429       } else {
13430 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
13431       }
13432     }
13433     Hitlist_free(&hitpairlist);
13434   }
13435   FREE(hitpairs);
13436   FREE(eliminate);
13437   hitpairlist = optimal;
13438 
13439 
13440   /* Eliminate within loci: minimize nsegments and maximize splice score (for approximately equal insertlengths) */
13441   /* Since we have achieved same number of matches, we should minimize nsegments to achieve parsimony */
13442   debug8(printf("  Step 4.  Minimize nsegments and splice score (for approximately equal insertlengths)\n"));
13443   optimal = (List_T) NULL;
13444 
13445   keptp = false;
13446   hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
13447   eliminate = (bool *) CALLOC(n,sizeof(bool));
13448   qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
13449   i = 0;
13450   while (i < n) {
13451     j = i+1;
13452     while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
13453       j++;
13454     }
13455     if (j - i > 1) {
13456       debug8(printf("Found a group from %d to %d\n",i,j));
13457       best_nsegments = querylength5 + querylength3;
13458       best_insertlength_score = 99;
13459       max_splice_score = 0.0;
13460       for (k = i; k < j; k++) {
13461 	hitpair = hitpairs[k];
13462 	if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) < best_nsegments) {
13463 	  best_nsegments = nsegments;
13464 	  best_insertlength_score = calc_insertlength_score(hitpair->insertlength);
13465 	  max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
13466 
13467 	} else if (nsegments == best_nsegments) {
13468 	  if ((insertlength_score = calc_insertlength_score(hitpair->insertlength)) < best_insertlength_score) {
13469 	    best_insertlength_score = insertlength_score;
13470 	    max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
13471 
13472 	  } else if (insertlength_score == best_insertlength_score) {
13473 	    if ((splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score) > max_splice_score) {
13474 	      max_splice_score = splice_score;
13475 	    }
13476 	  }
13477 	}
13478       }
13479       debug8(printf("best_nsegments %d, best_insertlength_score %d, max_splice_score %f\n",
13480 		    best_nsegments,best_insertlength_score,max_splice_score));
13481 
13482       for (k = i; k < j; k++) {
13483 	hitpair = hitpairs[k];
13484 	if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) > best_nsegments) {
13485 	  debug8(printf("Within loci pair (nsegments %d > %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13486 			nsegments,best_nsegments,
13487 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13488 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13489 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13490 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13491 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13492 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13493 	  eliminate[k] = true;
13494 
13495 	} else if (calc_insertlength_score(hitpair->insertlength) > best_insertlength_score) {
13496 	  debug8(printf("Within loci pair (insertlength score %d > %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13497 			calc_insertlength_score(hitpair->insertlength),best_insertlength_score,
13498 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13499 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13500 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13501 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13502 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13503 	  eliminate[k] = true;
13504 
13505 	} else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
13506 	  debug8(printf("Within loci pair (splice_score w/slop %f < %f): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13507 			hitpair->hit5->splice_score + hitpair->hit3->splice_score,max_splice_score,
13508 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13509 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13510 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13511 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13512 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13513 	  eliminate[k] = true;
13514 
13515 	} else {
13516 	  keptp = true;
13517 	}
13518       }
13519     }
13520 
13521     i = j;
13522   }
13523 
13524   if (keptp == false) {
13525     optimal = hitpairlist;
13526   } else {
13527     for (k = 0; k < n; k++) {
13528       hitpair = hitpairs[k];
13529       if (eliminate[k] == true) {
13530 	debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13531 		      hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13532 		      hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13533 		      hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13534 		      hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13535 		      hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13536 	Stage3pair_free(&hitpair);
13537 	/* eliminatedp = true; */
13538       } else {
13539 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
13540       }
13541     }
13542     Hitlist_free(&hitpairlist);
13543   }
13544   FREE(hitpairs);
13545   FREE(eliminate);
13546   hitpairlist = optimal;
13547 
13548 
13549   /* Eliminate within loci: minimize outerlength */
13550   debug8(printf("  Step 5.  Minimize outerlength\n"));
13551   optimal = (List_T) NULL;
13552 
13553   keptp = false;
13554   hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
13555   eliminate = (bool *) CALLOC(n,sizeof(bool));
13556   qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
13557   i = 0;
13558   while (i < n) {
13559     j = i+1;
13560     while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
13561       j++;
13562     }
13563     if (j - i > 1) {
13564       debug8(printf("Found a group from %d to %d\n",i,j));
13565       best_outerlength = (Chrpos_T) -1U;
13566       for (k = i; k < j; k++) {
13567 	hitpair = hitpairs[k];
13568 	if (hitpair->outerlength < best_outerlength) {
13569 	  best_outerlength = hitpair->outerlength;
13570 	}
13571       }
13572       debug8(printf("best_outerlength %u\n",best_outerlength));
13573 
13574       for (k = i; k < j; k++) {
13575 	hitpair = hitpairs[k];
13576 	if (hitpair->outerlength > best_outerlength) {
13577 	  debug8(printf("Within loci pair (outerlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
13578 			hitpair->outerlength,best_outerlength /*+ OUTERLENGTH_SLOP*/,
13579 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13580 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13581 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13582 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
13583 	  eliminate[k] = true;
13584 
13585 	} else {
13586 	  debug8(printf("Within loci pair (outerlength %u == %u): Keeping hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
13587 			hitpair->outerlength,best_outerlength /*+ OUTERLENGTH_SLOP*/,
13588 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13589 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13590 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13591 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
13592 	  keptp = true;
13593 	}
13594       }
13595     }
13596 
13597     i = j;
13598   }
13599 
13600   if (keptp == false) {
13601     optimal = hitpairlist;
13602   } else {
13603     for (k = 0; k < n; k++) {
13604       hitpair = hitpairs[k];
13605       if (eliminate[k] == true) {
13606 	debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13607 		      hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13608 		      hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13609 		      hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13610 		      hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13611 		      hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13612 	Stage3pair_free(&hitpair);
13613 	/* eliminatedp = true; */
13614       } else {
13615 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
13616       }
13617     }
13618     Hitlist_free(&hitpairlist);
13619   }
13620   FREE(hitpairs);
13621   FREE(eliminate);
13622   hitpairlist = optimal;
13623 
13624 
13625   return hitpairlist;
13626 }
13627 
13628 
13629 #ifdef PRE_RESOLVE_MULTIMAPPING
13630 List_T
Stage3pair_resolve_multimapping(List_T hitpairs,Hitlistpool_T hitlistpool)13631 Stage3pair_resolve_multimapping (List_T hitpairs, Hitlistpool_T hitlistpool) {
13632   List_T resolve1, resolve2, resolve3, p;
13633   Stage3pair_T hitpair;
13634 
13635   long int best_tally;
13636   double tally_threshold;
13637   bool runlengthp;
13638 
13639 
13640   if (List_length(hitpairs) <= 1) {
13641     return hitpairs;
13642   }
13643 
13644 #if 0
13645   if (genes_iit == NULL) {
13646     resolve1 = hitpairs;
13647   } else {
13648     best_overlap = NO_KNOWN_GENE;
13649     for (p = hitpairs; p != NULL; p = p->rest) {
13650       hitpair = (Stage3pair_T) p->first;
13651       if ((hitpair->gene_overlap = Stage3pair_gene_overlap(hitpair)) > best_overlap) {
13652 	best_overlap = hitpair->gene_overlap;
13653       }
13654     }
13655     if (best_overlap == NO_KNOWN_GENE) {
13656       resolve1 = hitpairs;
13657     } else {
13658       resolve1 = (List_T) NULL;
13659       for (p = hitpairs; p != NULL; p = p->rest) {
13660 	hitpair = (Stage3pair_T) p->first;
13661 	if (hitpair->gene_overlap < best_overlap) {
13662 	  Stage3pair_free(&hitpair);
13663 	} else {
13664 	  resolve1 = Hitlist_push(resolve1,hitlistpool,(void *) hitpair);
13665 	}
13666       }
13667       Hitlist_free(&hitpairs);
13668     }
13669   }
13670 
13671   if (List_length(resolve1) <= 1) {
13672     return resolve1;
13673   }
13674 #else
13675   resolve1 = hitpairs;
13676 #endif
13677 
13678   if (tally_iit == NULL) {
13679     resolve2 = resolve1;
13680   } else {
13681     best_tally = 0L;
13682     for (p = resolve1; p != NULL; p = p->rest) {
13683       hitpair = (Stage3pair_T) p->first;
13684       if ((hitpair->tally = Stage3end_compute_tally(hitpair->hit5) + Stage3end_compute_tally(hitpair->hit3)) > best_tally) {
13685 	best_tally = hitpair->tally;
13686       }
13687     }
13688     if (best_tally == 0L) {
13689       resolve2 = resolve1;
13690     } else {
13691       resolve2 = (List_T) NULL;
13692 #ifdef USE_TALLY_RATIO
13693       tally_threshold = (double) best_tally / TALLY_RATIO;
13694 #else
13695       tally_threshold = 1.0;
13696 #endif
13697       for (p = resolve1; p != NULL; p = p->rest) {
13698 	hitpair = (Stage3pair_T) p->first;
13699 	if ((double) hitpair->tally < tally_threshold) {
13700 	  Stage3pair_free(&hitpair);
13701 	} else {
13702 	  resolve2 = Hitlist_push(resolve2,hitlistpool,(void *) hitpair);
13703 	}
13704       }
13705       Hitlist_free(&resolve1);
13706     }
13707   }
13708 
13709   if (List_length(resolve2) <= 1) {
13710     return resolve2;
13711   }
13712 
13713   if (runlength_iit == NULL) {
13714     resolve3 = resolve2;
13715   } else {
13716     runlengthp = false;
13717     for (p = resolve2; p != NULL; p = p->rest) {
13718       hitpair = (Stage3pair_T) p->first;
13719       if (Stage3end_runlength_p(hitpair->hit5) == true || Stage3end_runlength_p(hitpair->hit3) == true) {
13720 	runlengthp = true;
13721       }
13722     }
13723     if (runlengthp == false) {
13724       resolve3 = resolve2;
13725     } else {
13726       resolve3 = (List_T) NULL;
13727       for (p = resolve2; p != NULL; p = p->rest) {
13728 	hitpair = (Stage3pair_T) p->first;
13729 	if (Stage3end_runlength_p(hitpair->hit5) == false && Stage3end_runlength_p(hitpair->hit3) == false) {
13730 	  Stage3pair_free(&hitpair);
13731 	} else {
13732 	  resolve3 = Hitlist_push(resolve3,hitlistpool,(void *) hitpair);
13733 	}
13734       }
13735       Hitlist_free(&resolve2);
13736     }
13737   }
13738 
13739 
13740   return resolve3;
13741 }
13742 #endif
13743 
13744 
13745 #if 0
13746 /* Eliminates entire pair even if only one end is bad.  Should filter each end, and not each pair */
13747 List_T
13748 Stage3pair_filter (List_T hits, Hitlistpool_T hitlistpool,
13749 		   int max_mismatches_5, int max_mismatches_3,
13750 		   int min_coverage_5, int min_coverage_3) {
13751   List_T newhits = NULL, p;
13752   Stage3end_T hit5, hit3;
13753   Stage3pair_T hitpair;
13754 
13755   /* Previously had option filter_within_trims_p to look at ref_score_overall */
13756   for (p = hits; p != NULL; p = List_next(p)) {
13757     hitpair = (Stage3pair_T) List_head(p);
13758     hit5 = hitpair->hit5;
13759     hit3 = hitpair->hit3;
13760     debug(printf("refalt_score_within_trims is %d and %d\n",hit5->refalt_score_within_trims,hit3->refalt_score_within_trims));
13761 
13762     if (hit5->refalt_score_within_trims > max_mismatches_5 || hit3->refalt_score_within_trims > max_mismatches_3) {
13763       Stage3pair_free(&hitpair);
13764     } else if (hit5->querylength - hit5->trim_querystart - hit5->trim_queryend < min_coverage_5 &&
13765 	       hit3->querylength - hit3->trim_querystart - hit3->trim_queryend < min_coverage_3) {
13766       Stage3pair_free(&hitpair);
13767     } else {
13768       newhits = Hitlist_push(newhits,hitlistpool,(void *) hitpair);
13769     }
13770   }
13771 
13772   Hitlist_free(&hits);
13773   return newhits;
13774 }
13775 #endif
13776 
13777 
13778 Stage3pair_T *
Stage3pair_eval_and_sort(int npaths,int * first_absmq,int * second_absmq,Stage3pair_T * stage3pairarray,char * queryuc_ptr_5,char * queryuc_ptr_3,char * quality_string_5,char * quality_string_3)13779 Stage3pair_eval_and_sort (int npaths, int *first_absmq, int *second_absmq,
13780 			  Stage3pair_T *stage3pairarray,
13781 			  char *queryuc_ptr_5, char *queryuc_ptr_3,
13782 			  char *quality_string_5, char *quality_string_3) {
13783   float maxlik, loglik;
13784 
13785   float total, q;
13786   int mapq_score;
13787 
13788   int compute_npaths;
13789   int randomi, i;
13790   Stage3pair_T temp, hitpair;
13791 
13792   if (npaths == 0) {
13793     /* Skip */
13794     *first_absmq = 0;
13795     *second_absmq = 0;
13796 
13797   } else if (npaths == 1) {
13798     hitpair = stage3pairarray[0];
13799     hitpair->mapq_loglik = MAPQ_MAXIMUM_SCORE;
13800     hitpair->mapq_score = MAPQ_max_quality_score(quality_string_5,hitpair->hit5->querylength);
13801     if ((mapq_score = MAPQ_max_quality_score(quality_string_3,hitpair->hit3->querylength)) > stage3pairarray[0]->mapq_score) {
13802       hitpair->mapq_score = mapq_score;
13803     }
13804     hitpair->absmq_score = MAPQ_MAXIMUM_SCORE;
13805 
13806     Stage3end_display_prep(hitpair->hit5,queryuc_ptr_5,/*first_read_p*/true);
13807     Stage3end_display_prep(hitpair->hit3,queryuc_ptr_3,/*first_read_p*/false);
13808 
13809     *first_absmq = hitpair->absmq_score;
13810     *second_absmq = 0;
13811 
13812   } else {
13813 
13814     /* Resolve ambiguities, needed for computing mapq */
13815     for (i = 0; i < npaths; i++) {
13816       hitpair = stage3pairarray[i];
13817       Stage3end_display_prep(hitpair->hit5,queryuc_ptr_5,/*first_read_p*/true);
13818       Stage3end_display_prep(hitpair->hit3,queryuc_ptr_3,/*first_read_p*/false);
13819     }
13820 
13821 
13822     /* Compute mapq_loglik */
13823     for (i = 0; i < npaths; i++) {
13824       hitpair = stage3pairarray[i];
13825       hitpair->mapq_loglik =
13826 	Stage3end_compute_mapq(hitpair->hit5,quality_string_5);
13827       hitpair->mapq_loglik +=
13828 	Stage3end_compute_mapq(hitpair->hit3,quality_string_3);
13829     }
13830 
13831     /* Sort by nmatches, then mapq, and then insert length */
13832     qsort(stage3pairarray,npaths,sizeof(Stage3pair_T),Stage3pair_output_cmp);
13833 
13834     if (want_random_p) {
13835       /* Randomize among best alignments */
13836       i = 1;
13837       while (i < npaths && Stage3pair_output_cmp(&(stage3pairarray[i]),&(stage3pairarray[0])) == 0) {
13838 	i++;
13839       }
13840       if (i > 1) {		/* i is number of ties */
13841 	/* randomi = (int) ((double) i * rand()/((double) RAND_MAX + 1.0)); */
13842 	randomi = (int) (rand() / (((double) RAND_MAX + 1.0) / (double) i));
13843 	/* fprintf(stderr,"%d dups => random %d\n",i,randomi); */
13844 	temp = stage3pairarray[0];
13845 	stage3pairarray[0] = stage3pairarray[randomi];
13846 	stage3pairarray[randomi] = temp;
13847       }
13848     }
13849 
13850     /* Enforce monotonicity */
13851     for (i = npaths - 1; i > 0; i--) {
13852       if (stage3pairarray[i-1]->mapq_loglik < stage3pairarray[i]->mapq_loglik) {
13853 	stage3pairarray[i-1]->mapq_loglik = stage3pairarray[i]->mapq_loglik;
13854       }
13855     }
13856     maxlik = stage3pairarray[0]->mapq_loglik;
13857 
13858     /* Subtract maxlik to avoid underflow */
13859     for (i = 0; i < npaths; i++) {
13860       stage3pairarray[i]->mapq_loglik -= maxlik;
13861     }
13862 
13863 #if 0
13864     /* Save on computation if possible */
13865     /* Doesn't work */
13866     if (npaths < maxpaths) {
13867       compute_npaths = npaths;
13868     } else {
13869       compute_npaths = maxpaths;
13870     }
13871     if (compute_npaths < 2) {
13872       compute_npaths = 2;
13873     }
13874 #else
13875     compute_npaths = npaths;
13876 #endif
13877 
13878 
13879     /* Compute absolute mapq */
13880     for (i = 0; i < compute_npaths; i++) {
13881       loglik = stage3pairarray[i]->mapq_loglik + MAPQ_MAXIMUM_SCORE;
13882       if (loglik < 0.0) {
13883 	loglik = 0.0;
13884       }
13885       stage3pairarray[i]->absmq_score = rint(loglik);
13886     }
13887     *first_absmq = stage3pairarray[0]->absmq_score;
13888     *second_absmq = stage3pairarray[1]->absmq_score;
13889 
13890 
13891     /* Compute Bayesian mapq */
13892     total = 0.0;
13893     for (i = 0; i < npaths; i++) {
13894       total += (stage3pairarray[i]->mapq_loglik = fasterexp(stage3pairarray[i]->mapq_loglik));
13895     }
13896 
13897     /* Obtain posterior probabilities of being true */
13898     for (i = 0; i < compute_npaths; i++) {
13899       stage3pairarray[i]->mapq_loglik /= total;
13900     }
13901 
13902     /* Convert to Phred scores */
13903     for (i = 0; i < compute_npaths; i++) {
13904       if ((q = 1.0 - stage3pairarray[i]->mapq_loglik) < 2.5e-10 /* 10^-9.6 */) {
13905 	stage3pairarray[i]->mapq_score = 96;
13906       } else {
13907 	stage3pairarray[i]->mapq_score = rint(-10.0 * log10(q));
13908       }
13909     }
13910 
13911 #if 0
13912     /* Apply filtering for mapq unique -- currently not used since mapq_unique_score is high */
13913     if (stage3pairarray[0]->mapq_score >= mapq_unique_score &&
13914 	stage3pairarray[1]->mapq_score < mapq_unique_score) {
13915       for (i = 1; i < *npaths; i++) {
13916 	Stage3pair_free(&(stage3pairarray[i]));
13917       }
13918       *npaths = 1;
13919     }
13920 #endif
13921   }
13922 
13923   return stage3pairarray;
13924 }
13925 
13926 
13927 static List_T
Stage3pair_optimal_score_prefinal(bool * eliminatedp,List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3)13928 Stage3pair_optimal_score_prefinal (bool *eliminatedp, List_T hitpairlist,
13929 				   Hitlistpool_T hitlistpool, int querylength5, int querylength3) {
13930   List_T optimal = NULL, p, q;
13931   Stage3pair_T hitpair;
13932   T hit5, hit3;
13933   Substring_T substring;
13934   Junction_T junction;
13935   int cutoff_level_5, cutoff_level_3, ref_nmismatches;
13936   int n;
13937   int minscore5 = querylength5, minscore3 = querylength3, minscore = querylength5 + querylength3;
13938 #ifdef USE_OPTIMAL_SCORE_BINGO
13939   int minscore_bingo = querylength5 + querylength3;
13940 #endif
13941   int trim_querystart_5 = 0, trim_queryend_5 = 0, trim_querystart_3 = 0, trim_queryend_3 = 0,
13942     trim_querystart_0, trim_queryend_0;
13943 
13944 
13945 #if 0 /* DISTANT_SPLICE_SPECIAL */
13946   bool shortdistance_p = false;
13947 #endif
13948 
13949 
13950   *eliminatedp = false;
13951   n = List_length(hitpairlist);
13952   debug8(printf("\nEntered Stage3pair_optimal_score_prefinal with %d hitpairs\n",n));
13953 
13954   if (n <= 1) {
13955     return hitpairlist;
13956   }
13957 
13958 
13959   /* Use eventrim for comparing alignments.  Previously picked
13960      smallest trims, but now picking largest ones */
13961   for (p = hitpairlist; p != NULL; p = p->rest) {
13962     hitpair = (Stage3pair_T) p->first;
13963     hit5 = hitpair->hit5;
13964     hit3 = hitpair->hit3;
13965 
13966     debug8(printf("hit5 %u..%u method %s, nsegments %d, nindels %d, trim_querystart: %d%s, trim_queryend %d%s, start_ambig %d, end_ambig %d, sensedir %d, splice score %f\n",
13967 		  hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,Method_string(hit5->method),
13968 		  hit5->nsegments,hit5->nindels,hit5->trim_querystart,hit5->trim_querystart_splicep ? " (splice)" : "",
13969 		  hit5->trim_queryend,hit5->trim_queryend_splicep ? " (splice)" : "",
13970 		  start_amb_length(hit5),end_amb_length(hit5),hit5->sensedir,hit5->splice_score));
13971 
13972     debug8(printf("hit3 %u..%u method %s, nsegments %d, nindels %d, trim_querystart %d%s, trim_queryend %d%s, start_ambig %d, end_ambig %d, sensedir %d, splice score %f\n\n",
13973 		  hit3->genomicstart - hit3->chroffset,hit3->genomicend - hit3->chroffset,Method_string(hit3->method),
13974 		  hit3->nsegments,hit3->nindels,hit3->trim_querystart,hit3->trim_querystart_splicep ? " (splice)" : "",
13975 		  hit3->trim_queryend,hit3->trim_queryend_splicep ? " (splice)" : "",
13976 		  start_amb_length(hit3),end_amb_length(hit3),hit3->sensedir,hit3->splice_score));
13977 
13978     if (hit5->trim_querystart_splicep == true) {
13979       /* Skip */
13980     } else if (hit5->trim_querystart > trim_querystart_5) {
13981       trim_querystart_5 = hit5->trim_querystart;
13982     }
13983     if (hit5->trim_queryend_splicep == true) {
13984       /* Skip */
13985     } else if (hit5->trim_queryend > trim_queryend_5) {
13986       trim_queryend_5 = hit5->trim_queryend;
13987     }
13988 
13989     if (hit3->trim_querystart_splicep == true) {
13990       /* Skip */
13991     } else if (hit3->trim_querystart > trim_querystart_3) {
13992       trim_querystart_3 = hit3->trim_querystart;
13993     }
13994     if (hit3->trim_queryend_splicep == true) {
13995       /* Skip */
13996     } else if (hit3->trim_queryend > trim_queryend_3) {
13997       trim_queryend_3 = hit3->trim_queryend;
13998     }
13999   }
14000 
14001   if (trim_querystart_5 == querylength5) {
14002     trim_querystart_5 = 0;
14003   }
14004   if (trim_queryend_5 == querylength5) {
14005     trim_queryend_5 = 0;
14006   }
14007   if (trim_querystart_3 == querylength3) {
14008     trim_querystart_3 = 0;
14009   }
14010   if (trim_queryend_3 == querylength3) {
14011     trim_queryend_3 = 0;
14012   }
14013 
14014   debug8(printf("overall 5': trim_querystart %d, trim_queryend %d\n",trim_querystart_5,trim_queryend_5));
14015   debug8(printf("overall 3': trim_querystart %d, trim_queryend %d\n",trim_querystart_3,trim_queryend_3));
14016 
14017 
14018   for (p = hitpairlist; p != NULL; p = p->rest) {
14019     hitpair = (Stage3pair_T) p->first;
14020     hit5 = hitpair->hit5;
14021     hit3 = hitpair->hit3;
14022 
14023 #ifdef CONSIDER_ENDS_IN_EVAL
14024     hit5->score_eventrim = hit5->trim_querystart / 8 + hit5->trim_queryend / 8;
14025 #else
14026     hit5->score_eventrim = 0;
14027 #endif
14028 
14029     debug8(printf("score 5' OTHER:"));
14030 
14031     if (trim_querystart_5 + trim_queryend_5 >= querylength5) {
14032       for (q = hit5->substrings_1toN; q != NULL; q = List_next(q)) {
14033 	substring = (Substring_T) List_head(q);
14034 	hit5->score_eventrim += Substring_nmismatches_bothdiff(substring);
14035       }
14036 
14037     } else {
14038       for (q = hit5->substrings_1toN; q != NULL; q = List_next(q)) {
14039 	substring = (Substring_T) List_head(q);
14040 	trim_querystart_0 = trim_querystart_5;
14041 	trim_queryend_0 = trim_queryend_5;
14042 	if (Substring_mandatory_trim_querystart(substring) > trim_querystart_0) {
14043 	  trim_querystart_0 = Substring_mandatory_trim_querystart(substring);
14044 	}
14045 	if (Substring_mandatory_trim_queryend(substring) > trim_queryend_0) {
14046 	  trim_queryend_0 = Substring_mandatory_trim_queryend(substring);
14047 	}
14048 	hit5->score_eventrim += Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0);
14049 	debug8(printf("  substring (%d..%d) %d.",trim_querystart_5,trim_queryend_5,
14050 		      Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0)));
14051       }
14052     }
14053 
14054     for (q = hit5->junctions_1toN; q != NULL; q = List_next(q)) {
14055       junction = (Junction_T) List_head(q);
14056       if (Junction_nindels(junction) > 0) {
14057 	hit5->score_eventrim += indel_penalty_middle;
14058 	debug8(printf(" => add %d.",indel_penalty_middle));
14059       }
14060     }
14061 
14062 
14063 #if 0
14064     /* Accept a single indel */
14065 #ifdef SCORE_INDELS_EVENTRIM
14066     if (hit5->hittype == INSERTION || hit5->hittype == DELETION) {
14067       debug8(printf("  indel at %d",hit5->indel_pos));
14068       if (hit5->indel_pos > trim_querystart_5 && hit5->indel_pos < querylength5 - trim_queryend_5) {
14069 	hit5->score_eventrim += indel_penalty_middle;
14070 	debug8(printf(" => add %d.",indel_penalty_middle));
14071       }
14072     }
14073 #endif
14074 #endif
14075     debug8(printf("  RESULT: %d\n",hit5->score_eventrim));
14076 
14077     if (hitpair->hit5->score_eventrim < minscore5) {
14078       minscore5 = hitpair->hit5->score_eventrim;
14079     }
14080 
14081 
14082 #ifdef CONSIDER_ENDS_IN_EVAL
14083     hit3->score_eventrim = hit3->trim_querystart / 8 + hit3->trim_queryend / 8;
14084 #else
14085     hit3->score_eventrim = 0;
14086 #endif
14087 
14088     debug8(printf("score 3' OTHER:"));
14089 
14090     if (trim_querystart_3 + trim_queryend_3 >= querylength3) {
14091       for (q = hit3->substrings_1toN; q != NULL; q = List_next(q)) {
14092 	substring = (Substring_T) List_head(q);
14093 	hit3->score_eventrim += Substring_nmismatches_bothdiff(substring);
14094       }
14095 
14096     } else {
14097       for (q = hit3->substrings_1toN; q != NULL; q = List_next(q)) {
14098 	substring = (Substring_T) List_head(q);
14099 	trim_querystart_0 = trim_querystart_3;
14100 	trim_queryend_0 = trim_queryend_3;
14101 	if (Substring_mandatory_trim_querystart(substring) > trim_querystart_0) {
14102 	  trim_querystart_0 = Substring_mandatory_trim_querystart(substring);
14103 	}
14104 	if (Substring_mandatory_trim_queryend(substring) > trim_queryend_0) {
14105 	  trim_queryend_0 = Substring_mandatory_trim_queryend(substring);
14106 	}
14107 	hit3->score_eventrim += Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0);
14108 	debug8(printf("  substring (%d..%d) %d.",trim_querystart_3,trim_queryend_3,
14109 		      Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0)));
14110       }
14111     }
14112 
14113     for (q = hit3->junctions_1toN; q != NULL; q = List_next(q)) {
14114       junction = (Junction_T) List_head(q);
14115       if (Junction_nindels(junction) > 0) {
14116 	hit3->score_eventrim += indel_penalty_middle;
14117 	debug8(printf(" => add %d.",indel_penalty_middle));
14118       }
14119     }
14120 
14121 #if 0
14122     /* Accept a single indel */
14123 #ifdef SCORE_INDELS_EVENTRIM
14124     if (hit3->hittype == INSERTION || hit3->hittype == DELETION) {
14125       debug8(printf("  indel at %d",hit3->indel_pos));
14126       if (hit3->indel_pos > trim_querystart_3 && hit3->indel_pos < querylength3 - trim_queryend_3) {
14127 	hit3->score_eventrim += indel_penalty_middle;
14128 	debug8(printf(" => add %d.",indel_penalty_middle));
14129       }
14130     }
14131 #endif
14132 #endif
14133     debug8(printf("  RESULT: %d\n",hit3->score_eventrim));
14134 
14135     if (hitpair->hit3->score_eventrim < minscore3) {
14136       minscore3 = hitpair->hit3->score_eventrim;
14137     }
14138 
14139 
14140     /* Compute for hitpair */
14141     debug8(printf("hitpair score_eventrim %d = %d + %d\n",
14142 		  hit5->score_eventrim + hit3->score_eventrim,
14143 		  hit5->score_eventrim,hit3->score_eventrim));
14144     hitpair->score_eventrim = hit5->score_eventrim + hit3->score_eventrim;
14145     if (hitpair->score_eventrim < minscore) {
14146       minscore = hitpair->score_eventrim;
14147     }
14148 
14149   }
14150   debug8(printf("MINSCORE: %d\n",minscore));
14151 
14152 
14153   /* Prefinal: Use score_eventrim */
14154   debug8(printf("Stage3pair_optimal_score_prefinal over %d pairs: minscore = %d and %d + subopt:%d\n",
14155 		n,minscore5,minscore3,subopt_levels));
14156 
14157   /* finalp == false.  Add suboptimal_mismatches to each end. */
14158   minscore5 += subopt_levels;
14159   minscore3 += subopt_levels;
14160   cutoff_level_5 = minscore5;
14161   cutoff_level_3 = minscore3;
14162 
14163   for (p = hitpairlist; p != NULL; p = p->rest) {
14164     hitpair = (Stage3pair_T) p->first;
14165 
14166     if (hitpair->hit5->score_eventrim > cutoff_level_5 + SCORE_EVENTRIM_SLOP && hitpair->hit3->score_eventrim > cutoff_level_3 + SCORE_EVENTRIM_SLOP) {
14167       debug8(printf("Prefinal: Eliminating hit pair %p at %u..%u|%u..%u with score_eventrim_5 %d > cutoff_level_5 %d and score_eventrim_3 %d > cutoff_level_3 %d, sensedirs %d and %d, splice scores %f and %f\n",
14168 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14169 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14170 		    hitpair->hit5->score_eventrim,cutoff_level_5,hitpair->hit3->score_eventrim,cutoff_level_3,
14171 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14172       Stage3pair_free(&hitpair);
14173       *eliminatedp = true;
14174 
14175     } else {
14176       debug8(printf("Prefinal: Keeping hit pair %p at %u..%u|%u..%u with score_eventrim_5 %d <= cutoff_level_5 %d or score_eventrim_3 %d <= cutoff_level_3 %d, sensedirs %d and %d, splice scores %f and %f\n",
14177 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14178 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14179 		    hitpair->hit5->score_eventrim,cutoff_level_5,hitpair->hit3->score_eventrim,cutoff_level_3,
14180 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14181       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14182     }
14183   }
14184   Hitlist_free(&hitpairlist);
14185 
14186 
14187 #if 0
14188   /* Filter on nsegments */
14189   if (finalp == true && optimal != NULL) {
14190     hitpairlist = optimal;
14191     optimal = (List_T) NULL;
14192 
14193     hitpair = (Stage3pair_T) hitpairlist->first;
14194     best_nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments;
14195     best_nsegments_5 = hitpair->hit5->nsegments;
14196     best_nsegments_3 = hitpair->hit3->nsegments;
14197 
14198     for (p = hitpairlist; p != NULL; p = p->rest) {
14199       hitpair = (Stage3pair_T) p->first;
14200       if (hitpair->hit5->nsegments + hitpair->hit3->nsegments < best_nsegments) {
14201 	best_nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments;
14202       }
14203       if (hitpair->hit5->nsegments < best_nsegments_5) {
14204 	best_nsegments_5 = hitpair->hit5->nsegments;
14205       }
14206       if (hitpair->hit3->nsegments < best_nsegments_3) {
14207 	best_nsegments_3 = hitpair->hit3->nsegments;
14208       }
14209     }
14210 
14211     for (p = hitpairlist; p != NULL; p = p->rest) {
14212       hitpair = (Stage3pair_T) p->first;
14213       if (hitpair->hit5->nsegments + hitpair->hit3->nsegments > best_nsegments + 2) {
14214 	debug8(printf("Eliminating hit pair %p with nsegments %d+%d, sensedirs %d and %d, splice scores %f and %f\n",
14215 		      hitpair,hitpair->hit5->nsegments,hitpair->hit3->nsegments,
14216 		      hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14217 	Stage3pair_free(&hitpair);
14218 	*eliminatedp = true;
14219       } else {
14220 	debug8(printf("Keeping hit pair %p with nsegments %d+%d, sensedirs %d and %d, splice scores %f and %f\n",
14221 		      hitpair,hitpair->hit5->nsegments,hitpair->hit3->nsegments,
14222 		      hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14223 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14224       }
14225     }
14226 
14227     Hitlist_free(&hitpairlist);
14228   }
14229 #endif
14230 
14231 
14232 #if 0
14233   /* Filter on pairlength */
14234   if (optimal != NULL) {
14235     hitpairlist = optimal;
14236     optimal = (List_T) NULL;
14237 
14238     hitpair = (Stage3pair_T) hitpairlist->first;
14239     best_absdifflength = hitpair->absdifflength;
14240     best_outerlength = hitpair->outerlength;
14241 
14242     for (p = hitpairlist; p != NULL; p = p->rest) {
14243       hitpair = (Stage3pair_T) p->first;
14244       if (hitpair->absdifflength < best_absdifflength) {
14245 	best_absdifflength = hitpair->absdifflength;
14246 	best_outerlength = hitpair->outerlength;
14247       } else if (hitpair->absdifflength > best_absdifflength) {
14248 	/* Skip */
14249       } else if (hitpair->outerlength < best_outerlength) {
14250 	best_outerlength = hitpair->outerlength;
14251       }
14252     }
14253 
14254     for (p = hitpairlist; p != NULL; p = p->rest) {
14255       hitpair = (Stage3pair_T) p->first;
14256       if (hitpair->absdifflength > best_absdifflength) {
14257 	debug8(printf("Eliminating hit pair %p with absdifflength %d\n",hitpair,hitpair->absdifflength));
14258 	Stage3pair_free(&hitpair);
14259 	*eliminatedp = true;
14260       } else if (hitpair->outerlength > best_outerlength + OUTERLENGTH_SLOP) {
14261 	debug8(printf("Eliminating hit pair %p with outerlength %u\n",hitpair,hitpair->outerlength));
14262 	Stage3pair_free(&hitpair);
14263 	*eliminatedp = true;
14264       } else {
14265 	debug8(printf("Keeping hit pair %p with absdifflength %d and outerlength %d\n",
14266 		      hitpair,hitpair->absdifflength,hitpair->outerlength));
14267 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14268       }
14269     }
14270 
14271     Hitlist_free(&hitpairlist);
14272   }
14273 #endif
14274 
14275   debug8(printf("Exiting Stage3pair_optimal_score_prefinal with %d hits\n",List_length(optimal)));
14276   return optimal;
14277 }
14278 
14279 
14280 /* Desired criteria: (A) within locus: (A.1) nsegments within locus,
14281    to get most complete alignment; (A.2) insertlength; and (A.3)
14282    splice_score, to get the correct sensedir.  (B) between loci:
14283    nmatches (and not nmatches_to_trims), to end alignments at the
14284    splice site */
14285 
14286 #if 0
14287 static List_T
14288 Stage3pair_optimal_score_final_old (bool *eliminatedp, List_T hitpairlist,
14289 				    Hitlistpool_T hitlistpool, int querylength5, int querylength3) {
14290   List_T optimal = NULL, p;
14291   Stage3pair_T *hitpairs, hitpair;
14292   int n, i, j, k;
14293   int best_nsegments, nsegments;
14294   int best_insertlength_score, insertlength_score;
14295   int best_nmatches_to_trims, nmatches_to_trims;
14296   double max_splice_score, splice_score;
14297   int max_nmatches = 0, cutoff_level;
14298   /* int trim5_left, trim5_right, trim3_left, trim3_right, min_trim; */
14299   bool *eliminate, keptp;
14300 
14301   /* Relies on Path_solve_from_diagonals to maximize the number of segments at each locus */
14302 
14303   *eliminatedp = false;
14304   n = List_length(hitpairlist);
14305   debug8(printf("\nEntered Stage3pair_optimal_score_final with %d hitpairs\n",n));
14306 
14307   if (n <= 1) {
14308     return hitpairlist;
14309   }
14310 
14311 #ifdef DEBUG8
14312   for (p = hitpairlist; p != NULL; p = p->rest) {
14313     hitpair = (Stage3pair_T) p->first;
14314     printf("%p %p %u..%u|%u..%u methods %s and %s, nsegments %d+%d, nmatches %d+%d (%d+%d to trims), pairlength %u, outerlength %u, sensedirs %d and %d, splice scores %f and %f\n",
14315 	   hitpair->hit5,hitpair->hit3,
14316 	   hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14317 	   hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14318 	   Method_string(hitpair->hit5->method),Method_string(hitpair->hit3->method),
14319 	   hitpair->hit5->nsegments,hitpair->hit3->nsegments,
14320 	   hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14321 	   hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14322 	   hitpair->insertlength,hitpair->outerlength,
14323 	   hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score);
14324   }
14325 #endif
14326 
14327 
14328   /* Prune based on refalt_nmatches_plus_spliced_trims (to get the splice ends) */
14329   max_nmatches = 0;
14330   for (p = hitpairlist; p != NULL; p = p->rest) {
14331     hitpair = (Stage3pair_T) p->first;
14332     if (hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims > max_nmatches) {
14333       max_nmatches = hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims;
14334       assert(max_nmatches <= querylength5 + querylength3);
14335     }
14336   }
14337 
14338   /* May not want to be greedy on cutoff level here.  Might want to raise subopt_levels */
14339   cutoff_level = max_nmatches - subopt_levels;
14340   debug8(printf("(1) refalt cutoff level %d = max_nmatches %d\n",cutoff_level,max_nmatches));
14341 
14342   for (p = hitpairlist; p != NULL; p = p->rest) {
14343     hitpair = (Stage3pair_T) p->first;
14344 
14345     if (hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims < cutoff_level /*- NMATCHES_SLOP*/) {
14346       debug8(printf("Final (nmatches %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to_trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14347 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14348 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14349 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14350 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14351 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14352 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14353       Stage3pair_free(&hitpair);
14354       *eliminatedp = true;
14355 
14356     } else {
14357       debug8(printf("Final (nmatches %d >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to_trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14358 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14359 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14360 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14361 		    List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14362 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14363 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14364 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14365       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14366     }
14367   }
14368   Hitlist_free(&hitpairlist);
14369   hitpairlist = optimal;
14370   optimal = (List_T) NULL;
14371 
14372 
14373   /* Prune based on ref_nmatches_plus_spliced_trims (to get the splice ends) */
14374   max_nmatches = 0;
14375   for (p = hitpairlist; p != NULL; p = p->rest) {
14376     hitpair = (Stage3pair_T) p->first;
14377     if (hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims > max_nmatches) {
14378       max_nmatches = hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims;
14379       assert(max_nmatches <= querylength5 + querylength3);
14380     }
14381   }
14382 
14383   /* May not want to be greedy on cutoff level here.  Might want to raise subopt_levels */
14384   cutoff_level = max_nmatches - subopt_levels;
14385   debug8(printf("(2) ref cutoff level %d = max_nmatches %d\n",cutoff_level,max_nmatches));
14386 
14387   for (p = hitpairlist; p != NULL; p = p->rest) {
14388     hitpair = (Stage3pair_T) p->first;
14389 
14390     if (hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims < cutoff_level /*- NMATCHES_SLOP*/) {
14391       debug8(printf("Final (nmatches %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to_trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14392 		    hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14393 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14394 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14395 		    hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
14396 		    hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims,cutoff_level,
14397 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14398       Stage3pair_free(&hitpair);
14399       *eliminatedp = true;
14400 
14401     } else {
14402       debug8(printf("Final (nmatches %d >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to_trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14403 		    hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14404 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14405 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14406 		    List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14407 		    hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
14408 		    hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims,cutoff_level,
14409 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14410       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14411     }
14412   }
14413   Hitlist_free(&hitpairlist);
14414   hitpairlist = optimal;
14415   optimal = (List_T) NULL;
14416 
14417 
14418   /* Prune based on refalt_nmatches_to_trims */
14419   best_nmatches_to_trims = 0;
14420   for (p = hitpairlist; p != NULL; p = p->rest) {
14421     hitpair = (Stage3pair_T) p->first;
14422     if (hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims > best_nmatches_to_trims) {
14423       best_nmatches_to_trims = hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims;
14424       assert(best_nmatches_to_trims <= querylength5 + querylength3);
14425     }
14426   }
14427 
14428   cutoff_level = best_nmatches_to_trims - subopt_levels;
14429   debug8(printf("cutoff level %d = best_nmatches_to_trims %d\n",cutoff_level,best_nmatches_to_trims));
14430 
14431   /* Do not allow slop for final */
14432   for (p = hitpairlist; p != NULL; p = p->rest) {
14433     hitpair = (Stage3pair_T) p->first;
14434 
14435     if (hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims < cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/) {
14436       debug8(printf("Final (nmatches_to_trims %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14437 		    hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
14438 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14439 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14440 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14441 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14442 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14443       Stage3pair_free(&hitpair);
14444       *eliminatedp = true;
14445 
14446     } else {
14447       debug8(printf("Final (nmatches %d (%d ref) to trims >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14448 		    hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims,
14449 		    hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
14450 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14451 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14452 		    List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14453 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14454 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14455 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14456       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14457     }
14458   }
14459   Hitlist_free(&hitpairlist);
14460   hitpairlist = optimal;
14461   optimal = (List_T) NULL;
14462 
14463 
14464   /* Eliminate within loci (1): refalt_nmatches_to_trims_only */
14465   keptp = false;
14466   hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
14467   eliminate = (bool *) CALLOC(n,sizeof(bool));
14468   qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
14469   i = 0;
14470   while (i < n) {
14471     j = i+1;
14472     while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
14473       j++;
14474     }
14475     if (j - i > 1) {
14476       debug8(printf("Found a group from %d to %d\n",i,j));
14477       best_nmatches_to_trims = 0;
14478       for (k = i; k < j; k++) {
14479 	hitpair = hitpairs[k];
14480 	if ((nmatches_to_trims = hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims) > best_nmatches_to_trims) {
14481 	  best_nmatches_to_trims = nmatches_to_trims;
14482 	}
14483       }
14484       debug8(printf("best_nmatches_to_trims %d\n",best_nmatches_to_trims));
14485 
14486       for (k = i; k < j; k++) {
14487 	hitpair = hitpairs[k];
14488 	/* Do not allow slop for final */
14489 	if ((nmatches_to_trims = hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims) < best_nmatches_to_trims /*- NMATCHES_TO_TRIMS_SLOP*/) {
14490 	  debug8(printf("Within loci pair (nmatches_to_trims %d < %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14491 			nmatches_to_trims,best_nmatches_to_trims /*- NMATCHES_TO_TRIMS_SLOP*/,
14492 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14493 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14494 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14495 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14496 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14497 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14498 	  eliminate[k] = true;
14499 	} else {
14500 	  keptp = true;
14501 	}
14502       }
14503     }
14504 
14505     i = j;
14506   }
14507 
14508   if (keptp == false) {
14509     optimal = hitpairlist;
14510   } else {
14511     for (k = 0; k < n; k++) {
14512       hitpair = hitpairs[k];
14513       if (eliminate[k] == true) {
14514 	debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14515 		      hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14516 		      hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14517 		      hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14518 		      hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14519 		      hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14520 		      hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14521 	Stage3pair_free(&hitpair);
14522 	*eliminatedp = true;
14523       } else {
14524 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14525       }
14526     }
14527     Hitlist_free(&hitpairlist);
14528   }
14529   FREE(hitpairs);
14530   FREE(eliminate);
14531   hitpairlist = optimal;
14532   optimal = (List_T) NULL;
14533 
14534 
14535   /* Eliminate within loci (2): nsegments and splice score */
14536   keptp = false;
14537   hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
14538   eliminate = (bool *) CALLOC(n,sizeof(bool));
14539   qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
14540   i = 0;
14541   while (i < n) {
14542     j = i+1;
14543     while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
14544       j++;
14545     }
14546     if (j - i > 1) {
14547       debug8(printf("Found a group from %d to %d\n",i,j));
14548       best_nsegments = 0;
14549       best_insertlength_score = 99;
14550       max_splice_score = 0.0;
14551       for (k = i; k < j; k++) {
14552 	hitpair = hitpairs[k];
14553 	if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) > best_nsegments) {
14554 	  best_nsegments = nsegments;
14555 	  best_insertlength_score = calc_insertlength_score(hitpair->insertlength);
14556 	  max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
14557 
14558 	} else if (nsegments == best_nsegments) {
14559 	  if ((insertlength_score = calc_insertlength_score(hitpair->insertlength)) < best_insertlength_score) {
14560 	    best_insertlength_score = insertlength_score;
14561 	    max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
14562 
14563 	  } else if (insertlength_score == best_insertlength_score) {
14564 	    if ((splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score) > max_splice_score) {
14565 	      max_splice_score = splice_score;
14566 	    }
14567 	  }
14568 	}
14569       }
14570       debug8(printf("best_nsegments %d, best_insertlength_score %d, max_splice_score %f\n",
14571 		    best_nsegments,best_insertlength_score,max_splice_score));
14572 
14573       for (k = i; k < j; k++) {
14574 	hitpair = hitpairs[k];
14575 	if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) < best_nsegments) {
14576 	  debug8(printf("Within loci pair (nsegments %d < %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14577 			nsegments,best_nsegments,
14578 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14579 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14580 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14581 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14582 			hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14583 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14584 	  eliminate[k] = true;
14585 
14586 	} else if (calc_insertlength_score(hitpair->insertlength) > best_insertlength_score) {
14587 	  debug8(printf("Within loci pair (insertlength score %d > %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14588 			calc_insertlength_score(hitpair->insertlength),best_insertlength_score,
14589 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14590 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14591 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14592 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14593 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14594 	  eliminate[k] = true;
14595 
14596 	} else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
14597 	  debug8(printf("Within loci pair (splice_score w/slop %f < %f): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14598 			hitpair->hit5->splice_score + hitpair->hit3->splice_score,max_splice_score,
14599 			hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14600 			hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14601 			hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14602 			hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14603 			hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14604 	  eliminate[k] = true;
14605 
14606 	} else {
14607 	  keptp = true;
14608 	}
14609       }
14610     }
14611 
14612     i = j;
14613   }
14614 
14615   if (keptp == false) {
14616     optimal = hitpairlist;
14617   } else {
14618     for (k = 0; k < n; k++) {
14619       hitpair = hitpairs[k];
14620       if (eliminate[k] == true) {
14621 	debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14622 		      hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14623 		      hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14624 		      hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14625 		      hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14626 		      hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14627 	Stage3pair_free(&hitpair);
14628 	*eliminatedp = true;
14629       } else {
14630 	optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14631       }
14632     }
14633     Hitlist_free(&hitpairlist);
14634   }
14635   FREE(hitpairs);
14636   FREE(eliminate);
14637   hitpairlist = optimal;
14638   /* optimal = (List_T) NULL; */
14639 
14640 
14641 #if 0
14642   /* Filter on trim amount */
14643   hitpairlist = optimal;
14644   optimal = (List_T) NULL;
14645   min_trim = querylength5 + querylength3;
14646   for (p = hitpairlist; p != NULL; p = p->rest) {
14647     hitpair = (Stage3pair_T) p->first;
14648 
14649     if (hitpair->hit5->trim_querystart_splicep == true) {
14650       /* Skip */
14651       trim5_left = 0;
14652     } else {
14653       trim5_left = hitpair->hit5->trim_querystart;
14654     }
14655     if (hitpair->hit5->trim_queryend_splicep == true) {
14656       /* Skip */
14657       trim5_right = 0;
14658     } else {
14659       trim5_right = hitpair->hit5->trim_queryend;
14660     }
14661 
14662     if (hitpair->hit3->trim_querystart_splicep == true) {
14663       /* Skip */
14664       trim3_left = 0;
14665     } else {
14666       trim3_left = hitpair->hit3->trim_querystart;
14667     }
14668     if (hitpair->hit3->trim_queryend_splicep == true) {
14669       /* Skip */
14670       trim3_right = 0;
14671     } else {
14672       trim3_right = hitpair->hit3->trim_queryend;
14673     }
14674 
14675     if (trim5_left + trim5_right + trim3_left + trim3_right < min_trim) {
14676       min_trim = trim5_left + trim5_right + trim3_left + trim3_right;
14677     }
14678   }
14679 
14680   for (p = hitpairlist; p != NULL; p = p->rest) {
14681     hitpair = (Stage3pair_T) p->first;
14682 
14683     if (hitpair->hit5->trim_querystart_splicep == true) {
14684       /* Skip */
14685       trim5_left = 0;
14686     } else {
14687       trim5_left = hitpair->hit5->trim_querystart;
14688     }
14689     if (hitpair->hit5->trim_queryend_splicep == true) {
14690       /* Skip */
14691       trim5_right = 0;
14692     } else {
14693       trim5_right = hitpair->hit5->trim_queryend;
14694     }
14695 
14696     if (hitpair->hit3->trim_querystart_splicep == true) {
14697       /* Skip */
14698       trim3_left = 0;
14699     } else {
14700       trim3_left = hitpair->hit3->trim_querystart;
14701     }
14702     if (hitpair->hit3->trim_queryend_splicep == true) {
14703       /* Skip */
14704       trim3_right = 0;
14705     } else {
14706       trim3_right = hitpair->hit3->trim_queryend;
14707     }
14708 
14709     if (trim5_left + trim5_right + trim3_left + trim3_right > min_trim) {
14710       debug8(printf("Final (trim): Eliminating hit pair %p at %u..%u|%u..%u for trim %d+%d+%d+%d\n",
14711 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14712 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14713 		    trim5_left,trim5_right,trim3_left,trim3_right));
14714       Stage3pair_free(&hitpair);
14715       *eliminatedp = true;
14716 
14717     } else {
14718       debug8(printf("Final (trim): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) for trim %d+%d+%d+%d\n",
14719 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14720 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14721 		    List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14722 		    trim5_left,trim5_right,trim3_left,trim3_right));
14723       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14724     }
14725   }
14726   Hitlist_free(&hitpairlist);
14727   hitpairlist = optimal;
14728   optimal = (List_T) NULL;
14729 #endif
14730 
14731 
14732 #if 0
14733   /* Not good, especially for homologous chromosomes.  Use insert_length only within loci, not between */
14734   /* Then find smallest insert length and outerlength across loci */
14735   best_insertlength = (Chrpos_T) -1;
14736   for (p = hitpairlist; p != NULL; p = p->rest) {
14737     hitpair = (Stage3pair_T) p->first;
14738     if (hitpair->insertlength < best_insertlength) {
14739       best_insertlength = hitpair->insertlength;
14740     }
14741   }
14742 
14743   for (p = hitpairlist; p != NULL; p = p->rest) {
14744     hitpair = (Stage3pair_T) p->first;
14745 
14746     if (hitpair->insertlength > best_insertlength /*+ INSERTLENGTH_SLOP*/) {  /* No slop for final */
14747       debug8(printf("Final (insertlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to_trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14748 		    hitpair->insertlength,best_insertlength,
14749 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14750 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14751 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14752 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14753 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14754       Stage3pair_free(&hitpair);
14755       *eliminatedp = true;
14756 
14757     } else {
14758       debug8(printf("Final (insertlength %u, outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to_trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14759 		    hitpair->insertlength,hitpair->outerlength,
14760 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14761 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14762 		    List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14763 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14764 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14765 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14766       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14767     }
14768   }
14769   Hitlist_free(&hitpairlist);
14770   hitpairlist = optimal;
14771   optimal = (List_T) NULL;
14772 #endif
14773 
14774 #if 0
14775   /* Not good, especially for homologous chromosomes.  Use outerlength only within loci, not between */
14776   /* Finally find smallest outerlength across loci */
14777   best_outerlength = (Chrpos_T) -1;
14778   for (p = hitpairlist; p != NULL; p = p->rest) {
14779     hitpair = (Stage3pair_T) p->first;
14780     if (hitpair->outerlength < best_outerlength) {
14781       best_outerlength = hitpair->outerlength;
14782     }
14783   }
14784 
14785   for (p = hitpairlist; p != NULL; p = p->rest) {
14786     hitpair = (Stage3pair_T) p->first;
14787 
14788     if (hitpair->outerlength > best_outerlength /*+ OUTERLENGTH_SLOP*/) {
14789       debug8(printf("Final (outerlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14790 		    hitpair->outerlength,best_outerlength /*+ OUTERLENGTH_SLOP*/,
14791 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14792 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14793 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14794 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14795 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14796       Stage3pair_free(&hitpair);
14797       *eliminatedp = true;
14798 
14799     } else {
14800       debug8(printf("Final (outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14801 		    hitpair->outerlength,
14802 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14803 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14804 		    List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14805 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14806 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14807 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14808       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14809     }
14810   }
14811   Hitlist_free(&hitpairlist);
14812   hitpairlist = optimal;
14813   /* optimal = (List_T) NULL; */
14814 #endif
14815 
14816   debug8(printf("Exiting Stage3pair_optimal_score_final with %d hits\n",List_length(hitpairlist)));
14817   return hitpairlist;
14818 }
14819 #endif
14820 
14821 
14822 static List_T
Stage3pair_optimal_score_final(bool * eliminatedp,List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3)14823 Stage3pair_optimal_score_final (bool *eliminatedp, List_T hitpairlist,
14824 				Hitlistpool_T hitlistpool, int querylength5, int querylength3) {
14825   List_T optimal = NULL, p;
14826   Stage3pair_T hitpair;
14827   int n;
14828   int max_adj_nmatches, score;
14829   int best_nmatches_to_trims;
14830   int cutoff_level;
14831   /* int trim5_left, trim5_right, trim3_left, trim3_right, min_trim; */
14832 
14833   /* Relies on Path_solve_from_diagonals to maximize the number of segments at each locus */
14834 
14835   *eliminatedp = false;
14836   n = List_length(hitpairlist);
14837   debug8(printf("\nEntered Stage3pair_optimal_score_final with %d hitpairs\n",n));
14838 
14839   if (n <= 1) {
14840     return hitpairlist;
14841   }
14842 
14843 #ifdef DEBUG8
14844   for (p = hitpairlist; p != NULL; p = p->rest) {
14845     hitpair = (Stage3pair_T) p->first;
14846     printf("%p %p %u..%u|%u..%u methods %s and %s, nsegments %d+%d, nmatches %d+%d (%d+%d to trims), scores %d+%d, pairlength %u, outerlength %u, sensedirs %d and %d, splice scores %f and %f\n",
14847 	   hitpair->hit5,hitpair->hit3,
14848 	   hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14849 	   hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14850 	   Method_string(hitpair->hit5->method),Method_string(hitpair->hit3->method),
14851 	   hitpair->hit5->nsegments,hitpair->hit3->nsegments,
14852 	   hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14853 	   hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14854 	   hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,
14855 	   hitpair->insertlength,hitpair->outerlength,
14856 	   hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score);
14857   }
14858   printf("\n");
14859 #endif
14860 
14861 
14862   /* (1) Prune based on nmatches adjusted by score to get a tradeoff between matches and parsimony */
14863   max_adj_nmatches = 0;
14864   for (p = hitpairlist; p != NULL; p = p->rest) {
14865     hitpair = (Stage3pair_T) p->first;
14866     if ((score = hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
14867 	 - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall) > max_adj_nmatches) {
14868       max_adj_nmatches = score;
14869     }
14870   }
14871 
14872   /* May not want to be greedy on cutoff level here.  Might want to raise subopt_levels */
14873   cutoff_level = max_adj_nmatches - subopt_levels;
14874   debug8(printf("(1) refalt cutoff level %d = max_adj_nmatches %d - subopt_levels %d\n",
14875 		cutoff_level,max_adj_nmatches,subopt_levels));
14876 
14877   for (p = hitpairlist; p != NULL; p = p->rest) {
14878     hitpair = (Stage3pair_T) p->first;
14879 
14880     if (hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
14881 	- hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall < cutoff_level /*- NMATCHES_SLOP*/) {
14882       debug8(printf("Final (adj nmatches %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to_trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14883 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
14884 		    - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall,cutoff_level /*- NMATCHES_SLOP*/,
14885 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14886 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14887 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14888 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14889 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14890       Stage3pair_free(&hitpair);
14891       *eliminatedp = true;
14892 
14893     } else {
14894       debug8(printf("Final (adj nmatches %d >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to_trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14895 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14896 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14897 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14898 		    List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14899 		    hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14900 		    hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14901 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14902       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14903     }
14904   }
14905   Hitlist_free(&hitpairlist);
14906   hitpairlist = optimal;
14907   optimal = (List_T) NULL;
14908 
14909 
14910   /* (2) Prune based on ref_nmatches_to_trims */
14911   best_nmatches_to_trims = 0;
14912   for (p = hitpairlist; p != NULL; p = p->rest) {
14913     hitpair = (Stage3pair_T) p->first;
14914     if (hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims > best_nmatches_to_trims) {
14915       best_nmatches_to_trims = hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims;
14916       assert(best_nmatches_to_trims <= querylength5 + querylength3);
14917     }
14918   }
14919 
14920   cutoff_level = best_nmatches_to_trims - subopt_levels;
14921   debug8(printf("cutoff level %d = best_nmatches_to_trims %d\n",cutoff_level,best_nmatches_to_trims));
14922 
14923   /* Do not allow slop for final */
14924   for (p = hitpairlist; p != NULL; p = p->rest) {
14925     hitpair = (Stage3pair_T) p->first;
14926 
14927     if (hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims < cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/) {
14928       debug8(printf("Final (nmatches_to_trims %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14929 		    hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
14930 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14931 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14932 		    hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
14933 		    hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims,cutoff_level,
14934 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14935       Stage3pair_free(&hitpair);
14936       *eliminatedp = true;
14937 
14938     } else {
14939       debug8(printf("Final (nmatches_to_trims %d >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14940 		    hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
14941 		    hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14942 		    hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14943 		    List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14944 		    hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
14945 		    hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims,cutoff_level,
14946 		    hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14947       optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14948     }
14949   }
14950   Hitlist_free(&hitpairlist);
14951   hitpairlist = optimal;
14952   /* optimal = (List_T) NULL; */
14953 
14954   /* Shouldn't need to eliminate within loci, since that was done during prefinal */
14955 
14956   debug8(printf("Exiting Stage3pair_optimal_score_final with %d hits\n",List_length(hitpairlist)));
14957   return hitpairlist;
14958 }
14959 
14960 
14961 
14962 List_T
Stage3pair_optimal_score(List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3,bool finalp)14963 Stage3pair_optimal_score (List_T hitpairlist, Hitlistpool_T hitlistpool,
14964 			  int querylength5, int querylength3, bool finalp) {
14965   List_T optimal;
14966   bool eliminatedp;
14967 
14968   if (finalp == false) {
14969     optimal = Stage3pair_optimal_score_prefinal(&eliminatedp,hitpairlist,hitlistpool,
14970 						querylength5,querylength3);
14971     while (eliminatedp == true) {
14972       optimal = Stage3pair_optimal_score_prefinal(&eliminatedp,optimal,hitlistpool,
14973 						  querylength5,querylength3);
14974     }
14975 
14976   } else {
14977     optimal = Stage3pair_optimal_score_final(&eliminatedp,hitpairlist,hitlistpool,
14978 					     querylength5,querylength3);
14979     while (eliminatedp == true) {
14980       optimal = Stage3pair_optimal_score_final(&eliminatedp,optimal,hitlistpool,
14981 					       querylength5,querylength3);
14982     }
14983   }
14984 
14985   return optimal;
14986 }
14987 
14988 
14989 #if 0
14990 /* Called when computing GMAP alignment in stage1hr.c */
14991 bool
14992 Stage3pair_sense_consistent_p (List_T hitpairlist) {
14993   Stage3pair_T hitpair;
14994   T hit5, hit3;
14995   List_T p;
14996 
14997   for (p = hitpairlist; p != NULL; p = List_next(p)) {
14998     hitpair = (Stage3pair_T) List_head(p);
14999     hit5 = hitpair->hit5;
15000     hit3 = hitpair->hit3;
15001     if (hit5->sensedir_for_concordance == hit3->sensedir_for_concordance) {
15002       return true;
15003     }
15004   }
15005   return false;
15006 }
15007 #endif
15008 
15009 
15010 /* Want to unalias plus and alias minus */
15011 List_T
Stage3end_linearize_5(List_T hitlist)15012 Stage3end_linearize_5 (List_T hitlist) {
15013   T hit;
15014   List_T p;
15015 #ifdef DEBUG12
15016   Chrpos_T chrlength;
15017 #endif
15018 
15019   for (p = hitlist; p != NULL; p = List_next(p)) {
15020     hit = (T) List_head(p);
15021     debug12(chrlength = hit->chrlength);
15022     debug12(printf("Looking at 5' end %u..%u against chrlength %u\n",
15023 		   hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,chrlength));
15024 
15025     if (hit->circularalias == 0) {
15026       /* Skip */
15027 
15028     } else if (hit->circularalias == +1) {
15029       if (hit->plusp == true) {
15030 	unalias_circular(hit);
15031       }
15032 
15033     } else if (hit->circularalias == -1) {
15034       if (hit->plusp == false) {
15035 	alias_circular(hit);
15036       }
15037     }
15038   }
15039 
15040   return hitlist;
15041 }
15042 
15043 
15044 /* Want to alias plus and unalias minus */
15045 List_T
Stage3end_linearize_3(List_T hitlist)15046 Stage3end_linearize_3 (List_T hitlist) {
15047   T hit;
15048   List_T p;
15049 #ifdef DEBUG12
15050   Chrpos_T chrlength;
15051 #endif
15052 
15053   for (p = hitlist; p != NULL; p = List_next(p)) {
15054     hit = (T) List_head(p);
15055     debug12(chrlength = hit->chrlength);
15056     debug12(printf("Looking at 3' end %u..%u against chrlength %u\n",
15057 		   hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,chrlength));
15058 
15059     if (hit->circularalias == 0) {
15060       /* Skip */
15061 
15062     } else if (hit->circularalias == -1) {
15063       if (hit->plusp == true) {
15064 	alias_circular(hit);
15065       }
15066 
15067     } else if (hit->circularalias == +1) {
15068       if (hit->plusp == false) {
15069 	unalias_circular(hit);
15070       }
15071     }
15072   }
15073 
15074   return hitlist;
15075 }
15076 
15077 
15078 
15079 List_T
Stage3pair_remove_circular_alias(List_T hitpairlist,Hitlistpool_T hitlistpool)15080 Stage3pair_remove_circular_alias (List_T hitpairlist, Hitlistpool_T hitlistpool) {
15081   List_T newlist = NULL, p;
15082   Stage3pair_T hitpair;
15083 
15084   debug12(printf("Stage3pair_remove_circular_alias called with %d hitpairs\n",
15085 		 List_length(hitpairlist)));
15086   for (p = hitpairlist; p != NULL; p = p->rest) {
15087     hitpair = (Stage3pair_T) p->first;
15088 
15089 #if 0
15090     /* Not sure if this is necessary */
15091     if (hitpair->hit5->circularalias == +1 && hitpair->hit3->circularalias == +1) {
15092       /* First, try to salvage alias +1 */
15093       unalias_circular(hitpair->hit5);
15094       unalias_circular(hitpair->hit3);
15095     }
15096 #endif
15097 
15098 #if 0
15099     if (hitpair->hit5->plusp == true) {
15100       trim = hitpair->hit5->trim_querystart;
15101     } else {
15102       trim = hitpair->hit3->trim_queryend;
15103     }
15104 #endif
15105 
15106     if (hitpair->low >= hitpair->hit5->chroffset + hitpair->hit5->chrlength) {
15107       /* Both ends in circular alias */
15108       debug12(printf("Both ends in circular alias\n"));
15109       Stage3pair_free(&hitpair);
15110 
15111     } else {
15112       newlist = Hitlist_push(newlist,hitlistpool,(void *) hitpair);
15113     }
15114   }
15115 
15116   Hitlist_free(&hitpairlist);
15117   return newlist;
15118 }
15119 
15120 
15121