1 static char rcsid[] = "$Id: stage3hr.c 223081 2020-09-13 14:21:03Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "stage3hr.h"
7 #include "stage3hrdef.h"
8
9 #include <stdlib.h> /* For qsort */
10 #include <string.h>
11 #include <strings.h>
12 #include <ctype.h> /* For islower */
13 #include <math.h> /* For exp() and log10() */
14 #include "assert.h"
15 #include "mem.h"
16 #include "univcoord.h"
17
18 #include "chrnum.h"
19 #include "complement.h"
20 #include "interval.h"
21 #include "univdiag.h"
22 #include "univdiagdef.h"
23 #include "substring.h"
24 #include "junction.h"
25 #include "genome128_hr.h"
26 #include "mapq.h"
27 #include "cigar.h"
28 #include "comp.h" /* For Stage3end_run_gmap */
29 #include "maxent_hr.h"
30 #include "fastlog.h"
31 #include "transcript.h"
32 #include "kmer-search.h"
33
34
35
36 /* Scores for alts_status_inside */
37 #define ALTS_RESOLVED_BYLENGTH 0
38 #define ALTS_NOT_AMBIGUOUS 1
39
40
41 /* Eliminates distant splices if short splices are found */
42 /* #define DISTANT_SPLICE_SPECIAL 1 */
43
44 #define CONCORDANT_TEXT "concordant"
45 #define PAIRED_TEXT "paired"
46 #define UNPAIRED_TEXT "unpaired"
47
48 #ifdef USE_TALLY_RATIO
49 #define TALLY_RATIO 2.0
50 #endif
51
52 /* #define SUBSUMPTION_SLOP 10 */ /* Should allow for short insert lengths */
53 #define ADJ_NMATCHES_SLOP 2 /* Corresponds to one mismatch, sacrificing for a better splice score */
54 #define NMATCHES_SLOP 6
55 /* #define NMATCHES_TO_TRIMS_SLOP 9 */ /* Looser to allow for different splice options */
56 #define INSERTLENGTH_SLOP 100
57 #define OUTERLENGTH_SLOP 1000
58 #define SPLICE_SCORE_SLOP 0.03
59
60 /* #define MIN_ALIGNMENT_LEN 20 -- Now taken care of by min-coverage */
61
62 #define SPLICED_END_PENALTY 1 /* For long spliced ends. Add to score 1 point per each 8 bp */
63 #define NONSPLICED_END_RESTORE 6 /* For long spliced ends. Reduce score by 6 points per each 8 bp */
64 #define END_BINSIZE 8
65
66 #define SCORE_EVENTRIM_SLOP 2
67 #define SCORE_INDELS_EVENTRIM 1 /* Needed to compare genomic positions with and without indels */
68 #define EVENTRIM_BADINTRON_PENALTY 2
69 #define DO_FINAL 1
70
71
72 #ifdef CHECK_ASSERTIONS
73 #define CHECK_NMISMATCHES 1
74 #endif
75
76
77 #if 0
78 /* This is a bad idea. Better to use nconcordant as a guide to stopping */
79 #define MAX_HITS 100 /* For evaluating concordance */
80 #endif
81
82 /* #define USE_ALLOCA_FOR_HITS 1 -- can lead to stack overflow */
83
84
85 /* Stage3end_new */
86 #ifdef DEBUG0
87 #define debug0(x) x
88 #else
89 #define debug0(x)
90 #endif
91
92 /* Stage3end_filter */
93 #ifdef DEBUG1
94 #define debug1(x) x
95 #else
96 #define debug1(x)
97 #endif
98
99
100 /* transcript-guided alignment */
101 /* May want to turn on debug2 in transcript.c */
102 #ifdef DEBUG2
103 #define debug2(x) x
104 #else
105 #define debug2(x)
106 #endif
107
108 /* Stage3end_T comparisons. Need to modify calls from path-solve.c */
109 #ifdef DEBUG4
110 #define debug4(x) x
111 #else
112 #define debug4(x)
113 #endif
114
115 /* Stage3end_nmatches_substrings */
116 #ifdef DEBUG7
117 #define debug7(x) x
118 #else
119 #define debug7(x)
120 #endif
121
122
123 /* Stage3pair_T comparisons */
124 #ifdef DEBUG8
125 #define debug8(x) x
126 #else
127 #define debug8(x)
128 #endif
129
130
131 /* Resolving insides */
132 #ifdef DEBUG9
133 #define debug9(x) x
134 #else
135 #define debug9(x)
136 #endif
137
138 /* insert length calculation */
139 #ifdef DEBUG10
140 #define debug10(x) x
141 #else
142 #define debug10(x)
143 #endif
144
145 /* circular chromosomes */
146 #ifdef DEBUG12
147 #define debug12(x) x
148 #else
149 #define debug12(x)
150 #endif
151
152 /* substring_gmap */
153 #ifdef DEBUG13
154 #define debug13(x) x
155 #else
156 #define debug13(x)
157 #endif
158
159 /* Stage3_determine_pairtype */
160 #ifdef DEBUG14
161 #define debug14(x) x
162 #else
163 #define debug14(x)
164 #endif
165
166 /* Stage3pair_overlap */
167 #ifdef DEBUG15
168 #define debug15(x) x
169 #else
170 #define debug15(x)
171 #endif
172
173
174
175 #define MAPQ_MAXIMUM_SCORE 40
176
177 static bool omit_concordant_uniq_p = false;
178 static bool omit_concordant_mult_p = false;
179 static bool filter_within_trims_p = false;
180
181 /* static int kmer_search_sizelimit = 100; */
182
183 static int subopt_levels;
184
185 static bool want_random_p;
186 static bool transcriptomep;
187 static bool invert_first_p;
188 static bool invert_second_p;
189 static Genome_T genomecomp;
190 static Genome_T genomebits;
191 static Genome_T genomebits_alt;
192
193 static Univ_IIT_T chromosome_iit;
194 static int nchromosomes;
195 static int circular_typeint;
196
197 static Genome_T transcriptomebits;
198 static Transcriptome_T transcriptome;
199 static Univ_IIT_T transcript_iit;
200 static bool remap_transcriptome_p = false;
201
202 static IIT_T tally_iit;
203 static int *tally_divint_crosstable;
204 static IIT_T runlength_iit;
205 static int *runlength_divint_crosstable;
206
207 static Chrpos_T pairmax_linear;
208 static Chrpos_T pairmax_circular;
209
210 static Chrpos_T expected_pairlength;
211 static Chrpos_T pairlength_deviation;
212
213 static Chrpos_T expected_pairlength_low;
214 static Chrpos_T expected_pairlength_high;
215 static Chrpos_T expected_pairlength_very_high;
216
217 static int localsplicing_penalty;
218 static int indel_penalty_middle;
219 static int antistranded_penalty;
220 static bool favor_multiexon_p;
221
222 static int ambig_end_interval; /* For penalizing large ambiguous ends
223 in GMAP alignments, since such ends
224 should have been found */
225
226 static Univcoord_T genomelength;
227 static bool *circularp;
228 static bool *altlocp;
229 static Univcoord_T *alias_starts;
230 static Univcoord_T *alias_ends;
231
232 static char *failedinput_root;
233 static Outputtype_T output_type;
234 static bool merge_samechr_p;
235 static bool method_print_p = false;
236
237
238 /* Probably not good to use in certain genomic regions, unless we also
239 use known splicesites with distance information. */
240 /* But sometimes need to use to get correct mapping */
241 static bool favor_ambiguous_p;
242
243
244 void
Stage3hr_setup(bool transcriptomep_in,bool invert_first_p_in,bool invert_second_p_in,Genome_T genomecomp_in,Genome_T genomebits_in,Genome_T genomebits_alt_in,Univ_IIT_T chromosome_iit_in,Univcoord_T genomelength_in,int nchromosomes_in,int circular_typeint_in,Genome_T transcriptomebits_in,Transcriptome_T transcriptome_in,Univ_IIT_T transcript_iit_in,IIT_T tally_iit_in,int * tally_divint_crosstable_in,IIT_T runlength_iit_in,int * runlength_divint_crosstable_in,bool distances_observed_p,Chrpos_T pairmax_linear_in,Chrpos_T pairmax_circular_in,Chrpos_T expected_pairlength_in,Chrpos_T pairlength_deviation_in,int localsplicing_penalty_in,int indel_penalty_middle_in,int antistranded_penalty_in,bool favor_multiexon_p_in,int subopt_levels_in,bool * circularp_in,bool * altlocp_in,Univcoord_T * alias_starts_in,Univcoord_T * alias_ends_in,bool filter_within_trims_p_in,bool omit_concordant_uniq_p_in,bool omit_concordant_mult_p_in,char * failedinput_root_in,Outputtype_T output_type_in,bool merge_samechr_p_in,bool method_print_p_in,bool want_random_p_in)245 Stage3hr_setup (bool transcriptomep_in, bool invert_first_p_in, bool invert_second_p_in,
246 Genome_T genomecomp_in, Genome_T genomebits_in, Genome_T genomebits_alt_in,
247 Univ_IIT_T chromosome_iit_in, Univcoord_T genomelength_in, int nchromosomes_in, int circular_typeint_in,
248
249 Genome_T transcriptomebits_in, Transcriptome_T transcriptome_in, Univ_IIT_T transcript_iit_in,
250
251 IIT_T tally_iit_in, int *tally_divint_crosstable_in,
252 IIT_T runlength_iit_in, int *runlength_divint_crosstable_in,
253 bool distances_observed_p,
254 Chrpos_T pairmax_linear_in, Chrpos_T pairmax_circular_in,
255 Chrpos_T expected_pairlength_in, Chrpos_T pairlength_deviation_in,
256 int localsplicing_penalty_in, int indel_penalty_middle_in,
257 int antistranded_penalty_in, bool favor_multiexon_p_in, int subopt_levels_in,
258 bool *circularp_in, bool *altlocp_in,
259 Univcoord_T *alias_starts_in, Univcoord_T *alias_ends_in,
260 bool filter_within_trims_p_in, bool omit_concordant_uniq_p_in, bool omit_concordant_mult_p_in,
261 char *failedinput_root_in, Outputtype_T output_type_in, bool merge_samechr_p_in,
262 bool method_print_p_in, bool want_random_p_in) {
263
264 transcriptomep = transcriptomep_in;
265 invert_first_p = invert_first_p_in;
266 invert_second_p = invert_second_p_in;
267 genomecomp = genomecomp_in;
268 genomebits = genomebits_in;
269 genomebits_alt = genomebits_alt_in;
270
271 chromosome_iit = chromosome_iit_in;
272 nchromosomes = nchromosomes_in;
273 circular_typeint = circular_typeint_in;
274
275 transcriptomebits = transcriptomebits_in;
276 transcriptome = transcriptome_in;
277 transcript_iit = transcript_iit_in;
278
279 tally_iit = tally_iit_in;
280 tally_divint_crosstable = tally_divint_crosstable_in;
281 runlength_iit = runlength_iit_in;
282 runlength_divint_crosstable = runlength_divint_crosstable_in;
283 localsplicing_penalty = localsplicing_penalty_in;
284 indel_penalty_middle = indel_penalty_middle_in;
285 antistranded_penalty = antistranded_penalty_in;
286 favor_multiexon_p = favor_multiexon_p_in;
287
288 pairmax_linear = pairmax_linear_in;
289 pairmax_circular = pairmax_circular_in;
290 expected_pairlength = expected_pairlength_in;
291 pairlength_deviation = pairlength_deviation_in;
292
293 if (pairlength_deviation > expected_pairlength) {
294 expected_pairlength_low = 0;
295 } else {
296 expected_pairlength_low = expected_pairlength - pairlength_deviation;
297 }
298 expected_pairlength_high = expected_pairlength + pairlength_deviation;
299 expected_pairlength_very_high = expected_pairlength + 10*pairlength_deviation;
300
301 if (distances_observed_p == true) {
302 favor_ambiguous_p = false;
303 } else {
304 favor_ambiguous_p = true;
305 }
306
307 #if 0
308 ambig_end_interval = index1part + (index1interval - 1);
309 #else
310 ambig_end_interval = 8; /* Since GMAP uses 8-mers */
311 #endif
312
313 subopt_levels = subopt_levels_in;
314
315 genomelength = genomelength_in;
316 circularp = circularp_in;
317 altlocp = altlocp_in;
318 alias_starts = alias_starts_in;
319 alias_ends = alias_ends_in;
320
321 failedinput_root = failedinput_root_in;
322
323 filter_within_trims_p = filter_within_trims_p_in;
324 omit_concordant_uniq_p = omit_concordant_uniq_p_in;
325 omit_concordant_mult_p = omit_concordant_mult_p_in;
326
327 output_type = output_type_in;
328 merge_samechr_p = merge_samechr_p_in;
329 method_print_p = method_print_p_in;
330 want_random_p = want_random_p_in;
331
332 return;
333 }
334
335
336
337 #define T Stage3end_T
338
339 Hittype_T
Stage3end_hittype(T this)340 Stage3end_hittype (T this) {
341 return this->hittype;
342 }
343
344 static char *
hittype_string(Hittype_T hittype)345 hittype_string (Hittype_T hittype) {
346 switch (hittype) {
347 case EXACT: return "exact";
348 case SUB: return "sub";
349 case HALFSPLICE_DONOR: return "donor";
350 case HALFSPLICE_ACCEPTOR: return "acceptor";
351 case SPLICE: return "splice";
352 case SAMECHR_SPLICE: return "samechr_splice";
353 case TRANSLOC_SPLICE: return "transloc_splice";
354 case SUBSTRINGS: return "substrings";
355 default: abort();
356 }
357 }
358
359 char *
Stage3end_hittype_string(T this)360 Stage3end_hittype_string (T this) {
361 return hittype_string(this->hittype);
362 }
363
364 Method_T
Stage3end_method(T this)365 Stage3end_method (T this) {
366 return this->method;
367 }
368
369
370 int
Stage3end_genestrand(T this)371 Stage3end_genestrand (T this) {
372 return this->genestrand;
373 }
374
375 bool
Stage3end_transcriptomep(T this)376 Stage3end_transcriptomep (T this) {
377 if (this == NULL) {
378 /* Can happen if we call upon a mate in a halfmapping */
379 return false;
380 } else if (this->method == TR) {
381 return true;
382 } else {
383 return false;
384 }
385 }
386
387 List_T
Stage3end_transcripts(T this)388 Stage3end_transcripts (T this) {
389 return this->transcripts;
390 }
391
392 void
Stage3end_set_transcripts(T this,List_T transcripts)393 Stage3end_set_transcripts (T this, List_T transcripts) {
394 List_free(&this->transcripts);
395 this->transcripts = transcripts;
396 return;
397 }
398
399 List_T
Stage3end_transcripts_other(T this)400 Stage3end_transcripts_other (T this) {
401 return this->transcripts_other;
402 }
403
404
405 #if 0
406 void
407 Stage3end_transfer_transcripts (T dest, List_T sources) {
408 List_T p, q;
409 T source;
410 Transcript_T transcript;
411
412 for (p = sources; p != NULL; p = List_next(p)) {
413 source = (T) List_head(p);
414 debug2(printf("Transferring %d transcripts from %s to %s\n",
415 List_length(source->transcripts),hittype_string(source->hittype),hittype_string(dest->hittype)));
416 for (q = source->transcripts; q != NULL; q = List_next(q)) {
417 transcript = (Transcript_T) List_head(q);
418 if (Transcript_in_list_p(transcript,dest->transcripts) == true) {
419 Transcript_free(&transcript);
420 } else {
421 printf("Pushing onto transcripts %p,",dest->transcripts);
422 dest->transcripts = List_push(dest->transcripts,(void *) transcript);
423 printf(" now %p\n",dest->transcripts);
424 }
425 }
426 List_free(&source->transcripts);
427 debug2(Transcript_print_nums(dest->transcripts));
428 debug2(printf("\n"));
429
430 Stage3end_free(&source);
431 }
432
433 return;
434 }
435 #endif
436
437 #if 0
438 static void
439 Stage3end_transfer_transcripts_other (T dest, List_T sources) {
440 List_T p, q;
441 T source;
442 Transcript_T transcript;
443
444 for (p = sources; p != NULL; p = List_next(p)) {
445 source = (T) List_head(p);
446 for (q = source->transcripts; q != NULL; q = List_next(q)) {
447 transcript = (Transcript_T) List_head(q);
448 if (Transcript_in_list_p(transcript,dest->transcripts_other) == true) {
449 Transcript_free(&transcript);
450 } else {
451 printf("Pushing onto transcripts %p,",dest->transcripts);
452 dest->transcripts_other = List_push(dest->transcripts_other,(void *) transcript);
453 printf(" now %p\n",dest->transcripts);
454 }
455 }
456 List_free(&source->transcripts);
457 Stage3end_free(&source);
458 }
459
460 return;
461 }
462 #endif
463
464
465 static void
Stage3end_transfer_transcripts_one(T dest,T source)466 Stage3end_transfer_transcripts_one (T dest, T source) {
467 List_T q;
468 Transcript_T transcript;
469
470 #ifdef DEBUG2
471 printf("Transferring %d transcripts from %s to %s\n",
472 List_length(source->transcripts),hittype_string(source->hittype),hittype_string(dest->hittype));
473
474 printf("Before:\n");
475 printf("Dest: "); Transcript_print_nums(dest->transcripts); printf("\n");
476 printf("Source: "); Transcript_print_nums(source->transcripts); printf("\n");
477 #endif
478
479 for (q = source->transcripts; q != NULL; q = List_next(q)) {
480 transcript = (Transcript_T) List_head(q);
481 if (Transcript_in_list_p(transcript,dest->transcripts) == true) {
482 Transcript_free(&transcript);
483 } else {
484 dest->transcripts = List_push(dest->transcripts,(void *) transcript);
485 }
486 }
487 List_free(&source->transcripts);
488
489 for (q = source->transcripts_other; q != NULL; q = List_next(q)) {
490 transcript = (Transcript_T) List_head(q);
491 if (Transcript_in_list_p(transcript,dest->transcripts_other) == true) {
492 Transcript_free(&transcript);
493 } else {
494 dest->transcripts_other = List_push(dest->transcripts_other,(void *) transcript);
495 }
496 }
497 List_free(&source->transcripts_other);
498
499 #ifdef DEBUG2
500 printf("After:\n");
501 printf("Dest: "); Transcript_print_nums(dest->transcripts); printf("\n");
502 /* Source lists will be empty */
503 #endif
504
505 return;
506 }
507
508 static void
Stage3pair_transfer_transcripts_one(Stage3pair_T dest,Stage3pair_T source)509 Stage3pair_transfer_transcripts_one (Stage3pair_T dest, Stage3pair_T source) {
510
511 Stage3end_transfer_transcripts_one(dest->hit5,source->hit5);
512 Stage3end_transfer_transcripts_one(dest->hit3,source->hit3);
513
514 return;
515 }
516
517
518 bool
Stage3end_distant_splice_p(T this)519 Stage3end_distant_splice_p (T this) {
520 if (this->distant_splice_p == true) {
521 return true;
522 } else {
523 return false;
524 }
525 }
526
527
528 Chrnum_T
Stage3end_chrnum(T this)529 Stage3end_chrnum (T this) {
530 if (this == NULL) {
531 /* Can happen if we call upon a mate in a halfmapping */
532 return 0;
533 } else {
534 return this->chrnum;
535 }
536 }
537
538 Chrnum_T
Stage3end_effective_chrnum(T this)539 Stage3end_effective_chrnum (T this) {
540 if (this == NULL) {
541 /* Can happen if we call upon a mate in a halfmapping */
542 return 0;
543 } else {
544 return this->effective_chrnum;
545 }
546 }
547
548 Chrnum_T
Stage3end_other_chrnum(T this)549 Stage3end_other_chrnum (T this) {
550 if (this == NULL) {
551 /* Can happen if we call upon a mate in a halfmapping */
552 return 0;
553 } else {
554 return this->other_chrnum;
555 }
556 }
557
558 Univcoord_T
Stage3end_chroffset(T this)559 Stage3end_chroffset (T this) {
560 return this->chroffset;
561 }
562
563 Univcoord_T
Stage3end_chrhigh(T this)564 Stage3end_chrhigh (T this) {
565 return this->chrhigh;
566 }
567
568 Chrpos_T
Stage3end_chrlength(T this)569 Stage3end_chrlength (T this) {
570 if (this == NULL) {
571 /* Can happen if we call upon a mate in a halfmapping */
572 return 0;
573 } else {
574 return this->chrlength;
575 }
576 }
577
578 Chrpos_T
Stage3end_chrpos_low(T this)579 Stage3end_chrpos_low (T this) {
580 return this->low - this->chroffset;
581 }
582
583 Chrpos_T
Stage3end_chrpos_high(T this)584 Stage3end_chrpos_high (T this) {
585 return this->high - this->chroffset;
586 }
587
588
589 Univcoord_T
Stage3end_genomicstart(T this)590 Stage3end_genomicstart (T this) {
591 return this->genomicstart;
592 }
593
594 Univcoord_T
Stage3end_genomicend(T this)595 Stage3end_genomicend (T this) {
596 return this->genomicend;
597 }
598
599 #if 0
600 /* For Goby */
601 int
602 Stage3end_query_alignment_length (T this) {
603 int length = 0;
604 List_T p;
605 Substring_T substring;
606 Junction_T junction;
607
608 for (p = this->substrings_LtoH; p != NULL; p = List_next(p)) {
609 substring = (Substring_T) List_head(p);
610 length += Substring_match_length(substring);
611 }
612 for (p = this->junctions_LtoH; p != NULL; p = List_next(p)) {
613 junction = (Junction_T) List_head(p);
614 if (Junction_type(junction) == INS_JUNCTION) {
615 length += Junction_nindels(junction);
616 }
617 }
618
619 return length;
620 }
621 #endif
622
623
624 #if 0
625 Chrpos_T
626 Stage3end_genomic_alignment_length (T this) {
627 Chrpos_T length = 0;
628 List_T p;
629 Substring_T substring;
630 Junction_T junction;
631
632 for (p = this->substrings_LtoH; p != NULL; p = List_next(p)) {
633 substring = (Substring_T) List_head(p);
634 length += Substring_genomic_alignment_length(substring);
635 }
636 for (p = this->junctions_LtoH; p != NULL; p = List_next(p)) {
637 junction = (Junction_T) List_head(p);
638 if (Junction_type(junction) == DEL_JUNCTION) {
639 length += (Chrpos_T) Junction_nindels(junction);
640 }
641 }
642
643 return length;
644 }
645 #endif
646
647
648 #if 0
649 static Substring_T
650 find_substring_low (T this) {
651 Substring_T substring_low;
652 List_T substrings_LtoH, p;
653
654 if (this->plusp == true) {
655 substrings_LtoH = this->substrings_1toN;
656 } else {
657 substrings_LtoH = this->substrings_Nto1;
658 }
659
660 p = substrings_LtoH;
661 substring_low = (Substring_T) List_head(p);
662 if (Substring_has_alts_p(substring_low) == true) {
663 p = List_next(p);
664 substring_low = (Substring_T) List_head(p);
665 }
666
667 return substring_low;
668 }
669 #endif
670
671
672 #if 0
673 static Substring_T
674 find_substring_high (T this) {
675 Substring_T substring_high;
676 List_T substrings_HtoL, p;
677
678 if (this->plusp == true) {
679 substrings_HtoL = this->substrings_Nto1;
680 } else {
681 substrings_HtoL = this->substrings_1toN;
682 }
683
684 p = substrings_HtoL;
685 substring_high = (Substring_T) List_head(p);
686 if (Substring_has_alts_p(substring_high) == true) {
687 p = List_next(p);
688 substring_high = (Substring_T) List_head(p);
689 }
690
691 return substring_high;
692 }
693 #endif
694
695
696 int
Stage3end_mapq_score(T this)697 Stage3end_mapq_score (T this) {
698 return this->mapq_score;
699 }
700
701 int
Stage3end_absmq_score(T this)702 Stage3end_absmq_score (T this) {
703 return this->absmq_score;
704 }
705
706 int
Stage3end_nmismatches_bothdiff(T this)707 Stage3end_nmismatches_bothdiff (T this) {
708 return this->nmismatches_bothdiff;
709 }
710
711 int
Stage3end_nmismatches_refdiff(T this)712 Stage3end_nmismatches_refdiff (T this) {
713 return this->nmismatches_refdiff;
714 }
715
716
717 #if 0
718 Endtype_T
719 Stage3end_start_endtype (T this) {
720 Substring_T substring;
721
722 if (this->plusp == true) {
723 substring = (Substring_T) List_head(this->substrings_1toN);
724 } else {
725 substring = (Substring_T) List_head(this->substrings_Nto1);
726 }
727 return Substring_start_endtype(substring);
728 }
729 #endif
730
731 #if 0
732 Endtype_T
733 Stage3end_end_endtype (T this) {
734 Substring_T substring;
735
736 if (this->plusp == true) {
737 substring = (Substring_T) List_head(this->substrings_Nto1);
738 } else {
739 substring = (Substring_T) List_head(this->substrings_1toN);
740 }
741 return Substring_end_endtype(substring);
742 }
743 #endif
744
745 int
Stage3end_nindels(T this)746 Stage3end_nindels (T this) {
747 return this->nindels;
748 }
749
750 int
Stage3end_querylength(T this)751 Stage3end_querylength (T this) {
752 return this->querylength;
753 }
754
755 bool
Stage3end_plusp(T this)756 Stage3end_plusp (T this) {
757 return this->plusp;
758 }
759
760 bool
Stage3end_paired_usedp(T this)761 Stage3end_paired_usedp (T this) {
762 return this->paired_usedp;
763 }
764
765 int
Stage3end_max_trim(T this)766 Stage3end_max_trim (T this) {
767 if (this->trim_querystart > this->trim_queryend) {
768 return this->trim_querystart;
769 } else {
770 return this->trim_queryend;
771 }
772 }
773
774
775 static int
start_amb_length(T this)776 start_amb_length (T this) {
777 return Substring_start_amb_length((Substring_T) List_head(this->substrings_1toN));
778 }
779
780 static int
end_amb_length(T this)781 end_amb_length (T this) {
782 return Substring_end_amb_length((Substring_T) List_head(this->substrings_Nto1));
783 }
784
785 #if 0
786 static int
787 n_amb_ends (T this) {
788 int n = 0;
789
790 if (start_amb_length(this) > 0) {
791 n++;
792 }
793 if (end_amb_length(this) > 0) {
794 n++;
795 }
796
797 return n;
798 }
799 #endif
800
801
802 #ifdef DEBUG8
803 static int
amb_length(T this)804 amb_length (T this) {
805 return Substring_start_amb_length((Substring_T) List_head(this->substrings_1toN)) +
806 Substring_end_amb_length((Substring_T) List_head(this->substrings_Nto1));
807 }
808 #endif
809
810
811 #if 0
812 /* Two types of ambiguity: known amb (mapped to >1 genomic place) and unknown amb (splice site seen) */
813 static bool
814 known_ambiguous_p (T this) {
815 if (Substring_ambiguous_p((Substring_T) List_head(this->substrings_1toN))) {
816 return true;
817 } else if (Substring_ambiguous_p((Substring_T) List_head(this->substrings_Nto1))) {
818 return true;
819 } else {
820 return false;
821 }
822 }
823 #endif
824
825
826 /* Includes amb and non-amb */
827 int
Stage3end_total_trim(T this)828 Stage3end_total_trim (T this) {
829 return this->trim_querystart + this->trim_queryend;
830 }
831
832
833 int
Stage3end_circularpos(T this)834 Stage3end_circularpos (T this) {
835 return this->circularpos;
836 }
837
838
839 Junction_T
Stage3end_junctionD(T this)840 Stage3end_junctionD (T this) {
841 if (this->sensedir == SENSE_ANTI) {
842 return (Junction_T) List_head(this->junctions_Nto1);
843 } else {
844 return (Junction_T) List_head(this->junctions_1toN);
845 }
846 }
847
848 Junction_T
Stage3end_junctionA(T this)849 Stage3end_junctionA (T this) {
850 if (this->sensedir == SENSE_ANTI) {
851 return (Junction_T) List_head(this->junctions_1toN);
852 } else {
853 return (Junction_T) List_head(this->junctions_Nto1);
854 }
855 }
856
857 List_T
Stage3end_substrings_LtoH(T this)858 Stage3end_substrings_LtoH (T this) {
859 if (this->plusp == true) {
860 return this->substrings_1toN;
861 } else {
862 return this->substrings_Nto1;
863 }
864 }
865
866 List_T
Stage3end_junctions_LtoH(T this)867 Stage3end_junctions_LtoH (T this) {
868 if (this->plusp == true) {
869 return this->junctions_1toN;
870 } else {
871 return this->junctions_Nto1;
872 }
873 }
874
875
876 /* Called only by samprint currently */
877 Substring_T
Stage3end_substring1(T this)878 Stage3end_substring1 (T this) {
879 return (Substring_T) List_head(this->substrings_1toN);
880 }
881
882 /* Called only by samprint currently */
883 Substring_T
Stage3end_substringN(T this)884 Stage3end_substringN (T this) {
885 return (Substring_T) List_head(this->substrings_Nto1);
886 }
887
888
889 Substring_T
Stage3end_substring_for_concordance(T this,bool first_read_p)890 Stage3end_substring_for_concordance (T this, bool first_read_p) {
891 if (first_read_p == true) {
892 return (Substring_T) List_head(this->substrings_Nto1);
893 } else {
894 return (Substring_T) List_head(this->substrings_1toN);
895 }
896 }
897
898 Substring_T
Stage3end_substring_other(T this,bool first_read_p)899 Stage3end_substring_other (T this, bool first_read_p) {
900 if (first_read_p == true) {
901 return (Substring_T) List_head(this->substrings_1toN);
902 } else {
903 return (Substring_T) List_head(this->substrings_Nto1);
904 }
905 }
906
907
908 bool
Stage3end_donor_concordant_p(T this,bool first_read_p)909 Stage3end_donor_concordant_p (T this, bool first_read_p) {
910 if (this->sensedir != SENSE_ANTI) {
911 if (first_read_p == true) {
912 return false;
913 } else {
914 return true;
915 }
916 } else {
917 if (first_read_p == true) {
918 return true;
919 } else {
920 return false;
921 }
922 }
923 }
924
925
926 Substring_T
Stage3end_substring_donor(T this)927 Stage3end_substring_donor (T this) {
928 if (this->sensedir == SENSE_ANTI) {
929 return (Substring_T) List_head(this->substrings_Nto1);
930 } else if (this->sensedir == SENSE_FORWARD) {
931 return (Substring_T) List_head(this->substrings_1toN);
932 } else {
933 fprintf(stderr,"sensedir is SENSE_NULL in Stage3end_substring_donor\n");
934 abort();
935 }
936 }
937
938 Substring_T
Stage3end_substring_acceptor(T this)939 Stage3end_substring_acceptor (T this) {
940 if (this->sensedir == SENSE_ANTI) {
941 return (Substring_T) List_head(this->substrings_1toN);
942 } else if (this->sensedir == SENSE_FORWARD) {
943 return (Substring_T) List_head(this->substrings_Nto1);
944 } else {
945 fprintf(stderr,"sensedir is SENSE_NULL in Stage3end_substring_acceptor\n");
946 abort();
947 }
948 }
949
950 /* Now same as Stage3end_substring_donor */
951 Substring_T
Stage3end_substringD(T this)952 Stage3end_substringD (T this) {
953 if (this->sensedir == SENSE_ANTI) {
954 return (Substring_T) List_head(this->substrings_Nto1);
955 } else {
956 return (Substring_T) List_head(this->substrings_1toN);
957 }
958 }
959
960 /* Now same as Stage3end_substring_acceptor */
961 Substring_T
Stage3end_substringA(T this)962 Stage3end_substringA (T this) {
963 if (this->sensedir == SENSE_ANTI) {
964 return (Substring_T) List_head(this->substrings_1toN);
965 } else {
966 return (Substring_T) List_head(this->substrings_Nto1);
967 }
968 }
969
970
971 Substring_T
Stage3end_substringS(T this)972 Stage3end_substringS (T this) {
973 return (Substring_T) List_head(List_next(this->substrings_1toN));
974 }
975
976
977
978 /* Same logic as in print_substrings in samprint.c to get the first substring for CIGAR or MD string */
979 Substring_T
Stage3end_substring_low(T this,int hardclip_low)980 Stage3end_substring_low (T this, int hardclip_low) {
981 List_T p;
982
983 debug15(printf("Entered Stage3end_substring_low\n"));
984
985 if (this == NULL) {
986 return (Substring_T) NULL;
987
988 } else if (this->plusp == true) {
989 p = this->substrings_1toN; /* substrings_LtoH */
990 if (Substring_has_alts_p((Substring_T) List_head(p)) == true) {
991 p = List_next(p);
992 }
993 while (p != NULL && Substring_queryend((Substring_T) List_head(p)) <= hardclip_low) {
994 debug15(printf("Plus: Skipping substring %d..%d against hardclip_low %d\n",
995 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
996 hardclip_low));
997 p = List_next(p);
998 }
999
1000 if (p == NULL) {
1001 return (Substring_T) NULL;
1002 } else {
1003 debug15(printf("Plus: Returning substring %d..%d against hardclip_low %d\n",
1004 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1005 hardclip_low));
1006 return (Substring_T) List_head(p);
1007 }
1008
1009 } else {
1010 #ifdef DEBUG15
1011 for (p = this->substrings_LtoH; p != NULL; p = List_next(p)) {
1012 printf("LtoH: %d..%d\n",
1013 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)));
1014 }
1015 #endif
1016
1017 p = this->substrings_Nto1; /* substrings_LtoH */
1018 if (Substring_has_alts_p((Substring_T) List_head(p)) == true) {
1019 p = List_next(p);
1020 }
1021
1022 while (p != NULL && Substring_querystart((Substring_T) List_head(p)) >= this->querylength - hardclip_low) {
1023 debug15(printf("Minus: Skipping substring %d..%d against %d = querylength %d - hardclip_low %d\n",
1024 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1025 this->querylength - hardclip_low,this->querylength,hardclip_low));
1026 p = List_next(p);
1027 }
1028
1029 if (p == NULL) {
1030 return (Substring_T) NULL;
1031 } else {
1032 debug15(printf("Minus: Returning substring %d..%d against %d = querylength %d - hardclip_low %d\n",
1033 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1034 this->querylength - hardclip_low,this->querylength,hardclip_low));
1035 return (Substring_T) List_head(p);
1036 }
1037 }
1038 }
1039
1040
1041 #if 0
1042 /* Modified from Stage3end_substring_low */
1043 Substring_T
1044 Stage3end_substring_high (T this, int hardclip_high) {
1045 List_T p;
1046
1047 debug15(printf("Entered Stage3end_substring_high\n"));
1048
1049 if (this == NULL) {
1050 return (Substring_T) NULL;
1051
1052 } else if (this->plusp == true) {
1053 p = this->substrings_HtoL;
1054 if (Substring_has_alts_p((Substring_T) List_head(p)) == true) {
1055 p = List_next(p);
1056 }
1057
1058 while (p != NULL && Substring_querystart((Substring_T) List_head(p)) >= this->querylength - hardclip_high) {
1059 debug15(printf("Plus: Skipping substring %d..%d against %d = querylength %d - hardclip_high %d\n",
1060 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1061 this->querylength - hardclip_high,this->querylength,hardclip_high));
1062 p = List_next(p);
1063 }
1064
1065 if (p == NULL) {
1066 return (Substring_T) NULL;
1067 } else {
1068 debug15(printf("Plus: Returning substring %d..%d against %d = querylength %d - hardclip_high %d\n",
1069 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1070 this->querylength - hardclip_high,this->querylength,hardclip_high));
1071 return (Substring_T) List_head(p);
1072 }
1073
1074 } else {
1075 #ifdef DEBUG15
1076 for (p = this->substrings_HtoL; p != NULL; p = List_next(p)) {
1077 printf("HtoL: %d..%d\n",
1078 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)));
1079 }
1080 #endif
1081
1082 p = this->substrings_HtoL;
1083 if (Substring_has_alts_p((Substring_T) List_head(p)) == true) {
1084 p = List_next(p);
1085 }
1086
1087 while (p != NULL && Substring_queryend((Substring_T) List_head(p)) <= hardclip_high) {
1088 debug15(printf("Minus: Skipping substring %d..%d against hardclip_high %d\n",
1089 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1090 hardclip_high));
1091 p = List_next(p);
1092 }
1093
1094 if (p == NULL) {
1095 return (Substring_T) NULL;
1096 } else {
1097 debug15(printf("Minus: Returning substring %d..%d against hardclip_high %d\n",
1098 Substring_querystart((Substring_T) List_head(p)),Substring_queryend((Substring_T) List_head(p)),
1099 hardclip_high));
1100 return (Substring_T) List_head(p);
1101 }
1102 }
1103 }
1104 #endif
1105
1106
1107
1108 Substring_T
Stage3end_substring_containing(T this,int querypos)1109 Stage3end_substring_containing (T this, int querypos) {
1110 Substring_T substring;
1111 List_T substrings_LtoH, p;
1112
1113 if (this->plusp == true) {
1114 substrings_LtoH = this->substrings_1toN;
1115 } else {
1116 substrings_LtoH = this->substrings_Nto1;
1117 }
1118
1119 for (p = substrings_LtoH; p != NULL; p = List_next(p)) {
1120 substring = (Substring_T) List_head(p);
1121 if (Substring_contains_p(substring,querypos) == true) {
1122 return substring;
1123 }
1124 }
1125
1126 return (Substring_T) NULL;
1127 }
1128
1129
1130 double
Stage3end_min_evalue(T this)1131 Stage3end_min_evalue (T this) {
1132 double min_evalue = 1000.0, evalue;
1133 Substring_T substring;
1134 List_T p;
1135
1136 for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
1137 substring = (Substring_T) List_head(p);
1138 if ((evalue = Substring_evalue(substring)) < min_evalue) {
1139 min_evalue = evalue;
1140 }
1141 }
1142
1143 return min_evalue;
1144 }
1145
1146
1147 double
Stage3end_chimera_prob(T this)1148 Stage3end_chimera_prob (T this) {
1149 List_T p;
1150 Junction_T junction;
1151
1152 for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
1153 junction = (Junction_T) List_head(p);
1154 if (Junction_type(junction) == CHIMERA_JUNCTION) {
1155 return Junction_prob(junction);
1156 }
1157 }
1158
1159 return 0.0;
1160 }
1161
1162 static double
Stage3end_prob(T this)1163 Stage3end_prob (T this) {
1164 double prob = 0.0;
1165 List_T p;
1166 Junction_T junction;
1167
1168 for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
1169 junction = (Junction_T) List_head(p);
1170 prob += Junction_prob(junction);
1171 }
1172
1173 return prob;
1174 }
1175
1176
1177 /* Should eventually look for substrings adjacent to the chimeric junction */
1178 Univcoord_T
Stage3end_chimera_segmenti_left(T this)1179 Stage3end_chimera_segmenti_left (T this) {
1180 Univcoord_T x_segmenti, x_segmentj;
1181 Substring_T substring_donor, substring_acceptor;
1182
1183 if (this->sensedir == SENSE_ANTI) {
1184 substring_donor = (Substring_T) List_head(this->substrings_Nto1);
1185 substring_acceptor = (Substring_T) List_head(this->substrings_1toN);
1186 } else {
1187 substring_donor = (Substring_T) List_head(this->substrings_1toN);
1188 substring_acceptor = (Substring_T) List_head(this->substrings_Nto1);
1189 }
1190
1191 x_segmenti = Substring_left_genomicseg(substring_donor);
1192 x_segmentj = Substring_left_genomicseg(substring_acceptor);
1193 if (x_segmenti < x_segmentj) {
1194 return x_segmenti;
1195 } else {
1196 return x_segmentj;
1197 }
1198 }
1199
1200 /* Should eventually look for substrings adjacent to the chimeric junction */
1201 Univcoord_T
Stage3end_chimera_segmentj_left(T this)1202 Stage3end_chimera_segmentj_left (T this) {
1203 Univcoord_T x_segmenti, x_segmentj;
1204 Substring_T substring_donor, substring_acceptor;
1205
1206 if (this->sensedir == SENSE_ANTI) {
1207 substring_donor = (Substring_T) List_head(this->substrings_Nto1);
1208 substring_acceptor = (Substring_T) List_head(this->substrings_1toN);
1209 } else {
1210 substring_donor = (Substring_T) List_head(this->substrings_1toN);
1211 substring_acceptor = (Substring_T) List_head(this->substrings_Nto1);
1212 }
1213
1214 x_segmenti = Substring_left_genomicseg(substring_donor);
1215 x_segmentj = Substring_left_genomicseg(substring_acceptor);
1216 if (x_segmenti > x_segmentj) {
1217 return x_segmenti;
1218 } else {
1219 return x_segmentj;
1220 }
1221 }
1222
1223
1224 int
Stage3end_chimera_segmenti_cmp(const void * a,const void * b)1225 Stage3end_chimera_segmenti_cmp (const void *a, const void *b) {
1226 T x = * (T *) a;
1227 T y = * (T *) b;
1228 Univcoord_T x_segmenti, x_segmentj, y_segmenti, y_segmentj, temp;
1229 Substring_T x_substring_donor, x_substring_acceptor,
1230 y_substring_donor, y_substring_acceptor;
1231
1232 if (x->sensedir == SENSE_ANTI) {
1233 x_substring_donor = (Substring_T) List_head(x->substrings_Nto1);
1234 x_substring_acceptor = (Substring_T) List_head(x->substrings_1toN);
1235 } else {
1236 x_substring_donor = (Substring_T) List_head(x->substrings_1toN);
1237 x_substring_acceptor = (Substring_T) List_head(x->substrings_Nto1);
1238 }
1239
1240 if (y->sensedir == SENSE_ANTI) {
1241 y_substring_donor = (Substring_T) List_head(y->substrings_Nto1);
1242 y_substring_acceptor = (Substring_T) List_head(y->substrings_1toN);
1243 } else {
1244 y_substring_donor = (Substring_T) List_head(y->substrings_1toN);
1245 y_substring_acceptor = (Substring_T) List_head(y->substrings_Nto1);
1246 }
1247
1248 x_segmenti = Substring_left_genomicseg(x_substring_donor);
1249 x_segmentj = Substring_left_genomicseg(x_substring_acceptor);
1250 if (x_segmentj < x_segmenti) {
1251 temp = x_segmentj;
1252 x_segmentj = x_segmenti;
1253 x_segmenti = temp;
1254 }
1255
1256 y_segmenti = Substring_left_genomicseg(y_substring_donor);
1257 y_segmentj = Substring_left_genomicseg(y_substring_acceptor);
1258 if (y_segmentj < y_segmenti) {
1259 temp = y_segmentj;
1260 y_segmentj = y_segmenti;
1261 y_segmenti = temp;
1262 }
1263
1264 if (x_segmenti < y_segmenti) {
1265 return -1;
1266 } else if (y_segmenti < x_segmenti) {
1267 return +1;
1268 } else if (x_segmentj > y_segmentj) {
1269 return -1;
1270 } else if (y_segmentj > x_segmentj) {
1271 return +1;
1272 } else {
1273 return 0;
1274 }
1275 }
1276
1277
1278
1279 int
Stage3end_chimera_segmentj_cmp(const void * a,const void * b)1280 Stage3end_chimera_segmentj_cmp (const void *a, const void *b) {
1281 T x = * (T *) a;
1282 T y = * (T *) b;
1283 Univcoord_T x_segmenti, x_segmentj, y_segmenti, y_segmentj, temp;
1284 Substring_T x_substring_donor, x_substring_acceptor,
1285 y_substring_donor, y_substring_acceptor;
1286
1287 if (x->sensedir == SENSE_ANTI) {
1288 x_substring_donor = (Substring_T) List_head(x->substrings_Nto1);
1289 x_substring_acceptor = (Substring_T) List_head(x->substrings_1toN);
1290 } else {
1291 x_substring_donor = (Substring_T) List_head(x->substrings_1toN);
1292 x_substring_acceptor = (Substring_T) List_head(x->substrings_Nto1);
1293 }
1294
1295 if (y->sensedir == SENSE_ANTI) {
1296 y_substring_donor = (Substring_T) List_head(y->substrings_Nto1);
1297 y_substring_acceptor = (Substring_T) List_head(y->substrings_1toN);
1298 } else {
1299 y_substring_donor = (Substring_T) List_head(y->substrings_1toN);
1300 y_substring_acceptor = (Substring_T) List_head(y->substrings_Nto1);
1301 }
1302
1303
1304 x_segmenti = Substring_left_genomicseg(x_substring_donor);
1305 x_segmentj = Substring_left_genomicseg(x_substring_acceptor);
1306 if (x_segmentj < x_segmenti) {
1307 temp = x_segmentj;
1308 x_segmentj = x_segmenti;
1309 x_segmenti = temp;
1310 }
1311
1312 y_segmenti = Substring_left_genomicseg(y_substring_donor);
1313 y_segmentj = Substring_left_genomicseg(y_substring_acceptor);
1314 if (y_segmentj < y_segmenti) {
1315 temp = y_segmentj;
1316 y_segmentj = y_segmenti;
1317 y_segmenti = temp;
1318 }
1319
1320 if (x_segmentj < y_segmentj) {
1321 return -1;
1322 } else if (y_segmentj < x_segmentj) {
1323 return +1;
1324 } else if (x_segmenti > y_segmenti) {
1325 return -1;
1326 } else if (y_segmenti > x_segmenti) {
1327 return +1;
1328 } else {
1329 return 0;
1330 }
1331 }
1332
1333
1334 int
Stage3end_sensedir(T this)1335 Stage3end_sensedir (T this) {
1336 if (this == NULL) {
1337 /* Can happen if we call upon a mate in a halfmapping */
1338 return SENSE_NULL;
1339 } else {
1340 return this->sensedir;
1341 }
1342 }
1343
1344 #if 0
1345 int
1346 Stage3end_cdna_direction (T this) {
1347 if (this == NULL) {
1348 return SENSE_NULL;
1349 } else if (this->sensedir == SENSE_FORWARD) {
1350 return +1;
1351 } else if (this->sensedir == SENSE_ANTI) {
1352 return -1;
1353 } else {
1354 return SENSE_NULL;
1355 }
1356 }
1357 #endif
1358
1359 #if 0
1360 bool
1361 Stage3end_start_ambiguous_p (T this) {
1362 Substring_T substring;
1363
1364 substring = (Substring_T) List_head(this->substrings_1toN);
1365 return Substring_ambiguous_p(substring);
1366 }
1367 #endif
1368
1369 #if 0
1370 bool
1371 Stage3end_end_ambiguous_p (T this) {
1372 Substring_T substring;
1373
1374 substring = (Substring_T) List_head(this->substrings_Nto1);
1375 return Substring_ambiguous_p(substring);
1376 }
1377 #endif
1378
1379 bool
Stage3end_start_has_alts_p(T this)1380 Stage3end_start_has_alts_p (T this) {
1381 Substring_T substring;
1382
1383 substring = (Substring_T) List_head(this->substrings_1toN);
1384 return Substring_has_alts_p(substring);
1385 }
1386
1387 bool
Stage3end_end_has_alts_p(T this)1388 Stage3end_end_has_alts_p (T this) {
1389 Substring_T substring;
1390
1391 substring = (Substring_T) List_head(this->substrings_Nto1);
1392 return Substring_has_alts_p(substring);
1393 }
1394
1395
1396 Univcoord_T *
Stage3end_start_alts_coords(T this)1397 Stage3end_start_alts_coords (T this) {
1398 Substring_T substring;
1399
1400 substring = (Substring_T) List_head(this->substrings_1toN);
1401 if (Substring_has_alts_p(substring) == false) {
1402 return (Univcoord_T *) NULL;
1403 } else {
1404 return Substring_alts_coords(substring);
1405 }
1406 }
1407
1408 Univcoord_T *
Stage3end_end_alts_coords(T this)1409 Stage3end_end_alts_coords (T this) {
1410 Substring_T substring;
1411
1412 substring = (Substring_T) List_head(this->substrings_Nto1);
1413 if (Substring_has_alts_p(substring) == false) {
1414 return (Univcoord_T *) NULL;
1415 } else {
1416 return Substring_alts_coords(substring);
1417 }
1418 }
1419
1420 int
Stage3end_start_alts_ncoords(T this)1421 Stage3end_start_alts_ncoords (T this) {
1422 Substring_T substring;
1423
1424 substring = (Substring_T) List_head(this->substrings_1toN);
1425 if (Substring_has_alts_p(substring) == false) {
1426 return 0;
1427 } else {
1428 return Substring_alts_ncoords(substring);
1429 }
1430 }
1431
1432 int
Stage3end_end_alts_ncoords(T this)1433 Stage3end_end_alts_ncoords (T this) {
1434 Substring_T substring;
1435
1436 substring = (Substring_T) List_head(this->substrings_Nto1);
1437 if (Substring_has_alts_p(substring) == false) {
1438 return 0;
1439 } else {
1440 return Substring_alts_ncoords(substring);
1441 }
1442 }
1443
1444
1445 int
Stage3end_substrings_querystart(T this)1446 Stage3end_substrings_querystart (T this) {
1447 Substring_T substring;
1448
1449 substring = (Substring_T) List_head(this->substrings_1toN);
1450 return Substring_querystart(substring);
1451 }
1452
1453 int
Stage3end_substrings_queryend(T this)1454 Stage3end_substrings_queryend (T this) {
1455 Substring_T substring;
1456
1457 substring = (Substring_T) List_head(this->substrings_Nto1);
1458 return Substring_queryend(substring);
1459 }
1460
1461
1462 int
Stage3end_trimlength(T this)1463 Stage3end_trimlength (T this) {
1464 return this->trim_querystart + this->trim_queryend;
1465 }
1466
1467
1468 void
Stage3end_count_hits(int * npaths_primary,int * npaths_altloc,List_T hits)1469 Stage3end_count_hits (int *npaths_primary, int *npaths_altloc, List_T hits) {
1470 T hit;
1471
1472 *npaths_primary = *npaths_altloc = 0;
1473
1474 while (hits != NULL) {
1475 hit = (T) List_head(hits);
1476 if (altlocp[hit->chrnum] == true) {
1477 *npaths_altloc += 1;
1478 } else {
1479 *npaths_primary += 1;
1480 }
1481 hits = List_next(hits);
1482 }
1483
1484 return;
1485 }
1486
1487 #if 0
1488 static long int
1489 Stage3end_compute_tally (T this) {
1490 long int tally = 0L;
1491 List_T p;
1492 Substring_T substring;
1493
1494 for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
1495 substring = (Substring_T) List_head(p);
1496 tally += Substring_tally(substring,tally_iit,tally_divint_crosstable);
1497 }
1498
1499 return tally;
1500 }
1501 #endif
1502
1503 #if 0
1504 static bool
1505 Stage3end_runlength_p (T this) {
1506 List_T p;
1507 Substring_T substring;
1508
1509 for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
1510 substring = (Substring_T) List_head(p);
1511 if (Substring_runlength_p(substring,runlength_iit,runlength_divint_crosstable) == true) {
1512 return true;
1513 }
1514 }
1515
1516 return false;
1517 }
1518 #endif
1519
1520
1521 void
Stage3end_free(T * old)1522 Stage3end_free (T *old) {
1523 List_T p;
1524 Substring_T substring;
1525 Junction_T junction;
1526
1527
1528 if (*old != NULL) {
1529 debug0(printf("Freeing Stage3end %p from method %s\n",*old,Method_string((*old)->method)));
1530
1531 if ((*old)->transcripts_other != NULL) {
1532 Transcript_gc(&(*old)->transcripts_other);
1533 }
1534 if ((*old)->transcripts != NULL) {
1535 Transcript_gc(&(*old)->transcripts);
1536 }
1537
1538 for (p = (*old)->substrings_1toN; p != NULL; p = List_next(p)) {
1539 substring = (Substring_T) List_head(p);
1540 Substring_free(&substring);
1541 }
1542 /* List_free(&(*old)->substrings_1toN); -- allocated by Listpool_push */
1543 /* List_free(&(*old)->substrings_Nto1); -- allocated by Listpool_push */
1544 /* List_free(&(*old)->substrings_LtoH); -- allocated by Listpool_push */
1545 /* List_free(&(*old)->substrings_HtoL); -- allocated by Listpool_push */
1546
1547 for (p = (*old)->junctions_1toN; p != NULL; p = List_next(p)) {
1548 junction = (Junction_T) List_head(p);
1549 Junction_free(&junction);
1550 }
1551 /* List_free(&(*old)->junctions_1toN); -- allocated by Listpool_push */
1552 /* List_free(&(*old)->junctions_Nto1); -- allocated by Listpool_push */
1553 /* List_free(&(*old)->junctions_LtoH); -- allocated by Listpool_push */
1554 /* List_free(&(*old)->junctions_HtoL); */
1555
1556 FREE_OUT(*old);
1557 }
1558
1559 return;
1560 }
1561
1562
1563 /* Used for freeing list contents in Concordance_pair_up procedures */
1564 /* Do not free the list itself, though, which was previously freed in
1565 stage1hr.c, and now allocated by Hitlistpool_T */
1566 void
Stage3end_gc(List_T values)1567 Stage3end_gc (List_T values) {
1568 List_T p;
1569 T hit;
1570
1571 for (p = values; p != NULL; p = p->rest) {
1572 if ((hit = (T) p->first) != NULL) {
1573 Stage3end_free(&hit);
1574 }
1575 }
1576 Hitlist_free(&values);
1577 return;
1578 }
1579
1580
1581
1582 bool
Stage3pair_distant_splice_p(Stage3pair_T this)1583 Stage3pair_distant_splice_p (Stage3pair_T this) {
1584 if (this->hit5 != NULL && this->hit5->distant_splice_p == true) {
1585 return true;
1586 } else if (this->hit3 != NULL && this->hit3->distant_splice_p == true) {
1587 return true;
1588 } else {
1589 return false;
1590 }
1591 }
1592
1593
1594 int
Stage3pair_genestrand(Stage3pair_T this)1595 Stage3pair_genestrand (Stage3pair_T this) {
1596 return this->genestrand;
1597 }
1598
1599 Stage3end_T
Stage3pair_hit5(Stage3pair_T this)1600 Stage3pair_hit5 (Stage3pair_T this) {
1601 return this->hit5;
1602 }
1603
1604 Stage3end_T
Stage3pair_hit3(Stage3pair_T this)1605 Stage3pair_hit3 (Stage3pair_T this) {
1606 return this->hit3;
1607 }
1608
1609 int
Stage3pair_mapq_score(Stage3pair_T this)1610 Stage3pair_mapq_score (Stage3pair_T this) {
1611 return this->mapq_score;
1612 }
1613
1614 int
Stage3pair_absmq_score(Stage3pair_T this)1615 Stage3pair_absmq_score (Stage3pair_T this) {
1616 return this->absmq_score;
1617 }
1618
1619 List_T
Stage3pair_transcripts5(Stage3pair_T this)1620 Stage3pair_transcripts5 (Stage3pair_T this) {
1621 return this->hit5->transcripts;
1622 }
1623
1624 List_T
Stage3pair_transcripts3(Stage3pair_T this)1625 Stage3pair_transcripts3 (Stage3pair_T this) {
1626 return this->hit3->transcripts;
1627 }
1628
1629 Chrpos_T
Stage3pair_pairlength(Stage3pair_T this)1630 Stage3pair_pairlength (Stage3pair_T this) {
1631 return this->insertlength;
1632 }
1633
1634 int
Stage3pair_relationship(Stage3pair_T this)1635 Stage3pair_relationship (Stage3pair_T this) {
1636 return this->pair_relationship;
1637 }
1638
1639 int
Stage3pair_total_trim(Stage3pair_T this)1640 Stage3pair_total_trim (Stage3pair_T this) {
1641 return Stage3end_total_trim(this->hit5) + Stage3end_total_trim(this->hit3);
1642 }
1643
1644 int
Stage3pair_max_trim(Stage3pair_T this)1645 Stage3pair_max_trim (Stage3pair_T this) {
1646 int trim5, trim3;
1647 T hit;
1648
1649 #if 0
1650 /* Don't want ambiguous ends for purpose of defining concordant terminals */
1651 trim5 = Stage3end_total_trim(this->hit5);
1652 trim3 = Stage3end_total_trim(this->hit3);
1653 #else
1654 hit = this->hit5;
1655 trim5 = hit->trim_querystart + hit->trim_queryend;
1656 hit = this->hit3;
1657 trim3 = hit->trim_querystart + hit->trim_queryend;
1658 #endif
1659
1660 if (trim5 > trim3) {
1661 return trim5;
1662 } else {
1663 return trim3;
1664 }
1665 }
1666
1667 int
Stage3pair_nmatches_to_trims(int * nmatches5,int * nmatches3,Stage3pair_T this)1668 Stage3pair_nmatches_to_trims (int *nmatches5, int *nmatches3, Stage3pair_T this) {
1669 *nmatches5 = this->hit5->refalt_nmatches_to_trims;
1670 *nmatches3 = this->hit3->refalt_nmatches_to_trims;
1671 return (*nmatches5) + (*nmatches3);
1672 }
1673
1674 int
Stage3pair_ref_nmatches_to_trims(int * nmatches5,int * nmatches3,Stage3pair_T this)1675 Stage3pair_ref_nmatches_to_trims (int *nmatches5, int *nmatches3, Stage3pair_T this) {
1676 *nmatches5 = this->hit5->ref_nmatches_to_trims;
1677 *nmatches3 = this->hit3->ref_nmatches_to_trims;
1678 return (*nmatches5) + (*nmatches3);
1679 }
1680
1681
1682 bool
Stage3pair_concordantp(List_T hitpairs)1683 Stage3pair_concordantp (List_T hitpairs) {
1684 List_T p;
1685 Stage3pair_T hitpair;
1686
1687 for (p = hitpairs; p != NULL; p = List_next(p)) {
1688 hitpair = (Stage3pair_T) List_head(p);
1689 #if 0
1690 /* Not necessary, since we are getting the result after GMAP align pair */
1691 if (Stage3_determine_pairtype(hitpair->hit5,hitpair->hit3,hitpair) == CONCORDANT) {
1692 return true;
1693 }
1694 #else
1695 if (hitpair->pairtype == CONCORDANT) {
1696 return true;
1697 }
1698 #endif
1699 }
1700 return false;
1701 }
1702
1703 void
Stage3pair_count_hits(int * npaths_primary,int * npaths_altloc,List_T hitpairs)1704 Stage3pair_count_hits (int *npaths_primary, int *npaths_altloc, List_T hitpairs) {
1705 Stage3pair_T hitpair;
1706
1707 *npaths_primary = *npaths_altloc = 0;
1708
1709 while (hitpairs != NULL) {
1710 hitpair = (Stage3pair_T) List_head(hitpairs);
1711 if (altlocp[hitpair->hit5->chrnum] == true) {
1712 *npaths_altloc += 1;
1713 } else if (altlocp[hitpair->hit3->chrnum] == true) {
1714 *npaths_altloc += 1;
1715 } else {
1716 *npaths_primary += 1;
1717 }
1718 hitpairs = List_next(hitpairs);
1719 }
1720
1721 return;
1722 }
1723
1724 List_T
Stage3pair_filter_nonconcordant(List_T hitpairs,Hitlistpool_T hitlistpool)1725 Stage3pair_filter_nonconcordant (List_T hitpairs, Hitlistpool_T hitlistpool) {
1726 List_T filtered = NULL, p;
1727 Stage3pair_T hitpair;
1728
1729 for (p = hitpairs; p != NULL; p = List_next(p)) {
1730 hitpair = (Stage3pair_T) List_head(p);
1731 if (hitpair->pairtype != CONCORDANT) {
1732 Stage3pair_free(&hitpair);
1733 } else {
1734 filtered = Hitlist_push(filtered,hitlistpool,(void *) hitpair);
1735 }
1736 }
1737 Hitlist_free(&hitpairs);
1738 return filtered;
1739 }
1740
1741
1742 /* Returns true if ilengths are valid */
1743 static bool
find_ilengths(int * ilength_low,int * ilength_high,Stage3end_T hit,Univcoord_T common_genomicpos)1744 find_ilengths (int *ilength_low, int *ilength_high, Stage3end_T hit, Univcoord_T common_genomicpos) {
1745 List_T p, q;
1746 Substring_T substring;
1747 Junction_T junction;
1748
1749
1750 debug15(printf("Finding ilengths for common_genomicpos %u\n",(Chrpos_T) (common_genomicpos - chroffset)));
1751 if (hit->plusp == true) {
1752 #ifdef DEBUG15
1753 printf("plus. Checking common genomicpos %llu against\n",common_genomicpos - hit->chroffset);
1754 for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
1755 substring = (Substring_T) List_head(p);
1756 printf("substring %p: %u..%u, trim %d..%d\n",
1757 substring,Substring_alignstart_trim(substring) - hit->chroffset,
1758 Substring_alignend_trim(substring) - 1U - hit->chroffset,
1759 Substring_trim_querystart(substring),Substring_trim_queryend(substring));
1760 }
1761 printf("\n");
1762 #endif
1763 /* Plus: Subtract 1 from alignend */
1764 *ilength_low = 0;
1765 for (p = hit->substrings_1toN, q = hit->junctions_1toN; p != NULL; p = List_next(p), q = List_next(q)) {
1766 substring = (Substring_T) List_head(p);
1767 debug15(printf("substring %p: %u..%u, trim %d..%d\n",substring,
1768 Substring_alignstart_trim(substring) - hit->chroffset,
1769 Substring_alignend_trim(substring) - 1U - hit->chroffset,
1770 Substring_trim_querystart(substring),Substring_trim_queryend(substring)));
1771 if (Substring_overlap_point_trimmed_p(substring,common_genomicpos) == false) {
1772 *ilength_low += Substring_genomic_alignment_length(substring);
1773 if (q != NULL) {
1774 junction = (Junction_T) List_head(q);
1775 if (Junction_type(junction) == INS_JUNCTION) {
1776 *ilength_low += Junction_nindels(junction);
1777 }
1778 }
1779
1780 } else {
1781 *ilength_low += (common_genomicpos - Substring_alignstart_trim(substring) + 1);
1782 *ilength_high = ((Substring_alignend_trim(substring) - 1) - common_genomicpos + 1);
1783 p = List_next(p);
1784 while (p != NULL) {
1785 substring = (Substring_T) List_head(p);
1786 *ilength_high += Substring_genomic_alignment_length(substring);
1787 p = List_next(p);
1788 }
1789 while (q != NULL) {
1790 junction = (Junction_T) List_head(q);
1791 if (Junction_type(junction) == INS_JUNCTION) {
1792 *ilength_high += Junction_nindels(junction);
1793 }
1794 q = List_next(q);
1795 }
1796 debug15(printf("Plus: Have ilength_low %d and ilength_high %d\n",*ilength_low,*ilength_high));
1797 return true;
1798 }
1799 }
1800 } else {
1801 #ifdef DEBUG15
1802 printf("minus. Checking common genomicpos %llu against\n",common_genomicpos - hit->chroffset);
1803 for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
1804 substring = (Substring_T) List_head(p);
1805 printf("substring %p: %u..%u, trim %d..%d\n",
1806 substring,Substring_alignstart_trim(substring) - hit->chroffset,
1807 Substring_alignend_trim(substring) - 1U - hit->chroffset,
1808 Substring_trim_querystart(substring),Substring_trim_queryend(substring));
1809 }
1810 printf("\n");
1811 #endif
1812 /* Minus: Subtract 1 from alignstart */
1813 *ilength_high = 0;
1814 for (p = hit->substrings_1toN, q = hit->junctions_1toN; p != NULL; p = List_next(p), q = List_next(q)) {
1815 substring = (Substring_T) List_head(p);
1816 debug15(printf("substring: %u..%u\n",
1817 Substring_alignstart_trim(substring) - 1U - hit->chroffset,
1818 Substring_alignend_trim(substring) - hit->chroffset));
1819 if (Substring_overlap_point_trimmed_p(substring,common_genomicpos) == false) {
1820 *ilength_high += Substring_genomic_alignment_length(substring);
1821 if (q != NULL) {
1822 junction = (Junction_T) List_head(q);
1823 if (Junction_type(junction) == INS_JUNCTION) {
1824 *ilength_high += Junction_nindels(junction);
1825 }
1826 }
1827
1828 } else {
1829 *ilength_high += ((Substring_alignstart_trim(substring) - 1) - common_genomicpos + 1);
1830 *ilength_low = (common_genomicpos - (Substring_alignend_trim(substring) /*+ 1*/) + 1);
1831 p = List_next(p);
1832 while (p != NULL) {
1833 substring = (Substring_T) List_head(p);
1834 *ilength_low += Substring_genomic_alignment_length(substring);
1835 p = List_next(p);
1836 }
1837 while (q != NULL) {
1838 junction = (Junction_T) List_head(q);
1839 if (Junction_type(junction) == INS_JUNCTION) {
1840 *ilength_low += Junction_nindels(junction);
1841 }
1842 q = List_next(q);
1843 }
1844 debug15(printf("Minus: Have ilength_low %d and ilength_high %d\n",*ilength_low,*ilength_high));
1845 return true;
1846 }
1847 }
1848 }
1849
1850 return false;
1851 }
1852
1853
1854
1855 /* Needed to compute overlap properly. Based on pair_insert_length below, plus code for handling GMAP. */
1856 static Univcoord_T
pair_common_genomicpos(Stage3end_T hit5,Stage3end_T hit3)1857 pair_common_genomicpos (Stage3end_T hit5, Stage3end_T hit3) {
1858 Univcoord_T common_genomicpos;
1859 Univcoord_T start5, end5, start3, end3;
1860 List_T p, q;
1861 Substring_T substring, substring5, substring3;
1862
1863 if (hit5->plusp == true && hit3->plusp == true) {
1864 /* plus/plus */
1865 debug15(printf("Computing overlap using substrings plus/plus\n"));
1866
1867 start5 = hit5->genomicstart + hit5->trim_querystart + start_amb_length(hit5);
1868 end5 = (hit5->genomicend - 1) - hit5->trim_queryend - end_amb_length(hit5);
1869 start3 = hit3->genomicstart + hit3->trim_querystart + start_amb_length(hit3);
1870 end3 = (hit3->genomicend - 1) - hit3->trim_queryend - end_amb_length(hit3);
1871 debug15(printf("hit5 endpoints are %u..%u. hit3 endpoints are %u..%u\n",
1872 start5-hit5->chroffset,end5-hit5->chroffset,start3-hit3->chroffset,end3-hit3->chroffset));
1873
1874 if (end3 < start5) {
1875 /* Case 1 */
1876 return false;
1877 } else if (end5 < start3) {
1878 /* Case 6 */
1879 return false;
1880 } else if (start3 < start5) {
1881 if (end3 < end5) {
1882 /* Case 2: Tails overlap. Go from start5 to end3 */
1883 debug15(printf("plus/plus case 2a: start5 %u\n",start5 - hit5->chroffset));
1884 for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
1885 substring = (Substring_T) List_head(p);
1886 if (Substring_overlap_point_trimmed_p(substring,start5)) {
1887 return start5;
1888 }
1889 }
1890
1891 /* Case 2: Tails overlap. Go from start5 to end3 */
1892 debug15(printf("plus/plus case 2b: end3 %u\n",end3 - hit3->chroffset));
1893 for (p = hit5->substrings_Nto1; p != NULL; p = List_next(p)) {
1894 substring = (Substring_T) List_head(p);
1895 if (Substring_overlap_point_trimmed_p(substring,end3)) {
1896 return end3;
1897 }
1898 }
1899 /* Fall through to general algorithm */
1900
1901 } else {
1902 /* Case 3: hit3 subsumes hit5 */
1903 debug15(printf("plus/plus case 3\n"));
1904 for (p = hit3->substrings_Nto1; p != NULL; p = List_next(p)) {
1905 substring = (Substring_T) List_head(p);
1906 if (Substring_overlap_point_trimmed_p(substring,end5)) {
1907 return end5;
1908 }
1909 }
1910 /* Fall through to general algorithm */
1911 }
1912
1913 } else {
1914 if (end3 < end5) {
1915 /* Case 4: hit5 subsumes hit3 */
1916 debug15(printf("plus/plus case 4\n"));
1917 for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
1918 substring = (Substring_T) List_head(p);
1919 if (Substring_overlap_point_trimmed_p(substring,start3)) {
1920 return start3;
1921 }
1922 }
1923 /* Fall through to general algorithm */
1924
1925 } else {
1926 /* Case 5: Based on hit3_trimmed_length */
1927 debug15(printf("plus/plus case 5a\n"));
1928 for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
1929 substring = (Substring_T) List_head(p);
1930 if (Substring_overlap_point_trimmed_p(substring,start3)) {
1931 return start3;
1932 }
1933 }
1934
1935 /* Case 5: Based on hit5_trimmed_length */
1936 debug15(printf("plus/plus case 5b\n"));
1937 for (p = hit3->substrings_Nto1; p != NULL; p = List_next(p)) {
1938 substring = (Substring_T) List_head(p);
1939 if (Substring_overlap_point_trimmed_p(substring,end5)) {
1940 return end5;
1941 }
1942 }
1943 /* Fall through to general algorithm */
1944 }
1945 }
1946
1947 /* General algorithm */
1948 debug15(printf("plus/plus general\n"));
1949 for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
1950 substring3 = (Substring_T) List_head(p);
1951 for (q = hit5->substrings_1toN; q != NULL; q = List_next(q)) {
1952 substring5 = (Substring_T) List_head(q);
1953 if ((common_genomicpos = Substring_overlap_segment_trimmed(substring5,substring3)) != 0) {
1954 return common_genomicpos;
1955 }
1956 }
1957 }
1958
1959 return 0;
1960
1961 } else if (hit5->plusp == true && hit3->plusp == false) {
1962 /* plus/minus */
1963 debug15(printf("Computing overlap using substrings plus/minus\n"));
1964 return 0;
1965
1966 #if 0
1967 start5 = hit5->genomicstart + hit5->trim_querystart + start_amb_length(hit5);
1968 end5 = hit5->genomicend - hit5->trim_queryend - end_amb_length(hit5);
1969 start3 = hit3->genomicstart - hit3->trim_querystart - start_amb_length(hit3);
1970 end3 = hit3->genomicend + hit3->trim_queryend + end_amb_length(hit3);
1971
1972 if (start3 < start5) {
1973 /* Case 1 */
1974 return 0;
1975 } else if (end5 < end3) {
1976 /* Case 6 */
1977 return 0;
1978 } else if (end3 < start5) {
1979 if (start3 < end5) {
1980 /* Case 2: Tails overlap. Go from start5 to start3 */
1981 debug15(printf("plus case 2a: start5 %u\n",start5 - hit5->chroffset));
1982 if (Substring_overlap_point_trimmed_p(hit3->substring0,start5)) {
1983 return start5;
1984 } else if (Substring_overlap_point_trimmed_p(hit3->substring1,start5)) {
1985 return start5;
1986 } else if (Substring_overlap_point_trimmed_p(hit3->substring2,start5)) {
1987 return start5;
1988 }
1989
1990 /* Case 2: Tails overlap. Go from start5 to start3 */
1991 debug15(printf("plus case 2b: start3 %u\n",start3 - hit3->chroffset));
1992 if (Substring_overlap_point_trimmed_p(hit5->substring2,start3)) {
1993 return start3;
1994 } else if (Substring_overlap_point_trimmed_p(hit5->substring1,start3)) {
1995 return start3;
1996 } else if (Substring_overlap_point_trimmed_p(hit5->substring0,start3)) {
1997 return start3;
1998 }
1999 /* Fall through to general algorithm */
2000
2001 } else {
2002 /* Case 3: hit3 subsumes hit5 */
2003 debug15(printf("plus case 3\n"));
2004 if (Substring_overlap_point_trimmed_p(hit3->substring2,end5)) {
2005 return end5;
2006 } else if (Substring_overlap_point_trimmed_p(hit3->substring1,end5)) {
2007 return end5;
2008 } else if (Substring_overlap_point_trimmed_p(hit3->substring0,end5)) {
2009 return end5;
2010 }
2011 /* Fall through to general algorithm */
2012 }
2013
2014 } else {
2015 if (start3 < end5) {
2016 /* Case 4: hit5 subsumes hit3 */
2017 debug15(printf("plus case 4\n"));
2018 if (Substring_overlap_point_trimmed_p(hit5->substring0,end3)) {
2019 return end3;
2020 } else if (Substring_overlap_point_trimmed_p(hit5->substring1,end3)) {
2021 return end3;
2022 } else if (Substring_overlap_point_trimmed_p(hit5->substring2,end3)) {
2023 return end3;
2024 }
2025 /* Fall through to general algorithm */
2026
2027 } else {
2028 /* Case 5: Based on hit3_trimmed_length */
2029 debug15(printf("plus case 5a\n"));
2030 if (Substring_overlap_point_trimmed_p(hit5->substring0,end3)) {
2031 return end3;
2032 } else if (Substring_overlap_point_trimmed_p(hit5->substring1,end3)) {
2033 return end3;
2034 } else if (Substring_overlap_point_trimmed_p(hit5->substring2,end3)) {
2035 return end3;
2036 }
2037
2038 /* Case 5: Based on hit5_trimmed_length */
2039 debug15(printf("plus case 5b\n"));
2040 if (Substring_overlap_point_trimmed_p(hit3->substring2,end5)) {
2041 return end5;
2042 } else if (Substring_overlap_point_trimmed_p(hit3->substring1,end5)) {
2043 return end5;
2044 } else if (Substring_overlap_point_trimmed_p(hit3->substring0,end5)) {
2045 return end5;
2046 }
2047 /* Fall through to general algorithm */
2048 }
2049 }
2050
2051 /* General algorithm */
2052 debug15(printf("plus general: hit3->substring1\n"));
2053 if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring1)) != 0) {
2054 return common_genomicpos;
2055 } else if (hit5->substring2 != NULL &&
2056 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring1)) != 0) {
2057 return common_genomicpos;
2058 } else if (hit5->substring0 != NULL &&
2059 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring1)) != 0) {
2060 return common_genomicpos;
2061 }
2062
2063 if (hit3->substring2 != NULL) {
2064 debug15(printf("plus general: hit3->substring2\n"));
2065 if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring2)) != 0) {
2066 return common_genomicpos;
2067 } else if (hit5->substring2 != NULL &&
2068 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring2)) != 0) {
2069 return common_genomicpos;
2070 } else if (hit5->substring0 != NULL &&
2071 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring2)) != 0) {
2072 return common_genomicpos;
2073 }
2074 }
2075
2076 if (hit3->substring0 != NULL) {
2077 debug15(printf("plus general: hit3->substring0\n"));
2078 if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring0)) != 0) {
2079 return common_genomicpos;
2080 } else if (hit5->substring2 != NULL &&
2081 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring0)) != 0) {
2082 return common_genomicpos;
2083 } else if (hit5->substring0 != NULL &&
2084 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring0)) != 0) {
2085 return common_genomicpos;
2086 }
2087 }
2088
2089 return 0U;
2090 #endif
2091
2092 } else if (hit5->plusp == false && hit3->plusp == true) {
2093 /* minus/plus */
2094 debug15(printf("Computing overlap using substrings minus/plus\n"));
2095 return 0;
2096
2097 #if 0
2098 start5 = hit5->genomicstart - hit5->trim_querystart - start_amb_length(hit5);
2099 end5 = hit5->genomicend + hit5->trim_queryend + end_amb_length(hit5);
2100 start3 = hit3->genomicstart + hit3->trim_querystart + start_amb_length(hit3);
2101 end3 = hit3->genomicend - hit3->trim_queryend - end_amb_length(hit3);
2102
2103 if (end3 < end5) {
2104 /* Case 1 */
2105 return 0;
2106 } else if (start5 < start3) {
2107 /* Case 6 */
2108 return 0;
2109 } else if (start3 < end5) {
2110 if (end3 < start5) {
2111 /* Case 2: Tails overlap. Go from end5 to end3 */
2112 debug15(printf("plus case 2a: end5 %u\n",end5 - hit5->chroffset));
2113 if (Substring_overlap_point_trimmed_p(hit3->substring0,end5)) {
2114 return end5;
2115 } else if (Substring_overlap_point_trimmed_p(hit3->substring1,end5)) {
2116 return end5;
2117 } else if (Substring_overlap_point_trimmed_p(hit3->substring2,end5)) {
2118 return end5;
2119 }
2120
2121 /* Case 2: Tails overlap. Go from end5 to end3 */
2122 debug15(printf("plus case 2b: end3 %u\n",end3 - hit3->chroffset));
2123 if (Substring_overlap_point_trimmed_p(hit5->substring2,end3)) {
2124 return end3;
2125 } else if (Substring_overlap_point_trimmed_p(hit5->substring1,end3)) {
2126 return end3;
2127 } else if (Substring_overlap_point_trimmed_p(hit5->substring0,end3)) {
2128 return end3;
2129 }
2130 /* Fall through to general algorithm */
2131
2132 } else {
2133 /* Case 3: hit3 subsumes hit5 */
2134 debug15(printf("plus case 3\n"));
2135 if (Substring_overlap_point_trimmed_p(hit3->substring2,start5)) {
2136 return start5;
2137 } else if (Substring_overlap_point_trimmed_p(hit3->substring1,start5)) {
2138 return start5;
2139 } else if (Substring_overlap_point_trimmed_p(hit3->substring0,start5)) {
2140 return start5;
2141 }
2142 /* Fall through to general algorithm */
2143 }
2144
2145 } else {
2146 if (end3 < start5) {
2147 /* Case 4: hit5 subsumes hit3 */
2148 debug15(printf("plus case 4\n"));
2149 if (Substring_overlap_point_trimmed_p(hit5->substring0,start3)) {
2150 return start3;
2151 } else if (Substring_overlap_point_trimmed_p(hit5->substring1,start3)) {
2152 return start3;
2153 } else if (Substring_overlap_point_trimmed_p(hit5->substring2,start3)) {
2154 return start3;
2155 }
2156 /* Fall through to general algorithm */
2157
2158 } else {
2159 /* Case 5: Based on hit3_trimmed_length */
2160 debug15(printf("plus case 5a\n"));
2161 if (Substring_overlap_point_trimmed_p(hit5->substring0,start3)) {
2162 return start3;
2163 } else if (Substring_overlap_point_trimmed_p(hit5->substring1,start3)) {
2164 return start3;
2165 } else if (Substring_overlap_point_trimmed_p(hit5->substring2,start3)) {
2166 return start3;
2167 }
2168
2169 /* Case 5: Based on hit5_trimmed_length */
2170 debug15(printf("plus case 5b\n"));
2171 if (Substring_overlap_point_trimmed_p(hit3->substring2,start5)) {
2172 return start5;
2173 } else if (Substring_overlap_point_trimmed_p(hit3->substring1,start5)) {
2174 return start5;
2175 } else if (Substring_overlap_point_trimmed_p(hit3->substring0,start5)) {
2176 return start5;
2177 }
2178 /* Fall through to general algorithm */
2179 }
2180 }
2181
2182 /* General algorithm */
2183 debug15(printf("plus general: hit3->substring1\n"));
2184 if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring1)) != 0) {
2185 return common_genomicpos;
2186 } else if (hit5->substring2 != NULL &&
2187 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring1)) != 0) {
2188 return common_genomicpos;
2189 } else if (hit5->substring0 != NULL &&
2190 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring1)) != 0) {
2191 return common_genomicpos;
2192 }
2193
2194 if (hit3->substring2 != NULL) {
2195 debug15(printf("plus general: hit3->substring2\n"));
2196 if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring2)) != 0) {
2197 return common_genomicpos;
2198 } else if (hit5->substring2 != NULL &&
2199 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring2)) != 0) {
2200 return common_genomicpos;
2201 } else if (hit5->substring0 != NULL &&
2202 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring2)) != 0) {
2203 return common_genomicpos;
2204 }
2205 }
2206
2207 if (hit3->substring0 != NULL) {
2208 debug15(printf("plus general: hit3->substring0\n"));
2209 if ((common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring1,hit3->substring0)) != 0) {
2210 return common_genomicpos;
2211 } else if (hit5->substring2 != NULL &&
2212 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring2,hit3->substring0)) != 0) {
2213 return common_genomicpos;
2214 } else if (hit5->substring0 != NULL &&
2215 (common_genomicpos = Substring_overlap_segment_trimmed(hit5->substring0,hit3->substring0)) != 0) {
2216 return common_genomicpos;
2217 }
2218 }
2219
2220 return 0;
2221 #endif
2222
2223 } else if (hit5->plusp == false && hit3->plusp == false) {
2224 /* minus/minus */
2225 debug15(printf("Computing overlap using substrings minus/minus\n"));
2226
2227 start5 = (hit5->genomicstart - 1) - hit5->trim_querystart /*- start_amb_length(hit5)*/;
2228 end5 = hit5->genomicend + hit5->trim_queryend /*+ end_amb_length(hit5)*/;
2229 start3 = (hit3->genomicstart - 1) - hit3->trim_querystart /*- start_amb_length(hit3)*/;
2230 end3 = hit3->genomicend + hit3->trim_queryend /*+ end_amb_length(hit3)*/;
2231 debug15(printf("hit5 endpoints are %u..%u. hit3 endpoints are %u..%u\n",
2232 start5-hit5->chroffset,end5-hit5->chroffset,start3-hit3->chroffset,end3-hit3->chroffset));
2233
2234 if (end3 > start5) {
2235 /* Case 1 */
2236 return 0;
2237 } else if (end5 > start3) {
2238 /* Case 6 */
2239 return 0;
2240 } else if (start3 > start5) {
2241 if (end3 > end5) {
2242 /* Case 2: Tails overlap. Go from start5 to end3 */
2243 debug15(printf("minus/minus case 2a: start5 %llu (%u)\n",start5,start5 - hit5->chroffset));
2244 for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
2245 substring = (Substring_T) List_head(p);
2246 if (Substring_overlap_point_trimmed_p(substring,start5)) {
2247 return start5;
2248 }
2249 }
2250
2251 /* Case 2: Tails overlap. Go from start5 to end3 */
2252 debug15(printf("plus case 2b: end3 %u\n",end3 - hit3->chroffset));
2253 for (p = hit5->substrings_Nto1; p != NULL; p = List_next(p)) {
2254 substring = (Substring_T) List_head(p);
2255 if (Substring_overlap_point_trimmed_p(substring,end3)) {
2256 return end3;
2257 }
2258 }
2259 /* Fall through to general algorithm */
2260
2261 } else {
2262 /* Case 3: hit3 subsumes hit5 */
2263 debug15(printf("minus/minus case 3: end5 %u\n",end5 - hit5->chroffset));
2264 for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
2265 substring = (Substring_T) List_head(p);
2266 if (Substring_overlap_point_trimmed_p(substring,end5)) {
2267 return end5;
2268 }
2269 }
2270
2271 /* Fall through to general algorithm */
2272 }
2273
2274 } else {
2275 if (end3 > end5) {
2276 /* Case 4: hit5 subsumes hit3 */
2277 debug15(printf("minus/minus case 4: start3 %u\n",(Chrpos_T) (start3 - hit3->chroffset)));
2278 for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
2279 substring = (Substring_T) List_head(p);
2280 if (Substring_overlap_point_trimmed_p(substring,start3)) {
2281 return start3;
2282 }
2283 }
2284 /* Fall through to general algorithm */
2285
2286 } else {
2287 /* Case 5: Based on hit3_trimmed_length */
2288 debug15(printf("minus case 5a: start3 %u\n",start3 - hit3->chroffset));
2289 for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
2290 substring = (Substring_T) List_head(p);
2291 if (Substring_overlap_point_trimmed_p(substring,start3)) {
2292 return start3;
2293 }
2294 }
2295
2296 /* Case 5: Based on hit5_trimmed_length */
2297 debug15(printf("minus case 5b: end5 %u\n",end5 - hit5->chroffset));
2298 for (p = hit3->substrings_Nto1; p != NULL; p = List_next(p)) {
2299 substring = (Substring_T) List_head(p);
2300 if (Substring_overlap_point_trimmed_p(substring,end5)) {
2301 return end5;
2302 }
2303 }
2304 /* Fall through to general algorithm */
2305 }
2306 }
2307
2308 /* General algorithm */
2309 debug15(printf("minus/minus general\n"));
2310 for (p = hit3->substrings_1toN; p != NULL; p = List_next(p)) {
2311 substring3 = (Substring_T) List_head(p);
2312 for (q = hit5->substrings_1toN; q != NULL; q = List_next(q)) {
2313 substring5 = (Substring_T) List_head(q);
2314 if ((common_genomicpos = Substring_overlap_segment_trimmed(substring5,substring3)) != 0) {
2315 return common_genomicpos;
2316 }
2317 }
2318 }
2319
2320 return 0;
2321
2322 } else {
2323 abort();
2324 return 0;
2325 }
2326 }
2327
2328
2329 static bool
test_hardclips(Univcoord_T * common_genomicpos,int hardclip_low,Stage3end_T hit_low,int hardclip_high,Stage3end_T hit_high,Univcoord_T chroffset)2330 test_hardclips (Univcoord_T *common_genomicpos, int hardclip_low, Stage3end_T hit_low,
2331 int hardclip_high, Stage3end_T hit_high, Univcoord_T chroffset) {
2332 Substring_T low_substring, high_substring;
2333 int low_querypos, high_querypos;
2334 int low_querylength, high_querylength;
2335 bool plusp;
2336
2337 low_querylength = hit_low->querylength;
2338 high_querylength = hit_high->querylength;
2339
2340 debug15(printf("Entering test_hardclips with hardclip_low %d, hardclip_high %d\n",
2341 hardclip_low,hardclip_high));
2342 debug15(printf("querylength_low %d, querylength_high %d\n",low_querylength,high_querylength));
2343
2344 plusp = Stage3end_plusp(hit_low);
2345
2346 if (plusp == true) {
2347 low_querypos = hardclip_low;
2348 high_querypos = high_querylength /*- 1*/ - hardclip_high;
2349 debug15(printf("Both substrings, plus. low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2350
2351 if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2352 debug15(printf("Fails because low_querypos %d gives a NULL substring\n",low_querypos));
2353 return false;
2354 } else if (Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring) {
2355 debug15(printf("Fails because low_querypos %d - 1 gives substring %p\n",
2356 low_querypos,Stage3end_substring_containing(hit_low,low_querypos-1)));
2357 return false;
2358 } else if (Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring) {
2359 debug15(printf("Fails because low_querypos %d + 1 gives substring %p\n",
2360 low_querypos,Stage3end_substring_containing(hit_low,low_querypos+1)));
2361 return false;
2362 } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2363 debug15(printf("Fails because high_querypos %d gives a NULL substring\n",high_querypos));
2364 return false;
2365 } else if (Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring) {
2366 debug15(printf("Fails because high_querypos %d - 1 gives substring %p\n",
2367 high_querypos,Stage3end_substring_containing(hit_high,high_querypos-1)));
2368 return false;
2369 } else if (Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring) {
2370 debug15(printf("Fails because high_querypos %d + 1 gives substring %p\n",
2371 high_querypos,Stage3end_substring_containing(hit_high,high_querypos+1)));
2372 return false;
2373 } else if (Substring_genomicstart(low_substring) + low_querypos - chroffset != Substring_genomicstart(high_substring) + high_querypos - chroffset) {
2374 debug15(printf("Fails because low chrpos %u != high chrpos %u\n",
2375 Substring_genomicstart(low_substring) + low_querypos - chroffset,
2376 Substring_genomicstart(high_substring) + high_querypos - chroffset));
2377 return false;
2378 } else {
2379 *common_genomicpos = Substring_genomicstart(low_substring) + low_querypos; /* Want univcoord */
2380 debug15(printf("Succeeds with common point %u\n",*common_genomicpos - chroffset));
2381 return true;
2382 }
2383
2384 } else {
2385 low_querypos = low_querylength /*- 1*/ - hardclip_low;
2386 high_querypos = hardclip_high;
2387 debug15(printf("Both substrings, minus. low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2388
2389 if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2390 debug15(printf("Fails because low_querypos %d gives a NULL substring\n",low_querypos));
2391 return false;
2392 } else if (Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring) {
2393 debug15(printf("Fails because low_querypos %d - 1 gives substring %p\n",
2394 low_querypos,Stage3end_substring_containing(hit_low,low_querypos-1)));
2395 return false;
2396 } else if (Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring) {
2397 debug15(printf("Fails because low_querypos %d + 1 gives substring %p\n",
2398 low_querypos,Stage3end_substring_containing(hit_low,low_querypos+1)));
2399 return false;
2400 } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2401 debug15(printf("Fails because high_querypos %d gives a NULL substring\n",high_querypos));
2402 return false;
2403 } else if (Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring) {
2404 debug15(printf("Fails because high_querypos %d - 1 gives substring %p\n",
2405 high_querypos,Stage3end_substring_containing(hit_high,high_querypos-1)));
2406 return false;
2407 } else if (Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring) {
2408 debug15(printf("Fails because high_querypos %d + 1 gives substring %p\n",
2409 high_querypos,Stage3end_substring_containing(hit_high,high_querypos+1)));
2410 return false;
2411 } else if ((Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset != (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset) {
2412 debug15(printf("Fails because low chrpos %u != high chrpos %u\n",
2413 (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset,
2414 (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset));
2415 return false;
2416 } else {
2417 *common_genomicpos = (Substring_genomicstart(low_substring) - 1) - low_querypos; /* Want univcoord */
2418 debug15(printf("Succeeds with common point %u\n",*common_genomicpos - chroffset));
2419 return true;
2420 }
2421 }
2422 }
2423
2424
2425
2426 /* Replaces adjust_hardclips in samprint.c */
2427 static Univcoord_T
adjust_hardclips_right(int * shift,int hardclip_low,Stage3end_T hit_low,int hardclip_high,Stage3end_T hit_high,Univcoord_T chroffset)2428 adjust_hardclips_right (int *shift, int hardclip_low, Stage3end_T hit_low,
2429 int hardclip_high, Stage3end_T hit_high, Univcoord_T chroffset) {
2430 Substring_T low_substring, high_substring;
2431 int low_querypos, high_querypos;
2432 int low_querylength, high_querylength;
2433 Chrpos_T low_chrpos, high_chrpos;
2434 bool plusp;
2435
2436
2437 low_querylength = hit_low->querylength;
2438 high_querylength = hit_high->querylength;
2439
2440 debug15(printf("Entering adjust_hardclips_right with hardclip_low %d, hardclip_high %d\n",
2441 hardclip_low,hardclip_high));
2442 *shift = 1; /* Making an initial move before each while loop */
2443 plusp = Stage3end_plusp(hit_low);
2444
2445 if (plusp == true) {
2446 low_querypos = hardclip_low;
2447 high_querypos = high_querylength /*- 1*/ - hardclip_high;
2448 debug15(printf("Both substrings, plus. low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2449
2450 low_querypos++;
2451 high_querypos++;
2452 debug15(printf("right shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2453 while ((low_querypos + 1) < low_querylength && (high_querypos + 1) < high_querylength &&
2454 ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2455 Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring ||
2456 Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring ||
2457 (high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL ||
2458 Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring ||
2459 Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring ||
2460 Substring_genomicstart(low_substring) + low_querypos - chroffset != Substring_genomicstart(high_substring) + high_querypos - chroffset)) {
2461 (*shift) += 1;
2462 if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2463 low_querypos++;
2464 } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2465 high_querypos++;
2466 } else {
2467 low_chrpos = Substring_genomicstart(low_substring) + low_querypos - chroffset;
2468 high_chrpos = Substring_genomicstart(high_substring) + high_querypos - chroffset;
2469 if (low_chrpos < high_chrpos) {
2470 debug15(printf("low_chrpos %u < high_chrpos %u, so advancing low_querypos\n",low_chrpos,high_chrpos));
2471 low_querypos++;
2472 } else if (high_chrpos < low_chrpos) {
2473 debug15(printf("high_chrpos %u < low_chrpos %u, so advancing high_querypos\n",high_chrpos,low_chrpos));
2474 high_querypos++;
2475 } else {
2476 low_querypos++;
2477 high_querypos++;
2478 }
2479 }
2480 debug15(printf("right shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2481 }
2482
2483 if ((low_querypos + 1) >= low_querylength ||
2484 (high_querypos + 1) >= high_querylength ||
2485 (low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2486 Stage3end_substring_containing(hit_high,high_querypos) == NULL) {
2487 *shift = 0;
2488 return 0;
2489 } else {
2490 debug15(printf("Returning %u + %d\n",Substring_genomicstart(low_substring) - chroffset,
2491 low_querypos));
2492 assert((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) != NULL);
2493 assert((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) != NULL);
2494 assert(Stage3end_substring_containing(hit_low,low_querypos-1) == low_substring);
2495 assert(Stage3end_substring_containing(hit_low,low_querypos+1) == low_substring);
2496 assert(Stage3end_substring_containing(hit_high,high_querypos-1) == high_substring);
2497 assert(Stage3end_substring_containing(hit_high,high_querypos+1) == high_substring);
2498 return Substring_genomicstart(low_substring) + low_querypos; /* Want univcoord */
2499 }
2500
2501 } else {
2502 low_querypos = low_querylength /*- 1*/ - hardclip_low;
2503 high_querypos = hardclip_high;
2504 debug15(printf("Both substrings, minus. low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2505
2506 low_querypos--;
2507 high_querypos--;
2508 debug15(printf("right shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2509 while ((low_querypos - 1) >= 0 && (high_querypos - 1) >= 0 &&
2510 ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2511 Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring ||
2512 Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring ||
2513 (high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL ||
2514 Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring ||
2515 Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring ||
2516 (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset != (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset)) {
2517 (*shift) += 1;
2518 if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2519 low_querypos--;
2520 } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2521 high_querypos--;
2522 } else {
2523 low_chrpos = (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset;
2524 high_chrpos = (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset;
2525 if (low_chrpos < high_chrpos) {
2526 debug15(printf("low_chrpos %u < high_chrpos %u, so decreasing low_querypos\n",low_chrpos,high_chrpos));
2527 low_querypos--;
2528 } else if (high_chrpos < low_chrpos) {
2529 debug15(printf("high_chrpos %u < low_chrpos %u, so decreasing high_querypos\n",high_chrpos,low_chrpos));
2530 high_querypos--;
2531 } else {
2532 low_querypos--;
2533 high_querypos--;
2534 }
2535 }
2536 debug15(printf("right shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2537 }
2538
2539 if ((low_querypos - 1) < 0 ||
2540 (high_querypos - 1) < 0 ||
2541 (low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2542 Stage3end_substring_containing(hit_high,high_querypos) == NULL) {
2543 *shift = 0;
2544 return 0;
2545 } else {
2546 debug15(printf("Returning %u - %d\n",Substring_genomicstart(low_substring) - chroffset,
2547 low_querypos));
2548 assert((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) != NULL);
2549 assert((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) != NULL);
2550 assert(Stage3end_substring_containing(hit_low,low_querypos-1) == low_substring);
2551 assert(Stage3end_substring_containing(hit_low,low_querypos+1) == low_substring);
2552 assert(Stage3end_substring_containing(hit_high,high_querypos-1) == high_substring);
2553 assert(Stage3end_substring_containing(hit_high,high_querypos+1) == high_substring);
2554 return (Substring_genomicstart(low_substring) - 1) - low_querypos; /* Want univcoord */
2555 }
2556 }
2557 }
2558
2559
2560 /* Replaces adjust_hardclips in samprint.c */
2561 static Univcoord_T
adjust_hardclips_left(int * shift,int hardclip_low,Stage3end_T hit_low,int hardclip_high,Stage3end_T hit_high,Univcoord_T chroffset)2562 adjust_hardclips_left (int *shift, int hardclip_low, Stage3end_T hit_low,
2563 int hardclip_high, Stage3end_T hit_high, Univcoord_T chroffset) {
2564 Substring_T low_substring, high_substring;
2565 int low_querypos, high_querypos;
2566 int low_querylength, high_querylength;
2567 Chrpos_T low_chrpos, high_chrpos;
2568 bool plusp;
2569
2570
2571 low_querylength = hit_low->querylength;
2572 high_querylength = hit_high->querylength;
2573
2574 debug15(printf("Entering adjust_hardclips_left with hardclip_low %d, hardclip_high %d\n",
2575 hardclip_low,hardclip_high));
2576 *shift = 1; /* Making an initial move before each while loop */
2577 plusp = Stage3end_plusp(hit_low);
2578
2579 if (plusp == true) {
2580 low_querypos = hardclip_low;
2581 high_querypos = high_querylength /*- 1*/ - hardclip_high;
2582 debug15(printf("Both substrings, plus. low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2583
2584 low_querypos--;
2585 high_querypos--;
2586 debug15(printf("left shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2587 while ((low_querypos - 1) >= 0 && (high_querypos - 1) >= 0 &&
2588 ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2589 Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring ||
2590 Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring ||
2591 (high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL ||
2592 Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring ||
2593 Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring ||
2594 Substring_genomicstart(low_substring) + low_querypos - chroffset != Substring_genomicstart(high_substring) + high_querypos - chroffset)) {
2595 (*shift) += 1;
2596 if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2597 low_querypos--;
2598 } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2599 high_querypos--;
2600 } else {
2601 low_chrpos = Substring_genomicstart(low_substring) + low_querypos - chroffset;
2602 high_chrpos = Substring_genomicstart(high_substring) + high_querypos - chroffset;
2603 if (low_chrpos > high_chrpos) {
2604 debug15(printf("low_chrpos %u > high_chrpos %u, so decreasing low_querypos\n",low_chrpos,high_chrpos));
2605 low_querypos--;
2606 } else if (high_chrpos > low_chrpos) {
2607 debug15(printf("high_chrpos %u > low_chrpos %u, so decreasing high_querypos\n",high_chrpos,low_chrpos));
2608 high_querypos--;
2609 } else {
2610 low_querypos--;
2611 high_querypos--;
2612 }
2613 }
2614 debug15(printf("left shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2615 }
2616
2617 if ((low_querypos - 1) < 0 || (high_querypos - 1) < 0 ||
2618 (low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2619 Stage3end_substring_containing(hit_high,high_querypos) == NULL) {
2620 *shift = 0;
2621 return 0;
2622 } else {
2623 debug15(printf("Returning %u + %d\n",Substring_genomicstart(low_substring) - chroffset,
2624 low_querypos));
2625 assert((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) != NULL);
2626 assert((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) != NULL);
2627 assert(Stage3end_substring_containing(hit_low,low_querypos-1) == low_substring);
2628 assert(Stage3end_substring_containing(hit_low,low_querypos+1) == low_substring);
2629 assert(Stage3end_substring_containing(hit_high,high_querypos-1) == high_substring);
2630 assert(Stage3end_substring_containing(hit_high,high_querypos+1) == high_substring);
2631 return Substring_genomicstart(low_substring) + low_querypos; /* Want univcoord */
2632 }
2633
2634 } else {
2635 low_querypos = low_querylength /*- 1*/ - hardclip_low;
2636 high_querypos = hardclip_high;
2637 debug15(printf("Both substrings, minus. low_querypos %d, high_querypos %d\n",low_querypos,high_querypos));
2638
2639 low_querypos++;
2640 high_querypos++;
2641 debug15(printf("left shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2642 while ((low_querypos + 1) < low_querylength && (high_querypos + 1) < high_querylength &&
2643 ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2644 Stage3end_substring_containing(hit_low,low_querypos-1) != low_substring ||
2645 Stage3end_substring_containing(hit_low,low_querypos+1) != low_substring ||
2646 (high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL ||
2647 Stage3end_substring_containing(hit_high,high_querypos-1) != high_substring ||
2648 Stage3end_substring_containing(hit_high,high_querypos+1) != high_substring ||
2649 (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset != (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset)) {
2650 (*shift) += 1;
2651 if ((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL) {
2652 low_querypos++;
2653 } else if ((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) == NULL) {
2654 high_querypos++;
2655 } else {
2656 low_chrpos = (Substring_genomicstart(low_substring) - 1) - low_querypos - chroffset;
2657 high_chrpos = (Substring_genomicstart(high_substring) - 1) - high_querypos - chroffset;
2658 if (low_chrpos > high_chrpos) {
2659 debug15(printf("low_chrpos %u > high_chrpos %u, so advancing low_querypos\n",low_chrpos,high_chrpos));
2660 low_querypos++;
2661 } else if (high_chrpos > low_chrpos) {
2662 debug15(printf("high_chrpos %u > low_chrpos %u, so advancing high_querypos\n",high_chrpos,low_chrpos));
2663 high_querypos++;
2664 } else {
2665 low_querypos++;
2666 high_querypos++;
2667 }
2668 }
2669 debug15(printf("left shift %d: Advancing to low_querypos %d and high_querypos %d\n",*shift,low_querypos,high_querypos));
2670 }
2671
2672 if ((low_querypos + 1) >= low_querylength || (high_querypos + 1) >= high_querylength ||
2673 (low_substring = Stage3end_substring_containing(hit_low,low_querypos)) == NULL ||
2674 Stage3end_substring_containing(hit_high,high_querypos) == NULL) {
2675 *shift = 0;
2676 return 0;
2677 } else {
2678 debug15(printf("Returning %u - %d\n",Substring_genomicstart(low_substring) - chroffset,
2679 low_querypos));
2680 assert((low_substring = Stage3end_substring_containing(hit_low,low_querypos)) != NULL);
2681 assert((high_substring = Stage3end_substring_containing(hit_high,high_querypos)) != NULL);
2682 assert(Stage3end_substring_containing(hit_low,low_querypos-1) == low_substring);
2683 assert(Stage3end_substring_containing(hit_low,low_querypos+1) == low_substring);
2684 assert(Stage3end_substring_containing(hit_high,high_querypos-1) == high_substring);
2685 assert(Stage3end_substring_containing(hit_high,high_querypos+1) == high_substring);
2686 return (Substring_genomicstart(low_substring) - 1) - low_querypos; /* Want univcoord */
2687 }
2688 }
2689 }
2690
2691
2692
2693 /* Note: Do not alter this->insertlength, which is used for SAM
2694 output. The insertlength computed here is used only for performing
2695 --clip-overlap or --merge-overlap */
2696 int
Stage3pair_overlap(int * hardclip5_low,int * hardclip5_high,int * hardclip3_low,int * hardclip3_high,Stage3pair_T this)2697 Stage3pair_overlap (int *hardclip5_low, int *hardclip5_high, int *hardclip3_low, int *hardclip3_high, Stage3pair_T this) {
2698 Stage3end_T hit5, hit3;
2699 int clipdir;
2700 int ilength53, ilength35, ilength5_low, ilength5_high, ilength3_low, ilength3_high;
2701 int common_shift, common_left, common_right;
2702 Univcoord_T common_genomicpos, common_genomicpos_right, common_genomicpos_left;
2703 int shift_right, shift_left;
2704 #ifdef DEBUG15
2705 int overlap;
2706 #endif
2707
2708
2709 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2710
2711 hit5 = this->hit5;
2712 hit3 = this->hit3;
2713
2714 debug15(printf("Entered Stage3pair_overlap with hittype %s and %s\n",
2715 hittype_string(hit5->hittype),hittype_string(hit3->hittype)));
2716 if (hit5->hittype == SAMECHR_SPLICE || hit5->hittype == TRANSLOC_SPLICE) {
2717 return 0;
2718 } else if (hit3->hittype == SAMECHR_SPLICE || hit3->hittype == TRANSLOC_SPLICE) {
2719 return 0;
2720 } else if (hit5->plusp != hit3->plusp) {
2721 debug15(printf("The two ends are not on the same strand, so returning 0\n"));
2722 return 0;
2723 } else {
2724 debug15(printf("hit5 trim_querystart %d + amb_start %d, trim_queryend %d + amb_end %d, hit3 trim_querystart %d + amb_start %d, trim_queryend %d + amb_end %d\n",
2725 hit5->trim_querystart,start_amb_length(hit5),hit5->trim_queryend,end_amb_length(hit5),
2726 hit3->trim_querystart,start_amb_length(hit3),hit3->trim_queryend,end_amb_length(hit3)));
2727 if (hit5->plusp == true) {
2728 /* plus */
2729 #if 0
2730 hit5_trimmed_length = hit5->querylength - hit5->trim_querystart - hit5->trim_queryend - start_amb_length(hit5) - end_amb_length(hit5);
2731 hit3_trimmed_length = hit3->querylength - hit3->trim_querystart - hit3->trim_queryend - start_amb_length(hit3) - end_amb_length(hit3);
2732 totallength = hit5_trimmed_length + hit3_trimmed_length;
2733 debug15(printf("totallength = %d, hit5 trimmed length = %d, hit3 trimmed length = %d\n",
2734 totallength,hit5_trimmed_length,hit3_trimmed_length));
2735 debug15(printf("original insertlength: %d, trim+amb5: %d..%d, trim+amb3: %d..%d\n",
2736 this->insertlength,hit5->trim_querystart + start_amb_length(hit5),
2737 hit5->trim_queryend + end_amb_length(hit5),hit3->trim_querystart + start_amb_length(hit3),
2738 hit3->trim_queryend + end_amb_length(hit3)));
2739 #endif
2740
2741 if ((common_genomicpos = pair_common_genomicpos(hit5,hit3)) == 0) {
2742 debug15(printf("Cannot determine a common point, so returning 0\n"));
2743 return 0;
2744
2745 } else if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
2746 find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
2747 debug15(printf("Cannot determine ilengths, so returning 0\n"));
2748 return 0;
2749
2750 } else {
2751 debug15(printf("Inclusive: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2752 debug15(printf("ilength53 is %d, ilength 35 is %d\n",ilength5_low + ilength3_high - 1,ilength3_low + ilength5_high - 1));
2753
2754 common_left = (ilength5_low < ilength3_low) ? ilength5_low : ilength3_low;
2755 common_right = (ilength5_high < ilength3_high) ? ilength5_high : ilength3_high;
2756 if (common_right > common_left) {
2757 common_shift = common_right/2 - (common_left - 1)/2;
2758 debug15(printf("Common shift is %d = common_right %d/2 - (common_left %d - 1)/2\n",
2759 common_shift,common_right,common_left));
2760 assert(ilength5_low > 0);
2761 assert(ilength3_low > 0);
2762 ilength5_low -= 1;
2763 ilength3_low -= 1;
2764 } else {
2765 common_shift = (common_right - 1)/2 - common_left/2;
2766 debug15(printf("Common shift is %d = (common_right %d - 1)/2 - common_left %d/2\n",
2767 common_shift,common_right,common_left));
2768 assert(ilength5_high > 0);
2769 assert(ilength3_high > 0);
2770 ilength5_high -= 1;
2771 ilength3_high -= 1;
2772 }
2773 debug15(printf("Exclusive: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2774
2775
2776 if ((ilength53 = ilength5_low + ilength3_high) >= (ilength35 = ilength3_low + ilength5_high)) {
2777 /* Use >=, not >, so we favor clipping heads over clipping tails in case of a tie */
2778 debug15(printf("plus, ilength53 is longer. Clipping heads.\n"));
2779 debug15(printf("Overlap is %d = common_left %d + common_right %d - 1\n",
2780 common_left+common_right-1,common_left,common_right));
2781 clipdir = +1;
2782
2783 /* Want to clip 5 high and 3 low */
2784 *hardclip5_high = ilength5_high - common_shift;
2785 *hardclip3_low = ilength3_low + common_shift;
2786 debug15(printf("Overlap clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2787 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2788 *hardclip5_high += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
2789 *hardclip3_low += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
2790 debug15(printf("Ambig clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2791 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2792
2793 if (common_shift != 0) {
2794 if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == true) {
2795 /* No adjustment needed, but need to recompute ilengths for shifted common_genomicpos */
2796 } else {
2797 common_genomicpos_right = adjust_hardclips_right(&shift_right,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset);
2798 common_genomicpos_left = adjust_hardclips_left(&shift_left,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset);
2799 debug15(printf("shift_right %d, shift_left %d\n",shift_right,shift_left));
2800 if (shift_right == 0 && shift_left == 0) {
2801 /* Try original position without a shift */
2802 *hardclip5_high = ilength5_high /*- common_shift*/;
2803 *hardclip3_low = ilength3_low /*+ common_shift*/;
2804 *hardclip5_high += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
2805 *hardclip3_low += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
2806 if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == false) {
2807 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2808 return 0;
2809 }
2810 } else if (shift_left == 0) {
2811 common_genomicpos = common_genomicpos_right;
2812 } else if (shift_right == 0) {
2813 common_genomicpos = common_genomicpos_left;
2814 } else if (shift_right <= shift_left) {
2815 common_genomicpos = common_genomicpos_right;
2816 } else {
2817 common_genomicpos = common_genomicpos_left;
2818 }
2819 }
2820
2821 debug15(printf("New common point is %u\n",common_genomicpos - hit3->chroffset));
2822 /* Recompute hardclips */
2823 if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
2824 find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
2825 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2826 return 0;
2827 } else if (ilength3_low > ilength5_high) {
2828 debug15(printf("Uneven: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2829 assert(ilength3_low > 0);
2830 ilength3_low -= 1;
2831 } else {
2832 debug15(printf("Uneven: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2833 assert(ilength5_high > 0);
2834 ilength5_high -= 1;
2835 }
2836 debug15(printf("Even: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2837
2838 *hardclip5_high = ilength5_high /*- common_shift*/;
2839 *hardclip3_low = ilength3_low /*+ common_shift*/;
2840 debug15(printf("Initial computation of clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2841 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2842
2843 *hardclip5_high += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
2844 *hardclip3_low += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
2845 debug15(printf("Recomputed clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2846 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2847 }
2848
2849 #if 0
2850 if (*hardclip5_high < 0) {
2851 *hardclip5_high = 0;
2852 }
2853 if (*hardclip3_low < 0) {
2854 *hardclip3_low = 0;
2855 }
2856 debug15(printf("Positive clip for ilength53 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2857 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2858 #endif
2859
2860 } else {
2861 debug15(printf("plus, ilength35 is longer. Clipping tails.\n"));
2862 debug15(printf("Overlap is %d = common_left %d + common_right %d - 1\n",
2863 common_left+common_right-1,common_left,common_right));
2864 clipdir = -1;
2865
2866 /* Want to clip 5 low and 3 high */
2867 *hardclip5_low = ilength5_low + common_shift;
2868 *hardclip3_high = ilength3_high - common_shift;
2869 debug15(printf("Overlap clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2870 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2871 *hardclip5_low += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
2872 *hardclip3_high += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
2873 debug15(printf("Ambig clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2874 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2875
2876 if (common_shift != 0) {
2877 if (test_hardclips(&common_genomicpos,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset) == true) {
2878 /* No adjustment needed, but need to recompute ilengths for shifted common_genomicpos */
2879 } else {
2880 common_genomicpos_right = adjust_hardclips_right(&shift_right,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset);
2881 common_genomicpos_left = adjust_hardclips_left(&shift_left,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset);
2882 debug15(printf("shift_right %d, shift_left %d\n",shift_right,shift_left));
2883 if (shift_right == 0 && shift_left == 0) {
2884 /* Try original position without a shift */
2885 *hardclip5_low = ilength5_low /*+ common_shift*/;
2886 *hardclip3_high = ilength3_high /*- common_shift*/;
2887 *hardclip5_low += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
2888 *hardclip3_high += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
2889 if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == false) {
2890 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2891 return 0;
2892 }
2893 } else if (shift_left == 0) {
2894 common_genomicpos = common_genomicpos_right;
2895 } else if (shift_right == 0) {
2896 common_genomicpos = common_genomicpos_left;
2897 } else if (shift_right <= shift_left) {
2898 common_genomicpos = common_genomicpos_right;
2899 } else {
2900 common_genomicpos = common_genomicpos_left;
2901 }
2902 }
2903
2904 debug15(printf("New common point is %u\n",common_genomicpos - hit3->chroffset));
2905 /* Recompute hardclips */
2906 if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
2907 find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
2908 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
2909 return 0;
2910 } else if (ilength5_low > ilength3_high) {
2911 debug15(printf("Uneven: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2912 assert(ilength5_low > 0);
2913 ilength5_low -= 1;
2914 } else {
2915 debug15(printf("Uneven: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2916 assert(ilength3_high > 0);
2917 ilength3_high -= 1;
2918 }
2919 debug15(printf("Even: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2920
2921 *hardclip5_low = ilength5_low /*+ common_shift*/;
2922 *hardclip3_high = ilength3_high /*- common_shift*/;
2923 debug15(printf("Initial computation of clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2924 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2925
2926 *hardclip5_low += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
2927 *hardclip3_high += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
2928 debug15(printf("Recomputed clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2929 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2930 }
2931
2932 #if 0
2933 if (*hardclip5_low < 0) {
2934 *hardclip5_low = 0;
2935 }
2936 if (*hardclip3_high < 0) {
2937 *hardclip3_high = 0;
2938 }
2939 debug15(printf("Positive clip for ilength35 plus is hardclip5 %d..%d and hardclip3 %d..%d\n",
2940 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
2941 #endif
2942 }
2943
2944 debug15(printf("returning clipdir %d\n",clipdir));
2945 return clipdir;
2946 }
2947
2948 } else {
2949 /* minus */
2950 #if 0
2951 hit5_trimmed_length = hit5->querylength - hit5->trim_querystart - hit5->trim_queryend - start_amb_length(hit5) - end_amb_length(hit5);
2952 hit3_trimmed_length = hit3->querylength - hit3->trim_querystart - hit3->trim_queryend - start_amb_length(hit3) - end_amb_length(hit3);
2953 totallength = hit5_trimmed_length + hit3_trimmed_length;
2954 debug15(printf("totallength = %d, hit5 trimmed length = %d, hit3 trimmed length = %d\n",
2955 totallength,hit5_trimmed_length,hit3_trimmed_length));
2956 debug15(printf("original insertlength: %d, trim+amb5: %d..%d, trim+amb3: %d..%d\n",
2957 this->insertlength,hit5->trim_querystart + start_amb_length(hit5),
2958 hit5->trim_queryend + hit5->end_amb_length,hit3->trim_querystart + start_amb_length(hit3),
2959 hit3->trim_queryend + hit3->end_amb_length));
2960 #endif
2961
2962 if ((common_genomicpos = pair_common_genomicpos(hit5,hit3)) == 0) {
2963 debug15(printf("Cannot determine a common point, so returning 0\n"));
2964 return 0;
2965
2966 } else if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
2967 find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
2968 debug15(printf("Cannot determine ilengths, so returning 0\n"));
2969 return 0;
2970
2971 } else {
2972 debug15(printf("Inclusive: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2973 debug15(printf("ilength53lh is %d, ilength35lh is %d\n",ilength5_low + ilength3_high - 1,ilength3_low + ilength5_high - 1));
2974
2975 common_left = (ilength5_low < ilength3_low) ? ilength5_low : ilength3_low;
2976 common_right = (ilength5_high < ilength3_high) ? ilength5_high : ilength3_high;
2977 if (common_right > common_left) {
2978 common_shift = common_right/2 - (common_left - 1)/2;
2979 debug15(printf("Common shift is %d = common_right %d/2 - (common_left %d - 1)/2\n",
2980 common_shift,common_right,common_left));
2981 assert(ilength5_low > 0);
2982 assert(ilength3_low > 0);
2983 ilength5_low -= 1;
2984 ilength3_low -= 1;
2985 } else {
2986 common_shift = (common_right - 1)/2 - common_left/2;
2987 debug15(printf("Common shift is %d = (common_right %d - 1)/2 - common_left %d/2\n",
2988 common_shift,common_right,common_left));
2989 assert(ilength5_high > 0);
2990 assert(ilength3_high > 0);
2991 ilength5_high -= 1;
2992 ilength3_high -= 1;
2993 }
2994 debug15(printf("Exclusive: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
2995
2996 if ((ilength53 = ilength5_low + ilength3_high) > (ilength35 = ilength3_low + ilength5_high)) {
2997 /* Use >, not >=, so we favor clipping heads over clipping tails in case of a tie */
2998 debug15(printf("minus, ilength53 is longer. Clipping tails.\n"));
2999 debug15(overlap = common_left + common_right - 1);
3000 debug15(printf("Overlap is %d = common_left %d + common_right %d - 1\n",
3001 overlap,common_left,common_right));
3002 clipdir = +1;
3003
3004
3005 /* Want to clip 5 high and 3 low */
3006 *hardclip5_high = ilength5_high - common_shift;
3007 *hardclip3_low = ilength3_low + common_shift;
3008 debug15(printf("Overlap clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3009 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3010 *hardclip5_high += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
3011 *hardclip3_low += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
3012 debug15(printf("Ambig clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3013 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3014
3015 if (common_shift != 0) {
3016 if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == true) {
3017 /* No adjustment needed, but need to recompute ilengths for shifted common_genomicpos */
3018 } else {
3019 common_genomicpos_right = adjust_hardclips_right(&shift_right,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset);
3020 common_genomicpos_left = adjust_hardclips_left(&shift_left,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset);
3021 debug15(printf("shift_right %d, shift_left %d\n",shift_right,shift_left));
3022 if (shift_right == 0 && shift_left == 0) {
3023 /* Try original position without a shift */
3024 *hardclip5_high = ilength5_high /*- common_shift*/;
3025 *hardclip3_low = ilength3_low /*+ common_shift*/;
3026 *hardclip5_high += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
3027 *hardclip3_low += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
3028 if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == false) {
3029 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
3030 return 0;
3031 }
3032 } else if (shift_left == 0) {
3033 common_genomicpos = common_genomicpos_right;
3034 } else if (shift_right == 0) {
3035 common_genomicpos = common_genomicpos_left;
3036 } else if (shift_right <= shift_left) {
3037 common_genomicpos = common_genomicpos_right;
3038 } else {
3039 common_genomicpos = common_genomicpos_left;
3040 }
3041 }
3042
3043 debug15(printf("New common point is %u\n",common_genomicpos - hit3->chroffset));
3044 /* Recompute hardclips */
3045 if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
3046 find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
3047 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
3048 return 0;
3049 } else if (ilength3_low > ilength5_high) {
3050 debug15(printf("Uneven: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3051 assert(ilength3_low > 0);
3052 ilength3_low -= 1;
3053 } else {
3054 debug15(printf("Uneven: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3055 assert(ilength5_high > 0);
3056 ilength5_high -= 1;
3057 }
3058 debug15(printf("Even: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3059
3060 *hardclip5_high = ilength5_high /*- common_shift*/;
3061 *hardclip3_low = ilength3_low /*+ common_shift*/;
3062 debug15(printf("Initial computation of clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3063 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3064
3065 *hardclip5_high += hit5->trim_querystart /*+ start_amb_length(hit5)*/;
3066 *hardclip3_low += hit3->trim_queryend /*+ end_amb_length(hit3)*/;
3067 debug15(printf("Recomputed clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3068 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3069 }
3070
3071 #if 0
3072 if (*hardclip5_high < 0) {
3073 *hardclip5_high = 0;
3074 }
3075 if (*hardclip3_low < 0) {
3076 *hardclip3_low = 0;
3077 }
3078 debug15(printf("Positive clip for ilength53 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3079 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3080 #endif
3081
3082 } else {
3083 debug15(printf("minus, ilength35 is longer. Clipping heads.\n"));
3084 debug15(overlap = common_left + common_right - 1);
3085 debug15(printf("Overlap is %d = common_left %d + common_right %d - 1\n",
3086 overlap,common_left,common_right));
3087 clipdir = -1;
3088
3089 /* Want to clip 5 low and 3 high */
3090 *hardclip5_low = ilength5_low + common_shift;
3091 *hardclip3_high = ilength3_high - common_shift;
3092 debug15(printf("Overlap clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3093 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3094 *hardclip5_low += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
3095 *hardclip3_high += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
3096 debug15(printf("Ambig clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3097 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3098
3099 if (common_shift != 0) {
3100 if (test_hardclips(&common_genomicpos,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset) == true) {
3101 /* No adjustment needed, but need to recompute ilengths for shifted common_genomicpos */
3102 } else {
3103 common_genomicpos_right = adjust_hardclips_right(&shift_right,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset);
3104 common_genomicpos_left = adjust_hardclips_left(&shift_left,*hardclip5_low,hit5,*hardclip3_high,hit3,hit3->chroffset);
3105 debug15(printf("shift_right %d, shift_left %d\n",shift_right,shift_left));
3106 if (shift_right == 0 && shift_left == 0) {
3107 /* Try original position without a shift */
3108 *hardclip5_low = ilength5_low /*+ common_shift*/;
3109 *hardclip3_high = ilength3_high /*- common_shift*/;
3110 *hardclip5_low += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
3111 *hardclip3_high += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
3112 if (test_hardclips(&common_genomicpos,*hardclip3_low,hit3,*hardclip5_high,hit5,hit3->chroffset) == false) {
3113 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
3114 return 0;
3115 }
3116 } else if (shift_left == 0) {
3117 common_genomicpos = common_genomicpos_right;
3118 } else if (shift_right == 0) {
3119 common_genomicpos = common_genomicpos_left;
3120 } else if (shift_right <= shift_left) {
3121 common_genomicpos = common_genomicpos_right;
3122 } else {
3123 common_genomicpos = common_genomicpos_left;
3124 }
3125 }
3126
3127 debug15(printf("New common point is %u\n",common_genomicpos - hit3->chroffset));
3128 /* Recompute hardclips */
3129 if (find_ilengths(&ilength5_low,&ilength5_high,hit5,common_genomicpos) == false ||
3130 find_ilengths(&ilength3_low,&ilength3_high,hit3,common_genomicpos) == false) {
3131 *hardclip5_low = *hardclip5_high = *hardclip3_low = *hardclip3_high = 0;
3132 return 0;
3133 } else if (ilength5_low > ilength3_high) {
3134 debug15(printf("Uneven: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3135 assert(ilength5_low > 0);
3136 ilength5_low -= 1;
3137 } else {
3138 debug15(printf("Uneven: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3139 assert(ilength3_high > 0);
3140 ilength3_high -= 1;
3141 }
3142 debug15(printf("Even: ilengths5: %d|%d. ilengths3: %d|%d\n",ilength5_low,ilength5_high,ilength3_low,ilength3_high));
3143
3144 *hardclip5_low = ilength5_low /*+ common_shift*/;
3145 *hardclip3_high = ilength3_high /*- common_shift*/;
3146 debug15(printf("Initial computation of clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3147 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3148
3149 *hardclip5_low += hit5->trim_queryend /*+ end_amb_length(hit5)*/;
3150 *hardclip3_high += hit3->trim_querystart /*+ start_amb_length(hit3)*/;
3151 debug15(printf("Recomputed clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3152 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3153 }
3154
3155 #if 0
3156 if (*hardclip5_low < 0) {
3157 *hardclip5_low = 0;
3158 }
3159 if (*hardclip3_high < 0) {
3160 *hardclip3_high = 0;
3161 }
3162 debug15(printf("Positive clip for ilength35 minus is hardclip5 %d..%d and hardclip3 %d..%d\n",
3163 *hardclip5_low,*hardclip5_high,*hardclip3_low,*hardclip3_high));
3164 #endif
3165 }
3166 }
3167
3168 debug15(printf("returning clipdir %d\n",clipdir));
3169 return clipdir;
3170 }
3171 }
3172 }
3173
3174
3175 void
Stage3pair_free(Stage3pair_T * old)3176 Stage3pair_free (Stage3pair_T *old) {
3177 debug0(printf("Freeing pair %p with hits %p and %p\n",*old,(*old)->hit5,(*old)->hit3));
3178 assert((*old)->hit3 != NULL);
3179 debug0(printf("Freeing end3 at %p\n",(*old)->hit3));
3180 Stage3end_free(&(*old)->hit3);
3181
3182 assert((*old)->hit5 != NULL);
3183 debug0(printf("Freeing end5 at %p\n",(*old)->hit5));
3184 Stage3end_free(&(*old)->hit5);
3185
3186 FREE_OUT(*old);
3187 return;
3188 }
3189
3190
3191
3192 #if 0
3193 static long int
3194 Stage3pair_tally (Stage3pair_T this) {
3195
3196 if (tally_iit == NULL) {
3197 return 0L;
3198 } else if (this->tally >= 0) {
3199 return this->tally;
3200 } else {
3201 this->tally = Stage3end_compute_tally(this->hit5) + Stage3end_compute_tally(this->hit3);
3202 return this->tally;
3203 }
3204 }
3205 #endif
3206
3207
3208 static char complCode[128] = COMPLEMENT_LC;
3209
3210 #if 0
3211 static char *
3212 make_complement_buffered (char *complement, char *sequence, unsigned int length) {
3213 int i, j;
3214
3215 /* complement = (char *) CALLOC_OUT(length+1,sizeof(char)); */
3216 for (i = length-1, j = 0; i >= 0; i--, j++) {
3217 complement[j] = complCode[(int) sequence[i]];
3218 }
3219 complement[length] = '\0';
3220 return complement;
3221 }
3222 #endif
3223
3224 static char *
make_complement_inplace(char * sequence,unsigned int length)3225 make_complement_inplace (char *sequence, unsigned int length) {
3226 char temp;
3227 unsigned int i, j;
3228
3229 for (i = 0, j = length-1; i < length/2; i++, j--) {
3230 temp = complCode[(int) sequence[i]];
3231 sequence[i] = complCode[(int) sequence[j]];
3232 sequence[j] = temp;
3233 }
3234 if (i == j) {
3235 sequence[i] = complCode[(int) sequence[i]];
3236 }
3237
3238 return sequence;
3239 }
3240
3241 char *
Stage3end_substrings_genomic_sequence(int * seqlength,T this,Genome_T genome)3242 Stage3end_substrings_genomic_sequence (int *seqlength, T this, Genome_T genome) {
3243 char *gbuffer;
3244 List_T p, q;
3245 Substring_T substring;
3246 Junction_T junction;
3247 int querypos, querystart, queryend, querylength, substring_length;
3248
3249 *seqlength = 0;
3250 for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
3251 substring = (Substring_T) List_head(p);
3252 #ifdef NO_SOFT_CLIPS
3253 querystart = Substring_querystart_orig(substring);
3254 queryend = Substring_queryend_orig(substring);
3255 #else
3256 querystart = Substring_querystart(substring);
3257 queryend = Substring_queryend(substring);
3258 #endif
3259 *seqlength += queryend - querystart;
3260 }
3261 for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
3262 junction = (Junction_T) List_head(p);
3263 if (Junction_type(junction) == DEL_JUNCTION) {
3264 *seqlength += Junction_nindels(junction);
3265 }
3266 }
3267
3268 gbuffer = (char *) MALLOC((*seqlength+1) * sizeof(char));
3269 if (this->plusp == true) {
3270 /* Build from querystart to queryend, so we don't wipe out sequence with terminating \0 character */
3271 querypos = 0;
3272 for (p = this->substrings_1toN, q = this->junctions_1toN; p != NULL; p = List_next(p), q = List_next(q)) {
3273 substring = (Substring_T) List_head(p);
3274 #ifdef NO_SOFT_CLIPS
3275 querystart = Substring_querystart_orig(substring);
3276 queryend = Substring_queryend_orig(substring);
3277 #else
3278 querystart = Substring_querystart(substring);
3279 queryend = Substring_queryend(substring);
3280 #endif
3281 substring_length = queryend - querystart;
3282 Genome_fill_buffer_simple(genome,Substring_left(substring) + querystart,
3283 substring_length,&(gbuffer[querypos]));
3284 querypos += substring_length;
3285
3286 if (q != NULL) {
3287 junction = (Junction_T) List_head(q);
3288 if (Junction_type(junction) == DEL_JUNCTION) {
3289 substring_length = Junction_nindels(junction);
3290 Genome_fill_buffer_simple(genome,Junction_deletionpos(junction),
3291 substring_length,&(gbuffer[querypos]));
3292 querypos += substring_length;
3293 }
3294 }
3295 }
3296
3297 return gbuffer;
3298
3299 } else {
3300 /* Build from queryend to querystart, so we don't wipe out sequence with terminating \0 character */
3301 querypos = 0;
3302 querylength = this->querylength;
3303 for (p = this->substrings_Nto1, q = this->junctions_Nto1; p != NULL; p = List_next(p), q = List_next(q)) {
3304 substring = (Substring_T) List_head(p);
3305 #ifdef NO_SOFT_CLIPS
3306 querystart = Substring_querystart_orig(substring);
3307 queryend = Substring_queryend_orig(substring);
3308 #else
3309 querystart = Substring_querystart(substring);
3310 queryend = Substring_queryend(substring);
3311 #endif
3312 substring_length = queryend - querystart;
3313 Genome_fill_buffer_simple(genome,Substring_left(substring) + (querylength - queryend),
3314 substring_length,&(gbuffer[querypos]));
3315 querypos += substring_length;
3316
3317 if (q != NULL) {
3318 junction = (Junction_T) List_head(q);
3319 if (Junction_type(junction) == DEL_JUNCTION) {
3320 substring_length = Junction_nindels(junction);
3321 Genome_fill_buffer_simple(genome,Junction_deletionpos(junction),
3322 substring_length,&(gbuffer[querypos]));
3323 querypos += substring_length;
3324 }
3325 }
3326 }
3327
3328 return make_complement_inplace(gbuffer,*seqlength);
3329 }
3330 }
3331
3332
3333 const Except_T Copy_Substring = { "Substring invalid during copy" };
3334
3335 static T
Stage3end_copy(T old,Listpool_T listpool)3336 Stage3end_copy (T old, Listpool_T listpool) {
3337 T new = (T) MALLOC_OUT(sizeof(*new));
3338 List_T p;
3339 Substring_T old_substring, new_substring;
3340 Junction_T old_junction, new_junction;
3341
3342 debug0(printf("*****Copying Stage3end %p -> %p of type %s\n",
3343 old,new,hittype_string(old->hittype)));
3344
3345 new->hittype = old->hittype;
3346 new->method = old->method;
3347 new->level = old->level;
3348
3349 new->querylength = old->querylength;
3350 new->querylength_adj = old->querylength_adj;
3351
3352 new->transcripts = Transcript_copy_list(old->transcripts);
3353 new->transcripts_other = Transcript_copy_list(old->transcripts_other);
3354
3355 new->substrings_1toN = (List_T) NULL;
3356 new->substrings_Nto1 = (List_T) NULL;
3357
3358 new->junctions_1toN = (List_T) NULL;
3359 new->junctions_Nto1 = (List_T) NULL;
3360
3361 for (p = old->substrings_1toN; p != NULL; p = List_next(p)) {
3362 old_substring = (Substring_T) List_head(p);
3363 new_substring = Substring_copy(old_substring);
3364 new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) new_substring);
3365 }
3366
3367 for (p = old->junctions_1toN; p != NULL; p = List_next(p)) {
3368 old_junction = (Junction_T) List_head(p);
3369 new_junction = Junction_copy(old_junction);
3370 new->junctions_1toN = Listpool_push(new->junctions_1toN,listpool,(void *) new_junction);
3371 }
3372
3373 new->substrings_Nto1 = Listpool_copy(new->substrings_1toN,listpool); /* Before reversal of 1toN */
3374 new->junctions_Nto1 = Listpool_copy(new->junctions_1toN,listpool); /* Before reversal of 1toN */
3375
3376 /* Reversals to handle builds of 1toN */
3377 new->substrings_1toN = List_reverse(new->substrings_1toN);
3378 new->junctions_1toN = List_reverse(new->junctions_1toN);
3379
3380
3381 new->trim_querystart = old->trim_querystart;
3382 new->trim_queryend = old->trim_queryend;
3383 new->mandatory_trim_querystart = old->mandatory_trim_querystart;
3384 new->mandatory_trim_queryend = old->mandatory_trim_queryend;
3385 new->trim_querystart_splicep = old->trim_querystart_splicep;
3386 new->trim_queryend_splicep = old->trim_queryend_splicep;
3387
3388 new->querystart_chrbound = old->querystart_chrbound;
3389 new->queryend_chrbound = old->queryend_chrbound;
3390
3391 new->genomicstart = old->genomicstart;
3392 new->genomicend = old->genomicend;
3393
3394 new->low = old->low;
3395 new->high = old->high;
3396 new->genomiclength = old->genomiclength;
3397 new->guided_insertlength = old->guided_insertlength;
3398
3399 new->distant_splice_p = old->distant_splice_p;
3400 new->chrnum = old->chrnum;
3401 new->effective_chrnum = old->effective_chrnum;
3402 new->other_chrnum = old->other_chrnum;
3403 new->chroffset = old->chroffset;
3404 new->chrhigh = old->chrhigh;
3405 new->chrlength = old->chrlength;
3406 new->plusp = old->plusp;
3407 new->genestrand = old->genestrand;
3408
3409 new->sensedir = old->sensedir;
3410 new->sensedir_for_concordance = old->sensedir_for_concordance;
3411
3412 new->nsplices = old->nsplices;
3413 new->splice_score = old->splice_score;
3414 new->nindels = old->nindels;
3415
3416 new->nmismatches_bothdiff = old->nmismatches_bothdiff;
3417 new->nmismatches_refdiff = old->nmismatches_refdiff;
3418 new->nsegments = old->nsegments;
3419
3420 new->refalt_nmatches_to_trims = old->refalt_nmatches_to_trims;
3421 new->ref_nmatches_to_trims = old->ref_nmatches_to_trims;
3422
3423 new->ref_score_overall = old->ref_score_overall;
3424 new->refalt_score_overall = old->refalt_score_overall;
3425 new->refalt_score_within_trims = old->refalt_score_within_trims;
3426
3427 new->refalt_nmatches_plus_spliced_trims = old->refalt_nmatches_plus_spliced_trims;
3428 new->ref_nmatches_plus_spliced_trims = old->ref_nmatches_plus_spliced_trims;
3429
3430 new->paired_usedp = old->paired_usedp;
3431
3432 new->circularalias = old->circularalias;
3433 new->circularpos = old->circularpos;
3434 new->altlocp = old->altlocp;
3435 debug12(printf("Copying circularpos of %d from hit %p to hit %p\n",new->circularpos,old,new));
3436
3437 new->score_eventrim = old->score_eventrim;
3438 new->mapq_loglik = old->mapq_loglik;
3439 new->mapq_score = old->mapq_score;
3440 new->absmq_score = old->absmq_score;
3441
3442 /* Actually, the assertion is excluded only for the JOIN hittype */
3443 assert(new->hittype == SPLICE || Substring_querystart(List_head(new->substrings_1toN)) <= Substring_querystart(List_head(new->substrings_Nto1)));
3444
3445 return new;
3446 }
3447
3448
3449 static int
compute_circularpos(int * circularalias,T hit)3450 compute_circularpos (int *circularalias, T hit) {
3451 int circularpos;
3452 List_T substrings_LtoH, p;
3453 Substring_T substring;
3454
3455
3456 debug12(printf("Computing circularpos on hit at %u..%u, plusp %d, with trim left %d and trim right %d\n",
3457 hit->low - hit->chroffset,hit->high - hit->chroffset,
3458 hit->plusp,hit->trim_querystart,hit->trim_queryend));
3459 if (circularp[hit->chrnum] == false) {
3460 debug12(printf("Chromosome #%d is not circular\n",hit->chrnum));
3461 /* This also handles hit->chrnum == 0, where translocation cannot be circular */
3462 *circularalias = 0;
3463 return -1;
3464
3465 } else if (hit->low - hit->chroffset >= hit->chrlength) {
3466 /* All of read after trimming is in high part. Previously
3467 checked hit->high against hit->chrhigh, for circularalias of
3468 +2, but that should be fixed now */
3469
3470 debug12(printf("Circular chromosome of length %u\n",hit->chrlength));
3471 debug12(printf("All of read after trimming %u..%u is in high part\n",
3472 hit->low - hit->chroffset,hit->high - hit->chroffset));
3473 *circularalias = +1; /* All of read is in second copy */
3474 debug12(printf("For hit %p, pair circularpos is -1, circularalias is %d\n",hit,*circularalias));
3475 return -1;
3476
3477 } else if (hit->high - hit->chroffset < hit->chrlength) {
3478 /* All of read after trimming is in low part. Previously
3479 checked hit->low against hit->chroffset for circularalias of
3480 -2, but that should be fixed now */
3481
3482 debug12(printf("Circular chromosome of length %u\n",hit->chrlength));
3483 debug12(printf("All of read after trimming %u..%u is in low part\n",
3484 hit->low - hit->chroffset,hit->high - hit->chroffset));
3485 *circularalias = -1; /* All of read is in first copy */
3486 debug12(printf("For hit %p, pair circularpos is -1, circularalias is %d\n",hit,*circularalias));
3487 return -1;
3488
3489 } else {
3490 *circularalias = 0; /* Straddling middle */
3491 if (hit->plusp == true) {
3492 substrings_LtoH = hit->substrings_1toN;
3493 } else {
3494 substrings_LtoH = hit->substrings_Nto1;
3495 }
3496
3497 debug12(printf("Circular chromosome of length %u\n",hit->chrlength));
3498 for (p = substrings_LtoH; p != NULL; p = List_next(p)) {
3499 substring = (Substring_T) List_head(p);
3500 if ((circularpos = Substring_circularpos(substring)) > 0) {
3501 debug12(printf("For hit %p, returning circularpos %d from substring (plus)\n",hit,circularpos));
3502 return circularpos;
3503 }
3504 }
3505 debug12(printf("For hit %p, pair circularpos is -1, circularalias is %d\n",hit,*circularalias));
3506 return -1;
3507 }
3508 }
3509
3510
3511 /* Modified from Stage3end_new_precomputed for a single substring */
3512 T
Stage3end_new_terminal(int * found_score_overall,int * found_score_within_trims,Substring_T substring_in,int querylength,bool gplusp,int genestrand,int sensedir,Listpool_T listpool,Method_T method,int level)3513 Stage3end_new_terminal (int *found_score_overall, int *found_score_within_trims,
3514 Substring_T substring_in, int querylength,
3515 bool gplusp, int genestrand, int sensedir, Listpool_T listpool,
3516 Method_T method, int level) {
3517 T new;
3518
3519 Substring_T substring;
3520 Chrnum_T chrnum;
3521 Univcoord_T chroffset, chrhigh;
3522 Chrpos_T chrlength;
3523
3524 Univcoord_T genomicstart, genomicend;
3525 List_T substrings;
3526 List_T p;
3527 int adj = 0;
3528
3529
3530 substring = Substring_copy(substring_in); /* Always make a copy of the input substring */
3531 chrnum = Substring_chrnum(substring);
3532 chroffset = Substring_chroffset(substring);
3533 chrhigh = Substring_chrhigh(substring);
3534 chrlength = Substring_chrlength(substring);
3535
3536 debug0(printf("Entered Stage3end_new_terminal, method %s, with chrnum %d, query %d..%d\n",
3537 Method_string(method),chrnum,Substring_querystart(substring),Substring_queryend(substring)));
3538
3539 new = (T) MALLOC_OUT(sizeof(*new));
3540 new->hittype = SUBSTRINGS;
3541 new->method = method;
3542 new->level = level;
3543
3544 new->querylength = querylength;
3545 new->querylength_adj = querylength + adj;
3546
3547 /* Caller must not free these lists */
3548 new->transcripts = (List_T) NULL;
3549 new->transcripts_other = (List_T) NULL;
3550
3551 /* Unlike Stage3end_new_substrings, where substrings and junctions
3552 are in opposite orders, substrings and junctions here are in the
3553 same order. */
3554
3555 substrings = Listpool_push(NULL,listpool,(void *) substring);
3556 new->substrings_1toN = substrings;
3557 new->substrings_Nto1 = Listpool_copy(substrings,listpool);
3558 /* Do not use substrings after this */
3559
3560 new->junctions_1toN = new->junctions_Nto1 = (List_T) NULL;
3561 /* There is no junctions_HtoL field */
3562 /* Do not use junctions after this */
3563
3564 #if 0
3565 /* No need to reverse for a single substring */
3566 if (gplusp == true) {
3567 /* Substrings, head to tail, are query low to high and genome low to high */
3568 new->substrings_HtoL = List_reverse(new->substrings_HtoL);
3569 } else {
3570 /* Substrings, head to tail, are query low to high and genome high to low */
3571 new->substrings_LtoH = List_reverse(new->substrings_LtoH);
3572 new->junctions_LtoH = List_reverse(new->junctions_LtoH);
3573 }
3574 #endif
3575
3576 #ifdef DEBUG0
3577 printf("NEW SUBSTRING\n");
3578 printf("%d..%d\t%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",Substring_querystart(substring),Substring_queryend(substring),
3579 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
3580 Substring_nmatches_to_trims(substring),Substring_amb_length(substring));
3581 printf("\n");
3582 #endif
3583
3584 new->trim_querystart = Substring_trim_querystart(substring);
3585 new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring);
3586 new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring);
3587 new->trim_queryend = Substring_trim_queryend(substring);
3588 new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substring);
3589 new->trim_queryend_splicep = Substring_trim_queryend_splicep(substring);
3590 debug0(printf(" trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
3591 debug0(printf(" trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
3592
3593 new->querystart_chrbound = Substring_querystart_chrbound(substring);
3594 new->queryend_chrbound = Substring_queryend_chrbound(substring);
3595 if (new->trim_querystart > new->querystart_chrbound) {
3596 new->querystart_chrbound = new->trim_querystart;
3597 }
3598 if (querylength - new->trim_queryend < new->queryend_chrbound) {
3599 new->queryend_chrbound = querylength - new->trim_queryend;
3600 }
3601 assert(new->querystart_chrbound < new->queryend_chrbound);
3602 debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
3603
3604
3605 genomicstart = Substring_genomicstart(substring);
3606 genomicend = Substring_genomicend(substring);
3607 new->genomicstart = genomicstart;
3608 new->genomicend = genomicend;
3609
3610 if (gplusp == true) {
3611 new->low = genomicstart + new->querystart_chrbound;
3612 new->high = genomicend - (querylength - new->queryend_chrbound);
3613 new->genomiclength = genomicend - genomicstart;
3614 } else {
3615 new->low = genomicend + (querylength - new->queryend_chrbound);
3616 new->high = genomicstart - new->querystart_chrbound;
3617 new->genomiclength = genomicstart - genomicend;
3618 }
3619 assert(new->low < new->high);
3620 debug0(printf("low %u, high %u\n",new->low - chroffset,new->high - chroffset));
3621
3622 new->guided_insertlength = 0U;
3623
3624 new->distant_splice_p = false;
3625 new->chrnum = new->effective_chrnum = chrnum;
3626 new->other_chrnum = 0;
3627 new->chroffset = chroffset;
3628 new->chrhigh = chrhigh;
3629 new->chrlength = chrlength;
3630 new->plusp = gplusp;
3631 new->genestrand = genestrand;
3632
3633 new->sensedir_for_concordance = new->sensedir = sensedir;
3634
3635 new->nsplices = 0;
3636 new->splice_score = 0.0;
3637 new->nindels = 0;
3638
3639 new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(substring); /* Trimmed */
3640 new->nmismatches_refdiff = Substring_nmismatches_refdiff(substring);
3641 new->nsegments = List_length(new->substrings_1toN);
3642
3643
3644 new->refalt_nmatches_to_trims = new->ref_nmatches_to_trims = 0;
3645 /* Note: Cannot use substrings variable here. Need to use new->substrings_1toN */
3646 for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
3647 substring = (Substring_T) List_head(p);
3648 new->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
3649 new->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
3650 }
3651 debug0(printf("**Setting nmatches_to_trims to be %d\n",new->refalt_nmatches_to_trims));
3652
3653 new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims + Substring_amb_length(substring);
3654 new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + Substring_amb_length(substring);
3655 assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
3656
3657 /* Used for global comparisons */
3658 new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
3659 new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
3660 new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims;
3661 if (Substring_trim_querystart_splicep(substring) == false) {
3662 new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring)/END_BINSIZE);
3663 } else {
3664 new->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring)/END_BINSIZE);
3665 }
3666 if (Substring_trim_queryend_splicep(substring) == false) {
3667 new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((querylength - Substring_queryend(substring))/END_BINSIZE);
3668 } else {
3669 new->refalt_score_within_trims += SPLICED_END_PENALTY*((querylength - Substring_queryend(substring))/END_BINSIZE);
3670 }
3671 /* was Substring_amb_length(substring)/AMB_PENALTY, but doesn't work for DNA-seq */
3672
3673 if (chrlength < (Univcoord_T) querylength) {
3674 new->ref_score_overall -= ((Univcoord_T) querylength - chrlength);
3675 new->refalt_score_overall -= ((Univcoord_T) querylength - chrlength);
3676 new->refalt_score_within_trims -= ((Univcoord_T) querylength - chrlength);
3677 }
3678 assert(new->refalt_score_within_trims >= 0);
3679
3680
3681 /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
3682 if (new->refalt_score_overall < *found_score_overall) {
3683 *found_score_overall = new->refalt_score_overall;
3684 }
3685 /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
3686 if (new->refalt_score_within_trims < *found_score_within_trims) {
3687 *found_score_within_trims = new->refalt_score_within_trims;
3688 }
3689
3690
3691 /* new->penalties = 0; */
3692
3693 /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
3694 /* new->tally = -1L; */
3695
3696 new->paired_usedp = false;
3697
3698 /* new->query_splicepos = -1; */
3699 new->circularpos = compute_circularpos(&new->circularalias,new);
3700
3701 if ((new->altlocp = altlocp[chrnum]) == false) {
3702 debug0(printf("*****Method %s: Stage3end_new_terminal returning primary %p at %u..%u\n\n",
3703 Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset));
3704 return new;
3705
3706 } else {
3707 debug0(printf("*****Method %s: Stage3end_new_terminal returning altloc %p at %u..%u\n\n",
3708 Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset));
3709 return new;
3710 }
3711 }
3712
3713
3714
3715 /* Called only by kmer-search.c */
3716 T
Stage3end_new_precomputed(int * found_score_overall,int * found_score_within_trims,int nmismatches_bothdiff,int nmismatches_refdiff,List_T substrings,List_T junctions,List_T transcripts,List_T transcripts_other,int querylength,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool gplusp,int genestrand,int sensedir,Listpool_T listpool,Method_T method,int level)3717 Stage3end_new_precomputed (int *found_score_overall, int *found_score_within_trims,
3718 int nmismatches_bothdiff, int nmismatches_refdiff,
3719 List_T substrings, List_T junctions, List_T transcripts, List_T transcripts_other,
3720 int querylength, Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
3721 bool gplusp, int genestrand, int sensedir, Listpool_T listpool, Method_T method, int level) {
3722 T new;
3723
3724 Univcoord_T genomicstart, genomicend;
3725 Substring_T substring, substring1, substringN;
3726 Junction_T junction;
3727 List_T p;
3728 int adj = 0;
3729 int nsites;
3730 double prob_total;
3731
3732
3733 #ifdef DEBUG0
3734 printf("Entered Stage3end_new_precomputed, method %s, with gplusp %d\n",Method_string(method),gplusp);
3735 printf("%d substrings\n",List_length(substrings));
3736 printf("%d junctions\n",List_length(junctions));
3737 #endif
3738 assert(List_length(substrings) == List_length(junctions) + 1);
3739
3740 new = (T) MALLOC_OUT(sizeof(*new));
3741 new->hittype = SUBSTRINGS;
3742 new->method = method;
3743 new->level = level;
3744
3745 new->querylength = querylength;
3746 new->querylength_adj = querylength + adj;
3747
3748 /* Caller must not free these lists */
3749 new->transcripts = transcripts;
3750 new->transcripts_other = transcripts_other;
3751
3752 /* Unlike Stage3end_new_substrings, where substrings and junctions
3753 are in opposite orders, substrings and junctions here are in the
3754 same order. */
3755
3756 new->substrings_1toN = substrings;
3757 new->substrings_Nto1 = List_reverse(Listpool_copy(substrings,listpool));
3758 new->junctions_1toN = junctions;
3759 new->junctions_Nto1 = List_reverse(Listpool_copy(junctions,listpool));
3760
3761
3762 /* There is no junctions_HtoL field */
3763
3764 #if 0
3765 if (gplusp == true) {
3766 /* Substrings, head to tail, are query low to high and genome low to high */
3767 new->substrings_LtoH = Listpool_copy(new->substrings_1toN,listpool);
3768 new->substrings_HtoL = Listpool_copy(new->substrings_Nto1,listpool);
3769 new->junctions_LtoH = Listpool_copy(new->junctions_1toN,listpool);
3770 /* new->junctions_HtoL = Listpool_copy(new->junctions_Nto1,listpool); */
3771 } else {
3772 /* Substrings, head to tail, are query low to high and genome high to low */
3773 new->substrings_LtoH = Listpool_copy(new->substrings_Nto1,listpool);
3774 new->substrings_HtoL = Listpool_copy(new->substrings_1toN,listpool);
3775 new->junctions_LtoH = Listpool_copy(new->junctions_Nto1,listpool);
3776 /* new->junctions_HtoL = Listpool_copy(new->junctions_1toN,listpool); */
3777 }
3778 #endif
3779 /* Do not use substrings after this */
3780 /* Do not use junctions after this */
3781
3782
3783
3784 #ifdef DEBUG0
3785 printf("NEW SUBSTRINGS (query order)\n");
3786 for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
3787 substring = List_head(p);
3788 if (Substring_ambiguous_p(substring) == true) {
3789 printf("%d..%d\t%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\tprobs:%f and %f\n",Substring_querystart(substring),Substring_queryend(substring),
3790 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
3791 Substring_nmatches_to_trims(substring),Substring_amb_length(substring),Substring_amb_donor_prob(substring),Substring_amb_acceptor_prob(substring)
3792 );
3793 } else {
3794 printf("%d..%d\t%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",Substring_querystart(substring),Substring_queryend(substring),
3795 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
3796 Substring_nmatches_to_trims(substring),Substring_amb_length(substring));
3797 }
3798 }
3799 printf("\n");
3800
3801 printf("NEW JUNCTIONS (query order)\n");
3802 for (p = new->junctions_1toN; p != NULL; p = List_next(p)) {
3803 junction = List_head(p);
3804 printf("splice distance %u, nindels %d\n",Junction_splice_distance(junction),Junction_nindels(junction));
3805 }
3806 printf("\n");
3807 #endif
3808
3809
3810 substring1 = (Substring_T) List_head(new->substrings_1toN);
3811 substringN = (Substring_T) List_head(new->substrings_Nto1);
3812
3813 new->trim_querystart = Substring_trim_querystart(substring1);
3814 new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring1);
3815 new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring1);
3816 new->trim_queryend = Substring_trim_queryend(substringN);
3817 new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substringN);
3818 new->trim_queryend_splicep = Substring_trim_queryend_splicep(substringN);
3819 debug0(printf(" trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
3820 debug0(printf(" trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
3821
3822 new->querystart_chrbound = Substring_querystart_chrbound(substring1);
3823 new->queryend_chrbound = Substring_queryend_chrbound(substringN);
3824 if (new->trim_querystart > new->querystart_chrbound) {
3825 new->querystart_chrbound = new->trim_querystart;
3826 }
3827 if (querylength - new->trim_queryend < new->queryend_chrbound) {
3828 new->queryend_chrbound = querylength - new->trim_queryend;
3829 }
3830 assert(new->querystart_chrbound < new->queryend_chrbound);
3831 debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
3832
3833
3834 genomicstart = Substring_genomicstart(substring1);
3835 genomicend = Substring_genomicend(substringN);
3836 new->genomicstart = genomicstart;
3837 new->genomicend = genomicend;
3838
3839 if (gplusp == true) {
3840 new->low = genomicstart + new->querystart_chrbound;
3841 new->high = genomicend - (querylength - new->queryend_chrbound);
3842 new->genomiclength = genomicend - genomicstart;
3843 } else {
3844 new->low = genomicend + (querylength - new->queryend_chrbound);
3845 new->high = genomicstart - new->querystart_chrbound;
3846 new->genomiclength = genomicstart - genomicend;
3847 }
3848 assert(new->low < new->high);
3849 debug0(printf("low %u, high %u\n",new->low - chroffset,new->high - chroffset));
3850
3851 new->guided_insertlength = 0U;
3852
3853 new->distant_splice_p = false;
3854 new->chrnum = new->effective_chrnum = chrnum;
3855 new->other_chrnum = 0;
3856 new->chroffset = chroffset;
3857 new->chrhigh = chrhigh;
3858 new->chrlength = chrlength;
3859 new->plusp = gplusp;
3860 new->genestrand = genestrand;
3861
3862
3863 prob_total = 0.0;
3864 nsites = 0;
3865 new->nsplices = 0;
3866 for (p = junctions; p != NULL; p = List_next(p)) {
3867 junction = (Junction_T) List_head(p);
3868 if (Junction_type(junction) == SPLICE_JUNCTION) {
3869 prob_total += Junction_splice_score(junction);
3870 nsites += 2;
3871 new->nsplices += 1;
3872 }
3873 }
3874 if (nsites == 0) {
3875 new->splice_score = 0.0;
3876 } else {
3877 new->splice_score = prob_total / (double) nsites;
3878 }
3879 debug0(printf("SPLICE SCORE: %f\n",new->splice_score));
3880 new->nindels = 0;
3881
3882
3883 new->nmismatches_bothdiff = nmismatches_bothdiff; /* Trimmed */
3884 new->nmismatches_refdiff = nmismatches_refdiff;
3885 new->nsegments = List_length(new->substrings_1toN);
3886
3887 /* new->nmatches_to_trims = querylength_trimmed - nmismatches_whole; */
3888 new->refalt_nmatches_to_trims = new->ref_nmatches_to_trims = 0;
3889 /* Note: Cannot use substrings variable here. Need to use new->substrings_1toN */
3890 for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
3891 substring = (Substring_T) List_head(p);
3892 new->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
3893 new->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
3894 }
3895 debug0(printf("**Setting nmatches_to_trims to be %d\n",new->ref_nmatches_to_trims));
3896
3897 new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims + Substring_start_amb_length(substring) + Substring_end_amb_length(substring);
3898 new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + Substring_start_amb_length(substring) + Substring_end_amb_length(substring);
3899 for (p = new->junctions_1toN; p != NULL; p = List_next(p)) {
3900 junction = List_head(p);
3901 new->refalt_nmatches_plus_spliced_trims += Junction_ninserts(junction);
3902 new->ref_nmatches_plus_spliced_trims += Junction_ninserts(junction);
3903 }
3904 assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
3905
3906 new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
3907 new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
3908 new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims;
3909 if (Substring_trim_querystart_splicep(substring1) == false) {
3910 new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring1)/END_BINSIZE);
3911 } else {
3912 new->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring1)/END_BINSIZE);
3913 }
3914 if (Substring_trim_queryend_splicep(substringN) == false) {
3915 new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((querylength - Substring_queryend(substringN))/END_BINSIZE);
3916 } else {
3917 new->refalt_score_within_trims += SPLICED_END_PENALTY*((querylength - Substring_queryend(substringN))/END_BINSIZE);
3918 }
3919 /* was Substring_amb_length(substring)/AMB_PENALTY, but doesn't work for DNA-seq */
3920
3921 if (chrlength < (Univcoord_T) querylength) {
3922 new->ref_score_overall -= ((Univcoord_T) querylength - chrlength);
3923 new->refalt_score_overall -= ((Univcoord_T) querylength - chrlength);
3924 new->refalt_score_within_trims -= ((Univcoord_T) querylength - chrlength);
3925 }
3926 assert(new->refalt_score_within_trims >= 0);
3927
3928
3929 /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
3930 if (new->refalt_score_overall < *found_score_overall) {
3931 *found_score_overall = new->refalt_score_overall;
3932 }
3933 /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
3934 if (new->refalt_score_within_trims < *found_score_within_trims) {
3935 *found_score_within_trims = new->refalt_score_within_trims;
3936 }
3937
3938
3939 /* new->penalties = 0; */
3940
3941 /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
3942 /* new->tally = -1L; */
3943
3944 new->sensedir_for_concordance = new->sensedir = sensedir;
3945
3946 new->paired_usedp = false;
3947
3948 /* new->query_splicepos = -1; */
3949 new->circularpos = compute_circularpos(&new->circularalias,new);
3950
3951 if ((new->altlocp = altlocp[chrnum]) == false) {
3952 debug0(printf("*****Method %s: Stage3end_new_precomputed returning primary %p at %u..%u with splice_score %f\n\n",
3953 Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset,new->splice_score));
3954 return new;
3955
3956 } else {
3957 debug0(printf("*****Method %s: Stage3end_new_precomputed returning altloc %p at %u..%u with splice_score %f\n\n",
3958 Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset,new->splice_score));
3959 return new;
3960 }
3961 }
3962
3963
3964 int
Stage3end_nmatches_substrings(int * ref_nmatches,Intlist_T endpoints,Univcoordlist_T univdiagonals,Intlist_T nmismatches_list,Intlist_T ref_nmismatches_list,List_T junctions,int querylength,Compress_T query_compress,Substring_T qend_alts,Substring_T qstart_alts,bool plusp,int genestrand,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool splice5p_in,bool splice3p_in,Listpool_T listpool)3965 Stage3end_nmatches_substrings (int *ref_nmatches, Intlist_T endpoints, Univcoordlist_T univdiagonals,
3966 Intlist_T nmismatches_list, Intlist_T ref_nmismatches_list, List_T junctions,
3967 int querylength, Compress_T query_compress,
3968 Substring_T qend_alts, Substring_T qstart_alts, bool plusp, int genestrand,
3969 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
3970 bool splice5p_in, bool splice3p_in, Listpool_T listpool) {
3971 int nmatches, substring_nmatches, substring_ref_nmatches;
3972 int qstart, qend;
3973 Univcoord_T univdiagonal, left;
3974 Intlist_T r, x, y;
3975 Univcoordlist_T q;
3976 Junction_T junction;
3977 #ifdef MAKE_JUNCTION
3978 Junction_T qstart_junction = NULL, qend_junction = NULL;
3979 double donor_prob, acceptor_prob;
3980 #endif
3981 List_T newjunctions, p, j;
3982 bool splice5p, splice3p;
3983 int adj0; /* deletions - insertions */
3984 int nmismatches, ref_nmismatches, indel_score = 0, nindels = 0;
3985 int nindelbreaks, n_large_indels;
3986 /* double donor_prob, acceptor_prob; */
3987
3988
3989 debug0(printf("Entered Stage3end_nmatches_substrings with %s, plusp %d, splice5p %d, splice3p %d\n",
3990 Intlist_to_string(endpoints),plusp,splice5p_in,splice3p_in));
3991 nmatches = 0;
3992 *ref_nmatches = 0;
3993
3994 #ifdef DEBUG7
3995 printf("Entered Stage3end_nmatches_substrings, at univdiagonal %u [%u], with chrnum #%d, plusp %d, and endpoints %s\n",
3996 Univcoordlist_head(univdiagonals),Univcoordlist_head(univdiagonals) - chroffset,chrnum,plusp,Intlist_to_string(endpoints));
3997 printf("There are %d endpoints, %d univdiagonals, %d nmismatches, and %d junctions\n",
3998 Intlist_length(endpoints),Univcoordlist_length(univdiagonals),Intlist_length(nmismatches_list),List_length(junctions));
3999 if (qstart_alts != NULL) {
4000 printf("qstart_alts at %d..%d\n",Substring_querystart(qstart_alts),Substring_queryend(qstart_alts));
4001 }
4002 if (qend_alts != NULL) {
4003 printf("qend_alts at %d..%d\n",Substring_querystart(qend_alts),Substring_queryend(qend_alts));
4004 }
4005 printf("Endpoints: %s\n",Intlist_to_string(endpoints));
4006 printf("Univdiagonals: %s\n",Univcoordlist_to_string_offset(univdiagonals,chroffset));
4007 printf("Mismatches: %s\n",Intlist_to_string(nmismatches_list));
4008 printf("Ref mismatches: %s\n",Intlist_to_string(ref_nmismatches_list));
4009 #endif
4010
4011 assert(Univcoordlist_length(univdiagonals) == Intlist_length(endpoints) - 1);
4012 assert(Intlist_length(nmismatches_list) == Intlist_length(endpoints) - 1);
4013 assert(Intlist_length(ref_nmismatches_list) == Intlist_length(endpoints) - 1);
4014 assert(List_length(junctions) == Intlist_length(endpoints) - 2);
4015
4016
4017 newjunctions = Listpool_copy(junctions,listpool);
4018
4019
4020 #ifdef DEBUG0
4021 for (p = junctions; p != NULL; p = List_next(p)) {
4022 Junction_print((Junction_T) List_head(p));
4023 }
4024 printf("\n");
4025 #endif
4026
4027
4028 qstart = Intlist_head(endpoints);
4029 nmismatches = Intlist_head(nmismatches_list);
4030 ref_nmismatches = Intlist_head(ref_nmismatches_list);
4031
4032 if (plusp == true) {
4033 j = newjunctions; /* Put here before we handle querystart_alts */
4034 if (qstart_alts != NULL) {
4035 debug7(printf("Adding %d matches for qstart_alts\n",Substring_nmatches(qstart_alts)));
4036 nmatches += Substring_nmatches(qstart_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4037 *ref_nmatches += Substring_ref_nmatches(qstart_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4038 #ifdef MAKE_JUNCTION
4039 donor_prob = Substring_amb_donor_prob(qstart_alts);
4040 acceptor_prob = Substring_amb_acceptor_prob(qstart_alts);
4041 qstart_junction = Junction_new_ambig_splice(orig_sensedir,donor_prob,acceptor_prob);
4042 newjunctions = Listpool_push(newjunctions,listpool,(void *) qstart_junction);
4043 #else
4044 newjunctions = Listpool_push(newjunctions,listpool,(void *) NULL);
4045 #endif
4046 splice5p = false;
4047 } else {
4048 splice5p = splice5p_in;
4049 }
4050
4051 /* Add qpos to get alignstart/alignend */
4052 for (q = univdiagonals, x = nmismatches_list, y = ref_nmismatches_list, r = Intlist_next(endpoints); q != NULL;
4053 q = Univcoordlist_next(q), x = Intlist_next(x), y = Intlist_next(y), r = Intlist_next(r), j = List_next(j)) {
4054 qend = Intlist_head(r);
4055 nmismatches = Intlist_head(x);
4056 ref_nmismatches = Intlist_head(y);
4057 univdiagonal = Univcoordlist_head(q);
4058 left = univdiagonal - (Univcoord_T) querylength;
4059 debug0(printf("Stage3end_nmatches_substrings: qstart %d..qend %d at univdiagonal %u [%u]\n",
4060 qstart,qend,univdiagonal,univdiagonal - chroffset));
4061
4062 /* genomicstart = left; */
4063 /* genomicend = left + querylength; */
4064 /* alignstart = genomicstart + qstart; */
4065 /* alignend = genomicstart + qend; */
4066
4067 if (nmismatches >= 0 && ref_nmismatches >= 0) {
4068 debug7(printf("Checking mismatches at %u from querystart %d to queryend %d\n",univdiagonal - chroffset,qstart,qend));
4069 debug7(printf("%d vs %d\n",nmismatches,
4070 Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4071 /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand)));
4072 #ifdef CHECK_NMISMATCHES
4073 assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4074 /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand));
4075 #endif
4076 } else {
4077 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4078 /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand);
4079 Intlist_head_set(x,nmismatches); /* Save for Stage3end_new_substrings */
4080 Intlist_head_set(y,ref_nmismatches); /* Save for Stage3end_new_substrings */
4081 debug7(printf("%d (%d ref) mismatches from genome over querypos %d..%d\n",
4082 nmismatches,ref_nmismatches,qstart,qend));
4083 }
4084 if (Univcoordlist_next(q) != NULL || qend_alts != NULL) {
4085 splice3p = false;
4086 } else {
4087 splice3p = splice3p_in;
4088 }
4089
4090 if (splice5p == false && splice3p == false) {
4091 /* Could potentially check here if qstart < qend, but relying upon caller to use endpoints_acceptable_p */
4092 debug7(printf("Shortcut computes matches of %d = (%d - %d) - nmismatches %d\n",
4093 (qend-qstart)-nmismatches,qend,qstart,nmismatches));
4094 nmatches += (qend - qstart) - nmismatches;
4095 *ref_nmatches += (qend - qstart) - ref_nmismatches;
4096 } else {
4097 substring_nmatches =
4098 Substring_compute_nmatches(&substring_ref_nmatches,left,/*querystart*/qstart,/*queryend*/qend,querylength,
4099 /*plusp*/true,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
4100 /*splice_querystart_p*/splice5p,/*splice_queryend_p*/splice3p,/*chrnum_fixed_p*/true);
4101 if (substring_nmatches < 0) {
4102 /* Don't know how to fix junctions */
4103 debug0(printf("Poor substring (plus) for %d..%d, so returning -1 from Stage3end_nmatches_substrings\n",
4104 qstart,qend));
4105 *ref_nmatches = -1;
4106 return -1;
4107 } else {
4108 debug7(printf("Substring_compute_nmatches returns nmatches %d over querypos %d..%d\n",
4109 substring_nmatches,qstart,qend));
4110 nmatches += substring_nmatches;
4111 *ref_nmatches += substring_ref_nmatches;
4112 }
4113 }
4114
4115 /* Prepare for next iteration */
4116 qstart = qend;
4117 if (j != NULL) {
4118 if ((junction = (Junction_T) List_head(j)) == NULL) {
4119 /* qstart_junction */
4120 } else if ((adj0 = Junction_adj(junction)) != 0) {
4121 /* adj += adj0; */
4122 indel_score += indel_penalty_middle;
4123 nindels += Junction_nindels(junction);
4124 if (adj0 < 0) {
4125 debug7(printf("Adjusting qstart %d up by %d\n",qstart,-adj0));
4126 qstart -= adj0; /* Insertion */
4127 }
4128 }
4129 }
4130 splice5p = false;
4131 }
4132
4133 } else {
4134 j = newjunctions; /* Put here before we handle querystart_alts */
4135 if (qstart_alts != NULL) {
4136 debug7(printf("Adding %d matches for qstart_alts\n",Substring_nmatches(qstart_alts)));
4137 nmatches += Substring_nmatches(qstart_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4138 *ref_nmatches += Substring_ref_nmatches(qstart_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4139 #ifdef MAKE_JUNCTION
4140 donor_prob = Substring_amb_donor_prob(qstart_alts);
4141 acceptor_prob = Substring_amb_acceptor_prob(qstart_alts);
4142 qstart_junction = Junction_new_ambig_splice(orig_sensedir,donor_prob,acceptor_prob);
4143 /* printf("Creating junction with donor_prob %f and acceptor_prob %f\n",donor_prob,acceptor_prob); */
4144 newjunctions = Listpool_push(newjunctions,listpool,(void *) qstart_junction);
4145 #else
4146 newjunctions = Listpool_push(newjunctions,listpool,(void *) NULL);
4147 #endif
4148 splice5p = false;
4149 } else {
4150 splice5p = splice5p_in;
4151 }
4152
4153 /* Subtract querypos to get alignstart/alignend */
4154 for (q = univdiagonals, x = nmismatches_list, y = ref_nmismatches_list, r = Intlist_next(endpoints); q != NULL;
4155 q = Univcoordlist_next(q), x = Intlist_next(x), y = Intlist_next(y), r = Intlist_next(r), j = List_next(j)) {
4156 qend = Intlist_head(r);
4157 nmismatches = Intlist_head(x);
4158 ref_nmismatches = Intlist_head(y);
4159 univdiagonal = Univcoordlist_head(q);
4160 left = univdiagonal - (Univcoord_T) querylength;
4161 debug0(printf("Stage3end_nmatches_substrings: qstart %d..qend %d at univdiagonal %u [%u]\n",
4162 qstart,qend,univdiagonal,univdiagonal - chroffset));
4163
4164 /* genomicend = left; */
4165 /* genomicstart = left + querylength; */
4166 /* genomicend_adj = genomicend - adj; */
4167 /* genomicstart_adj = genomicend - adj; */
4168 /* alignstart = genomicstart - (querylength - qend); */
4169 /* alignend = genomicstart - (querylength - qstart); */
4170
4171 if (nmismatches >= 0 && ref_nmismatches >= 0) {
4172 #ifdef CHECK_NMISMATCHES
4173 assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4174 /*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand));
4175 #endif
4176 } else {
4177 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4178 /*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand);
4179 Intlist_head_set(x,nmismatches); /* Save for Stage3end_new_substrings */
4180 Intlist_head_set(y,ref_nmismatches); /* Save for Stage3end_new_substrings */
4181 debug7(printf("%d (%d ref) mismatches from genome over querypos %d..%d\n",
4182 nmismatches,ref_nmismatches,querylength - qend,querylength - qstart));
4183 }
4184 if (Univcoordlist_next(q) != NULL || qend_alts != NULL) {
4185 splice3p = false;
4186 } else {
4187 splice3p = splice3p_in;
4188 }
4189
4190 if (splice5p == false && splice3p == false) {
4191 /* Could potentially check here if qstart < qend, but relying upon caller to use endpoints_acceptable_p */
4192 debug7(printf("Shortcut computes matches of %d = (%d - %d) - nmismatches %d\n",
4193 (qend-qstart)-nmismatches,querylength - qstart,querylength - qend,nmismatches));
4194 nmatches += (qend - qstart) - nmismatches;
4195 *ref_nmatches += (qend - qstart) - ref_nmismatches;
4196 } else {
4197 substring_nmatches =
4198 Substring_compute_nmatches(&substring_ref_nmatches,left,/*querystart*/querylength - qend,
4199 /*queryend*/querylength - qstart,querylength,
4200 /*plusp*/false,genestrand,query_compress,
4201 chrnum,chroffset,chrhigh,chrlength,/*splice_querystart_p*/splice3p,
4202 /*splice_queryend_p*/splice5p,/*chrnum_fixed_p*/true);
4203 if (substring_nmatches < 0) {
4204 /* Don't know how to fix junctions */
4205 debug0(printf("Poor substring (minus) for querypos %d..%d, so returning -1 from Stage3end_new_substrings\n",
4206 querylength - qend,querylength - qstart));
4207 *ref_nmatches = -1;
4208 return -1;
4209 } else {
4210 debug7(printf("Substring_compute_nmatches returns nmatches %d over querypos %d..%d\n",
4211 substring_nmatches,querylength - qend,querylength - qstart));
4212 nmatches += substring_nmatches;
4213 *ref_nmatches += substring_ref_nmatches;
4214 }
4215 }
4216
4217 /* Prepare for next iteration */
4218 qstart = qend;
4219 if (j != NULL) {
4220 if ((junction = (Junction_T) List_head(j)) == NULL) {
4221 /* qstart_junction */
4222 } else if ((adj0 = Junction_adj(junction)) != 0) {
4223 /* adj += adj0; */
4224 indel_score += indel_penalty_middle;
4225 nindels += Junction_nindels(junction);
4226 if (adj0 < 0) {
4227 debug7(printf("Adjusting qstart %d up by %d\n",qstart,-adj0));
4228 qstart -= adj0; /* Insertion */
4229 }
4230 }
4231 }
4232 splice5p = false;
4233 }
4234 }
4235
4236 if (qend_alts != NULL) {
4237 debug7(printf("Adding %d matches for qend_alts\n",Substring_nmatches(qend_alts)));
4238 nmatches += Substring_nmatches(qend_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4239 *ref_nmatches += Substring_ref_nmatches(qend_alts); /* Not nmatches_to_trims, which is 0 for alts_substring */
4240 #ifdef MAKE_JUNCTION
4241 newjunctions = List_reverse(newjunctions);
4242 donor_prob = Substring_amb_donor_prob(qend_alts);
4243 acceptor_prob = Substring_amb_acceptor_prob(qend_alts);
4244 qend_junction = Junction_new_ambig_splice(orig_sensedir,donor_prob,acceptor_prob);
4245 /* printf("Creating junction with donor_prob %f and acceptor_prob %f\n",donor_prob,acceptor_prob); */
4246 newjunctions = Listpool_push(newjunctions,listpool,(void *) qend_junction);
4247 newjunctions = List_reverse(newjunctions);
4248 #endif
4249 }
4250
4251
4252 nindelbreaks = 0;
4253 n_large_indels = 0;
4254
4255 for (p = newjunctions; p != NULL; p = List_next(p)) {
4256 junction = (Junction_T) List_head(p);
4257 /* CHIMERA_JUNCTION not possible */
4258 if (junction == NULL) {
4259 /* qstart_junction */
4260 } else if (Junction_type(junction) == SPLICE_JUNCTION) {
4261 /* No indel breaks. ? Add penalty for bad splice probs */
4262
4263 } else if (Junction_type(junction) == INS_JUNCTION) {
4264 nindelbreaks++;
4265 if (Junction_nindels(junction) > 6) {
4266 n_large_indels++;
4267 }
4268 } else if (Junction_type(junction) == DEL_JUNCTION) {
4269 nindelbreaks++;
4270 if (Junction_nindels(junction) > 6) {
4271 n_large_indels++;
4272 }
4273 }
4274 }
4275
4276 #if 0
4277 nmatches = nmatches - nindelbreaks*indel_penalty_middle - n_large_indels*3;
4278 for (p = newjunctions; p != NULL; p = List_next(p)) {
4279 if ((junction = List_head(p)) != NULL) {
4280 nmatches += Junction_ninserts(junction);
4281 }
4282 }
4283 #endif
4284
4285
4286 #ifdef MAKE_JUNCTION
4287 Junction_free(&qstart_junction);
4288 Junction_free(&qend_junction);
4289 #endif
4290
4291 debug7(printf("Stage3end_nmatches_substrings returning %d matches\n",nmatches));
4292 /* List_free(&newjunctions); -- allocated by Listpool_push */
4293
4294 assert(nmatches <= querylength);
4295 return nmatches;
4296 }
4297
4298
4299
4300 /* endpoints are all in qstart/qend convention. Need to convert to
4301 querystart and queryend when creating Substring_T objects */
4302 /* Three actions at each end: extend, chop, or compute_trim */
4303 T
Stage3end_new_substrings(int * found_score_overall,int * found_score_within_trims,Intlist_T endpoints,Univcoordlist_T univdiagonals,Intlist_T nmismatches_list,Intlist_T ref_nmismatches_list,List_T junctions,int querylength,Compress_T query_compress,Substring_T qend_alts,Substring_T qstart_alts,bool plusp,int genestrand,int sensedir,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool splice5p_in,Splicetype_T splicetype5,double ambig_prob_5,bool splice3p_in,Splicetype_T splicetype3,double ambig_prob_3,Listpool_T listpool,Method_T method,int level)4304 Stage3end_new_substrings (int *found_score_overall, int *found_score_within_trims,
4305 Intlist_T endpoints, Univcoordlist_T univdiagonals,
4306 Intlist_T nmismatches_list, Intlist_T ref_nmismatches_list, List_T junctions,
4307 int querylength, Compress_T query_compress,
4308 Substring_T qend_alts, Substring_T qstart_alts,
4309 bool plusp, int genestrand, int sensedir,
4310 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
4311 bool splice5p_in, Splicetype_T splicetype5, double ambig_prob_5,
4312 bool splice3p_in, Splicetype_T splicetype3, double ambig_prob_3,
4313 Listpool_T listpool, Method_T method, int level) {
4314 T new;
4315
4316 Univcoord_T genomicstart, genomicend;
4317 int querylength_trimmed = 0;
4318 int qstart, qend, queryspan;
4319 Univcoord_T univdiagonal, left;
4320 Intlist_T r, x, y;
4321 Univcoordlist_T q;
4322 Substring_T substring, substring1, substringN;
4323 Junction_T junction;
4324 List_T substrings_HtoL, substrings_LtoH, junctions_LtoH;
4325 List_T substrings = NULL, p, j;
4326 List_T newjunctions;
4327 bool splice5p, splice3p, passp;
4328 int adj = 0, adj0; /* deletions - insertions */
4329 int nmismatches, ref_nmismatches, indel_score = 0, nindels = 0;
4330 int nmismatches_bothdiff = 0, nmismatches_refdiff = 0;
4331 int new_sensedir;
4332 bool contradictionp;
4333 int nsites, nindelbreaks, n_large_indels;
4334 double prob_total, donor_prob, acceptor_prob;
4335
4336
4337 debug7(printf("Entered Stage3end_new_substrings, method %s, with %s, plusp %d, splice5p %d, splice3p %d\n",
4338 Method_string(method),Intlist_to_string(endpoints),plusp,splice5p_in,splice3p_in));
4339
4340 #ifdef DEBUG0
4341 printf("Entered Stage3end_new_substrings, method %s, at univdiagonal %u [%u], with chrnum #%d, plusp %d, sensedir %d, and endpoints %s\n",
4342 Method_string(method),Univcoordlist_head(univdiagonals),Univcoordlist_head(univdiagonals) - chroffset,chrnum,plusp,sensedir,Intlist_to_string(endpoints));
4343 printf("There are %d endpoints, %d univdiagonals, %d nmismatches, and %d junctions\n",
4344 Intlist_length(endpoints),Univcoordlist_length(univdiagonals),Intlist_length(nmismatches_list),List_length(junctions));
4345 if (qstart_alts != NULL) {
4346 printf("qstart_alts at %d..%d. ",Substring_querystart(qstart_alts),Substring_queryend(qstart_alts));
4347 Substring_print_alts_coords(qstart_alts);
4348 printf("\n");
4349 }
4350 if (qend_alts != NULL) {
4351 printf("qend_alts at %d..%d. ",Substring_querystart(qend_alts),Substring_queryend(qend_alts));
4352 Substring_print_alts_coords(qend_alts);
4353 printf("\n");
4354 }
4355 printf("Endpoints: %s\n",Intlist_to_string(endpoints));
4356 printf("Univdiagonals: %s\n",Univcoordlist_to_string_offset(univdiagonals,chroffset));
4357 printf("Mismatches: %s\n",Intlist_to_string(nmismatches_list));
4358 printf("Ref mismatches: %s\n",Intlist_to_string(ref_nmismatches_list));
4359 #endif
4360
4361 assert(Univcoordlist_length(univdiagonals) == Intlist_length(endpoints) - 1);
4362 assert(Intlist_length(nmismatches_list) == Intlist_length(endpoints) - 1);
4363 assert(Intlist_length(ref_nmismatches_list) == Intlist_length(endpoints) - 1);
4364 assert(List_length(junctions) == Intlist_length(endpoints) - 2);
4365
4366
4367 newjunctions = Junction_copy_list(junctions,listpool);
4368
4369 #ifdef DEBUG0
4370 for (p = newjunctions; p != NULL; p = List_next(p)) {
4371 Junction_print((Junction_T) List_head(p));
4372 }
4373 printf("\n");
4374 #endif
4375
4376 qstart = Intlist_head(endpoints);
4377 nmismatches = Intlist_head(nmismatches_list);
4378 ref_nmismatches = Intlist_head(ref_nmismatches_list);
4379
4380 if (plusp == true) {
4381 j = newjunctions; /* Put here before we handle qstart_alts */
4382 if (qstart_alts != NULL) {
4383 substrings = Listpool_push(substrings,listpool,(void *) Substring_copy(qstart_alts));
4384 donor_prob = Substring_amb_donor_prob(qstart_alts);
4385 acceptor_prob = Substring_amb_acceptor_prob(qstart_alts);
4386 junction = Junction_new_ambig_splice(sensedir,donor_prob,acceptor_prob);
4387 newjunctions = Listpool_push(newjunctions,listpool,(void *) junction);
4388 splice5p = false;
4389 } else {
4390 splice5p = splice5p_in;
4391 }
4392
4393 /* Add qpos to get alignstart/alignend */
4394 for (q = univdiagonals, x = nmismatches_list, y = ref_nmismatches_list, r = Intlist_next(endpoints); q != NULL;
4395 q = Univcoordlist_next(q), x = Intlist_next(x), y = Intlist_next(y), r = Intlist_next(r), j = List_next(j)) {
4396 qend = Intlist_head(r);
4397 nmismatches = Intlist_head(x);
4398 ref_nmismatches = Intlist_head(y);
4399 univdiagonal = Univcoordlist_head(q);
4400 left = univdiagonal - (Univcoord_T) querylength;
4401 debug0(printf("Stage3end_new_substrings: qstart %d..qend %d at univdiagonal %u [%u}\n",
4402 qstart,qend,univdiagonal,univdiagonal - chroffset));
4403
4404 /* genomicstart = left; */
4405 /* genomicend = left + querylength; */
4406 /* alignstart = genomicstart + qstart; */
4407 /* alignend = genomicstart + queryend; */
4408
4409 if (nmismatches >= 0 && ref_nmismatches >= 0) {
4410 debug7(printf("Checking mismatches at %u from querystart %d to queryend %d\n",univdiagonal - chroffset,qstart,qend));
4411 debug7(printf("%d vs %d\n",nmismatches,
4412 Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4413 /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand)));
4414 #ifdef CHECK_NMISMATCHES
4415 assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4416 /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand));
4417 #endif
4418 } else {
4419 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4420 /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand);
4421 }
4422 if (Univcoordlist_next(q) != NULL || qend_alts != NULL) {
4423 splice3p = false;
4424 } else {
4425 splice3p = splice3p_in;
4426 }
4427
4428 if ((substring = Substring_new(nmismatches,ref_nmismatches,left,/*querystart*/qstart,/*queryend*/qend,querylength,
4429 /*plusp*/true,genestrand,query_compress,
4430 chrnum,chroffset,chrhigh,chrlength,
4431 /*splice_querystart_p*/splice5p,/*splicetype_querystart*/splicetype5,
4432 /*ambig_prob_querystart*/ambig_prob_5,
4433 /*splice_queryend_p*/splice3p,/*splicetype_queryend*/splicetype3,
4434 /*ambig_prob_queryend*/ambig_prob_3,sensedir)) == NULL) {
4435 /* Don't know how to fix junctions */
4436 debug0(printf("Poor substring (plus) for %d..%d, so returning NULL from Stage3end_new_substrings\n",
4437 qstart,qend));
4438 for (p = substrings; p != NULL; p = List_next(p)) {
4439 substring = (Substring_T) List_head(p);
4440 if (substring == qstart_alts) {
4441 /* qstart_alts freed by calling procedure. Need to free junction created for querystart_alts. */
4442 /* junctions = List_pop(junctions,(void **) &junction); */
4443 /* Junction_free(&junction); */
4444 } else {
4445 Substring_free(&substring);
4446 }
4447 }
4448 /* List_free(&substrings); -- allocated by Listpool_push */
4449 debug0(printf("Stage3end_new_substrings returning NULL\n"));
4450 Junction_list_gc(&newjunctions);
4451 return (T) NULL;
4452
4453 } else {
4454 debug7(printf("Substring_new returns nmismatches %d, nmatches %d, ambp %d, amb %d over querypos %d..%d\n",
4455 Substring_nmismatches_bothdiff(substring),
4456 Substring_nmatches(substring),Substring_ambiguous_p(substring),
4457 Substring_amb_length(substring),Substring_querystart(substring),Substring_queryend(substring)));
4458
4459 debug0(printf("Substring_new returns nmismatches %d, nmatches %d, ambp %d, amb %d over querypos %d..%d\n",
4460 Substring_nmismatches_bothdiff(substring),
4461 Substring_nmatches(substring),Substring_ambiguous_p(substring),
4462 Substring_amb_length(substring),Substring_querystart(substring),Substring_queryend(substring)));
4463 substrings = Listpool_push(substrings,listpool,(void *) substring);
4464 nmismatches_bothdiff += Substring_nmismatches_bothdiff(substring);
4465 nmismatches_refdiff += Substring_nmismatches_refdiff(substring);
4466 querylength_trimmed += Substring_match_length(substring);
4467 }
4468
4469 /* Prepare for next iteration */
4470 qstart = qend;
4471 if (j != NULL) {
4472 junction = (Junction_T) List_head(j);
4473 if ((adj0 = Junction_adj(junction)) != 0) {
4474 adj += adj0;
4475 indel_score += indel_penalty_middle;
4476 nindels += Junction_nindels(junction);
4477 if (adj0 < 0) {
4478 qstart -= adj0; /* Insertion */
4479 }
4480 }
4481 }
4482 splice5p = false;
4483 }
4484
4485 } else {
4486 j = newjunctions; /* Put here before we handle querystart_alts */
4487 if (qstart_alts != NULL) {
4488 substrings = Listpool_push(substrings,listpool,(void *) Substring_copy(qstart_alts));
4489 donor_prob = Substring_amb_donor_prob(qstart_alts);
4490 acceptor_prob = Substring_amb_acceptor_prob(qstart_alts);
4491 junction = Junction_new_ambig_splice(sensedir,donor_prob,acceptor_prob);
4492 /* printf("Creating junction with donor_prob %f and acceptor_prob %f\n",donor_prob,acceptor_prob); */
4493 newjunctions = Listpool_push(newjunctions,listpool,(void *) junction);
4494 splice5p = false;
4495 } else {
4496 splice5p = splice5p_in;
4497 }
4498
4499 /* Subtract qpos to get alignstart/alignend */
4500 for (q = univdiagonals, x = nmismatches_list, y = ref_nmismatches_list, r = Intlist_next(endpoints); q != NULL;
4501 q = Univcoordlist_next(q), x = Intlist_next(x), y = Intlist_next(y), r = Intlist_next(r), j = List_next(j)) {
4502 qend = Intlist_head(r);
4503 nmismatches = Intlist_head(x);
4504 ref_nmismatches = Intlist_head(y);
4505 univdiagonal = Univcoordlist_head(q);
4506 left = univdiagonal - (Univcoord_T) querylength;
4507 debug0(printf("Stage3end_new_substrings: qstart %d..qend %d at univdiagonal %u [%u]\n",
4508 qstart,qend,univdiagonal,univdiagonal - chroffset));
4509
4510 /* genomicend = left; */
4511 /* genomicstart = left + querylength; */
4512 /* genomicend_adj = genomicend - adj; */
4513 /* genomicstart_adj = genomicend - adj; */
4514 /* alignstart = genomicstart - (querylength - qend); */
4515 /* alignend = genomicstart - (querylength - qstart); */
4516
4517 if (nmismatches >= 0 && ref_nmismatches >= 0) {
4518 #ifdef CHECK_NMISMATCHES
4519 assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4520 /*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand));
4521 #endif
4522 } else {
4523 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
4524 /*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand);
4525 }
4526 if (Univcoordlist_next(q) != NULL || qend_alts != NULL) {
4527 splice3p = false;
4528 } else {
4529 splice3p = splice3p_in;
4530 }
4531
4532 if ((substring = Substring_new(nmismatches,ref_nmismatches,left,/*querystart*/querylength - qend,
4533 /*queryend*/querylength - qstart,querylength,
4534 /*plusp*/false,genestrand,query_compress,
4535 chrnum,chroffset,chrhigh,chrlength,
4536 /*splice_querystart_p*/splice3p,/*splicetype_querystart*/splicetype3,
4537 /*ambig_prob_querystart*/ambig_prob_3,
4538 /*splice_queryend_p*/splice5p,/*splicetype_queryend*/splicetype5,
4539 /*ambig_prob_queryend*/ambig_prob_5,sensedir)) == NULL) {
4540 /* Don't know how to fix junctions */
4541 debug0(printf("Poor substring (minus) for %d..%d, so returning NULL from Stage3end_new_substrings\n",
4542 querylength - qend,querylength - qstart));
4543 for (p = substrings; p != NULL; p = List_next(p)) {
4544 substring = (Substring_T) List_head(p);
4545 if (substring == qstart_alts) {
4546 /* querystart_alts freed by calling procedure. Need to free junction created for querystart_alts. */
4547 /* junctions = List_pop(junctions,(void **) &junction); */
4548 /* Junction_free(&junction); */
4549 } else {
4550 Substring_free(&substring);
4551 }
4552 }
4553 /* List_free(&substrings); -- allocated by Listpool_push */
4554
4555 debug0(printf("Stage3end_new_substrings returning NULL\n"));
4556 Junction_list_gc(&newjunctions);
4557 return (T) NULL;
4558
4559 } else {
4560 debug7(printf("Substring_new returns nmismatches %d, nmatches %d, ambp %d, amb %d over querypos %d..%d\n",
4561 Substring_nmismatches_bothdiff(substring),
4562 Substring_nmatches(substring),Substring_ambiguous_p(substring),
4563 Substring_amb_length(substring),Substring_querystart(substring),Substring_queryend(substring)));
4564
4565 debug0(printf("Substring_new returns nmismatches %d, nmatches %d, ambp %d, amb %d over querypos %d..%d\n",
4566 Substring_nmismatches_bothdiff(substring),
4567 Substring_nmatches(substring),Substring_ambiguous_p(substring),
4568 Substring_amb_length(substring),Substring_querystart(substring),Substring_queryend(substring)));
4569 substrings = Listpool_push(substrings,listpool,(void *) substring);
4570 nmismatches_bothdiff += Substring_nmismatches_bothdiff(substring);
4571 nmismatches_refdiff += Substring_nmismatches_refdiff(substring);
4572 querylength_trimmed += Substring_match_length(substring);
4573 }
4574
4575 /* Prepare for next iteration */
4576 qstart = qend;
4577 if (j != NULL) {
4578 junction = (Junction_T) List_head(j);
4579 if ((adj0 = Junction_adj(junction)) != 0) {
4580 adj += adj0;
4581 indel_score += indel_penalty_middle;
4582 nindels += Junction_nindels(junction);
4583 if (adj0 < 0) {
4584 qstart -= adj0; /* Insertion */
4585 }
4586 }
4587 }
4588 splice5p = false;
4589 }
4590 }
4591
4592 if (qend_alts != NULL) {
4593 substrings = Listpool_push(substrings,listpool,(void *) Substring_copy(qend_alts));
4594 newjunctions = List_reverse(newjunctions);
4595 donor_prob = Substring_amb_donor_prob(qend_alts);
4596 acceptor_prob = Substring_amb_acceptor_prob(qend_alts);
4597 junction = Junction_new_ambig_splice(sensedir,donor_prob,acceptor_prob);
4598 /* printf("Creating junction with donor_prob %f and acceptor_prob %f\n",donor_prob,acceptor_prob); */
4599 newjunctions = Listpool_push(newjunctions,listpool,(void *) junction);
4600 newjunctions = List_reverse(newjunctions);
4601 }
4602
4603 #ifdef DEBUG0
4604 printf("NEW JUNCTIONS\n");
4605 for (p = newjunctions; p != NULL; p = List_next(p)) {
4606 Junction_print(List_head(p));
4607 }
4608 printf("\n");
4609 #endif
4610
4611
4612 if (plusp == true) {
4613 substring1 = List_last_value(substrings);
4614 substringN = List_head(substrings);
4615 } else {
4616 substring1 = List_head(substrings);
4617 substringN = List_last_value(substrings);
4618 }
4619
4620 debug0(printf("Trim left: %d. Trim right: %d\n",
4621 Substring_trim_querystart(substring1),Substring_trim_queryend(substringN)));
4622
4623 passp = true;
4624 if (Substring_chrnum(substring1) != Substring_chrnum(substringN)) {
4625 debug0(printf("ABORTING BECAUSE SUBSTRINGS HAVE DIFFERENT CHRNUMS: %d AND %d\n",
4626 Substring_chrnum(substring1),Substring_chrnum(substringN)));
4627 passp = false;
4628
4629 } else if (circularp[chrnum] == true && plusp == true && Substring_alignend_trim(substringN) - Substring_alignstart_trim(substring1) >= chrlength) {
4630 debug0(printf("ABORTING BECAUSE CIRCULAR CHROMOSOME CHRLENGTH %u AND ALIGNMENT %u..%u\n",
4631 chrlength,Substring_alignstart_trim(substring1),Substring_alignend_trim(substringN)));
4632 passp = false;
4633
4634 } else if (circularp[chrnum] == true && plusp == false && Substring_alignstart_trim(substring1) - Substring_alignend_trim(substringN) >= chrlength) {
4635 debug0(printf("ABORTING BECAUSE CIRCULAR CHROMOSOME CHRLENGTH %u AND ALIGNMENT %u..%u\n",
4636 chrlength,Substring_alignstart_trim(substring1),Substring_alignend_trim(substringN)));
4637 passp = false;
4638
4639 } else if ((queryspan = Substring_queryend(substringN) - Substring_querystart(substring1)) == querylength) {
4640 /* Allow short queries to match completely */
4641
4642 #if 0
4643 } else if (queryspan < MIN_ALIGNMENT_LEN) {
4644 debug0(printf("ABORTING BECAUSE QUERYSPAN %d < MIN_ALIGNMENT_LEN %d\n",
4645 queryspan,MIN_ALIGNMENT_LEN));
4646 passp = false;
4647 #endif
4648
4649 }
4650
4651
4652 if (passp == false) {
4653 for (p = substrings; p != NULL; p = List_next(p)) {
4654 substring = (Substring_T) List_head(p);
4655 if (substring == qstart_alts || substring == qend_alts) {
4656 /* qstart_alts and qend_alts freed by calling procedure */
4657 } else {
4658 Substring_free(&substring);
4659 }
4660 }
4661 /* List_free(&substrings); -- allocated by Listpool_push */
4662
4663 debug0(printf("Stage3end_new_substrings returning NULL\n"));
4664 Junction_list_gc(&newjunctions);
4665 return (T) NULL;
4666 }
4667
4668
4669 new = (T) MALLOC_OUT(sizeof(*new));
4670 new->hittype = SUBSTRINGS;
4671 new->method = method;
4672 new->level = level;
4673
4674 new->transcripts = (List_T) NULL;
4675 new->transcripts_other = (List_T) NULL;
4676
4677 new->querylength = querylength;
4678 new->querylength_adj = querylength + adj;
4679
4680 /* Note differences between substrings and junctions. Substrings
4681 were pushed onto lists above, and junctions were created by the
4682 caller, so they are originally in opposite orders */
4683 substrings_HtoL = substrings;
4684 substrings_LtoH = List_reverse(Listpool_copy(substrings,listpool));
4685 junctions_LtoH = newjunctions;
4686
4687 if (plusp == true) {
4688 new->substrings_1toN = substrings_LtoH;
4689 new->substrings_Nto1 = substrings_HtoL;
4690
4691 new->junctions_1toN = junctions_LtoH;
4692 new->junctions_Nto1 = List_reverse(Listpool_copy(junctions_LtoH,listpool));
4693
4694 } else {
4695 new->substrings_1toN = substrings_HtoL;
4696 new->substrings_Nto1 = substrings_LtoH;
4697
4698 new->junctions_1toN = List_reverse(Listpool_copy(junctions_LtoH,listpool));
4699 new->junctions_Nto1 = junctions_LtoH;
4700 }
4701
4702
4703 #ifdef DEBUG0
4704 printf("NEW SUBSTRINGS\n");
4705 for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
4706 substring = List_head(p);
4707 if (Substring_has_alts_p(substring) == true) {
4708 printf("%d..%d\t#%d\talts\tmatches_to_trims: %d\tamb:%d\t%d common_prob:%f alts:",
4709 Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
4710 Substring_nmatches_to_trims(substring),Substring_amb_length(substring),
4711 Substring_alts_ncoords(substring),Substring_alts_common_prob(substring));
4712 Substring_print_alts_coords(substring);
4713 printf("\n");
4714
4715 } else if (Substring_ambiguous_p(substring) == true) {
4716 printf("%d..%d\t#%d\t%u..%u\tambig\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\tprobs:%f and %f\n",
4717 Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
4718 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),
4719 Substring_nmismatches_bothdiff(substring),Substring_nmatches_to_trims(substring),Substring_amb_length(substring),
4720 Substring_amb_donor_prob(substring),Substring_amb_acceptor_prob(substring));
4721 } else {
4722 printf("%d..%d\t#%d\t%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",
4723 Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
4724 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),
4725 Substring_nmismatches_bothdiff(substring),Substring_nmatches_to_trims(substring),
4726 Substring_amb_length(substring));
4727 }
4728 }
4729 printf("\n");
4730 #endif
4731
4732
4733 substring1 = (Substring_T) List_head(new->substrings_1toN);
4734 substringN = (Substring_T) List_head(new->substrings_Nto1);
4735
4736 new->trim_querystart = Substring_trim_querystart(substring1);
4737 new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring1);
4738 new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring1);
4739 new->trim_queryend = Substring_trim_queryend(substringN);
4740 new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substringN);
4741 new->trim_queryend_splicep = Substring_trim_queryend_splicep(substringN);
4742 debug0(printf(" trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
4743 debug0(printf(" trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
4744
4745 new->querystart_chrbound = Substring_querystart_chrbound(substring1);
4746 new->queryend_chrbound = Substring_queryend_chrbound(substringN);
4747 if (new->trim_querystart > new->querystart_chrbound) {
4748 new->querystart_chrbound = new->trim_querystart;
4749 }
4750 if (querylength - new->trim_queryend < new->queryend_chrbound) {
4751 new->queryend_chrbound = querylength - new->trim_queryend;
4752 }
4753 assert(new->querystart_chrbound < new->queryend_chrbound);
4754 debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
4755
4756
4757 genomicstart = Substring_genomicstart(substring1);
4758 genomicend = Substring_genomicend(substringN);
4759 new->genomicstart = genomicstart;
4760 new->genomicend = genomicend;
4761
4762 if (plusp == true) {
4763 new->low = genomicstart + new->querystart_chrbound;
4764 new->high = genomicend - (querylength - new->queryend_chrbound);
4765 new->genomiclength = genomicend - genomicstart;
4766 } else {
4767 new->low = genomicend + (querylength - new->queryend_chrbound);
4768 new->high = genomicstart - new->querystart_chrbound;
4769 new->genomiclength = genomicstart - genomicend;
4770 }
4771 assert(new->low < new->high);
4772 debug0(printf("low %u, high %u\n",new->low - chroffset,new->high - chroffset));
4773
4774 new->guided_insertlength = 0U;
4775
4776 new->distant_splice_p = false;
4777 new->chrnum = new->effective_chrnum = chrnum;
4778 new->other_chrnum = 0;
4779 new->chroffset = chroffset;
4780 new->chrhigh = chrhigh;
4781 new->chrlength = chrlength;
4782 new->plusp = plusp;
4783 new->genestrand = genestrand;
4784
4785 if (sensedir != SENSE_NULL) {
4786 debug0(printf("sensedir is %d (original)\n",sensedir));
4787 new->sensedir = sensedir;
4788 } else {
4789 new->sensedir = SENSE_NULL;
4790 contradictionp = false;
4791 for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
4792 substring = (Substring_T) List_head(p);
4793 debug0(printf("substring has sensedir %d\n",Substring_sensedir(substring)));
4794 if (Substring_sensedir(substring) == SENSE_NULL) {
4795 /* Ignore */
4796 } else if (new_sensedir == SENSE_NULL) {
4797 new_sensedir = Substring_sensedir(substring);
4798 } else if (Substring_sensedir(substring) != new_sensedir) {
4799 contradictionp = true;
4800 }
4801 }
4802
4803 for (p = new->junctions_1toN; p != NULL; p = List_next(p)) {
4804 junction = (Junction_T) List_head(p);
4805 debug0(printf("junction has sensedir %d\n",Junction_sensedir(junction)));
4806 if (Junction_sensedir(junction) == SENSE_NULL) {
4807 /* Ignore. Probably an indel. */
4808 } else if (new_sensedir == SENSE_NULL) {
4809 new_sensedir = Junction_sensedir(junction);
4810 } else if (Junction_sensedir(junction) != new_sensedir) {
4811 contradictionp = true;
4812 }
4813 }
4814
4815 if (contradictionp == true) {
4816 debug0(printf("CONTRADICTION IN SENSEDIR\n"));
4817 new->sensedir = SENSE_NULL;
4818 } else {
4819 debug0(printf("sensedir is %d\n",new_sensedir));
4820 new->sensedir = new_sensedir;
4821 }
4822 }
4823 new->sensedir_for_concordance = new->sensedir;
4824
4825 prob_total = 0.0;
4826 nsites = 0;
4827 if (splice5p_in == true) {
4828 prob_total += ambig_prob_5;
4829 nsites++;
4830 }
4831 if (splice3p_in == true) {
4832 prob_total += ambig_prob_3;
4833 nsites++;
4834 }
4835
4836 new->nsplices = 0;
4837 for (p = newjunctions; p != NULL; p = List_next(p)) {
4838 junction = (Junction_T) List_head(p);
4839 if (Junction_type(junction) == SPLICE_JUNCTION) {
4840 prob_total += Junction_splice_score(junction);
4841 nsites += 2;
4842 new->nsplices += 1;
4843 }
4844 }
4845 if (nsites == 0) {
4846 new->splice_score = 0.0;
4847 } else {
4848 new->splice_score = prob_total / (double) nsites;
4849 }
4850 debug0(printf("SPLICE SCORE: %f\n",new->splice_score));
4851
4852
4853 nindelbreaks = 0;
4854 n_large_indels = 0;
4855 for (p = newjunctions; p != NULL; p = List_next(p)) {
4856 junction = (Junction_T) List_head(p);
4857 /* CHIMERA_JUNCTION not possible */
4858 if (Junction_type(junction) == INS_JUNCTION) {
4859 nindelbreaks++;
4860 if (Junction_nindels(junction) > 6) {
4861 n_large_indels++;
4862 }
4863 } else if (Junction_type(junction) == DEL_JUNCTION) {
4864 nindelbreaks++;
4865 if (Junction_nindels(junction) > 6) {
4866 n_large_indels++;
4867 }
4868 }
4869 }
4870
4871
4872 /* nmismatches_bothdiff is computed after trimming */
4873 new->nindels = nindels;
4874 new->nmismatches_bothdiff = nmismatches_bothdiff; /* Trimmed */
4875 new->nmismatches_refdiff = nmismatches_refdiff;
4876 new->nsegments = List_length(new->substrings_1toN);
4877
4878
4879 new->refalt_nmatches_to_trims = new->ref_nmatches_to_trims = 0;
4880 /* Note: Cannot use substrings variable here. Need to use new->substrings_1toN */
4881 for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
4882 substring = (Substring_T) List_head(p);
4883 new->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
4884 new->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
4885 }
4886 debug0(printf("Setting nmatches_to_trims to be %d\n",new->refalt_nmatches_to_trims));
4887
4888 new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims + Substring_start_amb_length(substring1) + Substring_end_amb_length(substringN);
4889 new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + Substring_start_amb_length(substring1) + Substring_end_amb_length(substringN);
4890 debug0(printf("Setting nmatches_plus_spliced_trims to be %d = %d + %d + %d\n",
4891 new->ref_nmatches_plus_spliced_trims,new->ref_nmatches_to_trims,
4892 Substring_start_amb_length(substring1),Substring_end_amb_length(substringN)));
4893
4894 for (p = new->junctions_1toN; p != NULL; p = List_next(p)) {
4895 junction = List_head(p);
4896 new->refalt_nmatches_plus_spliced_trims += Junction_ninserts(junction);
4897 new->ref_nmatches_plus_spliced_trims += Junction_ninserts(junction);
4898 }
4899 assert(new->refalt_nmatches_plus_spliced_trims >= 0);
4900 assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
4901
4902 new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
4903 new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
4904 /* Needed to make -m flag work properly. Generally improves alignments */
4905 #if 1
4906 new->ref_score_overall += indel_score; /* -nindels was an attempt to compensate for missing matches */
4907 new->refalt_score_overall += indel_score; /* -nindels was an attempt to compensate for missing matches */
4908 #endif
4909 new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims;
4910 if (Substring_trim_querystart_splicep(substring1) == false) {
4911 new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring1)/END_BINSIZE);
4912 } else {
4913 new->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring1)/END_BINSIZE);
4914 }
4915 if (Substring_trim_queryend_splicep(substringN) == false) {
4916 new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((querylength - Substring_queryend(substringN))/END_BINSIZE);
4917 } else {
4918 new->refalt_score_within_trims += SPLICED_END_PENALTY*((querylength - Substring_queryend(substringN))/END_BINSIZE);
4919 }
4920 /* was Substring_start_amb_length(substring1)/AMB_PENALTY - Substring_end_amb_length(substringN)/AMB_PENALTY, but doesn't work for DNA-seq */
4921
4922 if (chrlength < (Univcoord_T) querylength) {
4923 new->ref_score_overall -= ((Univcoord_T) querylength - chrlength);
4924 new->refalt_score_overall -= ((Univcoord_T) querylength - chrlength);
4925 new->refalt_score_within_trims -= ((Univcoord_T) querylength - chrlength);
4926 }
4927 assert(new->refalt_score_within_trims >= 0);
4928
4929
4930 /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
4931 if (new->refalt_score_overall < *found_score_overall) {
4932 *found_score_overall = new->refalt_score_overall;
4933 }
4934 /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
4935 if (new->refalt_score_within_trims < *found_score_within_trims) {
4936 *found_score_within_trims = new->refalt_score_within_trims;
4937 }
4938
4939
4940 /* new->penalties = 0; */
4941
4942 /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
4943 /* new->tally = -1L; */
4944
4945 new->paired_usedp = false;
4946
4947 /* new->query_splicepos = -1; */
4948 new->circularpos = compute_circularpos(&new->circularalias,new);
4949
4950 debug0(printf("%d substrings\n",List_length(new->substrings_1toN)));
4951 debug0(printf("%d junctions\n",List_length(new->junctions_1toN)));
4952 assert(List_length(new->substrings_1toN) == List_length(new->junctions_1toN) + 1);
4953
4954
4955 /* Previously checked for (new->circularalias == +2 || new->circularalias == -2) */
4956
4957 debug7(printf("Stage3end_new_substrings returning %d matches_plus_spliced_trims\n",
4958 new->refalt_nmatches_plus_spliced_trims));
4959
4960 if (new->circularpos >= 0) {
4961 new->altlocp = false;
4962 debug0(printf("*****Method %s: Stage3end_new_substrings returning circular %p from Stage3end_new_substrings with score %d within trims, %d overall (found_score %d), nmatches %d, sensedir %d, splice score %f\n\n",
4963 Method_string(method),new,new->refalt_score_within_trims,new->refalt_score_overall,*found_score_within_trims,
4964 new->refalt_nmatches_plus_spliced_trims,new->sensedir,new->splice_score));
4965 return new;
4966
4967 } else if ((new->altlocp = altlocp[chrnum]) == false) {
4968 debug0(printf("*****Method %s: Stage3end_new_substrings returning primary %p from Stage3end_new_substrings with score %d within trims, %d overall (found_score %d), nmatches %d, sensedir %d, splice score %f\n\n",
4969 Method_string(method),new,new->refalt_score_within_trims,new->refalt_score_overall,*found_score_within_trims,
4970 new->refalt_nmatches_plus_spliced_trims,new->sensedir,new->splice_score));
4971 return new;
4972
4973 } else {
4974 debug0(printf("*****Method %s: Stage3end_new_substrings returning altloc %p from Stage3end_new_substrings with score %d within trims, %d overall (found_score %d), nmatches %d, sensedir %d, splice score %f\n\n",
4975 Method_string(method),new,new->refalt_score_within_trims,new->refalt_score_overall,*found_score_within_trims,
4976 new->refalt_nmatches_plus_spliced_trims,new->sensedir,new->splice_score));
4977 return new;
4978 }
4979 }
4980
4981
4982 #define add_bounded(x,plusterm,highbound) ((x + (plusterm) >= highbound) ? (highbound - 1) : x + (plusterm))
4983 #define subtract_bounded(x,minusterm,lowbound) ((x < lowbound + (minusterm)) ? lowbound : x - (minusterm))
4984
4985
4986 T
Stage3end_new_substitution(int * found_score_overall,int * found_score_within_trims,Univcoord_T univdiagonal,int pos5,int pos3,int querylength,int * mismatch_positions_alloc,Compress_T query_compress,bool plusp,int genestrand,int sensedir,int nmismatches_allowed,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Listpool_T listpool,Method_T method,int level)4987 Stage3end_new_substitution (int *found_score_overall, int *found_score_within_trims,
4988 Univcoord_T univdiagonal, int pos5, int pos3, int querylength,
4989 int *mismatch_positions_alloc, Compress_T query_compress,
4990 bool plusp, int genestrand, int sensedir, int nmismatches_allowed,
4991 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
4992 Chrpos_T chrlength, Listpool_T listpool, Method_T method, int level) {
4993 T new;
4994 Univcoord_T left;
4995 Substring_T substring;
4996 int qstart, qend, nmismatches, ref_nmismatches;
4997 bool splice_querystart_p, splice_queryend_p;
4998 Splicetype_T splicetype_querystart, splicetype_queryend;
4999 double ambig_prob_querystart, ambig_prob_queryend;
5000
5001
5002 debug0(printf("Entered Stage3end_new_substitution, method %s, sensedir %d at univdiagonal %u [%u] and chrhigh %u\n",
5003 Method_string(method),sensedir,univdiagonal,univdiagonal - chroffset,chrhigh));
5004
5005 left = univdiagonal - (Univcoord_T) querylength;
5006
5007 if (plusp == true) {
5008 splice_querystart_p = Substring_qstart_trim(&qstart,&splicetype_querystart,&ambig_prob_querystart,
5009 univdiagonal,pos3,querylength,plusp,genestrand,
5010 mismatch_positions_alloc,query_compress,chroffset,sensedir);
5011 splice_queryend_p = Substring_qend_trim(&qend,&splicetype_queryend,&ambig_prob_queryend,
5012 univdiagonal,pos5,querylength,plusp,genestrand,
5013 mismatch_positions_alloc,query_compress,chroffset,chrhigh,sensedir);
5014
5015 debug0(printf("Trimming querystart yields splicep %d, qstart %d, prob %f\n",splice_querystart_p,qstart,ambig_prob_querystart));
5016 debug0(printf("Trimming queryend yields splicep %d, qend %d, prob %f\n",splice_queryend_p,qend,ambig_prob_queryend));
5017
5018 if (qstart < 0 || qend < 0) {
5019 debug0(printf("Returning NULL\n"));
5020 return (T) NULL;
5021
5022 } else if (qend <= qstart) {
5023 /* Otherwise, calling Genome_count_mismatches_substring will not be defined */
5024 debug0(printf("Returning NULL\n"));
5025 return (T) NULL;
5026
5027 } else {
5028 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
5029 /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,genestrand);
5030 if (nmismatches > nmismatches_allowed) {
5031 debug0(printf("Returning NULL\n"));
5032 return (T) NULL;
5033
5034 } else if ((substring = Substring_new(nmismatches,ref_nmismatches,left,/*querystart*/qstart,/*queryend*/qend,
5035 querylength,/*plusp*/true,genestrand,query_compress,
5036 chrnum,chroffset,chrhigh,chrlength,
5037 splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
5038 splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
5039 sensedir)) == NULL) {
5040 debug0(printf("Returning NULL\n"));
5041 return (T) NULL;
5042 }
5043 }
5044
5045 } else {
5046 /* trim_querystart and trim_queryend Genome_count_mismatches_substring are flipped, but not for Substring_new */
5047 splice_querystart_p = Substring_qend_trim(&qend,&splicetype_querystart,&ambig_prob_querystart,
5048 univdiagonal,pos5,querylength,plusp,genestrand,
5049 mismatch_positions_alloc,query_compress,chroffset,chrhigh,sensedir);
5050 splice_queryend_p = Substring_qstart_trim(&qstart,&splicetype_queryend,&ambig_prob_queryend,
5051 univdiagonal,pos3,querylength,plusp,genestrand,
5052 mismatch_positions_alloc,query_compress,chroffset,sensedir);
5053
5054 debug0(printf("Trimming querystart yields splicep %d, qstart %d, prob %f\n",splice_querystart_p,qstart,ambig_prob_querystart));
5055 debug0(printf("Trimming queryend yields splicep %d, qend %d, prob %f\n",splice_queryend_p,qend,ambig_prob_queryend));
5056
5057 if (qstart < 0 || qend < 0) {
5058 debug0(printf("Returning NULL\n"));
5059 return (T) NULL;
5060
5061 } else if (qend <= qstart) {
5062 /* Otherwise, calling Genome_count_mismatches_substring will not be defined */
5063 debug0(printf("Returning NULL\n"));
5064 return (T) NULL;
5065
5066 } else {
5067 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
5068 /*pos5*/qstart,/*pos3*/qend,/*plusp*/false,genestrand);
5069
5070 if (nmismatches > nmismatches_allowed) {
5071 debug0(printf("Returning NULL\n"));
5072 return (T) NULL;
5073
5074 } else if ((substring = Substring_new(nmismatches,ref_nmismatches,left,/*querystart*/querylength - qend,/*queryend*/querylength - qstart,
5075 querylength,/*plusp*/false,genestrand,query_compress,
5076 chrnum,chroffset,chrhigh,chrlength,
5077 splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
5078 splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
5079 sensedir)) == NULL) {
5080 debug0(printf("Returning NULL\n"));
5081 return (T) NULL;
5082 }
5083 }
5084 }
5085
5086 new = (T) MALLOC_OUT(sizeof(*new));
5087 debug0(printf("Stage3end_new_substitution %p: univdiagonal %llu, chrnum %d, nmismatches %d\n",
5088 new,(unsigned long long) univdiagonal,Substring_chrnum(substring),nmismatches));
5089
5090 new->substrings_1toN = Listpool_push(NULL,listpool,(void *) substring);
5091 new->substrings_Nto1 = Listpool_push(NULL,listpool,(void *) substring);
5092
5093 new->junctions_1toN = (List_T) NULL;
5094 new->junctions_Nto1 = (List_T) NULL;
5095
5096 #if 0
5097 if (plusp) {
5098 new->trim_querystart = qstart;
5099 new->trim_queryend = querylength - qend;
5100 } else {
5101 new->trim_querystart = querylength - qend;
5102 new->trim_queryend = qstart;
5103 }
5104 new->trim_querystart_splicep = splice_querystart_p;
5105 new->trim_queryend_splicep = splice_queryend_p;
5106 #else
5107 new->trim_querystart = Substring_trim_querystart(substring);
5108 new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring);
5109 new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring);
5110 new->trim_queryend = Substring_trim_queryend(substring);
5111 new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substring);
5112 new->trim_queryend_splicep = Substring_trim_queryend_splicep(substring);
5113 #endif
5114 debug0(printf(" trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
5115 debug0(printf(" trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
5116
5117 new->querystart_chrbound = Substring_querystart_chrbound(substring);
5118 new->queryend_chrbound = Substring_queryend_chrbound(substring);
5119 if (new->trim_querystart > new->querystart_chrbound) {
5120 new->querystart_chrbound = new->trim_querystart;
5121 }
5122 if (querylength - new->trim_queryend < new->queryend_chrbound) {
5123 new->queryend_chrbound = querylength - new->trim_queryend;
5124 }
5125 assert(new->querystart_chrbound < new->queryend_chrbound);
5126 debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
5127
5128
5129 new->transcripts = (List_T) NULL;
5130 new->transcripts_other = (List_T) NULL;
5131
5132 new->querylength_adj = new->querylength = querylength;
5133 if (plusp == true) {
5134 new->genomicstart = left;
5135 new->genomicend = left + (Univcoord_T) querylength;
5136 new->low = new->genomicstart + (Univcoord_T) new->querystart_chrbound;
5137 new->high = new->genomicend - (Univcoord_T) (querylength - new->queryend_chrbound);
5138 } else {
5139 new->genomicend = left;
5140 new->genomicstart = left + (Univcoord_T) querylength;
5141 new->low = new->genomicend + (Univcoord_T) (querylength - new->queryend_chrbound);
5142 new->high = new->genomicstart - (Univcoord_T) new->querystart_chrbound;
5143 }
5144 assert(new->low < new->high);
5145 debug0(printf("low %u, high %u\n",new->low - chroffset,new->high - chroffset));
5146
5147 new->genomiclength = querylength;
5148
5149 new->guided_insertlength = 0U;
5150
5151 #if 0
5152 if (nmismatches == 0) {
5153 /* Proper hittype needed so we can eliminate identical hits */
5154 new->hittype = EXACT;
5155 } else {
5156 new->hittype = SUB;
5157 }
5158 #else
5159 new->hittype = SUB;
5160 #endif
5161 new->method = method;
5162 new->level = level;
5163
5164
5165 /* Note: It is possible that Substring_new has assigned a new chrnum, different from the one given */
5166 new->distant_splice_p = false;
5167 new->chrnum = new->effective_chrnum = Substring_chrnum(substring);
5168 new->other_chrnum = 0;
5169 new->chroffset = Substring_chroffset(substring);
5170 new->chrhigh = Substring_chrhigh(substring);
5171 new->chrlength = Substring_chrlength(substring);
5172 new->plusp = plusp;
5173 new->genestrand = genestrand;
5174
5175 new->sensedir_for_concordance = new->sensedir = sensedir;
5176
5177 #if 0
5178 new->mapq_loglik = Substring_mapq_loglik(substring);
5179 new->mapq_score = 0;
5180 new->absmq_score = 0;
5181 #endif
5182
5183 new->nindels = 0;
5184 new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(substring);
5185 new->nmismatches_refdiff = Substring_nmismatches_refdiff(substring);
5186 new->nsegments = 1;
5187
5188
5189 new->refalt_nmatches_to_trims = Substring_nmatches_to_trims(substring);
5190 new->ref_nmatches_to_trims = Substring_ref_nmatches_to_trims(substring);
5191 new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims + Substring_amb_length(substring);
5192 new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + Substring_amb_length(substring);
5193 assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
5194
5195 new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
5196 new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
5197 new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims;
5198 if (Substring_trim_querystart_splicep(substring) == false) {
5199 new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring)/END_BINSIZE);
5200 } else {
5201 new->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring)/END_BINSIZE);
5202 }
5203 if (Substring_trim_queryend_splicep(substring) == false) {
5204 new->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((querylength - Substring_queryend(substring))/END_BINSIZE);
5205 } else {
5206 new->refalt_score_within_trims += SPLICED_END_PENALTY*((querylength - Substring_queryend(substring))/END_BINSIZE);
5207 }
5208 /* was Substring_amb_length(substring)/AMB_PENALTY, but doesn't work for DNA-seq */
5209
5210 if (chrlength < (Univcoord_T) querylength) {
5211 new->ref_score_overall -= ((Univcoord_T) querylength - chrlength);
5212 new->refalt_score_overall -= ((Univcoord_T) querylength - chrlength);
5213 new->refalt_score_within_trims -= ((Univcoord_T) querylength - chrlength);
5214 }
5215 assert(new->refalt_score_within_trims >= 0);
5216
5217
5218 /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
5219 if (new->refalt_score_overall < *found_score_overall) {
5220 *found_score_overall = new->refalt_score_overall;
5221 }
5222 /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
5223 if (new->refalt_score_within_trims < *found_score_within_trims) {
5224 *found_score_within_trims = new->refalt_score_within_trims;
5225 }
5226
5227
5228 /* new->penalties = 0; */
5229
5230 /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
5231 /* new->tally = -1L; */
5232
5233 new->nsplices = 0;
5234 if (splice_querystart_p == true && splice_queryend_p == true) {
5235 new->splice_score = (ambig_prob_querystart + ambig_prob_queryend)/2.0;
5236 } else if (splice_querystart_p == true) {
5237 new->splice_score = ambig_prob_querystart;
5238 } else if (splice_queryend_p == true) {
5239 new->splice_score = ambig_prob_queryend;
5240 } else {
5241 new->splice_score = 0.0;
5242 }
5243
5244 new->paired_usedp = false;
5245
5246 /* new->query_splicepos = -1; */
5247 new->circularpos = compute_circularpos(&new->circularalias,new);
5248
5249 debug0(printf("*****Method %s: Stage3end_new_substitution returning %p at %u..%u with nmatches_to_trims %d and amb length %d+%d\n\n",
5250 Method_string(method),new,new->genomicstart - chroffset,new->genomicend - chroffset,new->ref_nmatches_to_trims,
5251 start_amb_length(new),end_amb_length(new)));
5252
5253 /* Previously checked for (new->circularalias == +2 || new->circularalias == -2) */
5254
5255 if (new->circularpos >= 0) {
5256 new->altlocp = false;
5257 return new;
5258
5259 } else if ((new->altlocp = altlocp[chrnum]) == false) {
5260 return new;
5261
5262 } else {
5263 return new;
5264 }
5265 }
5266
5267
5268
5269 /* Previously allowed donor or acceptor to be NULL, when we performed Splice_group_by_segment */
5270 /* Previously new->substring1 was donor and new->substring2 was acceptor */
5271 /* TODO: Modify a Stage3end_new_splice to take two Stage3end_T parts, somewhat like a Stage3pair_T */
5272 T
Stage3end_new_splice(int * found_score_overall,int * found_score_within_trims,Substring_T donor,Substring_T acceptor,Chrpos_T distance,bool shortdistancep,int querylength,bool copy_donor_p,bool copy_acceptor_p,bool first_read_p,int orig_sensedir,Listpool_T listpool,Method_T method,int level)5273 Stage3end_new_splice (int *found_score_overall, int *found_score_within_trims,
5274 Substring_T donor, Substring_T acceptor,
5275 Chrpos_T distance, bool shortdistancep, int querylength,
5276 bool copy_donor_p, bool copy_acceptor_p, bool first_read_p, int orig_sensedir,
5277 Listpool_T listpool, Method_T method, int level) {
5278 T new;
5279 Substring_T substring_for_concordance; /* always the inner substring */
5280 Substring_T substring_other; /* the outer substring */
5281 Substring_T substring1, substringN;
5282 Junction_T junction;
5283
5284 List_T transcripts;
5285 char *remap_sequence;
5286 int remap_seqlength;
5287 double donor_prob, acceptor_prob;
5288 #ifdef DEBUG0
5289 Substring_T substring;
5290 List_T p;
5291 #endif
5292
5293
5294 if (Substring_nmatches_to_trims(donor) < 15 ||
5295 Substring_nmatches_to_trims(acceptor) < 15) {
5296 /* Not enough evidence to find each end of the translocation */
5297 return (T) NULL;
5298 } else {
5299 new = (T) MALLOC_OUT(sizeof(*new));
5300 }
5301
5302 donor_prob = Substring_siteD_prob(donor);
5303 acceptor_prob = Substring_siteA_prob(acceptor);
5304
5305 debug0(printf("Stage3end_new_splice, method %s: %p with first_read_p %d, sensedir %d, donor substring %p and acceptor substring %p, donor_prob %f and acceptor_prob %f\n",
5306 Method_string(method),new,first_read_p,orig_sensedir,donor,acceptor,donor_prob,acceptor_prob));
5307
5308 #if 0
5309 assert(Substring_match_length_orig(donor) + Substring_match_length_orig(acceptor) + amb_length == querylength);
5310 #endif
5311
5312 new->querylength_adj = new->querylength = querylength;
5313
5314 new->nindels = 0;
5315
5316 new->transcripts = (List_T) NULL;
5317 new->transcripts_other = (List_T) NULL;
5318
5319 new->splice_score = donor_prob + acceptor_prob;
5320
5321 new->method = method;
5322 new->level = level;
5323
5324 if (shortdistancep == true) {
5325 new->distant_splice_p = false;
5326
5327 new->hittype = SPLICE;
5328 new->genestrand = Substring_genestrand(donor);
5329 new->chrnum = Substring_chrnum(donor);
5330 new->chroffset = Substring_chroffset(donor);
5331 new->chrhigh = Substring_chrhigh(donor);
5332 new->chrlength = Substring_chrlength(donor);
5333
5334 assert(Substring_plusp(donor) == Substring_plusp(acceptor));
5335 assert(SENSE_CONSISTENT_P(Substring_sensedir(donor),Substring_sensedir(acceptor)));
5336
5337 } else {
5338 new->distant_splice_p = true;
5339
5340 if (Substring_chrnum(donor) == Substring_chrnum(acceptor) &&
5341 Substring_plusp(donor) == Substring_plusp(acceptor) &&
5342 SENSE_CONSISTENT_P(Substring_sensedir(donor),Substring_sensedir(acceptor))) {
5343 new->genestrand = Substring_genestrand(donor);
5344 new->hittype = SAMECHR_SPLICE;
5345 new->chrnum = Substring_chrnum(donor);
5346 new->chroffset = Substring_chroffset(donor);
5347 new->chrhigh = Substring_chrhigh(donor);
5348 new->chrlength = Substring_chrlength(donor);
5349 } else {
5350 new->hittype = TRANSLOC_SPLICE;
5351 new->genestrand = 0;
5352 new->chrnum = 0;
5353 new->chroffset = 0;
5354 new->chrhigh = 0;
5355 new->chrlength = 0;
5356 }
5357 }
5358
5359 /* printf("Making splice with shortdistancep = %d, donor chrnum %d, and acceptor chrnum %d => chrnum %d\n",
5360 shortdistancep,Substring_chrnum(donor),Substring_chrnum(acceptor),new->chrnum); */
5361
5362 new->guided_insertlength = 0U;
5363 new->nsegments = 2;
5364 new->nsplices = 1;
5365
5366 /* Define substrings and junctions */
5367 if (new->chrnum != 0) {
5368 new->sensedir = orig_sensedir;
5369 junction = Junction_new_splice(distance,orig_sensedir,donor_prob,acceptor_prob);
5370
5371 } else if (Substring_querystart(donor) < Substring_querystart(acceptor)) {
5372 /* Translocation, sense */
5373 new->sensedir = SENSE_FORWARD;
5374 junction = Junction_new_chimera(/*sensedir:SENSE_FORWARD,*/donor_prob,acceptor_prob);
5375
5376 } else {
5377 /* Translocation, antisense */
5378 new->sensedir = SENSE_ANTI;
5379 junction = Junction_new_chimera(/*sensedir:SENSE_ANTI,*/donor_prob,acceptor_prob);
5380 }
5381 new->sensedir_for_concordance = new->sensedir;
5382
5383 debug0(printf("donor querypos %d..%d\n",Substring_querystart(donor),Substring_queryend(donor)));
5384 debug0(printf("acceptor querypos %d..%d\n",Substring_querystart(acceptor),Substring_queryend(acceptor)));
5385 debug0(printf("sensedir %d\n",new->sensedir));
5386
5387
5388 /* new->junctions_LtoH = Listpool_push(NULL,listpool,(void *) junction); */
5389 /* new->junctions_HtoL = Listpool_push(NULL,listpool,(void *) junction); */
5390 new->junctions_1toN = Listpool_push(NULL,listpool,(void *) junction);
5391 new->junctions_Nto1 = Listpool_push(NULL,listpool,(void *) junction);
5392
5393 donor = copy_donor_p ? Substring_copy(donor) : donor;
5394 acceptor = copy_acceptor_p ? Substring_copy(acceptor) : acceptor;
5395 if (new->sensedir != SENSE_ANTI) {
5396 /* SENSE_FORWARD or SENSE_NULL */
5397 /* Order is donor (substring1), acceptor (substring2) */
5398 new->substrings_1toN = Listpool_push(NULL,listpool,(void *) acceptor);
5399 new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) donor);
5400 } else {
5401 /* SENSE_ANTI */
5402 /* Order is acceptor (substring1), donor (substring2) */
5403 new->substrings_1toN = Listpool_push(NULL,listpool,(void *) donor);
5404 new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) acceptor);
5405 }
5406 new->substrings_Nto1 = List_reverse(Listpool_copy(new->substrings_1toN,listpool));
5407 assert(Substring_querystart(List_head(new->substrings_1toN)) < Substring_querystart(List_head(new->substrings_Nto1)));
5408 /* Done assigning substrings */
5409
5410
5411 substring1 = (Substring_T) List_head(new->substrings_1toN);
5412 substringN = (Substring_T) List_head(new->substrings_Nto1);
5413
5414 new->trim_querystart = Substring_trim_querystart(substring1);
5415 new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring1);
5416 new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring1);
5417 new->trim_queryend = Substring_trim_queryend(substringN);
5418 new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substringN);
5419 new->trim_queryend_splicep = Substring_trim_queryend_splicep(substringN);
5420 debug0(printf(" trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
5421 debug0(printf(" trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
5422
5423 new->querystart_chrbound = Substring_querystart_chrbound(substring1);
5424 new->queryend_chrbound = Substring_queryend_chrbound(substringN);
5425 if (new->trim_querystart > new->querystart_chrbound) {
5426 new->querystart_chrbound = new->trim_querystart;
5427 }
5428 if (querylength - new->trim_queryend < new->queryend_chrbound) {
5429 new->queryend_chrbound = querylength - new->trim_queryend;
5430 }
5431 assert(new->querystart_chrbound < new->queryend_chrbound);
5432 debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
5433
5434
5435 if (new->chrnum != 0) {
5436 /* Ordinary splice. No need to distinguish effective_chrnum and other_chrnum */
5437 substring_for_concordance = substring_other = (Substring_T) NULL;
5438 new->effective_chrnum = new->chrnum;
5439 new->other_chrnum = 0;
5440
5441 /* Define coordinates as usual */
5442 new->genomicstart = Substring_genomicstart(substring1);
5443 new->genomicend = Substring_genomicend(substringN);
5444 new->plusp = Substring_plusp(substring1);
5445
5446 } else {
5447 /* Translocation. Concordant substring is the inner one */
5448 if (first_read_p == true) {
5449 substring_for_concordance = substringN; /* (Substring_T) List_head(new->substrings_Nto1); */
5450 substring_other = substring1; /* (Substring_T) List_head(new->substrings_1toN); */
5451 debug0(printf("Since first read, substring for concordance is at chr %d\n",Substring_chrnum(substring_for_concordance)));
5452 } else {
5453 substring_for_concordance = substring1; /* (Substring_T) List_head(new->substrings_1toN); */
5454 substring_other = substringN; /* (Substring_T) List_head(new->substrings_Nto1); */
5455 debug0(printf("Since second read, substring for concordance is at chr %d\n",Substring_chrnum(substring_for_concordance)));
5456 }
5457
5458 new->effective_chrnum = Substring_chrnum(substring_for_concordance);
5459 new->other_chrnum = Substring_chrnum(substring_other);
5460
5461 /* Define coordinates based on substring for concordance */
5462 new->genomicstart = Substring_genomicstart(substring_for_concordance);
5463 new->genomicend = Substring_genomicend(substring_for_concordance);
5464
5465 /* This plusp is somewhat artificial, based on substring_for_concordance,
5466 but it defines order of substrings_LtoH */
5467 new->plusp = Substring_plusp(substring_for_concordance);
5468 }
5469
5470 #ifdef DEBUG0
5471 printf("NEW SUBSTRINGS (query order)\n");
5472 for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
5473 substring = List_head(p);
5474 if (Substring_ambiguous_p(substring) == true) {
5475 printf("%d..%d\t%d:%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\tprobs:%f and %f\n",
5476 Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
5477 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
5478 Substring_nmatches_to_trims(substring),Substring_amb_length(substring),
5479 Substring_amb_donor_prob(substring),Substring_amb_acceptor_prob(substring));
5480 } else {
5481 printf("%d..%d\t%d:%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",
5482 Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
5483 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
5484 Substring_nmatches_to_trims(substring),Substring_amb_length(substring));
5485 }
5486 }
5487 printf("\n");
5488 #endif
5489
5490
5491 /* genomicstart and genomicend could be reversed for a scramble */
5492 if (new->genomicstart < new->genomicend) {
5493 new->low = new->genomicstart + new->querystart_chrbound;
5494 new->high = new->genomicend - (querylength - new->queryend_chrbound);
5495 new->genomiclength = new->genomicend - new->genomicstart;
5496 } else {
5497 new->low = new->genomicend + (querylength - new->queryend_chrbound);
5498 new->high = new->genomicstart - new->querystart_chrbound;
5499 new->genomiclength = new->genomicstart - new->genomicend;
5500 }
5501 /* assert(new->low < new->high); */
5502 debug0(printf("low %u, high %u\n",new->low,new->high));
5503
5504 debug0(printf(" hittype is %s, plusp %d, genomicpos %u..%u\n",
5505 hittype_string(new->hittype),new->plusp,new->genomicstart - new->chroffset,new->genomicend - new->chroffset));
5506
5507
5508 new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(donor) + Substring_nmismatches_bothdiff(acceptor);
5509 new->nmismatches_refdiff = Substring_nmismatches_refdiff(donor) + Substring_nmismatches_refdiff(acceptor);
5510
5511 new->refalt_nmatches_to_trims = Substring_nmatches_to_trims(donor) + Substring_nmatches_to_trims(acceptor);
5512 new->ref_nmatches_to_trims = Substring_ref_nmatches_to_trims(donor) + Substring_ref_nmatches_to_trims(acceptor);
5513 new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims;
5514 new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims;
5515 assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
5516
5517 new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
5518 new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
5519 new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims; /* Should not have any trims at the ends */
5520 if (new->chrlength == 0) {
5521 /* Cannot compare querylength with chrlength, which is 0 */
5522 } else if (new->chrlength < (Univcoord_T) querylength) {
5523 new->ref_score_overall -= ((Univcoord_T) querylength - new->chrlength);
5524 new->refalt_score_overall -= ((Univcoord_T) querylength - new->chrlength);
5525 new->refalt_score_within_trims -= ((Univcoord_T) querylength - new->chrlength);
5526 }
5527 assert(new->refalt_score_within_trims >= 0);
5528
5529 /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
5530 if (new->refalt_score_overall < *found_score_overall) {
5531 *found_score_overall = new->refalt_score_overall;
5532 }
5533 /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
5534 if (new->refalt_score_within_trims < *found_score_within_trims) {
5535 *found_score_within_trims = new->refalt_score_within_trims;
5536 }
5537
5538 debug0(printf("New splice has donor %d + acceptor %d matches, sensedir %d\n",
5539 Substring_nmatches(donor),Substring_nmatches(acceptor),new->sensedir));
5540
5541 /* new->penalties = splicing_penalty; */
5542
5543 /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
5544 /* new->tally = -1L; */
5545
5546 #if 0
5547 new->mapq_score = 0;
5548 new->absmq_score = 0;
5549 #endif
5550
5551 new->paired_usedp = false;
5552
5553 #if 0
5554 if (new->sensedir != SENSE_ANTI) {
5555 assert(Substring_queryend(donor) == Substring_querystart(acceptor));
5556 /* new->query_splicepos = Substring_queryend(donor); */
5557 } else {
5558 assert(Substring_queryend(acceptor) == Substring_querystart(donor));
5559 /* new->query_splicepos = Substring_queryend(acceptor); */
5560 }
5561 assert(new->query_splicepos > 0 && new->query_splicepos < querylength - 1);
5562 #endif
5563
5564 new->circularpos = compute_circularpos(&new->circularalias,new);
5565 /* Previously checked for (new->circularalias == +2 || new->circularalias == -2) */
5566
5567 if (new->circularpos >= 0) {
5568 new->altlocp = false;
5569 } else if ((new->altlocp = altlocp[new->chrnum]) == false) {
5570 } else {
5571 }
5572
5573 if (transcriptomep == true && remap_transcriptome_p == true && substring_for_concordance != NULL) {
5574 /* Remap substring_for_concordance */
5575 remap_sequence = Substring_genomic_sequence(&remap_seqlength,substring_for_concordance,genomecomp);
5576 if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,new->effective_chrnum,
5577 Substring_chrpos_low(substring_for_concordance),
5578 Substring_chrpos_high(substring_for_concordance),
5579 transcript_iit,transcriptomebits,transcriptome)) != NULL) {
5580 new->transcripts = transcripts;
5581 }
5582 FREE(remap_sequence);
5583
5584 /* Remap substring_other */
5585 remap_sequence = Substring_genomic_sequence(&remap_seqlength,substring_other,genomecomp);
5586 if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,new->other_chrnum,
5587 Substring_chrpos_low(substring_other),
5588 Substring_chrpos_high(substring_other),
5589 transcript_iit,transcriptomebits,transcriptome)) != NULL) {
5590 new->transcripts_other = transcripts;
5591 }
5592 FREE(remap_sequence);
5593 }
5594
5595 debug0(printf("*****Method %s: Returning new splice %p at genomic %u..%u, donor %p (%u => %u), acceptor %p (%u => %u), score %d\n\n",
5596 Method_string(method),new,new->genomicstart - new->chroffset,new->genomicend - new->chroffset,donor,
5597 donor == NULL ? 0 : Substring_left_genomicseg(donor),
5598 donor == NULL ? 0 : Substring_splicecoord_D(donor),
5599 acceptor,acceptor == NULL ? 0 : Substring_left_genomicseg(acceptor),
5600 acceptor == NULL ? 0 : Substring_splicecoord_A(acceptor),new->refalt_score_within_trims));
5601 debug0(printf("sensedir %d\n",new->sensedir));
5602 return new;
5603 }
5604
5605
5606 T
Stage3end_new_distant(int * found_score_overall,int * found_score_within_trims,Substring_T startfrag,Substring_T endfrag,int splice_pos,int nmismatches1,int nmismatches2,double prob1,double prob2,int sensedir_distant_guess,Chrpos_T distance,bool shortdistancep,int querylength,bool first_read_p,Listpool_T listpool,int level)5607 Stage3end_new_distant (int *found_score_overall, int *found_score_within_trims,
5608 Substring_T startfrag, Substring_T endfrag, int splice_pos,
5609 int nmismatches1, int nmismatches2,
5610 double prob1, double prob2, int sensedir_distant_guess,
5611 Chrpos_T distance, bool shortdistancep, int querylength,
5612 bool first_read_p, Listpool_T listpool, int level) {
5613 T new;
5614 Substring_T substring_for_concordance; /* always the inner substring */
5615 Substring_T substring_other; /* the outer substring */
5616 Substring_T substring1, substringN;
5617 Substring_T donor, acceptor;
5618 Junction_T junction;
5619
5620 List_T transcripts;
5621 char *remap_sequence;
5622 int remap_seqlength;
5623 #ifdef DEBUG0
5624 Substring_T substring;
5625 List_T p;
5626 #endif
5627
5628 new = (T) MALLOC_OUT(sizeof(*new));
5629
5630 debug0(printf("Stage3end_new_distant: %p with first_read_p %d, shortdistancep %d, sensedir guessed to be %d\n",
5631 new,first_read_p,shortdistancep,sensedir_distant_guess));
5632
5633 new->querylength_adj = new->querylength = querylength;
5634
5635 new->nindels = 0;
5636
5637 new->transcripts = (List_T) NULL;
5638 new->transcripts_other = (List_T) NULL;
5639
5640 new->splice_score = 0.0;
5641
5642 new->method = DISTANT_DNA;
5643 new->level = level;
5644
5645 debug0(printf("chrnum: %d and %d, plusp: %d and %d, sensedir: %d and %d\n",
5646 Substring_chrnum(startfrag),Substring_chrnum(endfrag),
5647 Substring_plusp(startfrag),Substring_plusp(endfrag),
5648 Substring_sensedir(startfrag),Substring_sensedir(endfrag)));
5649
5650 if (shortdistancep == true) {
5651 new->distant_splice_p = false;
5652
5653 new->hittype = SPLICE;
5654 new->genestrand = Substring_genestrand(startfrag);
5655 new->chrnum = Substring_chrnum(startfrag);
5656 new->chroffset = Substring_chroffset(startfrag);
5657 new->chrhigh = Substring_chrhigh(startfrag);
5658 new->chrlength = Substring_chrlength(startfrag);
5659
5660 assert(Substring_plusp(startfrag) == Substring_plusp(endfrag));
5661 assert(SENSE_CONSISTENT_P(Substring_sensedir(startfrag),Substring_sensedir(endfrag)));
5662
5663 } else {
5664 new->distant_splice_p = true;
5665
5666 new->hittype = TRANSLOC_SPLICE;
5667 new->genestrand = 0;
5668 new->chrnum = 0;
5669 new->chroffset = 0;
5670 new->chrhigh = 0;
5671 new->chrlength = 0;
5672 }
5673
5674 /* printf("Making splice with shortdistancep = %d, startfrag chrnum %d, and endfrag chrnum %d => chrnum %d\n",
5675 shortdistancep,Substring_chrnum(startfrag),Substring_chrnum(endfrag),new->chrnum); */
5676
5677 new->guided_insertlength = 0U;
5678 new->nsegments = 2;
5679 new->nsplices = 1;
5680
5681 /* Trim startfrag and endfrag at splice_pos */
5682 startfrag = Substring_trim_startfrag(nmismatches1,/*old*/startfrag,/*new_queryend*/splice_pos);
5683 endfrag = Substring_trim_endfrag(nmismatches2,/*old*/endfrag,/*new_querystart*/splice_pos);
5684
5685 /* Define substrings and junctions */
5686 new->sensedir_for_concordance = sensedir_distant_guess; /* was SENSE_NULL */
5687 new->sensedir = sensedir_distant_guess;
5688 if (sensedir_distant_guess != SENSE_ANTI) {
5689 /* Order is donor (substring1), acceptor (substring2) */
5690 donor = startfrag;
5691 Substring_label_donor(donor,splice_pos,prob1,sensedir_distant_guess);
5692
5693 acceptor = endfrag;
5694 Substring_label_acceptor(acceptor,splice_pos,prob2,sensedir_distant_guess);
5695
5696 new->substrings_1toN = Listpool_push(NULL,listpool,(void *) acceptor);
5697 new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) donor);
5698
5699 } else {
5700 /* Order is acceptor (substring1), donor (substring2) */
5701 acceptor = startfrag;
5702 Substring_label_acceptor(acceptor,splice_pos,prob1,sensedir_distant_guess);
5703
5704 donor = endfrag;
5705 Substring_label_donor(donor,splice_pos,prob2,sensedir_distant_guess);
5706
5707 new->substrings_1toN = Listpool_push(NULL,listpool,(void *) donor);
5708 new->substrings_1toN = Listpool_push(new->substrings_1toN,listpool,(void *) acceptor);
5709 }
5710
5711 if (shortdistancep == true) {
5712 junction = Junction_new_splice(distance,sensedir_distant_guess,Substring_siteD_prob(donor),Substring_siteA_prob(acceptor));
5713 } else {
5714 junction = Junction_new_chimera(/*sensedir_distant_guess,*/Substring_siteD_prob(donor),Substring_siteA_prob(acceptor));
5715 }
5716
5717 /* new->junctions_LtoH = Listpool_push(NULL,listpool,(void *) junction); */
5718 /* new->junctions_HtoL = Listpool_push(NULL,listpool,(void *) junction); */
5719 new->junctions_1toN = Listpool_push(NULL,listpool,(void *) junction);
5720 new->junctions_Nto1 = Listpool_push(NULL,listpool,(void *) junction);
5721
5722 new->substrings_Nto1 = List_reverse(Listpool_copy(new->substrings_1toN,listpool));
5723 assert(Substring_querystart(List_head(new->substrings_1toN)) < Substring_querystart(List_head(new->substrings_Nto1)));
5724 /* Done assigning substrings */
5725
5726
5727 substring1 = (Substring_T) List_head(new->substrings_1toN);
5728 substringN = (Substring_T) List_head(new->substrings_Nto1);
5729
5730 new->trim_querystart = Substring_trim_querystart(substring1);
5731 new->mandatory_trim_querystart = Substring_mandatory_trim_querystart(substring1);
5732 new->trim_querystart_splicep = Substring_trim_querystart_splicep(substring1);
5733 new->trim_queryend = Substring_trim_queryend(substringN);
5734 new->mandatory_trim_queryend = Substring_mandatory_trim_queryend(substringN);
5735 new->trim_queryend_splicep = Substring_trim_queryend_splicep(substringN);
5736 debug0(printf(" trim on left: %d (splicep %d)\n",new->trim_querystart,new->trim_querystart_splicep));
5737 debug0(printf(" trim on right: %d (splicep %d)\n",new->trim_queryend,new->trim_queryend_splicep));
5738
5739 new->querystart_chrbound = Substring_querystart_chrbound(substring1);
5740 new->queryend_chrbound = Substring_queryend_chrbound(substringN);
5741 if (new->trim_querystart > new->querystart_chrbound) {
5742 new->querystart_chrbound = new->trim_querystart;
5743 }
5744 if (querylength - new->trim_queryend < new->queryend_chrbound) {
5745 new->queryend_chrbound = querylength - new->trim_queryend;
5746 }
5747 assert(new->querystart_chrbound < new->queryend_chrbound);
5748 debug0(printf("querystart_chrbound %d, queryend_chrbound %d\n",new->querystart_chrbound,new->queryend_chrbound));
5749
5750
5751 /* Translocation. Concordant substring is the inner one */
5752 if (first_read_p == true) {
5753 substring_for_concordance = substringN; /* (Substring_T) List_head(new->substrings_Nto1); */
5754 substring_other = substring1; /* (Substring_T) List_head(new->substrings_1toN); */
5755 debug0(printf("Since first read, substring for concordance is at chr %d\n",Substring_chrnum(substring_for_concordance)));
5756 } else {
5757 substring_for_concordance = substring1; /* (Substring_T) List_head(new->substrings_1toN); */
5758 substring_other = substringN; /* (Substring_T) List_head(new->substrings_Nto1); */
5759 debug0(printf("Since second read, substring for concordance is at chr %d\n",Substring_chrnum(substring_for_concordance)));
5760 }
5761
5762 new->effective_chrnum = Substring_chrnum(substring_for_concordance);
5763 new->other_chrnum = Substring_chrnum(substring_other);
5764
5765 /* Define coordinates based on substring for concordance */
5766 new->genomicstart = Substring_genomicstart(substring_for_concordance);
5767 new->genomicend = Substring_genomicend(substring_for_concordance);
5768
5769 /* This plusp is somewhat artificial, based on substring_for_concordance,
5770 but it defines order of substrings_LtoH */
5771 new->plusp = Substring_plusp(substring_for_concordance);
5772
5773 #ifdef DEBUG0
5774 printf("NEW SUBSTRINGS (query order)\n");
5775 for (p = new->substrings_1toN; p != NULL; p = List_next(p)) {
5776 substring = List_head(p);
5777 if (Substring_ambiguous_p(substring) == true) {
5778 printf("%d..%d\t%d:%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\tprobs:%f and %f\n",
5779 Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
5780 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
5781 Substring_nmatches_to_trims(substring),Substring_amb_length(substring),
5782 Substring_amb_donor_prob(substring),Substring_amb_acceptor_prob(substring));
5783 } else {
5784 printf("%d..%d\t%d:%u..%u\tmismatches:%d\tmatches_to_trims:%d\tamb:%d\n",
5785 Substring_querystart(substring),Substring_queryend(substring),Substring_chrnum(substring),
5786 Substring_alignstart_trim_chr(substring),Substring_alignend_trim_chr(substring),Substring_nmismatches_bothdiff(substring),
5787 Substring_nmatches_to_trims(substring),Substring_amb_length(substring));
5788 }
5789 }
5790 printf("\n");
5791 #endif
5792
5793 /* genomicstart and genomicend could be reversed for a scramble */
5794 if (new->genomicstart < new->genomicend) {
5795 new->low = new->genomicstart + new->querystart_chrbound;
5796 new->high = new->genomicend - (querylength - new->queryend_chrbound);
5797 new->genomiclength = new->genomicend - new->genomicstart;
5798 } else {
5799 new->low = new->genomicend + (querylength - new->queryend_chrbound);
5800 new->high = new->genomicstart - new->querystart_chrbound;
5801 new->genomiclength = new->genomicstart - new->genomicend;
5802 }
5803 /* assert(new->low < new->high); */
5804 debug0(printf("low %u, high %u\n",new->low,new->high));
5805
5806 debug0(printf(" hittype is %s, plusp %d, genomicpos %u..%u\n",
5807 hittype_string(new->hittype),new->plusp,new->genomicstart - new->chroffset,new->genomicend - new->chroffset));
5808
5809 new->nmismatches_bothdiff = Substring_nmismatches_bothdiff(startfrag) + Substring_nmismatches_bothdiff(endfrag);
5810 new->nmismatches_refdiff = Substring_nmismatches_refdiff(startfrag) + Substring_nmismatches_refdiff(endfrag);
5811
5812 new->refalt_nmatches_to_trims = Substring_nmatches_to_trims(startfrag) + Substring_nmatches_to_trims(endfrag);
5813 new->ref_nmatches_to_trims = Substring_ref_nmatches_to_trims(startfrag) + Substring_ref_nmatches_to_trims(endfrag);
5814 new->refalt_nmatches_plus_spliced_trims = new->refalt_nmatches_to_trims;
5815 new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims;
5816 assert(new->refalt_nmatches_plus_spliced_trims <= querylength);
5817
5818 new->ref_score_overall = querylength - new->ref_nmatches_to_trims;
5819 new->refalt_score_overall = querylength - new->refalt_nmatches_to_trims;
5820 new->refalt_score_within_trims = querylength - new->refalt_nmatches_plus_spliced_trims; /* Should not have any trims at the ends */
5821 if (new->chrlength == 0) {
5822 /* Cannot compare querylength with chrlength, which is 0 */
5823 } else if (new->chrlength < (Univcoord_T) querylength) {
5824 new->ref_score_overall -= ((Univcoord_T) querylength - new->chrlength);
5825 new->refalt_score_overall -= ((Univcoord_T) querylength - new->chrlength);
5826 new->refalt_score_within_trims -= ((Univcoord_T) querylength - new->chrlength);
5827 }
5828 assert(new->refalt_score_within_trims >= 0);
5829
5830 /* found_score_overall does not compensate for spliced ends, so gives motivation to find distant splicing */
5831 if (new->refalt_score_overall < *found_score_overall) {
5832 *found_score_overall = new->refalt_score_overall;
5833 }
5834 /* found_score_within_trims does compensate for spliced trims, and guides how much further alignment is necessary */
5835 if (new->refalt_score_within_trims < *found_score_within_trims) {
5836 *found_score_within_trims = new->refalt_score_within_trims;
5837 }
5838
5839 debug0(printf("New distant has startfrag %d + endfrag %d matches, sensedir %d, score %d overall and %d within trims\n",
5840 Substring_nmatches(startfrag),Substring_nmatches(endfrag),new->sensedir,
5841 new->refalt_score_overall,new->refalt_score_within_trims));
5842
5843 /* new->penalties = splicing_penalty; */
5844
5845 /* new->gene_overlap = NO_KNOWN_GENE; -- initialized later when resolving multimappers */
5846 /* new->tally = -1L; */
5847
5848 #if 0
5849 new->mapq_score = 0;
5850 new->absmq_score = 0;
5851 #endif
5852
5853 new->paired_usedp = false;
5854 /* new->query_splicepos = splice_pos; */
5855
5856 new->circularpos = compute_circularpos(&new->circularalias,new);
5857 /* Previously checked for (new->circularalias == +2 || new->circularalias == -2) */
5858
5859 if (new->circularpos >= 0) {
5860 new->altlocp = false;
5861 } else if ((new->altlocp = altlocp[new->chrnum]) == false) {
5862 } else {
5863 }
5864
5865 if (transcriptomep == true && remap_transcriptome_p == true && substring_for_concordance != NULL) {
5866 /* Remap substring_for_concordance */
5867 remap_sequence = Substring_genomic_sequence(&remap_seqlength,substring_for_concordance,genomecomp);
5868 if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,new->effective_chrnum,
5869 Substring_chrpos_low(substring_for_concordance),
5870 Substring_chrpos_high(substring_for_concordance),
5871 transcript_iit,transcriptomebits,transcriptome)) != NULL) {
5872 new->transcripts = transcripts;
5873 }
5874 FREE(remap_sequence);
5875
5876 /* Remap substring_other */
5877 remap_sequence = Substring_genomic_sequence(&remap_seqlength,substring_other,genomecomp);
5878 if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,new->other_chrnum,
5879 Substring_chrpos_low(substring_other),
5880 Substring_chrpos_high(substring_other),
5881 transcript_iit,transcriptomebits,transcriptome)) != NULL) {
5882 new->transcripts_other = transcripts;
5883 }
5884 FREE(remap_sequence);
5885 }
5886
5887 debug0(printf("*****Method distant: Returning new distant %p at genomic %u..%u, startfrag %p (%u => ), endfrag %p (%u => ), score %d\n\n",
5888 new,new->genomicstart - new->chroffset,new->genomicend - new->chroffset,
5889 startfrag,Substring_left_genomicseg(startfrag),endfrag,Substring_left_genomicseg(endfrag),
5890 new->refalt_score_within_trims));
5891 return new;
5892 }
5893
5894
5895 static int
Stage3end_output_cmp(const void * a,const void * b)5896 Stage3end_output_cmp (const void *a, const void *b) {
5897 T x = * (T *) a;
5898 T y = * (T *) b;
5899
5900 if (x->refalt_nmatches_plus_spliced_trims > y->refalt_nmatches_plus_spliced_trims) {
5901 return -1;
5902 } else if (y->refalt_nmatches_plus_spliced_trims > x->refalt_nmatches_plus_spliced_trims) {
5903 return +1;
5904 } else if (x->ref_nmatches_plus_spliced_trims > y->ref_nmatches_plus_spliced_trims) {
5905 return -1;
5906 } else if (y->ref_nmatches_plus_spliced_trims > x->ref_nmatches_plus_spliced_trims) {
5907 return +1;
5908 } else if (x->mapq_loglik > y->mapq_loglik) {
5909 return -1;
5910 } else if (y->mapq_loglik > x->mapq_loglik) {
5911 return +1;
5912 } else if (x->distant_splice_p == false && y->distant_splice_p == true) {
5913 return -1;
5914 } else if (y->distant_splice_p == false && x->distant_splice_p == true) {
5915 return +1;
5916 } else if (x->guided_insertlength > 0 && y->guided_insertlength == 0) {
5917 return -1;
5918 } else if (y->guided_insertlength > 0 && x->guided_insertlength == 0) {
5919 return +1;
5920 } else if (x->guided_insertlength < y->guided_insertlength) {
5921 return -1;
5922 } else if (y->guided_insertlength < x->guided_insertlength) {
5923 return +1;
5924 } else if (x->refalt_score_within_trims < y->refalt_score_within_trims) {
5925 return -1;
5926 } else if (y->refalt_score_within_trims < x->refalt_score_within_trims) {
5927 return +1;
5928
5929 /* This genomic ordering will be undone if want_random_p is true */
5930 } else if (x->genomicstart < y->genomicstart) {
5931 return -1;
5932 } else if (y->genomicstart < x->genomicstart) {
5933 return +1;
5934
5935 } else if (x->genomicend < y->genomicend) {
5936 return -1;
5937 } else if (y->genomicend < x->genomicend) {
5938 return +1;
5939
5940 } else if (x->plusp == true && y->plusp == false) {
5941 return -1;
5942 } else if (x->plusp == false && y->plusp == true) {
5943 return +1;
5944
5945 } else if (x->hittype < y->hittype) {
5946 return -1;
5947 } else if (y->hittype < x->hittype) {
5948 return +1;
5949
5950 } else {
5951 return 0;
5952 }
5953 }
5954
5955
5956 static int
Stage3pair_output_cmp(const void * a,const void * b)5957 Stage3pair_output_cmp (const void *a, const void *b) {
5958 Stage3pair_T x = * (Stage3pair_T *) a;
5959 Stage3pair_T y = * (Stage3pair_T *) b;
5960
5961 #ifdef USE_BINGO
5962 if (x->absdifflength_bingo_p == true && y->absdifflength_bingo_p == false) {
5963 return -1;
5964 } else if (y->absdifflength_bingo_p == true && x->absdifflength_bingo_p == false) {
5965 return +1;
5966 }
5967 #endif
5968
5969 if (x->hit5->refalt_nmatches_plus_spliced_trims +
5970 x->hit3->refalt_nmatches_plus_spliced_trims >
5971 y->hit5->refalt_nmatches_plus_spliced_trims +
5972 y->hit3->refalt_nmatches_plus_spliced_trims) {
5973 return -1;
5974 } else if (y->hit5->refalt_nmatches_plus_spliced_trims +
5975 y->hit3->refalt_nmatches_plus_spliced_trims >
5976 x->hit5->refalt_nmatches_plus_spliced_trims +
5977 x->hit3->refalt_nmatches_plus_spliced_trims) {
5978 return +1;
5979 } else if (x->hit5->ref_nmatches_plus_spliced_trims +
5980 x->hit3->ref_nmatches_plus_spliced_trims >
5981 y->hit5->ref_nmatches_plus_spliced_trims +
5982 y->hit3->ref_nmatches_plus_spliced_trims) {
5983 return -1;
5984 } else if (y->hit5->ref_nmatches_plus_spliced_trims +
5985 y->hit3->ref_nmatches_plus_spliced_trims >
5986 x->hit5->ref_nmatches_plus_spliced_trims +
5987 x->hit3->ref_nmatches_plus_spliced_trims) {
5988 return +1;
5989 } else if (x->mapq_loglik > y->mapq_loglik) {
5990 return -1;
5991 } else if (y->mapq_loglik > x->mapq_loglik) {
5992 return +1;
5993 } else if (x->insertlength > 0 && y->insertlength == 0) {
5994 return -1;
5995 } else if (y->insertlength > 0 && x->insertlength == 0) {
5996 return +1;
5997 } else if (x->insertlength < y->insertlength) {
5998 return -1;
5999 } else if (y->insertlength < x->insertlength) {
6000 return +1;
6001 } else if (x->hit5->refalt_score_within_trims +
6002 x->hit3->refalt_score_within_trims <
6003 y->hit5->refalt_score_within_trims +
6004 y->hit3->refalt_score_within_trims) {
6005 return -1;
6006 } else if (y->hit5->refalt_score_within_trims +
6007 y->hit3->refalt_score_within_trims <
6008 x->hit5->refalt_score_within_trims +
6009 x->hit3->refalt_score_within_trims) {
6010 return +1;
6011
6012 /* This genomic ordering will be undone if want_random_p is true */
6013 } else if (x->low < y->low) {
6014 return -1;
6015 } else if (y->low < x->low) {
6016 return +1;
6017
6018 } else if (x->high < y->high) {
6019 return -1;
6020 } else if (y->high < x->high) {
6021 return +1;
6022
6023 } else {
6024 return 0;
6025 }
6026 }
6027
6028
6029
6030 static float
Stage3end_compute_mapq(Stage3end_T this,char * quality_string)6031 Stage3end_compute_mapq (Stage3end_T this, char *quality_string) {
6032 List_T p;
6033 Substring_T substring;
6034
6035 if (this == NULL) {
6036 return 0.0;
6037
6038 } else {
6039 this->mapq_loglik = 0.0;
6040 for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
6041 substring = (Substring_T) List_head(p);
6042 this->mapq_loglik += Substring_compute_mapq(substring,quality_string);
6043 }
6044 }
6045
6046 return this->mapq_loglik;
6047 }
6048
6049
6050
6051 static void
Stage3end_display_prep(Stage3end_T this,char * queryuc_ptr,bool first_read_p)6052 Stage3end_display_prep (Stage3end_T this, char *queryuc_ptr, bool first_read_p) {
6053 List_T p, q;
6054 Substring_T substring;
6055 Junction_T pre_junction, post_junction;
6056 Junctiontype_T type;
6057 int extraleft, extraright;
6058 bool sam_print_xt_p = false;
6059 /* int type; */
6060 /* int extralow, extrahigh; */
6061
6062 if (this != NULL) {
6063 if (output_type == SAM_OUTPUT) {
6064 if (this->hittype == TRANSLOC_SPLICE ||
6065 (this->hittype == SAMECHR_SPLICE && merge_samechr_p == false)) {
6066 /* This is the condition in samprint to print the XT field, which needs the splice information */
6067 sam_print_xt_p = true;
6068 }
6069 }
6070
6071 debug0(printf("Doing a display prep of end %p\n",this));
6072
6073 this->nmismatches_refdiff = 0;
6074
6075 /* First segments */
6076 /* For operations on substrings, proceed in 1toN order, not LtoH order */
6077 substring = (Substring_T) List_head(this->substrings_1toN);
6078 if (output_type == STD_OUTPUT) {
6079 extraleft = Substring_querystart(substring); /* terminal start */
6080 } else {
6081 extraleft = 0;
6082 }
6083
6084 if (List_length(this->substrings_1toN) == 1) {
6085 post_junction = (Junction_T) NULL;
6086 if (output_type == STD_OUTPUT) {
6087 extraright = this->querylength - Substring_queryend(substring); /* terminal end */
6088 } else {
6089 extraright = 0;
6090 }
6091 } else {
6092 post_junction = (Junction_T) List_head(this->junctions_1toN);
6093 /* Junction_print(post_junction); */
6094
6095 if (output_type == M8_OUTPUT) {
6096 extraright = 0;
6097 } else if ((type = Junction_type(post_junction)) == CHIMERA_JUNCTION || sam_print_xt_p == true) {
6098 extraright = 2;
6099 } else if (output_type == SAM_OUTPUT) {
6100 extraright = 0;
6101 } else if (type == SPLICE_JUNCTION) {
6102 extraright = 2;
6103 } else if (first_read_p == true && type == DEL_JUNCTION) {
6104 extraright = Junction_nindels(post_junction);
6105 } else {
6106 extraright = 0;
6107 }
6108 }
6109
6110 if (Substring_has_alts_p(substring) == true) {
6111 /* Skip */
6112 } else {
6113 this->nmismatches_refdiff +=
6114 Substring_display_prep(substring,queryuc_ptr,this->querylength,
6115 extraleft,extraright,genomecomp);
6116 }
6117
6118 assert(List_length(this->substrings_1toN) == List_length(this->junctions_1toN) + 1);
6119 if ((p = List_next(this->substrings_1toN)) == NULL) {
6120 /* No middle segments */
6121 } else {
6122 for (q = List_next(this->junctions_1toN); q != NULL; p = List_next(p), q = List_next(q)) {
6123 /* Middle segments */
6124 pre_junction = post_junction;
6125 post_junction = List_head(q);
6126
6127 /* Junction_print(pre_junction); */
6128 /* Junction_print(post_junction); */
6129
6130 if (output_type == M8_OUTPUT) {
6131 extraleft = 0;
6132 } else if ((type = Junction_type(pre_junction)) == CHIMERA_JUNCTION || sam_print_xt_p == true) {
6133 extraleft = 2;
6134 } else if (output_type == SAM_OUTPUT) {
6135 extraleft = 0;
6136 } else if (type == SPLICE_JUNCTION) {
6137 extraleft = 2;
6138 } else if (first_read_p == false && type == DEL_JUNCTION) {
6139 extraleft = Junction_nindels(pre_junction);
6140 } else {
6141 extraleft = 0;
6142 }
6143
6144 if (output_type == M8_OUTPUT) {
6145 extraright = 0;
6146 } else if ((type = Junction_type(post_junction)) == CHIMERA_JUNCTION || sam_print_xt_p == true) {
6147 extraright = 2;
6148 } else if (output_type == SAM_OUTPUT) {
6149 extraright = 0;
6150 } else if (type == SPLICE_JUNCTION) {
6151 extraright = 2;
6152 } else if (first_read_p == true && type == DEL_JUNCTION) {
6153 extraright = Junction_nindels(post_junction);
6154 } else {
6155 extraright = 0;
6156 }
6157
6158 substring = (Substring_T) List_head(p);
6159 if (Substring_has_alts_p(substring) == true) {
6160 /* Skip */
6161 } else {
6162 this->nmismatches_refdiff +=
6163 Substring_display_prep(substring,queryuc_ptr,this->querylength,
6164 extraleft,extraright,genomecomp);
6165 }
6166 }
6167
6168 /* Last segment */
6169 pre_junction = post_junction;
6170 /* Junction_print(pre_junction); */
6171
6172 if (output_type == M8_OUTPUT) {
6173 extraleft = 0;
6174 } else if ((type = Junction_type(pre_junction)) == CHIMERA_JUNCTION || sam_print_xt_p == true) {
6175 extraleft = 2;
6176 } else if (output_type == SAM_OUTPUT) {
6177 extraleft = 0;
6178 } else if (type == SPLICE_JUNCTION) {
6179 extraleft = 2;
6180 } else if (first_read_p == false && type == DEL_JUNCTION) {
6181 extraleft = Junction_nindels(pre_junction);
6182 } else {
6183 extraleft = 0;
6184 }
6185
6186 substring = (Substring_T) List_head(p);
6187 if (output_type == STD_OUTPUT) {
6188 extraright = this->querylength - Substring_queryend(substring);
6189 } else {
6190 extraright = 0;
6191 }
6192
6193 if (Substring_has_alts_p(substring) == true) {
6194 /* Skip */
6195 } else {
6196 this->nmismatches_refdiff +=
6197 Substring_display_prep(substring,queryuc_ptr,this->querylength,
6198 extraleft,extraright,genomecomp);
6199 }
6200 }
6201 }
6202
6203 return;
6204 }
6205
6206
6207 List_T
Stage3end_filter(List_T hits,Hitlistpool_T hitlistpool,int max_mismatches_refalt,int max_mismatches_ref,int min_coverage)6208 Stage3end_filter (List_T hits, Hitlistpool_T hitlistpool,
6209 int max_mismatches_refalt, int max_mismatches_ref, int min_coverage) {
6210 List_T newhits = NULL, p;
6211 Stage3end_T hit;
6212
6213 debug1(printf("Entered Stage3end_filter with max_mismatches_refalt %d, max_mismatches_ref %d, and min_coverage %d\n",
6214 max_mismatches_refalt,max_mismatches_ref,min_coverage));
6215
6216 if (filter_within_trims_p == false) {
6217 /* Generally want overall mismatches for DNA-seq, so use refalt_score_overall */
6218 for (p = hits; p != NULL; p = List_next(p)) {
6219 hit = (Stage3end_T) List_head(p);
6220 debug1(printf("DNA-seq: Comparing refalt score %d against max_mismatches_refalt %d, ref %d against %d, and coverage %d against min_coverage %d\n",
6221 hit->refalt_score_overall,max_mismatches_refalt,
6222 hit->ref_score_overall,max_mismatches_ref,
6223 hit->querylength - hit->trim_querystart - hit->trim_queryend,min_coverage));
6224 debug1(printf("Coverage is querylength %d - trim_querystart %d - trim_queryend %d + mandatory %d + mandatory %d\n",
6225 hit->querylength,hit->trim_querystart,hit->trim_queryend,hit->mandatory_trim_querystart,hit->mandatory_trim_queryend));
6226
6227 if (hit->refalt_score_overall > max_mismatches_refalt) {
6228 debug1(printf(" => FREE\n"));
6229 Stage3end_free(&hit);
6230 } else if (hit->ref_score_overall > max_mismatches_ref) {
6231 debug1(printf(" => FREE\n"));
6232 Stage3end_free(&hit);
6233 } else if (hit->querylength - hit->trim_querystart - hit->trim_queryend + hit->mandatory_trim_querystart + hit->mandatory_trim_queryend < min_coverage) {
6234 debug1(printf(" => FREE\n"));
6235 Stage3end_free(&hit);
6236 } else {
6237 debug1(printf(" => KEEP\n"));
6238 newhits = Hitlist_push(newhits,hitlistpool,(void *) hit);
6239 }
6240 }
6241
6242 } else {
6243 /* Generally expect trims for RNA-seq, so use refalt_score_within_trims */
6244 for (p = hits; p != NULL; p = List_next(p)) {
6245 hit = (Stage3end_T) List_head(p);
6246 debug1(printf("RNA-seq: Comparing refalt score %d against max_mismatches_refalt %d, and coverage %d against min_coverage %d\n",
6247 hit->refalt_score_within_trims,max_mismatches_refalt,hit->querylength - hit->trim_querystart - hit->trim_queryend,min_coverage));
6248 debug1(printf("Coverage is querylength %d - trim_querystart %d - trim_queryend %d + mandatory %d + mandatory %d\n",
6249 hit->querylength,hit->trim_querystart,hit->trim_queryend,hit->mandatory_trim_querystart,hit->mandatory_trim_queryend));
6250
6251 if (hit->refalt_score_within_trims > max_mismatches_refalt) {
6252 debug1(printf(" => FREE\n"));
6253 Stage3end_free(&hit);
6254 } else if (hit->querylength - hit->trim_querystart - hit->trim_queryend + hit->mandatory_trim_querystart + hit->mandatory_trim_queryend < min_coverage) {
6255 debug1(printf(" => FREE\n"));
6256 Stage3end_free(&hit);
6257 } else {
6258 debug1(printf(" => KEEP\n"));
6259 newhits = Hitlist_push(newhits,hitlistpool,(void *) hit);
6260 }
6261 }
6262 }
6263
6264
6265 Hitlist_free(&hits);
6266 return newhits;
6267 }
6268
6269
6270
6271
6272 Stage3end_T *
Stage3end_eval_and_sort(int npaths,int * first_absmq,int * second_absmq,Stage3end_T * stage3array,char * queryuc_ptr,char * quality_string,bool displayp)6273 Stage3end_eval_and_sort (int npaths, int *first_absmq, int *second_absmq,
6274 Stage3end_T *stage3array, char *queryuc_ptr, char *quality_string,
6275 bool displayp) {
6276 float maxlik, loglik;
6277 float total, q; /* For Bayesian mapq calculation */
6278 int compute_npaths;
6279
6280 int randomi, i;
6281 Stage3end_T temp, hit;
6282
6283 if (npaths == 0) {
6284 /* Skip */
6285 *first_absmq = 0;
6286 *second_absmq = 0;
6287
6288 } else if (npaths == 1) {
6289 hit = stage3array[0];
6290 hit->mapq_loglik = MAPQ_MAXIMUM_SCORE;
6291 hit->mapq_score = MAPQ_max_quality_score(quality_string,hit->querylength);
6292 hit->absmq_score = MAPQ_MAXIMUM_SCORE;
6293
6294 if (displayp == true) {
6295 Stage3end_display_prep(hit,queryuc_ptr,/*first_read_p*/true);
6296 }
6297 *first_absmq = hit->absmq_score;
6298 *second_absmq = 0;
6299
6300 } else {
6301 /* Compute mapq_loglik */
6302 for (i = 0; i < npaths; i++) {
6303 Stage3end_compute_mapq(stage3array[i],quality_string);
6304 }
6305
6306 /* Sort by nmatches, then mapq */
6307 qsort(stage3array,npaths,sizeof(Stage3end_T),Stage3end_output_cmp);
6308
6309 if (want_random_p) {
6310 /* Randomize among best alignments */
6311 i = 1;
6312 while (i < npaths && Stage3end_output_cmp(&(stage3array[i]),&(stage3array[0])) == 0) {
6313 i++;
6314 }
6315 if (i > 1) { /* i is number of ties */
6316 /* randomi = (int) ((double) i * rand()/((double) RAND_MAX + 1.0)); */
6317 randomi = (int) (rand() / (((double) RAND_MAX + 1.0) / (double) i));
6318 /* fprintf(stderr,"%d dups => random %d\n",i,randomi); */
6319 temp = stage3array[0];
6320 stage3array[0] = stage3array[randomi];
6321 stage3array[randomi] = temp;
6322 }
6323 }
6324
6325 /* Enforce monotonicity */
6326 for (i = npaths - 1; i > 0; i--) {
6327 if (stage3array[i-1]->mapq_loglik < stage3array[i]->mapq_loglik) {
6328 stage3array[i-1]->mapq_loglik = stage3array[i]->mapq_loglik;
6329 }
6330 }
6331 maxlik = stage3array[0]->mapq_loglik;
6332
6333 /* Subtract maxlik to avoid underflow */
6334 for (i = 0; i < npaths; i++) {
6335 stage3array[i]->mapq_loglik -= maxlik;
6336 }
6337
6338 #if 0
6339 /* Save on computation if possible */
6340 /* Not possible, since we are going to select randomly from among all npaths */
6341 if (npaths < maxpaths) {
6342 compute_npaths = npaths;
6343 } else {
6344 compute_npaths = maxpaths;
6345 }
6346 if (compute_npaths < 2) {
6347 compute_npaths = 2;
6348 }
6349 #else
6350 compute_npaths = npaths;
6351 #endif
6352
6353 /* Compute absolute mapq */
6354 for (i = 0; i < compute_npaths; i++) {
6355 loglik = stage3array[i]->mapq_loglik + MAPQ_MAXIMUM_SCORE;
6356 if (loglik < 0.0) {
6357 loglik = 0.0;
6358 }
6359 stage3array[i]->absmq_score = rint(loglik);
6360 }
6361 *first_absmq = stage3array[0]->absmq_score;
6362 *second_absmq = stage3array[1]->absmq_score;
6363
6364
6365 /* Compute Bayesian mapq */
6366 total = 0.0;
6367 for (i = 0; i < npaths; i++) {
6368 total += (stage3array[i]->mapq_loglik = fasterexp(stage3array[i]->mapq_loglik));
6369 }
6370
6371 /* Obtain posterior probabilities of being true */
6372 for (i = 0; i < compute_npaths; i++) {
6373 stage3array[i]->mapq_loglik /= total;
6374 }
6375
6376 /* Convert to Phred scores */
6377 for (i = 0; i < compute_npaths; i++) {
6378 if ((q = 1.0 - stage3array[i]->mapq_loglik) < 2.5e-10 /* 10^-9.6 */) {
6379 stage3array[i]->mapq_score = 96;
6380 } else {
6381 stage3array[i]->mapq_score = rint(-10.0 * log10(q));
6382 }
6383 }
6384
6385 if (displayp == true) {
6386 /* Prepare for display */
6387 for (i = 0; i < compute_npaths; i++) {
6388 Stage3end_display_prep(stage3array[i],queryuc_ptr,/*first_read_p*/true);
6389 }
6390 }
6391
6392 #if 0
6393 /* Apply filtering for mapq unique -- currently not used since mapq_unique_score is high */
6394 if (stage3array[0]->mapq_score >= mapq_unique_score &&
6395 stage3array[1]->mapq_score < mapq_unique_score) {
6396 for (i = 1; i < *npaths; i++) {
6397 Stage3end_free(&(stage3array[i]));
6398 }
6399 *npaths = 1;
6400 }
6401 #endif
6402 }
6403
6404 return stage3array;
6405 }
6406
6407
6408 static int
insertlength_expected(Chrpos_T insertlength)6409 insertlength_expected (Chrpos_T insertlength) {
6410 if (insertlength < expected_pairlength_low) {
6411 return -1;
6412 } else if (insertlength > expected_pairlength_very_high) {
6413 return -1;
6414 } else if (insertlength > expected_pairlength_high) {
6415 return 0;
6416 } else {
6417 return +1;
6418 }
6419 }
6420
6421
6422 /* For concordant ends */
6423 static Chrpos_T
pair_insert_length(int * pair_relationship,Stage3end_T hit5,Stage3end_T hit3)6424 pair_insert_length (int *pair_relationship, Stage3end_T hit5, Stage3end_T hit3) {
6425 List_T p, q;
6426 Substring_T substring5, substring3;
6427
6428 if (hit5->plusp != hit3->plusp) {
6429 debug10(printf("pair_insert_length: hit5->plusp %d != hit3->plusp %d, so returning 0\n",
6430 hit5->plusp,hit3->plusp));
6431 *pair_relationship = 0;
6432 return 0;
6433 }
6434
6435 if (hit5->chrnum != 0 && hit3->chrnum != 0) {
6436 for (q = hit3->substrings_1toN; q != NULL; q = List_next(q)) {
6437 substring3 = (Substring_T) List_head(q);
6438 for (p = hit5->substrings_1toN; p != NULL; p = List_next(p)) {
6439 substring5 = (Substring_T) List_head(p);
6440 if (Substring_overlap_p(substring5,substring3)) {
6441 debug10(printf("Calling Substring_insert_length on %d..%d and %d..%d\n",
6442 Substring_querystart(substring5),Substring_queryend(substring5),
6443 Substring_querystart(substring3),Substring_queryend(substring3)));
6444 return Substring_insert_length(&(*pair_relationship),substring5,substring3);
6445 }
6446 }
6447 }
6448 }
6449
6450 /* No overlap found between any combination of substrings */
6451 if (hit5->plusp == true) {
6452 if (hit5->genomicend > hit3->genomicstart + hit5->querylength + hit3->querylength) {
6453 debug10(printf("pair_insert_length: no overlap found, and %u - %u + %d + %d < 0, so returning 0\n",
6454 hit3->genomicstart - hit3->chroffset,hit5->genomicend - hit5->chroffset,
6455 hit5->querylength,hit3->querylength));
6456 *pair_relationship = 0;
6457 return 0;
6458 } else {
6459 debug10(printf("pair_insert_length: no overlap found, so returning %u - %u + %d + %d\n",
6460 hit3->genomicstart - hit3->chroffset,hit5->genomicend - hit5->chroffset,
6461 hit5->querylength,hit3->querylength));
6462 }
6463 *pair_relationship = +1;
6464 return hit3->genomicstart - hit5->genomicend + hit5->querylength + hit3->querylength;
6465
6466 } else {
6467 if (hit3->genomicstart > hit5->genomicend + hit5->querylength + hit3->querylength) {
6468 debug10(printf("pair_insert_length: no overlap found, and %u - %u + %d + %d < 0, so returning 0\n",
6469 hit5->genomicend - hit5->chroffset,hit3->genomicstart - hit3->chroffset,
6470 hit5->querylength,hit3->querylength));
6471 *pair_relationship = 0;
6472 return 0;
6473 } else {
6474 debug10(printf("pair_insert_length: no overlap found, so returning %u - %u + %d + %d\n",
6475 hit5->genomicend - hit5->chroffset,hit3->genomicstart - hit3->chroffset,
6476 hit5->querylength,hit3->querylength));
6477 *pair_relationship = -1;
6478 return hit5->genomicend - hit3->genomicstart + hit5->querylength + hit3->querylength;
6479 }
6480 }
6481 }
6482
6483
6484
6485 /* For unpaired ends */
6486 static Chrpos_T
pair_insert_length_unpaired(Stage3end_T hit5,Stage3end_T hit3)6487 pair_insert_length_unpaired (Stage3end_T hit5, Stage3end_T hit3) {
6488
6489 if (hit5->effective_chrnum != hit3->effective_chrnum) {
6490 debug10(printf("pair_insert_length: hit5->plusp %d != hit3->plusp %d, so returning 0\n",
6491 hit5->plusp,hit3->plusp));
6492 return 0;
6493 } else if (hit5->distant_splice_p == true) {
6494 return 0;
6495 } else if (hit3->distant_splice_p == true) {
6496 return 0;
6497 } else if (hit5->high < hit3->low) {
6498 /* was hit3->low - hit5->high + hit5->querylength + hit3->querylength; */
6499 return hit3->genomicstart - hit5->genomicstart;
6500 } else if (hit3->high < hit5->low) {
6501 /* was hit5->low - hit3->high + hit5->querylength + hit3->querylength; */
6502 return hit5->genomicstart - hit3->genomicstart;
6503 } else {
6504 return hit5->querylength + hit3->querylength;
6505 }
6506 }
6507
6508
6509 Stage3end_T *
Stage3end_eval_and_sort_guided(int npaths,int * first_absmq,int * second_absmq,Stage3end_T guide,Stage3end_T * stage3array,char * queryuc_ptr,char * quality_string,bool displayp)6510 Stage3end_eval_and_sort_guided (int npaths, int *first_absmq, int *second_absmq, Stage3end_T guide,
6511 Stage3end_T *stage3array, char *queryuc_ptr, char *quality_string,
6512 bool displayp) {
6513 float maxlik, loglik;
6514 float total, q; /* For Bayesian mapq calculation */
6515 int compute_npaths;
6516
6517 int randomi, i;
6518 Stage3end_T temp, hit;
6519
6520 if (npaths == 0) {
6521 /* Skip */
6522 *first_absmq = 0;
6523 *second_absmq = 0;
6524
6525 } else if (npaths == 1) {
6526 hit = stage3array[0];
6527 hit->mapq_loglik = MAPQ_MAXIMUM_SCORE;
6528 hit->mapq_score = MAPQ_max_quality_score(quality_string,hit->querylength);
6529 hit->absmq_score = MAPQ_MAXIMUM_SCORE;
6530
6531 if (displayp == true) {
6532 Stage3end_display_prep(hit,queryuc_ptr,/*first_read_p*/true);
6533 }
6534 *first_absmq = hit->absmq_score;
6535 *second_absmq = 0;
6536
6537 } else {
6538 /* Compute mapq_loglik */
6539 for (i = 0; i < npaths; i++) {
6540 Stage3end_compute_mapq(stage3array[i],quality_string);
6541 }
6542
6543 /* Compute insert_length relative to guide. This is the only change from the unguided procedure. */
6544 for (i = 0; i < npaths; i++) {
6545 stage3array[i]->guided_insertlength = pair_insert_length_unpaired(stage3array[i],guide);
6546 }
6547
6548 /* Sort by nmatches, then mapq */
6549 qsort(stage3array,npaths,sizeof(Stage3end_T),Stage3end_output_cmp);
6550
6551 if (want_random_p) {
6552 /* Randomize among best alignments */
6553 i = 1;
6554 while (i < npaths && Stage3end_output_cmp(&(stage3array[i]),&(stage3array[0])) == 0) {
6555 i++;
6556 }
6557 if (i > 1) { /* i is number of ties */
6558 /* randomi = (int) ((double) i * rand()/((double) RAND_MAX + 1.0)); */
6559 randomi = (int) (rand() / (((double) RAND_MAX + 1.0) / (double) i));
6560 /* fprintf(stderr,"%d dups => random %d\n",i,randomi); */
6561 temp = stage3array[0];
6562 stage3array[0] = stage3array[randomi];
6563 stage3array[randomi] = temp;
6564 }
6565 }
6566
6567 /* Enforce monotonicity */
6568 for (i = npaths - 1; i > 0; i--) {
6569 if (stage3array[i-1]->mapq_loglik < stage3array[i]->mapq_loglik) {
6570 stage3array[i-1]->mapq_loglik = stage3array[i]->mapq_loglik;
6571 }
6572 }
6573 maxlik = stage3array[0]->mapq_loglik;
6574
6575 /* Subtract maxlik to avoid underflow */
6576 for (i = 0; i < npaths; i++) {
6577 stage3array[i]->mapq_loglik -= maxlik;
6578 }
6579
6580 #if 0
6581 /* Save on computation if possible */
6582 /* Not possible, since we are going to select randomly from among all paths */
6583 if (npaths < maxpaths) {
6584 compute_npaths = npaths;
6585 } else {
6586 compute_npaths = maxpaths;
6587 }
6588 if (compute_npaths < 2) {
6589 compute_npaths = 2;
6590 }
6591 #else
6592 compute_npaths = npaths;
6593 #endif
6594
6595 /* Compute absolute mapq */
6596 for (i = 0; i < compute_npaths; i++) {
6597 loglik = stage3array[i]->mapq_loglik + MAPQ_MAXIMUM_SCORE;
6598 if (loglik < 0.0) {
6599 loglik = 0.0;
6600 }
6601 stage3array[i]->absmq_score = rint(loglik);
6602 }
6603 *first_absmq = stage3array[0]->absmq_score;
6604 *second_absmq = stage3array[1]->absmq_score;
6605
6606
6607 /* Compute Bayesian mapq */
6608 total = 0.0;
6609 for (i = 0; i < npaths; i++) {
6610 total += (stage3array[i]->mapq_loglik = fasterexp(stage3array[i]->mapq_loglik));
6611 }
6612
6613 /* Obtain posterior probabilities of being true */
6614 for (i = 0; i < compute_npaths; i++) {
6615 stage3array[i]->mapq_loglik /= total;
6616 }
6617
6618 /* Convert to Phred scores */
6619 for (i = 0; i < compute_npaths; i++) {
6620 if ((q = 1.0 - stage3array[i]->mapq_loglik) < 2.5e-10 /* 10^-9.6 */) {
6621 stage3array[i]->mapq_score = 96;
6622 } else {
6623 stage3array[i]->mapq_score = rint(-10.0 * log10(q));
6624 }
6625 }
6626
6627 if (displayp == true) {
6628 /* Prepare for display */
6629 for (i = 0; i < compute_npaths; i++) {
6630 Stage3end_display_prep(stage3array[i],queryuc_ptr,/*first_read_p*/true);
6631 }
6632 }
6633
6634 #if 0
6635 /* Apply filtering for mapq unique -- currently not used since mapq_unique_score is high */
6636 if (stage3array[0]->mapq_score >= mapq_unique_score &&
6637 stage3array[1]->mapq_score < mapq_unique_score) {
6638 for (i = 1; i < *npaths; i++) {
6639 Stage3end_free(&(stage3array[i]));
6640 }
6641 *npaths = 1;
6642 }
6643 #endif
6644 }
6645
6646 return stage3array;
6647 }
6648
6649
6650 /* Note: single-end terminals can be present with non-terminals when
6651 paired-end reads are searched for concordance, which can accumulate
6652 terminal alignments */
6653
6654 /* Pre-final: max (max-terminal, min-other)
6655 Final: max (min-terminal, max-GMAP, min-other) */
6656
6657
6658 static List_T
Stage3end_optimal_score_prefinal(bool * eliminatedp,List_T hitlist,Hitlistpool_T hitlistpool,int querylength)6659 Stage3end_optimal_score_prefinal (bool *eliminatedp, List_T hitlist,
6660 Hitlistpool_T hitlistpool, int querylength) {
6661 List_T optimal = NULL, p, q;
6662 T hit;
6663 Substring_T substring;
6664 Junction_T junction;
6665 int n;
6666 int cutoff_level, ref_nmismatches;
6667 int minscore = querylength;
6668 int trim_querystart = 0, trim_queryend = 0, trim_querystart_0, trim_queryend_0;
6669
6670
6671 #ifdef DISTANT_SPLICE_SPECIAL
6672 bool shortdistance_p = false;
6673 #endif
6674
6675
6676 *eliminatedp = false;
6677 n = List_length(hitlist);
6678 debug4(printf("\nEntered Stage3end_optimal_score with %d hits\n",n));
6679
6680 if (n <= 1) {
6681 return hitlist;
6682 }
6683
6684 /* Use eventrim for comparing alignments. Previously picked
6685 smallest trims, but now picking largest ones */
6686 for (p = hitlist; p != NULL; p = p->rest) {
6687 hit = (T) p->first;
6688
6689 debug4(printf("hit %u..%u method %s, nsegments %d, nindels %d, trim_querystart: %d%s, trim_queryend %d%s, start_ambig %d, end_ambig %d. sensedir %d\n",
6690 hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,Method_string(hit->method),
6691 hit->nsegments,hit->nindels,hit->trim_querystart,hit->trim_querystart_splicep ? " (splice)" : "",
6692 hit->trim_queryend,hit->trim_queryend_splicep ? " (splice)" : "",
6693 start_amb_length(hit),end_amb_length(hit),hit->sensedir));
6694
6695 if (hit->trim_querystart_splicep == true) {
6696 /* Skip */
6697 } else if (hit->trim_querystart > trim_querystart) {
6698 trim_querystart = hit->trim_querystart;
6699 }
6700 if (hit->trim_queryend_splicep == true) {
6701 /* Skip */
6702 } else if (hit->trim_queryend > trim_queryend) {
6703 trim_queryend = hit->trim_queryend;
6704 }
6705 }
6706
6707 if (trim_querystart == querylength) {
6708 trim_querystart = 0;
6709 }
6710 if (trim_queryend == querylength) {
6711 trim_queryend = 0;
6712 }
6713 debug4(printf("trim_querystart: %d, trim_queryend %d\n",trim_querystart,trim_queryend));
6714
6715 for (p = hitlist; p != NULL; p = p->rest) {
6716 hit = (T) p->first;
6717
6718 #ifdef CONSIDER_ENDS_IN_EVAL
6719 hit->score_eventrim = hit->trim_querystart / 8 + hit->trim_queryend / 8;
6720 #else
6721 hit->score_eventrim = 0;
6722 #endif
6723
6724 debug4(printf("score OTHER:"));
6725
6726 if (trim_querystart + trim_queryend >= querylength) {
6727 for (q = hit->substrings_1toN; q != NULL; q = List_next(q)) {
6728 substring = (Substring_T) List_head(q);
6729 hit->score_eventrim += Substring_nmismatches_bothdiff(substring);
6730 }
6731
6732 } else {
6733 for (q = hit->substrings_1toN; q != NULL; q = List_next(q)) {
6734 substring = (Substring_T) List_head(q);
6735 trim_querystart_0 = trim_querystart;
6736 trim_queryend_0 = trim_queryend;
6737 if (Substring_mandatory_trim_querystart(substring) > trim_querystart_0) {
6738 trim_querystart_0 = Substring_mandatory_trim_querystart(substring);
6739 }
6740 if (Substring_mandatory_trim_queryend(substring) > trim_queryend_0) {
6741 trim_queryend_0 = Substring_mandatory_trim_queryend(substring);
6742 }
6743 hit->score_eventrim += Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0);
6744 debug4(printf(" substring (%d..%d) %d.",trim_querystart,trim_queryend,
6745 Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0)));
6746 }
6747 }
6748
6749 for (q = hit->junctions_1toN; q != NULL; q = List_next(q)) {
6750 junction = (Junction_T) List_head(q);
6751 if (Junction_nindels(junction) > 0) {
6752 hit->score_eventrim += indel_penalty_middle;
6753 debug4(printf(" => add %d.",indel_penalty_middle));
6754 }
6755 }
6756
6757
6758 #if 0
6759 /* Accept a single indel */
6760 #ifdef SCORE_INDELS_EVENTRIM
6761 if (hit->hittype == INSERTION || hit->hittype == DELETION) {
6762 debugee(printf(" indel at %d",hit->indel_pos));
6763 if (hit->indel_pos > trim_querystart && hit->indel_pos < querylength - trim_queryend) {
6764 hit->score_eventrim += indel_penalty_middle;
6765 debug4(printf(" => add %d.",indel_penalty_middle));
6766 }
6767 }
6768 #endif
6769 #endif
6770 debug4(printf(" RESULT: %d\n",hit->score_eventrim));
6771
6772 if (hit->score_eventrim < minscore) {
6773 minscore = hit->score_eventrim;
6774 }
6775 }
6776 debug4(printf("MINSCORE: %d\n",minscore));
6777
6778
6779 /* Prefinal: Use score_eventrim */
6780 debug4(printf("Stage3end_optimal_score over %d hits: minscore = %d + subopt:%d\n",
6781 n,minscore,subopt_levels));
6782 minscore += subopt_levels;
6783 cutoff_level = minscore;
6784
6785 for (p = hitlist; p != NULL; p = p->rest) {
6786 hit = (T) p->first;
6787
6788 if (hit->score_eventrim > cutoff_level + SCORE_EVENTRIM_SLOP) {
6789 debug4(printf("Prefinal: Eliminating hit %p at %u..%u with score_eventrim %d > cutoff_level %d\n",
6790 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6791 hit->score_eventrim,cutoff_level));
6792 Stage3end_free(&hit);
6793 *eliminatedp = true;
6794
6795 } else {
6796 debug4(printf("Prefinal: Keeping hit %p at %u..%u with score_eventrim %d <= cutoff_level %d\n",
6797 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6798 hit->score_eventrim,cutoff_level));
6799 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
6800 }
6801 }
6802 Hitlist_free(&hitlist);
6803
6804
6805 #if 0
6806 /* Filter on nsegments */
6807 if (finalp == true && optimal != NULL) {
6808 hitlist = optimal;
6809 optimal = (List_T) NULL;
6810
6811 hit = (T) hitlist->first;
6812 best_nsegments = hit->nsegments;
6813
6814 for (p = hitlist; p != NULL; p = p->rest) {
6815 hit = (T) p->first;
6816 if (hit->nsegments < best_nsegments) {
6817 best_nsegments = hit->nsegments;
6818 }
6819 }
6820
6821 for (p = hitlist; p != NULL; p = p->rest) {
6822 hit = (T) p->first;
6823 if (hit->nsegments > best_nsegments + 2) {
6824 debug4(printf("Eliminating a hit with nsegments %d\n",hit->nsegments));
6825 Stage3end_free(&hit);
6826 *eliminatedp = true;
6827 } else {
6828 debug4(printf("Keeping a hit with nsegments %d, nindels %d\n",hit->nsegments,hit->nindels));
6829 optimal = Hitlist_push(optimal,hitlitpool,(void *) hit);
6830 }
6831 }
6832
6833 Hitlist_free(&hitlist);
6834 }
6835 #endif
6836
6837 debug4(printf("hitlist now has %d entries\n",List_length(optimal)));
6838 return optimal;
6839 }
6840
6841
6842 static int
hit_position_cmp(const void * a,const void * b)6843 hit_position_cmp (const void *a, const void *b) {
6844 T x = * (T *) a;
6845 T y = * (T *) b;
6846
6847 if (x->plusp < y->plusp) {
6848 return -1;
6849 } else if (y->plusp < x->plusp) {
6850 return +1;
6851 } else if (x->low < y->low) {
6852 return -1;
6853 } else if (y->low < x->low) {
6854 return +1;
6855 } else if (x->high > y->high) {
6856 return -1;
6857 } else if (y->high > x->high) {
6858 return +1;
6859 } else {
6860 return 0;
6861 }
6862 }
6863
6864 static bool
hit_equal(Stage3end_T x,Stage3end_T y)6865 hit_equal (Stage3end_T x, Stage3end_T y) {
6866 List_T p, q;
6867 Substring_T substring_x, substring_y;
6868
6869 if (x->plusp != y->plusp) {
6870 return false; /* Different strands */
6871 } else {
6872 p = x->substrings_1toN;
6873 q = y->substrings_1toN;
6874 while (p != NULL && q != NULL) {
6875 substring_x = (Substring_T) p->first;
6876 substring_y = (Substring_T) q->first;
6877 if (Substring_equal(substring_x,substring_y) == false) {
6878 return false;
6879 }
6880 p = List_next(p);
6881 q = List_next(q);
6882 }
6883 if (p != NULL || q != NULL) {
6884 return false;
6885 }
6886
6887 return true;
6888 }
6889 }
6890
6891
6892 static bool
hit_overlap_p(T x,T y)6893 hit_overlap_p (T x, T y) {
6894 if (x->chrnum != y->chrnum) {
6895 return false; /* Different chrnums */
6896 } else if (x->plusp != y->plusp) {
6897 return false; /* Different strands */
6898 } else if (x->high < y->low) {
6899 return false;
6900 } else if (x->low > y->high) {
6901 return false;
6902 } else {
6903 return true;
6904 }
6905 }
6906
6907 #if 0
6908 static List_T
6909 Stage3end_optimal_score_final_old (bool *eliminatedp, List_T hitlist, Hitlistpool_T hitlistpool,
6910 int querylength) {
6911 List_T optimal = NULL, p;
6912 T *hits, hit;
6913 int n, i, j, k;
6914 int best_nsegments;
6915 int best_nmatches_to_trims;
6916 double max_splice_score;
6917 int max_nmatches = 0, cutoff_level;
6918 /* int trim_querystart, trim_queryend, min_trim; */
6919 bool *eliminate, keptp;
6920
6921
6922 *eliminatedp = false;
6923 n = List_length(hitlist);
6924 debug4(printf("\nEntered Stage3end_optimal_score with %d hits\n",n));
6925
6926 if (n <= 1) {
6927 return hitlist;
6928 }
6929
6930 #ifdef DEBUG4
6931 for (p = hitlist; p != NULL; p = p->rest) {
6932 hit = (Stage3end_T) p->first;
6933 printf("%p %u..%u method %s, score_eventrim %d, nmatches %d (%d to_trims)\n",
6934 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6935 Method_string(hit->method),hit->score_eventrim,hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims);
6936 }
6937 #endif
6938
6939 /* Prune based on refalt_nmatches_plus_spliced_trims (to get the splice ends) */
6940 max_nmatches = 0;
6941 for (p = hitlist; p != NULL; p = p->rest) {
6942 hit = (Stage3end_T) p->first;
6943 if (hit->refalt_nmatches_plus_spliced_trims > max_nmatches) {
6944 max_nmatches = hit->refalt_nmatches_plus_spliced_trims;
6945 assert(max_nmatches <= querylength);
6946 }
6947 }
6948
6949 cutoff_level = max_nmatches - subopt_levels;
6950 debug4(printf("(1) refalt cutoff level %d = max_nmatches %d\n",cutoff_level,max_nmatches));
6951
6952 for (p = hitlist; p != NULL; p = List_next(p)) {
6953 hit = (Stage3end_T) p->first;
6954
6955 if (hit->refalt_nmatches_plus_spliced_trims < cutoff_level /*- NMATCHES_SLOP*/) {
6956 debug4(printf("Final (nmatches %d < %d): Eliminating hit %p at %u..%u with nmatches %d (%d to_trims) < cutoff_level %d\n",
6957 hit->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
6958 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6959 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
6960 Stage3end_free(&hit);
6961 *eliminatedp = true;
6962
6963 } else {
6964 debug4(printf("Final (nmatches %d >= %d): Keeping hit %p at %u..%u with nmatches %d (%d to_trims) >= cutoff_level %d\n",
6965 hit->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
6966 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6967 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
6968 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
6969 }
6970 }
6971 Hitlist_free(&hitlist);
6972 hitlist = optimal;
6973 optimal = (List_T) NULL;
6974
6975
6976 /* Prune based on ref_nmatches_plus_spliced_trims (to get the splice ends) */
6977 max_nmatches = 0;
6978 for (p = hitlist; p != NULL; p = p->rest) {
6979 hit = (Stage3end_T) p->first;
6980 if (hit->ref_nmatches_plus_spliced_trims > max_nmatches) {
6981 max_nmatches = hit->ref_nmatches_plus_spliced_trims;
6982 assert(max_nmatches <= querylength);
6983 }
6984 }
6985
6986 /* May not want to be greedy on cutoff level here. Might want to raise subopt_levels */
6987 cutoff_level = max_nmatches - subopt_levels;
6988 debug4(printf("(2) ref cutoff level %d = max_nmatches %d\n",cutoff_level,max_nmatches));
6989
6990 for (p = hitlist; p != NULL; p = List_next(p)) {
6991 hit = (Stage3end_T) p->first;
6992
6993 if (hit->ref_nmatches_plus_spliced_trims < cutoff_level /*- NMATCHES_SLOP*/) {
6994 debug4(printf("Final (nmatches %d < %d): Eliminating hit %p at %u..%u with nmatches %d (%d to_trims) < cutoff_level %d\n",
6995 hit->ref_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
6996 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
6997 hit->ref_nmatches_plus_spliced_trims,hit->ref_nmatches_to_trims,cutoff_level));
6998 Stage3end_free(&hit);
6999 *eliminatedp = true;
7000
7001 } else {
7002 debug4(printf("Final (nmatches %d >= %d): Keeping hit %p at %u..%u with nmatches %d (%d to_trims) >= cutoff_level %d\n",
7003 hit->ref_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
7004 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7005 hit->ref_nmatches_plus_spliced_trims,hit->ref_nmatches_to_trims,cutoff_level));
7006 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7007 }
7008 }
7009 Hitlist_free(&hitlist);
7010 hitlist = optimal;
7011 optimal = (List_T) NULL;
7012
7013
7014 /* Prune based on nmatches_to_trims */
7015 best_nmatches_to_trims = 0;
7016 for (p = hitlist; p != NULL; p = p->rest) {
7017 hit = (Stage3end_T) p->first;
7018 if (hit->refalt_nmatches_to_trims > best_nmatches_to_trims) {
7019 best_nmatches_to_trims = hit->refalt_nmatches_to_trims;
7020 assert(best_nmatches_to_trims <= querylength);
7021 }
7022 }
7023
7024 cutoff_level = best_nmatches_to_trims - subopt_levels;
7025 debug4(printf("cutoff level %d = best_nmatches_to_trims %d\n",cutoff_level,best_nmatches_to_trims));
7026
7027 /* Do not allow slop for final */
7028 for (p = hitlist; p != NULL; p = List_next(p)) {
7029 hit = (Stage3end_T) p->first;
7030
7031 if (hit->refalt_nmatches_to_trims < cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/) {
7032 debug4(printf("Final (nmatches_to_trims %d < %d): Eliminating hit %p at %u..%u with nmatches_to_trims %d (%d to_trims) < cutoff_level %d\n",
7033 hit->refalt_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
7034 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7035 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
7036 Stage3end_free(&hit);
7037 *eliminatedp = true;
7038
7039 } else {
7040 debug4(printf("Final (nmatches_to_trims %d >= %d): Keeping hit %p at %u..%u with nmatches_to_trims %d (%d to_trims) >= cutoff_level %d\n",
7041 hit->refalt_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
7042 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7043 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
7044 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7045 }
7046 }
7047 Hitlist_free(&hitlist);
7048 hitlist = optimal;
7049 optimal = (List_T) NULL;
7050
7051
7052 /* Eliminate within loci (1): refalt_nmatches_to_trims only */
7053 keptp = false;
7054 hits = (T *) List_to_array_n(&n,hitlist);
7055 eliminate = (bool *) CALLOC(n,sizeof(bool));
7056 qsort(hits,n,sizeof(T),hit_position_cmp);
7057 i = 0;
7058 while (i < n) {
7059 j = i+1;
7060 while (j < n && hit_overlap_p(hits[j],hits[i]) == true) {
7061 j++;
7062 }
7063 if (j - i > 1) {
7064 debug4(printf("Found a group from %d to %d\n",i,j));
7065 best_nmatches_to_trims = 0;
7066 for (k = i; k < j; k++) {
7067 hit = hits[k];
7068 if (hit->refalt_nmatches_to_trims > best_nmatches_to_trims) {
7069 best_nmatches_to_trims = hit->refalt_nmatches_to_trims;
7070 }
7071 }
7072 debug4(printf("best_nmatches_to_trims %d\n",best_nmatches_to_trims));
7073
7074 for (k = i; k < j; k++) {
7075 hit = hits[k];
7076 /* Do not allow slop for final */
7077 if (hit->refalt_nmatches_to_trims < best_nmatches_to_trims /*- NMATCHES_TO_TRIMS_SLOP*/) {
7078 debug4(printf("Within loci end (nmatches_to_trims): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7079 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7080 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7081 eliminate[k] = true;
7082 } else {
7083 keptp = true;
7084 }
7085 }
7086 }
7087
7088 i = j;
7089 }
7090
7091 if (keptp == false) {
7092 optimal = hitlist;
7093 } else {
7094 for (k = 0; k < n; k++) {
7095 hit = hits[k];
7096 if (eliminate[k] == true) {
7097 debug4(printf("Within loci end: Eliminating hit %p at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7098 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7099 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7100 Stage3end_free(&hit);
7101 *eliminatedp = true;
7102 } else {
7103 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7104 }
7105 }
7106 Hitlist_free(&hitlist);
7107 }
7108 FREE(hits);
7109 FREE(eliminate);
7110 hitlist = optimal;
7111 optimal = (List_T) NULL;
7112
7113
7114 /* Eliminate within loci (2): nsegments and splice score */
7115 keptp = false;
7116 hits = (T *) List_to_array_n(&n,hitlist);
7117 eliminate = (bool *) CALLOC(n,sizeof(bool));
7118 qsort(hits,n,sizeof(T),hit_position_cmp);
7119 i = 0;
7120 while (i < n) {
7121 j = i+1;
7122 while (j < n && hit_overlap_p(hits[j],hits[i]) == true) {
7123 j++;
7124 }
7125 if (j - i > 1) {
7126 debug4(printf("Found a group from %d to %d\n",i,j));
7127 best_nsegments = querylength;
7128 max_splice_score = 0.0;
7129 for (k = i; k < j; k++) {
7130 hit = hits[k];
7131 if (hit->nsegments < best_nsegments) {
7132 best_nsegments = hit->nsegments;
7133 max_splice_score = hit->splice_score;
7134
7135 } else if (hit->nsegments == best_nsegments) {
7136 if (hit->splice_score > max_splice_score) {
7137 max_splice_score = hit->splice_score;
7138 }
7139 }
7140 }
7141 debug8(printf("best_nsegments %d, max_splice_score %f\n",
7142 best_nsegments,max_splice_score));
7143
7144 for (k = i; k < j; k++) {
7145 hit = hits[k];
7146 if (hit->nsegments > best_nsegments) {
7147 debug4(printf("Within loci end (nsegments %d > %d): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7148 hit->nsegments,best_nsegments,
7149 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7150 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7151 eliminate[k] = true;
7152
7153 } else if (hit->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
7154 debug4(printf("Within loci end (splice score w/slop %f < %f): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7155 hit->splice_score,max_splice_score,
7156 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7157 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7158 eliminate[k] = true;
7159
7160 } else {
7161 keptp = true;
7162 }
7163 }
7164 }
7165
7166 i = j;
7167 }
7168
7169 if (keptp == false) {
7170 optimal = hitlist;
7171 } else {
7172 for (k = 0; k < n; k++) {
7173 hit = hits[k];
7174 if (eliminate[k] == true) {
7175 debug4(printf("Within loci end: Eliminating hit %p at %u..%u with nsegments %d, nmatches %d (%d to_trims), splice_score %f\n",
7176 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->nsegments,
7177 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->splice_score));
7178 Stage3end_free(&hit);
7179 *eliminatedp = true;
7180 } else {
7181 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7182 }
7183 }
7184 Hitlist_free(&hitlist);
7185 }
7186 FREE(hits);
7187 FREE(eliminate);
7188 hitlist = optimal;
7189 /* optimal = (List_T) NULL; */
7190
7191 #if 0
7192 /* Filter on trim amount */
7193 optimal = (List_T) NULL;
7194 min_trim = querylength;
7195 for (p = hitlist; p != NULL; p = p->rest) {
7196 hit = (T) p->first;
7197 if (hit->trim_querystart_splicep == true) {
7198 /* Skip */
7199 trim_querystart = 0;
7200 } else {
7201 trim_querystart = hit->trim_querystart;
7202 }
7203 if (hit->trim_queryend_splicep == true) {
7204 /* Skip */
7205 trim_queryend = 0;
7206 } else {
7207 trim_queryend = hit->trim_queryend;
7208 }
7209
7210 if (trim_querystart + trim_queryend < min_trim) {
7211 min_trim = trim_querystart + trim_queryend;
7212 }
7213 }
7214
7215 for (p = hitlist; p != NULL; p = p->rest) {
7216 hit = (T) p->first;
7217 if (hit->trim_querystart_splicep == true) {
7218 /* Skip */
7219 trim_querystart = 0;
7220 } else {
7221 trim_querystart = hit->trim_querystart;
7222 }
7223 if (hit->trim_queryend_splicep == true) {
7224 /* Skip */
7225 trim_queryend = 0;
7226 } else {
7227 trim_queryend = hit->trim_queryend;
7228 }
7229
7230 if (trim_querystart + trim_queryend > min_trim) {
7231 debug4(printf("Final: Eliminating hit %p at %u..%u with trim %d + %d > min_trim %d\n",
7232 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7233 trim_querystart,trim_queryend,min_trim));
7234 Stage3end_free(&hit);
7235 *eliminatedp = true;
7236
7237 } else {
7238 debug4(printf("Final: Keeping hit %p at %u..%u with trim %d + %d == min_trim %d\n",
7239 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7240 trim_querystart,trim_queryend,min_trim));
7241 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7242 }
7243 }
7244 Hitlist_free(&hitlist);
7245 #endif
7246
7247
7248 debug4(printf("Exiting Stage3end_optimal_score_final with %d hits\n",List_length(hitlist)));
7249 return hitlist;
7250 }
7251 #endif
7252
7253
7254 static List_T
Stage3end_optimal_score_final(bool * eliminatedp,List_T hitlist,Hitlistpool_T hitlistpool,int querylength)7255 Stage3end_optimal_score_final (bool *eliminatedp, List_T hitlist, Hitlistpool_T hitlistpool,
7256 int querylength) {
7257 List_T optimal = NULL, p;
7258 T hit;
7259 int n;
7260 int max_adj_nmatches, score;
7261 int best_nmatches_to_trims;
7262 int cutoff_level;
7263 /* int trim_querystart, trim_queryend, min_trim; */
7264
7265
7266 *eliminatedp = false;
7267 n = List_length(hitlist);
7268 debug4(printf("\nEntered Stage3end_optimal_score with %d hits\n",n));
7269
7270 if (n <= 1) {
7271 return hitlist;
7272 }
7273
7274 #ifdef DEBUG4
7275 for (p = hitlist; p != NULL; p = p->rest) {
7276 hit = (Stage3end_T) p->first;
7277 printf("%p %u..%u method %s, score_eventrim %d, nmatches %d (%d to_trims), refalt score %d\n",
7278 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7279 Method_string(hit->method),hit->score_eventrim,hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,
7280 hit->refalt_score_overall);
7281 }
7282 printf("\n");
7283 #endif
7284
7285 /* (1) Prune based on nmatches adjusted by score to get a tradeoff between matches and parsimony */
7286 max_adj_nmatches = 0;
7287 for (p = hitlist; p != NULL; p = p->rest) {
7288 hit = (Stage3end_T) p->first;
7289 if ((score = hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall) > max_adj_nmatches) {
7290 max_adj_nmatches = score;
7291 }
7292 }
7293
7294 cutoff_level = max_adj_nmatches - subopt_levels;
7295 debug4(printf("(1) refalt cutoff level %d = max_adj_nmatches %d - subopt_levels %d\n",
7296 cutoff_level,max_adj_nmatches,subopt_levels));
7297
7298 for (p = hitlist; p != NULL; p = List_next(p)) {
7299 hit = (Stage3end_T) p->first;
7300
7301 if (hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall < cutoff_level /*- NMATCHES_SLOP*/) {
7302 debug4(printf("Final (adj nmatches %d < %d): Eliminating hit %p at %u..%u with nmatches %d (%d to_trims) < cutoff_level %d\n",
7303 hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_within_trims,cutoff_level /*- NMATCHES_SLOP*/,
7304 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7305 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
7306 Stage3end_free(&hit);
7307 *eliminatedp = true;
7308
7309 } else {
7310 debug4(printf("Final (nmatches %d >= %d): Keeping hit %p at %u..%u with nmatches %d (%d to_trims) >= cutoff_level %d\n",
7311 hit->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
7312 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7313 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,cutoff_level));
7314 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7315 }
7316 }
7317 Hitlist_free(&hitlist);
7318 hitlist = optimal;
7319 optimal = (List_T) NULL;
7320
7321
7322 /* (2) Prune based on ref_nmatches_to_trims */
7323 best_nmatches_to_trims = 0;
7324 for (p = hitlist; p != NULL; p = p->rest) {
7325 hit = (Stage3end_T) p->first;
7326 if (hit->ref_nmatches_to_trims > best_nmatches_to_trims) {
7327 best_nmatches_to_trims = hit->ref_nmatches_to_trims;
7328 assert(best_nmatches_to_trims <= querylength);
7329 }
7330 }
7331
7332 cutoff_level = best_nmatches_to_trims - subopt_levels;
7333 debug4(printf("cutoff level %d = best_nmatches_to_trims %d\n",cutoff_level,best_nmatches_to_trims));
7334
7335 /* Do not allow slop for final */
7336 for (p = hitlist; p != NULL; p = List_next(p)) {
7337 hit = (Stage3end_T) p->first;
7338
7339 if (hit->ref_nmatches_to_trims < cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/) {
7340 debug4(printf("Final (nmatches_to_trims %d < %d): Eliminating hit %p at %u..%u with nmatches_to_trims %d (%d to_trims) < cutoff_level %d\n",
7341 hit->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
7342 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7343 hit->ref_nmatches_plus_spliced_trims,hit->ref_nmatches_to_trims,cutoff_level));
7344 Stage3end_free(&hit);
7345 *eliminatedp = true;
7346
7347 } else {
7348 debug4(printf("Final (nmatches_to_trims %d >= %d): Keeping hit %p at %u..%u with nmatches_to_trims %d (%d to_trims) >= cutoff_level %d\n",
7349 hit->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
7350 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
7351 hit->ref_nmatches_plus_spliced_trims,hit->ref_nmatches_to_trims,cutoff_level));
7352 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
7353 }
7354 }
7355 Hitlist_free(&hitlist);
7356 hitlist = optimal;
7357 /* optimal = (List_T) NULL; */
7358
7359 /* Shouldn't need to eliminate within loci, since that was done during prefinal */
7360
7361 debug4(printf("Exiting Stage3end_optimal_score_final with %d hits\n",List_length(hitlist)));
7362 return hitlist;
7363 }
7364
7365
7366
7367 List_T
Stage3end_optimal_score(List_T hitlist,Hitlistpool_T hitlistpool,int querylength,bool finalp)7368 Stage3end_optimal_score (List_T hitlist, Hitlistpool_T hitlistpool, int querylength, bool finalp) {
7369 List_T optimal;
7370 bool eliminatedp;
7371
7372 if (finalp == false) {
7373 optimal = Stage3end_optimal_score_prefinal(&eliminatedp,hitlist,hitlistpool,querylength);
7374 while (eliminatedp == true) {
7375 optimal = Stage3end_optimal_score_prefinal(&eliminatedp,optimal,hitlistpool,querylength);
7376 }
7377
7378 } else {
7379 optimal = Stage3end_optimal_score_final(&eliminatedp,hitlist,hitlistpool,querylength);
7380 while (eliminatedp == true) {
7381 optimal = Stage3end_optimal_score_final(&eliminatedp,optimal,hitlistpool,querylength);
7382 }
7383 }
7384
7385 return optimal;
7386 }
7387
7388
7389 static void
unalias_circular(T hit)7390 unalias_circular (T hit) {
7391 Chrpos_T chrlength = hit->chrlength;
7392 List_T p;
7393 Substring_T substring;
7394
7395 assert(hit->circularalias == +1);
7396 debug12(printf("Calling unalias_circular on substrings\n"));
7397 for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
7398 substring = (Substring_T) List_head(p);
7399 Substring_unalias_circular(substring);
7400 }
7401
7402 /* Doesn't fix hitpair->low and hitpair->high */
7403 hit->genomicstart -= chrlength;
7404 hit->genomicend -= chrlength;
7405 hit->low -= chrlength;
7406 hit->high -= chrlength;
7407
7408 hit->circularalias = -1;
7409
7410 return;
7411 }
7412
7413
7414 #if 0
7415 List_T
7416 Stage3end_unalias_circular (List_T hitlist) {
7417 List_T p;
7418 T hit;
7419
7420 for (p = hitlist; p != NULL; p = p->rest) {
7421 hit = (T) p->first;
7422 if (hit->circularalias == +1) {
7423 unalias_circular(hit);
7424 }
7425 }
7426
7427 return hitlist;
7428 }
7429 #endif
7430
7431 List_T
Stage3end_remove_circular_alias(List_T hitlist,Hitlistpool_T hitlistpool)7432 Stage3end_remove_circular_alias (List_T hitlist, Hitlistpool_T hitlistpool) {
7433 List_T newlist = NULL, p;
7434 T hit;
7435
7436 debug12(printf("Calling Stage3end_remove_circular_alias on %d hits\n",List_length(hitlist)));
7437 for (p = hitlist; p != NULL; p = p->rest) {
7438 hit = (T) p->first;
7439
7440 if (hit->circularalias == +1) {
7441 /* First, try to salvage alias +1 */
7442 unalias_circular(hit);
7443 }
7444
7445 if (hit->chrnum == 0) {
7446 /* Translocation */
7447 newlist = Hitlist_push(newlist,hitlistpool,(void *) hit);
7448
7449 } else if (hit->low - hit->chroffset >= hit->chrlength) {
7450 /* All in circular alias */
7451 debug12(printf("Freeing hit because all is in circular alias\n"));
7452 Stage3end_free(&hit);
7453
7454 } else {
7455 newlist = Hitlist_push(newlist,hitlistpool,(void *) hit);
7456 }
7457 }
7458
7459 Hitlist_free(&hitlist);
7460 return newlist;
7461 }
7462
7463
7464 #if 0
7465 int
7466 Stage3end_noptimal (List_T hitlist, int querylength) {
7467 int noptimal;
7468 List_T p;
7469 T hit;
7470 int minscore = querylength;
7471
7472 noptimal = 0;
7473 for (p = hitlist; p != NULL; p = p->rest) {
7474 hit = (T) p->first;
7475 if (hit->score < minscore) {
7476 minscore = hit->score;
7477 noptimal = 0;
7478 }
7479 if (hit->score == minscore) {
7480 noptimal++;
7481 }
7482 }
7483
7484 return noptimal;
7485 }
7486 #endif
7487
7488
7489 static Univcoord_T
normalize_coord(Univcoord_T orig,int circularalias,Chrpos_T chrlength)7490 normalize_coord (Univcoord_T orig, int circularalias, Chrpos_T chrlength) {
7491 if (circularalias == +1) {
7492 return orig - chrlength;
7493 } else {
7494 return orig;
7495 }
7496 }
7497
7498
7499
7500 static int
duplicate_sort_cmp(const void * a,const void * b)7501 duplicate_sort_cmp (const void *a, const void *b) {
7502 int cmp;
7503 T x = * (T *) a;
7504 T y = * (T *) b;
7505 Univcoord_T x_genomicstart, y_genomicstart;
7506 Univcoord_T x_genomicend, y_genomicend;
7507 List_T p, q;
7508 Substring_T x_substring, y_substring;
7509
7510 if (altlocp[x->chrnum] == true && altlocp[y->chrnum] == true) {
7511 if (alias_ends[y->chrnum] >= alias_starts[x->chrnum] &&
7512 alias_starts[y->chrnum] <= alias_ends[x->chrnum]) {
7513 /* The primary regions overlap */
7514 return 0;
7515 } else if (alias_starts[x->chrnum] < alias_starts[y->chrnum]) {
7516 return -1;
7517 } else if (alias_starts[y->chrnum] < alias_starts[x->chrnum]) {
7518 return +1;
7519 } else if (alias_ends[x->chrnum] < alias_ends[y->chrnum]) {
7520 return -1;
7521 } else if (alias_ends[y->chrnum] < alias_ends[x->chrnum]) {
7522 return +1;
7523 } else {
7524 return 0;
7525 }
7526
7527 } else if (altlocp[x->chrnum] == true) {
7528 if (y->genomicend >= alias_starts[x->chrnum] &&
7529 y->genomicstart <= alias_ends[x->chrnum]) {
7530 /* y overlaps with the primary region for x */
7531 return +1; /* Put primary region first */
7532 }
7533 /* Don't overlap, so fall through to rest of procedure */
7534
7535 } else if (altlocp[y->chrnum] == true) {
7536 if (alias_ends[y->chrnum] >= x->genomicstart &&
7537 alias_starts[y->chrnum] <= x->genomicend) {
7538 /* x overlaps with the primary region for y */
7539 return -1; /* Put primary region first */
7540 }
7541 /* Don't overlap, so fall through to rest of procedure */
7542 }
7543
7544
7545 x_genomicstart = normalize_coord(x->genomicstart,x->circularalias,x->chrlength);
7546 x_genomicend = normalize_coord(x->genomicend,x->circularalias,x->chrlength);
7547
7548 y_genomicstart = normalize_coord(y->genomicstart,y->circularalias,y->chrlength);
7549 y_genomicend = normalize_coord(y->genomicend,y->circularalias,y->chrlength);
7550
7551
7552 if (x_genomicstart < y_genomicstart) {
7553 return -1;
7554 } else if (x_genomicstart > y_genomicstart) {
7555 return +1;
7556 } else if (x->hittype < y->hittype) {
7557 return -1;
7558 } else if (x->hittype > y->hittype) {
7559 return +1;
7560 } else if (x_genomicend < y_genomicend) {
7561 return -1;
7562 } else if (x_genomicend > y_genomicend) {
7563 return +1;
7564
7565 /* sensedir is relevant for transcriptome-guided alignment, with overlapping genes */
7566 } else if (x->sensedir > y->sensedir) {
7567 return -1;
7568 } else if (y->sensedir > x->sensedir) {
7569 return +1;
7570
7571 } else {
7572 for (p = x->substrings_1toN, q = y->substrings_1toN; p != NULL && q != NULL; p = List_next(p), q = List_next(q)) {
7573 x_substring = (Substring_T) List_head(p);
7574 y_substring = (Substring_T) List_head(q);
7575 if ((cmp = Substring_compare(x_substring,y_substring,x->circularalias,y->circularalias,x->chrlength,y->chrlength)) != 0) {
7576 return cmp;
7577 }
7578 }
7579 if (p == NULL && q != NULL) {
7580 return -1;
7581 } else if (p != NULL && q == NULL) {
7582 return +1;
7583 }
7584
7585 #if 0
7586 /* Need to change to search on junctions */
7587 if (x->indel_low < y->indel_low) {
7588 return -1;
7589 } else if (y->indel_low < x->indel_low) {
7590 return +1;
7591 }
7592 #endif
7593
7594 return 0;
7595 }
7596 }
7597
7598 /* Same as duplicate_sort_cmp, except for indel_low */
7599 static int
duplicate_equiv_cmp(const void * a,const void * b)7600 duplicate_equiv_cmp (const void *a, const void *b) {
7601 int cmp;
7602 T x = * (T *) a;
7603 T y = * (T *) b;
7604 List_T p, q;
7605 Substring_T x_substring, y_substring;
7606
7607 Univcoord_T x_genomicstart, x_genomicend, y_genomicstart, y_genomicend;
7608
7609 if (altlocp[x->chrnum] == true && altlocp[y->chrnum] == true) {
7610 if (alias_ends[y->chrnum] >= alias_starts[x->chrnum] &&
7611 alias_starts[y->chrnum] <= alias_ends[x->chrnum]) {
7612 /* The primary regions overlap */
7613 return 0;
7614 }
7615
7616 } else if (altlocp[x->chrnum] == true) {
7617 if (y->genomicend >= alias_starts[x->chrnum] &&
7618 y->genomicstart <= alias_ends[x->chrnum]) {
7619 /* y overlaps with the primary region for x */
7620 return 0;
7621 }
7622
7623 } else if (altlocp[y->chrnum] == true) {
7624 if (alias_ends[y->chrnum] >= x->genomicstart &&
7625 alias_starts[y->chrnum] <= x->genomicend) {
7626 /* x overlaps with the primary region for y */
7627 return 0;
7628 }
7629 }
7630
7631 x_genomicstart = normalize_coord(x->genomicstart,x->circularalias,x->chrlength);
7632 x_genomicend = normalize_coord(x->genomicend,x->circularalias,x->chrlength);
7633
7634 y_genomicstart = normalize_coord(y->genomicstart,y->circularalias,y->chrlength);
7635 y_genomicend = normalize_coord(y->genomicend,y->circularalias,y->chrlength);
7636
7637 if (x_genomicstart < y_genomicstart) {
7638 return -1;
7639 } else if (x_genomicstart > y_genomicstart) {
7640 return +1;
7641 #if 0
7642 } else if (x->hittype < y->hittype) {
7643 return -1;
7644 } else if (x->hittype > y->hittype) {
7645 return +1;
7646 #endif
7647 } else if (x_genomicend < y_genomicend) {
7648 return -1;
7649 } else if (x_genomicend > y_genomicend) {
7650 return +1;
7651
7652 /* sensedir is relevant for transcriptome-guided alignment, with overlapping genes */
7653 } else if (x->sensedir > y->sensedir) {
7654 return -1;
7655 } else if (y->sensedir > x->sensedir) {
7656 return +1;
7657
7658 } else {
7659 for (p = x->substrings_1toN, q = y->substrings_1toN; p != NULL && q != NULL; p = List_next(p), q = List_next(q)) {
7660 x_substring = (Substring_T) List_head(p);
7661 y_substring = (Substring_T) List_head(q);
7662 if ((cmp = Substring_compare(x_substring,y_substring,x->circularalias,y->circularalias,x->chrlength,y->chrlength)) != 0) {
7663 return cmp;
7664 }
7665 }
7666 if (p == NULL && q != NULL) {
7667 return -1;
7668 } else if (p != NULL && q == NULL) {
7669 return +1;
7670 } else {
7671 return 0;
7672 }
7673 }
7674 }
7675
7676
7677 #if defined(DEBUG0) || defined(DEBUG4)
7678 static void
Stage3end_print_substrings(Stage3end_T hit)7679 Stage3end_print_substrings (Stage3end_T hit) {
7680 List_T p;
7681 Substring_T substring;
7682
7683 for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
7684 if ((substring = (Substring_T) List_head(p)) == NULL) {
7685 printf("NA ");
7686 } else {
7687 printf("#%d:%llu..%llu ",
7688 Substring_chrnum(substring),
7689 (unsigned long long) Substring_alignstart_trim(substring),
7690 (unsigned long long) Substring_alignend_trim(substring));
7691 }
7692 }
7693 return;
7694 }
7695 #endif
7696
7697
7698 const Except_T Duplicate_Pairing = { "Duplicates both seen in pairing" };
7699
7700 List_T
Stage3end_remove_duplicates(List_T hitlist,Hitlistpool_T hitlistpool)7701 Stage3end_remove_duplicates (List_T hitlist, Hitlistpool_T hitlistpool) {
7702 #ifdef DEBUG4
7703 List_T p;
7704 #endif
7705 T x, y, *hits;
7706 int n, usedi, i, j, k;
7707 bool *eliminate, eliminatep;
7708
7709 debug4(printf("Entered Stage3end_remove_duplicates with %d hits\n",List_length(hitlist)));
7710 if ((n = List_length(hitlist)) == 0) {
7711 return (List_T) NULL;
7712 } else {
7713 #ifdef USE_ALLOCA_FOR_HITS
7714 eliminate = (bool *) CALLOCA(n,sizeof(bool));
7715 hits = (T *) MALLOCA(n * sizeof(T));
7716 List_fill_array((void **) hits,hitlist); /* hitlist is a return value */
7717 #else
7718 eliminate = (bool *) CALLOC(n,sizeof(bool));
7719 hits = (T *) List_to_array(hitlist,NULL);
7720 #endif
7721 }
7722
7723
7724 /* By equivalence */
7725 debug4(printf("Stage3end_remove_duplicates: checking %d hits by equivalence class\n",n));
7726 qsort(hits,n,sizeof(T),duplicate_sort_cmp);
7727
7728 debug4(
7729 for (i = 0; i < n; i++) {
7730 x = hits[i];
7731 printf(" Initial %d (%s): %p #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d, sense %d ",
7732 i,Method_string(x->method),x,x->chrnum,x->low - x->chroffset,x->high - x->chroffset,
7733 x->circularalias,x->refalt_nmatches_plus_spliced_trims,x->refalt_nmatches_to_trims,x->refalt_score_within_trims,x->sensedir);
7734 Stage3end_print_substrings(x);
7735 if (x->transcripts != NULL) {
7736 Transcript_print_list(x->transcripts);
7737 }
7738 printf("\n");
7739 }
7740 );
7741
7742 eliminatep = false;
7743 i = 0;
7744 while (i < n) {
7745 j = i+1;
7746 while (j < n && duplicate_equiv_cmp(&(hits[j]),&(hits[i])) == 0) {
7747 j++;
7748 }
7749
7750 if (j > i+1) {
7751 debug4(printf("Equivalence class #%d through #%d. ",i,j-1));
7752
7753 x = hits[i];
7754 if (x->paired_usedp == true) {
7755 usedi = i;
7756 } else {
7757 usedi = -1;
7758 }
7759
7760 for (k = i+1; k < j; k++) {
7761 y = hits[k];
7762 if (y->paired_usedp == true) {
7763 if (usedi >= 0) {
7764 debug4(printf(" #%d equivalent to #%d and both used (%p and %p)\n",k,usedi,hits[k],hits[usedi]));
7765 #if 0
7766 /* This doesn't matter anymore. Example from NM_001033853:
7767 TTGCCCTTGGTCACCCCGATGACGTCGATCATCTCATCCTGCCCAAACACTTGGTTCACAGGTACCTGCTGCTCA
7768 AGTGATGAATCCAAGAGGCGTTTCTATAAGAATTGGCATAAATCTAAGAAGAAGGCCCACCTGATGGAGATCCAG */
7769 fprintf(stderr,"Duplicates of Stage3end_T both seen\n");
7770 #if 0
7771 /* No longer providing queryseq1 and queryseq2 */
7772 Shortread_print_query_pairedend_fasta(stderr,queryseq1,queryseq2,
7773 /*invert_first_p*/false,/*invert_second_p*/true);
7774 #endif
7775 Except_raise(&Duplicate_Pairing, __FILE__, __LINE__);
7776 #endif
7777 } else {
7778 usedi = k;
7779 }
7780 }
7781 }
7782
7783 if (usedi < 0) {
7784 debug4(printf("None used yet so eliminating #%d through #%d\n",i+1,j-1));
7785 for (k = i+1; k < j; k++) {
7786 y = hits[k];
7787 if (y->transcripts != NULL) {
7788 x->transcripts = List_append(y->transcripts,x->transcripts);
7789 y->transcripts = (List_T) NULL;
7790 }
7791 eliminate[k] = true;
7792 eliminatep = true;
7793 }
7794 } else {
7795 debug4(printf("One used already so eliminating all but #%d\n",usedi));
7796 for (k = i; k < j; k++) {
7797 if (k != usedi) {
7798 y = hits[k];
7799 if (y->transcripts != NULL) {
7800 x->transcripts = List_append(y->transcripts,x->transcripts);
7801 y->transcripts = (List_T) NULL;
7802 }
7803 eliminate[k] = true;
7804 eliminatep = true;
7805 }
7806 }
7807 }
7808 }
7809
7810 i = j;
7811 }
7812
7813
7814 #if 0
7815 nkept = 0;
7816 for (i = 0; i < n; i++) {
7817 if (eliminate[i] == false) {
7818 nkept++;
7819 }
7820 }
7821 if (nkept == 0) {
7822 /* All entries eliminated one another, so keep the first one */
7823 eliminate[0] = false;
7824 }
7825 #endif
7826
7827 if (eliminatep == false) {
7828 debug4(printf("No eliminations, so hitlist is unchanged\n"));
7829 } else {
7830 Hitlist_free(&hitlist);
7831 for (i = n-1; i >= 0; i--) {
7832 x = hits[i];
7833 if (eliminate[i] == false) {
7834 #ifdef DEBUG4
7835 printf(" Keeping #%d at chr #%d:%u..%u, score %d, nmatches %d (nindels %d, chrnum %d) (plusp = %d, sensedir = %d) ",
7836 i,x->chrnum,x->low - x->chroffset,x->high - x->chroffset,
7837 x->refalt_score_within_trims,x->refalt_nmatches_plus_spliced_trims,x->nindels,x->chrnum,x->plusp,x->sensedir);
7838 Stage3end_print_substrings(x);
7839 if (x->transcripts != NULL) {
7840 Transcript_print_nums(x->transcripts);
7841 }
7842 printf("\n");
7843 #endif
7844 hitlist = Hitlist_push(hitlist,hitlistpool,(void *) x);
7845
7846 } else {
7847 #ifdef DEBUG4
7848 printf(" Eliminating #%d at chr #%d:%u..%u, score %d, nmatches %d (nindels %d, chrnum %d) (plusp = %d, sensedir = %d) ",
7849 i,x->chrnum,x->low - x->chroffset,x->high - x->chroffset,
7850 x->refalt_score_within_trims,x->refalt_nmatches_plus_spliced_trims,x->nindels,x->chrnum,x->plusp,x->sensedir);
7851 Stage3end_print_substrings(x);
7852 if (x->transcripts != NULL) {
7853 Transcript_print_nums(x->transcripts);
7854 }
7855 printf("\n");
7856 #endif
7857 Stage3end_free(&x);
7858 }
7859 }
7860 }
7861
7862 #ifdef USE_ALLOCA_FOR_HITS
7863 FREEA(hits);
7864 FREEA(eliminate);
7865 #else
7866 FREE(hits);
7867 FREE(eliminate);
7868 #endif
7869
7870 #ifdef DEBUG4
7871 for (p = hitlist, i = 0; p != NULL; p = p->rest, i++) {
7872 x = (T) p->first;
7873 printf(" Final %d: #%d:%u..%u (plusp = %d, sensedir = %d) ",
7874 i,x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,x->plusp,x->sensedir);
7875 Stage3end_print_substrings(x);
7876 if (x->transcripts != NULL) {
7877 Transcript_print_nums(x->transcripts);
7878 }
7879 printf("\n");
7880 }
7881 #endif
7882
7883 debug4(printf("Exited Stage3end_remove_duplicates with %d hits\n",List_length(hitlist)));
7884 return hitlist;
7885 }
7886
7887
7888
7889 T *
Stage3end_remove_duplicates_array(int * nunique,List_T * duplicates,T * hits,int nhits,Hitlistpool_T hitlistpool)7890 Stage3end_remove_duplicates_array (int *nunique, List_T *duplicates, T *hits, int nhits,
7891 Hitlistpool_T hitlistpool) {
7892 T *unique, *out, x, y;
7893 int usedi, i, j, k;
7894 bool *eliminate, eliminatep;
7895
7896 debug4(printf("Entered Stage3end_remove_duplicates_array with %d hits\n",nhits));
7897 if (nhits == 0) {
7898 *nunique = 0;
7899 return (T *) NULL;
7900
7901 } else {
7902 eliminate = (bool *) CALLOC(nhits,sizeof(bool));
7903 }
7904
7905
7906 /* By equivalence */
7907 debug4(printf("Stage3end_remove_duplicates_array: checking %d hits by equivalence class\n",nhits));
7908 qsort(hits,nhits,sizeof(T),duplicate_sort_cmp);
7909
7910 debug4(
7911 for (i = 0; i < nhits; i++) {
7912 x = hits[i];
7913 printf(" Initial %d (%s): %p #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d, sense %d ",
7914 i,Method_string(x->method),x,x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
7915 x->circularalias,x->refalt_nmatches_plus_spliced_trims,x->refalt_nmatches_to_trims,x->refalt_score_within_trims,x->sensedir);
7916 Stage3end_print_substrings(x);
7917 if (x->transcripts != NULL) {
7918 Transcript_print_list(x->transcripts);
7919 }
7920 printf("\n");
7921 }
7922 );
7923
7924 eliminatep = false;
7925 i = 0;
7926 while (i < nhits) {
7927 j = i+1;
7928 while (j < nhits && duplicate_equiv_cmp(&(hits[j]),&(hits[i])) == 0) {
7929 j++;
7930 }
7931
7932 if (j > i+1) {
7933 debug4(printf("Equivalence class #%d through #%d. ",i,j-1));
7934
7935 x = hits[i];
7936 if (x->paired_usedp == true) {
7937 usedi = i;
7938 } else {
7939 usedi = -1;
7940 }
7941
7942 for (k = i+1; k < j; k++) {
7943 y = hits[k];
7944 if (y->paired_usedp == true) {
7945 if (usedi >= 0) {
7946 debug4(printf(" #%d equivalent to #%d and both used (%p and %p)\n",k,usedi,hits[k],hits[usedi]));
7947 #if 0
7948 /* This doesn't matter anymore. Example from NM_001033853:
7949 TTGCCCTTGGTCACCCCGATGACGTCGATCATCTCATCCTGCCCAAACACTTGGTTCACAGGTACCTGCTGCTCA
7950 AGTGATGAATCCAAGAGGCGTTTCTATAAGAATTGGCATAAATCTAAGAAGAAGGCCCACCTGATGGAGATCCAG */
7951 fprintf(stderr,"Duplicates of Stage3end_T both seen\n");
7952 #if 0
7953 /* No longer providing queryseq1 and queryseq2 */
7954 Shortread_print_query_pairedend_fasta(stderr,queryseq1,queryseq2,
7955 /*invert_first_p*/false,/*invert_second_p*/true);
7956 #endif
7957 Except_raise(&Duplicate_Pairing, __FILE__, __LINE__);
7958 #endif
7959 } else {
7960 usedi = k;
7961 }
7962 }
7963 }
7964
7965 if (usedi < 0) {
7966 debug4(printf("None used yet so eliminating #%d through #%d\n",i+1,j-1));
7967 for (k = i+1; k < j; k++) {
7968 y = hits[k];
7969 if (y->transcripts != NULL) {
7970 x->transcripts = List_append(y->transcripts,x->transcripts);
7971 y->transcripts = (List_T) NULL;
7972 }
7973 eliminate[k] = true;
7974 eliminatep = true;
7975 }
7976 } else {
7977 debug4(printf("One used already so eliminating all but #%d\n",usedi));
7978 for (k = i; k < j; k++) {
7979 if (k != usedi) {
7980 y = hits[k];
7981 if (y->transcripts != NULL) {
7982 x->transcripts = List_append(y->transcripts,x->transcripts);
7983 y->transcripts = (List_T) NULL;
7984 }
7985 eliminate[k] = true;
7986 eliminatep = true;
7987 }
7988 }
7989 }
7990 }
7991
7992 i = j;
7993 }
7994
7995
7996 #if 0
7997 nkept = 0;
7998 for (i = 0; i < nhits; i++) {
7999 if (eliminate[i] == false) {
8000 nkept++;
8001 }
8002 }
8003 if (nkept == 0) {
8004 /* All entries eliminated one another, so keep the first one */
8005 eliminate[0] = false;
8006 }
8007 #endif
8008
8009 if (eliminatep == false) {
8010 debug4(printf("No eliminations, so hits are unchanged\n"));
8011 unique = hits;
8012 *nunique = nhits;
8013
8014 } else {
8015 /* Caller needs (*nunique)+1, but since we are guaranteed to have one elimination, nhits will suffice */
8016 out = unique = (T *) MALLOC(nhits*sizeof(T));
8017
8018 for (i = nhits-1; i >= 0; i--) {
8019 x = hits[i];
8020 if (eliminate[i] == false) {
8021 #ifdef DEBUG4
8022 printf(" Keeping #%d:%u..%u, score %d, nmatches %d (nindels %d, chrnum %d) (plusp = %d, sensedir = %d) ",
8023 x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
8024 x->refalt_score_within_trims,x->refalt_nmatches_plus_spliced_trims,x->nindels,x->chrnum,x->plusp,x->sensedir);
8025 Stage3end_print_substrings(x);
8026 if (x->transcripts != NULL) {
8027 Transcript_print_nums(x->transcripts);
8028 }
8029 printf("\n");
8030 #endif
8031 *out++ = x;
8032
8033 } else {
8034 #ifdef DEBUG4
8035 printf(" Eliminating #%d:%u..%u, score %d, nmatches %d (nindels %d, chrnum %d) (plusp = %d, sensedir = %d) ",
8036 x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,
8037 x->refalt_score_within_trims,x->refalt_nmatches_plus_spliced_trims,x->nindels,x->chrnum,x->plusp,x->sensedir);
8038 Stage3end_print_substrings(x);
8039 if (x->transcripts != NULL) {
8040 Transcript_print_nums(x->transcripts);
8041 }
8042 printf("\n");
8043 #endif
8044 /* Stage3end_free(&x); -- Cannot free, because newladder and ladder might share this hit */
8045 *duplicates = Hitlist_push(*duplicates,hitlistpool,(void *) x);
8046 }
8047 }
8048
8049 *nunique = out - unique;
8050 FREE(hits);
8051 }
8052
8053 FREE(eliminate);
8054
8055 #ifdef DEBUG4
8056 for (i = 0; i < *nunique; i++) {
8057 x = unique[i];
8058 printf(" Final %d: #%d:%u..%u (plusp = %d, sensedir = %d) ",
8059 i,x->chrnum,x->genomicstart - x->chroffset,x->genomicend - x->chroffset,x->plusp,x->sensedir);
8060 Stage3end_print_substrings(x);
8061 if (x->transcripts != NULL) {
8062 Transcript_print_nums(x->transcripts);
8063 }
8064 printf("\n");
8065 }
8066 #endif
8067
8068 debug4(printf("Exited Stage3end_remove_duplicates_array with %d hits\n",*nunique));
8069 return unique;
8070 }
8071
8072
8073
8074 #if 0
8075 static bool
8076 extra_ambiguous_ends_p (List_T substrings) {
8077 int nambiguous;
8078 List_T p;
8079
8080 p = substrings;
8081 nambiguous = 0;
8082 while (Substring_ambiguous_p((Substring_T) List_head(p)) == true) {
8083 p = List_next(p);
8084 nambiguous += 1;
8085 }
8086 if (nambiguous > 1) {
8087 return true;
8088 }
8089
8090 substrings = List_reverse(substrings);
8091
8092 p = substrings;
8093 nambiguous = 0;
8094 while (Substring_ambiguous_p((Substring_T) List_head(p)) == true) {
8095 p = List_next(p);
8096 nambiguous += 1;
8097 }
8098
8099 substrings = List_reverse(substrings);
8100
8101 if (nambiguous > 1) {
8102 return true;
8103 } else {
8104 return false;
8105 }
8106 }
8107 #endif
8108
8109
8110 #if 0
8111 List_T
8112 Stage3end_reject_trimlengths (List_T hits, Hitlistpool_T hitlistpool) {
8113 List_T filtered = NULL, p;
8114 T hit;
8115
8116 for (p = hits; p != NULL; p = p->rest) {
8117 hit = (T) p->first;
8118 if (hit->trim_querystart + hit->trim_queryend >= reject_trimlength) {
8119 Stage3end_free(&hit);
8120 } else {
8121 filtered = Hitlist_push(filtered,hitlistpool,(void *) hit);
8122 }
8123 }
8124
8125 Hitlist_free(&hits);
8126 return filtered;
8127 }
8128 #endif
8129
8130
8131 /* Used for eliminating exact duplicates. Also sorts secondarily by hittype. */
8132 static int
hit_sort_cmp(const void * a,const void * b)8133 hit_sort_cmp (const void *a, const void *b) {
8134 Stage3end_T x = * (Stage3end_T *) a;
8135 Stage3end_T y = * (Stage3end_T *) b;
8136
8137 debug4(printf("Comparing %s: #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d with %s: #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d\n",
8138 Method_string(x->method),x->chrnum,x->genomicstart-x->chroffset,x->genomicend-x->chroffset,
8139 x->circularalias,x->refalt_nmatches_plus_spliced_trims,x->refalt_nmatches_to_trims,x->refalt_score_within_trims,
8140 Method_string(y->method),y->chrnum,y->genomicstart-y->chroffset,y->genomicend-y->chroffset,
8141 y->circularalias,y->refalt_nmatches_plus_spliced_trims,x->refalt_nmatches_to_trims,y->refalt_score_within_trims));
8142
8143 if (altlocp[x->chrnum] == true && altlocp[y->chrnum] == true) {
8144 if (alias_ends[y->chrnum] >= alias_starts[x->chrnum] &&
8145 alias_starts[y->chrnum] <= alias_ends[x->chrnum]) {
8146 /* The primary regions overlap */
8147 return 0;
8148 } else if (alias_starts[x->chrnum] < alias_starts[y->chrnum]) {
8149 return -1;
8150 } else if (alias_starts[y->chrnum] < alias_starts[x->chrnum]) {
8151 return +1;
8152 } else if (alias_ends[x->chrnum] < alias_ends[y->chrnum]) {
8153 return -1;
8154 } else if (alias_ends[y->chrnum] < alias_ends[x->chrnum]) {
8155 return +1;
8156 } else {
8157 return 0;
8158 }
8159
8160 } else if (altlocp[x->chrnum] == true) {
8161 if (y->genomicend >= alias_starts[x->chrnum] &&
8162 y->genomicstart <= alias_ends[x->chrnum]) {
8163 /* y overlaps with the primary region for x */
8164 return +1; /* Put primary region first */
8165 }
8166 /* Don't overlap, so fall through to rest of procedure */
8167
8168 } else if (altlocp[y->chrnum] == true) {
8169 if (alias_ends[y->chrnum] >= x->genomicstart &&
8170 alias_starts[y->chrnum] <= x->genomicend) {
8171 /* x overlaps with the primary region for y */
8172 return -1; /* Put primary region first */
8173 }
8174 /* Don't overlap, so fall through to rest of procedure */
8175 }
8176
8177
8178 if (x->plusp > y->plusp) {
8179 return -1;
8180 } else if (y->plusp > x->plusp) {
8181 return +1;
8182
8183
8184 } else if (x->low < y->low) {
8185 debug4(printf("Returning -1 for low\n"));
8186 return -1;
8187 } else if (y->low < x->low) {
8188 debug4(printf("Returning +1 for low\n"));
8189 return +1;
8190
8191 } else if (x->high < y->high) {
8192 debug4(printf("Returning -1 for high\n"));
8193 return -1;
8194 } else if (y->high < x->high) {
8195 debug4(printf("Returning +1 for high\n"));
8196 return +1;
8197
8198
8199 } else if (x->refalt_score_within_trims < y->refalt_score_within_trims) {
8200 return -1;
8201 } else if (y->refalt_score_within_trims < x->refalt_score_within_trims) {
8202 return +1;
8203 } else if (x->refalt_nmatches_plus_spliced_trims > y->refalt_nmatches_plus_spliced_trims) {
8204 return -1;
8205 } else if (y->refalt_nmatches_plus_spliced_trims > x->refalt_nmatches_plus_spliced_trims) {
8206 return +1;
8207 } else if (x->ref_nmatches_plus_spliced_trims > y->ref_nmatches_plus_spliced_trims) {
8208 return -1;
8209 } else if (y->ref_nmatches_plus_spliced_trims > x->ref_nmatches_plus_spliced_trims) {
8210 return +1;
8211
8212 /* Prioritize last method used */
8213 } else if (x->method > y->method) {
8214 return -1;
8215 } else if (y->method > x->method) {
8216 return +1;
8217
8218 } else if (x->altlocp < y->altlocp) {
8219 return -1;
8220 } else if (y->altlocp < x->altlocp) {
8221 return +1;
8222
8223
8224 } else if (x->sensedir != 0 && y->sensedir == 0) {
8225 return -1;
8226 } else if (y->sensedir != 0 && x->sensedir == 0) {
8227 return +1;
8228
8229 } else if (x->splice_score > y->splice_score) {
8230 debug4(printf(" => loses by splice score\n"));
8231 return -1;
8232 } else if (y->splice_score > x->splice_score) {
8233 debug4(printf(" => wins by splice score\n"));
8234 return +1;
8235
8236 } else {
8237 debug4(printf("Returning 0 for equivalent\n"));
8238 return 0;
8239 }
8240 }
8241
8242
8243
8244 #if 0
8245 /* Same as hit_sort_cmp, except for hittype, nmatches_to_trims, and indel_low */
8246 static int
8247 hit_equiv_cmp (Stage3end_T x, Stage3end_T y) {
8248
8249 if (altlocp[x->chrnum] == true && altlocp[y->chrnum] == true) {
8250 if (alias_ends[y->chrnum] >= alias_starts[x->chrnum] &&
8251 alias_starts[y->chrnum] <= alias_ends[x->chrnum]) {
8252 /* The primary regions overlap */
8253 return 0;
8254 }
8255
8256 } else if (altlocp[x->chrnum] == true) {
8257 if (y->genomicend >= alias_starts[x->chrnum] &&
8258 y->genomicstart <= alias_ends[x->chrnum]) {
8259 /* y overlaps with the primary region for x */
8260 return 0;
8261 }
8262
8263 } else if (altlocp[y->chrnum] == true) {
8264 if (alias_ends[y->chrnum] >= x->genomicstart &&
8265 alias_starts[y->chrnum] <= x->genomicend) {
8266 /* x overlaps with the primary region for y */
8267 return 0;
8268 }
8269 }
8270
8271 if (x->plusp > y->plusp) {
8272 return -1;
8273 } else if (y->plusp > x->plusp) {
8274 return +1;
8275 } else if (x->low < y->low) {
8276 return -1;
8277 } else if (y->low < x->low) {
8278 return +1;
8279 } else if (x->high < y->high) {
8280 return +1;
8281 } else if (y->high < x->high) {
8282 return -1;
8283
8284 } else if (x->refalt_score_within_trims < y->refalt_score_within_trims) {
8285 return -1;
8286 } else if (y->refalt_score_within_trims < x->refalt_score_within_trims) {
8287 return +1;
8288 } else if (x->refalt_nmatches_plus_spliced_trims > y->refalt_nmatches_plus_spliced_trims) {
8289 return -1;
8290 } else if (y->refalt_nmatches_plus_spliced_trims > x->refalt_nmatches_plus_spliced_trims) {
8291 return +1;
8292 } else if (x->ref_nmatches_plus_spliced_trims > y->ref_nmatches_plus_spliced_trims) {
8293 return -1;
8294 } else if (y->ref_nmatches_plus_spliced_trims > x->ref_nmatches_plus_spliced_trims) {
8295 return +1;
8296
8297 #if 0
8298 /* Causes hits to not be recognized as equivalent */
8299 } else if (x->nsplices < y->nsplices) {
8300 return -1;
8301 } else if (y->nsplices < x->nsplices) {
8302 return +1;
8303 #endif
8304
8305 #if 0
8306 } else if (y->start_amb_length + y->end_amb_length == 0 &&
8307 x->start_amb_length + x->end_amb_length > 0) {
8308 return -1;
8309 } else if (x->start_amb_length + x->end_amb_length == 0 &&
8310 y->start_amb_length + y->end_amb_length > 0) {
8311 return +1;
8312 #endif
8313
8314 #if 0
8315 } else if (x->indel_low < y->indel_low) {
8316 return -1;
8317 } else if (y->indel_low < x->indel_low) {
8318 return +1;
8319 #endif
8320
8321 #if 0
8322 /* Used for sorting but not equiv */
8323 } else if (x->sensedir != 0 && y->sensedir == 0) {
8324 return -1;
8325 } else if (y->sensedir != 0 && x->sensedir == 0) {
8326 return +1;
8327 #endif
8328
8329 #if 0
8330 } else if (x->sensedir == y->sensedir) {
8331 return 0;
8332 } else if (x->sensedir > y->sensedir) {
8333 return +1;
8334 } else if (y->sensedir > x->sensedir) {
8335 return -1;
8336 #endif
8337
8338 } else if (x->splice_score > y->splice_score) {
8339 debug4(printf(" => loses by splice score\n"));
8340 return -1;
8341
8342 } else if (y->splice_score > x->splice_score) {
8343 debug4(printf(" => wins by splice score\n"));
8344 return +1;
8345
8346 } else {
8347 debug4(printf(" => identical for sorting purposes\n"));
8348 return 0;
8349 }
8350 }
8351 #endif
8352
8353
8354 int
Stage3end_hit_goodness_cmp(bool * equalp,Stage3end_T hit,Stage3end_T best_hit,bool finalp)8355 Stage3end_hit_goodness_cmp (bool *equalp, Stage3end_T hit,
8356 Stage3end_T best_hit, bool finalp) {
8357 double prob1, prob2;
8358
8359 #ifdef PRE_RESOLVE_MULTIMAPPING
8360 if (Stage3end_tally(x) > TALLY_RATIO*Stage3end_tally(y)) {
8361 debug4(printf(" #%d overlaps #%d and tally %ld > %f*%ld, so marking %d for elimination\n",
8362 i,j,x->tally,TALLY_RATIO,y->tally,j));
8363 eliminate[j] = true;
8364 } else if (Stage3end_tally(y) > TALLY_RATIO*Stage3end_tally(x)) {
8365 debug4(printf(" #%d overlaps #%d and tally %f*%ld < %ld, so marking %d for elimination\n",
8366 i,j,TALLY_RATIO,x->tally,y->tally,i));
8367 eliminate[i] = true;
8368 }
8369 #endif
8370
8371 *equalp = false;
8372
8373 #if 0
8374 /* Don't want to use nmatches_to_trims */
8375 /* Favors definitive splices over ambiguous ones (by using nmatches_to_trims) */
8376 if (known_ambiguous_p(hit) == true && known_ambiguous_p(best_hit) == false) {
8377 return -1;
8378 } else if (known_ambiguous_p(hit) == false && known_ambiguous_p(best_hit) == true) {
8379 return +1;
8380 }
8381 #endif
8382
8383 if (hit->refalt_nmatches_plus_spliced_trims > best_hit->refalt_nmatches_plus_spliced_trims + NMATCHES_SLOP) {
8384 /* Significantly more matches */
8385 debug4(printf("More matches (to_trims)\n"));
8386 return +1;
8387 } else if (hit->refalt_nmatches_plus_spliced_trims < best_hit->refalt_nmatches_plus_spliced_trims - NMATCHES_SLOP) {
8388 /* Significantly fewer matches */
8389 debug4(printf("Fewer matches (to_trims)\n"));
8390 return -1;
8391
8392 #if 0
8393 } else if (hit->nsplices > best_hit->nsplices) {
8394 debug4(printf(" => loses by nsplices: %d > %d in best\n",hit->nsplices,best_hit->nsplices));
8395 return -1;
8396 } else if (hit->nsplices < best_hit->nsplices) {
8397 debug4(printf(" => wins by nsplices: %d < %d in best\n",hit->nsplices,best_hit->nsplices));
8398 return +1;
8399 #endif
8400
8401 } else if (hit->hittype > best_hit->hittype) {
8402 debug4(printf(" => loses by hittype\n"));
8403 return -1;
8404 } else if (hit->hittype < best_hit->hittype) {
8405 debug4(printf(" => wins by hittype\n"));
8406 return +1;
8407
8408 #if 0
8409 } else if (start_amb_length(hit) + end_amb_length(hit) > 0 &&
8410 start_amb_length(best_hit) + end_amb_length(best_hit) == 0) {
8411 debug4(printf(" => loses by ambiguity\n"));
8412 return -1;
8413 } else if (start_amb_length(hit) + end_amb_length(hit) == 0 &&
8414 start_amb_length(best_hit) + end_amb_length(best_hit) > 0) {
8415 debug4(printf(" => wins by ambiguity\n"));
8416 return +1;
8417 #endif
8418
8419 } else if (hit->nindels > best_hit->nindels) {
8420 debug4(printf(" => loses by nindels\n"));
8421 return -1;
8422 } else if (hit->nindels < best_hit->nindels) {
8423 debug4(printf(" => wins by nindels\n"));
8424 return +1;
8425
8426 } else if (hit->distant_splice_p == true && best_hit->distant_splice_p == false) {
8427 debug4(printf(" => loses because distant splice\n"));
8428 return -1;
8429 } else if (hit->distant_splice_p == false && best_hit->distant_splice_p == true) {
8430 debug4(printf(" => wins because not distant splice\n"));
8431 return +1;
8432
8433 } else if (finalp == false) {
8434 debug4(printf(" => indistinguishable\n"));
8435 return 0;
8436
8437 } else if (hit->hittype == TRANSLOC_SPLICE && best_hit->hittype == TRANSLOC_SPLICE) {
8438 prob1 = hit->splice_score;
8439 prob2 = best_hit->splice_score;
8440
8441 if (prob1 < prob2) {
8442 debug4(printf(" => loses by TRANSLOC_SPLICE splice prob %f vs %f\n",prob1,prob2));
8443 return -1;
8444 } else if (prob1 > prob2) {
8445 debug4(printf(" => wins by TRANSLOC_SPLICE splice prob %f vs %f\n",prob1,prob2));
8446 return +1;
8447 } else {
8448 debug4(printf(" => equal\n"));
8449 *equalp = true;
8450 return 0;
8451 }
8452
8453 } else {
8454 prob1 = Stage3end_prob(hit);
8455 prob2 = Stage3end_prob(best_hit);
8456 if (prob1 < prob2) {
8457 debug4(printf(" => loses by splice prob %f vs %f\n",prob1,prob2));
8458 return -1;
8459 } else if (prob1 > prob2) {
8460 debug4(printf(" => wins by splice prob %f vs %f\n",prob1,prob2));
8461 return +1;
8462 }
8463
8464 if (hit->genomiclength > best_hit->genomiclength) {
8465 debug4(printf(" => loses by genomiclength: %u > %u\n",
8466 hit->genomiclength,best_hit->genomiclength));
8467 return -1;
8468 } else if (hit->genomiclength < best_hit->genomiclength) {
8469 debug4(printf(" => wins by genomiclength: %u < %u\n",
8470 hit->genomiclength,best_hit->genomiclength));
8471 return +1;
8472
8473 } else {
8474 debug4(printf(" => equal\n"));
8475 *equalp = true;
8476 return 0;
8477 }
8478 }
8479 }
8480
8481
8482 /* Not clear how to handle altloc */
8483 static bool
hit_subsumption(Stage3end_T x,Stage3end_T y)8484 hit_subsumption (Stage3end_T x, Stage3end_T y) {
8485 if (x->chrnum != y->chrnum) {
8486 /* Previously true for straddles, but then corrected that issue */
8487 /* Now potentially true for lefts below 0 */
8488 return false;
8489 } else if (x->plusp != y->plusp) {
8490 return false; /* Different strands */
8491 } else if (x->low <= y->low && x->high >= y->high) {
8492 return true;
8493 } else if (y->low <= x->low && y->high >= x->high) {
8494 return true;
8495 } else {
8496 return false;
8497 }
8498 }
8499
8500 /* Not clear how to handle altloc */
8501 static bool
hit_endpoint_equivp(Stage3end_T x,Stage3end_T y)8502 hit_endpoint_equivp (Stage3end_T x, Stage3end_T y) {
8503 if (x->plusp != y->plusp) {
8504 return false; /* Different strands */
8505 } else if (x->genomicstart != y->genomicstart) {
8506 return false;
8507 } else if (x->genomicend != y->genomicend) {
8508 return false;
8509 } else {
8510 return true;
8511 }
8512 }
8513
8514
8515 static bool
hit_bad_superstretch_p(Stage3end_T hit_k,Stage3end_T * hits,int k,int j,bool finalp)8516 hit_bad_superstretch_p (Stage3end_T hit_k, Stage3end_T *hits, int k, int j, bool finalp) {
8517 int a;
8518 bool equalp;
8519
8520 for (a = k+1; a <= j; a++) {
8521 if (hit_subsumption(hit_k,hits[a]) == true) {
8522 debug4(printf("Testing %d because stretches over %d",k,a));
8523 if (Stage3end_hit_goodness_cmp(&equalp,hits[a],hit_k,finalp) > 0 || equalp == true) {
8524 debug4(printf(" => eliminating\n"));
8525 return true;
8526 }
8527 debug4(printf("\n"));
8528 }
8529 }
8530 return false;
8531 }
8532
8533
8534 static List_T
remove_overlaps_distant(List_T hitlist,Hitlistpool_T hitlistpool)8535 remove_overlaps_distant (List_T hitlist, Hitlistpool_T hitlistpool) {
8536 List_T unique = NULL;
8537 T best_hit, hit, *hits;
8538 int cmp;
8539 int n, i, j, k, besti;
8540 bool *eliminate, equalp;
8541 #ifdef PRE_RESOLVE_MULTIMAPPING
8542 long int best_tally;
8543 #endif
8544
8545 if ((n = List_length(hitlist)) == 0) {
8546 return (List_T) NULL;
8547 } else {
8548 #ifdef USE_ALLOCA_FOR_HITS
8549 eliminate = (bool *) CALLOCA(n,sizeof(bool));
8550 hits = (T *) MALLOCA(n * sizeof(T));
8551 List_fill_array((void **) hits,hitlist);
8552 Hitlist_free(&hitlist);
8553 #else
8554 eliminate = (bool *) CALLOC(n,sizeof(bool));
8555 hits = (T *) List_to_array(hitlist,NULL);
8556 Hitlist_free(&hitlist);
8557 #endif
8558 }
8559
8560 debug4(printf("Step 0. Checking for duplicates among distant\n"));
8561 qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp);
8562
8563 /* Find clusters from left */
8564 i = 0;
8565 while (i < n) {
8566 j = i;
8567 while (j+1 < n && hit_endpoint_equivp(hits[i],hits[j+1]) == true) {
8568 j = j+1;
8569 }
8570
8571 if (j > i) {
8572 debug4(printf("Cluster from %d up through %d\n",i,j));
8573
8574 best_hit = hits[i];
8575 besti = i;
8576 debug4(printf("Assume best is %d\n",besti));
8577
8578 for (k = i+1; k <= j; k++) {
8579 cmp = Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,/*finalp*/true);
8580 debug4(printf("Comparison of %d with best %d yields %d\n",k,besti,cmp));
8581 if (cmp > 0) {
8582 best_hit = hits[k];
8583 besti = k;
8584 debug4(printf("Best is now %d\n",besti));
8585 }
8586 }
8587
8588 for (k = i; k <= j; k++) {
8589 if (k == besti) {
8590 /* Skip */
8591 } else if (Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,/*finalp*/true) < 0 || equalp == true) {
8592 debug4(printf(" Eliminating hit %d from left, because beaten by %d\n",k,besti));
8593 eliminate[k] = true;
8594 }
8595 }
8596 }
8597
8598 i = j+1;
8599 }
8600
8601 for (i = n-1; i >= 0; i--) {
8602 hit = hits[i];
8603 if (eliminate[i] == false) {
8604 unique = Hitlist_push(unique,hitlistpool,(void *) hit);
8605 } else if (hit->paired_usedp == true) {
8606 unique = Hitlist_push(unique,hitlistpool,(void *) hit);
8607 } else {
8608 Stage3end_free(&hit);
8609 }
8610 }
8611
8612 #ifdef USE_ALLOCA_FOR_HITS
8613 FREEA(hits);
8614 FREEA(eliminate);
8615 #else
8616 FREE(hits);
8617 FREE(eliminate);
8618 #endif
8619
8620 debug4(printf("Returning %d unique distant splices\n",List_length(unique)));
8621 return unique;
8622 }
8623
8624
8625
8626
8627 #if 0
8628 List_T
8629 Stage3end_remove_overlaps_old (List_T hitlist, Hitlistpool_T hitlistpool,
8630 int querylength, bool finalp) {
8631 List_T unique = NULL, distant = NULL, local = NULL, p;
8632 T best_hit, hit, parent, *hits, *prev;
8633 int cmp;
8634 int nkept, n, i, j, k, besti;
8635 bool *eliminate, equalp;
8636 int *parenti;
8637 #ifdef PRE_RESOLVE_MULTIMAPPING
8638 long int best_tally;
8639 #endif
8640
8641
8642 debug4(printf("Entered Stage3end_remove_overlaps with %d hits: %s\n",
8643 List_length(hitlist),finalp == true ? "FINAL" : "not final"));
8644
8645 for (p = hitlist; p != NULL; p = List_next(p)) {
8646 hit = (T) List_head(p);
8647 if (hit->distant_splice_p == false) {
8648 local = Hitlist_push(local,hitlistpool,(void *) hit);
8649 } else {
8650 distant = Hitlist_push(distant,hitlistpool,(void *) hit);
8651 }
8652 }
8653 Hitlist_free(&hitlist);
8654
8655 distant = remove_overlaps_distant(distant,hitlistpool);
8656
8657 if ((n = List_length(local)) == 0) {
8658 return distant;
8659 } else {
8660 #ifdef USE_ALLOCA_FOR_HITS
8661 eliminate = (bool *) CALLOCA(n,sizeof(bool));
8662 hits = (T *) MALLOCA(n * sizeof(T));
8663 List_fill_array((void **) hits,local);
8664 Hitlist_free(&local);
8665 #else
8666 eliminate = (bool *) CALLOC(n,sizeof(bool));
8667 hits = (T *) List_to_array(local,NULL);
8668 Hitlist_free(&local);
8669 #endif
8670 }
8671
8672
8673 /* Step 1. Check for exact duplicates */
8674 /* Probably don't want to eliminate aliases at this point */
8675 debug4(printf("Step 1. Checking for exact duplicates\n"));
8676 qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp);
8677
8678 debug4(
8679 for (i = 0; i < n; i++) {
8680 hit = hits[i];
8681 printf(" Initial %d (%s): %p #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d",
8682 i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
8683 hit->circularalias,hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
8684 if (hit->transcripts != NULL) {
8685 Transcript_print_list(hit->transcripts);
8686 }
8687 printf("\n");
8688 }
8689 );
8690
8691 i = 0;
8692 while (i < n) {
8693 j = i+1;
8694 debug4(printf(" %d,%d",i,j));
8695 while (j < n && hit_equal(hits[j],hits[i]) == true) {
8696 debug4(printf(" %d is identical to %d => eliminating\n",j,i));
8697 eliminate[j] = true;
8698 j++;
8699 }
8700 i = j;
8701 }
8702 debug4(printf("\n"));
8703
8704
8705 nkept = 0;
8706 for (i = 0; i < n; i++) {
8707 if (eliminate[i] == false) {
8708 nkept++;
8709 } else if (hits[i]->paired_usedp == true) {
8710 nkept++;
8711 }
8712 }
8713 if (nkept == 0) {
8714 /* All entries eliminated one another, so keep the first one */
8715 eliminate[0] = false;
8716 nkept = 1;
8717 }
8718
8719 prev = hits;
8720 #ifdef USE_ALLOCA_FOR_HITS
8721 hits = (Stage3end_T *) MALLOCA(nkept * sizeof(Stage3end_T));
8722 #else
8723 hits = (Stage3end_T *) MALLOC(nkept * sizeof(Stage3end_T));
8724 #endif
8725
8726 for (i = 0, j = 0; i < n; i++) {
8727 hit = prev[i];
8728 if (eliminate[i] == false) {
8729 debug4(printf(" Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8730 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8731 hit->plusp,hit->sensedir));
8732 best_hit = hits[j++] = hit;
8733 } else if (hit->paired_usedp == true) {
8734 debug4(printf(" Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8735 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8736 hit->plusp,hit->sensedir));
8737 hits[j++] = hit;
8738 } else {
8739 debug4(printf(" Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8740 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8741 hit->plusp,hit->sensedir));
8742 Stage3end_transfer_transcripts_one(best_hit,hit);
8743 Stage3end_free(&hit);
8744 }
8745 }
8746
8747 #ifdef USE_ALLOCA_FOR_HITS
8748 FREEA(prev);
8749 #else
8750 FREE(prev);
8751 #endif
8752
8753
8754 /* Step 2: Check for superstretches */
8755 n = nkept;
8756 debug4(printf("Step 2. Checking for superstretches among %d hits within subsumption clusters\n",n));
8757
8758 for (i = 0; i < n; i++) {
8759 eliminate[i] = false;
8760 }
8761
8762 debug4(
8763 for (i = 0; i < n; i++) {
8764 hit = hits[i];
8765 printf(" Initial %d (%s): %p #%d:%u..%u, nmatches %d (%d to_trims), score %d",
8766 i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
8767 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
8768 if (hit->transcripts != NULL) {
8769 Transcript_print_list(hit->transcripts);
8770 }
8771 printf("\n");
8772 }
8773 );
8774
8775 /* Find clusters */
8776 i = 0;
8777 while (i < n) {
8778 j = i;
8779 /* Previously checked if (hits[i]->distant_splice_p == false) */
8780 while (j+1 < n && hit_subsumption(hits[i],hits[j+1]) == true) {
8781 j = j+1;
8782 }
8783
8784 if (j > i) {
8785 debug4(printf("Cluster from %d up through %d\n",i,j));
8786
8787 /* Find bad superstretches */
8788 for (k = i; k <= j; k++) {
8789 /* Previously checked if (hits[i]->distant_splice_p == false) */
8790 if (hit_bad_superstretch_p(hits[k],hits,k,j,finalp) == true) {
8791 eliminate[k] = true;
8792 /* parenti[k] = j; */
8793 }
8794 }
8795 }
8796
8797 i = j+1;
8798 }
8799
8800 nkept = 0;
8801 for (i = 0; i < n; i++) {
8802 if (eliminate[i] == false) {
8803 nkept++;
8804 } else if (hits[i]->paired_usedp == true) {
8805 nkept++;
8806 }
8807 }
8808 if (nkept == 0) {
8809 /* All entries eliminated one another, so keep the first one */
8810 eliminate[0] = false;
8811 nkept = 1;
8812 }
8813
8814 prev = hits;
8815 #ifdef USE_ALLOCA_FOR_HITS
8816 hits = (Stage3end_T *) MALLOCA(nkept * sizeof(Stage3end_T));
8817 #else
8818 hits = (Stage3end_T *) MALLOC(nkept * sizeof(Stage3end_T));
8819 #endif
8820
8821 for (i = 0, j = 0; i < n; i++) {
8822 hit = prev[i];
8823 if (eliminate[i] == false) {
8824 debug4(printf(" Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8825 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8826 hit->plusp,hit->sensedir));
8827 hits[j++] = hit;
8828 } else if (hit->paired_usedp == true) {
8829 debug4(printf(" Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8830 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8831 hit->plusp,hit->sensedir));
8832 hits[j++] = hit;
8833 } else {
8834 debug4(printf(" Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8835 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8836 hit->plusp,hit->sensedir));
8837 /* parent = prev[parenti[i]]; */
8838 /* Stage3end_transfer_transcripts_one(parent,hit); */
8839 Stage3end_free(&hit);
8840 }
8841 }
8842
8843 #ifdef USE_ALLOCA_FOR_HITS
8844 FREEA(prev);
8845 #else
8846 FREE(prev);
8847 #endif
8848
8849
8850 /* Step 3: Check for best within subsumption clusters */
8851 n = nkept;
8852 debug4(printf("Checking for best among %d hits within subsumption clusters\n",n));
8853
8854 for (i = 0; i < n; i++) {
8855 eliminate[i] = false;
8856 }
8857 /* qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp); -- No need since original order was kept */
8858
8859 debug4(
8860 for (i = 0; i < n; i++) {
8861 hit = hits[i];
8862 printf(" Initial %d (%s): %p #%d:%u..%u, nmatches %d (%d to_trims), score %d",
8863 i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
8864 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
8865 if (hit->transcripts != NULL) {
8866 Transcript_print_list(hit->transcripts);
8867 }
8868 printf("\n");
8869 }
8870 );
8871
8872 /* Find clusters from left */
8873 i = 0;
8874 while (i < n) {
8875 j = i;
8876 /* Previously checked if (hits[i]->distant_splice_p == false) */
8877 while (j+1 < n && hit_subsumption(hits[i],hits[j+1]) == true) {
8878 j = j+1;
8879 }
8880
8881 if (j > i) {
8882 debug4(printf("Cluster from %d up through %d\n",i,j));
8883
8884 best_hit = hits[i];
8885 besti = i;
8886 debug4(printf("Assume best is %d\n",besti));
8887
8888 for (k = i+1; k <= j; k++) {
8889 /* Previously checked if (hits[i]->distant_splice_p == false) */
8890 cmp = Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,finalp);
8891 debug4(printf("Comparison of %d with best %d yields %d\n",k,besti,cmp));
8892 if (cmp > 0) {
8893 best_hit = hits[k];
8894 besti = k;
8895 debug4(printf("Best is now %d\n",besti));
8896 }
8897 }
8898
8899 for (k = i; k <= j; k++) {
8900 if (k == besti) {
8901 /* Skip */
8902 /* Previously checked if (hits[i]->distant_splice_p == false) */
8903 } else if (Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,finalp) < 0 || equalp == true) {
8904 debug4(printf(" Eliminating hit %d from left, because beaten by %d\n",k,besti));
8905 eliminate[k] = true;
8906 /* parenti[k] = i; */
8907 }
8908 }
8909 }
8910
8911 i = j+1;
8912 }
8913
8914
8915 /* Find clusters starting from right */
8916 j = n - 1;
8917 while (j >= 0) {
8918 i = j;
8919 /* Previously checked if (hits[i]->distant_splice_p == false) */
8920 while (i-1 >= 0 && hit_subsumption(hits[j],hits[i-1]) == true) {
8921 i = i-1;
8922 }
8923
8924 if (i < j) {
8925 debug4(printf("Cluster from %d down through %d\n",j,i));
8926 best_hit = hits[i];
8927 besti = i;
8928 debug4(printf("Assume best is %d\n",besti));
8929
8930 for (k = i+1; k <= j; k++) {
8931 /* Previously checked if (hits[i]->distant_splice_p == false) */
8932 cmp = Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,finalp);
8933 debug4(printf("Comparison of %d with best %d yields %d\n",k,besti,cmp));
8934 if (cmp > 0) {
8935 best_hit = hits[k];
8936 besti = k;
8937 debug4(printf("Best is now %d\n",besti));
8938 }
8939 }
8940
8941 for (k = i; k <= j; k++) {
8942 if (k == besti) {
8943 /* Skip */
8944 /* Previously checked if (hits[i]->distant_splice_p == false) */
8945 } else if (Stage3end_hit_goodness_cmp(&equalp,hits[k],best_hit,finalp) < 0 || equalp == true) {
8946 debug4(printf(" Eliminating hit %d from right, because beaten by %d\n",k,besti));
8947 eliminate[k] = true;
8948 /* parenti[k] = i; */
8949 }
8950 }
8951 }
8952
8953 j = i-1;
8954 }
8955
8956
8957 nkept = 0;
8958 for (i = 0; i < n; i++) {
8959 if (eliminate[i] == false) {
8960 nkept++;
8961 } else if (hits[i]->paired_usedp == true) {
8962 nkept++;
8963 }
8964 }
8965 if (nkept == 0) {
8966 eliminate[0] = false;
8967 nkept = 1;
8968 }
8969
8970 prev = hits;
8971 #ifdef USE_ALLOCA_FOR_HITS
8972 hits = (Stage3end_T *) MALLOCA(nkept * sizeof(Stage3end_T));
8973 #else
8974 hits = (Stage3end_T *) MALLOC(nkept * sizeof(Stage3end_T));
8975 #endif
8976
8977 for (i = 0, j = 0; i < n; i++) {
8978 hit = prev[i];
8979 if (eliminate[i] == false) {
8980 debug4(printf(" Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8981 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8982 hit->plusp,hit->sensedir));
8983 hits[j++] = hit;
8984 } else if (hit->paired_usedp == true) {
8985 debug4(printf(" Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8986 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8987 hit->plusp,hit->sensedir));
8988 hits[j++] = hit;
8989 } else {
8990 debug4(printf(" Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
8991 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
8992 hit->plusp,hit->sensedir));
8993 /* parent = prev[parenti[i]]; */
8994 /* Stage3end_transfer_transcripts_one(parent,hit); */
8995 Stage3end_free(&hit);
8996 }
8997 }
8998
8999 #ifdef USE_ALLOCA_FOR_HITS
9000 FREEA(prev);
9001 parenti = (int *) CALLOCA(nkept,sizeof(int));
9002 #else
9003 FREE(prev);
9004 parenti = (int *) CALLOC(nkept,sizeof(int));
9005 #endif
9006
9007
9008 /* Step 4: Check for identity */
9009 n = nkept;
9010 debug4(printf("Checking for duplicates among %d hits by identity\n",n));
9011
9012 for (i = 0; i < n; i++) {
9013 eliminate[i] = false;
9014 }
9015 /* qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp); -- No need since original order was kept */
9016
9017 debug4(
9018 for (i = 0; i < n; i++) {
9019 hit = hits[i];
9020 printf(" Initial %d (%s): %p #%d:%u..%u, nmatches %d (%d to_trims), score %d",
9021 i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
9022 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
9023 if (hit->transcripts != NULL) {
9024 Transcript_print_list(hit->transcripts);
9025 }
9026 printf("\n");
9027 }
9028 );
9029
9030 i = 0;
9031 while (i < n) {
9032 debug4(printf("Looking at %d with score %d\n",i,hits[i]->refalt_score_within_trims));
9033 j = i+1;
9034 while (j < n && hit_equal(hits[j],hits[i]) == true) {
9035 debug4(printf(" %d equal to %d\n",j,i));
9036 eliminate[j] = true;
9037 parenti[j] = i;
9038 j++;
9039 }
9040
9041 i = j;
9042 }
9043
9044 for (i = n-1; i >= 0; i--) {
9045 hit = hits[i];
9046 if (eliminate[i] == false) {
9047 unique = Hitlist_push(unique,hitlistpool,(void *) hit);
9048 } else if (hit->paired_usedp == true) {
9049 unique = Hitlist_push(unique,hitlistpool,(void *) hit);
9050 } else {
9051 parent = hits[parenti[i]]; /* Not prev, since we are using hits instead */
9052 Stage3end_transfer_transcripts_one(parent,hit);
9053 Stage3end_free(&hit);
9054 }
9055 }
9056
9057 #ifdef USE_ALLOCA_FOR_HITS
9058 FREEA(hits);
9059 FREEA(eliminate);
9060 FREEA(parenti);
9061 #else
9062 FREE(hits);
9063 FREE(eliminate);
9064 FREE(parenti);
9065 #endif
9066
9067
9068 #ifdef PRE_RESOLVE_MULTIMAPPING
9069 if (use_tally_p == true && tally_iit != NULL) {
9070 if ((n = List_length(unique)) > 1) {
9071 #ifdef USE_ALLOCA_FOR_HITS
9072 hits = (T *) MALLOCA(n * sizeof(T));
9073 List_fill_array((void **) hits,unique);
9074 Hitlist_free(&unique);
9075 #else
9076 hits = (T *) List_to_array(unique,NULL);
9077 Hitlist_free(&unique);
9078 #endif
9079
9080 best_tally = 0;
9081 for (i = 0; i < n; i++) {
9082 if (hits[i]->tally < 0) {
9083 hits[i]->tally = Stage3end_compute_tally(hits[i]);
9084 }
9085 if (hits[i]->tally > best_tally) {
9086 best_tally = hits[i]->tally;
9087 }
9088 }
9089
9090 unique = (List_T) NULL;
9091 for (i = 0; i < n; i++) {
9092 if (hits[i]->tally < best_tally) {
9093 /* Stage3end_free(&(hits[i])); */
9094 } else {
9095 unique = Hitlist_push(unique,hitlistpool,(void *) hits[i]);
9096 }
9097 }
9098
9099 #ifdef USE_ALLOCA_FOR_HITS
9100 FREEA(hits);
9101 #else
9102 FREE(hits);
9103 #endif
9104 }
9105 }
9106 #endif
9107
9108 unique = List_append(unique,distant);
9109 debug4(printf("Exited Stage3end_remove_overlaps with %d hits\n",List_length(unique)));
9110 return unique;
9111 }
9112 #endif
9113
9114
9115 /* Tries to match Stage3pair_remove_overlaps */
9116 List_T
Stage3end_remove_overlaps(List_T hitlist,Hitlistpool_T hitlistpool,int querylength,bool finalp)9117 Stage3end_remove_overlaps (List_T hitlist, Hitlistpool_T hitlistpool, int querylength, bool finalp) {
9118 List_T optimal, distant = NULL, local = NULL, p;
9119 T best_hit, hit, *hits, *prev;
9120
9121 int max_adj_nmatches, score;
9122 int best_nsegments;
9123 double max_splice_score;
9124
9125 int nkept, n, i, j, k;
9126 bool *eliminate, keptp;
9127 #ifdef PRE_RESOLVE_MULTIMAPPING
9128 long int best_tally;
9129 #endif
9130
9131
9132 debug4(printf("Entered Stage3end_remove_overlaps with %d hits: %s\n",
9133 List_length(hitlist),finalp == true ? "FINAL" : "not final"));
9134
9135 for (p = hitlist; p != NULL; p = List_next(p)) {
9136 hit = (T) List_head(p);
9137 if (hit->distant_splice_p == false) {
9138 local = Hitlist_push(local,hitlistpool,(void *) hit);
9139 } else {
9140 distant = Hitlist_push(distant,hitlistpool,(void *) hit);
9141 }
9142 }
9143 Hitlist_free(&hitlist);
9144
9145 distant = remove_overlaps_distant(distant,hitlistpool);
9146
9147 if ((n = List_length(local)) == 0) {
9148 return distant;
9149 } else {
9150 #ifdef USE_ALLOCA_FOR_HITS
9151 eliminate = (bool *) CALLOCA(n,sizeof(bool));
9152 hits = (T *) MALLOCA(n * sizeof(T));
9153 List_fill_array((void **) hits,local);
9154 Hitlist_free(&local);
9155 #else
9156 eliminate = (bool *) CALLOC(n,sizeof(bool));
9157 hits = (T *) List_to_array(local,NULL);
9158 Hitlist_free(&local);
9159 #endif
9160 }
9161 /* local alignments are in hits, but distant alignments are in distant */
9162
9163
9164 /* Step 1. Check for exact duplicates */
9165 /* Probably don't want to eliminate aliases at this point */
9166 debug4(printf("Step 1. Checking for exact duplicates\n"));
9167 qsort(hits,n,sizeof(Stage3end_T),hit_sort_cmp);
9168
9169 debug4(
9170 for (i = 0; i < n; i++) {
9171 hit = hits[i];
9172 printf(" Initial %d (%s): %p #%d:%u..%u, circularalias %d, nmatches %d (%d to_trims), score %d",
9173 i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
9174 hit->circularalias,hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
9175 if (hit->transcripts != NULL) {
9176 Transcript_print_list(hit->transcripts);
9177 }
9178 printf("\n");
9179 }
9180 );
9181
9182 i = 0;
9183 while (i < n) {
9184 j = i+1;
9185 debug4(printf(" %d,%d",i,j));
9186 while (j < n && hit_equal(hits[j],hits[i]) == true) {
9187 debug4(printf(" %d is identical to %d => eliminating\n",j,i));
9188 eliminate[j] = true;
9189 j++;
9190 }
9191 i = j;
9192 }
9193 debug4(printf("\n"));
9194
9195
9196 nkept = 0;
9197 for (i = 0; i < n; i++) {
9198 if (eliminate[i] == false) {
9199 nkept++;
9200 } else if (hits[i]->paired_usedp == true) {
9201 nkept++;
9202 }
9203 }
9204 if (nkept == 0) {
9205 /* All entries eliminated one another, so keep the first one */
9206 eliminate[0] = false;
9207 nkept = 1;
9208 }
9209
9210 prev = hits;
9211 #ifdef USE_ALLOCA_FOR_HITS
9212 hits = (Stage3end_T *) MALLOCA(nkept * sizeof(Stage3end_T));
9213 #else
9214 hits = (Stage3end_T *) MALLOC(nkept * sizeof(Stage3end_T));
9215 #endif
9216
9217 for (i = 0, j = 0; i < n; i++) {
9218 hit = prev[i];
9219 if (eliminate[i] == false) {
9220 debug4(printf(" Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9221 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9222 hit->plusp,hit->sensedir));
9223 best_hit = hits[j++] = hit;
9224 } else if (hit->paired_usedp == true) {
9225 debug4(printf(" Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9226 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9227 hit->plusp,hit->sensedir));
9228 hits[j++] = hit;
9229 } else {
9230 debug4(printf(" Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9231 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9232 hit->plusp,hit->sensedir));
9233 Stage3end_transfer_transcripts_one(best_hit,hit);
9234 Stage3end_free(&hit);
9235 }
9236 }
9237
9238 #ifdef USE_ALLOCA_FOR_HITS
9239 FREEA(prev);
9240 #else
9241 FREE(prev);
9242 #endif
9243
9244
9245 /* Step 2: Check for superstretches */
9246 hitlist = (List_T) NULL;
9247 n = nkept;
9248 debug4(printf("Step 2. Checking for superstretches among %d hits within subsumption clusters\n",n));
9249
9250 for (i = 0; i < n; i++) {
9251 eliminate[i] = false;
9252 }
9253
9254 debug4(
9255 for (i = 0; i < n; i++) {
9256 hit = hits[i];
9257 printf(" Initial %d (%s): %p #%d:%u..%u, nmatches %d (%d to_trims), score %d",
9258 i,Method_string(hit->method),hit,hit->chrnum,hit->genomicstart-hit->chroffset,hit->genomicend-hit->chroffset,
9259 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,hit->refalt_score_within_trims);
9260 if (hit->transcripts != NULL) {
9261 Transcript_print_list(hit->transcripts);
9262 }
9263 printf("\n");
9264 }
9265 );
9266
9267 /* Find clusters */
9268 i = 0;
9269 while (i < n) {
9270 j = i;
9271 /* Previously checked if (hits[i]->distant_splice_p == false) */
9272 while (j+1 < n && hit_subsumption(hits[i],hits[j+1]) == true) {
9273 j = j+1;
9274 }
9275
9276 if (j > i) {
9277 debug4(printf("Cluster from %d up through %d\n",i,j));
9278
9279 /* Find bad superstretches */
9280 for (k = i; k <= j; k++) {
9281 /* Previously checked if (hits[i]->distant_splice_p == false) */
9282 if (hit_bad_superstretch_p(hits[k],hits,k,j,finalp) == true) {
9283 eliminate[k] = true;
9284 /* parenti[k] = j; */
9285 }
9286 }
9287 }
9288
9289 i = j+1;
9290 }
9291
9292 nkept = 0;
9293 for (i = 0; i < n; i++) {
9294 if (eliminate[i] == false) {
9295 nkept++;
9296 } else if (hits[i]->paired_usedp == true) {
9297 nkept++;
9298 }
9299 }
9300 if (nkept == 0) {
9301 /* All entries eliminated one another, so keep the first one */
9302 eliminate[0] = false;
9303 nkept = 1;
9304 }
9305
9306
9307 for (i = 0, j = 0; i < n; i++) {
9308 hit = hits[i];
9309 if (eliminate[i] == false) {
9310 debug4(printf(" Keeping #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9311 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9312 hit->plusp,hit->sensedir));
9313 hitlist = Hitlist_push(hitlist,hitlistpool,(void *) hit);
9314 } else if (hit->paired_usedp == true) {
9315 debug4(printf(" Already paired #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9316 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9317 hit->plusp,hit->sensedir));
9318 hitlist = Hitlist_push(hitlist,hitlistpool,(void *) hit);
9319 } else {
9320 debug4(printf(" Eliminating #%d:%u..%u, nmatches (trimmed) %d (plusp = %d, sensedir = %d)\n",
9321 hit->chrnum,hit->low - hit->chroffset,hit->high - hit->chroffset,hit->refalt_nmatches_plus_spliced_trims,
9322 hit->plusp,hit->sensedir));
9323 /* parent = prev[parenti[i]]; */
9324 /* Stage3end_transfer_transcripts_one(parent,hit); */
9325 Stage3end_free(&hit);
9326 }
9327 }
9328
9329 #ifdef USE_ALLOCA_FOR_HITS
9330 FREEA(hits);
9331 FREEA(eliminate);
9332 #else
9333 FREE(hits);
9334 FREE(eliminate);
9335 #endif
9336
9337
9338 /* Prune based on nmatches adjusted by score to get a tradeoff between matches and parsimony */
9339 /* Same as step 1 of Stage3pair_optimal_score_final */
9340 debug8(printf(" Step 3. Maximize nmatches adjusted by score (with slop)\n"));
9341 optimal = (List_T) NULL;
9342
9343 keptp = false;
9344 hits = (T *) List_to_array_n(&n,hitlist);
9345 eliminate = (bool *) CALLOC(n,sizeof(bool));
9346 qsort(hits,n,sizeof(T),hit_position_cmp);
9347 i = 0;
9348 while (i < n) {
9349 j = i+1;
9350 while (j < n && hit_overlap_p(hits[j],hits[i]) == true) {
9351 j++;
9352 }
9353 if (j - 1 > 1) {
9354 debug4(printf("Found a group from %d to %d\n",i,j));
9355 max_adj_nmatches = 0;
9356 for (k = i; k < j; k++) {
9357 hit = hits[k];
9358 if ((score = hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall) > max_adj_nmatches) {
9359 max_adj_nmatches = score;
9360 }
9361 }
9362 debug4(printf("max_adj_nmatches = %d\n",max_adj_nmatches));
9363
9364 for (k = i; k < j; k++) {
9365 hit = hits[k];
9366 if (hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall < max_adj_nmatches - ADJ_NMATCHES_SLOP) {
9367 debug4(printf("Within loci end (adj score %d (%d-%d) < %d w/slop): Eliminating hit %p at %u..%u with nmatches %d (%d+ to trims)\n",
9368 hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall,
9369 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_score_overall,max_adj_nmatches,
9370 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9371 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims));
9372 eliminate[k] = true;
9373 } else {
9374 debug4(printf("Within loci end (adj score %d (%d-%d) == %d w/slop): Keeping hit %p at %u..%u with nmatches %d (%d+ to trims)\n",
9375 hit->refalt_nmatches_plus_spliced_trims - hit->refalt_score_overall,
9376 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_score_overall,max_adj_nmatches,
9377 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9378 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims));
9379 keptp = true;
9380 }
9381 }
9382 }
9383
9384 i = j;
9385 }
9386
9387 if (keptp == false) {
9388 optimal = hitlist;
9389 } else {
9390 for (k = 0; k < n; k++) {
9391 hit = hits[k];
9392 if (eliminate[k] == true) {
9393 debug4(printf("Within loci end: Eliminating hit %p at %u..%u with nsegments %d, nmatches %d (%d to_trims), sensedir %d, splice score %f\n",
9394 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9395 hit->nsegments,
9396 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,
9397 hit->sensedir,hit->splice_score));
9398 Stage3end_free(&hit);
9399 /* eliminatedp = true; */
9400 } else {
9401 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
9402 }
9403 }
9404 Hitlist_free(&hitlist);
9405 }
9406 FREE(hits);
9407 FREE(eliminate);
9408 hitlist = optimal;
9409
9410
9411 /* Eliminate within loci: minimize nsegments and maximize splice score */
9412 /* Since we have achieved same number of matches, we should minimize nsegments to achieve parsimony */
9413 debug4(printf(" Step 4. Minimize nsegments and splice score\n"));
9414 optimal = (List_T) NULL;
9415
9416 keptp = false;
9417 hits = (T *) List_to_array_n(&n,hitlist);
9418 eliminate = (bool *) CALLOC(n,sizeof(bool));
9419 qsort(hits,n,sizeof(T),hit_position_cmp);
9420 i = 0;
9421 while (i < n) {
9422 j = i+1;
9423 while (j < n && hit_overlap_p(hits[j],hits[i]) == true) {
9424 j++;
9425 }
9426 if (j - 1 > 1) {
9427 debug4(printf("Found a group from %d to %d\n",i,j));
9428 best_nsegments = querylength;
9429 max_splice_score = 0.0;
9430 for (k = i; k < j; k++) {
9431 hit = hits[k];
9432 if (hit->nsegments < best_nsegments) {
9433 best_nsegments = hit->nsegments;
9434 max_splice_score = hit->splice_score;
9435 } else if (hit->nsegments == best_nsegments) {
9436 max_splice_score = hit->splice_score;
9437 }
9438 }
9439 debug4(printf("best_nsegments %d, max_splice_score %f\n",
9440 best_nsegments,max_splice_score));
9441
9442 for (k = i; k < j; k++) {
9443 hit = hits[k];
9444 if (hit->nsegments > best_nsegments) {
9445 debug4(printf("Within loci end (nsegments %d < %d): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), sensedir %d, splice score %f\n",
9446 hit->nsegments,best_nsegments,hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9447 hit->nsegments,hit->refalt_nmatches_plus_spliced_trims,
9448 hit->refalt_nmatches_to_trims,hit->sensedir,hit->splice_score));
9449 eliminate[k] = true;
9450 } else if (hit->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
9451 debug4(printf("Within loci end (splice_score w/slop %f < %f): Marking hit %p for elimination at %u..%u with nsegments %d, nmatches %d (%d to_trims), sensedir %d, splice score %f\n",
9452 hit->splice_score,max_splice_score,hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9453 hit->nsegments,hit->refalt_nmatches_plus_spliced_trims,
9454 hit->refalt_nmatches_to_trims,hit->sensedir,hit->splice_score));
9455 eliminate[k] = true;
9456 } else {
9457 keptp = true;
9458 }
9459 }
9460 }
9461
9462 i = j;
9463 }
9464
9465 if (keptp == false) {
9466 optimal = hitlist;
9467 } else {
9468 for (k = 0; k < n; k++) {
9469 hit = hits[k];
9470 if (eliminate[k] == true) {
9471 debug4(printf("Within loci end: Eliminating hit %p at %u..%u with nsegments %d, nmatches %d (%d to_trims), sensedir %d, splice score %f\n",
9472 hit,hit->low - hit->chroffset,hit->high - hit->chroffset,
9473 hit->nsegments,
9474 hit->refalt_nmatches_plus_spliced_trims,hit->refalt_nmatches_to_trims,
9475 hit->sensedir,hit->splice_score));
9476 Stage3end_free(&hit);
9477 /* eliminatedp = true; */
9478 } else {
9479 optimal = Hitlist_push(optimal,hitlistpool,(void *) hit);
9480 }
9481 }
9482 Hitlist_free(&hitlist);
9483 }
9484 FREE(hits);
9485 FREE(eliminate);
9486 hitlist = optimal;
9487
9488 /* Step 5. Removing outerlength not applicable to a single end */
9489
9490 return List_append(hitlist,distant);
9491 }
9492
9493
9494
9495 #ifdef PRE_RESOLVE_MULTIMAPPING
9496 List_T
Stage3end_resolve_multimapping(List_T hits,Hitlistpool_T hitlistpool)9497 Stage3end_resolve_multimapping (List_T hits, Hitlistpool_T hitlistpool) {
9498 List_T resolve1, resolve2, resolve3, p;
9499 Stage3end_T hit;
9500
9501 /* Overlap_T best_overlap; */
9502 long int best_tally;
9503 double tally_threshold;
9504 bool runlengthp;
9505
9506 if (List_length(hits) <= 1) {
9507 return hits;
9508 }
9509
9510 resolve1 = hits;
9511
9512 if (tally_iit == NULL) {
9513 resolve2 = resolve1;
9514 } else {
9515 best_tally = 0L;
9516 for (p = resolve1; p != NULL; p = p->rest) {
9517 hit = (Stage3end_T) p->first;
9518 if ((hit->tally = Stage3end_compute_tally(hit)) > best_tally) {
9519 best_tally = hit->tally;
9520 }
9521 }
9522 if (best_tally == 0L) {
9523 resolve2 = resolve1;
9524 } else {
9525 resolve2 = (List_T) NULL;
9526 #ifdef USE_TALLY_RATIO
9527 tally_threshold = (double) best_tally / TALLY_RATIO;
9528 #else
9529 tally_threshold = 1.0;
9530 #endif
9531 for (p = resolve1; p != NULL; p = p->rest) {
9532 hit = (Stage3end_T) p->first;
9533 if ((double) hit->tally < tally_threshold) {
9534 Stage3end_free(&hit);
9535 } else {
9536 resolve2 = Hitlist_push(resolve2,hitlistpool,(void *) hit);
9537 }
9538 }
9539 Hitlist_free(&resolve1);
9540 }
9541 }
9542
9543
9544 if (List_length(resolve2) <= 1) {
9545 return resolve2;
9546 }
9547
9548 if (runlength_iit == NULL) {
9549 resolve3 = resolve2;
9550 } else {
9551 runlengthp = false;
9552 for (p = resolve2; p != NULL; p = p->rest) {
9553 hit = (Stage3end_T) p->first;
9554 if (Stage3end_runlength_p(hit) == true) {
9555 runlengthp = true;
9556 }
9557 }
9558 if (runlengthp == false) {
9559 resolve3 = resolve2;
9560 } else {
9561 resolve3 = (List_T) NULL;
9562 for (p = resolve2; p != NULL; p = p->rest) {
9563 hit = (Stage3end_T) p->first;
9564 if (Stage3end_runlength_p(hit) == false) {
9565 Stage3end_free(&hit);
9566 } else {
9567 resolve3 = Hitlist_push(resolve3,hitlistpool,(void *) hit);
9568 }
9569 }
9570 Hitlist_free(&resolve2);
9571 }
9572 }
9573
9574
9575 return resolve3;
9576 }
9577 #endif
9578
9579
9580 Pairtype_T
Stage3_determine_pairtype(T hit5,T hit3,Stage3pair_T stage3pair)9581 Stage3_determine_pairtype (T hit5, T hit3, Stage3pair_T stage3pair) {
9582 int pairmax;
9583
9584 debug14(printf("Entered Stage3_determine_pairtype\n"));
9585 if (hit5->effective_chrnum != hit3->effective_chrnum) {
9586 debug14(printf("Returning unpaired\n"));
9587 return UNPAIRED;
9588 } else if (hit5->plusp != hit3->plusp) {
9589 debug14(printf("Returning paired_inversion\n"));
9590 return PAIRED_INVERSION;
9591 } else if (hit5->plusp == true) {
9592 if (hit3->genomicend < hit5->genomicstart) {
9593 debug14(printf("Returning paired_scramble\n"));
9594 return PAIRED_SCRAMBLE;
9595 } else {
9596 if (circularp[hit5->effective_chrnum] == true) {
9597 pairmax = pairmax_circular;
9598 } else {
9599 pairmax = pairmax_linear;
9600 }
9601 if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9602 debug14(printf("Returning concordant based on transcriptome\n"));
9603 return CONCORDANT;
9604 } else if (hit3->genomicstart > hit5->genomicend + pairmax) {
9605 debug14(printf("Returning paired_toolong\n"));
9606 return PAIRED_TOOLONG;
9607 } else {
9608 debug14(printf("Returning concordant\n"));
9609 return CONCORDANT;
9610 }
9611 }
9612 } else {
9613 if (hit3->genomicend > hit5->genomicstart) {
9614 debug14(printf("Returning paired_scramble\n"));
9615 return PAIRED_SCRAMBLE;
9616 } else {
9617 if (circularp[hit3->effective_chrnum] == true) {
9618 pairmax = pairmax_circular;
9619 } else {
9620 pairmax = pairmax_linear;
9621 }
9622 if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9623 debug14(printf("Returning concordant based on transcriptome\n"));
9624 return CONCORDANT;
9625 } else if (hit3->genomicstart + pairmax < hit5->genomicend) {
9626 debug14(printf("Returning paired_toolong\n"));
9627 return PAIRED_TOOLONG;
9628 } else {
9629 debug14(printf("Returning concordant\n"));
9630 return CONCORDANT;
9631 }
9632 }
9633 }
9634 }
9635
9636
9637 #if 0
9638 /* Previously, samprint.c called this, but it can lead to incorrect answers when transcripts are added later */
9639 Pairtype_T
9640 Stage3pair_pairtype (Stage3pair_T this) {
9641 return this->pairtype;
9642 }
9643 #else
9644 Pairtype_T
Stage3pair_determine_pairtype(Stage3pair_T this)9645 Stage3pair_determine_pairtype (Stage3pair_T this) {
9646 return Stage3_determine_pairtype(this->hit5,this->hit3,this);
9647 }
9648 #endif
9649
9650 bool
Stage3pair_circularp(Stage3pair_T this)9651 Stage3pair_circularp (Stage3pair_T this) {
9652 return this->circularp;
9653 }
9654
9655 bool
Stage3pair_altlocp(Stage3pair_T this)9656 Stage3pair_altlocp (Stage3pair_T this) {
9657 if (altlocp[this->hit5->chrnum] == true) {
9658 return true;
9659 } else if (altlocp[this->hit3->chrnum] == true) {
9660 return true;
9661 } else {
9662 return false;
9663 }
9664 }
9665
9666
9667 #if 0
9668 static char *
9669 unpaired_type_text (T hit5, T hit3) {
9670 if (hit5->chrnum != hit3->chrnum) {
9671 return UNPAIRED_INTERCHROM_TEXT;
9672 } else if (hit5->plusp != hit3->plusp) {
9673 return PAIRED_INVERSION_TEXT;
9674 } else if (hit5->plusp == true) {
9675 if (hit3->genomicstart < hit5->genomicstart) {
9676 return PAIRED_SCRAMBLE_TEXT;
9677 } else {
9678 return UNPAIRED_TOOLONG_TEXT;
9679 }
9680 } else {
9681 if (hit5->genomicstart < hit3->genomicstart) {
9682 return PAIRED_SCRAMBLE_TEXT;
9683 } else {
9684 return UNPAIRED_TOOLONG_TEXT;
9685 }
9686 }
9687 }
9688 #endif
9689
9690
9691
9692
9693 /* Has a copy in pair.c */
9694 static void
print_pair_info(Filestring_T fp,T hit5,T hit3,int insertlength,int pairscore,Pairtype_T pairtype)9695 print_pair_info (Filestring_T fp, T hit5, T hit3, int insertlength, int pairscore,
9696 Pairtype_T pairtype) {
9697
9698 assert(hit5->effective_chrnum == hit3->effective_chrnum); /* Same chromosomes */
9699
9700 #if 0
9701 /* Doesn't hold for paired (inversion) */
9702 assert(hit5->plusp == hit3->plusp); /* Same direction */
9703 #endif
9704
9705 #ifndef NO_COMPARE
9706 FPRINTF(fp,"pair_score:%d",pairscore);
9707 FPRINTF(fp,",insert_length:%d",insertlength);
9708 #endif
9709
9710 switch (pairtype) {
9711 case CONCORDANT: break;
9712 case PAIRED_SCRAMBLE: FPRINTF(fp,",pairtype:scramble"); break;
9713 case PAIRED_INVERSION: FPRINTF(fp,",pairtype:inversion"); break;
9714 case PAIRED_TOOLONG: FPRINTF(fp,",pairtype:toolong"); break;
9715 case CONCORDANT_TRANSLOCATIONS: break;
9716 case PAIRED_UNSPECIFIED: abort();
9717 case UNPAIRED: abort();
9718 case UNSPECIFIED: abort();
9719 }
9720
9721 return;
9722 }
9723
9724
9725
9726
9727 static void
print_substrings(Filestring_T fp,Stage3pair_T stage3pair,T this,int score,Univ_IIT_T chromosome_iit,Shortread_T queryseq,Shortread_T headerseq,char * acc_suffix,bool invertp,T hit5,T hit3,int insertlength,int pairscore,Pairtype_T pairtype,int mapq_score,bool first_read_p)9728 print_substrings (Filestring_T fp, Stage3pair_T stage3pair, T this,
9729 int score, Univ_IIT_T chromosome_iit, Shortread_T queryseq,
9730 Shortread_T headerseq, char *acc_suffix, bool invertp, T hit5, T hit3, int insertlength,
9731 int pairscore, Pairtype_T pairtype, int mapq_score, bool first_read_p) {
9732 char *single_chr, *chr;
9733 bool allocp, alloc1p, pairinfo_printed_p = false;
9734 List_T substrings, junctions, p, q;
9735 Substring_T substring;
9736 Junction_T pre_junction, post_junction;
9737 int nblocks;
9738
9739 if (this->chrnum == 0) {
9740 single_chr = (char *) NULL;
9741 alloc1p = false;
9742 } else {
9743 single_chr = Univ_IIT_label(chromosome_iit,this->chrnum,&alloc1p);
9744 }
9745 if (invertp == true) {
9746 substrings = this->substrings_Nto1;
9747 junctions = this->junctions_Nto1;
9748 } else {
9749 substrings = this->substrings_1toN;
9750 junctions = this->junctions_1toN;
9751 }
9752
9753 if (output_type == M8_OUTPUT) {
9754 for (p = substrings; p != NULL; p = List_next(p)) {
9755 substring = (Substring_T) List_head(p);
9756 if (Substring_has_alts_p(substring) == true) {
9757 /* Skip */
9758 } else {
9759 if ((chr = single_chr) == NULL) {
9760 chr = Univ_IIT_label(chromosome_iit,Substring_chrnum(substring),&allocp);
9761 }
9762 Substring_print_m8(fp,substring,headerseq,acc_suffix,chr,invertp);
9763 if (single_chr == NULL && allocp == true) {
9764 FREE(chr);
9765 }
9766 }
9767 }
9768
9769 } else {
9770 if ((nblocks = List_length(substrings)) == 1) {
9771 post_junction = (Junction_T) NULL;
9772 } else {
9773 post_junction = (Junction_T) List_head(junctions);
9774 }
9775 substring = (Substring_T) List_head(substrings);
9776 if (Substring_has_alts_p(substring) == true) {
9777 nblocks -= 1;
9778 }
9779 substring = (Substring_T) List_last_value(substrings);
9780 if (Substring_has_alts_p(substring) == true) {
9781 nblocks -= 1;
9782 }
9783
9784
9785 /* First line */
9786 substring = (Substring_T) List_head(substrings);
9787 if (Substring_has_alts_p(substring) == true) {
9788 /* Skip */
9789 } else {
9790 if ((chr = single_chr) == NULL) {
9791 chr = Univ_IIT_label(chromosome_iit,Substring_chrnum(substring),&allocp);
9792 }
9793 FPRINTF(fp," ");
9794 Substring_print_alignment(fp,/*pre_junction*/NULL,substring,post_junction,queryseq,genomecomp,chr,invertp);
9795 if (single_chr == NULL && allocp == true) {
9796 FREE(chr);
9797 }
9798
9799 /* Alignment info */
9800 #ifndef NO_COMPARE
9801 FPRINTF(fp,"\tsegs:%d,align_score:%d,mapq:%d",nblocks,score,mapq_score);
9802 if (method_print_p == true) {
9803 Method_print(fp,this->method);
9804 }
9805 #endif
9806
9807 /* Transcriptome info */
9808 if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9809 Transcript_concordance_print(fp,hit5->transcripts,hit3->transcripts,transcript_iit,
9810 /*concordantp*/true,first_read_p,invertp,/*header*/"\tTranscripts:");
9811 } else if (this->transcripts != NULL) {
9812 Transcript_singleend_print(fp,this->transcripts,transcript_iit,invertp,/*header*/"\tTranscripts:");
9813 }
9814
9815 /* Pairing info */
9816 if (hit5 != NULL && hit3 != NULL) {
9817 FPRINTF(fp,"\t");
9818 print_pair_info(fp,hit5,hit3,insertlength,pairscore,pairtype);
9819 }
9820 pairinfo_printed_p = true;
9821
9822 FPRINTF(fp,"\n");
9823 }
9824
9825 if ((p = List_next(substrings)) == NULL) {
9826 /* Done */
9827 } else {
9828 /* Middle lines */
9829 for (q = List_next(junctions); q != NULL; p = List_next(p), q = List_next(q)) {
9830 pre_junction = post_junction;
9831 post_junction = List_head(q);
9832
9833 substring = (Substring_T) List_head(p);
9834 if (Substring_has_alts_p(substring) == true) {
9835 /* Skip */
9836 } else {
9837 if (pairinfo_printed_p == true) {
9838 FPRINTF(fp,",");
9839 } else {
9840 FPRINTF(fp," ");
9841 }
9842 if ((chr = single_chr) == NULL) {
9843 chr = Univ_IIT_label(chromosome_iit,Substring_chrnum(substring),&allocp);
9844 }
9845 Substring_print_alignment(fp,pre_junction,substring,post_junction,queryseq,genomecomp,chr,invertp);
9846 if (single_chr == NULL && allocp == true) {
9847 FREE(chr);
9848 }
9849
9850 if (pairinfo_printed_p == false) {
9851 /* Alignment info if not already printed */
9852 #ifndef NO_COMPARE
9853 FPRINTF(fp,"\tsegs:%d,align_score:%d,mapq:%d",nblocks,score,mapq_score);
9854 if (method_print_p == true) {
9855 Method_print(fp,this->method);
9856 }
9857 #endif
9858
9859 /* Transcriptome info if not already printed */
9860 if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9861 Transcript_concordance_print(fp,hit5->transcripts,hit3->transcripts,transcript_iit,
9862 /*concordantp*/true,first_read_p,invertp,/*header*/"\tTranscripts:");
9863 } else if (this->transcripts != NULL) {
9864 Transcript_singleend_print(fp,this->transcripts,transcript_iit,invertp,/*header*/"\tTranscripts:");
9865 }
9866
9867 /* Pairing info if not already printed */
9868 if (hit5 != NULL && hit3 != NULL) {
9869 FPRINTF(fp,"\t");
9870 print_pair_info(fp,hit5,hit3,insertlength,pairscore,pairtype);
9871 }
9872 pairinfo_printed_p = true;
9873 }
9874
9875 FPRINTF(fp,"\n");
9876 }
9877 }
9878
9879 /* Last line */
9880 pre_junction = post_junction;
9881
9882 substring = (Substring_T) List_head(p);
9883 if (Substring_has_alts_p(substring) == true) {
9884 /* Skip */
9885 } else {
9886 if (pairinfo_printed_p == true) {
9887 FPRINTF(fp,",");
9888 } else {
9889 FPRINTF(fp," ");
9890 }
9891 if ((chr = single_chr) == NULL) {
9892 chr = Univ_IIT_label(chromosome_iit,Substring_chrnum(substring),&allocp);
9893 }
9894 Substring_print_alignment(fp,pre_junction,substring,/*post_junction*/NULL,queryseq,genomecomp,chr,invertp);
9895 if (single_chr == NULL && allocp == true) {
9896 FREE(chr);
9897 }
9898
9899 if (pairinfo_printed_p == false) {
9900 /* Alignment info if not already printed */
9901 #ifndef NO_COMPARE
9902 FPRINTF(fp,"\tsegs:%d,align_score:%d,mapq:%d",nblocks,score,mapq_score);
9903 if (method_print_p == true) {
9904 Method_print(fp,this->method);
9905 }
9906 #endif
9907
9908 /* Transcriptome info if not already printed */
9909 if (stage3pair != NULL && Transcript_concordant_p(hit5->transcripts,hit3->transcripts) == true) {
9910 Transcript_concordance_print(fp,hit5->transcripts,hit3->transcripts,transcript_iit,
9911 /*concordantp*/true,first_read_p,invertp,/*header*/"\tTranscripts:");
9912 } else if (this->transcripts != NULL) {
9913 Transcript_singleend_print(fp,this->transcripts,transcript_iit,invertp,/*header*/"\tTranscripts:");
9914 }
9915
9916 /* Pairing info if not already printed */
9917 if (hit5 != NULL && hit3 != NULL) {
9918 FPRINTF(fp,"\t");
9919 print_pair_info(fp,hit5,hit3,insertlength,pairscore,pairtype);
9920 }
9921 /* pairinfo_printed_p = true; */
9922 }
9923 FPRINTF(fp,"\n");
9924 }
9925 }
9926 }
9927
9928 if (alloc1p == true) {
9929 FREE(single_chr);
9930 }
9931 }
9932
9933
9934
9935 /* May substitute paired-end loglik for single-end loglik */
9936 void
Stage3end_print(Filestring_T fp,Stage3pair_T stage3pair,T this,Univ_IIT_T chromosome_iit,Shortread_T queryseq,Shortread_T headerseq,char * acc_suffix,bool invertp,T hit5,T hit3,int insertlength,int pairscore,Pairtype_T pairtype,int mapq_score,bool first_read_p)9937 Stage3end_print (Filestring_T fp, Stage3pair_T stage3pair, T this,
9938 Univ_IIT_T chromosome_iit, Shortread_T queryseq, Shortread_T headerseq,
9939 char *acc_suffix, bool invertp, T hit5, T hit3, int insertlength,
9940 int pairscore, Pairtype_T pairtype, int mapq_score, bool first_read_p) {
9941
9942 /* TODO: Instead of score_within_trims, which contains penalties for
9943 ambiguous lengths, use (querylength - this->nmatches_plus_spliced_trims) instead */
9944 print_substrings(fp,stage3pair,this,this->refalt_score_within_trims,
9945 chromosome_iit,queryseq,headerseq,acc_suffix,invertp,
9946 hit5,hit3,insertlength,pairscore,pairtype,mapq_score,first_read_p);
9947
9948 return;
9949 }
9950
9951
9952 static void
print_query_header(Filestring_T fp,char initchar,Shortread_T queryseq,bool invertp)9953 print_query_header (Filestring_T fp, char initchar, Shortread_T queryseq, bool invertp) {
9954 FPRINTF(fp,"%c",initchar);
9955 if (invertp == false) {
9956 Shortread_print_oneline(fp,queryseq);
9957 } else {
9958 Shortread_print_oneline_revcomp(fp,queryseq);
9959 }
9960
9961 return;
9962 }
9963
9964
9965
9966 static void
print_barcode_and_quality(Filestring_T fp,Shortread_T queryseq,bool invertp,int quality_shift)9967 print_barcode_and_quality (Filestring_T fp, Shortread_T queryseq, bool invertp, int quality_shift) {
9968 char *barcode;
9969
9970 if ((barcode = Shortread_barcode(queryseq)) != NULL) {
9971 FPRINTF(fp,"\tbarcode:%s",barcode);
9972 }
9973
9974 if (Shortread_quality_string(queryseq) != NULL) {
9975 FPRINTF(fp,"\t");
9976 if (invertp == false) {
9977 Shortread_print_quality(fp,queryseq,/*hardclip_low*/0,/*hardclip_high*/0,
9978 quality_shift,/*show_chopped_p*/true);
9979 } else {
9980 Shortread_print_quality_revcomp(fp,queryseq,/*hardclip_low*/0,/*hardclip_high*/0,
9981 quality_shift,/*show_chopped_p*/true);
9982 }
9983 }
9984
9985 return;
9986 }
9987
9988
9989 void
Stage3pair_print_end(Filestring_T fp,Filestring_T fp_failedinput,Result_T result,Resulttype_T resulttype,char initchar,bool firstp,Univ_IIT_T chromosome_iit,Shortread_T queryseq,Shortread_T headerseq1,Shortread_T headerseq2,int maxpaths,bool quiet_if_excessive_p,bool invertp,int quality_shift)9990 Stage3pair_print_end (Filestring_T fp, Filestring_T fp_failedinput,
9991 Result_T result, Resulttype_T resulttype,
9992 char initchar, bool firstp, Univ_IIT_T chromosome_iit,
9993 Shortread_T queryseq, Shortread_T headerseq1, Shortread_T headerseq2,
9994 int maxpaths, bool quiet_if_excessive_p,
9995 bool invertp, int quality_shift) {
9996 Stage3pair_T *stage3pairarray, stage3pair;
9997 T *stage3array, *stage3array_mate, this, hit5, hit3;
9998 int npaths_primary, npaths_altloc, npaths_mate_primary, npaths_mate_altloc, pathnum;
9999 int first_absmq, second_absmq;
10000 bool excessivep, translocationp;
10001
10002 if (resulttype == PAIREDEND_NOMAPPING) {
10003 if (output_type != M8_OUTPUT) {
10004 Filestring_set_split_output(fp,OUTPUT_NM);
10005 print_query_header(fp,initchar,queryseq,invertp);
10006 FPRINTF(fp,"\t0 %s",UNPAIRED_TEXT);
10007
10008 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10009
10010 FPRINTF(fp,"\t");
10011 Shortread_print_header(fp,headerseq1,headerseq2);
10012 FPRINTF(fp,"\n");
10013 }
10014 /* If failedinput_root != NULL, then this case is handled by calling procedure */
10015
10016 } else if (resulttype == CONCORDANT_UNIQ) {
10017 stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10018 stage3pair = stage3pairarray[0];
10019 hit5 = stage3pair->hit5;
10020 hit3 = stage3pair->hit3;
10021
10022 if (stage3pair->circularp == true) {
10023 Filestring_set_split_output(fp,OUTPUT_CC);
10024 } else {
10025 Filestring_set_split_output(fp,OUTPUT_CU);
10026 }
10027
10028 if (omit_concordant_uniq_p == true && stage3pair->circularp == false) {
10029 /* Skip printing */
10030 Filestring_set_split_output(fp,OUTPUT_NONE);
10031
10032 } else {
10033 if (output_type != M8_OUTPUT) {
10034 print_query_header(fp,initchar,queryseq,invertp);
10035 FPRINTF(fp,"\t1 %s",CONCORDANT_TEXT);
10036
10037 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10038
10039 FPRINTF(fp,"\t");
10040 Shortread_print_header(fp,headerseq1,headerseq2);
10041 }
10042
10043 if (firstp == true) {
10044 Stage3end_print(fp,stage3pair,hit5,
10045 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10046 invertp,hit5,hit3,stage3pair->insertlength,
10047 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10048 stage3pair->hit3->refalt_score_within_trims,
10049 stage3pair->pairtype,stage3pair->mapq_score,
10050 /*first_read_p*/true);
10051 } else {
10052 Stage3end_print(fp,stage3pair,hit3,
10053 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10054 invertp,hit5,hit3,stage3pair->insertlength,
10055 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10056 stage3pair->hit3->refalt_score_within_trims,
10057 stage3pair->pairtype,stage3pair->mapq_score,
10058 /*first_read_p*/false);
10059 }
10060
10061 if (output_type != M8_OUTPUT) {
10062 FPRINTF(fp,"\n");
10063 }
10064 }
10065
10066 } else if (resulttype == CONCORDANT_TRANSLOC) {
10067 Filestring_set_split_output(fp,OUTPUT_CT);
10068 stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10069
10070 if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10071 if (output_type != M8_OUTPUT) {
10072 /* No xs category for transloc, so ignore quiet-if-excessive_p */
10073 print_query_header(fp,initchar,queryseq,invertp);
10074 FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,CONCORDANT_TEXT);
10075 FPRINTF(fp," (transloc)");
10076
10077 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10078
10079 FPRINTF(fp,"\t");
10080 Shortread_print_header(fp,headerseq1,headerseq2);
10081
10082 /* No further output */
10083 FPRINTF(fp,"\n");
10084 }
10085
10086 if (failedinput_root != NULL) {
10087 Shortread_print_query_singleend(fp_failedinput,queryseq,headerseq1);
10088 }
10089
10090 } else {
10091 if (output_type != M8_OUTPUT) {
10092 print_query_header(fp,initchar,queryseq,invertp);
10093 FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,CONCORDANT_TEXT);
10094 FPRINTF(fp," (transloc)");
10095
10096 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10097
10098 FPRINTF(fp,"\t");
10099 Shortread_print_header(fp,headerseq1,headerseq2);
10100 }
10101
10102 for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10103 stage3pair = stage3pairarray[pathnum-1];
10104 hit5 = stage3pair->hit5;
10105 hit3 = stage3pair->hit3;
10106
10107 if (firstp == true) {
10108 Stage3end_print(fp,stage3pair,hit5,
10109 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10110 invertp,hit5,hit3,stage3pair->insertlength,
10111 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10112 stage3pair->hit3->refalt_score_within_trims,
10113 stage3pair->pairtype,stage3pair->mapq_score,
10114 /*first_read_p*/true);
10115 } else {
10116 Stage3end_print(fp,stage3pair,hit3,
10117 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10118 invertp,hit5,hit3,stage3pair->insertlength,
10119 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10120 stage3pair->hit3->refalt_score_within_trims,
10121 stage3pair->pairtype,stage3pair->mapq_score,
10122 /*first_read_p*/false);
10123 }
10124 }
10125
10126 if (output_type != M8_OUTPUT) {
10127 FPRINTF(fp,"\n");
10128 }
10129 }
10130
10131
10132 } else if (resulttype == CONCORDANT_MULT) {
10133 stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10134
10135 if (omit_concordant_mult_p == true) {
10136 /* Skip printing */
10137 Filestring_set_split_output(fp,OUTPUT_NONE);
10138
10139 } else if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10140 Filestring_set_split_output(fp,OUTPUT_CX);
10141 if (output_type != M8_OUTPUT) {
10142 print_query_header(fp,initchar,queryseq,invertp);
10143 FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,CONCORDANT_TEXT);
10144
10145 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10146
10147 FPRINTF(fp,"\t");
10148 Shortread_print_header(fp,headerseq1,headerseq2);
10149
10150 /* No further output */
10151 FPRINTF(fp,"\n");
10152
10153 if (failedinput_root != NULL) {
10154 Shortread_print_query_singleend(fp_failedinput,queryseq,headerseq1);
10155 }
10156 }
10157
10158 } else {
10159 Filestring_set_split_output(fp,OUTPUT_CM);
10160 if (output_type != M8_OUTPUT) {
10161 print_query_header(fp,initchar,queryseq,invertp);
10162 FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,CONCORDANT_TEXT);
10163
10164 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10165
10166 FPRINTF(fp,"\t");
10167 Shortread_print_header(fp,headerseq1,headerseq2);
10168 }
10169
10170 for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10171 stage3pair = stage3pairarray[pathnum-1];
10172 hit5 = stage3pair->hit5;
10173 hit3 = stage3pair->hit3;
10174
10175 if (firstp == true) {
10176 Stage3end_print(fp,stage3pair,hit5,
10177 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10178 invertp,hit5,hit3,stage3pair->insertlength,
10179 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10180 stage3pair->hit3->refalt_score_within_trims,
10181 stage3pair->pairtype,stage3pair->mapq_score,
10182 /*first_read_p*/true);
10183 } else {
10184 Stage3end_print(fp,stage3pair,hit3,
10185 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10186 invertp,hit5,hit3,stage3pair->insertlength,
10187 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10188 stage3pair->hit3->refalt_score_within_trims,
10189 stage3pair->pairtype,stage3pair->mapq_score,
10190 /*first_read_p*/false);
10191 }
10192 }
10193
10194 if (output_type != M8_OUTPUT) {
10195 FPRINTF(fp,"\n");
10196 }
10197 }
10198
10199 } else if (resulttype == PAIRED_UNIQ) {
10200 stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10201 stage3pair = stage3pairarray[0];
10202
10203 if (stage3pair->circularp == true) {
10204 Filestring_set_split_output(fp,OUTPUT_PC);
10205 } else if (stage3pair->pairtype == PAIRED_INVERSION) {
10206 Filestring_set_split_output(fp,OUTPUT_PI);
10207 } else if (stage3pair->pairtype == PAIRED_SCRAMBLE) {
10208 Filestring_set_split_output(fp,OUTPUT_PS);
10209 } else if (stage3pair->pairtype == PAIRED_TOOLONG) {
10210 Filestring_set_split_output(fp,OUTPUT_PL);
10211 } else {
10212 fprintf(stderr,"Unexpected pairtype %d\n",stage3pair->pairtype);
10213 abort();
10214 }
10215
10216 if (output_type != M8_OUTPUT) {
10217 print_query_header(fp,initchar,queryseq,invertp);
10218 FPRINTF(fp,"\t1 %s",PAIRED_TEXT);
10219
10220 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10221
10222 FPRINTF(fp,"\t");
10223 Shortread_print_header(fp,headerseq1,headerseq2);
10224 }
10225
10226 hit5 = stage3pair->hit5;
10227 hit3 = stage3pair->hit3;
10228
10229 if (firstp == true) {
10230 Stage3end_print(fp,stage3pair,hit5,
10231 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10232 invertp,hit5,hit3,stage3pair->insertlength,
10233 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10234 stage3pair->hit3->refalt_score_within_trims,
10235 stage3pair->pairtype,stage3pair->mapq_score,
10236 /*first_read_p*/true);
10237 } else {
10238 Stage3end_print(fp,stage3pair,hit3,
10239 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10240 invertp,hit5,hit3,stage3pair->insertlength,
10241 stage3pair->hit5->refalt_score_within_trims +
10242 stage3pair->hit3->refalt_score_within_trims,
10243 stage3pair->pairtype,stage3pair->mapq_score,
10244 /*first_read_p*/false);
10245 }
10246
10247 if (output_type != M8_OUTPUT) {
10248 FPRINTF(fp,"\n");
10249 }
10250
10251 } else if (resulttype == PAIRED_MULT) {
10252 stage3pairarray = (Stage3pair_T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10253
10254 if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10255 Filestring_set_split_output(fp,OUTPUT_PX);
10256 if (output_type != M8_OUTPUT) {
10257 print_query_header(fp,initchar,queryseq,invertp);
10258 FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,PAIRED_TEXT);
10259
10260 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10261
10262 FPRINTF(fp,"\t");
10263 Shortread_print_header(fp,headerseq1,headerseq2);
10264
10265 /* No further output */
10266 FPRINTF(fp,"\n");
10267
10268 if (failedinput_root != NULL) {
10269 Shortread_print_query_singleend(fp_failedinput,queryseq,headerseq1);
10270 }
10271 }
10272
10273 } else {
10274 Filestring_set_split_output(fp,OUTPUT_PM);
10275 if (output_type != M8_OUTPUT) {
10276 print_query_header(fp,initchar,queryseq,invertp);
10277 FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,PAIRED_TEXT);
10278
10279 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10280
10281 FPRINTF(fp,"\t");
10282 Shortread_print_header(fp,headerseq1,headerseq2);
10283 }
10284
10285 for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10286 stage3pair = stage3pairarray[pathnum-1];
10287 hit5 = stage3pair->hit5;
10288 hit3 = stage3pair->hit3;
10289
10290 if (firstp == true) {
10291 Stage3end_print(fp,stage3pair,hit5,
10292 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10293 invertp,hit5,hit3,stage3pair->insertlength,
10294 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10295 stage3pair->hit3->refalt_score_within_trims,
10296 stage3pair->pairtype,stage3pair->mapq_score,
10297 /*first_read_p*/true);
10298 } else {
10299 Stage3end_print(fp,stage3pair,hit3,
10300 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10301 invertp,hit5,hit3,stage3pair->insertlength,
10302 /*pairscore*/stage3pair->hit5->refalt_score_within_trims +
10303 stage3pair->hit3->refalt_score_within_trims,
10304 stage3pair->pairtype,stage3pair->mapq_score,
10305 /*first_read_p*/false);
10306 }
10307 }
10308
10309 if (output_type != M8_OUTPUT) {
10310 FPRINTF(fp,"\n");
10311 }
10312 }
10313
10314
10315 } else {
10316 /* Print as singles */
10317 if (firstp == true) {
10318 /* Get stage3array_mate first to avoid incorrect values for npaths */
10319 stage3array_mate = (T *) Result_array2(&npaths_mate_primary,&npaths_mate_altloc,&first_absmq,&second_absmq,result);
10320 stage3array = (T *) Result_array(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10321 } else {
10322 /* Get stage3array_mate first to avoid incorrect values for npaths */
10323 stage3array_mate = (T *) Result_array(&npaths_mate_primary,&npaths_mate_altloc,&first_absmq,&second_absmq,result);
10324 stage3array = (T *) Result_array2(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,result);
10325 }
10326
10327 excessivep = false;
10328 translocationp = false;
10329 if (resulttype == HALFMAPPING_UNIQ) {
10330 if (npaths_primary + npaths_altloc > 0 && Stage3end_circularpos(stage3array[0]) > 0) {
10331 Filestring_set_split_output(fp,OUTPUT_HC);
10332 } else if (npaths_mate_primary + npaths_mate_altloc > 0 && Stage3end_circularpos(stage3array_mate[0]) > 0) {
10333 Filestring_set_split_output(fp,OUTPUT_HC);
10334 } else {
10335 Filestring_set_split_output(fp,OUTPUT_HU);
10336 }
10337
10338 } else if (resulttype == HALFMAPPING_TRANSLOC) {
10339 Filestring_set_split_output(fp,OUTPUT_HT);
10340 translocationp = true;
10341
10342 } else if (resulttype == HALFMAPPING_MULT) {
10343 if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10344 Filestring_set_split_output(fp,OUTPUT_HX);
10345 excessivep = true;
10346 } else {
10347 Filestring_set_split_output(fp,OUTPUT_HM);
10348 }
10349
10350 } else if (resulttype == UNPAIRED_UNIQ) {
10351 if (npaths_primary + npaths_altloc > 0 && Stage3end_circularpos(stage3array[0]) > 0) {
10352 Filestring_set_split_output(fp,OUTPUT_UC);
10353 } else if (npaths_mate_primary + npaths_mate_altloc > 0 && Stage3end_circularpos(stage3array_mate[0]) > 0) {
10354 Filestring_set_split_output(fp,OUTPUT_UC);
10355 } else {
10356 Filestring_set_split_output(fp,OUTPUT_UU);
10357 }
10358
10359 } else if (resulttype == UNPAIRED_TRANSLOC) {
10360 Filestring_set_split_output(fp,OUTPUT_UT);
10361 translocationp = true;
10362
10363 } else if (resulttype == UNPAIRED_MULT) {
10364 if (quiet_if_excessive_p && npaths_primary + npaths_altloc > maxpaths) {
10365 Filestring_set_split_output(fp,OUTPUT_UX);
10366 excessivep = true;
10367 } else {
10368 Filestring_set_split_output(fp,OUTPUT_UM);
10369 }
10370
10371 } else {
10372 fprintf(stderr,"Resulttype is %s\n",Resulttype_string(resulttype));
10373 abort();
10374 }
10375
10376 if (output_type != M8_OUTPUT) {
10377 print_query_header(fp,initchar,queryseq,invertp);
10378 FPRINTF(fp,"\t%d %s",npaths_primary + npaths_altloc,UNPAIRED_TEXT);
10379 if (translocationp == true) {
10380 FPRINTF(fp," (transloc)");
10381 }
10382
10383 print_barcode_and_quality(fp,queryseq,invertp,quality_shift);
10384
10385 FPRINTF(fp,"\t");
10386 Shortread_print_header(fp,headerseq1,headerseq2);
10387 }
10388
10389 if (excessivep == true) {
10390 /* No output */
10391 if (failedinput_root != NULL) {
10392 Shortread_print_query_singleend(fp_failedinput,queryseq,headerseq1);
10393 }
10394
10395 } else {
10396 if (firstp == true) {
10397 for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10398 this = stage3array[pathnum-1];
10399 Stage3end_print(fp,/*stage3pair*/NULL,this,
10400 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/1",
10401 invertp,/*hit5*/(T) NULL,/*hit3*/(T) NULL,
10402 /*insertlength*/0,/*pairscore*/0,/*pairtype*/UNPAIRED,this->mapq_score,
10403 /*first_read_p*/true);
10404 }
10405 } else {
10406 for (pathnum = 1; pathnum <= npaths_primary + npaths_altloc && pathnum <= maxpaths; pathnum++) {
10407 this = stage3array[pathnum-1];
10408 Stage3end_print(fp,/*stage3pair*/NULL,this,
10409 chromosome_iit,queryseq,headerseq1,/*acc_suffix*/"/2",
10410 invertp,/*hit5*/(T) NULL,/*hit3*/(T) NULL,
10411 /*insertlength*/0,/*pairscore*/0,/*pairtype*/UNPAIRED,this->mapq_score,
10412 /*first_read_p*/false);
10413 }
10414 }
10415 }
10416
10417 if (output_type != M8_OUTPUT) {
10418 FPRINTF(fp,"\n");
10419 }
10420 }
10421
10422 return;
10423 }
10424
10425
10426
10427 /* Used only for --merge-overlap features, so obey hardclip and not querystart/queryend */
10428 /* If use querylength_adj, ss.bug.4 fails. If use querylength, ss.bug.3 fails */
10429 static List_T
Stage3end_convert_to_pairs_out(List_T pairs,T hit,Shortread_T queryseq,int hardclip_low,int hardclip_high,int queryseq_offset)10430 Stage3end_convert_to_pairs_out (List_T pairs, T hit, Shortread_T queryseq,
10431 int hardclip_low, int hardclip_high, int queryseq_offset) {
10432 List_T p, q;
10433 /* Chrpos_T genomicpos1, genomicpos2; */
10434 Substring_T substring, prev_substring;
10435 Junction_T junction;
10436 Junctiontype_T type;
10437 char *deletion_string;
10438
10439 if (hit->hittype == TRANSLOC_SPLICE) {
10440 /* Cannot handle translocations within a single GMAP alignment */
10441 abort();
10442 return NULL;
10443
10444 } else {
10445 p = hit->substrings_1toN;
10446 prev_substring = (Substring_T) List_head(p);
10447 pairs = Substring_convert_to_pairs_out(pairs,prev_substring,hit->querylength,
10448 queryseq,hardclip_low,hardclip_high,queryseq_offset);
10449
10450 for (q = hit->junctions_1toN, p = List_next(p); p != NULL; q = List_next(q), p = List_next(p)) {
10451 junction = (Junction_T) List_head(q);
10452 substring = (Substring_T) List_head(p);
10453
10454 if ((type = Junction_type(junction)) == INS_JUNCTION) {
10455 pairs = Substring_add_insertion_out(pairs,prev_substring,substring,hit->querylength,
10456 /*insertionlength*/Junction_nindels(junction),queryseq,
10457 hardclip_low,hardclip_high,queryseq_offset);
10458 } else if (type == DEL_JUNCTION) {
10459 deletion_string = Junction_deletion_string(junction,genomecomp,hit->plusp);
10460 pairs = Substring_add_deletion_out(pairs,prev_substring,substring,hit->querylength,
10461 deletion_string,/*deletionlength*/Junction_nindels(junction),
10462 hardclip_low,hardclip_high,queryseq_offset);
10463 } else if (type == SPLICE_JUNCTION) {
10464 pairs = Substring_add_intron_out(pairs,prev_substring,substring,hit->querylength,
10465 hardclip_low,hardclip_high,queryseq_offset);
10466
10467 } else {
10468 abort();
10469 }
10470
10471 pairs = Substring_convert_to_pairs_out(pairs,substring,hit->querylength,
10472 queryseq,hardclip_low,hardclip_high,queryseq_offset);
10473 prev_substring = substring;
10474 }
10475
10476 debug15(Simplepair_dump_list(pairs,true));
10477 return pairs;
10478 }
10479 }
10480
10481
10482 /* Don't want querylength_adj */
10483 struct Simplepair_T *
Stage3pair_merge(int * npairs,int * querylength_merged,char ** queryseq_merged,char ** quality_merged,Stage3pair_T this,Shortread_T queryseq5,Shortread_T queryseq3,int querylength5,int querylength3,int clipdir,int hardclip5_low,int hardclip5_high,int hardclip3_low,int hardclip3_high)10484 Stage3pair_merge (int *npairs, int *querylength_merged, char **queryseq_merged, char **quality_merged,
10485 Stage3pair_T this, Shortread_T queryseq5, Shortread_T queryseq3,
10486 int querylength5, int querylength3, int clipdir,
10487 int hardclip5_low, int hardclip5_high, int hardclip3_low, int hardclip3_high) {
10488 struct Simplepair_T *pairarray, *newpair;
10489 Simplepair_T oldpair;
10490 List_T pairs, pairs5, pairs3, p;
10491 T hit5, hit3;
10492 int querylengthA, querylengthB;
10493 char *queryseq_ptr_5, *queryseq_ptr_3, *quality_ptr_5, *quality_ptr_3;
10494 #ifdef CHECK_ASSERTIONS
10495 Chrpos_T genomicpos1, genomicpos2;
10496 #endif
10497
10498 hit5 = this->hit5;
10499 hit3 = this->hit3;
10500 queryseq_ptr_5 = Shortread_fullpointer_uc(queryseq5);
10501 queryseq_ptr_3 = Shortread_fullpointer_uc(queryseq3);
10502 quality_ptr_5 = Shortread_quality_string(queryseq5);
10503 quality_ptr_3 = Shortread_quality_string(queryseq3);
10504
10505 if (hit5->plusp == true) {
10506 if (clipdir > 0) {
10507 pairs5 = Stage3end_convert_to_pairs_out(NULL,hit5,queryseq5,hardclip5_low,hardclip5_high,/*queryseq_offset*/0);
10508 pairs5 = Simplepair_strip_gaps_at_head(pairs5);
10509
10510 pairs3 = Stage3end_convert_to_pairs_out(NULL,hit3,queryseq3,hardclip3_low,hardclip3_high,
10511 /*queryseq_offset*/querylength5-hardclip5_low-hardclip5_high-hardclip3_low-hardclip3_high);
10512 pairs3 = Simplepair_strip_gaps_at_tail(pairs3);
10513
10514 #ifdef CHECK_ASSERTIONS
10515 genomicpos1 = Simplepair_head_genomepos(pairs5);
10516 genomicpos2 = Simplepair_last_genomepos(pairs3);
10517 if (genomicpos2 != genomicpos1 + 1U) {
10518 printf("Accession %s, plus\n",Shortread_accession(queryseq5));
10519 printf("Expected genomicpos2 %u == genomicpos1 %u + 1\n",genomicpos2,genomicpos1);
10520 Simplepair_dump_list(pairs5,true);
10521 Simplepair_dump_list(pairs3,true);
10522 abort();
10523 }
10524 #endif
10525
10526 pairs = List_append(pairs3,pairs5);
10527
10528 querylengthA = querylength5 - hardclip5_low - hardclip5_high;
10529 querylengthB = querylength3 - hardclip3_low - hardclip3_high;
10530 *querylength_merged = querylengthA + querylengthB;
10531
10532 *queryseq_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10533 strncpy(*queryseq_merged,queryseq_ptr_5,querylengthA);
10534 strncpy(&((*queryseq_merged)[querylengthA]),&(queryseq_ptr_3[querylength3 - querylengthB]),querylengthB);
10535 (*queryseq_merged)[querylengthA+querylengthB] = '\0';
10536
10537 if (quality_ptr_5 == NULL || quality_ptr_3 == NULL) {
10538 *quality_merged = (char *) NULL;
10539 } else {
10540 *quality_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10541 strncpy(*quality_merged,quality_ptr_5,querylengthA);
10542 strncpy(&((*quality_merged)[querylengthA]),&(quality_ptr_3[querylength3 - querylengthB]),querylengthB);
10543 (*quality_merged)[querylengthA+querylengthB] = '\0';
10544 }
10545
10546 } else if (clipdir < 0) {
10547 pairs3 = Stage3end_convert_to_pairs_out(NULL,hit3,queryseq3,hardclip3_low,hardclip3_high,/*queryseq_offset*/0);
10548 pairs3 = Simplepair_strip_gaps_at_head(pairs3);
10549
10550 pairs5 = Stage3end_convert_to_pairs_out(NULL,hit5,queryseq5,hardclip5_low,hardclip5_high,
10551 /*queryseq_offset*/querylength3-hardclip3_low-hardclip3_high-hardclip5_low-hardclip5_high);
10552 pairs5 = Simplepair_strip_gaps_at_tail(pairs5);
10553
10554 #ifdef CHECK_ASSERTIONS
10555 genomicpos1 = Simplepair_head_genomepos(pairs3);
10556 genomicpos2 = Simplepair_last_genomepos(pairs5);
10557 if (genomicpos2 != genomicpos1 + 1U) {
10558 printf("Accession %s, plus, clipdir %d\n",Shortread_accession(queryseq5),clipdir);
10559 printf("Expected genomicpos2 %u == genomicpos1 %u + 1\n",genomicpos2,genomicpos1);
10560 printf("Begin of pairs3\n");
10561 Simplepair_dump_list(pairs3,true);
10562 printf("Begin of pairs5\n");
10563 Simplepair_dump_list(pairs5,true);
10564 abort();
10565 }
10566 #endif
10567
10568 pairs = List_append(pairs5,pairs3);
10569
10570 querylengthA = querylength3 - hardclip3_low - hardclip3_high;
10571 querylengthB = querylength5 - hardclip5_low - hardclip5_high;
10572 *querylength_merged = querylengthA + querylengthB;
10573
10574 *queryseq_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10575 strncpy(*queryseq_merged,queryseq_ptr_3,querylengthA);
10576 strncpy(&((*queryseq_merged)[querylengthA]),&(queryseq_ptr_5[querylength5 - querylengthB]),querylengthB);
10577 (*queryseq_merged)[querylengthA+querylengthB] = '\0';
10578
10579 if (quality_ptr_5 == NULL || quality_ptr_3 == NULL) {
10580 *quality_merged = (char *) NULL;
10581 } else {
10582 *quality_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10583 strncpy(*quality_merged,quality_ptr_3,querylengthA);
10584 strncpy(&((*quality_merged)[querylengthA]),&(quality_ptr_5[querylength5 - querylengthB]),querylengthB);
10585 (*quality_merged)[querylengthA+querylengthB] = '\0';
10586 }
10587
10588 } else {
10589 abort();
10590 }
10591
10592 } else {
10593 if (clipdir > 0) {
10594 pairs3 = Stage3end_convert_to_pairs_out(NULL,hit3,queryseq3,hardclip3_low,hardclip3_high,/*queryseq_offset*/0);
10595 pairs3 = Simplepair_strip_gaps_at_head(pairs3);
10596
10597 pairs5 = Stage3end_convert_to_pairs_out(NULL,hit5,queryseq5,hardclip5_low,hardclip5_high,
10598 /*queryseq_offset*/querylength3-hardclip3_low-hardclip3_high-hardclip5_low-hardclip5_high);
10599 pairs5 = Simplepair_strip_gaps_at_tail(pairs5);
10600
10601 #ifdef CHECK_ASSERTIONS
10602 genomicpos1 = Simplepair_head_genomepos(pairs3);
10603 genomicpos2 = Simplepair_last_genomepos(pairs5);
10604 if (genomicpos2 != genomicpos1 - 1U) {
10605 printf("Accession %s, minus\n",Shortread_accession(queryseq5));
10606 printf("Expected genomicpos2 %u == genomicpos1 %u - 1\n",genomicpos2,genomicpos1);
10607 Simplepair_dump_list(pairs3,true);
10608 Simplepair_dump_list(pairs5,true);
10609 abort();
10610 }
10611 #endif
10612
10613 pairs = List_append(pairs5,pairs3);
10614
10615 querylengthA = querylength3 - hardclip3_low - hardclip3_high;
10616 querylengthB = querylength5 - hardclip5_low - hardclip5_high;
10617 *querylength_merged = querylengthA + querylengthB;
10618
10619 *queryseq_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10620 strncpy(*queryseq_merged,queryseq_ptr_3,querylengthA);
10621 strncpy(&((*queryseq_merged)[querylengthA]),&(queryseq_ptr_5[querylength5 - querylengthB]),querylengthB);
10622 (*queryseq_merged)[querylengthA+querylengthB] = '\0';
10623
10624 if (quality_ptr_5 == NULL || quality_ptr_3 == NULL) {
10625 *quality_merged = (char *) NULL;
10626 } else {
10627 *quality_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10628 strncpy(*quality_merged,quality_ptr_3,querylengthA);
10629 strncpy(&((*quality_merged)[querylengthA]),&(quality_ptr_5[querylength5 - querylengthB]),querylengthB);
10630 (*quality_merged)[querylengthA+querylengthB] = '\0';
10631 }
10632
10633 } else if (clipdir < 0) {
10634 pairs5 = Stage3end_convert_to_pairs_out(NULL,hit5,queryseq5,hardclip5_low,hardclip5_high,/*queryseq_offset*/0);
10635 pairs5 = Simplepair_strip_gaps_at_head(pairs5);
10636
10637 pairs3 = Stage3end_convert_to_pairs_out(NULL,hit3,queryseq3,hardclip3_low,hardclip3_high,
10638 /*queryseq_offset*/querylength5-hardclip5_low-hardclip5_high-hardclip3_low-hardclip3_high);
10639 pairs3 = Simplepair_strip_gaps_at_tail(pairs3);
10640
10641 #ifdef CHECK_ASSERTIONS
10642 genomicpos1 = Simplepair_head_genomepos(pairs5);
10643 genomicpos2 = Simplepair_last_genomepos(pairs3);
10644 if (genomicpos2 != genomicpos1 - 1U) {
10645 printf("Accession %s, minus\n",Shortread_accession(queryseq5));
10646 printf("Expected genomicpos2 %u == genomicpos1 %u - 1\n",genomicpos2,genomicpos1);
10647 Simplepair_dump_list(pairs5,true);
10648 Simplepair_dump_list(pairs3,true);
10649 abort();
10650 }
10651 #endif
10652
10653 pairs = List_append(pairs3,pairs5);
10654
10655 querylengthA = querylength5 - hardclip5_low - hardclip5_high;
10656 querylengthB = querylength3 - hardclip3_low - hardclip3_high;
10657 *querylength_merged = querylengthA + querylengthB;
10658
10659 *queryseq_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10660 strncpy(*queryseq_merged,queryseq_ptr_5,querylengthA);
10661 strncpy(&((*queryseq_merged)[querylengthA]),&(queryseq_ptr_3[querylength3 - querylengthB]),querylengthB);
10662 (*queryseq_merged)[querylengthA+querylengthB] = '\0';
10663
10664 if (quality_ptr_5 == NULL || quality_ptr_3 == NULL) {
10665 *quality_merged = (char *) NULL;
10666 } else {
10667 *quality_merged = (char *) MALLOC_OUT((querylengthA+querylengthB+1) * sizeof(char));
10668 strncpy(*quality_merged,quality_ptr_5,querylengthA);
10669 strncpy(&((*quality_merged)[querylengthA]),&(quality_ptr_3[querylength3 - querylengthB]),querylengthB);
10670 (*quality_merged)[querylengthA+querylengthB] = '\0';
10671 }
10672
10673 } else {
10674 abort();
10675 }
10676 }
10677
10678 pairs = List_reverse(pairs);
10679 /* Simplepair_dump_list(pairs,true); */
10680
10681 *npairs = List_length(pairs);
10682 newpair = pairarray = (struct Simplepair_T *) MALLOC_OUT((*npairs)*sizeof(struct Simplepair_T));
10683 for (p = pairs; p != NULL; p = p->rest) {
10684 oldpair = (Simplepair_T) p->first;
10685 memcpy(newpair++,oldpair,sizeof(struct Simplepair_T));
10686 Simplepair_free_out(&oldpair);
10687 }
10688 List_free_out(&pairs);
10689
10690 return pairarray;
10691 }
10692
10693
10694 #if 0
10695 static int
10696 compute_insertlength (int *pair_relationship, Stage3pair_T this) {
10697 T hit5, hit3;
10698 int querylength5, querylength3;
10699
10700 hit5 = this->hit5;
10701 hit3 = this->hit3;
10702 querylength5 = hit5->querylength;
10703 querylength3 = hit3->querylength;
10704
10705 debug10(printf("Computing insertlength on %u..%u to %u..%u\n",
10706 hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,
10707 hit3->genomicend - hit3->chroffset,hit3->genomicstart - hit3->chroffset));
10708
10709 if (hit5->plusp == true && hit3->plusp == false) {
10710 /* Have 5-start..end and 3-end..start */
10711 /* or 3-end..start and 5-start..end */
10712
10713 *pair_relationship = 0;
10714 if (hit5->genomicend < hit3->genomicend) {
10715 return (hit3->genomicend - hit5->genomicend) + querylength5 + querylength3;
10716 } else if (hit3->genomicstart < hit5->genomicstart) {
10717 return (hit5->genomicstart - hit3->genomicstart) + querylength5 + querylength3;
10718 } else {
10719 return pair_insert_length_unpaired(hit5,hit3);
10720 }
10721
10722 } else if (hit5->plusp == false && hit3->plusp == true) {
10723 /* Have 5-end..start and 3-start..end */
10724 /* or 3-start..end and 5-end..start */
10725
10726 *pair_relationship = 0;
10727 if (hit5->genomicstart < hit3->genomicstart) {
10728 return (hit3->genomicstart - hit5->genomicstart) + querylength5 + querylength3;
10729 } else if (hit3->genomicend < hit5->genomicend) {
10730 return (hit5->genomicend - hit3->genomicend) + querylength5 + querylength3;
10731 } else {
10732 return pair_insert_length_unpaired(hit5,hit3);
10733 }
10734
10735 } else if (hit5->plusp == true) {
10736 /* Concordant directions on same chromosome (plus) */
10737 debug10(printf("Concordant on plus strand\n"));
10738 /* Have 5-start..end and 3-start..end */
10739 if (hit5->genomicend < hit3->genomicstart) {
10740 /* No overlap */
10741 *pair_relationship = +1;
10742 return (hit3->genomicstart - hit5->genomicend) + querylength5 + querylength3;
10743 } else {
10744 return pair_insert_length(&(*pair_relationship),hit5,hit3);
10745 }
10746
10747
10748 } else {
10749 /* Concordant directions on same chromosome (minus) */
10750 debug10(printf("Concordant on minus strand\n"));
10751 /* Have 3-end..start and 5-end..start */
10752 if (hit3->genomicstart < hit5->genomicend) {
10753 /* No overlap */
10754 *pair_relationship = -1;
10755 return (hit5->genomicend - hit3->genomicstart) + querylength5 + querylength3;
10756 } else {
10757 return pair_insert_length(&(*pair_relationship),hit5,hit3);
10758 }
10759 }
10760 }
10761 #endif
10762
10763
10764 /* Need to make a copy of the hit before calling */
10765 static void
resolve_ambiguity_5(T this,int * mismatch_positions_alloc,Compress_T query_compress,int alts_resolve)10766 resolve_ambiguity_5 (T this, int *mismatch_positions_alloc, Compress_T query_compress, int alts_resolve) {
10767 Substring_T substring, anchor, substring1;
10768 Junction_T junction;
10769 Univcoord_T left, ignore;
10770 Chrpos_T splice_distance;
10771 double donor_prob, acceptor_prob;
10772 List_T p;
10773
10774 substring = (Substring_T) List_head(this->substrings_Nto1);
10775 anchor = (Substring_T) List_head(List_next(this->substrings_Nto1));
10776 junction = (Junction_T) List_head(this->junctions_Nto1);
10777 left = Substring_set_alt(&donor_prob,&acceptor_prob,&ignore,&this->genomicend,substring,alts_resolve);
10778 if (this->plusp == true) {
10779 splice_distance = left - Substring_left(anchor);
10780 this->high = this->genomicend - (this->querylength - this->queryend_chrbound);
10781 } else {
10782 splice_distance = Substring_left(anchor) - left;
10783 this->low = this->genomicend + (this->querylength - this->queryend_chrbound);
10784 }
10785 assert(this->low < this->high);
10786
10787 if (splice_distance > 0) {
10788 Junction_set_unambiguous(junction,splice_distance,donor_prob,acceptor_prob);
10789 } else {
10790 this->substrings_Nto1 = List_next(this->substrings_Nto1);
10791 this->junctions_Nto1 = List_next(this->junctions_Nto1);
10792 this->substrings_1toN = List_drop_last(this->substrings_1toN,(void **) &substring);
10793 this->junctions_1toN = List_drop_last(this->junctions_1toN,(void **) &junction);
10794
10795 this->nsplices -= 1;
10796 if (this->nsplices == 0) {
10797 this->splice_score = 0.0;
10798 } else {
10799 this->splice_score = (this->splice_score * 2*(this->nsplices+1) - Junction_splice_score(junction)) / (2*this->nsplices);
10800 }
10801 this->nsegments -= 1;
10802
10803 anchor = Substring_extend_anchor_queryend(anchor,substring,mismatch_positions_alloc,query_compress);
10804 List_head_set(this->substrings_Nto1,(void *) anchor);
10805 List_last_set(this->substrings_1toN,(void *) anchor);
10806 Substring_free(&substring);
10807 Junction_free(&junction);
10808
10809 /* Update information for hit */
10810 this->trim_queryend = Substring_trim_queryend(anchor);
10811 this->mandatory_trim_queryend = Substring_mandatory_trim_queryend(anchor);
10812 this->trim_queryend_splicep = Substring_trim_queryend_splicep(anchor);
10813
10814 this->refalt_nmatches_to_trims = this->ref_nmatches_to_trims = 0;
10815 for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
10816 substring = (Substring_T) List_head(p);
10817 this->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
10818 this->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
10819 }
10820
10821 substring1 = (Substring_T) List_head(this->substrings_1toN);
10822 this->refalt_nmatches_plus_spliced_trims = this->refalt_nmatches_to_trims + Substring_start_amb_length(substring1) + Substring_end_amb_length(anchor);
10823 this->ref_nmatches_plus_spliced_trims = this->ref_nmatches_to_trims + Substring_start_amb_length(substring1) + Substring_end_amb_length(anchor);
10824 for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
10825 junction = List_head(p);
10826 this->refalt_nmatches_plus_spliced_trims += Junction_ninserts(junction);
10827 this->ref_nmatches_plus_spliced_trims += Junction_ninserts(junction);
10828 }
10829 this->ref_score_overall = this->querylength - this->ref_nmatches_to_trims;
10830 this->refalt_score_overall = this->querylength - this->refalt_nmatches_to_trims;
10831 this->refalt_score_within_trims = this->querylength - this->refalt_nmatches_plus_spliced_trims;
10832 if (Substring_trim_querystart_splicep(substring1) == false) {
10833 this->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(substring1)/END_BINSIZE);
10834 } else {
10835 this->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(substring1)/END_BINSIZE);
10836 }
10837 if (Substring_trim_queryend_splicep(anchor) == false) {
10838 this->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((this->querylength - Substring_queryend(anchor))/END_BINSIZE);
10839 } else {
10840 this->refalt_score_within_trims += SPLICED_END_PENALTY*((this->querylength - Substring_queryend(anchor))/END_BINSIZE);
10841 }
10842
10843 if (this->chrlength < (Univcoord_T) this->querylength) {
10844 this->ref_score_overall -= ((Univcoord_T) this->querylength - this->chrlength);
10845 this->refalt_score_overall -= ((Univcoord_T) this->querylength - this->chrlength);
10846 this->refalt_score_within_trims -= ((Univcoord_T) this->querylength - this->chrlength);
10847 }
10848 assert(this->refalt_score_within_trims >= 0);
10849 }
10850
10851 return;
10852 }
10853
10854
10855 /* Need to make a copy of the hit before calling */
10856 static void
resolve_ambiguity_3(T this,int * mismatch_positions_alloc,Compress_T query_compress,int alts_resolve)10857 resolve_ambiguity_3 (T this, int *mismatch_positions_alloc, Compress_T query_compress, int alts_resolve) {
10858 Substring_T substring, anchor, substringN;
10859 Junction_T junction;
10860 Univcoord_T left, ignore;
10861 Chrpos_T splice_distance;
10862 double donor_prob, acceptor_prob;
10863 List_T p;
10864
10865 substring = (Substring_T) List_head(this->substrings_1toN);
10866 anchor = (Substring_T) List_head(List_next(this->substrings_1toN));
10867 junction = (Junction_T) List_head(this->junctions_1toN);
10868 left = Substring_set_alt(&donor_prob,&acceptor_prob,&this->genomicstart,&ignore,substring,alts_resolve);
10869 if (this->plusp == true) {
10870 splice_distance = Substring_left(anchor) - left;
10871 this->low = this->genomicstart + this->querystart_chrbound;
10872 } else {
10873 splice_distance = left - Substring_left(anchor);
10874 this->high = this->genomicstart - this->querystart_chrbound;
10875 }
10876 assert(this->low < this->high);
10877
10878 if (splice_distance > 0) {
10879 Junction_set_unambiguous(junction,splice_distance,donor_prob,acceptor_prob);
10880 } else {
10881 this->substrings_1toN = List_next(this->substrings_1toN);
10882 this->junctions_1toN = List_next(this->junctions_1toN);
10883 this->substrings_Nto1 = List_drop_last(this->substrings_Nto1,(void **) &substring);
10884 this->junctions_Nto1 = List_drop_last(this->junctions_Nto1,(void **) &junction);
10885 this->splice_score = (this->splice_score * 2*this->nsplices - Junction_splice_score(junction)) / (2*(this->nsplices - 1));
10886
10887 this->nsplices -= 1;
10888 if (this->nsplices == 0) {
10889 this->splice_score = 0.0;
10890 } else {
10891 this->splice_score = (this->splice_score * 2*(this->nsplices+1) - Junction_splice_score(junction)) / (2*this->nsplices);
10892 }
10893 this->nsegments -= 1;
10894
10895 anchor = Substring_extend_anchor_querystart(anchor,substring,mismatch_positions_alloc,query_compress);
10896 List_head_set(this->substrings_1toN,(void *) anchor);
10897 List_last_set(this->substrings_Nto1,(void *) anchor);
10898 Substring_free(&substring);
10899 Junction_free(&junction);
10900
10901 /* Update information for hit */
10902 this->trim_querystart = Substring_trim_querystart(anchor);
10903 this->mandatory_trim_querystart = Substring_mandatory_trim_querystart(anchor);
10904 this->trim_querystart_splicep = Substring_trim_querystart_splicep(anchor);
10905
10906 this->refalt_nmatches_to_trims = this->ref_nmatches_to_trims = 0;
10907 for (p = this->substrings_1toN; p != NULL; p = List_next(p)) {
10908 substring = (Substring_T) List_head(p);
10909 this->refalt_nmatches_to_trims += Substring_nmatches_to_trims(substring);
10910 this->ref_nmatches_to_trims += Substring_ref_nmatches_to_trims(substring);
10911 }
10912
10913 substringN = (Substring_T) List_head(this->substrings_Nto1);
10914 this->refalt_nmatches_plus_spliced_trims = this->refalt_nmatches_to_trims + Substring_start_amb_length(anchor) + Substring_end_amb_length(substringN);
10915 this->ref_nmatches_plus_spliced_trims = this->ref_nmatches_to_trims + Substring_start_amb_length(anchor) + Substring_end_amb_length(substringN);
10916 for (p = this->junctions_1toN; p != NULL; p = List_next(p)) {
10917 junction = List_head(p);
10918 this->refalt_nmatches_plus_spliced_trims += Junction_ninserts(junction);
10919 this->ref_nmatches_plus_spliced_trims += Junction_ninserts(junction);
10920 }
10921 this->ref_score_overall = this->querylength - this->ref_nmatches_to_trims;
10922 this->refalt_score_overall = this->querylength - this->refalt_nmatches_to_trims;
10923 this->refalt_score_within_trims = this->querylength - this->refalt_nmatches_plus_spliced_trims;
10924 if (Substring_trim_querystart_splicep(anchor) == false) {
10925 this->refalt_score_within_trims -= NONSPLICED_END_RESTORE*(Substring_querystart(anchor)/END_BINSIZE);
10926 } else {
10927 this->refalt_score_within_trims += SPLICED_END_PENALTY*(Substring_querystart(anchor)/END_BINSIZE);
10928 }
10929 if (Substring_trim_queryend_splicep(substringN) == false) {
10930 this->refalt_score_within_trims -= NONSPLICED_END_RESTORE*((this->querylength - Substring_queryend(substringN))/END_BINSIZE);
10931 } else {
10932 this->refalt_score_within_trims += SPLICED_END_PENALTY*((this->querylength - Substring_queryend(substringN))/END_BINSIZE);
10933 }
10934
10935 if (this->chrlength < (Univcoord_T) this->querylength) {
10936 this->ref_score_overall -= ((Univcoord_T) this->querylength - this->chrlength);
10937 this->refalt_score_overall -= ((Univcoord_T) this->querylength - this->chrlength);
10938 this->refalt_score_within_trims -= ((Univcoord_T) this->querylength - this->chrlength);
10939 }
10940 assert(this->refalt_score_within_trims >= 0);
10941 }
10942
10943 return;
10944 }
10945
10946
10947
10948 /* Should not set ambiguous flag in substrings, because resolution of
10949 an ambiguity depends on a particular pair of ends */
10950
10951 static void
resolve_inside_alts_splice_plus(int * alts_resolve_5,int * alts_resolve_3,int * alts_status_inside,T hit5,T hit3,int querylength5,int querylength3)10952 resolve_inside_alts_splice_plus (int *alts_resolve_5, int *alts_resolve_3,
10953 int *alts_status_inside, T hit5, T hit3, int querylength5, int querylength3) {
10954 Chrpos_T best_insertlength, insertlength;
10955 Univcoord_T genomicstart, genomicend;
10956 int besti5 = -1, besti3 = -1, i, j;
10957 int best_nmismatches, nmismatches;
10958
10959 Substring_T substring5, substring3;
10960 Univcoord_T *end_alts_coords, *start_alts_coords;
10961 int *end_alts_nmismatches, *start_alts_nmismatches;
10962 int end_amb_length_5, start_amb_length_3;
10963
10964
10965 debug9(printf("resolve plus: hit5 %p (%s) and hit3 %p (%s)\n",
10966 hit5,Method_string(hit5->method),hit3,Method_string(hit3->method)));
10967
10968 substring5 = (Substring_T) List_head(hit5->substrings_Nto1); /* the substring for concordance */
10969 debug9(printf("Testing substring5 %p %d..%d alts_p %d\n",
10970 substring5,Stage3end_substrings_querystart(hit5),Stage3end_substrings_queryend(hit5),
10971 Substring_has_alts_p(substring5)));
10972
10973 substring3 = (Substring_T) List_head(hit3->substrings_1toN); /* the substring for concordance (was Nto1) */
10974 debug9(printf("Testing substring3 %p %d..%d alts_p %d\n",
10975 substring3,Stage3end_substrings_querystart(hit3),Stage3end_substrings_queryend(hit3),
10976 Substring_has_alts_p(substring3)));
10977
10978 if (substring5 != NULL && Substring_has_alts_p(substring5) == true &&
10979 substring3 != NULL && Substring_has_alts_p(substring3) == true) {
10980 debug9(printf("Resolve plus case 1: Got alts at 5' and alts at 3':"));
10981 end_alts_coords = Substring_alts_coords(substring5);
10982 end_alts_nmismatches = Substring_alts_nmismatches(substring5);
10983 start_alts_coords = Substring_alts_coords(substring3);
10984 start_alts_nmismatches = Substring_alts_nmismatches(substring3);
10985 end_amb_length_5 = end_amb_length(hit5);
10986 start_amb_length_3 = start_amb_length(hit3);
10987
10988 best_insertlength = (Chrpos_T) -1;
10989 best_nmismatches = querylength5 + querylength3;
10990 for (i = 0; i < Substring_alts_ncoords(substring5); i++) {
10991 genomicend = end_alts_coords[i] + end_amb_length_5;
10992 for (j = 0; j < Substring_alts_ncoords(substring3); j++) {
10993 genomicstart = start_alts_coords[j] - start_amb_length_3;
10994 debug9(printf(" %u,%u",(Chrpos_T) (genomicend - hit5->chroffset),(Chrpos_T) (genomicstart - hit3->chroffset)));
10995 if (genomicend < genomicstart) {
10996 /* Look for valid insertlength */
10997 insertlength = genomicstart - genomicend + querylength5 + querylength3;
10998 debug9(printf(" (insertlength %u)",insertlength));
10999
11000 if (insertlength < best_insertlength) {
11001 besti5 = i;
11002 besti3 = j;
11003 best_insertlength = insertlength;
11004 best_nmismatches = end_alts_nmismatches[i] + start_alts_nmismatches[j];
11005 debug9(printf("*"));
11006 } else if (insertlength == best_insertlength &&
11007 (nmismatches = end_alts_nmismatches[i] + start_alts_nmismatches[j]) < best_nmismatches) {
11008 besti5 = i;
11009 besti3 = j;
11010 best_nmismatches = nmismatches;
11011 debug9(printf("*"));
11012 } else if (nmismatches == best_nmismatches) {
11013 debug9(printf("tie"));
11014 }
11015 }
11016 }
11017 }
11018
11019 if (besti5 >= 0 && besti3 >= 0) {
11020 debug9(printf("\nBEST HAS INSERTLENGTH %u AND NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11021 *alts_resolve_5 = besti5;
11022 *alts_resolve_3 = besti3;
11023 *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11024 hit5->genomicend = end_alts_coords[besti5] + end_amb_length_5;
11025 hit3->genomicstart = start_alts_coords[besti3] - start_amb_length_3;
11026 }
11027 debug9(printf("\n"));
11028
11029 } else if (substring5 != NULL && Substring_has_alts_p(substring5) == true) {
11030 debug9(printf("Resolve plus case 2: Got alts at 5':"));
11031 end_alts_coords = Substring_alts_coords(substring5);
11032 end_alts_nmismatches = Substring_alts_nmismatches(substring5);
11033 end_amb_length_5 = end_amb_length(hit5);
11034
11035 best_insertlength = (Chrpos_T) -1;
11036 best_nmismatches = querylength5;
11037 for (i = 0; i < Substring_alts_ncoords(substring5); i++) {
11038 genomicend = end_alts_coords[i] + end_amb_length_5;
11039 debug9(printf(" %u",(Chrpos_T) (genomicend - hit5->chroffset)));
11040 if (genomicend < hit3->genomicstart /*allow overlap*/+ querylength3) {
11041 /* Look for valid insertlength */
11042 insertlength = hit3->genomicstart - genomicend + querylength5 + querylength3;
11043 debug9(printf(" (insertlength %u)",insertlength));
11044
11045 if (insertlength < best_insertlength) {
11046 besti5 = i;
11047 best_insertlength = insertlength;
11048 best_nmismatches = end_alts_nmismatches[i];
11049 debug9(printf("*"));
11050 } else if (insertlength == best_insertlength &&
11051 (nmismatches = end_alts_nmismatches[i]) < best_nmismatches) {
11052 besti5 = i;
11053 best_nmismatches = nmismatches;
11054 debug9(printf("*"));
11055 } else if (nmismatches == best_nmismatches) {
11056 debug9(printf("tie"));
11057 }
11058 }
11059 }
11060
11061 if (besti5 >= 0) {
11062 debug9(printf("\nBEST HAS INSERTLENGTH %u WITH NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11063 *alts_resolve_5 = besti5;
11064 *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11065 hit5->genomicend = end_alts_coords[besti5] + end_amb_length_5;
11066 }
11067 debug9(printf("\n"));
11068
11069 } else if (substring3 != NULL && Substring_has_alts_p(substring3) == true) {
11070 debug9(printf("Resolve plus case 3: Got alts at 3':"));
11071 start_alts_coords = Substring_alts_coords(substring3);
11072 start_alts_nmismatches = Substring_alts_nmismatches(substring3);
11073 start_amb_length_3 = start_amb_length(hit3);
11074
11075 best_insertlength = (Chrpos_T) -1;
11076 best_nmismatches = querylength3;
11077 for (j = 0; j < Substring_alts_ncoords(substring3); j++) {
11078 genomicstart = start_alts_coords[j] - start_amb_length_3;
11079 debug9(printf(" %u",(Chrpos_T) (genomicstart - hit3->chroffset)));
11080 if (hit5->genomicend < genomicstart /*allow overlap*/+ querylength5) {
11081 /* Look for valid insertlength */
11082 insertlength = genomicstart - hit5->genomicend + querylength5 + querylength3;
11083 debug9(printf(" (insertlength %u)",insertlength));
11084
11085 if (insertlength < best_insertlength) {
11086 besti3 = j;
11087 best_insertlength = insertlength;
11088 best_nmismatches = start_alts_nmismatches[j];
11089 debug9(printf("*"));
11090 } else if (insertlength == best_insertlength &&
11091 (nmismatches = start_alts_nmismatches[j]) < best_nmismatches) {
11092 besti3 = j;
11093 best_nmismatches = nmismatches;
11094 debug9(printf("*"));
11095 } else if (nmismatches == best_nmismatches) {
11096 debug9(printf("tie"));
11097 }
11098 }
11099 }
11100
11101 if (besti3 >= 0) {
11102 debug9(printf("\nBEST HAS INSERTLENGTH %u WITH NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11103 *alts_resolve_3 = besti3;
11104 *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11105 hit3->genomicstart = start_alts_coords[besti3] - start_amb_length_3;
11106 }
11107 debug9(printf("\n"));
11108 }
11109
11110 return;
11111 }
11112
11113
11114 static void
resolve_inside_alts_splice_minus(int * alts_resolve_5,int * alts_resolve_3,int * alts_status_inside,T hit5,T hit3,int querylength5,int querylength3)11115 resolve_inside_alts_splice_minus (int *alts_resolve_5, int *alts_resolve_3,
11116 int *alts_status_inside, T hit5, T hit3, int querylength5, int querylength3) {
11117 Chrpos_T best_insertlength, insertlength;
11118 Univcoord_T genomicstart, genomicend;
11119 int besti5 = -1, besti3 = -1, i, j;
11120 int best_nmismatches, nmismatches;
11121
11122 Substring_T substring5, substring3;
11123 Univcoord_T *end_alts_coords, *start_alts_coords;
11124 int *end_alts_nmismatches, *start_alts_nmismatches;
11125 int end_amb_length_5, start_amb_length_3;
11126
11127
11128 debug9(printf("resolve minus: hit5 %p (%s) and hit3 %p (%s)\n",
11129 hit5,Method_string(hit5->method),hit3,Method_string(hit3->method)));
11130
11131 substring5 = (Substring_T) List_head(hit5->substrings_Nto1); /* the substring for concordance */
11132 debug9(printf("Testing substring5 %p %d..%d alts_p %d\n",
11133 substring5,Stage3end_substrings_querystart(hit5),Stage3end_substrings_queryend(hit5),
11134 Substring_has_alts_p(substring5)));
11135
11136 substring3 = (Substring_T) List_head(hit3->substrings_1toN); /* the substring for concordance */
11137 debug9(printf("Testing substring3 %p %d..%d alts_p %d\n",
11138 substring3,Stage3end_substrings_querystart(hit3),Stage3end_substrings_queryend(hit3),
11139 Substring_has_alts_p(substring3)));
11140
11141 if (substring5 != NULL && Substring_has_alts_p(substring5) == true &&
11142 substring3 != NULL && Substring_has_alts_p(substring3) == true) {
11143 debug9(printf("Resolve minus case 1: Got alts at 5' and alts at 3':"));
11144 end_alts_coords = Substring_alts_coords(substring5);
11145 end_alts_nmismatches = Substring_alts_nmismatches(substring5);
11146 start_alts_coords = Substring_alts_coords(substring3);
11147 start_alts_nmismatches = Substring_alts_nmismatches(substring3);
11148 end_amb_length_5 = end_amb_length(hit5);
11149 start_amb_length_3 = start_amb_length(hit3);
11150
11151 best_insertlength = (Chrpos_T) -1;
11152 best_nmismatches = querylength5 + querylength3;
11153 for (i = 0; i < Substring_alts_ncoords(substring5); i++) {
11154 genomicend = end_alts_coords[i] - end_amb_length_5;
11155 for (j = 0; j < Substring_alts_ncoords(substring3); j++) {
11156 genomicstart = start_alts_coords[j] + start_amb_length_3;
11157 debug9(printf(" %u,%u",(Chrpos_T) (genomicend - hit5->chroffset),(Chrpos_T) (genomicstart - hit3->chroffset)));
11158 if (genomicstart < genomicend) {
11159 /* Look for valid insertlength */
11160 insertlength = genomicend - genomicstart + querylength5 + querylength3;
11161 debug9(printf(" (insertlength %u)",insertlength));
11162
11163 if (insertlength < best_insertlength) {
11164 besti5 = i;
11165 besti3 = j;
11166 best_insertlength = insertlength;
11167 best_nmismatches = end_alts_nmismatches[i] + start_alts_nmismatches[j];
11168 debug9(printf("*"));
11169 } else if (insertlength == best_insertlength &&
11170 (nmismatches = end_alts_nmismatches[i] + start_alts_nmismatches[j]) < best_nmismatches) {
11171 besti5 = i;
11172 besti3 = j;
11173 best_nmismatches = nmismatches;
11174 debug9(printf("*"));
11175 } else if (nmismatches == best_nmismatches) {
11176 debug9(printf("tie"));
11177 }
11178 }
11179 }
11180 }
11181
11182 if (besti5 >= 0 && besti3 >= 0) {
11183 debug9(printf("\nBEST HAS INSERTLENGTH %u AND NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11184 *alts_resolve_5 = besti5;
11185 *alts_resolve_3 = besti3;
11186 *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11187 hit5->genomicend = end_alts_coords[besti5] - end_amb_length_5;
11188 hit3->genomicstart = start_alts_coords[besti3] + start_amb_length_3;
11189 }
11190 debug9(printf("\n"));
11191
11192 } else if (substring5 != NULL && Substring_has_alts_p(substring5) == true) {
11193 debug9(printf("Resolve minus case 2: Got alts at 5':"));
11194 end_alts_coords = Substring_alts_coords(substring5);
11195 end_alts_nmismatches = Substring_alts_nmismatches(substring5);
11196 end_amb_length_5 = end_amb_length(hit5);
11197
11198 best_insertlength = (Chrpos_T) -1;
11199 best_nmismatches = querylength5;
11200 for (i = 0; i < Substring_alts_ncoords(substring5); i++) {
11201 genomicend = end_alts_coords[i] - end_amb_length_5;
11202 debug9(printf(" %u",(Chrpos_T) (genomicend - hit5->chroffset)));
11203 debug9(printf(" (%u <? %u + %d)",hit3->genomicstart,genomicend,querylength3));
11204 if (hit3->genomicstart < genomicend /*allow overlap*/+ querylength3) {
11205 /* Look for valid insertlength */
11206 insertlength = genomicend - hit3->genomicstart + querylength5 + querylength3;
11207 debug9(printf(" (insertlength %u)",insertlength));
11208
11209 if (insertlength < best_insertlength) {
11210 besti5 = i;
11211 best_insertlength = insertlength;
11212 best_nmismatches = end_alts_nmismatches[i];
11213 debug9(printf("*"));
11214 } else if (insertlength == best_insertlength &&
11215 (nmismatches = end_alts_nmismatches[i]) < best_nmismatches) {
11216 besti5 = i;
11217 best_nmismatches = nmismatches;
11218 debug9(printf("*"));
11219 } else if (nmismatches == best_nmismatches) {
11220 debug9(printf("tie"));
11221 }
11222 }
11223 }
11224
11225 if (besti5 >= 0) {
11226 debug9(printf("\nBEST HAS INSERTLENGTH %u WITH NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11227 *alts_resolve_5 = besti5;
11228 *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11229 hit5->genomicend = end_alts_coords[besti5] - end_amb_length_5;
11230 }
11231 debug9(printf("\n"));
11232
11233 } else if (substring3 != NULL && Substring_has_alts_p(substring3) == true) {
11234 debug9(printf("Resolve minus case 3: Got alts at 3':"));
11235 start_alts_coords = Substring_alts_coords(substring3);
11236 start_alts_nmismatches = Substring_alts_nmismatches(substring3);
11237 start_amb_length_3 = start_amb_length(hit3);
11238
11239 best_insertlength = (Chrpos_T) -1;
11240 best_nmismatches = querylength3;
11241 for (j = 0; j < Substring_alts_ncoords(substring3); j++) {
11242 genomicstart = start_alts_coords[j] + start_amb_length_3;
11243 debug9(printf(" %u",(Chrpos_T) (genomicstart - hit3->chroffset)));
11244 if (genomicstart < hit5->genomicend /*allow overlap*/+ querylength5) {
11245 /* Look for valid insertlength */
11246 insertlength = hit5->genomicend - genomicstart + querylength5 + querylength3;
11247 debug9(printf(" (insertlength %u)",insertlength));
11248
11249 if (insertlength < best_insertlength) {
11250 besti3 = j;
11251 best_insertlength = insertlength;
11252 best_nmismatches = start_alts_nmismatches[j];
11253 debug9(printf("*"));
11254 } else if (insertlength == best_insertlength &&
11255 (nmismatches = start_alts_nmismatches[j]) < best_nmismatches) {
11256 besti3 = j;
11257 best_nmismatches = nmismatches;
11258 debug9(printf("*"));
11259 } else if (nmismatches == best_nmismatches) {
11260 debug9(printf("tie"));
11261 }
11262 }
11263 }
11264
11265 if (besti3 >= 0) {
11266 debug9(printf("\nBEST HAS INSERTLENGTH %u WITH NMISMATCHES %d\n",best_insertlength,best_nmismatches));
11267 *alts_resolve_3 = besti3;
11268 *alts_status_inside = ALTS_RESOLVED_BYLENGTH;
11269 hit3->genomicstart = start_alts_coords[besti3] + start_amb_length_3;
11270 }
11271 debug9(printf("\n"));
11272 }
11273
11274 return;
11275 }
11276
11277
11278
11279 static void
alias_circular(T hit)11280 alias_circular (T hit) {
11281 Chrpos_T chrlength = hit->chrlength;
11282 List_T p;
11283 Substring_T substring;
11284
11285 assert(hit->circularalias == -1);
11286 for (p = hit->substrings_1toN; p != NULL; p = List_next(p)) {
11287 substring = (Substring_T) List_head(p);
11288 Substring_alias_circular(substring);
11289 }
11290
11291 /* Doesn't fix hitpair->low and hitpair->high */
11292 hit->genomicstart += chrlength;
11293 hit->genomicend += chrlength;
11294 hit->low += chrlength;
11295 hit->high += chrlength;
11296
11297 hit->circularalias = +1;
11298
11299 return;
11300 }
11301
11302
11303 /* Previously allowed for private5p or private3p to be true. But now
11304 always copying (because concordance procedure can delete hits), and
11305 so private5p and private3p are essentially true. */
11306 Stage3pair_T
Stage3pair_new(T hit5_orig,T hit3_orig,int genestrand,int sensedir,Pairtype_T pairtype,int * mismatch_positions_alloc_5,int * mismatch_positions_alloc_3,Compress_T query5_compress_fwd,Compress_T query5_compress_rev,Compress_T query3_compress_fwd,Compress_T query3_compress_rev,Listpool_T listpool,bool expect_concordant_p,bool transcriptome_guided_p)11307 Stage3pair_new (T hit5_orig, T hit3_orig, int genestrand, int sensedir, Pairtype_T pairtype,
11308 int *mismatch_positions_alloc_5, int *mismatch_positions_alloc_3,
11309 Compress_T query5_compress_fwd, Compress_T query5_compress_rev,
11310 Compress_T query3_compress_fwd, Compress_T query3_compress_rev,
11311 Listpool_T listpool, bool expect_concordant_p, bool transcriptome_guided_p) {
11312 Stage3pair_T new;
11313 Stage3end_T hit5, hit3;
11314 Substring_T substring1, substringN;
11315 int alts_resolve_5, alts_resolve_3;
11316
11317 /* int found_score = 0; */
11318 bool overreach5p, overreach3p;
11319 Chrpos_T pairmax;
11320
11321 int querylength5 = hit5_orig->querylength;
11322 int querylength3 = hit3_orig->querylength;
11323
11324 char *remap_sequence;
11325 int remap_seqlength;
11326 List_T transcripts;
11327
11328
11329 debug0(printf("\nStage3pair_new called with pairtype %s and chrnum %d, %d (effective %d, %d), expect_concordant_p %d\n",
11330 Pairtype_string(pairtype),hit5_orig->chrnum,hit3_orig->chrnum,
11331 hit5_orig->effective_chrnum,hit3_orig->effective_chrnum,expect_concordant_p));
11332
11333 /* Always make a copy, because concordance procedure might delete the hit */
11334 hit5 = Stage3end_copy(hit5_orig,listpool);
11335 hit3 = Stage3end_copy(hit3_orig,listpool);
11336
11337 new = (Stage3pair_T) MALLOC_OUT(sizeof(*new));
11338
11339 if (pairtype == PAIRED_UNSPECIFIED || pairtype == UNSPECIFIED) {
11340 /* Can get here from running GMAP improvement on a paired result */
11341 pairtype = Stage3_determine_pairtype(hit5,hit3,/*stage3pair*/NULL);
11342 debug10(printf(" Changing pairtype to %s\n",Pairtype_string(pairtype)));
11343 if (pairtype == CONCORDANT) {
11344 expect_concordant_p = true;
11345 }
11346 }
11347 new->pairtype = pairtype;
11348 new->genestrand = genestrand;
11349 new->sensedir = sensedir;
11350
11351 alts_resolve_5 = -1;
11352 alts_resolve_3 = -1;
11353 new->alts_status_inside = ALTS_NOT_AMBIGUOUS;
11354
11355
11356 #if 0
11357 new->mapq_loglik = hit5->mapq_loglik + hit3->mapq_loglik;
11358 new->mapq_score = 0;
11359 new->absmq_score = 0;
11360 #endif
11361
11362 if (hit5->plusp == true && hit3->plusp == false) {
11363 debug10(printf("plus/minus\n"));
11364 new->dir = 0;
11365
11366 /* Have 5-start..end and 3-end..start */
11367 /* or 3-end..start and 5-start..end */
11368
11369 new->pair_relationship = 0;
11370 if (hit5->genomicend < hit3->genomicend) {
11371 new->insertlength = (hit3->genomicend - hit5->genomicend) + querylength5 + querylength3;
11372 new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11373 } else if (hit3->genomicstart < hit5->genomicstart) {
11374 new->insertlength = (hit5->genomicstart - hit3->genomicstart) + querylength5 + querylength3;
11375 new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11376 } else {
11377 new->insertlength = pair_insert_length_unpaired(hit5,hit3); /* was 0 */
11378 new->insertlength_expected_sign = false;
11379 }
11380
11381 } else if (hit5->plusp == false && hit3->plusp == true) {
11382 debug10(printf("minus/plus\n"));
11383 new->dir = 0;
11384
11385 /* Have 5-end..start and 3-start..end */
11386 /* or 3-start..end and 5-end..start */
11387
11388 new->pair_relationship = 0;
11389 if (hit5->genomicstart < hit3->genomicstart) {
11390 new->insertlength = (hit3->genomicstart - hit5->genomicstart) + querylength5 + querylength3;
11391 new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11392 } else if (hit3->genomicend < hit5->genomicend) {
11393 new->insertlength = (hit5->genomicend - hit3->genomicend) + querylength5 + querylength3;
11394 new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11395 } else {
11396 new->insertlength = pair_insert_length_unpaired(hit5,hit3); /* was 0 */
11397 new->insertlength_expected_sign = false;
11398 }
11399
11400 } else if (hit5->plusp == true) {
11401 /* Concordant directions on same chromosome (plus) */
11402 debug10(printf("*Concordant on plus strand\n"));
11403 new->dir = +1;
11404
11405 if (expect_concordant_p == true) {
11406 overreach5p = overreach3p = false;
11407 if (hit5->hittype == SPLICE) {
11408
11409 substringN = (Substring_T) List_head(hit5->substrings_Nto1);
11410 if (Substring_alignstart_trim(substringN) > hit3->genomicend) {
11411 substring1 = (Substring_T) List_head(hit5->substrings_1toN);
11412 if (Substring_alignend_trim(substring1) < hit3->genomicstart) {
11413 overreach5p = true;
11414 }
11415 }
11416 }
11417 if (hit3->hittype == SPLICE) {
11418 substring1 = (Substring_T) List_head(hit3->substrings_1toN);
11419 if (Substring_alignend_trim(substring1) < hit5->genomicstart) {
11420 substringN = (Substring_T) List_head(hit3->substrings_Nto1);
11421 if (Substring_alignstart_trim(substringN) > hit5->genomicend) {
11422 overreach3p = true;
11423 }
11424 }
11425 }
11426
11427 if (overreach5p == true || overreach3p == true) {
11428 /* Either overreach */
11429 debug0(printf(" Returning NULL because of dual overreach\n"));
11430 Stage3end_free(&hit5); /* This was the copy */
11431 Stage3end_free(&hit3); /* This was the copy */
11432 FREE_OUT(new);
11433 return (Stage3pair_T) NULL;
11434
11435 #if 0
11436 } else if (overreach5p == true) {
11437 /* Overreach of hit5 */
11438 debug9(printf("Overreach of hit5 of type SPLICE. Removing substring2\n"));
11439 if (hit5->sensedir == SENSE_FORWARD) {
11440 copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_bothdiff(hit5->substring1),
11441 /*nmismatches_acceptor*/0,/*donor*/hit5->substring1,/*acceptor*/NULL,/*distance*/0U,
11442 /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11443 /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11444 /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11445 /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11446 /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/true,
11447 /*sensedir*/hit5->sensedir,listpool,hit5->method,hit5->level);
11448 } else if (hit5->sensedir == SENSE_ANTI) {
11449 copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
11450 /*nmismatches_acceptor*/Substring_nmismatches_bothdiff(hit5->substring1),/*donor*/NULL,
11451 /*acceptor*/hit5->substring1,/*distance*/0U,
11452 /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11453 /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11454 /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11455 /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11456 /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/true,
11457 /*sensedir*/hit5->sensedir,listpool,hit5->method,hit5->level);
11458 } else {
11459 abort();
11460 }
11461 Stage3end_free(&hit5); /* This was the copy */
11462 hit5 = copy;
11463
11464 } else if (overreach3p == true) {
11465 /* Overreach of hit3 */
11466 debug9(printf("Overreach of hit3 of type SPLICE. Removing substring1\n"));
11467 if (hit3->sensedir == SENSE_FORWARD) {
11468 copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
11469 /*nmismatches_acceptor*/Substring_nmismatches_bothdiff(hit3->substring2),/*donor*/NULL,
11470 /*acceptor*/hit3->substring2,/*distance*/0U,
11471 /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11472 /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11473 /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11474 /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11475 /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/false,
11476 /*sensedir*/hit3->sensedir,listpool,hit3->method,hit3->level);
11477 } else if (hit3->sensedir == SENSE_ANTI) {
11478 copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_bothdiff(hit3->substring2),
11479 /*nmismatches_acceptor*/0,/*donor*/hit3->substring2,/*acceptor*/NULL,/*distance*/0U,
11480 /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11481 /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11482 /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11483 /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11484 /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/false,
11485 /*sensedir*/hit3->sensedir,listpool,hit3->method,hit3->level);
11486 } else {
11487 abort();
11488 }
11489 Stage3end_free(&hit3); /* This was the copy */
11490 hit3 = copy;
11491 #endif
11492 }
11493
11494 /* Try to resolve ambiguity on inside of concordant ends */
11495 debug9(printf("Calling resolve_inside_alts_splice_plus\n"));
11496 resolve_inside_alts_splice_plus(&alts_resolve_5,&alts_resolve_3,
11497 &new->alts_status_inside,hit5,hit3,querylength5,querylength3);
11498 if (alts_resolve_5 >= 0) {
11499 resolve_ambiguity_5(hit5,mismatch_positions_alloc_5,query5_compress_fwd,alts_resolve_5);
11500 }
11501 if (alts_resolve_3 >= 0) {
11502 resolve_ambiguity_3(hit3,mismatch_positions_alloc_3,query3_compress_fwd,alts_resolve_3);
11503 }
11504
11505 debug9(printf("For pair %p (%p and %p), set alts_resolve_5 to be %d and alts_resolve_3 to be %d\n",
11506 new,hit5,hit3,alts_resolve_5,alts_resolve_3));
11507 }
11508
11509 /* Have 5-start..end and 3-start..end */
11510 if (hit5->genomicend < hit3->genomicstart) {
11511 /* No overlap */
11512 new->pair_relationship = +1;
11513 new->insertlength = (hit3->genomicstart - hit5->genomicend) + querylength5 + querylength3;
11514 new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11515 debug10(printf("plus, no overlap: insert length %d = start3 %u - end5 %u + %d + %d\n",
11516 new->insertlength,hit3->genomicstart - hit3->chroffset,
11517 hit5->genomicend - hit5->chroffset,querylength5,querylength3));
11518 #if 0
11519 } else if (hit5->genomicend > hit3->genomicend + SUBSUMPTION_SLOP) {
11520 /* hit5 subsumes hit3 */
11521 debug10(printf("plus, subsumption %u > %u\n",
11522 hit5->genomicend - hit5->chroffset,hit3->genomicend - hit3->chroffset));
11523 new->pair_relationship = 0;
11524 new->insertlength = 0;
11525 new->insertlength_expected_sign = false;
11526 #endif
11527 } else {
11528 new->insertlength = pair_insert_length(&new->pair_relationship,hit5,hit3);
11529 new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11530 }
11531
11532
11533 } else {
11534 /* Concordant directions on same chromosome (minus) */
11535 debug10(printf("*Concordant on minus strand\n"));
11536 new->dir = -1;
11537
11538 if (expect_concordant_p == true) {
11539 overreach5p = overreach3p = false;
11540 if (hit5->hittype == SPLICE) {
11541 debug10(printf("Have splice on 5' end\n"));
11542 substringN = (Substring_T) List_head(hit5->substrings_Nto1);
11543 if (Substring_alignstart_trim(substringN) < hit3->genomicend) {
11544 substring1 = (Substring_T) List_head(hit5->substrings_1toN);
11545 if (Substring_alignend_trim(substring1) > hit3->genomicstart) {
11546 overreach5p = true;
11547 }
11548 }
11549 }
11550 if (hit3->hittype == SPLICE) {
11551 debug10(printf("Have splice on 3' end\n"));
11552 substring1 = (Substring_T) List_head(hit3->substrings_1toN);
11553 if (Substring_alignend_trim(substring1) > hit5->genomicstart) {
11554 substringN = (Substring_T) List_head(hit3->substrings_Nto1);
11555 if (Substring_alignstart_trim(substringN) < hit5->genomicend) {
11556 overreach3p = true;
11557 }
11558 }
11559 }
11560
11561 if (overreach5p == true || overreach3p == true) {
11562 /* Either overreach */
11563 debug0(printf(" Returning NULL because of dual overreach\n"));
11564 Stage3end_free(&hit5); /* This was the copy */
11565 Stage3end_free(&hit3); /* This was the copy */
11566 FREE_OUT(new);
11567 return (Stage3pair_T) NULL;
11568
11569 #if 0
11570 } else if (overreach5p == true) {
11571 /* Overreach of hit5 */
11572 debug9(printf("Overreach of hit5 of type SPLICE. Removing substring2\n"));
11573 if (hit5->sensedir == SENSE_FORWARD) {
11574 copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_bothdiff(hit5->substring1),
11575 /*nmismatches_acceptor*/0,/*donor*/hit5->substring1,/*acceptor*/NULL,/*distance*/0U,
11576 /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11577 /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11578 /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11579 /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11580 /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/true,
11581 /*sensedir*/hit5->sensedir,listpool,hit5->method,hit5->level);
11582 } else if (hit5->sensedir == SENSE_ANTI) {
11583 copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
11584 /*nmismatches_acceptor*/Substring_nmismatches_bothdiff(hit5->substring1),/*donor*/NULL,
11585 /*acceptor*/hit5->substring1,/*distance*/0U,
11586 /*shortdistancep*/true,localsplicing_penalty,hit5->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11587 /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11588 /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11589 /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11590 /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/true,
11591 /*sensedir*/hit5->sensedir,listpool,hit5->method,hit5->level);
11592 } else {
11593 abort();
11594 }
11595 Stage3end_free(&hit5); /* This was the copy */
11596 hit5 = copy;
11597
11598 } else if (overreach3p == true) {
11599 /* Overreach of hit3 */
11600 debug9(printf("Overreach of hit3 of type SPLICE. Removing substring1\n"));
11601 if (hit3->sensedir == SENSE_FORWARD) {
11602 copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/0,
11603 /*nmismatches_acceptor*/Substring_nmismatches_bothdiff(hit3->substring2),/*donor*/NULL,
11604 /*acceptor*/hit3->substring2,/*distance*/0U,
11605 /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11606 /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11607 /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11608 /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11609 /*copy_donor_p*/false,/*copy_acceptor_p*/true,/*first_read_p*/false,
11610 /*sensedir*/hit3->sensedir,listpool,hit3->method,hit3->level);
11611 } else if (hit3->sensedir == SENSE_ANTI) {
11612 copy = Stage3end_new_splice(&found_score,/*nmismatches_donor*/Substring_nmismatches_bothdiff(hit3->substring2),
11613 /*nmismatches_acceptor*/0,/*donor*/hit3->substring2,/*acceptor*/NULL,/*distance*/0U,
11614 /*shortdistancep*/true,localsplicing_penalty,hit3->querylength,/*amb_length*/0,/*amb_prob*/0.0,
11615 /*alts_coords_donor*/NULL,/*alts_coords_acceptor*/NULL,
11616 /*alts_nmismatches_donor*/NULL,/*alts_nmismatches_acceptor*/NULL,
11617 /*alts_probs_donor*/NULL,/*alts_probs_acceptor*/NULL,
11618 /*copy_donor_p*/true,/*copy_acceptor_p*/false,/*first_read_p*/false,
11619 /*sensedir*/hit3->sensedir,listpool,hit3->method,hit3->level);
11620 } else {
11621 abort();
11622 }
11623 Stage3end_free(&hit3); /* This was the copy */
11624 hit3 = copy;
11625 #endif
11626 }
11627
11628 /* Try to resolve ambiguity on inside of concordant ends */
11629 debug9(printf("Calling resolve_inside_alts_splice_minus\n"));
11630 resolve_inside_alts_splice_minus(&alts_resolve_5,&alts_resolve_3,
11631 &new->alts_status_inside,hit5,hit3,querylength5,querylength3);
11632 if (alts_resolve_5 >= 0) {
11633 resolve_ambiguity_5(hit5,mismatch_positions_alloc_5,query5_compress_rev,alts_resolve_5);
11634 }
11635 if (alts_resolve_3 >= 0) {
11636 resolve_ambiguity_3(hit3,mismatch_positions_alloc_3,query3_compress_rev,alts_resolve_3);
11637 }
11638
11639 debug9(printf("For pair %p (%p and %p), set alts_resolve_5 to be %d and alts_resolve_3 to be %d\n",
11640 new,hit5,hit3,alts_resolve_5,alts_resolve_3));
11641 }
11642
11643 /* Have 3-end..start and 5-end..start */
11644 if (hit3->genomicstart < hit5->genomicend) {
11645 /* No overlap */
11646 new->pair_relationship = -1;
11647 new->insertlength = (hit5->genomicend - hit3->genomicstart) + querylength5 + querylength3;
11648 new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11649 debug10(printf("minus, no overlap: insert length %d = end5 %u - start3 %u + %d + %d\n",
11650 new->insertlength,hit5->genomicend - hit5->chroffset,
11651 hit3->genomicstart - hit3->chroffset,querylength5,querylength3));
11652 #if 0
11653 } else if (hit3->genomicstart > hit5->genomicstart + SUBSUMPTION_SLOP) {
11654 /* hit3 subsumes hit5 */
11655 debug10(printf("minus, subsumption %u > %u\n",
11656 hit3->genomicstart - hit3->chroffset,hit5->genomicstart - hit5->chroffset));
11657 new->pair_relationship = 0;
11658 new->insertlength = 0;
11659 new->insertlength_expected_sign = false;
11660 #endif
11661 } else {
11662 new->insertlength = pair_insert_length(&new->pair_relationship,hit5,hit3);
11663 new->insertlength_expected_sign = insertlength_expected(new->insertlength);
11664 }
11665 }
11666
11667 debug10(printf("\nGot initial insertlength of %d\n",new->insertlength));
11668
11669 new->hit5 = hit5;
11670 new->hit3 = hit3;
11671
11672 /* Was new->insertlength <= 0, but this eliminates legitimate overlaps */
11673 /* Was new->insertlength < -pairmax, but this allows overreach */
11674 if (new->insertlength <= 0) { /* Not possible, since insertlength is unsigned */
11675 /* Not concordant */
11676 #ifdef USE_BINGO
11677 new->absdifflength_bingo_p = false;
11678 #endif
11679 #ifdef USE_ABSDIFFLENGTH
11680 new->absdifflength = (Chrpos_T) -1;
11681 #endif
11682
11683 if (expect_concordant_p == true) {
11684 debug0(printf(" Returning NULL, because insertlength %u, so not concordant\n",new->insertlength));
11685 Stage3end_free(&hit5); /* This was the copy */
11686 Stage3end_free(&hit3); /* This was the copy */
11687 FREE_OUT(new);
11688 return (Stage3pair_T) NULL;
11689 }
11690
11691 } else {
11692 if (transcriptome_guided_p == true) {
11693 pairmax = (Chrpos_T) -1;
11694 } else if (circularp[hit5->effective_chrnum] == true) {
11695 pairmax = pairmax_circular;
11696 } else {
11697 pairmax = pairmax_linear;
11698 }
11699 if (new->insertlength > pairmax && expect_concordant_p == true) {
11700 debug0(printf(" Returning NULL because insertlength %u > pairmax %d\n",new->insertlength,pairmax));
11701 Stage3end_free(&hit5); /* This was the copy */
11702 Stage3end_free(&hit3); /* This was the copy */
11703 FREE_OUT(new);
11704 return (Stage3pair_T) NULL;
11705
11706 } else {
11707 #ifdef USE_ABSDIFFLENGTH
11708 if (new->insertlength < expected_pairlength) {
11709 new->absdifflength = expected_pairlength - new->insertlength;
11710 } else {
11711 new->absdifflength = new->insertlength - expected_pairlength;
11712 }
11713 #endif
11714 #ifdef USE_BINGO
11715 if (new->absdifflength <= pairlength_deviation) {
11716 new->absdifflength_bingo_p = true;
11717 } else {
11718 new->absdifflength_bingo_p = false;
11719 }
11720 #endif
11721 }
11722 }
11723
11724 if (SENSE_CONSISTENT_P(hit5->sensedir_for_concordance,hit3->sensedir_for_concordance)) {
11725 debug0(printf("senses %d and %d are consistent\n",hit5->sensedir_for_concordance,hit3->sensedir_for_concordance));
11726 new->sense_consistent_p = true;
11727
11728 } else if (expect_concordant_p == true) {
11729 debug0(printf(" Returning NULL, because senses are not consistent\n"));
11730 Stage3end_free(&hit5); /* This was the copy */
11731 Stage3end_free(&hit3); /* This was the copy */
11732 FREE_OUT(new);
11733 return (Stage3pair_T) NULL;
11734
11735 } else {
11736 debug0(printf("senses are inconsistent, but allowable\n"));
11737 new->sense_consistent_p = false;
11738 }
11739
11740 /* No longer add scores from hit5 and hit3 */
11741
11742 /* new->overlap_known_gene_p = false; -- initialized later when resolving multimappers */
11743 /* new->tally = -1L; */
11744
11745 new->low = (hit5->low < hit3->low) ? hit5->low : hit3->low;
11746 new->high = (hit5->high > hit3->high) ? hit5->high : hit3->high;
11747 debug0(printf("hit5 %u..%u and hit3 %u..%u => %u..%u\n",
11748 hit5->low,hit5->high,hit3->low,hit3->high,new->low,new->high));
11749
11750 #if 0
11751 if (new->low > new->high) {
11752 fprintf(stderr,"new->low %u > new->high %u, hit5->chrnum %d\n",
11753 new->low - new->chroffset,new->high - new->chroffset,hit5->chrnum);
11754 abort();
11755 }
11756 #endif
11757
11758 if (hit5->chrnum == 0 || hit3->chrnum == 0) {
11759 new->outerlength = querylength5 + querylength3;
11760 } else {
11761 assert(new->low < new->high);
11762 new->outerlength = new->high - new->low;
11763 }
11764
11765 if (expect_concordant_p == true) {
11766 hit5_orig->paired_usedp = hit5->paired_usedp = true;
11767 hit3_orig->paired_usedp = hit3->paired_usedp = true;
11768 }
11769
11770 new->nsplices = hit5->nsplices + hit3->nsplices;
11771
11772 debug0(printf("Created new pair %p from %p and %p (nmatches_to_trims %d+%d)\n",
11773 new,hit5,hit3,hit5->refalt_nmatches_to_trims,hit3->refalt_nmatches_to_trims));
11774 debug0(printf(" methods %s and %s\n",Method_string(hit5->method),Method_string(hit3->method)));
11775 debug0(printf(" sensedirs %d and %d\n",hit5->sensedir,hit3->sensedir));
11776 debug0(printf(" chrpos_1toN %u..%u and %u..%u\n",
11777 hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,
11778 hit3->genomicstart - hit3->chroffset,hit3->genomicend - hit3->chroffset));
11779 debug0(printf(" chrpos_LtoH %u..%u and %u..%u\n",
11780 hit5->low - hit5->chroffset,hit5->high - hit5->chroffset,
11781 hit3->low - hit3->chroffset,hit3->high - hit3->chroffset));
11782 debug0(printf(" outerlength %u = %u - %u\n",new->outerlength,new->high,new->low));
11783
11784 if (hit5->circularpos < 0 && hit3->circularpos < 0) {
11785 new->circularp = false;
11786 } else {
11787 new->circularp = true;
11788 }
11789
11790 /* Fixing insertlength for circular pairs */
11791 if (new->insertlength > hit5->chrlength) {
11792 new->insertlength -= hit5->chrlength;
11793 }
11794
11795 if (hit5->circularalias == +1) {
11796 debug0(printf("Unaliasing 5' end\n"));
11797 unalias_circular(hit5);
11798 }
11799
11800 if (hit3->circularalias == +1) {
11801 debug0(printf("Unaliasing 3' end\n"));
11802 unalias_circular(hit3);
11803 }
11804
11805 if (remap_transcriptome_p == false) {
11806 /* Do not remap */
11807
11808 } else if (hit5->transcripts != NULL && hit3->transcripts != NULL) {
11809 /* No need to remap */
11810
11811 } else if (hit5->transcripts != NULL && hit3->transcripts == NULL) {
11812 debug0(printf("Remapping 3' end to transcriptome to match 5' end at %d:%u..%u\n",
11813 hit5->chrnum,hit5->low - hit5->chroffset,hit5->high - hit5->chroffset));
11814 remap_sequence = Stage3end_substrings_genomic_sequence(&remap_seqlength,hit3,genomecomp);
11815 debug0(printf("%s\n",remap_sequence));
11816
11817 if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,hit3->chrnum,
11818 /*lowbound*/hit3->low - hit3->chroffset,
11819 /*highbound*/hit3->high - hit3->chroffset,
11820 transcript_iit,transcriptomebits,transcriptome)) != NULL) {
11821 hit3->transcripts = transcripts;
11822 }
11823 FREE(remap_sequence);
11824
11825 } else if (hit5->transcripts == NULL && hit3->transcripts != NULL) {
11826 debug0(printf("Remapping 5' end to transcriptome to match 3' end at %d:%u..%u\n",
11827 hit3->chrnum,hit3->low - hit3->chroffset,hit3->high - hit3->chroffset));
11828
11829 remap_sequence = Stage3end_substrings_genomic_sequence(&remap_seqlength,hit5,genomecomp);
11830 debug0(printf("%s\n",remap_sequence));
11831 if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,hit5->chrnum,
11832 /*lowbound*/hit5->low - hit5->chroffset,
11833 /*highbound*/hit5->high - hit5->chroffset,
11834 transcript_iit,transcriptomebits,transcriptome)) != NULL) {
11835 hit5->transcripts = transcripts;
11836 }
11837 FREE(remap_sequence);
11838 }
11839
11840 #if 0
11841 /* Need this in addition to Stage3end_filter_concordant_tr, to
11842 eliminate any inconsistent transcripts */
11843 Transcript_concordance(&new->transcripts5,&new->transcripts3,hit5->transcripts,hit3->transcripts);
11844 debug0(printf("%d transcripts5, %d transcripts3\n",List_length(new->transcripts5),List_length(new->transcripts3)));
11845 #endif
11846
11847 pairtype = Stage3_determine_pairtype(hit5,hit3,/*stage3pair*/new);
11848
11849 /* assert((int) new->insertlength >= 0); */
11850 return new;
11851 }
11852
11853
11854 /* Used for eliminating exact duplicates. Also sorts secondarily by hittype. */
11855 static int
hitpair_sort_cmp(const void * a,const void * b)11856 hitpair_sort_cmp (const void *a, const void *b) {
11857 Stage3pair_T x = * (Stage3pair_T *) a;
11858 Stage3pair_T y = * (Stage3pair_T *) b;
11859
11860 Univcoord_T x_hit5_high, x_hit5_low, y_hit5_high, y_hit5_low;
11861 Univcoord_T x_hit3_high, x_hit3_low, y_hit3_high, y_hit3_low;
11862 Univcoord_T x_low, x_high, y_low, y_high;
11863
11864 debug8(printf(" Comparing (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), circularalias %d|%d, nmatches: %d+%d (%d+%d to trims), amb_lengths %d and %d, sensedirs %d-%d, score %f+%f\n",
11865 Pairtype_string(x->pairtype),Method_string(x->hit5->method),
11866 Method_string(x->hit3->method),x,
11867 x->hit5->low - x->hit5->chroffset,x->hit5->high - x->hit5->chroffset,
11868 x->hit3->low - x->hit3->chroffset,x->hit3->high - x->hit3->chroffset,
11869 x->dir,x->hit5->circularalias,x->hit3->circularalias,
11870 x->hit5->refalt_nmatches_plus_spliced_trims,x->hit3->refalt_nmatches_plus_spliced_trims,
11871 x->hit5->refalt_nmatches_to_trims,x->hit3->refalt_nmatches_to_trims,
11872 amb_length(x->hit5),amb_length(x->hit3),x->hit5->sensedir,x->hit3->sensedir,
11873 x->hit5->splice_score,x->hit3->splice_score));
11874
11875 debug8(printf(" with (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), circularalias %d|%d, nmatches: %d+%d (%d+%d to trims), amb_lengths %d and %d, sensedirs %d-%d, score %f+%f\n",
11876 Pairtype_string(y->pairtype),Method_string(y->hit5->method),
11877 Method_string(y->hit3->method),y,
11878 y->hit5->low - y->hit5->chroffset,y->hit5->high - y->hit5->chroffset,
11879 y->hit3->low - y->hit3->chroffset,y->hit3->high - y->hit3->chroffset,
11880 y->dir,y->hit5->circularalias,y->hit3->circularalias,
11881 y->hit5->refalt_nmatches_plus_spliced_trims,y->hit3->refalt_nmatches_plus_spliced_trims,
11882 y->hit5->refalt_nmatches_to_trims,y->hit3->refalt_nmatches_to_trims,
11883 amb_length(y->hit5),amb_length(y->hit3),y->hit5->sensedir,y->hit3->sensedir,
11884 y->hit5->splice_score,y->hit3->splice_score));
11885
11886 x_hit5_low = normalize_coord(x->hit5->low,x->hit5->circularalias,x->hit5->chrlength);
11887 x_hit5_high = normalize_coord(x->hit5->high,x->hit5->circularalias,x->hit5->chrlength);
11888
11889 x_hit3_low = normalize_coord(x->hit3->low,x->hit3->circularalias,x->hit3->chrlength);
11890 x_hit3_high = normalize_coord(x->hit3->high,x->hit3->circularalias,x->hit3->chrlength);
11891
11892 x_low = (x_hit5_low < x_hit3_low) ? x_hit5_low : x_hit3_low;
11893 x_high = (x_hit5_high > x_hit3_high) ? x_hit5_high : x_hit3_high;
11894
11895
11896 y_hit5_low = normalize_coord(y->hit5->low,y->hit5->circularalias,y->hit5->chrlength);
11897 y_hit5_high = normalize_coord(y->hit5->high,y->hit5->circularalias,y->hit5->chrlength);
11898
11899 y_hit3_low = normalize_coord(y->hit3->low,y->hit3->circularalias,y->hit3->chrlength);
11900 y_hit3_high = normalize_coord(y->hit3->high,y->hit3->circularalias,y->hit3->chrlength);
11901
11902 y_low = (y_hit5_low < y_hit3_low) ? y_hit5_low : y_hit3_low;
11903 y_high = (y_hit5_high > y_hit3_high) ? y_hit5_high : y_hit3_high;
11904
11905
11906 if (x->dir != 0 && y->dir == 0) {
11907 return -1;
11908 } else if (x->dir == 0 && y->dir != 0) {
11909 return +1;
11910 } else if (x->dir > 0 && y->dir < 0) {
11911 return -1;
11912 } else if (x->dir < 0 && y->dir > 0) {
11913 return +1;
11914
11915 /* low to high pattern needed for finding overlaps */
11916 } else if (x_low < y_low) {
11917 debug8(printf("Returning -1 for low\n"));
11918 return -1;
11919 } else if (y_low < x_low) {
11920 debug8(printf("Returning +1 for low\n"));
11921 return +1;
11922
11923 } else if (x_high > y_high) {
11924 debug8(printf("Returning -1 for high\n"));
11925 return -1;
11926 } else if (y_high > x_high) {
11927 debug8(printf("Returning +1 for high\n"));
11928 return +1;
11929
11930 /* Need to check inside ends to avoid declaring unequal hitpairs equal */
11931 } else if (x_hit5_low < y_hit5_low) {
11932 return -1;
11933 } else if (y_hit5_low < x_hit5_low) {
11934 return +1;
11935
11936 } else if (x_hit5_high < y_hit5_high) {
11937 return -1;
11938 } else if (y_hit5_high < x_hit5_high) {
11939 return +1;
11940
11941 } else if (x_hit3_low < y_hit3_low) {
11942 return -1;
11943 } else if (y_hit3_low < x_hit3_low) {
11944 return +1;
11945
11946 } else if (x_hit3_high < y_hit3_high) {
11947 return -1;
11948 } else if (y_hit3_high < x_hit3_high) {
11949 return +1;
11950
11951
11952 } else if (x->hit5->refalt_score_within_trims +
11953 x->hit3->refalt_score_within_trims <
11954 y->hit5->refalt_score_within_trims +
11955 y->hit3->refalt_score_within_trims) {
11956 return -1;
11957 } else if (y->hit5->refalt_score_within_trims +
11958 y->hit3->refalt_score_within_trims <
11959 x->hit5->refalt_score_within_trims +
11960 x->hit3->refalt_score_within_trims) {
11961 return +1;
11962 } else if (x->hit5->refalt_nmatches_plus_spliced_trims +
11963 x->hit3->refalt_nmatches_plus_spliced_trims >
11964 y->hit5->refalt_nmatches_plus_spliced_trims +
11965 y->hit3->refalt_nmatches_plus_spliced_trims) {
11966 return -1;
11967 } else if (y->hit5->refalt_nmatches_plus_spliced_trims +
11968 y->hit3->refalt_nmatches_plus_spliced_trims >
11969 x->hit5->refalt_nmatches_plus_spliced_trims +
11970 x->hit3->refalt_nmatches_plus_spliced_trims) {
11971 return +1;
11972 } else if (x->hit5->ref_nmatches_plus_spliced_trims +
11973 x->hit3->ref_nmatches_plus_spliced_trims >
11974 y->hit5->ref_nmatches_plus_spliced_trims +
11975 y->hit3->ref_nmatches_plus_spliced_trims) {
11976 return -1;
11977 } else if (y->hit5->ref_nmatches_plus_spliced_trims +
11978 y->hit3->ref_nmatches_plus_spliced_trims >
11979 x->hit5->ref_nmatches_plus_spliced_trims +
11980 x->hit3->ref_nmatches_plus_spliced_trims) {
11981 return +1;
11982
11983 } else if (x->alts_status_inside < y->alts_status_inside) {
11984 return -1;
11985 } else if (y->alts_status_inside < x->alts_status_inside) {
11986 return +1;
11987
11988 } else if (x->sense_consistent_p == true && y->sense_consistent_p == false) {
11989 debug8(printf(" => loses by sense_consistent_p\n"));
11990 return -1;
11991 } else if (x->sense_consistent_p == false && y->sense_consistent_p == true) {
11992 debug8(printf(" => wins by sense_consistent_p\n"));
11993 return +1;
11994
11995 } else if (x->hit5->splice_score + x->hit3->splice_score >
11996 y->hit5->splice_score + y->hit3->splice_score) {
11997 debug8(printf(" => loses by splice score\n"));
11998 return -1;
11999
12000 } else if (y->hit5->splice_score + y->hit3->splice_score >
12001 x->hit5->splice_score + x->hit3->splice_score) {
12002 debug8(printf(" => wins by splice score\n"));
12003 return +1;
12004
12005 } else {
12006 debug8(printf(" => identical for sorting purposes\n"));
12007 return 0;
12008 }
12009 }
12010
12011
12012 #if 0
12013 /* Same as hitpair_sort_cmp, except for hittype, nmatches_to_trims, and indel_low */
12014 static int
12015 hitpair_equiv_cmp (Stage3pair_T x, Stage3pair_T y) {
12016 Univcoord_T x_hit5_high, x_hit5_low, y_hit5_high, y_hit5_low;
12017 Univcoord_T x_hit3_high, x_hit3_low, y_hit3_high, y_hit3_low;
12018 Univcoord_T x_low, x_high, y_low, y_high;
12019
12020 x_hit5_low = normalize_coord(x->hit5->low,x->hit5->circularalias,x->hit5->chrlength);
12021 x_hit5_high = normalize_coord(x->hit5->high,x->hit5->circularalias,x->hit5->chrlength);
12022
12023 x_hit3_low = normalize_coord(x->hit3->low,x->hit3->circularalias,x->hit3->chrlength);
12024 x_hit3_high = normalize_coord(x->hit3->high,x->hit3->circularalias,x->hit3->chrlength);
12025
12026 x_low = (x_hit5_low < x_hit3_low) ? x_hit5_low : x_hit3_low;
12027 x_high = (x_hit5_high > x_hit3_high) ? x_hit5_high : x_hit3_high;
12028
12029
12030 y_hit5_low = normalize_coord(y->hit5->low,y->hit5->circularalias,y->hit5->chrlength);
12031 y_hit5_high = normalize_coord(y->hit5->high,y->hit5->circularalias,y->hit5->chrlength);
12032
12033 y_hit3_low = normalize_coord(y->hit3->low,y->hit3->circularalias,y->hit3->chrlength);
12034 y_hit3_high = normalize_coord(y->hit3->high,y->hit3->circularalias,y->hit3->chrlength);
12035
12036 y_low = (y_hit5_low < y_hit3_low) ? y_hit5_low : y_hit3_low;
12037 y_high = (y_hit5_high > y_hit3_high) ? y_hit5_high : y_hit3_high;
12038
12039
12040 if (x->dir != 0 && y->dir == 0) {
12041 return -1;
12042 } else if (x->dir == 0 && y->dir != 0) {
12043 return +1;
12044 } else if (x->dir > 0 && y->dir < 0) {
12045 return -1;
12046 } else if (x->dir < 0 && y->dir > 0) {
12047 return +1;
12048 } else if (x_low < y_low) {
12049 return -1;
12050 } else if (y_low < x_low) {
12051 return +1;
12052 } else if (x_high < y_high) {
12053 return -1;
12054 } else if (y_high < x_high) {
12055 return +1;
12056
12057 } else if (x_hit5_low < y_hit5_low) {
12058 return -1;
12059 } else if (y_hit5_low < x_hit5_low) {
12060 return +1;
12061 } else if (x_hit5_high < y_hit5_high) {
12062 return -1;
12063 } else if (y_hit5_high < x_hit5_high) {
12064 return +1;
12065
12066 } else if (x_hit3_low < y_hit3_low) {
12067 return -1;
12068 } else if (y_hit3_low < x_hit3_low) {
12069 return +1;
12070 } else if (x_hit3_high < y_hit3_high) {
12071 return -1;
12072 } else if (y_hit3_high < x_hit3_high) {
12073 return +1;
12074
12075 } else if (x->hit5->refalt_nmatches_plus_spliced_trims +
12076 x->hit3->refalt_nmatches_plus_spliced_trims >
12077 y->hit5->refalt_nmatches_plus_spliced_trims +
12078 y->hit3->refalt_nmatches_plus_spliced_trims) {
12079 return -1;
12080
12081 } else if (y->hit5->refalt_nmatches_plus_spliced_trims +
12082 y->hit3->refalt_nmatches_plus_spliced_trims >
12083 x->hit5->refalt_nmatches_plus_spliced_trims +
12084 x->hit3->refalt_nmatches_plus_spliced_trims) {
12085 return +1;
12086
12087 } else if (x->hit5->ref_nmatches_plus_spliced_trims +
12088 x->hit3->ref_nmatches_plus_spliced_trims >
12089 y->hit5->ref_nmatches_plus_spliced_trims +
12090 y->hit3->ref_nmatches_plus_spliced_trims) {
12091 return -1;
12092
12093 } else if (y->hit5->ref_nmatches_plus_spliced_trims +
12094 y->hit3->ref_nmatches_plus_spliced_trims >
12095 x->hit5->ref_nmatches_plus_spliced_trims +
12096 x->hit3->ref_nmatches_plus_spliced_trims) {
12097 return +1;
12098
12099 #if 0
12100 /* Causes hits to not be recognized as equivalent */
12101 } else if (x->nsplices < y->nsplices) {
12102 return -1;
12103 } else if (y->nsplices < x->nsplices) {
12104 return +1;
12105 #endif
12106
12107 } else if (x->alts_status_inside < y->alts_status_inside) {
12108 return -1;
12109 } else if (y->alts_status_inside < x->alts_status_inside) {
12110 return +1;
12111
12112 #if 0
12113 } else if (x->hit5->start_amb_length + x->hit5->end_amb_length +
12114 x->hit3->start_amb_length + x->hit3->end_amb_length > 0 &&
12115 y->hit5->start_amb_length + y->hit5->end_amb_length +
12116 y->hit3->start_amb_length + y->hit3->end_amb_length == 0) {
12117 return -1;
12118 } else if (y->hit5->start_amb_length + y->hit5->end_amb_length +
12119 y->hit3->start_amb_length + y->hit3->end_amb_length > 0 &&
12120 x->hit5->start_amb_length + x->hit5->end_amb_length +
12121 x->hit3->start_amb_length + x->hit3->end_amb_length == 0) {
12122 return +1;
12123 #endif
12124
12125 } else if (x->sense_consistent_p == true && y->sense_consistent_p == false) {
12126 return -1;
12127 } else if (x->sense_consistent_p == false && y->sense_consistent_p == true) {
12128 return +1;
12129
12130 #if 0
12131 } else if (x->indel_low < y->indel_low) {
12132 return -1;
12133 } else if (y->indel_low < x->indel_low) {
12134 return +1;
12135 #endif
12136
12137 #if 0
12138 } else if (x->sense_consistent_p == true) {
12139 /* Used for sorting, but not equiv */
12140 if ((x->hit5->sensedir_for_concordance != 0 || x->hit3->sensedir_for_concordance != 0) &&
12141 (y->hit5->sensedir_for_concordance == 0 && y->hit3->sensedir_for_concordance == 0)) {
12142 return -1;
12143 } else if ((y->hit5->sensedir_for_concordance != 0 || y->hit3->sensedir_for_concordance != 0) &&
12144 (x->hit5->sensedir_for_concordance == 0 && x->hit3->sensedir_for_concordance == 0)) {
12145 return +1;
12146 } else {
12147 return 0;
12148 }
12149 #endif
12150
12151 #if 0
12152 } else if (x->hit5->sensedir_for_concordance == y->hit5->sensedir_for_concordance &&
12153 x->hit3->sensedir_for_concordance == y->hit3->sensedir_for_concordance) {
12154 return 0;
12155 } else if (x->hit5->sensedir_for_concordance > y->hit5->sensedir_for_concordance) {
12156 return +1;
12157 } else if (y->hit5->sensedir_for_concordance > x->hit5->sensedir_for_concordance) {
12158 return -1;
12159 } else if (x->hit3->sensedir_for_concordance > y->hit3->sensedir_for_concordance) {
12160 return +1;
12161 } else if (y->hit3->sensedir_for_concordance > x->hit3->sensedir_for_concordance) {
12162 return -1;
12163 #endif
12164
12165 } else {
12166 return 0;
12167 }
12168 }
12169 #endif
12170
12171
12172 static int
hitpair_position_cmp(const void * a,const void * b)12173 hitpair_position_cmp (const void *a, const void *b) {
12174 Stage3pair_T x = * (Stage3pair_T *) a;
12175 Stage3pair_T y = * (Stage3pair_T *) b;
12176
12177 if (x->dir < y->dir) {
12178 return -1;
12179 } else if (y->dir < x->dir) {
12180 return +1;
12181 } else if (x->sensedir < y->sensedir) {
12182 return -1;
12183 } else if (y->sensedir < x->sensedir) {
12184 return +1;
12185 } else if (x->low < y->low) {
12186 return -1;
12187 } else if (y->low < x->low) {
12188 return +1;
12189 } else if (x->high > y->high) {
12190 return -1;
12191 } else if (y->high > x->high) {
12192 return +1;
12193 } else {
12194 return 0;
12195 }
12196 }
12197
12198
12199 static bool
hitpair_equal(Stage3pair_T x,Stage3pair_T y)12200 hitpair_equal (Stage3pair_T x, Stage3pair_T y) {
12201 List_T p, q;
12202 Substring_T substring_x, substring_y;
12203
12204 if (x->dir != y->dir) {
12205 return false; /* Different strands */
12206 } else {
12207 p = x->hit5->substrings_1toN;
12208 q = y->hit5->substrings_1toN;
12209 while (p != NULL && q != NULL) {
12210 substring_x = (Substring_T) p->first;
12211 substring_y = (Substring_T) q->first;
12212 if (Substring_equal(substring_x,substring_y) == false) {
12213 return false;
12214 }
12215 p = List_next(p);
12216 q = List_next(q);
12217 }
12218 if (p != NULL || q != NULL) {
12219 return false;
12220 }
12221
12222 p = x->hit3->substrings_1toN;
12223 q = y->hit3->substrings_1toN;
12224 while (p != NULL && q != NULL) {
12225 substring_x = (Substring_T) p->first;
12226 substring_y = (Substring_T) q->first;
12227 if (Substring_equal(substring_x,substring_y) == false) {
12228 return false;
12229 }
12230 p = List_next(p);
12231 q = List_next(q);
12232 }
12233 if (p != NULL || q != NULL) {
12234 return false;
12235 }
12236
12237 return true;
12238 }
12239 }
12240
12241
12242 static bool
hitpair_overlap_p(Stage3pair_T x,Stage3pair_T y)12243 hitpair_overlap_p (Stage3pair_T x, Stage3pair_T y) {
12244 /* printf("Checking for overlap of %u..%u and %u..%u ",x->low,x->high,y->low,y->high); */
12245 if (x->hit5->chrnum != y->hit5->chrnum) {
12246 /* printf("=> false\n"); */
12247 return false; /* Different chrnums */
12248 } else if (x->hit3->chrnum != y->hit3->chrnum) {
12249 return false; /* Different chrnums */
12250 } else if (x->dir != y->dir) {
12251 /* printf("=> false\n"); */
12252 return false; /* Different strands */
12253 } else if (x->high < y->low) {
12254 /* printf("=> false\n"); */
12255 return false;
12256 } else if (x->low > y->high) {
12257 /* printf("=> false\n"); */
12258 return false;
12259 } else {
12260 /* printf("=> true\n"); */
12261 return true;
12262 }
12263 }
12264
12265
12266 static bool
hitpair_subsumption(Stage3pair_T x,Stage3pair_T y)12267 hitpair_subsumption (Stage3pair_T x, Stage3pair_T y) {
12268 if (x->dir != y->dir) {
12269 return false; /* Different strands */
12270
12271 } else if (x->sensedir != y->sensedir) {
12272 return false;
12273
12274 } else if (x->low <= y->low && x->high >= y->high) {
12275 return true;
12276 } else if (y->low <= x->low && y->high >= x->high) {
12277 return true;
12278
12279 /* Test each end of the pair. Example: 1586..1512 and 1400..1468 should subsume 1586..1512 and 1564..1617 */
12280 } else if (x->hit5->low <= y->hit5->low && x->hit5->high >= y->hit5->high) {
12281 return true;
12282 } else if (y->hit5->low <= x->hit5->low && y->hit5->high >= x->hit5->high) {
12283 return true;
12284
12285 } else if (x->hit3->low <= y->hit3->low && x->hit3->high >= y->hit3->high) {
12286 return true;
12287 } else if (y->hit3->low <= x->hit3->low && y->hit3->high >= x->hit3->high) {
12288 return true;
12289
12290 } else {
12291 return false;
12292 }
12293 }
12294
12295
12296 static int
pair_matches_cmp(const void * a,const void * b)12297 pair_matches_cmp (const void *a, const void *b) {
12298 Stage3pair_T x = * (Stage3pair_T *) a;
12299 Stage3pair_T y = * (Stage3pair_T *) b;
12300
12301 if (x->hit5->refalt_nmatches_plus_spliced_trims +
12302 x->hit3->refalt_nmatches_plus_spliced_trims >
12303 y->hit5->refalt_nmatches_plus_spliced_trims +
12304 y->hit3->refalt_nmatches_plus_spliced_trims) {
12305 return -1;
12306 } else if (y->hit5->refalt_nmatches_plus_spliced_trims +
12307 y->hit3->refalt_nmatches_plus_spliced_trims >
12308 x->hit5->refalt_nmatches_plus_spliced_trims +
12309 x->hit3->refalt_nmatches_plus_spliced_trims) {
12310 return +1;
12311 } else if (x->hit5->ref_nmatches_plus_spliced_trims +
12312 x->hit3->ref_nmatches_plus_spliced_trims >
12313 y->hit5->ref_nmatches_plus_spliced_trims +
12314 y->hit3->ref_nmatches_plus_spliced_trims) {
12315 return -1;
12316 } else if (y->hit5->ref_nmatches_plus_spliced_trims +
12317 y->hit3->ref_nmatches_plus_spliced_trims >
12318 x->hit5->ref_nmatches_plus_spliced_trims +
12319 x->hit3->ref_nmatches_plus_spliced_trims) {
12320 return +1;
12321 } else {
12322 return 0;
12323 }
12324 }
12325
12326 List_T
Stage3pair_sort_bymatches(List_T hits,Hitlistpool_T hitlistpool)12327 Stage3pair_sort_bymatches (List_T hits, Hitlistpool_T hitlistpool) {
12328 List_T sorted = NULL;
12329 Stage3pair_T *array;
12330 int n, i;
12331
12332
12333 if ((n = List_length(hits)) == 0) {
12334 return (List_T) NULL;
12335 } else {
12336 #ifdef USE_ALLOCA_FOR_HITS
12337 array = (Stage3pair_T *) MALLOCA(n * sizeof(Stage3pair_T));
12338 List_fill_array((void **) array,hits);
12339 Hitlist_free(&hits);
12340 #else
12341 array = (Stage3pair_T *) List_to_array(hits,NULL);
12342 Hitlist_free(&hits);
12343 #endif
12344
12345 qsort(array,n,sizeof(Stage3pair_T),pair_matches_cmp);
12346 for (i = n-1; i >= 0; i--) {
12347 sorted = Hitlist_push(sorted,hitlistpool,(void *) array[i]);
12348 }
12349 #ifdef USE_ALLOCA_FOR_HITS
12350 FREEA(array);
12351 #else
12352 FREE(array);
12353 #endif
12354
12355 return sorted;
12356 }
12357 }
12358
12359
12360
12361 #if 0
12362 List_T
12363 Stage3pair_remove_duplicates_exact (List_T hitpairlist) {
12364 List_T unique = NULL;
12365 Stage3pair_T hitpair, *hitpairs;
12366 int n, i, j;
12367 bool *eliminate;
12368
12369 debug8(printf("Entered Stage3pair_remove_duplicates_exact with %d pairs\n",n));
12370 if ((n = List_length(hitpairlist)) == 0) {
12371 return NULL;
12372 } else {
12373 #ifdef USE_ALLOCA_FOR_HITS
12374 eliminate = (bool *) CALLOCA(n,sizeof(bool));
12375 hitpairs = (Stage3pair_T *) MALLOCA(n * sizeof(Stage3pair_T));
12376 List_fill_array((void **) hitpairs,hitpairlist);
12377 Hitlist_free(&hitpairlist);
12378 #else
12379 eliminate = (bool *) CALLOC(n,sizeof(bool));
12380 hitpairs = (Stage3pair_T *) List_to_array(hitpairlist,NULL);
12381 Hitlist_free(&hitpairlist);
12382 #endif
12383 }
12384
12385 debug8(printf("Checking for exact duplicates\n"));
12386 qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_sort_cmp);
12387
12388 debug8(
12389 for (i = 0; i < n; i++) {
12390 hitpair = hitpairs[i];
12391 printf(" Initial %d (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), circularalias %d|%d, nmatches: %d (%d to_trims)\n",
12392 i,Pairtype_string(hitpair->pairtype),Method_string(hitpair->hit5->method),
12393 Method_string(hitpair->hit3->method),hitpair,
12394 hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12395 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12396 hitpair->dir,hitpair->hit5->circularalias,hitpair->hit3->circularalias,
12397 hitpair->nmatches_plus_spliced_trims,hitpair->nmatches_to_trims);
12398 }
12399 );
12400
12401 i = 0;
12402 while (i < n) {
12403 j = i+1;
12404 while (j < n && hitpair_equal(hitpairs[j],hitpairs[i]) == true) {
12405 debug8(printf(" %d is identical to %d => eliminating\n",j,i));
12406 eliminate[j] = true;
12407 j++;
12408 }
12409 i = j;
12410 }
12411
12412 for (i = n-1; i >= 0; i--) {
12413 hitpair = hitpairs[i];
12414 if (eliminate[i] == false) {
12415 unique = Hitlist_push(unique,hitlistpool,(void *) hitpair);
12416 } else {
12417 Stage3pair_free(&hitpair);
12418 }
12419 }
12420
12421 #ifdef USE_ALLOCA_FOR_HITS
12422 FREEA(hitpairs);
12423 FREEA(eliminate);
12424 #else
12425 FREE(hitpairs);
12426 FREE(eliminate);
12427 #endif
12428
12429 debug8(printf("Exited Stage3pair_remove_duplicates_exact with %d pairs\n",List_length(unique)));
12430 return unique;
12431 }
12432 #endif
12433
12434
12435 static int
hitpair_goodness_cmp(bool * equalp,Stage3pair_T hitpair,Stage3pair_T best_hitpair,bool finalp)12436 hitpair_goodness_cmp (bool *equalp, Stage3pair_T hitpair,
12437 Stage3pair_T best_hitpair, bool finalp) {
12438 double prob1, prob2;
12439 /* Chrpos_T total_querylength, best_total_querylength; */
12440 double zscore, best_zscore;
12441
12442 #if 0
12443 int hitpair_nmatches, best_hitpair_nmatches;
12444 int max_trim_querystart, max_trim_queryend;
12445 Stage3end_T hit5, besthit5, hit3, besthit3;
12446
12447 if (hitpair->absdifflength_bingo_p < best_hitpair->absdifflength_bingo_p) {
12448 /* k is worse */
12449 debug8(printf(" => loses by absdifflength (bingo)\n"));
12450 return -1;
12451 } else if (hitpair->absdifflength_bingo_p > best_hitpair->absdifflength_bingo_p) {
12452 /* k is better */
12453 debug8(printf(" => wins by absdifflength (bingo)\n"));
12454 return +1;
12455 }
12456 #endif
12457
12458 #ifdef PRE_RESOLVE_MULTIMAPPING
12459 if (TALLY_RATIO*Stage3pair_tally(hitpair) < Stage3pair_tally(best_hitpair)) {
12460 /* k is worse */
12461 debug8(printf(" => loses by tally\n"));
12462 return -1;
12463 } else if (Stage3pair_tally(hitpair) > TALLY_RATIO*Stage3pair_tally(best_hitpair)) {
12464 /* k is better */
12465 debug8(printf(" => wins by tally\n"));
12466 return +1;
12467 }
12468 #endif
12469
12470 *equalp = false;
12471
12472 #if 0
12473 /* Don't want to use nmatches_to_trims */
12474 /* Previously, we favored ambiguous splices over definitive ones, but
12475 now that we are generating Stage3end_T objects with and without the
12476 end exons, we prefer definitive splices */
12477 if (known_ambiguous_p(hitpair->hit5) == true && known_ambiguous_p(best_hitpair->hit5) == false &&
12478 known_ambiguous_p(hitpair->hit3) == known_ambiguous_p(best_hitpair->hit3) &&
12479 hitpair->insertlength <= best_hitpair->insertlength) {
12480 debug8(printf("Case 1\n"));
12481 return -1;
12482
12483 } else if (known_ambiguous_p(hitpair->hit5) == false && known_ambiguous_p(best_hitpair->hit5) == true &&
12484 known_ambiguous_p(hitpair->hit3) == known_ambiguous_p(best_hitpair->hit3) &&
12485 hitpair->insertlength >= best_hitpair->insertlength) {
12486 debug8(printf("Case 2\n"));
12487 return +1;
12488
12489 } else if (known_ambiguous_p(hitpair->hit3) == true && known_ambiguous_p(best_hitpair->hit3) == false &&
12490 known_ambiguous_p(hitpair->hit5) == known_ambiguous_p(best_hitpair->hit5) &&
12491 hitpair->insertlength <= best_hitpair->insertlength) {
12492 debug8(printf("Case 3\n"));
12493 return -1;
12494
12495 } else if (known_ambiguous_p(hitpair->hit3) == false && known_ambiguous_p(best_hitpair->hit3) == true &&
12496 known_ambiguous_p(hitpair->hit5) == known_ambiguous_p(best_hitpair->hit5) &&
12497 hitpair->insertlength > best_hitpair->insertlength) {
12498 debug8(printf("Case 4\n"));
12499 return +1;
12500 }
12501 #endif
12502
12503
12504 if (hitpair->hit5->refalt_nmatches_plus_spliced_trims +
12505 hitpair->hit3->refalt_nmatches_plus_spliced_trims >
12506 best_hitpair->hit5->refalt_nmatches_plus_spliced_trims +
12507 best_hitpair->hit3->refalt_nmatches_plus_spliced_trims + NMATCHES_SLOP) {
12508 /* Significantly more matches */
12509 debug8(printf("More matches (to_trims)\n"));
12510 return +1;
12511 } else if (hitpair->hit5->refalt_nmatches_plus_spliced_trims +
12512 hitpair->hit3->refalt_nmatches_plus_spliced_trims <
12513 best_hitpair->hit5->refalt_nmatches_plus_spliced_trims +
12514 best_hitpair->hit3->refalt_nmatches_plus_spliced_trims - NMATCHES_SLOP) {
12515 /* Fewer matches */
12516 debug8(printf("Fewer matches (to_trims)\n"));
12517 return -1;
12518
12519 #if 0
12520 } else if ((hitpair->hit5->hittype != TRANSCRIPTOME || hitpair->hit3->hittype != TRANSCRIPTOME) &&
12521 (best_hitpair->hit5->hittype == TRANSCRIPTOME || best_hitpair->hit3->hittype == TRANSCRIPTOME)) {
12522 /* k is worse */
12523 debug8(printf(" => loses by transcriptome\n"));
12524 return -1;
12525
12526 } else if ((hitpair->hit5->hittype == TRANSCRIPTOME || hitpair->hit3->hittype == TRANSCRIPTOME) &&
12527 (best_hitpair->hit5->hittype != TRANSCRIPTOME || best_hitpair->hit3->hittype != TRANSCRIPTOME)) {
12528 /* k is better */
12529 debug8(printf(" => wins by transcriptome\n"));
12530 return +1;
12531 #endif
12532
12533 #if 0
12534 } else if (hitpair->nmatches_plus_spliced_trims < best_hitpair->nmatches_plus_spliced_trims - NMATCHES_SLOP) {
12535 /* k is worse */
12536 debug8(printf(" => loses by nmatches\n"));
12537 return -1;
12538 } else if (hitpair->nmatches_plus_spliced_trims > best_hitpair->nmatches_plus_spliced_trims + NMATCHES_SLOP) {
12539 /* k is better */
12540 debug8(printf(" => wins by nmatches\n"));
12541 return +1;
12542 #endif
12543
12544 #if 0
12545 } else if (hitpair->nsplices > best_hitpair->nsplices) {
12546 /* k is worse */
12547 debug8(printf(" => loses by nsplices: %d > %d in best\n",hitpair->nsplices,best_hitpair->nsplices));
12548 return -1;
12549 } else if (hitpair->nsplices < best_hitpair->nsplices) {
12550 /* k is better */
12551 debug8(printf(" => wins by nsplices: %d < %d in best\n",hitpair->nsplices,best_hitpair->nsplices));
12552 return +1;
12553 #endif
12554
12555 } else if (hitpair->alts_status_inside > best_hitpair->alts_status_inside) {
12556 /* k is worse */
12557 debug8(printf(" => loses by alts_status_inside\n"));
12558 return -1;
12559 } else if (hitpair->alts_status_inside < best_hitpair->alts_status_inside) {
12560 /* k is better */
12561 debug8(printf(" => wins by alts_status_inside\n"));
12562 return +1;
12563
12564
12565 } else if (hitpair->hit5->hittype > best_hitpair->hit5->hittype &&
12566 hitpair->hit3->hittype >= best_hitpair->hit3->hittype) {
12567 /* k is worse */
12568 debug8(printf(" => loses by hittype\n"));
12569 return -1;
12570
12571 } else if (hitpair->hit5->hittype >= best_hitpair->hit5->hittype &&
12572 hitpair->hit3->hittype > best_hitpair->hit3->hittype) {
12573 /* k is worse */
12574 debug8(printf(" => loses by hittype\n"));
12575 return -1;
12576
12577 } else if (hitpair->hit5->hittype < best_hitpair->hit5->hittype &&
12578 hitpair->hit3->hittype <= best_hitpair->hit3->hittype) {
12579 /* k is better */
12580 debug8(printf(" => wins by hittype\n"));
12581 return +1;
12582
12583 } else if (hitpair->hit5->hittype <= best_hitpair->hit5->hittype &&
12584 hitpair->hit3->hittype < best_hitpair->hit3->hittype) {
12585 /* k is better */
12586 debug8(printf(" => wins by hittype\n"));
12587 return +1;
12588
12589 #if 0
12590 } else if (n_amb_ends(hitpair->hit5) + n_amb_ends(hitpair->hit3) >
12591 n_amb_ends(best_hitpair->hit5) + n_amb_ends(best_hitpair->hit3)) {
12592 /* k is worse */
12593 debug8(printf(" => loses by ambiguity\n"));
12594 return -1;
12595
12596 } else if (n_amb_ends(hitpair->hit5) + n_amb_ends(hitpair->hit3) <
12597 n_amb_ends(best_hitpair->hit5) + n_amb_ends(best_hitpair->hit3)) {
12598 /* k is better */
12599 debug8(printf(" => wins by ambiguity\n"));
12600 return +1;
12601 #endif
12602
12603 } else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score >
12604 best_hitpair->hit5->splice_score + best_hitpair->hit3->splice_score) {
12605 /* k is worse */
12606 debug8(printf(" => loses by splice score\n"));
12607 return -1;
12608
12609 } else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score >
12610 best_hitpair->hit5->splice_score + best_hitpair->hit3->splice_score) {
12611 /* k is better */
12612 debug8(printf(" => wins by splice score\n"));
12613 return +1;
12614
12615 #if 0
12616 } else if (hitpair->absdifflength < best_hitpair->absdifflength) {
12617 /* k is worse */
12618 debug8(printf(" => loses by absdifflength\n"));
12619 return -1;
12620 } else if (hitpair->absdifflength > best_hitpair->absdifflength) {
12621 /* k is better */
12622 debug8(printf(" => wins by absdifflength\n"));
12623 return +1;
12624 #endif
12625
12626 } else if (finalp == false) {
12627 debug8(printf(" => indistinguishable\n"));
12628 return 0;
12629
12630 #ifdef USE_ABSDIFFLENGTH
12631 /* If insert length is within deviation of expected pairlength, favor it */
12632 } else if (best_hitpair->absdifflength <= (Chrpos_T) pairlength_deviation &&
12633 hitpair->absdifflength > (Chrpos_T) pairlength_deviation) {
12634 /* k is worse */
12635 debug8(printf(" => loses by absdifflength within deviation %d\n",pairlength_deviation));
12636 return -1;
12637 } else if (hitpair->absdifflength <= (Chrpos_T) pairlength_deviation &&
12638 best_hitpair->absdifflength > (Chrpos_T) pairlength_deviation) {
12639 /* k is better */
12640 debug8(printf(" => wins by absdifflength within deviation %d\n",pairlength_deviation));
12641 return +1;
12642 #endif
12643
12644 #if 0
12645 /* Previously favored longer insert lengths to give more compact
12646 splices. However, we now accept splices first that give
12647 expected pairlength */
12648 } else if (hitpair->insertlength_expected_sign == -1 && best_hitpair->insertlength_expected_sign == +1) {
12649 /* k is worse */
12650 debug8(printf(" => loses by insertlength_expected_sign\n"));
12651 return -1;
12652 } else if (hitpair->insertlength_expected_sign == +1 && best_hitpair->insertlength_expected_sign == -1) {
12653 /* k is better */
12654 debug8(printf(" => wins by insertlength_expected_sign\n"));
12655 return +1;
12656 #endif
12657
12658 /* Next we look at splice probability */
12659 } else {
12660 debug8(printf(" => prob"));
12661 prob1 = Stage3end_prob(hitpair->hit5) + Stage3end_prob(hitpair->hit3);
12662 prob2 = Stage3end_prob(best_hitpair->hit5) + Stage3end_prob(best_hitpair->hit3);
12663 if (prob1 + 0.3 < prob2) {
12664 /* k is worse */
12665 debug8(printf(" => loses by dual splice prob %f vs %f\n",prob1,prob2));
12666 return -1;
12667 } else if (prob1 > prob2 + 0.3) {
12668 /* k is better */
12669 debug8(printf(" => wins by dual splice prob %f vs %f\n",prob1,prob2));
12670 return +1;
12671 } else {
12672 debug8(printf(" => neither wins\n"));
12673 }
12674
12675
12676 #if 0
12677 /* Overlapping ends worse than separate ends */
12678 total_querylength = (Chrpos_T) (hitpair->hit5->querylength + hitpair->hit3->querylength);
12679 best_total_querylength = (Chrpos_T) (best_hitpair->hit5->querylength + best_hitpair->hit3->querylength);
12680
12681 if (hitpair->insertlength <= total_querylength && best_hitpair->insertlength > best_total_querylength) {
12682 debug8(printf(" => loses by being overlapping\n"));
12683 return -1;
12684 } else if (hitpair->insertlength > total_querylength && best_hitpair->insertlength <= best_total_querylength) {
12685 debug8(printf(" => wins by being separate\n"));
12686 return +1;
12687
12688 /* Next, favor shorter outerlengths to give more compact splices or closer pairs */
12689 } else if (hitpair->outerlength > best_hitpair->outerlength + OUTERLENGTH_SLOP) {
12690 /* k is worse */
12691 debug8(printf(" => loses by outerlength\n"));
12692 return -1;
12693 } else if (hitpair->outerlength + OUTERLENGTH_SLOP < best_hitpair->outerlength) {
12694 /* k is better */
12695 debug8(printf(" => wins by outerlength\n"));
12696 return +1;
12697
12698 } else {
12699 #if 0
12700 if (hitpair->insertlength_expected_sign >= 0 && best_hitpair->insertlength_expected_sign >= 0) {
12701 /* Both insert lengths are short, so favor shorter insert length */
12702 debug8(printf(" => short insertlengths"));
12703 /* Favor shorter insert lengths */
12704 if (hitpair->insertlength > best_hitpair->insertlength) {
12705 /* k is worse */
12706 debug8(printf(" => loses by insertlength\n"));
12707 return -1;
12708 } else if (hitpair->insertlength < best_hitpair->insertlength) {
12709 /* k is better */
12710 debug8(printf(" => wins by insertlength\n"));
12711 return +1;
12712 }
12713 }
12714 #endif
12715
12716 /* Both insert lengths are long, so favor longer insert length to give more compact splices */
12717 debug8(printf(" => long insertlengths"));
12718 if (hitpair->insertlength < best_hitpair->insertlength) {
12719 /* k is worse */
12720 debug8(printf(" => loses by insertlength\n"));
12721 return -1;
12722 } else if (hitpair->insertlength > best_hitpair->insertlength) {
12723 /* k is better */
12724 debug8(printf(" => wins by insertlength\n"));
12725 return +1;
12726 }
12727
12728 debug8(printf(" => equal\n"));
12729 *equalp = true;
12730 return 0;
12731 }
12732 #endif
12733
12734 /* Look at expected pairlength and pairlength deviation */
12735 if (hitpair->insertlength < expected_pairlength) {
12736 zscore = (double) (expected_pairlength - (Chrpos_T) hitpair->insertlength) / (double) pairlength_deviation;
12737 } else {
12738 zscore = (double) ((Chrpos_T) hitpair->insertlength - expected_pairlength) / (double) pairlength_deviation;
12739 }
12740 if (best_hitpair->insertlength < expected_pairlength) {
12741 best_zscore = (double) (expected_pairlength - (Chrpos_T) best_hitpair->insertlength) / (double) pairlength_deviation;
12742 } else {
12743 best_zscore = (double) ((Chrpos_T) best_hitpair->insertlength - expected_pairlength) / (double) pairlength_deviation;
12744 }
12745 debug8(printf("expected_pairlength %u, pairlength_deviation %u\n",expected_pairlength,pairlength_deviation));
12746 debug8(printf("Comparing insertlength %d (z score %f) with best_insertlength %d (zscore %f)\n",
12747 hitpair->insertlength,zscore,best_hitpair->insertlength,best_zscore));
12748
12749 if (zscore > best_zscore + 1.0) {
12750 /* k is worse */
12751 debug8(printf(" => loses by insertlength and zscore\n"));
12752 return -1;
12753 } else if (best_zscore > zscore + 1.0) {
12754 /* k is better */
12755 debug8(printf(" => wins by insertlength and zscore\n"));
12756 return +1;
12757 }
12758
12759 debug8(printf(" => equal\n"));
12760 *equalp = true;
12761 return 0;
12762 }
12763 }
12764
12765
12766 #if 0
12767 static bool
12768 hitpair_bad_superstretch_p (Stage3pair_T hitpair_k, Stage3pair_T *hitpairs, int k, int j,
12769 bool finalp) {
12770 int a;
12771 bool equalp;
12772
12773 for (a = k+1; a <= j; a++) {
12774 if (hitpair_subsumption(hitpair_k,hitpairs[a]) == true) {
12775 debug8(printf("Testing %d because stretches over %d",k,a));
12776 if (hitpair_goodness_cmp(&equalp,hitpairs[a],
12777 hitpair_k,finalp) > 0 || equalp == true) {
12778 debug8(printf(" => eliminating\n"));
12779 return true;
12780 }
12781 debug8(printf("\n"));
12782 }
12783 }
12784 return false;
12785 }
12786 #endif
12787
12788
12789 /* Recursive, list-based approach */
12790 static List_T
pair_remove_bad_superstretches(bool * keep_p,Stage3pair_T superstretch,List_T list,Hitlistpool_T hitlistpool,int querylength5,int querylength3,bool finalp)12791 pair_remove_bad_superstretches (bool *keep_p, Stage3pair_T superstretch, List_T list,
12792 Hitlistpool_T hitlistpool, int querylength5, int querylength3,
12793 bool finalp) {
12794 List_T result = NULL, p, q, r;
12795 Stage3pair_T stage3pair, hitpair;
12796 Chrpos_T best_insertlength, best_outerlength;
12797 int best_nsegments, nsegments;
12798 double max_splice_score, splice_score;
12799 bool equalp;
12800
12801 *keep_p = true;
12802
12803 p = list;
12804 while (p != NULL) {
12805 stage3pair = (Stage3pair_T) List_head(p);
12806
12807 q = List_next(p);
12808 while (q != NULL && hitpair_subsumption(stage3pair,(Stage3pair_T) List_head(q)) == true) {
12809 #ifdef DEBUG8
12810 printf(" This (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), nmatches: %d+%d (%d+%d to trims), insertlength %d, alts_status_inside %d, amb_lengths %d and %d\n",
12811 Pairtype_string(stage3pair->pairtype),Method_string(stage3pair->hit5->method),
12812 Method_string(stage3pair->hit3->method),stage3pair,
12813 stage3pair->hit5->low - stage3pair->hit5->chroffset,stage3pair->hit5->high - stage3pair->hit5->chroffset,
12814 stage3pair->hit3->low - stage3pair->hit3->chroffset,stage3pair->hit3->high - stage3pair->hit3->chroffset,
12815 stage3pair->dir,stage3pair->hit5->refalt_nmatches_plus_spliced_trims,stage3pair->hit3->refalt_nmatches_plus_spliced_trims,
12816 stage3pair->hit5->refalt_nmatches_to_trims,stage3pair->hit3->refalt_nmatches_to_trims,
12817 stage3pair->insertlength,stage3pair->alts_status_inside,amb_length(stage3pair->hit5),amb_length(stage3pair->hit3));
12818
12819 hitpair = (Stage3pair_T) List_head(q);
12820 printf("subsumes that (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), nmatches: %d+%d (%d+%d to trims), insertlength %d, alts_status_inside %d, amb_lengths %d and %d\n",
12821 Pairtype_string(hitpair->pairtype),Method_string(hitpair->hit5->method),
12822 Method_string(hitpair->hit3->method),hitpair,
12823 hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12824 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12825 hitpair->dir,hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12826 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12827 hitpair->insertlength,hitpair->alts_status_inside,amb_length(hitpair->hit5),amb_length(hitpair->hit3));
12828 #endif
12829 q = List_next(q);
12830 }
12831
12832 if (q == p) {
12833 result = Hitlist_push(result,hitlistpool,(void *) stage3pair);
12834 if (superstretch != NULL &&
12835 (hitpair_goodness_cmp(&equalp,stage3pair,superstretch,finalp) > 0 || equalp == true)) {
12836 *keep_p = false;
12837 }
12838 p = List_next(q);
12839
12840 } else {
12841 /* Cluster */
12842
12843 /* (1) Find smallest insert length with slop across loci */
12844 debug8(printf("Finding smallest insertlength\n"));
12845 best_insertlength = (Chrpos_T) -1;
12846 for (r = p; r != q; r = List_next(r)) {
12847 hitpair = (Stage3pair_T) r->first;
12848 if (hitpair->insertlength < best_insertlength) {
12849 best_insertlength = hitpair->insertlength;
12850 }
12851 }
12852
12853 for (r = p; r != q; r = List_next(r)) {
12854 hitpair = (Stage3pair_T) r->first;
12855
12856 if (hitpair->insertlength > best_insertlength + INSERTLENGTH_SLOP) { /* Initial slop */
12857 debug8(printf("Final (insertlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims), ref %d+%d (%d+%d)\n",
12858 hitpair->insertlength,best_insertlength,
12859 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12860 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12861 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12862 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12863 hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
12864 hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims));
12865 Stage3pair_free(&hitpair);
12866 r->first = (Stage3pair_T) NULL;
12867 *keep_p = false;
12868
12869 } else {
12870 debug8(printf("Final (insertlength %u, outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims), ref %d+%d (%d+%d)\n",
12871 hitpair->insertlength,hitpair->outerlength,
12872 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12873 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12874 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
12875 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12876 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12877 hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
12878 hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims));
12879 /* result = Hitlist_push(result,hitlistpool,(void *) hitpair); -- wait for last filtering step */
12880 }
12881 }
12882
12883
12884 /* (2) Find best nsegments and splice score */
12885 debug8(printf("Finding best nsegments and splice score\n"));
12886 best_nsegments = querylength5 + querylength3;
12887 max_splice_score = 0.0;
12888 for (r = p; r != q; r = List_next(r)) {
12889 if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12890 /* Already eliminated */
12891 } else if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) < best_nsegments) {
12892 best_nsegments = nsegments;
12893 max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
12894
12895 } else if (nsegments == best_nsegments) {
12896 if ((splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score) > max_splice_score) {
12897 max_splice_score = splice_score;
12898 }
12899 }
12900 }
12901 debug8(printf("best_nsegments %d, max_splice_score %f\n",best_nsegments,max_splice_score));
12902
12903 for (r = p; r != q; r = List_next(r)) {
12904 if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12905 /* Already eliminated */
12906
12907 } else if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) > best_nsegments) {
12908 debug8(printf("Within loci pair (nsegments %d > %d): Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to trims), sensedirs %d and %d, splice scores %f and %f\n",
12909 nsegments,best_nsegments,
12910 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12911 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12912 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
12913 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12914 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12915 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
12916 Stage3pair_free(&hitpair);
12917 r->first = (Stage3pair_T) NULL;
12918 *keep_p = false;
12919
12920 } else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
12921 debug8(printf("Within loci pair (splice_score w/slop %f < %f): Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to trims), sensedirs %d and %d, splice scores %f and %f\n",
12922 hitpair->hit5->splice_score + hitpair->hit3->splice_score,max_splice_score,
12923 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12924 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12925 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
12926 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12927 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
12928 Stage3pair_free(&hitpair);
12929 r->first = (Stage3pair_T) NULL;
12930 *keep_p = false;
12931
12932 } else {
12933 debug8(printf("Keeping hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
12934 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12935 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12936 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
12937 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
12938 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
12939 /* result = Hitlist_push(result,hitlistpool,(void *) hitpair); -- wait until last filtering step */
12940 }
12941 }
12942
12943
12944 /* (3) Find smallest outerlength across loci */
12945 debug8(printf("Finding smallest outerlength"));
12946 best_outerlength = (Chrpos_T) -1;
12947 for (r = p; r != q; r = List_next(r)) {
12948 if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12949 /* Already eliminated */
12950 } else if (hitpair->outerlength < best_outerlength) {
12951 best_outerlength = hitpair->outerlength;
12952 }
12953 }
12954 debug8(printf(" => %u\n",best_outerlength));
12955
12956 for (r = p; r != q; r = List_next(r)) {
12957 if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12958 /* Already eliminated */
12959
12960 } else if (hitpair->outerlength > best_outerlength /*+ OUTERLENGTH_SLOP*/) { /* No slop for final */
12961 debug8(printf("Final (outerlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
12962 hitpair->outerlength,best_outerlength /*+ OUTERLENGTH_SLOP*/,
12963 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12964 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12965 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12966 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
12967 Stage3pair_free(&hitpair);
12968 r->first = (Stage3pair_T) NULL;
12969 *keep_p = false;
12970
12971 } else {
12972 debug8(printf("Final (outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims)\n",
12973 hitpair->outerlength,
12974 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
12975 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
12976 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
12977 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
12978 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
12979 /* result = Hitlist_push(result,hitlistpool,(void *) hitpair); -- wait for last filtering step */
12980 }
12981 }
12982
12983
12984 /* (4) Find smallest insert length with slop across loci */
12985 debug8(printf("Finding smallest insertlength\n"));
12986 best_insertlength = (Chrpos_T) -1;
12987 for (r = p; r != q; r = List_next(r)) {
12988 if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12989 /* Already eliminated */
12990 } else if (hitpair->insertlength < best_insertlength) {
12991 best_insertlength = hitpair->insertlength;
12992 }
12993 }
12994 debug8(printf(" => %u\n",best_insertlength));
12995
12996 for (r = p; r != q; r = List_next(r)) {
12997 if ((hitpair = (Stage3pair_T) r->first) == NULL) {
12998 /* Already eliminated */
12999
13000 } else if (hitpair->insertlength > best_insertlength /*+ INSERTLENGTH_SLOP*/) { /* No slop for final */
13001 debug8(printf("Final (insertlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims), ref %d+%d (%d+%d)\n",
13002 hitpair->insertlength,best_insertlength,
13003 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13004 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13005 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13006 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13007 hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
13008 hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims));
13009 Stage3pair_free(&hitpair);
13010 r->first = (Stage3pair_T) NULL;
13011 *keep_p = false;
13012
13013 } else {
13014 debug8(printf("Final (insertlength %u, outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims), ref %d+%d (%d+%d)\n",
13015 hitpair->insertlength,hitpair->outerlength,
13016 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13017 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13018 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
13019 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13020 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13021 hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
13022 hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims));
13023 result = Hitlist_push(result,hitlistpool,(void *) hitpair);
13024 debug8(printf(" result now has length %d\n",List_length(result)));
13025 }
13026 }
13027
13028 p = q;
13029 }
13030 }
13031
13032 Hitlist_free(&list);
13033
13034 debug8(printf("pair_remove_bad_superstretches returning result of length %d\n",List_length(result)));
13035 return List_reverse(result);
13036 }
13037
13038
13039 static List_T
pair_remove_overlaps(List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3,bool translocp,bool finalp)13040 pair_remove_overlaps (List_T hitpairlist, Hitlistpool_T hitlistpool,
13041 int querylength5, int querylength3,
13042 bool translocp, bool finalp) {
13043 List_T unique = NULL;
13044 Stage3pair_T hitpair, parent, *hitpairs;
13045 int nkept, n, i, j;
13046 bool *eliminate;
13047 int *parenti;
13048 bool keep_p;
13049
13050 n = List_length(hitpairlist);
13051 debug8(printf(" Entering pair_remove_overlaps with %d pairs: %s\n",
13052 n,finalp == true ? "FINAL" : "not final"));
13053
13054 if (n <= 1) {
13055 debug8(printf(" Exiting pair_remove_overlaps with %d < 2 pairs\n",n));
13056 return hitpairlist;
13057 } else {
13058 #ifdef USE_ALLOCA_FOR_HITS
13059 eliminate = (bool *) CALLOCA(n,sizeof(bool));
13060 parenti = (int *) CALLOCA(n,sizeof(int));
13061 hitpairs = (Stage3pair_T *) MALLOCA(n * sizeof(Stage3pair_T));
13062 List_fill_array((void **) hitpairs,hitpairlist);
13063 Hitlist_free(&hitpairlist);
13064 #else
13065 eliminate = (bool *) CALLOC(n,sizeof(bool));
13066 parenti = (int *) CALLOC(n,sizeof(int));
13067 hitpairs = (Stage3pair_T *) List_to_array(hitpairlist,NULL);
13068 Hitlist_free(&hitpairlist);
13069 #endif
13070 }
13071
13072 /* Step 1. Check for exact duplicates */
13073 debug8(printf(" Step 1. Checking for exact duplicates\n"));
13074 qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_sort_cmp);
13075
13076 debug8(
13077 for (i = 0; i < n; i++) {
13078 hitpair = hitpairs[i];
13079 printf(" Initial %d (%s, %s-%s): %p, %u..%u|%u..%u (dir = %d), circularalias %d|%d, nmatches: %d+%d (%d+%d to trims), amb_lengths %d and %d, sensedirs %d and %d.",
13080 i,Pairtype_string(hitpair->pairtype),Method_string(hitpair->hit5->method),
13081 Method_string(hitpair->hit3->method),hitpair,
13082 hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13083 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13084 hitpair->dir,hitpair->hit5->circularalias,hitpair->hit3->circularalias,
13085 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13086 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13087 amb_length(hitpair->hit5),amb_length(hitpair->hit3),hitpair->hit5->sensedir,hitpair->hit3->sensedir);
13088 if (hitpair->hit5->hittype == TRANSLOC_SPLICE) {
13089 printf(" 5' TRANSLOC splice probs %f",hitpair->hit5->splice_score);
13090 }
13091 if (hitpair->hit3->hittype == TRANSLOC_SPLICE) {
13092 printf(" 3' TRANSLOC splice probs %f",hitpair->hit3->splice_score);
13093 }
13094 printf("\n");
13095 }
13096 );
13097
13098 i = 0;
13099 while (i < n) {
13100 j = i+1;
13101 debug8(printf(" %d,%d",i,j));
13102 while (j < n && hitpair_equal(hitpairs[j],hitpairs[i]) == true) {
13103 debug8(printf(" %d is identical to %d => eliminating\n",j,i));
13104 eliminate[j] = true;
13105 parenti[j] = i;
13106 j++;
13107 }
13108 i = j;
13109 }
13110 debug8(printf("\n"));
13111
13112 nkept = 0;
13113 for (i = 0; i < n; i++) {
13114 if (eliminate[i] == false) {
13115 nkept++;
13116 }
13117 }
13118 debug8(printf("nkept = %d\n",nkept));
13119
13120 if (nkept == 0) {
13121 /* All entries eliminated one another, so keep the first one */
13122 debug8(printf("All entries eliminate one another, so keep the first one\n"));
13123 eliminate[0] = false;
13124 nkept = 1;
13125 }
13126
13127 for (i = n - 1; i >= 0; --i) {
13128 hitpair = hitpairs[i];
13129 if (eliminate[i] == false) {
13130 debug8(printf(" Keeping %s|%s %u..%u|%u..%u, nmatches (trimmed) %d+%d, score %d+%d, (dir = %d)\n",
13131 Method_string(hitpair->hit5->method),Method_string(hitpair->hit3->method),
13132 hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13133 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13134 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13135 hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,hitpair->dir));
13136 unique = Hitlist_push(unique,hitlistpool,(void *) hitpair);
13137
13138 } else {
13139 debug8(printf(" Eliminating %s|%s %u..%u|%u..%u, nmatches (trimmed) %d+%d, score %d+%d, (dir = %d)\n",
13140 Method_string(hitpair->hit5->method),Method_string(hitpair->hit3->method),
13141 hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13142 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13143 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13144 hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,hitpair->dir));
13145
13146 parent = hitpairs[parenti[i]];
13147 Stage3pair_transfer_transcripts_one(parent,hitpair);
13148 Stage3pair_free(&hitpair);
13149 }
13150 }
13151
13152 #ifdef USE_ALLOCA_FOR_HITS
13153 FREEA(hitpairs);
13154 FREEA(eliminate);
13155 FREEA(parenti);
13156 #else
13157 FREE(hitpairs);
13158 FREE(eliminate);
13159 FREE(parenti);
13160 #endif
13161
13162
13163 debug8(printf(" Step 2. Checking for bad superstretches\n"));
13164 if (0 && translocp == true) {
13165 return unique;
13166 } else {
13167 return pair_remove_bad_superstretches(&keep_p,/*superstretch*/NULL,unique,
13168 hitlistpool,querylength5,querylength3,finalp);
13169 }
13170 }
13171
13172
13173 static int
calc_insertlength_score(Chrpos_T insertlength)13174 calc_insertlength_score (Chrpos_T insertlength) {
13175 if (insertlength > 80000) {
13176 return 2;
13177 } else if (insertlength > 1000) {
13178 return 1;
13179 } else {
13180 return 0;
13181 }
13182 }
13183
13184
13185 List_T
Stage3pair_remove_overlaps(List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3,bool translocp,bool finalp)13186 Stage3pair_remove_overlaps (List_T hitpairlist, Hitlistpool_T hitlistpool,
13187 int querylength5, int querylength3,
13188 bool translocp, bool finalp) {
13189 List_T optimal, unique_separate, unique_overlapping,
13190 separate = NULL, overlapping = NULL, p;
13191 Stage3pair_T hitpair_separate, hitpair_overlapping, *hitpairs, hitpair;
13192
13193 Stage3pair_T *array_separate, *array_overlapping;
13194 Univcoord_T low, high;
13195
13196 int max_adj_nmatches, score;
13197 int best_nsegments, nsegments;
13198 int best_insertlength_score, insertlength_score;
13199 double max_splice_score, splice_score;
13200 Chrpos_T best_outerlength;
13201
13202 bool subsumedp, equalp, *eliminate, keptp;
13203 int n_separate, n_overlapping, n, i, j, k;
13204
13205
13206 debug8(printf("Entered Stage3pair_remove_overlaps with %d hitpairs\n",List_length(hitpairlist)));
13207 for (p = hitpairlist; p != NULL; p = List_next(p)) {
13208 hitpair = (Stage3pair_T) List_head(p);
13209 if (hitpair->insertlength <= (Chrpos_T) (hitpair->hit5->querylength + hitpair->hit3->querylength)) {
13210 overlapping = Hitlist_push(overlapping,hitlistpool,(void *) hitpair);
13211 } else {
13212 separate = Hitlist_push(separate,hitlistpool,(void *) hitpair);
13213 }
13214 }
13215 Hitlist_free(&hitpairlist);
13216
13217 debug8(printf("Calling Stage3pair_remove_overlaps for separate pair ends\n"));
13218 unique_separate = pair_remove_overlaps(separate,hitlistpool,querylength5,querylength3,translocp,finalp);
13219
13220 debug8(printf("Calling Stage3pair_remove_overlaps for overlapping pair ends\n"));
13221 unique_overlapping = pair_remove_overlaps(overlapping,hitlistpool,querylength5,querylength3,translocp,finalp);
13222
13223 if (unique_overlapping == NULL) {
13224 debug8(printf("Unique overlapping is NULL\n"));
13225 hitpairlist = unique_separate;
13226 } else if (unique_separate == NULL) {
13227 debug8(printf("Unique separate is NULL\n"));
13228 hitpairlist = unique_overlapping;
13229 } else {
13230 debug8(printf("Have both overlapping and separate\n"));
13231 n_overlapping = List_length(unique_overlapping);
13232 #ifdef USE_ALLOCA_FOR_HITS
13233 array_overlapping = (Stage3pair_T *) MALLOCA(n_overlapping * sizeof(Stage3pair_T));
13234 List_fill_array((void **) array_overlapping,unique_overlapping);
13235 #else
13236 array_overlapping = (Stage3pair_T *) List_to_array(unique_overlapping,NULL);
13237 #endif
13238
13239 n_separate = List_length(unique_separate);
13240 #ifdef USE_ALLOCA_FOR_HITS
13241 array_separate = (Stage3pair_T *) MALLOCA(n_separate * sizeof(Stage3pair_T));
13242 List_fill_array((void **) array_separate,unique_separate);
13243 #else
13244 array_separate = (Stage3pair_T *) List_to_array(unique_separate,NULL);
13245 #endif
13246
13247 qsort(array_overlapping,n_overlapping,sizeof(Stage3pair_T),hitpair_position_cmp);
13248 qsort(array_separate,n_separate,sizeof(Stage3pair_T),hitpair_position_cmp);
13249
13250 /* 1. First, favor overlapping (with smaller insertlengths) */
13251 /* Keep unique_overlapping and filter unique_separate into indep_separate */
13252 Hitlist_free(&unique_separate);
13253 unique_separate = (List_T) NULL;
13254
13255 i = j = 0;
13256 for (i = 0; i < n_separate; i++) {
13257 hitpair_separate = array_separate[i];
13258 low = hitpair_separate->low;
13259 high = hitpair_separate->high;
13260 while (j >= 0 && array_overlapping[j]->high >= low) {
13261 j--;
13262 }
13263 j += 1;
13264
13265 subsumedp = false;
13266 while (j < n_overlapping && subsumedp == false && array_overlapping[j]->low <= high) {
13267 if (hitpair_goodness_cmp(&equalp,array_overlapping[j],
13268 hitpair_separate,finalp) > 0) {
13269 debug8(printf("overlapping pair %d better than separate pair %d\n",j,i));
13270 subsumedp = hitpair_subsumption(hitpair_separate,array_overlapping[j]);
13271 debug8(printf(" checking if separate pair %d subsumes overlapping pair %d => %d\n",
13272 i,j,subsumedp));
13273 }
13274 j++;
13275 }
13276 j -= 1;
13277
13278 if (subsumedp == true) {
13279 Stage3pair_free(&hitpair_separate);
13280 } else {
13281 unique_separate = Hitlist_push(unique_separate,hitlistpool,(void *) hitpair_separate);
13282 }
13283 }
13284
13285 #ifdef USE_ALLOCA_FOR_HITS
13286 FREEA(array_separate);
13287 #else
13288 FREE(array_separate);
13289 #endif
13290
13291 if ((n_separate = List_length(unique_separate)) == 0) {
13292 #ifdef USE_ALLOCA_FOR_HITS
13293 FREEA(array_overlapping);
13294 #else
13295 FREE(array_overlapping);
13296 #endif
13297 hitpairlist = unique_overlapping;
13298
13299 } else {
13300 #ifdef USE_ALLOCA_FOR_HITS
13301 array_separate = (Stage3pair_T *) MALLOCA(n_separate * sizeof(Stage3pair_T));
13302 List_fill_array((void **) array_separate,unique_separate);
13303 #else
13304 array_separate = (Stage3pair_T *) List_to_array(unique_separate,NULL);
13305 #endif
13306
13307 /* 2. Second, favor separate (with larger insertlengths) */
13308 /* Keep indep_separate and filter unique_overlapping into indep_overlapping */
13309 Hitlist_free(&unique_overlapping);
13310 unique_overlapping = (List_T) NULL;
13311
13312 i = j = 0;
13313 for (i = 0; i < n_overlapping; i++) {
13314 hitpair_overlapping = array_overlapping[i];
13315 low = hitpair_overlapping->low;
13316 high = hitpair_overlapping->high;
13317 while (j >= 0 && array_separate[j]->high >= low) {
13318 j--;
13319 }
13320 j += 1;
13321
13322 subsumedp = false;
13323 while (j < n_separate && subsumedp == false && array_separate[j]->low <= high) {
13324 if (hitpair_goodness_cmp(&equalp,array_separate[j],
13325 hitpair_overlapping,finalp) > 0) {
13326 debug8(printf("separate pair %d better than overlapping pair %d\n",j,i));
13327 subsumedp = hitpair_subsumption(array_separate[j],hitpair_overlapping);
13328 debug8(printf(" checking if separate pair %d subsumes overlapping pair %d => %d\n",
13329 j,i,subsumedp));
13330 }
13331 j++;
13332 }
13333 j -= 1;
13334
13335 if (subsumedp == true) {
13336 Stage3pair_free(&hitpair_overlapping);
13337 } else {
13338 unique_overlapping = Hitlist_push(unique_overlapping,hitlistpool,(void *) hitpair_overlapping);
13339 }
13340 }
13341 }
13342
13343 #ifdef USE_ALLOCA_FOR_HITS
13344 FREEA(array_separate);
13345 FREEA(array_overlapping);
13346 #else
13347 FREE(array_separate);
13348 FREE(array_overlapping);
13349 #endif
13350
13351 hitpairlist = List_append(unique_overlapping,unique_separate);
13352 }
13353
13354
13355 /* Prune based on nmatches adjusted by score to get a tradeoff between matches and parsimony */
13356 /* Same as step 1 of Stage3pair_optimal_score_final */
13357 debug8(printf(" Step 3. Maximize nmatches adjusted by score (with slop)\n"));
13358 optimal = (List_T) NULL;
13359
13360 keptp = false;
13361 hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
13362 eliminate = (bool *) CALLOC(n,sizeof(bool));
13363 qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
13364 i = 0;
13365 while (i < n) {
13366 j = i+1;
13367 while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
13368 j++;
13369 }
13370 if (j - i > 1) {
13371 debug8(printf("Found a group from %d to %d\n",i,j));
13372 max_adj_nmatches = 0;
13373 for (k = i; k < j; k++) {
13374 hitpair = hitpairs[k];
13375 if ((score = hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
13376 - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall) > max_adj_nmatches) {
13377 max_adj_nmatches = score;
13378 }
13379 }
13380 debug8(printf("max_adj_nmatches = %d\n",max_adj_nmatches));
13381
13382 for (k = i; k < j; k++) {
13383 hitpair = hitpairs[k];
13384 if (hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
13385 - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall < max_adj_nmatches - ADJ_NMATCHES_SLOP) {
13386 debug8(printf("Within loci pair (adj score %d (%d+%d -%d-%d) < %d w/slop): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
13387 hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
13388 - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall,
13389 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13390 hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,max_adj_nmatches,
13391 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13392 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13393 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13394 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
13395 eliminate[k] = true;
13396
13397 } else {
13398 debug8(printf("Within loci pair (adj score %d (%d+%d -%d-%d) == %d w/slop): Keeping hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
13399 hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
13400 - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall,
13401 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13402 hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,max_adj_nmatches,
13403 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13404 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13405 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13406 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
13407 keptp = true;
13408 }
13409 }
13410 }
13411
13412 i = j;
13413 }
13414
13415 if (keptp == false) {
13416 optimal = hitpairlist;
13417 } else {
13418 for (k = 0; k < n; k++) {
13419 hitpair = hitpairs[k];
13420 if (eliminate[k] == true) {
13421 debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13422 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13423 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13424 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13425 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13426 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13427 Stage3pair_free(&hitpair);
13428 /* eliminatedp = true; */
13429 } else {
13430 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
13431 }
13432 }
13433 Hitlist_free(&hitpairlist);
13434 }
13435 FREE(hitpairs);
13436 FREE(eliminate);
13437 hitpairlist = optimal;
13438
13439
13440 /* Eliminate within loci: minimize nsegments and maximize splice score (for approximately equal insertlengths) */
13441 /* Since we have achieved same number of matches, we should minimize nsegments to achieve parsimony */
13442 debug8(printf(" Step 4. Minimize nsegments and splice score (for approximately equal insertlengths)\n"));
13443 optimal = (List_T) NULL;
13444
13445 keptp = false;
13446 hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
13447 eliminate = (bool *) CALLOC(n,sizeof(bool));
13448 qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
13449 i = 0;
13450 while (i < n) {
13451 j = i+1;
13452 while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
13453 j++;
13454 }
13455 if (j - i > 1) {
13456 debug8(printf("Found a group from %d to %d\n",i,j));
13457 best_nsegments = querylength5 + querylength3;
13458 best_insertlength_score = 99;
13459 max_splice_score = 0.0;
13460 for (k = i; k < j; k++) {
13461 hitpair = hitpairs[k];
13462 if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) < best_nsegments) {
13463 best_nsegments = nsegments;
13464 best_insertlength_score = calc_insertlength_score(hitpair->insertlength);
13465 max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
13466
13467 } else if (nsegments == best_nsegments) {
13468 if ((insertlength_score = calc_insertlength_score(hitpair->insertlength)) < best_insertlength_score) {
13469 best_insertlength_score = insertlength_score;
13470 max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
13471
13472 } else if (insertlength_score == best_insertlength_score) {
13473 if ((splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score) > max_splice_score) {
13474 max_splice_score = splice_score;
13475 }
13476 }
13477 }
13478 }
13479 debug8(printf("best_nsegments %d, best_insertlength_score %d, max_splice_score %f\n",
13480 best_nsegments,best_insertlength_score,max_splice_score));
13481
13482 for (k = i; k < j; k++) {
13483 hitpair = hitpairs[k];
13484 if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) > best_nsegments) {
13485 debug8(printf("Within loci pair (nsegments %d > %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13486 nsegments,best_nsegments,
13487 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13488 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13489 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13490 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13491 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13492 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13493 eliminate[k] = true;
13494
13495 } else if (calc_insertlength_score(hitpair->insertlength) > best_insertlength_score) {
13496 debug8(printf("Within loci pair (insertlength score %d > %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13497 calc_insertlength_score(hitpair->insertlength),best_insertlength_score,
13498 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13499 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13500 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13501 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13502 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13503 eliminate[k] = true;
13504
13505 } else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
13506 debug8(printf("Within loci pair (splice_score w/slop %f < %f): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13507 hitpair->hit5->splice_score + hitpair->hit3->splice_score,max_splice_score,
13508 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13509 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13510 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13511 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13512 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13513 eliminate[k] = true;
13514
13515 } else {
13516 keptp = true;
13517 }
13518 }
13519 }
13520
13521 i = j;
13522 }
13523
13524 if (keptp == false) {
13525 optimal = hitpairlist;
13526 } else {
13527 for (k = 0; k < n; k++) {
13528 hitpair = hitpairs[k];
13529 if (eliminate[k] == true) {
13530 debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13531 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13532 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13533 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13534 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13535 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13536 Stage3pair_free(&hitpair);
13537 /* eliminatedp = true; */
13538 } else {
13539 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
13540 }
13541 }
13542 Hitlist_free(&hitpairlist);
13543 }
13544 FREE(hitpairs);
13545 FREE(eliminate);
13546 hitpairlist = optimal;
13547
13548
13549 /* Eliminate within loci: minimize outerlength */
13550 debug8(printf(" Step 5. Minimize outerlength\n"));
13551 optimal = (List_T) NULL;
13552
13553 keptp = false;
13554 hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
13555 eliminate = (bool *) CALLOC(n,sizeof(bool));
13556 qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
13557 i = 0;
13558 while (i < n) {
13559 j = i+1;
13560 while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
13561 j++;
13562 }
13563 if (j - i > 1) {
13564 debug8(printf("Found a group from %d to %d\n",i,j));
13565 best_outerlength = (Chrpos_T) -1U;
13566 for (k = i; k < j; k++) {
13567 hitpair = hitpairs[k];
13568 if (hitpair->outerlength < best_outerlength) {
13569 best_outerlength = hitpair->outerlength;
13570 }
13571 }
13572 debug8(printf("best_outerlength %u\n",best_outerlength));
13573
13574 for (k = i; k < j; k++) {
13575 hitpair = hitpairs[k];
13576 if (hitpair->outerlength > best_outerlength) {
13577 debug8(printf("Within loci pair (outerlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
13578 hitpair->outerlength,best_outerlength /*+ OUTERLENGTH_SLOP*/,
13579 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13580 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13581 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13582 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
13583 eliminate[k] = true;
13584
13585 } else {
13586 debug8(printf("Within loci pair (outerlength %u == %u): Keeping hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims)\n",
13587 hitpair->outerlength,best_outerlength /*+ OUTERLENGTH_SLOP*/,
13588 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13589 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13590 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
13591 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims));
13592 keptp = true;
13593 }
13594 }
13595 }
13596
13597 i = j;
13598 }
13599
13600 if (keptp == false) {
13601 optimal = hitpairlist;
13602 } else {
13603 for (k = 0; k < n; k++) {
13604 hitpair = hitpairs[k];
13605 if (eliminate[k] == true) {
13606 debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
13607 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
13608 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
13609 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
13610 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
13611 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
13612 Stage3pair_free(&hitpair);
13613 /* eliminatedp = true; */
13614 } else {
13615 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
13616 }
13617 }
13618 Hitlist_free(&hitpairlist);
13619 }
13620 FREE(hitpairs);
13621 FREE(eliminate);
13622 hitpairlist = optimal;
13623
13624
13625 return hitpairlist;
13626 }
13627
13628
13629 #ifdef PRE_RESOLVE_MULTIMAPPING
13630 List_T
Stage3pair_resolve_multimapping(List_T hitpairs,Hitlistpool_T hitlistpool)13631 Stage3pair_resolve_multimapping (List_T hitpairs, Hitlistpool_T hitlistpool) {
13632 List_T resolve1, resolve2, resolve3, p;
13633 Stage3pair_T hitpair;
13634
13635 long int best_tally;
13636 double tally_threshold;
13637 bool runlengthp;
13638
13639
13640 if (List_length(hitpairs) <= 1) {
13641 return hitpairs;
13642 }
13643
13644 #if 0
13645 if (genes_iit == NULL) {
13646 resolve1 = hitpairs;
13647 } else {
13648 best_overlap = NO_KNOWN_GENE;
13649 for (p = hitpairs; p != NULL; p = p->rest) {
13650 hitpair = (Stage3pair_T) p->first;
13651 if ((hitpair->gene_overlap = Stage3pair_gene_overlap(hitpair)) > best_overlap) {
13652 best_overlap = hitpair->gene_overlap;
13653 }
13654 }
13655 if (best_overlap == NO_KNOWN_GENE) {
13656 resolve1 = hitpairs;
13657 } else {
13658 resolve1 = (List_T) NULL;
13659 for (p = hitpairs; p != NULL; p = p->rest) {
13660 hitpair = (Stage3pair_T) p->first;
13661 if (hitpair->gene_overlap < best_overlap) {
13662 Stage3pair_free(&hitpair);
13663 } else {
13664 resolve1 = Hitlist_push(resolve1,hitlistpool,(void *) hitpair);
13665 }
13666 }
13667 Hitlist_free(&hitpairs);
13668 }
13669 }
13670
13671 if (List_length(resolve1) <= 1) {
13672 return resolve1;
13673 }
13674 #else
13675 resolve1 = hitpairs;
13676 #endif
13677
13678 if (tally_iit == NULL) {
13679 resolve2 = resolve1;
13680 } else {
13681 best_tally = 0L;
13682 for (p = resolve1; p != NULL; p = p->rest) {
13683 hitpair = (Stage3pair_T) p->first;
13684 if ((hitpair->tally = Stage3end_compute_tally(hitpair->hit5) + Stage3end_compute_tally(hitpair->hit3)) > best_tally) {
13685 best_tally = hitpair->tally;
13686 }
13687 }
13688 if (best_tally == 0L) {
13689 resolve2 = resolve1;
13690 } else {
13691 resolve2 = (List_T) NULL;
13692 #ifdef USE_TALLY_RATIO
13693 tally_threshold = (double) best_tally / TALLY_RATIO;
13694 #else
13695 tally_threshold = 1.0;
13696 #endif
13697 for (p = resolve1; p != NULL; p = p->rest) {
13698 hitpair = (Stage3pair_T) p->first;
13699 if ((double) hitpair->tally < tally_threshold) {
13700 Stage3pair_free(&hitpair);
13701 } else {
13702 resolve2 = Hitlist_push(resolve2,hitlistpool,(void *) hitpair);
13703 }
13704 }
13705 Hitlist_free(&resolve1);
13706 }
13707 }
13708
13709 if (List_length(resolve2) <= 1) {
13710 return resolve2;
13711 }
13712
13713 if (runlength_iit == NULL) {
13714 resolve3 = resolve2;
13715 } else {
13716 runlengthp = false;
13717 for (p = resolve2; p != NULL; p = p->rest) {
13718 hitpair = (Stage3pair_T) p->first;
13719 if (Stage3end_runlength_p(hitpair->hit5) == true || Stage3end_runlength_p(hitpair->hit3) == true) {
13720 runlengthp = true;
13721 }
13722 }
13723 if (runlengthp == false) {
13724 resolve3 = resolve2;
13725 } else {
13726 resolve3 = (List_T) NULL;
13727 for (p = resolve2; p != NULL; p = p->rest) {
13728 hitpair = (Stage3pair_T) p->first;
13729 if (Stage3end_runlength_p(hitpair->hit5) == false && Stage3end_runlength_p(hitpair->hit3) == false) {
13730 Stage3pair_free(&hitpair);
13731 } else {
13732 resolve3 = Hitlist_push(resolve3,hitlistpool,(void *) hitpair);
13733 }
13734 }
13735 Hitlist_free(&resolve2);
13736 }
13737 }
13738
13739
13740 return resolve3;
13741 }
13742 #endif
13743
13744
13745 #if 0
13746 /* Eliminates entire pair even if only one end is bad. Should filter each end, and not each pair */
13747 List_T
13748 Stage3pair_filter (List_T hits, Hitlistpool_T hitlistpool,
13749 int max_mismatches_5, int max_mismatches_3,
13750 int min_coverage_5, int min_coverage_3) {
13751 List_T newhits = NULL, p;
13752 Stage3end_T hit5, hit3;
13753 Stage3pair_T hitpair;
13754
13755 /* Previously had option filter_within_trims_p to look at ref_score_overall */
13756 for (p = hits; p != NULL; p = List_next(p)) {
13757 hitpair = (Stage3pair_T) List_head(p);
13758 hit5 = hitpair->hit5;
13759 hit3 = hitpair->hit3;
13760 debug(printf("refalt_score_within_trims is %d and %d\n",hit5->refalt_score_within_trims,hit3->refalt_score_within_trims));
13761
13762 if (hit5->refalt_score_within_trims > max_mismatches_5 || hit3->refalt_score_within_trims > max_mismatches_3) {
13763 Stage3pair_free(&hitpair);
13764 } else if (hit5->querylength - hit5->trim_querystart - hit5->trim_queryend < min_coverage_5 &&
13765 hit3->querylength - hit3->trim_querystart - hit3->trim_queryend < min_coverage_3) {
13766 Stage3pair_free(&hitpair);
13767 } else {
13768 newhits = Hitlist_push(newhits,hitlistpool,(void *) hitpair);
13769 }
13770 }
13771
13772 Hitlist_free(&hits);
13773 return newhits;
13774 }
13775 #endif
13776
13777
13778 Stage3pair_T *
Stage3pair_eval_and_sort(int npaths,int * first_absmq,int * second_absmq,Stage3pair_T * stage3pairarray,char * queryuc_ptr_5,char * queryuc_ptr_3,char * quality_string_5,char * quality_string_3)13779 Stage3pair_eval_and_sort (int npaths, int *first_absmq, int *second_absmq,
13780 Stage3pair_T *stage3pairarray,
13781 char *queryuc_ptr_5, char *queryuc_ptr_3,
13782 char *quality_string_5, char *quality_string_3) {
13783 float maxlik, loglik;
13784
13785 float total, q;
13786 int mapq_score;
13787
13788 int compute_npaths;
13789 int randomi, i;
13790 Stage3pair_T temp, hitpair;
13791
13792 if (npaths == 0) {
13793 /* Skip */
13794 *first_absmq = 0;
13795 *second_absmq = 0;
13796
13797 } else if (npaths == 1) {
13798 hitpair = stage3pairarray[0];
13799 hitpair->mapq_loglik = MAPQ_MAXIMUM_SCORE;
13800 hitpair->mapq_score = MAPQ_max_quality_score(quality_string_5,hitpair->hit5->querylength);
13801 if ((mapq_score = MAPQ_max_quality_score(quality_string_3,hitpair->hit3->querylength)) > stage3pairarray[0]->mapq_score) {
13802 hitpair->mapq_score = mapq_score;
13803 }
13804 hitpair->absmq_score = MAPQ_MAXIMUM_SCORE;
13805
13806 Stage3end_display_prep(hitpair->hit5,queryuc_ptr_5,/*first_read_p*/true);
13807 Stage3end_display_prep(hitpair->hit3,queryuc_ptr_3,/*first_read_p*/false);
13808
13809 *first_absmq = hitpair->absmq_score;
13810 *second_absmq = 0;
13811
13812 } else {
13813
13814 /* Resolve ambiguities, needed for computing mapq */
13815 for (i = 0; i < npaths; i++) {
13816 hitpair = stage3pairarray[i];
13817 Stage3end_display_prep(hitpair->hit5,queryuc_ptr_5,/*first_read_p*/true);
13818 Stage3end_display_prep(hitpair->hit3,queryuc_ptr_3,/*first_read_p*/false);
13819 }
13820
13821
13822 /* Compute mapq_loglik */
13823 for (i = 0; i < npaths; i++) {
13824 hitpair = stage3pairarray[i];
13825 hitpair->mapq_loglik =
13826 Stage3end_compute_mapq(hitpair->hit5,quality_string_5);
13827 hitpair->mapq_loglik +=
13828 Stage3end_compute_mapq(hitpair->hit3,quality_string_3);
13829 }
13830
13831 /* Sort by nmatches, then mapq, and then insert length */
13832 qsort(stage3pairarray,npaths,sizeof(Stage3pair_T),Stage3pair_output_cmp);
13833
13834 if (want_random_p) {
13835 /* Randomize among best alignments */
13836 i = 1;
13837 while (i < npaths && Stage3pair_output_cmp(&(stage3pairarray[i]),&(stage3pairarray[0])) == 0) {
13838 i++;
13839 }
13840 if (i > 1) { /* i is number of ties */
13841 /* randomi = (int) ((double) i * rand()/((double) RAND_MAX + 1.0)); */
13842 randomi = (int) (rand() / (((double) RAND_MAX + 1.0) / (double) i));
13843 /* fprintf(stderr,"%d dups => random %d\n",i,randomi); */
13844 temp = stage3pairarray[0];
13845 stage3pairarray[0] = stage3pairarray[randomi];
13846 stage3pairarray[randomi] = temp;
13847 }
13848 }
13849
13850 /* Enforce monotonicity */
13851 for (i = npaths - 1; i > 0; i--) {
13852 if (stage3pairarray[i-1]->mapq_loglik < stage3pairarray[i]->mapq_loglik) {
13853 stage3pairarray[i-1]->mapq_loglik = stage3pairarray[i]->mapq_loglik;
13854 }
13855 }
13856 maxlik = stage3pairarray[0]->mapq_loglik;
13857
13858 /* Subtract maxlik to avoid underflow */
13859 for (i = 0; i < npaths; i++) {
13860 stage3pairarray[i]->mapq_loglik -= maxlik;
13861 }
13862
13863 #if 0
13864 /* Save on computation if possible */
13865 /* Doesn't work */
13866 if (npaths < maxpaths) {
13867 compute_npaths = npaths;
13868 } else {
13869 compute_npaths = maxpaths;
13870 }
13871 if (compute_npaths < 2) {
13872 compute_npaths = 2;
13873 }
13874 #else
13875 compute_npaths = npaths;
13876 #endif
13877
13878
13879 /* Compute absolute mapq */
13880 for (i = 0; i < compute_npaths; i++) {
13881 loglik = stage3pairarray[i]->mapq_loglik + MAPQ_MAXIMUM_SCORE;
13882 if (loglik < 0.0) {
13883 loglik = 0.0;
13884 }
13885 stage3pairarray[i]->absmq_score = rint(loglik);
13886 }
13887 *first_absmq = stage3pairarray[0]->absmq_score;
13888 *second_absmq = stage3pairarray[1]->absmq_score;
13889
13890
13891 /* Compute Bayesian mapq */
13892 total = 0.0;
13893 for (i = 0; i < npaths; i++) {
13894 total += (stage3pairarray[i]->mapq_loglik = fasterexp(stage3pairarray[i]->mapq_loglik));
13895 }
13896
13897 /* Obtain posterior probabilities of being true */
13898 for (i = 0; i < compute_npaths; i++) {
13899 stage3pairarray[i]->mapq_loglik /= total;
13900 }
13901
13902 /* Convert to Phred scores */
13903 for (i = 0; i < compute_npaths; i++) {
13904 if ((q = 1.0 - stage3pairarray[i]->mapq_loglik) < 2.5e-10 /* 10^-9.6 */) {
13905 stage3pairarray[i]->mapq_score = 96;
13906 } else {
13907 stage3pairarray[i]->mapq_score = rint(-10.0 * log10(q));
13908 }
13909 }
13910
13911 #if 0
13912 /* Apply filtering for mapq unique -- currently not used since mapq_unique_score is high */
13913 if (stage3pairarray[0]->mapq_score >= mapq_unique_score &&
13914 stage3pairarray[1]->mapq_score < mapq_unique_score) {
13915 for (i = 1; i < *npaths; i++) {
13916 Stage3pair_free(&(stage3pairarray[i]));
13917 }
13918 *npaths = 1;
13919 }
13920 #endif
13921 }
13922
13923 return stage3pairarray;
13924 }
13925
13926
13927 static List_T
Stage3pair_optimal_score_prefinal(bool * eliminatedp,List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3)13928 Stage3pair_optimal_score_prefinal (bool *eliminatedp, List_T hitpairlist,
13929 Hitlistpool_T hitlistpool, int querylength5, int querylength3) {
13930 List_T optimal = NULL, p, q;
13931 Stage3pair_T hitpair;
13932 T hit5, hit3;
13933 Substring_T substring;
13934 Junction_T junction;
13935 int cutoff_level_5, cutoff_level_3, ref_nmismatches;
13936 int n;
13937 int minscore5 = querylength5, minscore3 = querylength3, minscore = querylength5 + querylength3;
13938 #ifdef USE_OPTIMAL_SCORE_BINGO
13939 int minscore_bingo = querylength5 + querylength3;
13940 #endif
13941 int trim_querystart_5 = 0, trim_queryend_5 = 0, trim_querystart_3 = 0, trim_queryend_3 = 0,
13942 trim_querystart_0, trim_queryend_0;
13943
13944
13945 #if 0 /* DISTANT_SPLICE_SPECIAL */
13946 bool shortdistance_p = false;
13947 #endif
13948
13949
13950 *eliminatedp = false;
13951 n = List_length(hitpairlist);
13952 debug8(printf("\nEntered Stage3pair_optimal_score_prefinal with %d hitpairs\n",n));
13953
13954 if (n <= 1) {
13955 return hitpairlist;
13956 }
13957
13958
13959 /* Use eventrim for comparing alignments. Previously picked
13960 smallest trims, but now picking largest ones */
13961 for (p = hitpairlist; p != NULL; p = p->rest) {
13962 hitpair = (Stage3pair_T) p->first;
13963 hit5 = hitpair->hit5;
13964 hit3 = hitpair->hit3;
13965
13966 debug8(printf("hit5 %u..%u method %s, nsegments %d, nindels %d, trim_querystart: %d%s, trim_queryend %d%s, start_ambig %d, end_ambig %d, sensedir %d, splice score %f\n",
13967 hit5->genomicstart - hit5->chroffset,hit5->genomicend - hit5->chroffset,Method_string(hit5->method),
13968 hit5->nsegments,hit5->nindels,hit5->trim_querystart,hit5->trim_querystart_splicep ? " (splice)" : "",
13969 hit5->trim_queryend,hit5->trim_queryend_splicep ? " (splice)" : "",
13970 start_amb_length(hit5),end_amb_length(hit5),hit5->sensedir,hit5->splice_score));
13971
13972 debug8(printf("hit3 %u..%u method %s, nsegments %d, nindels %d, trim_querystart %d%s, trim_queryend %d%s, start_ambig %d, end_ambig %d, sensedir %d, splice score %f\n\n",
13973 hit3->genomicstart - hit3->chroffset,hit3->genomicend - hit3->chroffset,Method_string(hit3->method),
13974 hit3->nsegments,hit3->nindels,hit3->trim_querystart,hit3->trim_querystart_splicep ? " (splice)" : "",
13975 hit3->trim_queryend,hit3->trim_queryend_splicep ? " (splice)" : "",
13976 start_amb_length(hit3),end_amb_length(hit3),hit3->sensedir,hit3->splice_score));
13977
13978 if (hit5->trim_querystart_splicep == true) {
13979 /* Skip */
13980 } else if (hit5->trim_querystart > trim_querystart_5) {
13981 trim_querystart_5 = hit5->trim_querystart;
13982 }
13983 if (hit5->trim_queryend_splicep == true) {
13984 /* Skip */
13985 } else if (hit5->trim_queryend > trim_queryend_5) {
13986 trim_queryend_5 = hit5->trim_queryend;
13987 }
13988
13989 if (hit3->trim_querystart_splicep == true) {
13990 /* Skip */
13991 } else if (hit3->trim_querystart > trim_querystart_3) {
13992 trim_querystart_3 = hit3->trim_querystart;
13993 }
13994 if (hit3->trim_queryend_splicep == true) {
13995 /* Skip */
13996 } else if (hit3->trim_queryend > trim_queryend_3) {
13997 trim_queryend_3 = hit3->trim_queryend;
13998 }
13999 }
14000
14001 if (trim_querystart_5 == querylength5) {
14002 trim_querystart_5 = 0;
14003 }
14004 if (trim_queryend_5 == querylength5) {
14005 trim_queryend_5 = 0;
14006 }
14007 if (trim_querystart_3 == querylength3) {
14008 trim_querystart_3 = 0;
14009 }
14010 if (trim_queryend_3 == querylength3) {
14011 trim_queryend_3 = 0;
14012 }
14013
14014 debug8(printf("overall 5': trim_querystart %d, trim_queryend %d\n",trim_querystart_5,trim_queryend_5));
14015 debug8(printf("overall 3': trim_querystart %d, trim_queryend %d\n",trim_querystart_3,trim_queryend_3));
14016
14017
14018 for (p = hitpairlist; p != NULL; p = p->rest) {
14019 hitpair = (Stage3pair_T) p->first;
14020 hit5 = hitpair->hit5;
14021 hit3 = hitpair->hit3;
14022
14023 #ifdef CONSIDER_ENDS_IN_EVAL
14024 hit5->score_eventrim = hit5->trim_querystart / 8 + hit5->trim_queryend / 8;
14025 #else
14026 hit5->score_eventrim = 0;
14027 #endif
14028
14029 debug8(printf("score 5' OTHER:"));
14030
14031 if (trim_querystart_5 + trim_queryend_5 >= querylength5) {
14032 for (q = hit5->substrings_1toN; q != NULL; q = List_next(q)) {
14033 substring = (Substring_T) List_head(q);
14034 hit5->score_eventrim += Substring_nmismatches_bothdiff(substring);
14035 }
14036
14037 } else {
14038 for (q = hit5->substrings_1toN; q != NULL; q = List_next(q)) {
14039 substring = (Substring_T) List_head(q);
14040 trim_querystart_0 = trim_querystart_5;
14041 trim_queryend_0 = trim_queryend_5;
14042 if (Substring_mandatory_trim_querystart(substring) > trim_querystart_0) {
14043 trim_querystart_0 = Substring_mandatory_trim_querystart(substring);
14044 }
14045 if (Substring_mandatory_trim_queryend(substring) > trim_queryend_0) {
14046 trim_queryend_0 = Substring_mandatory_trim_queryend(substring);
14047 }
14048 hit5->score_eventrim += Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0);
14049 debug8(printf(" substring (%d..%d) %d.",trim_querystart_5,trim_queryend_5,
14050 Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0)));
14051 }
14052 }
14053
14054 for (q = hit5->junctions_1toN; q != NULL; q = List_next(q)) {
14055 junction = (Junction_T) List_head(q);
14056 if (Junction_nindels(junction) > 0) {
14057 hit5->score_eventrim += indel_penalty_middle;
14058 debug8(printf(" => add %d.",indel_penalty_middle));
14059 }
14060 }
14061
14062
14063 #if 0
14064 /* Accept a single indel */
14065 #ifdef SCORE_INDELS_EVENTRIM
14066 if (hit5->hittype == INSERTION || hit5->hittype == DELETION) {
14067 debug8(printf(" indel at %d",hit5->indel_pos));
14068 if (hit5->indel_pos > trim_querystart_5 && hit5->indel_pos < querylength5 - trim_queryend_5) {
14069 hit5->score_eventrim += indel_penalty_middle;
14070 debug8(printf(" => add %d.",indel_penalty_middle));
14071 }
14072 }
14073 #endif
14074 #endif
14075 debug8(printf(" RESULT: %d\n",hit5->score_eventrim));
14076
14077 if (hitpair->hit5->score_eventrim < minscore5) {
14078 minscore5 = hitpair->hit5->score_eventrim;
14079 }
14080
14081
14082 #ifdef CONSIDER_ENDS_IN_EVAL
14083 hit3->score_eventrim = hit3->trim_querystart / 8 + hit3->trim_queryend / 8;
14084 #else
14085 hit3->score_eventrim = 0;
14086 #endif
14087
14088 debug8(printf("score 3' OTHER:"));
14089
14090 if (trim_querystart_3 + trim_queryend_3 >= querylength3) {
14091 for (q = hit3->substrings_1toN; q != NULL; q = List_next(q)) {
14092 substring = (Substring_T) List_head(q);
14093 hit3->score_eventrim += Substring_nmismatches_bothdiff(substring);
14094 }
14095
14096 } else {
14097 for (q = hit3->substrings_1toN; q != NULL; q = List_next(q)) {
14098 substring = (Substring_T) List_head(q);
14099 trim_querystart_0 = trim_querystart_3;
14100 trim_queryend_0 = trim_queryend_3;
14101 if (Substring_mandatory_trim_querystart(substring) > trim_querystart_0) {
14102 trim_querystart_0 = Substring_mandatory_trim_querystart(substring);
14103 }
14104 if (Substring_mandatory_trim_queryend(substring) > trim_queryend_0) {
14105 trim_queryend_0 = Substring_mandatory_trim_queryend(substring);
14106 }
14107 hit3->score_eventrim += Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0);
14108 debug8(printf(" substring (%d..%d) %d.",trim_querystart_3,trim_queryend_3,
14109 Substring_count_mismatches_region(&ref_nmismatches,substring,trim_querystart_0,trim_queryend_0)));
14110 }
14111 }
14112
14113 for (q = hit3->junctions_1toN; q != NULL; q = List_next(q)) {
14114 junction = (Junction_T) List_head(q);
14115 if (Junction_nindels(junction) > 0) {
14116 hit3->score_eventrim += indel_penalty_middle;
14117 debug8(printf(" => add %d.",indel_penalty_middle));
14118 }
14119 }
14120
14121 #if 0
14122 /* Accept a single indel */
14123 #ifdef SCORE_INDELS_EVENTRIM
14124 if (hit3->hittype == INSERTION || hit3->hittype == DELETION) {
14125 debug8(printf(" indel at %d",hit3->indel_pos));
14126 if (hit3->indel_pos > trim_querystart_3 && hit3->indel_pos < querylength3 - trim_queryend_3) {
14127 hit3->score_eventrim += indel_penalty_middle;
14128 debug8(printf(" => add %d.",indel_penalty_middle));
14129 }
14130 }
14131 #endif
14132 #endif
14133 debug8(printf(" RESULT: %d\n",hit3->score_eventrim));
14134
14135 if (hitpair->hit3->score_eventrim < minscore3) {
14136 minscore3 = hitpair->hit3->score_eventrim;
14137 }
14138
14139
14140 /* Compute for hitpair */
14141 debug8(printf("hitpair score_eventrim %d = %d + %d\n",
14142 hit5->score_eventrim + hit3->score_eventrim,
14143 hit5->score_eventrim,hit3->score_eventrim));
14144 hitpair->score_eventrim = hit5->score_eventrim + hit3->score_eventrim;
14145 if (hitpair->score_eventrim < minscore) {
14146 minscore = hitpair->score_eventrim;
14147 }
14148
14149 }
14150 debug8(printf("MINSCORE: %d\n",minscore));
14151
14152
14153 /* Prefinal: Use score_eventrim */
14154 debug8(printf("Stage3pair_optimal_score_prefinal over %d pairs: minscore = %d and %d + subopt:%d\n",
14155 n,minscore5,minscore3,subopt_levels));
14156
14157 /* finalp == false. Add suboptimal_mismatches to each end. */
14158 minscore5 += subopt_levels;
14159 minscore3 += subopt_levels;
14160 cutoff_level_5 = minscore5;
14161 cutoff_level_3 = minscore3;
14162
14163 for (p = hitpairlist; p != NULL; p = p->rest) {
14164 hitpair = (Stage3pair_T) p->first;
14165
14166 if (hitpair->hit5->score_eventrim > cutoff_level_5 + SCORE_EVENTRIM_SLOP && hitpair->hit3->score_eventrim > cutoff_level_3 + SCORE_EVENTRIM_SLOP) {
14167 debug8(printf("Prefinal: Eliminating hit pair %p at %u..%u|%u..%u with score_eventrim_5 %d > cutoff_level_5 %d and score_eventrim_3 %d > cutoff_level_3 %d, sensedirs %d and %d, splice scores %f and %f\n",
14168 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14169 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14170 hitpair->hit5->score_eventrim,cutoff_level_5,hitpair->hit3->score_eventrim,cutoff_level_3,
14171 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14172 Stage3pair_free(&hitpair);
14173 *eliminatedp = true;
14174
14175 } else {
14176 debug8(printf("Prefinal: Keeping hit pair %p at %u..%u|%u..%u with score_eventrim_5 %d <= cutoff_level_5 %d or score_eventrim_3 %d <= cutoff_level_3 %d, sensedirs %d and %d, splice scores %f and %f\n",
14177 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14178 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14179 hitpair->hit5->score_eventrim,cutoff_level_5,hitpair->hit3->score_eventrim,cutoff_level_3,
14180 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14181 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14182 }
14183 }
14184 Hitlist_free(&hitpairlist);
14185
14186
14187 #if 0
14188 /* Filter on nsegments */
14189 if (finalp == true && optimal != NULL) {
14190 hitpairlist = optimal;
14191 optimal = (List_T) NULL;
14192
14193 hitpair = (Stage3pair_T) hitpairlist->first;
14194 best_nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments;
14195 best_nsegments_5 = hitpair->hit5->nsegments;
14196 best_nsegments_3 = hitpair->hit3->nsegments;
14197
14198 for (p = hitpairlist; p != NULL; p = p->rest) {
14199 hitpair = (Stage3pair_T) p->first;
14200 if (hitpair->hit5->nsegments + hitpair->hit3->nsegments < best_nsegments) {
14201 best_nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments;
14202 }
14203 if (hitpair->hit5->nsegments < best_nsegments_5) {
14204 best_nsegments_5 = hitpair->hit5->nsegments;
14205 }
14206 if (hitpair->hit3->nsegments < best_nsegments_3) {
14207 best_nsegments_3 = hitpair->hit3->nsegments;
14208 }
14209 }
14210
14211 for (p = hitpairlist; p != NULL; p = p->rest) {
14212 hitpair = (Stage3pair_T) p->first;
14213 if (hitpair->hit5->nsegments + hitpair->hit3->nsegments > best_nsegments + 2) {
14214 debug8(printf("Eliminating hit pair %p with nsegments %d+%d, sensedirs %d and %d, splice scores %f and %f\n",
14215 hitpair,hitpair->hit5->nsegments,hitpair->hit3->nsegments,
14216 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14217 Stage3pair_free(&hitpair);
14218 *eliminatedp = true;
14219 } else {
14220 debug8(printf("Keeping hit pair %p with nsegments %d+%d, sensedirs %d and %d, splice scores %f and %f\n",
14221 hitpair,hitpair->hit5->nsegments,hitpair->hit3->nsegments,
14222 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14223 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14224 }
14225 }
14226
14227 Hitlist_free(&hitpairlist);
14228 }
14229 #endif
14230
14231
14232 #if 0
14233 /* Filter on pairlength */
14234 if (optimal != NULL) {
14235 hitpairlist = optimal;
14236 optimal = (List_T) NULL;
14237
14238 hitpair = (Stage3pair_T) hitpairlist->first;
14239 best_absdifflength = hitpair->absdifflength;
14240 best_outerlength = hitpair->outerlength;
14241
14242 for (p = hitpairlist; p != NULL; p = p->rest) {
14243 hitpair = (Stage3pair_T) p->first;
14244 if (hitpair->absdifflength < best_absdifflength) {
14245 best_absdifflength = hitpair->absdifflength;
14246 best_outerlength = hitpair->outerlength;
14247 } else if (hitpair->absdifflength > best_absdifflength) {
14248 /* Skip */
14249 } else if (hitpair->outerlength < best_outerlength) {
14250 best_outerlength = hitpair->outerlength;
14251 }
14252 }
14253
14254 for (p = hitpairlist; p != NULL; p = p->rest) {
14255 hitpair = (Stage3pair_T) p->first;
14256 if (hitpair->absdifflength > best_absdifflength) {
14257 debug8(printf("Eliminating hit pair %p with absdifflength %d\n",hitpair,hitpair->absdifflength));
14258 Stage3pair_free(&hitpair);
14259 *eliminatedp = true;
14260 } else if (hitpair->outerlength > best_outerlength + OUTERLENGTH_SLOP) {
14261 debug8(printf("Eliminating hit pair %p with outerlength %u\n",hitpair,hitpair->outerlength));
14262 Stage3pair_free(&hitpair);
14263 *eliminatedp = true;
14264 } else {
14265 debug8(printf("Keeping hit pair %p with absdifflength %d and outerlength %d\n",
14266 hitpair,hitpair->absdifflength,hitpair->outerlength));
14267 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14268 }
14269 }
14270
14271 Hitlist_free(&hitpairlist);
14272 }
14273 #endif
14274
14275 debug8(printf("Exiting Stage3pair_optimal_score_prefinal with %d hits\n",List_length(optimal)));
14276 return optimal;
14277 }
14278
14279
14280 /* Desired criteria: (A) within locus: (A.1) nsegments within locus,
14281 to get most complete alignment; (A.2) insertlength; and (A.3)
14282 splice_score, to get the correct sensedir. (B) between loci:
14283 nmatches (and not nmatches_to_trims), to end alignments at the
14284 splice site */
14285
14286 #if 0
14287 static List_T
14288 Stage3pair_optimal_score_final_old (bool *eliminatedp, List_T hitpairlist,
14289 Hitlistpool_T hitlistpool, int querylength5, int querylength3) {
14290 List_T optimal = NULL, p;
14291 Stage3pair_T *hitpairs, hitpair;
14292 int n, i, j, k;
14293 int best_nsegments, nsegments;
14294 int best_insertlength_score, insertlength_score;
14295 int best_nmatches_to_trims, nmatches_to_trims;
14296 double max_splice_score, splice_score;
14297 int max_nmatches = 0, cutoff_level;
14298 /* int trim5_left, trim5_right, trim3_left, trim3_right, min_trim; */
14299 bool *eliminate, keptp;
14300
14301 /* Relies on Path_solve_from_diagonals to maximize the number of segments at each locus */
14302
14303 *eliminatedp = false;
14304 n = List_length(hitpairlist);
14305 debug8(printf("\nEntered Stage3pair_optimal_score_final with %d hitpairs\n",n));
14306
14307 if (n <= 1) {
14308 return hitpairlist;
14309 }
14310
14311 #ifdef DEBUG8
14312 for (p = hitpairlist; p != NULL; p = p->rest) {
14313 hitpair = (Stage3pair_T) p->first;
14314 printf("%p %p %u..%u|%u..%u methods %s and %s, nsegments %d+%d, nmatches %d+%d (%d+%d to trims), pairlength %u, outerlength %u, sensedirs %d and %d, splice scores %f and %f\n",
14315 hitpair->hit5,hitpair->hit3,
14316 hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14317 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14318 Method_string(hitpair->hit5->method),Method_string(hitpair->hit3->method),
14319 hitpair->hit5->nsegments,hitpair->hit3->nsegments,
14320 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14321 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14322 hitpair->insertlength,hitpair->outerlength,
14323 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score);
14324 }
14325 #endif
14326
14327
14328 /* Prune based on refalt_nmatches_plus_spliced_trims (to get the splice ends) */
14329 max_nmatches = 0;
14330 for (p = hitpairlist; p != NULL; p = p->rest) {
14331 hitpair = (Stage3pair_T) p->first;
14332 if (hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims > max_nmatches) {
14333 max_nmatches = hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims;
14334 assert(max_nmatches <= querylength5 + querylength3);
14335 }
14336 }
14337
14338 /* May not want to be greedy on cutoff level here. Might want to raise subopt_levels */
14339 cutoff_level = max_nmatches - subopt_levels;
14340 debug8(printf("(1) refalt cutoff level %d = max_nmatches %d\n",cutoff_level,max_nmatches));
14341
14342 for (p = hitpairlist; p != NULL; p = p->rest) {
14343 hitpair = (Stage3pair_T) p->first;
14344
14345 if (hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims < cutoff_level /*- NMATCHES_SLOP*/) {
14346 debug8(printf("Final (nmatches %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to_trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14347 hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14348 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14349 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14350 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14351 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14352 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14353 Stage3pair_free(&hitpair);
14354 *eliminatedp = true;
14355
14356 } else {
14357 debug8(printf("Final (nmatches %d >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to_trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14358 hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14359 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14360 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14361 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14362 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14363 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14364 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14365 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14366 }
14367 }
14368 Hitlist_free(&hitpairlist);
14369 hitpairlist = optimal;
14370 optimal = (List_T) NULL;
14371
14372
14373 /* Prune based on ref_nmatches_plus_spliced_trims (to get the splice ends) */
14374 max_nmatches = 0;
14375 for (p = hitpairlist; p != NULL; p = p->rest) {
14376 hitpair = (Stage3pair_T) p->first;
14377 if (hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims > max_nmatches) {
14378 max_nmatches = hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims;
14379 assert(max_nmatches <= querylength5 + querylength3);
14380 }
14381 }
14382
14383 /* May not want to be greedy on cutoff level here. Might want to raise subopt_levels */
14384 cutoff_level = max_nmatches - subopt_levels;
14385 debug8(printf("(2) ref cutoff level %d = max_nmatches %d\n",cutoff_level,max_nmatches));
14386
14387 for (p = hitpairlist; p != NULL; p = p->rest) {
14388 hitpair = (Stage3pair_T) p->first;
14389
14390 if (hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims < cutoff_level /*- NMATCHES_SLOP*/) {
14391 debug8(printf("Final (nmatches %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to_trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14392 hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14393 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14394 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14395 hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
14396 hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims,cutoff_level,
14397 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14398 Stage3pair_free(&hitpair);
14399 *eliminatedp = true;
14400
14401 } else {
14402 debug8(printf("Final (nmatches %d >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to_trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14403 hitpair->hit5->ref_nmatches_plus_spliced_trims + hitpair->hit3->ref_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14404 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14405 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14406 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14407 hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
14408 hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims,cutoff_level,
14409 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14410 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14411 }
14412 }
14413 Hitlist_free(&hitpairlist);
14414 hitpairlist = optimal;
14415 optimal = (List_T) NULL;
14416
14417
14418 /* Prune based on refalt_nmatches_to_trims */
14419 best_nmatches_to_trims = 0;
14420 for (p = hitpairlist; p != NULL; p = p->rest) {
14421 hitpair = (Stage3pair_T) p->first;
14422 if (hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims > best_nmatches_to_trims) {
14423 best_nmatches_to_trims = hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims;
14424 assert(best_nmatches_to_trims <= querylength5 + querylength3);
14425 }
14426 }
14427
14428 cutoff_level = best_nmatches_to_trims - subopt_levels;
14429 debug8(printf("cutoff level %d = best_nmatches_to_trims %d\n",cutoff_level,best_nmatches_to_trims));
14430
14431 /* Do not allow slop for final */
14432 for (p = hitpairlist; p != NULL; p = p->rest) {
14433 hitpair = (Stage3pair_T) p->first;
14434
14435 if (hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims < cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/) {
14436 debug8(printf("Final (nmatches_to_trims %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14437 hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
14438 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14439 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14440 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14441 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14442 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14443 Stage3pair_free(&hitpair);
14444 *eliminatedp = true;
14445
14446 } else {
14447 debug8(printf("Final (nmatches %d (%d ref) to trims >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14448 hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims,
14449 hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
14450 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14451 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14452 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14453 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14454 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14455 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14456 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14457 }
14458 }
14459 Hitlist_free(&hitpairlist);
14460 hitpairlist = optimal;
14461 optimal = (List_T) NULL;
14462
14463
14464 /* Eliminate within loci (1): refalt_nmatches_to_trims_only */
14465 keptp = false;
14466 hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
14467 eliminate = (bool *) CALLOC(n,sizeof(bool));
14468 qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
14469 i = 0;
14470 while (i < n) {
14471 j = i+1;
14472 while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
14473 j++;
14474 }
14475 if (j - i > 1) {
14476 debug8(printf("Found a group from %d to %d\n",i,j));
14477 best_nmatches_to_trims = 0;
14478 for (k = i; k < j; k++) {
14479 hitpair = hitpairs[k];
14480 if ((nmatches_to_trims = hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims) > best_nmatches_to_trims) {
14481 best_nmatches_to_trims = nmatches_to_trims;
14482 }
14483 }
14484 debug8(printf("best_nmatches_to_trims %d\n",best_nmatches_to_trims));
14485
14486 for (k = i; k < j; k++) {
14487 hitpair = hitpairs[k];
14488 /* Do not allow slop for final */
14489 if ((nmatches_to_trims = hitpair->hit5->refalt_nmatches_to_trims + hitpair->hit3->refalt_nmatches_to_trims) < best_nmatches_to_trims /*- NMATCHES_TO_TRIMS_SLOP*/) {
14490 debug8(printf("Within loci pair (nmatches_to_trims %d < %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14491 nmatches_to_trims,best_nmatches_to_trims /*- NMATCHES_TO_TRIMS_SLOP*/,
14492 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14493 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14494 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14495 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14496 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14497 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14498 eliminate[k] = true;
14499 } else {
14500 keptp = true;
14501 }
14502 }
14503 }
14504
14505 i = j;
14506 }
14507
14508 if (keptp == false) {
14509 optimal = hitpairlist;
14510 } else {
14511 for (k = 0; k < n; k++) {
14512 hitpair = hitpairs[k];
14513 if (eliminate[k] == true) {
14514 debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14515 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14516 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14517 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14518 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14519 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14520 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14521 Stage3pair_free(&hitpair);
14522 *eliminatedp = true;
14523 } else {
14524 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14525 }
14526 }
14527 Hitlist_free(&hitpairlist);
14528 }
14529 FREE(hitpairs);
14530 FREE(eliminate);
14531 hitpairlist = optimal;
14532 optimal = (List_T) NULL;
14533
14534
14535 /* Eliminate within loci (2): nsegments and splice score */
14536 keptp = false;
14537 hitpairs = (Stage3pair_T *) List_to_array_n(&n,hitpairlist);
14538 eliminate = (bool *) CALLOC(n,sizeof(bool));
14539 qsort(hitpairs,n,sizeof(Stage3pair_T),hitpair_position_cmp);
14540 i = 0;
14541 while (i < n) {
14542 j = i+1;
14543 while (j < n && hitpair_overlap_p(hitpairs[j],hitpairs[i]) == true) {
14544 j++;
14545 }
14546 if (j - i > 1) {
14547 debug8(printf("Found a group from %d to %d\n",i,j));
14548 best_nsegments = 0;
14549 best_insertlength_score = 99;
14550 max_splice_score = 0.0;
14551 for (k = i; k < j; k++) {
14552 hitpair = hitpairs[k];
14553 if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) > best_nsegments) {
14554 best_nsegments = nsegments;
14555 best_insertlength_score = calc_insertlength_score(hitpair->insertlength);
14556 max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
14557
14558 } else if (nsegments == best_nsegments) {
14559 if ((insertlength_score = calc_insertlength_score(hitpair->insertlength)) < best_insertlength_score) {
14560 best_insertlength_score = insertlength_score;
14561 max_splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score;
14562
14563 } else if (insertlength_score == best_insertlength_score) {
14564 if ((splice_score = hitpair->hit5->splice_score + hitpair->hit3->splice_score) > max_splice_score) {
14565 max_splice_score = splice_score;
14566 }
14567 }
14568 }
14569 }
14570 debug8(printf("best_nsegments %d, best_insertlength_score %d, max_splice_score %f\n",
14571 best_nsegments,best_insertlength_score,max_splice_score));
14572
14573 for (k = i; k < j; k++) {
14574 hitpair = hitpairs[k];
14575 if ((nsegments = hitpair->hit5->nsegments + hitpair->hit3->nsegments) < best_nsegments) {
14576 debug8(printf("Within loci pair (nsegments %d < %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14577 nsegments,best_nsegments,
14578 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14579 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14580 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14581 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14582 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14583 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14584 eliminate[k] = true;
14585
14586 } else if (calc_insertlength_score(hitpair->insertlength) > best_insertlength_score) {
14587 debug8(printf("Within loci pair (insertlength score %d > %d): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14588 calc_insertlength_score(hitpair->insertlength),best_insertlength_score,
14589 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14590 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14591 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14592 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14593 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14594 eliminate[k] = true;
14595
14596 } else if (hitpair->hit5->splice_score + hitpair->hit3->splice_score < max_splice_score - SPLICE_SCORE_SLOP) {
14597 debug8(printf("Within loci pair (splice_score w/slop %f < %f): Marking hit pair %p for elimination at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14598 hitpair->hit5->splice_score + hitpair->hit3->splice_score,max_splice_score,
14599 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14600 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14601 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14602 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14603 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14604 eliminate[k] = true;
14605
14606 } else {
14607 keptp = true;
14608 }
14609 }
14610 }
14611
14612 i = j;
14613 }
14614
14615 if (keptp == false) {
14616 optimal = hitpairlist;
14617 } else {
14618 for (k = 0; k < n; k++) {
14619 hitpair = hitpairs[k];
14620 if (eliminate[k] == true) {
14621 debug8(printf("Within loci pair: Eliminating hit pair %p at %u..%u|%u..%u with nsegments %d+%d, pairlength %u, nmatches %d+%d (%d+%d to_trims), sensedirs %d and %d, splice scores %f and %f\n",
14622 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14623 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14624 hitpair->hit5->nsegments,hitpair->hit3->nsegments,hitpair->insertlength,
14625 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14626 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14627 Stage3pair_free(&hitpair);
14628 *eliminatedp = true;
14629 } else {
14630 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14631 }
14632 }
14633 Hitlist_free(&hitpairlist);
14634 }
14635 FREE(hitpairs);
14636 FREE(eliminate);
14637 hitpairlist = optimal;
14638 /* optimal = (List_T) NULL; */
14639
14640
14641 #if 0
14642 /* Filter on trim amount */
14643 hitpairlist = optimal;
14644 optimal = (List_T) NULL;
14645 min_trim = querylength5 + querylength3;
14646 for (p = hitpairlist; p != NULL; p = p->rest) {
14647 hitpair = (Stage3pair_T) p->first;
14648
14649 if (hitpair->hit5->trim_querystart_splicep == true) {
14650 /* Skip */
14651 trim5_left = 0;
14652 } else {
14653 trim5_left = hitpair->hit5->trim_querystart;
14654 }
14655 if (hitpair->hit5->trim_queryend_splicep == true) {
14656 /* Skip */
14657 trim5_right = 0;
14658 } else {
14659 trim5_right = hitpair->hit5->trim_queryend;
14660 }
14661
14662 if (hitpair->hit3->trim_querystart_splicep == true) {
14663 /* Skip */
14664 trim3_left = 0;
14665 } else {
14666 trim3_left = hitpair->hit3->trim_querystart;
14667 }
14668 if (hitpair->hit3->trim_queryend_splicep == true) {
14669 /* Skip */
14670 trim3_right = 0;
14671 } else {
14672 trim3_right = hitpair->hit3->trim_queryend;
14673 }
14674
14675 if (trim5_left + trim5_right + trim3_left + trim3_right < min_trim) {
14676 min_trim = trim5_left + trim5_right + trim3_left + trim3_right;
14677 }
14678 }
14679
14680 for (p = hitpairlist; p != NULL; p = p->rest) {
14681 hitpair = (Stage3pair_T) p->first;
14682
14683 if (hitpair->hit5->trim_querystart_splicep == true) {
14684 /* Skip */
14685 trim5_left = 0;
14686 } else {
14687 trim5_left = hitpair->hit5->trim_querystart;
14688 }
14689 if (hitpair->hit5->trim_queryend_splicep == true) {
14690 /* Skip */
14691 trim5_right = 0;
14692 } else {
14693 trim5_right = hitpair->hit5->trim_queryend;
14694 }
14695
14696 if (hitpair->hit3->trim_querystart_splicep == true) {
14697 /* Skip */
14698 trim3_left = 0;
14699 } else {
14700 trim3_left = hitpair->hit3->trim_querystart;
14701 }
14702 if (hitpair->hit3->trim_queryend_splicep == true) {
14703 /* Skip */
14704 trim3_right = 0;
14705 } else {
14706 trim3_right = hitpair->hit3->trim_queryend;
14707 }
14708
14709 if (trim5_left + trim5_right + trim3_left + trim3_right > min_trim) {
14710 debug8(printf("Final (trim): Eliminating hit pair %p at %u..%u|%u..%u for trim %d+%d+%d+%d\n",
14711 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14712 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14713 trim5_left,trim5_right,trim3_left,trim3_right));
14714 Stage3pair_free(&hitpair);
14715 *eliminatedp = true;
14716
14717 } else {
14718 debug8(printf("Final (trim): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) for trim %d+%d+%d+%d\n",
14719 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14720 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14721 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14722 trim5_left,trim5_right,trim3_left,trim3_right));
14723 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14724 }
14725 }
14726 Hitlist_free(&hitpairlist);
14727 hitpairlist = optimal;
14728 optimal = (List_T) NULL;
14729 #endif
14730
14731
14732 #if 0
14733 /* Not good, especially for homologous chromosomes. Use insert_length only within loci, not between */
14734 /* Then find smallest insert length and outerlength across loci */
14735 best_insertlength = (Chrpos_T) -1;
14736 for (p = hitpairlist; p != NULL; p = p->rest) {
14737 hitpair = (Stage3pair_T) p->first;
14738 if (hitpair->insertlength < best_insertlength) {
14739 best_insertlength = hitpair->insertlength;
14740 }
14741 }
14742
14743 for (p = hitpairlist; p != NULL; p = p->rest) {
14744 hitpair = (Stage3pair_T) p->first;
14745
14746 if (hitpair->insertlength > best_insertlength /*+ INSERTLENGTH_SLOP*/) { /* No slop for final */
14747 debug8(printf("Final (insertlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to_trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14748 hitpair->insertlength,best_insertlength,
14749 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14750 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14751 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14752 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14753 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14754 Stage3pair_free(&hitpair);
14755 *eliminatedp = true;
14756
14757 } else {
14758 debug8(printf("Final (insertlength %u, outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to_trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14759 hitpair->insertlength,hitpair->outerlength,
14760 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14761 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14762 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14763 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14764 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14765 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14766 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14767 }
14768 }
14769 Hitlist_free(&hitpairlist);
14770 hitpairlist = optimal;
14771 optimal = (List_T) NULL;
14772 #endif
14773
14774 #if 0
14775 /* Not good, especially for homologous chromosomes. Use outerlength only within loci, not between */
14776 /* Finally find smallest outerlength across loci */
14777 best_outerlength = (Chrpos_T) -1;
14778 for (p = hitpairlist; p != NULL; p = p->rest) {
14779 hitpair = (Stage3pair_T) p->first;
14780 if (hitpair->outerlength < best_outerlength) {
14781 best_outerlength = hitpair->outerlength;
14782 }
14783 }
14784
14785 for (p = hitpairlist; p != NULL; p = p->rest) {
14786 hitpair = (Stage3pair_T) p->first;
14787
14788 if (hitpair->outerlength > best_outerlength /*+ OUTERLENGTH_SLOP*/) {
14789 debug8(printf("Final (outerlength %u > %u): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14790 hitpair->outerlength,best_outerlength /*+ OUTERLENGTH_SLOP*/,
14791 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14792 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14793 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14794 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14795 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14796 Stage3pair_free(&hitpair);
14797 *eliminatedp = true;
14798
14799 } else {
14800 debug8(printf("Final (outerlength %u): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14801 hitpair->outerlength,
14802 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14803 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14804 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14805 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14806 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14807 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14808 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14809 }
14810 }
14811 Hitlist_free(&hitpairlist);
14812 hitpairlist = optimal;
14813 /* optimal = (List_T) NULL; */
14814 #endif
14815
14816 debug8(printf("Exiting Stage3pair_optimal_score_final with %d hits\n",List_length(hitpairlist)));
14817 return hitpairlist;
14818 }
14819 #endif
14820
14821
14822 static List_T
Stage3pair_optimal_score_final(bool * eliminatedp,List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3)14823 Stage3pair_optimal_score_final (bool *eliminatedp, List_T hitpairlist,
14824 Hitlistpool_T hitlistpool, int querylength5, int querylength3) {
14825 List_T optimal = NULL, p;
14826 Stage3pair_T hitpair;
14827 int n;
14828 int max_adj_nmatches, score;
14829 int best_nmatches_to_trims;
14830 int cutoff_level;
14831 /* int trim5_left, trim5_right, trim3_left, trim3_right, min_trim; */
14832
14833 /* Relies on Path_solve_from_diagonals to maximize the number of segments at each locus */
14834
14835 *eliminatedp = false;
14836 n = List_length(hitpairlist);
14837 debug8(printf("\nEntered Stage3pair_optimal_score_final with %d hitpairs\n",n));
14838
14839 if (n <= 1) {
14840 return hitpairlist;
14841 }
14842
14843 #ifdef DEBUG8
14844 for (p = hitpairlist; p != NULL; p = p->rest) {
14845 hitpair = (Stage3pair_T) p->first;
14846 printf("%p %p %u..%u|%u..%u methods %s and %s, nsegments %d+%d, nmatches %d+%d (%d+%d to trims), scores %d+%d, pairlength %u, outerlength %u, sensedirs %d and %d, splice scores %f and %f\n",
14847 hitpair->hit5,hitpair->hit3,
14848 hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14849 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14850 Method_string(hitpair->hit5->method),Method_string(hitpair->hit3->method),
14851 hitpair->hit5->nsegments,hitpair->hit3->nsegments,
14852 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14853 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,
14854 hitpair->hit5->refalt_score_overall,hitpair->hit3->refalt_score_overall,
14855 hitpair->insertlength,hitpair->outerlength,
14856 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score);
14857 }
14858 printf("\n");
14859 #endif
14860
14861
14862 /* (1) Prune based on nmatches adjusted by score to get a tradeoff between matches and parsimony */
14863 max_adj_nmatches = 0;
14864 for (p = hitpairlist; p != NULL; p = p->rest) {
14865 hitpair = (Stage3pair_T) p->first;
14866 if ((score = hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
14867 - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall) > max_adj_nmatches) {
14868 max_adj_nmatches = score;
14869 }
14870 }
14871
14872 /* May not want to be greedy on cutoff level here. Might want to raise subopt_levels */
14873 cutoff_level = max_adj_nmatches - subopt_levels;
14874 debug8(printf("(1) refalt cutoff level %d = max_adj_nmatches %d - subopt_levels %d\n",
14875 cutoff_level,max_adj_nmatches,subopt_levels));
14876
14877 for (p = hitpairlist; p != NULL; p = p->rest) {
14878 hitpair = (Stage3pair_T) p->first;
14879
14880 if (hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
14881 - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall < cutoff_level /*- NMATCHES_SLOP*/) {
14882 debug8(printf("Final (adj nmatches %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to_trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14883 hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims
14884 - hitpair->hit5->refalt_score_overall - hitpair->hit3->refalt_score_overall,cutoff_level /*- NMATCHES_SLOP*/,
14885 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14886 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14887 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14888 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14889 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14890 Stage3pair_free(&hitpair);
14891 *eliminatedp = true;
14892
14893 } else {
14894 debug8(printf("Final (adj nmatches %d >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to_trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14895 hitpair->hit5->refalt_nmatches_plus_spliced_trims + hitpair->hit3->refalt_nmatches_plus_spliced_trims,cutoff_level /*- NMATCHES_SLOP*/,
14896 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14897 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14898 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14899 hitpair->hit5->refalt_nmatches_plus_spliced_trims,hitpair->hit3->refalt_nmatches_plus_spliced_trims,
14900 hitpair->hit5->refalt_nmatches_to_trims,hitpair->hit3->refalt_nmatches_to_trims,cutoff_level,
14901 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14902 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14903 }
14904 }
14905 Hitlist_free(&hitpairlist);
14906 hitpairlist = optimal;
14907 optimal = (List_T) NULL;
14908
14909
14910 /* (2) Prune based on ref_nmatches_to_trims */
14911 best_nmatches_to_trims = 0;
14912 for (p = hitpairlist; p != NULL; p = p->rest) {
14913 hitpair = (Stage3pair_T) p->first;
14914 if (hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims > best_nmatches_to_trims) {
14915 best_nmatches_to_trims = hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims;
14916 assert(best_nmatches_to_trims <= querylength5 + querylength3);
14917 }
14918 }
14919
14920 cutoff_level = best_nmatches_to_trims - subopt_levels;
14921 debug8(printf("cutoff level %d = best_nmatches_to_trims %d\n",cutoff_level,best_nmatches_to_trims));
14922
14923 /* Do not allow slop for final */
14924 for (p = hitpairlist; p != NULL; p = p->rest) {
14925 hitpair = (Stage3pair_T) p->first;
14926
14927 if (hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims < cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/) {
14928 debug8(printf("Final (nmatches_to_trims %d < %d): Eliminating hit pair %p at %u..%u|%u..%u with nmatches %d+%d (%d+%d to trims) < cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14929 hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
14930 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14931 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14932 hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
14933 hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims,cutoff_level,
14934 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14935 Stage3pair_free(&hitpair);
14936 *eliminatedp = true;
14937
14938 } else {
14939 debug8(printf("Final (nmatches_to_trims %d >= %d): Keeping hit pair %p at %u..%u|%u..%u (%d+%d substrings) with nmatches %d+%d (%d+%d to trims) >= cutoff_level %d, sensedirs %d and %d, splice scores %f and %f\n",
14940 hitpair->hit5->ref_nmatches_to_trims + hitpair->hit3->ref_nmatches_to_trims,cutoff_level /*- NMATCHES_TO_TRIMS_SLOP*/,
14941 hitpair,hitpair->hit5->low - hitpair->hit5->chroffset,hitpair->hit5->high - hitpair->hit5->chroffset,
14942 hitpair->hit3->low - hitpair->hit3->chroffset,hitpair->hit3->high - hitpair->hit3->chroffset,
14943 List_length(hitpair->hit5->substrings_1toN),List_length(hitpair->hit3->substrings_1toN),
14944 hitpair->hit5->ref_nmatches_plus_spliced_trims,hitpair->hit3->ref_nmatches_plus_spliced_trims,
14945 hitpair->hit5->ref_nmatches_to_trims,hitpair->hit3->ref_nmatches_to_trims,cutoff_level,
14946 hitpair->hit5->sensedir,hitpair->hit3->sensedir,hitpair->hit5->splice_score,hitpair->hit3->splice_score));
14947 optimal = Hitlist_push(optimal,hitlistpool,(void *) hitpair);
14948 }
14949 }
14950 Hitlist_free(&hitpairlist);
14951 hitpairlist = optimal;
14952 /* optimal = (List_T) NULL; */
14953
14954 /* Shouldn't need to eliminate within loci, since that was done during prefinal */
14955
14956 debug8(printf("Exiting Stage3pair_optimal_score_final with %d hits\n",List_length(hitpairlist)));
14957 return hitpairlist;
14958 }
14959
14960
14961
14962 List_T
Stage3pair_optimal_score(List_T hitpairlist,Hitlistpool_T hitlistpool,int querylength5,int querylength3,bool finalp)14963 Stage3pair_optimal_score (List_T hitpairlist, Hitlistpool_T hitlistpool,
14964 int querylength5, int querylength3, bool finalp) {
14965 List_T optimal;
14966 bool eliminatedp;
14967
14968 if (finalp == false) {
14969 optimal = Stage3pair_optimal_score_prefinal(&eliminatedp,hitpairlist,hitlistpool,
14970 querylength5,querylength3);
14971 while (eliminatedp == true) {
14972 optimal = Stage3pair_optimal_score_prefinal(&eliminatedp,optimal,hitlistpool,
14973 querylength5,querylength3);
14974 }
14975
14976 } else {
14977 optimal = Stage3pair_optimal_score_final(&eliminatedp,hitpairlist,hitlistpool,
14978 querylength5,querylength3);
14979 while (eliminatedp == true) {
14980 optimal = Stage3pair_optimal_score_final(&eliminatedp,optimal,hitlistpool,
14981 querylength5,querylength3);
14982 }
14983 }
14984
14985 return optimal;
14986 }
14987
14988
14989 #if 0
14990 /* Called when computing GMAP alignment in stage1hr.c */
14991 bool
14992 Stage3pair_sense_consistent_p (List_T hitpairlist) {
14993 Stage3pair_T hitpair;
14994 T hit5, hit3;
14995 List_T p;
14996
14997 for (p = hitpairlist; p != NULL; p = List_next(p)) {
14998 hitpair = (Stage3pair_T) List_head(p);
14999 hit5 = hitpair->hit5;
15000 hit3 = hitpair->hit3;
15001 if (hit5->sensedir_for_concordance == hit3->sensedir_for_concordance) {
15002 return true;
15003 }
15004 }
15005 return false;
15006 }
15007 #endif
15008
15009
15010 /* Want to unalias plus and alias minus */
15011 List_T
Stage3end_linearize_5(List_T hitlist)15012 Stage3end_linearize_5 (List_T hitlist) {
15013 T hit;
15014 List_T p;
15015 #ifdef DEBUG12
15016 Chrpos_T chrlength;
15017 #endif
15018
15019 for (p = hitlist; p != NULL; p = List_next(p)) {
15020 hit = (T) List_head(p);
15021 debug12(chrlength = hit->chrlength);
15022 debug12(printf("Looking at 5' end %u..%u against chrlength %u\n",
15023 hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,chrlength));
15024
15025 if (hit->circularalias == 0) {
15026 /* Skip */
15027
15028 } else if (hit->circularalias == +1) {
15029 if (hit->plusp == true) {
15030 unalias_circular(hit);
15031 }
15032
15033 } else if (hit->circularalias == -1) {
15034 if (hit->plusp == false) {
15035 alias_circular(hit);
15036 }
15037 }
15038 }
15039
15040 return hitlist;
15041 }
15042
15043
15044 /* Want to alias plus and unalias minus */
15045 List_T
Stage3end_linearize_3(List_T hitlist)15046 Stage3end_linearize_3 (List_T hitlist) {
15047 T hit;
15048 List_T p;
15049 #ifdef DEBUG12
15050 Chrpos_T chrlength;
15051 #endif
15052
15053 for (p = hitlist; p != NULL; p = List_next(p)) {
15054 hit = (T) List_head(p);
15055 debug12(chrlength = hit->chrlength);
15056 debug12(printf("Looking at 3' end %u..%u against chrlength %u\n",
15057 hit->genomicstart - hit->chroffset,hit->genomicend - hit->chroffset,chrlength));
15058
15059 if (hit->circularalias == 0) {
15060 /* Skip */
15061
15062 } else if (hit->circularalias == -1) {
15063 if (hit->plusp == true) {
15064 alias_circular(hit);
15065 }
15066
15067 } else if (hit->circularalias == +1) {
15068 if (hit->plusp == false) {
15069 unalias_circular(hit);
15070 }
15071 }
15072 }
15073
15074 return hitlist;
15075 }
15076
15077
15078
15079 List_T
Stage3pair_remove_circular_alias(List_T hitpairlist,Hitlistpool_T hitlistpool)15080 Stage3pair_remove_circular_alias (List_T hitpairlist, Hitlistpool_T hitlistpool) {
15081 List_T newlist = NULL, p;
15082 Stage3pair_T hitpair;
15083
15084 debug12(printf("Stage3pair_remove_circular_alias called with %d hitpairs\n",
15085 List_length(hitpairlist)));
15086 for (p = hitpairlist; p != NULL; p = p->rest) {
15087 hitpair = (Stage3pair_T) p->first;
15088
15089 #if 0
15090 /* Not sure if this is necessary */
15091 if (hitpair->hit5->circularalias == +1 && hitpair->hit3->circularalias == +1) {
15092 /* First, try to salvage alias +1 */
15093 unalias_circular(hitpair->hit5);
15094 unalias_circular(hitpair->hit3);
15095 }
15096 #endif
15097
15098 #if 0
15099 if (hitpair->hit5->plusp == true) {
15100 trim = hitpair->hit5->trim_querystart;
15101 } else {
15102 trim = hitpair->hit3->trim_queryend;
15103 }
15104 #endif
15105
15106 if (hitpair->low >= hitpair->hit5->chroffset + hitpair->hit5->chrlength) {
15107 /* Both ends in circular alias */
15108 debug12(printf("Both ends in circular alias\n"));
15109 Stage3pair_free(&hitpair);
15110
15111 } else {
15112 newlist = Hitlist_push(newlist,hitlistpool,(void *) hitpair);
15113 }
15114 }
15115
15116 Hitlist_free(&hitpairlist);
15117 return newlist;
15118 }
15119
15120
15121