1 static char rcsid[] = "$Id: substring.c 223080 2020-09-13 14:20:25Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "substring.h"
7 #include <stdlib.h>
8 #include <string.h>
9 #include <ctype.h>
10 #include <math.h>		/* For log and exp */
11 
12 #include "assert.h"
13 #include "mem.h"
14 #include "maxent_hr.h"
15 #include "complement.h"
16 #include "genome128_hr.h"
17 #include "mapq.h"
18 #include "comp.h"
19 #include "splice.h"
20 #include "sense.h"
21 #include "simplepair.h"		/* For Simplepair_new_out */
22 
23 
24 /* Designed to allow 1 match to offset 1 mismatch.  To handle 2 matches vs 2 mismatches, penalize for multiple mismatches */
25 #define TRIM_MATCH_SCORE 1
26 #define TRIM_MISMATCH_SCORE_LAST -1 /* Requires 1 match to compensate */
27 #define TRIM_MISMATCH_SCORE_MULT -4 /* Requires 4 matches to compensate */
28 
29 #define KNOWN_SPLICESITE_EDGE 1	/* Cannot be 0; otherwise will hit found splice sites */
30 
31 #define CONSISTENT_TEXT "consistent"
32 #define TRANSLOCATION_TEXT "translocation"
33 #define INVERSION_TEXT "inversion"
34 #define SCRAMBLE_TEXT "scramble"
35 
36 #define MIN_EXON_LENGTH 20
37 
38 
39 #ifdef CHECK_ASSERTIONS
40 #define CHECK_NMISMATCHES 1
41 #endif
42 
43 
44 #ifdef DEBUG
45 #define debug(x) x
46 #else
47 #define debug(x)
48 #endif
49 
50 /* mark_mismatches */
51 #ifdef DEBUG1
52 #define debug1(x) x
53 #else
54 #define debug1(x)
55 #endif
56 
57 /* Substring_new */
58 #ifdef DEBUG2
59 #define debug2(x) x
60 #else
61 #define debug2(x)
62 #endif
63 
64 /* Substring_overlap_p and Substring_insert_length */
65 #ifdef DEBUG3
66 #define debug3(x) x
67 #else
68 #define debug3(x)
69 #endif
70 
71 
72 /* splice site probs */
73 #ifdef DEBUG4
74 #define debug4(x) x
75 #else
76 #define debug4(x)
77 #endif
78 
79 
80 /* Substring_convert_to_pairs */
81 #ifdef DEBUG6
82 #define debug6(x) x
83 #else
84 #define debug6(x)
85 #endif
86 
87 
88 /* contains known splicesite */
89 #ifdef DEBUG7
90 #define debug7(x) x
91 #else
92 #define debug7(x)
93 #endif
94 
95 
96 /* trimming.  may also want to turn on DEBUG8 in pair.c */
97 #ifdef DEBUG8
98 #define debug8(x) x
99 #else
100 #define debug8(x)
101 #endif
102 
103 /* bad_stretch_p */
104 #ifdef DEBUG9
105 #define debug9(x) x
106 #else
107 #define debug9(x)
108 #endif
109 
110 
111 /* Substring_set_alt */
112 #ifdef DEBUG10
113 #define debug10(x) x
114 #else
115 #define debug10(x)
116 #endif
117 
118 /* Substring_circularpos */
119 #ifdef DEBUG12
120 #define debug12(x) x
121 #else
122 #define debug12(x)
123 #endif
124 
125 
126 #define ACCEPTABLE_TRIM 3
127 #define END_SPLICESITE_PROB_CLOSE 0.95
128 #define END_SPLICESITE_PROB_FAR 0.90
129 #define MAX_NCONSECUTIVE_CLOSE 6
130 #define MAX_NCONSECUTIVE_FAR 20	/* Needs to be generous to find splices */
131 
132 
133 
134 #define LOG_99 -0.01005033585
135 #define LOG_01 -4.605170186
136 
137 #if 0
138 /* Switches on 5 consecutive mismatches */
139 #define LOG_99_9999 -0.01015034085
140 #define LOG_99_0001 -9.220390708
141 #define LOG_25_0001 -10.59663473
142 #define LOG_25_9999 -1.386394366
143 #define LOG_01_9999 -4.605270191
144 #define LOG_01_0001 -13.81551056
145 #define LOG_75_0001 -9.498022444
146 #define LOG_75_9999 -0.2877820775
147 #endif
148 
149 #if 0
150 #define LOG_99_999 -0.01105083619
151 #define LOG_99_001 -6.917805615
152 #define LOG_25_001 -8.29404964
153 #define LOG_25_999 -1.387294861
154 #define LOG_01_999 -4.606170686
155 #define LOG_01_001 -11.51292546
156 #define LOG_75_001 -7.195437351
157 #define LOG_75_999 -0.2886825728
158 #endif
159 
160 /* Switches on 4 consecutive mismatches */
161 #define LOG_99_99 -0.02010067171
162 #define LOG_99_01 -4.615220522
163 #define LOG_25_01 -5.991464547
164 #define LOG_25_99 -1.396344697
165 #define LOG_01_99 -4.615220522
166 #define LOG_01_01 -9.210340372
167 #define LOG_75_01 -4.892852258
168 #define LOG_75_99 -0.2977324083
169 
170 
171 static bool print_nsnpdiffs_p;
172 static bool print_snplabels_p;
173 static bool show_refdiff_p;
174 
175 static IIT_T snps_iit;
176 static int *snps_divint_crosstable;
177 
178 static Genome_T genomebits;
179 static Genome_T genomebits_alt;
180 
181 static Univ_IIT_T chromosome_iit;
182 static Univcoord_T genomelength;
183 static double genomesize;	/* For BLAST E-value */
184 static int nchromosomes;
185 static int circular_typeint;
186 
187 static bool novelsplicingp;
188 static bool knownsplicingp;
189 static Outputtype_T output_type;
190 
191 static Mode_T mode;
192 
193 static bool maskedp;
194 
195 static char complCode[128] = COMPLEMENT_LC;
196 
197 static char *
make_complement_inplace(char * sequence,unsigned int length)198 make_complement_inplace (char *sequence, unsigned int length) {
199   char temp;
200   unsigned int i, j;
201 
202   for (i = 0, j = length-1; i < length/2; i++, j--) {
203     temp = complCode[(int) sequence[i]];
204     sequence[i] = complCode[(int) sequence[j]];
205     sequence[j] = temp;
206   }
207   if (i == j) {
208     sequence[i] = complCode[(int) sequence[i]];
209   }
210 
211   return sequence;
212 }
213 
214 
215 
216 #define T Substring_T
217 
218 struct T {
219   /* Maintaining this pointer so we don't have to pass in the values
220      for Substring_display_prep based on plusp.  Caller needs to keep
221      these values until all computation is finished. */
222   Compress_T query_compress; /* Pointer to the associated query_compress */
223 
224 
225   int nmismatches_bothdiff;	/* Over region left after trimming */
226   int nmismatches_refdiff;	/* Over region left after trimming */
227   /* nsnpdiffs = nmismatches_bothdiff - nmismatches_refdiff */
228 
229   int nmatches_plus_spliced_trims;
230   int nmatches_to_trims;
231 
232   int ref_nmatches_plus_spliced_trims;
233   int ref_nmatches_to_trims;
234 
235   /* int trim_querystart; */
236   /* int trim_queryend; */
237   int mandatory_trim_querystart;	/* Resulting from unalias, plus SOFT_CLIPS_AVOID_CIRCULARIZATION */
238   int mandatory_trim_queryend;	/* Resulting from alias, plus SOFT_CLIPS_AVOID_CIRCULARIZATION */
239   int start_amb_length;
240   int end_amb_length;
241 
242   bool trim_querystart_splicep;
243   bool trim_queryend_splicep;
244 
245   Chrnum_T chrnum;
246   Univcoord_T chroffset;
247   Univcoord_T chrhigh;
248   Chrpos_T chrlength;
249 
250   Univcoord_T left; /* for plus: alignstart - querystart(orig).  for
251 		       minus: alignend - (querylength -
252 		       queryend(orig)).  Set when substring is created
253 		       or made unambiguous, and remains constant */
254 
255   Univcoord_T genomicstart;	/* For region corresponding to entire querylength (if extrapolated) */
256   Univcoord_T genomicend;
257 
258   Endtype_T start_endtype;
259   Endtype_T end_endtype;
260 
261   int querystart_pretrim;	/* From original invocation, before trimming */
262   int queryend_pretrim;		/* From original invocation, before trimming */
263   int querystart;		/* For part that aligns to genome, post-trim */
264   int queryend;
265 
266   int querystart_chrbound;	/* Bound on alignment if this were substring1 */
267   int queryend_chrbound;	/* Bound on alignment if this were substringN */
268 
269   int amb_splice_pos;		/* Used for ambiguous substrings.  Based on left, not querystart */
270   int querylength;
271 
272   Univcoord_T alignstart_trim;	/* For part that aligns to genome, excluding part that is trimmed (post-trim) */
273   Univcoord_T alignend_trim;
274 
275   bool plusp;
276   int genestrand;
277 
278   char *genomic_bothdiff; /* In same direction as query.  NULL if same
279   			     as query.  Has dashes outside of
280   			     querystart..(queryend-1) for indels and
281   			     splices.  Has lowercase to indicate
282   			     trimmed regions, terminal regions,
283   			     deletions, splice dinucleotides, and
284   			     mismatches from ref and alt.  Use for
285   			     MAPQ computations and scoring. */
286 
287   char *genomic_refdiff;  /* Same as above, but lowercase for
288 			     mismatches from ref only.  For
289 			     non-SNP-tolerant alignment, this is just
290 			     a pointer to genomic_bothdiff.  Use for
291 			     NM and MD computations.  */
292 
293   float mapq_loglik;
294 
295   int sensedir;
296 
297   Univcoord_T splicecoord_D;
298   int splicesitesD_knowni;	/* Needed for intragenic_splice_p in stage1hr.c */
299 
300   int siteD_pos;
301   double siteD_prob;
302 
303   /* for shortexon (always use *_1 for acceptor and *_2 for donor) */
304   /* for donor/acceptor: the ambiguous position */
305   Univcoord_T splicecoord_A;
306   int splicesitesA_knowni;
307 
308   int siteA_pos;
309   double siteA_prob;
310 
311   /* Note: For DNA fusions, use both splicecoord_D and splicecoord_A */
312   int siteN_pos;
313 
314 
315   int alts_ncoords;
316   Univcoord_T *alts_coords;
317   int *alts_knowni;
318   int *alts_nmismatches;
319   int *alts_ref_nmismatches;
320   double *alts_probs;
321 
322   Endtype_T amb_type;		/* Ambiguous DONs or ACCs */
323 };
324 
325 
326 void
Substring_alias_circular(T this)327 Substring_alias_circular (T this) {
328   Chrpos_T chrlength;
329   int i;
330 
331   if (this == NULL) {
332     /* Skip */
333 
334   } else if (this->alts_ncoords > 0) {
335     /* Cannot rely on this->left, which is 0.  Let mandatory_trim_querystart and mandatory_trim_queryend be 0 */
336     chrlength = this->chrlength;
337     for (i = 0; i < this->alts_ncoords; i++) {
338       this->alts_coords[i] += chrlength;
339     }
340 
341   } else {
342     chrlength = this->chrlength;
343 
344     if (this->left + this->querylength > this->chroffset + chrlength) {
345       debug2(printf("For alias, this->left %u + this->querylength %d > offset %u + chrlength %u\n",
346 		    this->left,this->querylength,this->chroffset,chrlength));
347       if (this->plusp == true) {
348 	this->mandatory_trim_queryend = (this->left + this->querylength) - (this->chroffset + chrlength);
349 	debug2(printf("For alias, plusp true, setting mandatory_trim_queryend to be %d\n",this->mandatory_trim_queryend));
350 	assert(this->mandatory_trim_queryend >= 0);
351       } else {
352 	this->mandatory_trim_querystart = (this->left + this->querylength) - (this->chroffset + chrlength);
353 	debug2(printf("For alias, plusp false, setting mandatory_trim_querystart to be %d\n",this->mandatory_trim_querystart));
354 	assert(this->mandatory_trim_querystart >= 0);
355       }
356     }
357 
358     this->left += chrlength;
359     this->genomicstart += chrlength;
360     this->genomicend += chrlength;
361     this->alignstart_trim += chrlength;
362     this->alignend_trim += chrlength;
363     this->splicecoord_D += chrlength;
364     this->splicecoord_A += chrlength;
365   }
366 
367   return;
368 }
369 
370 
371 void
Substring_unalias_circular(T this)372 Substring_unalias_circular (T this) {
373   Chrpos_T chrlength;
374   int i;
375 
376   if (this == NULL) {
377     /* Skip */
378 
379   } else if (this->alts_ncoords > 0) {
380     /* Cannot rely on this->left, which is 0.  Let mandatory_trim_querystart and mandatory_trim_queryend be 0 */
381     chrlength = this->chrlength;
382     for (i = 0; i < this->alts_ncoords; i++) {
383       this->alts_coords[i] -= chrlength;
384     }
385 
386   } else {
387     chrlength = this->chrlength;
388 
389     if (this->left < this->chroffset + chrlength) {
390       debug2(printf("For unalias, this->left %u < chroffset %u + chrlength %d\n",
391 		    this->left,this->chroffset,chrlength));
392       if (this->plusp == true) {
393 	this->mandatory_trim_querystart = (this->chroffset + chrlength) - this->left;
394 	debug2(printf("For unalias, plusp true, setting mandatory_trim_querystart to be %d\n",this->mandatory_trim_querystart));
395 	assert(this->mandatory_trim_querystart >= 0);
396       } else {
397 	this->mandatory_trim_queryend = (this->chroffset + chrlength) - this->left;
398 	debug2(printf("For unalias, plusp false, setting mandatory_trim_queryend to be %d\n",this->mandatory_trim_queryend));
399 	assert(this->mandatory_trim_queryend >= 0);
400       }
401     }
402 
403     this->left -= chrlength;
404     this->genomicstart -= chrlength;
405     this->genomicend -= chrlength;
406     this->alignstart_trim -= chrlength;
407     this->alignend_trim -= chrlength;
408     this->splicecoord_D -= chrlength;
409     this->splicecoord_A -= chrlength;
410   }
411 
412   return;
413 }
414 
415 
416 
417 static void
fill_w_dashes(char * string,int start,int end)418 fill_w_dashes (char *string, int start, int end) {
419   int i;
420 
421   for (i = start; i < end; i++) {
422     string[i] = '-';
423   }
424   return;
425 }
426 
427 static void
fill_w_stars(char * string,int start,int end)428 fill_w_stars (char *string, int start, int end) {
429   int i;
430 
431   for (i = start; i < end; i++) {
432     string[i] = '*';
433   }
434   return;
435 }
436 
437 
438 
439 void
Substring_free(T * old)440 Substring_free (T *old) {
441   if (*old) {
442     debug2(printf("Freeing substring %p\n",*old));
443     if ((*old)->alts_ncoords > 0) {
444       FREE_OUT((*old)->alts_coords);
445       FREE_OUT((*old)->alts_knowni);
446       FREE_OUT((*old)->alts_nmismatches);
447       FREE_OUT((*old)->alts_ref_nmismatches);
448       FREE_OUT((*old)->alts_probs);
449     }
450     if ((*old)->genomic_bothdiff != NULL) {
451       if ((*old)->genomic_refdiff != (*old)->genomic_bothdiff) {
452 	FREE_OUT((*old)->genomic_refdiff);
453       }
454       FREE_OUT((*old)->genomic_bothdiff);
455     }
456 
457     FREE_OUT(*old);
458   }
459   return;
460 }
461 
462 
463 void
Substring_list_gc(List_T * old)464 Substring_list_gc (List_T *old) {
465   List_T p;
466   T substring;
467 
468   for (p = *old; p != NULL; p = p->rest) {
469     substring = (Substring_T) p->first;
470     Substring_free(&substring);
471   }
472   /* List_free(&(*old)); -- allocated by Listpool_push */
473   return;
474 }
475 
476 
477 
478 bool
Substring_contains_p(T this,int querypos)479 Substring_contains_p (T this, int querypos) {
480   if (querypos >= this->querystart && querypos < this->queryend) {
481     return true;
482   } else {
483     return false;
484   }
485 }
486 
487 
488 int
Substring_compare(T substring1,T substring2,int alias1,int alias2,Chrpos_T chrlength1,Chrpos_T chrlength2)489 Substring_compare (T substring1, T substring2, int alias1, int alias2, Chrpos_T chrlength1, Chrpos_T chrlength2) {
490   Univcoord_T alignstart1, alignend1, alignstart2, alignend2;
491 
492   if (substring1 == NULL && substring2 == NULL) {
493     return 0;
494   } else if (substring1 == NULL) {
495     return -1;
496   } else if (substring2 == NULL) {
497     return +1;
498   } else {
499     alignstart1 = substring1->alignstart_trim;
500     alignend1 = substring1->alignend_trim;
501     if (alias1 < 0) {
502       alignstart1 += chrlength1;
503       alignend1 += chrlength1;
504     }
505 
506     alignstart2 = substring2->alignstart_trim;
507     alignend2 = substring2->alignend_trim;
508     if (alias2 < 0) {
509       alignstart2 += chrlength2;
510       alignend2 += chrlength2;
511     }
512 
513     if (alignstart1 < alignstart2) {
514       return -1;
515     } else if (alignstart1 > alignstart2) {
516       return +1;
517     } else if (alignend1 < alignend2) {
518       return -1;
519     } else if (alignend1 > alignend2) {
520       return +1;
521     } else {
522       return 0;
523     }
524   }
525 }
526 
527 
528 bool
Substring_equal_p(T substring1,T substring2)529 Substring_equal_p (T substring1, T substring2) {
530   Univcoord_T low1, high1, low2, high2;
531 
532   if (substring1->plusp == true) {
533     low1 = substring1->alignstart_trim;
534     high1 = substring1->alignend_trim;
535   } else {
536     low1 = substring1->alignend_trim;
537     high1 = substring1->alignstart_trim;
538   }
539   if (high1 > 0) {
540     high1 -= 1;
541   }
542 
543   if (substring2->plusp == true) {
544     low2 = substring2->alignstart_trim;
545     high2 = substring2->alignend_trim;
546   } else {
547     low2 = substring2->alignend_trim;
548     high2 = substring2->alignstart_trim;
549   }
550   if (high2 > 0) {
551     high2 -= 1;
552   }
553 
554   debug3(printf("Checking equivalence between %u..%u and %u..%u",low1,high1,low2,high2));
555 
556   if (low1 == low2 && high1 == high2) {
557     return true;
558   } else {
559     return false;
560   }
561 }
562 
563 
564 bool
Substring_overlap_p(T substring1,T substring2)565 Substring_overlap_p (T substring1, T substring2) {
566   Univcoord_T low1, high1, low2, high2;
567 
568   if (substring1->plusp == true) {
569     low1 = substring1->alignstart_trim;
570     high1 = substring1->alignend_trim;
571   } else {
572     low1 = substring1->alignend_trim;
573     high1 = substring1->alignstart_trim;
574   }
575   if (high1 > 0) {
576     high1 -= 1;
577   }
578 
579   if (substring2->plusp == true) {
580     low2 = substring2->alignstart_trim;
581     high2 = substring2->alignend_trim;
582   } else {
583     low2 = substring2->alignend_trim;
584     high2 = substring2->alignstart_trim;
585   }
586   if (high2 > 0) {
587     high2 -= 1;
588   }
589 
590   debug3(printf("Checking overlap between %u..%u and %u..%u",low1,high1,low2,high2));
591 
592   if (high2 < low1) {
593     debug3(printf(" => no because %u < %u\n",high2,low1));
594     return false;
595   } else if (low2 > high1) {
596     debug3(printf(" => no because %u > %u\n",low2,high1));
597     return false;
598   } else {
599     debug3(printf(" => yes\n"));
600     return true;
601   }
602 }
603 
604 
605 Chrpos_T
Substring_insert_length(int * pair_relationship,T substring5,T substring3)606 Substring_insert_length (int *pair_relationship, T substring5, T substring3) {
607   Univcoord_T low, high, low5, high5, low3, high3;
608 
609   if (substring5->plusp == true) {
610     low5 = substring5->genomicstart;
611     high5 = substring5->genomicend;
612   } else {
613     low5 = substring5->genomicend;
614     high5 = substring5->genomicstart;
615   }
616 
617   if (substring3->plusp == true) {
618     low3 = substring3->genomicstart;
619     high3 = substring3->genomicend;
620   } else {
621     low3 = substring3->genomicend;
622     high3 = substring3->genomicstart;
623   }
624 
625   if (low5 < low3) {
626     low = low5;
627   } else {
628     low = low3;
629   }
630 
631   if (high5 > high3) {
632     high = high5;
633   } else {
634     high = high3;
635   }
636 
637   if (low5 < low3 && high5 < high3) {
638     *pair_relationship = +1;
639   } else if (low3 < low5 && high3 < high5) {
640     *pair_relationship = -1;
641   } else if (substring5->plusp == true && substring3->plusp == true) {
642     *pair_relationship = +1;
643   } else if (substring5->plusp == false && substring3->plusp == false) {
644     *pair_relationship = -1;
645   } else {
646     *pair_relationship = 0;
647   }
648 
649   debug3(printf("Returning %u - %u.  pair_relationship %d\n",high,low,*pair_relationship));
650   return high - low;
651 }
652 
653 
654 bool
Substring_overlap_point_trimmed_p(T substring,Univcoord_T endpos)655 Substring_overlap_point_trimmed_p (T substring, Univcoord_T endpos) {
656   Univcoord_T low, high;
657 
658   if (substring == NULL) {
659     return false;
660   }
661 
662   if (substring->plusp == true) {
663     low = substring->alignstart_trim;
664     high = substring->alignend_trim;
665     if (high > 0) {
666       high -= 1;
667     }
668     debug3(printf("Checking overlap between plus %u..%u and %u",low,high,endpos));
669   } else {
670     low = substring->alignend_trim;
671     high = substring->alignstart_trim;
672     if (high > 0) {
673       high -= 1;
674     }
675     debug3(printf("Checking overlap between minus %u..%u and %u",low,high,endpos));
676   }
677 
678 
679   if (endpos < low) {
680     debug3(printf(" => no because %u < %u\n",endpos,low));
681     return false;
682   } else if (endpos > high) {
683     debug3(printf(" => no because %u > %u\n",endpos,high));
684     return false;
685   } else {
686     debug3(printf(" => yes\n"));
687     return true;
688   }
689 }
690 
691 
692 Univcoord_T
Substring_overlap_segment_trimmed(T substring1,T substring2)693 Substring_overlap_segment_trimmed (T substring1, T substring2) {
694   Univcoord_T maxlow, minhigh;
695   Univcoord_T low1, high1, low2, high2;
696 
697   if (substring1->plusp == true) {
698     low1 = substring1->alignstart_trim;
699     high1 = substring1->alignend_trim;
700   } else {
701     low1 = substring1->alignend_trim;
702     high1 = substring1->alignstart_trim;
703   }
704   if (high1 > 0) {
705     high1 -= 1;
706   }
707 
708   if (substring2->plusp == true) {
709     low2 = substring2->alignstart_trim;
710     high2 = substring2->alignend_trim;
711   } else {
712     low2 = substring2->alignend_trim;
713     high2 = substring2->alignstart_trim;
714   }
715   if (high2 > 0) {
716     high2 -= 1;
717   }
718 
719   debug3(printf("Checking overlap between %u..%u and %u..%u",low1,high1,low2,high2));
720 
721   if (high2 < low1) {
722     debug3(printf(" => no because %u < %u\n",high2,low1));
723     return 0;
724   } else if (low2 > high1) {
725     debug3(printf(" => no because %u > %u\n",low2,high1));
726     return 0;
727   } else {
728     maxlow = (low1 > low2) ? low1 : low2;
729     minhigh = (high1 < high2) ? high1 : high2;
730     debug3(printf(" => yes.  maxlow %llu, minhigh %llu.  returning %llu\n",
731 		  maxlow,minhigh,maxlow + (minhigh - maxlow)/2));
732     return maxlow + (minhigh - maxlow)/2;
733   }
734 }
735 
736 
737 static void
mark_mismatches_cmet_gsnap(char * gbuffer,char * query,int start,int end,int genestrand)738 mark_mismatches_cmet_gsnap (char *gbuffer, char *query, int start, int end, int genestrand) {
739   int i;
740 
741   debug1(printf("\n"));
742   debug1(printf("query:  %s\n",query));
743   debug1(printf("genome: %s\n",gbuffer));
744   debug1(printf("count:  "));
745 
746   if (genestrand == +2) {
747     for (i = start; i < end; i++) {
748       if (gbuffer[i] == 'G' && query[i] == 'A') {
749 	debug1(printf("."));
750 	gbuffer[i] = '.';
751       } else if (query[i] != gbuffer[i]) {
752 	debug1(printf("x"));
753 	assert(gbuffer[i] != OUTOFBOUNDS);
754 	gbuffer[i] = (char) tolower(gbuffer[i]);
755       } else {
756 	debug1(printf("*"));
757       }
758     }
759 
760   } else {
761     for (i = start; i < end; i++) {
762       if (gbuffer[i] == 'C' && query[i] == 'T') {
763 	debug1(printf("."));
764 	gbuffer[i] = '.';
765       } else if (query[i] != gbuffer[i]) {
766 	debug1(printf("x"));
767 	assert(gbuffer[i] != OUTOFBOUNDS);
768 	gbuffer[i] = (char) tolower(gbuffer[i]);
769       } else {
770 	debug1(printf("*"));
771       }
772     }
773   }
774 
775   return;
776 }
777 
778 
779 static void
mark_mismatches_cmet_sam(char * gbuffer,char * query,int start,int end,int genestrand)780 mark_mismatches_cmet_sam (char *gbuffer, char *query, int start, int end, int genestrand) {
781   int i;
782 
783   debug1(printf("query:  %s\n",query));
784   debug1(printf("genome: %s\n",gbuffer));
785   debug1(printf("count:  "));
786 
787   if (genestrand == +2) {
788     for (i = start; i < end; i++) {
789       if (gbuffer[i] == 'G' && query[i] == 'A') {
790 	debug1(printf("."));
791 #if 0
792 	/* Want to show mismatches */
793 	gbuffer[i] = 'A';		/* Avoids showing mismatches in MD and NM strings */
794 #endif
795       } else if (query[i] != gbuffer[i]) {
796 	debug1(printf("x"));
797 	assert(gbuffer[i] != OUTOFBOUNDS);
798 	gbuffer[i] = (char) tolower(gbuffer[i]);
799       } else {
800 	debug1(printf("*"));
801       }
802     }
803 
804   } else {
805     for (i = start; i < end; i++) {
806       if (gbuffer[i] == 'C' && query[i] == 'T') {
807 	debug1(printf("."));
808 #if 0
809 	/* Want to show mismatches */
810 	gbuffer[i] = 'T';		/* Avoids showing mismatches in MD and NM strings */
811 #endif
812       } else if (query[i] != gbuffer[i]) {
813 	debug1(printf("x"));
814 	assert(gbuffer[i] != OUTOFBOUNDS);
815 	gbuffer[i] = (char) tolower(gbuffer[i]);
816       } else {
817 	debug1(printf("*"));
818       }
819     }
820   }
821 
822   return;
823 }
824 
825 
826 
827 static void
mark_mismatches_atoi_gsnap(char * gbuffer,char * query,int start,int end,int genestrand)828 mark_mismatches_atoi_gsnap (char *gbuffer, char *query, int start, int end, int genestrand) {
829   int i;
830 
831   debug1(printf("query:  %s\n",query));
832   debug1(printf("genome: %s\n",gbuffer));
833   debug1(printf("count:  "));
834 
835   if (genestrand == +2) {
836     for (i = start; i < end; i++) {
837       if (gbuffer[i] == 'T' && query[i] == 'C') {
838 	debug1(printf("."));
839 	gbuffer[i] = '.';
840       } else if (query[i] != gbuffer[i]) {
841 	debug1(printf("x"));
842 	assert(gbuffer[i] != OUTOFBOUNDS);
843 	gbuffer[i] = (char) tolower(gbuffer[i]);
844       } else {
845 	debug1(printf("*"));
846       }
847     }
848 
849   } else {
850     for (i = start; i < end; i++) {
851       if (gbuffer[i] == 'A' && query[i] == 'G') {
852 	debug1(printf("."));
853 	gbuffer[i] = '.';
854       } else if (query[i] != gbuffer[i]) {
855 	debug1(printf("x"));
856 	assert(gbuffer[i] != OUTOFBOUNDS);
857 	gbuffer[i] = (char) tolower(gbuffer[i]);
858       } else {
859 	debug1(printf("*"));
860       }
861     }
862   }
863 
864   return;
865 }
866 
867 
868 
869 static void
mark_mismatches_atoi_sam(char * gbuffer,char * query,int start,int end,int genestrand)870 mark_mismatches_atoi_sam (char *gbuffer, char *query, int start, int end, int genestrand) {
871   int i;
872 
873   debug1(printf("query:  %s\n",query));
874   debug1(printf("genome: %s\n",gbuffer));
875   debug1(printf("count:  "));
876 
877   if (genestrand == +2) {
878     for (i = start; i < end; i++) {
879       if (gbuffer[i] == 'T' && query[i] == 'C') {
880 	debug1(printf("."));
881 #if 0
882 	/* Want to show mismatches */
883 	gbuffer[i] = 'C';		/* Avoids showing mismatches in MD and NM strings */
884 #endif
885       } else if (query[i] != gbuffer[i]) {
886 	debug1(printf("x"));
887 	assert(gbuffer[i] != OUTOFBOUNDS);
888 	gbuffer[i] = (char) tolower(gbuffer[i]);
889       } else {
890 	debug1(printf("*"));
891       }
892     }
893 
894   } else {
895     for (i = start; i < end; i++) {
896       if (gbuffer[i] == 'A' && query[i] == 'G') {
897 	debug1(printf("."));
898 #if 0
899 	/* Want to show mismatches */
900 	gbuffer[i] = 'G';		/* Avoids showing mismatches in MD and NM strings */
901 #endif
902       } else if (query[i] != gbuffer[i]) {
903 	debug1(printf("x"));
904 	assert(gbuffer[i] != OUTOFBOUNDS);
905 	gbuffer[i] = (char) tolower(gbuffer[i]);
906       } else {
907 	debug1(printf("*"));
908       }
909     }
910   }
911 
912   return;
913 }
914 
915 
916 static void
mark_mismatches_ttoc_gsnap(char * gbuffer,char * query,int start,int end,int genestrand)917 mark_mismatches_ttoc_gsnap (char *gbuffer, char *query, int start, int end, int genestrand) {
918   int i;
919 
920   debug1(printf("query:  %s\n",query));
921   debug1(printf("genome: %s\n",gbuffer));
922   debug1(printf("count:  "));
923 
924   if (genestrand == +2) {
925     for (i = start; i < end; i++) {
926       if (gbuffer[i] == 'A' && query[i] == 'G') {
927 	debug1(printf("."));
928 	gbuffer[i] = '.';
929       } else if (query[i] != gbuffer[i]) {
930 	debug1(printf("x"));
931 	assert(gbuffer[i] != OUTOFBOUNDS);
932 	gbuffer[i] = (char) tolower(gbuffer[i]);
933       } else {
934 	debug1(printf("*"));
935       }
936     }
937 
938   } else {
939     for (i = start; i < end; i++) {
940       if (gbuffer[i] == 'T' && query[i] == 'C') {
941 	debug1(printf("."));
942 	gbuffer[i] = '.';
943       } else if (query[i] != gbuffer[i]) {
944 	debug1(printf("x"));
945 	assert(gbuffer[i] != OUTOFBOUNDS);
946 	gbuffer[i] = (char) tolower(gbuffer[i]);
947       } else {
948 	debug1(printf("*"));
949       }
950     }
951   }
952 
953   return;
954 }
955 
956 
957 
958 static void
mark_mismatches_ttoc_sam(char * gbuffer,char * query,int start,int end,int genestrand)959 mark_mismatches_ttoc_sam (char *gbuffer, char *query, int start, int end, int genestrand) {
960   int i;
961 
962   debug1(printf("query:  %s\n",query));
963   debug1(printf("genome: %s\n",gbuffer));
964   debug1(printf("count:  "));
965 
966   if (genestrand == +2) {
967     for (i = start; i < end; i++) {
968       if (gbuffer[i] == 'A' && query[i] == 'G') {
969 	debug1(printf("."));
970 #if 0
971 	/* Want to show mismatches */
972 	gbuffer[i] = 'G';		/* Avoids showing mismatches in MD and NM strings */
973 #endif
974       } else if (query[i] != gbuffer[i]) {
975 	debug1(printf("x"));
976 	assert(gbuffer[i] != OUTOFBOUNDS);
977 	gbuffer[i] = (char) tolower(gbuffer[i]);
978       } else {
979 	debug1(printf("*"));
980       }
981     }
982 
983   } else {
984     for (i = start; i < end; i++) {
985       if (gbuffer[i] == 'T' && query[i] == 'C') {
986 	debug1(printf("."));
987 #if 0
988 	/* Want to show mismatches */
989 	gbuffer[i] = 'C';		/* Avoids showing mismatches in MD and NM strings */
990 #endif
991       } else if (query[i] != gbuffer[i]) {
992 	debug1(printf("x"));
993 	assert(gbuffer[i] != OUTOFBOUNDS);
994 	gbuffer[i] = (char) tolower(gbuffer[i]);
995       } else {
996 	debug1(printf("*"));
997       }
998     }
999   }
1000 
1001   return;
1002 }
1003 
1004 
1005 
1006 /* The result variable should be an existing genomic_bothdiff or genomic_refdiff, and then reassigned to that */
1007 static char *
embellish_genomic(char * result,char * genomic_diff,char * query,int querystart,int queryend,int querylength,int mandatory_trim_querystart_in,int mandatory_trim_queryend_in,int extraleft,int extraright,bool plusp,int genestrand)1008 embellish_genomic (char *result, char *genomic_diff, char *query, int querystart, int queryend, int querylength,
1009 		   int mandatory_trim_querystart_in, int mandatory_trim_queryend_in, int extraleft, int extraright,
1010 		   bool plusp, int genestrand) {
1011   int i, j, k;
1012   int mandatory_trim_querystart, mandatory_trim_queryend;
1013 
1014   debug1(printf("Entered embellish_genomic with querystart %d, queryend %d, querylength %d, mandatory_trim_querystart %d, mandatory_trim_queryend %d\n",
1015 		querystart,queryend,querylength,mandatory_trim_querystart_in,mandatory_trim_queryend_in));
1016   debug1(printf("genomic_diff %s\n",genomic_diff));
1017 
1018   assert(mandatory_trim_querystart_in >= 0);
1019   assert(mandatory_trim_queryend_in >= 0);
1020   assert(mandatory_trim_querystart_in < querylength);
1021   assert(mandatory_trim_queryend_in < querylength);
1022 
1023   if (plusp == true) {
1024     mandatory_trim_querystart = mandatory_trim_querystart_in;
1025     mandatory_trim_queryend = mandatory_trim_queryend_in;
1026   } else {
1027     mandatory_trim_querystart = mandatory_trim_queryend_in;
1028     mandatory_trim_queryend = mandatory_trim_querystart_in;
1029   }
1030 
1031   if (result == NULL) {
1032     result = (char *) MALLOC_OUT((querylength+1) * sizeof(char));
1033   }
1034   result[querylength] = '\0';
1035 #if 0
1036   for (i = 0; i < querylength; i++) {
1037     result[i] = '?';
1038   }
1039 #endif
1040 
1041   /* Add aligned region with lower-case diffs, surrounded by dashes */
1042   fill_w_dashes(result,0,querystart);
1043   fill_w_stars(result,0,mandatory_trim_querystart);
1044 
1045   /* Don't need to know adj anymore, because each substring has its own left */
1046   debug1(printf("Copying from genomic_diff[%d] to result[%d] for a length of %d - %d\n",querystart,querystart,queryend,querystart));
1047   strncpy(&(result[querystart]),&(genomic_diff[querystart]),queryend-querystart);
1048   debug1(printf("A g1: %s (%d..%d) extraleft:%d extraright:%d\n",result,querystart,queryend,extraleft,extraright));
1049 
1050   if (mode == STANDARD) {
1051     /* Skip */
1052   } else if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
1053     mark_mismatches_cmet_gsnap(result,query,querystart,queryend,genestrand);
1054   } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
1055     mark_mismatches_atoi_gsnap(result,query,querystart,queryend,genestrand);
1056   } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
1057     mark_mismatches_ttoc_gsnap(result,query,querystart,queryend,genestrand);
1058   } else {
1059     abort();
1060   }
1061 
1062   fill_w_dashes(result,queryend,querylength);
1063   fill_w_stars(result,querylength - mandatory_trim_queryend,querylength);
1064   debug1(printf("B g1: %s\n",result));
1065 
1066   /* Add terminal ends as lower-case.  Previously stopped at i >= 0 */
1067   for (k = 0, i = j = querystart - 1; k < extraleft && i >= mandatory_trim_querystart /*&& j >= 0*/; k++, i--, j--) {
1068     result[i] = (char) tolower(genomic_diff[j]);
1069     /* printf("k=%d i=%d result[i]=%c\n",k,i,result[i]); */
1070     assert(result[i] == 'a' || result[i] == 'c' || result[i] == 'g' || result[i] == 't' || result[i] == 'n' || result[i] == '*');
1071   }
1072 
1073   /* Add terminal ends as lower-case.  Previously stopped at i < querylength */
1074   for (k = 0, i = j = queryend; k < extraright && i < querylength - mandatory_trim_queryend /*&& j < genomiclength*/; k++, i++, j++) {
1075     result[i] = (char) tolower(genomic_diff[j]);
1076     /* printf("k=%d i=%d result[i]=%c\n",k,i,result[i]); */
1077   }
1078   debug1(printf("C g1: %s\n",result));
1079 
1080   return result;
1081 }
1082 
1083 
1084 static char *
embellish_genomic_sam(char * result,char * genomic_diff,char * query,int querystart,int queryend,int querylength,int mandatory_trim_querystart_in,int mandatory_trim_queryend_in,int extraleft,int extraright,bool plusp,int genestrand)1085 embellish_genomic_sam (char *result, char *genomic_diff, char *query, int querystart, int queryend, int querylength,
1086 		       int mandatory_trim_querystart_in, int mandatory_trim_queryend_in, int extraleft, int extraright,
1087 		       bool plusp, int genestrand) {
1088   int i, j, k;
1089   int start, end;
1090   int mandatory_trim_querystart, mandatory_trim_queryend;
1091 
1092   assert(mandatory_trim_querystart_in >= 0);
1093   assert(mandatory_trim_queryend_in >= 0);
1094 
1095   if (plusp == true) {
1096     mandatory_trim_querystart = mandatory_trim_querystart_in;
1097     mandatory_trim_queryend = mandatory_trim_queryend_in;
1098   } else {
1099     mandatory_trim_querystart = mandatory_trim_queryend_in;
1100     mandatory_trim_queryend = mandatory_trim_querystart_in;
1101   }
1102 
1103 
1104   if (result == NULL) {
1105     result = (char *) MALLOC_OUT((querylength+1) * sizeof(char));
1106   }
1107   result[querylength] = '\0';
1108 
1109   if ((start = querystart - extraleft) < 0) {
1110     start = 0;
1111   }
1112   if ((end = queryend + extraright) > querylength) {
1113     end = querylength;
1114   }
1115 
1116   fill_w_dashes(result,0,start);
1117   fill_w_stars(result,0,mandatory_trim_querystart);
1118   strncpy(&(result[start]),&(genomic_diff[start]),end-start);
1119   fill_w_dashes(result,end,querylength);
1120   fill_w_stars(result,querylength - mandatory_trim_queryend,querylength);
1121 
1122   if (mode == STANDARD) {
1123     /* Skip */
1124   } else if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
1125     mark_mismatches_cmet_sam(result,query,querystart,queryend,genestrand);
1126   } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
1127     mark_mismatches_atoi_sam(result,query,querystart,queryend,genestrand);
1128   } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
1129     mark_mismatches_ttoc_sam(result,query,querystart,queryend,genestrand);
1130   } else {
1131     abort();
1132   }
1133 
1134   /* Add terminal ends as lower-case.  Previously stopped at i >= 0 */
1135   for (k = 0, i = querystart-1, j = querystart-1; i >= mandatory_trim_querystart /*&& j >= 0*/; k++, i--, j--) {
1136     if (query[i] == genomic_diff[j]) {
1137       result[i] = genomic_diff[j];
1138     } else {
1139       result[i] = (char) tolower(genomic_diff[j]);
1140     }
1141     /* printf("k=%d i=%d j=%d result[i]=%c\n",k,i,j,result[i]); */
1142   }
1143 
1144   /* Add terminal ends as lower-case.  Previously stopped at i < querylength */
1145   for (k = 0, i = queryend, j = queryend; i < querylength - mandatory_trim_queryend /*&& j < genomiclength*/; k++, i++, j++) {
1146     if (query[i] == genomic_diff[j]) {
1147       result[i] = genomic_diff[j];
1148     } else {
1149       result[i] = (char) tolower(genomic_diff[j]);
1150     }
1151   }
1152 
1153   return result;
1154 }
1155 
1156 
1157 /* Still takes left as a parameter */
1158 int
Substring_trim_qstart_nosplice(int * nmismatches,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T left,Univcoord_T chroffset,int pos5,int pos3,int querylength,bool plusp,int genestrand)1159 Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
1160 				Compress_T query_compress, Univcoord_T left, Univcoord_T chroffset,
1161 				int pos5, int pos3, int querylength, bool plusp, int genestrand) {
1162   int score = 0;
1163   int trimpos = pos5, pos, prevpos, i;
1164   int *mismatch_positions = mismatch_positions_alloc;
1165   int alignlength = pos3 - pos5;
1166   Univcoord_T univdiagonal;
1167 
1168 
1169   debug8(printf("Entered Substring_trim_qstart_nosplice with pos5 %d, pos3 %d, mismatch_scores %d/%d, match_score %d\n",
1170 		pos5,pos3,TRIM_MISMATCH_SCORE_LAST,TRIM_MISMATCH_SCORE_MULT,TRIM_MATCH_SCORE));
1171   debug8(printf("Calling Genome_mismatches_right_trim with left %u, pos5 %d, pos3 %d\n",left,pos5,pos3));
1172   *nmismatches = Genome_mismatches_right_trim(mismatch_positions,/*max_mismatches*/alignlength,
1173 					     /*ome*/genomebits,/*ome_alt*/genomebits_alt,query_compress,
1174 					     left,pos5,pos3,plusp,genestrand);
1175   debug8(printf("%d mismatches:",*nmismatches));
1176   debug8(
1177 	 for (i = 0; i <= *nmismatches; i++) {
1178 	   printf(" %d",mismatch_positions[i]);
1179 	 }
1180 	 printf("\n");
1181 	 );
1182 
1183   if (*nmismatches == 0) {
1184     return pos5;
1185   }
1186 
1187   /* Advance until we get to a good region */
1188   pos = prevpos = pos3;
1189   i = 0;
1190   while (i < (*nmismatches) /*- 1*/ && score <= 0) {
1191     pos = mismatch_positions[i]; /* position at the mismatch */
1192     score += (prevpos - pos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_MULT;
1193     if (score < 0) {
1194       /* Distal matches did not compensate for the mismatch, so advance to here */
1195       debug8(printf("Advance qstart pos %d, score %d => 0, pos %d\n",pos,score,pos));
1196       score = 0;
1197     } else {
1198       debug8(printf("Advance qstart pos %d, score %d, pos %d\n",pos,score,pos));
1199     }
1200     prevpos = pos;			   /* On the mismatch */
1201     i++;
1202   }
1203 
1204   /* Now trim */
1205   trimpos = prevpos = pos;
1206   score = 0;
1207   debug8(printf("Trim qend pos %d, score %d, trimpos %d\n",pos,score,trimpos));
1208   while (i < *nmismatches) {
1209     pos = mismatch_positions[i];
1210     score += (prevpos - pos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_MULT;
1211     if (score >= 0) {
1212       trimpos = pos;
1213       score = 0;
1214     }
1215     debug8(printf("Trim qstart pos %d, score %d, trimpos %d\n",pos,score,trimpos));
1216     prevpos = pos;
1217     i++;
1218   }
1219 
1220   score += (prevpos - pos5 - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_LAST;
1221   if (score >= 0) {
1222     trimpos = pos5 - 1;		/* At the "mismatch" */
1223     /* score = 0; */
1224   }
1225 
1226   if (novelsplicingp == false && trimpos == 0 &&
1227       left + (Univcoord_T) querylength >= chroffset + (Univcoord_T) querylength) {
1228     /* For DNA-seq, if still within chromosome, take the initial mismatch at the beginning of the read */
1229     debug8(printf("Backing up 1 bp to start => trimpos %d\n",0));
1230     return 0;
1231   } else {
1232     debug8(printf("Final qstart pos %d => trimpos %d\n",pos,trimpos+1));
1233     return trimpos + 1;		/* One position after the mismatch */
1234   }
1235 }
1236 
1237 
1238 /* Still takes left as a parameter */
1239 int
Substring_trim_qend_nosplice(int * nmismatches,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T left,Univcoord_T chrhigh,int pos5,int pos3,int querylength,bool plusp,int genestrand)1240 Substring_trim_qend_nosplice (int *nmismatches, int *mismatch_positions_alloc,
1241 			      Compress_T query_compress, Univcoord_T left, Univcoord_T chrhigh,
1242 			      int pos5, int pos3, int querylength, bool plusp, int genestrand) {
1243   int score = 0;
1244   int trimpos = pos3, pos, prevpos, i;
1245   int *mismatch_positions = mismatch_positions_alloc;
1246   int alignlength = pos3 - pos5;
1247 
1248 
1249   debug8(printf("Entered Substring_trim_qend_nosplice with pos5 %d, pos3 %d, mismatch_scores %d/%d, match_score %d\n",
1250 		pos5,pos3,TRIM_MISMATCH_SCORE_LAST,TRIM_MISMATCH_SCORE_MULT,TRIM_MATCH_SCORE));
1251   debug8(printf("Calling Genome_mismatches_left_trim with left %u, pos5 %d, pos3 %d\n",left,pos5,pos3));
1252   *nmismatches = Genome_mismatches_left_trim(mismatch_positions,/*max_mismatches*/alignlength,
1253 					     /*ome*/genomebits,/*ome_alt*/genomebits_alt,query_compress,
1254 					     left,pos5,pos3,plusp,genestrand);
1255 
1256   debug8(printf("%d mismatches:",*nmismatches));
1257   debug8(
1258 	 for (i = 0; i <= *nmismatches; i++) {
1259 	   printf(" %d",mismatch_positions[i]);
1260 	 }
1261 	 printf("\n");
1262 	 );
1263 
1264   if (*nmismatches == 0) {
1265     return pos3;
1266   }
1267 
1268   /* Advance until we get to a good region */
1269   pos = prevpos = pos5;
1270   i = 0;
1271   while (i < (*nmismatches) /*- 1*/ && score <= 0) {
1272     pos = mismatch_positions[i]; /* position at the mismatch */
1273     score += (pos - prevpos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_MULT;
1274     if (score < 0) {
1275       /* Distal matches did not compensate for the mismatch, so advance to here */
1276       debug8(printf("Advance qend pos %d, score %d => 0, pos %d\n",pos,score,pos));
1277       score = 0;
1278     } else {
1279       debug8(printf("Advance qend pos %d, score %d, pos %d\n",pos,score,pos));
1280     }
1281     prevpos = pos;			   /* On the mismatch */
1282     i++;
1283   }
1284 
1285   /* Now trim */
1286   trimpos = prevpos = pos;
1287   score = 0;
1288   while (i < *nmismatches) {
1289     pos = mismatch_positions[i];
1290     score += (pos - prevpos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_MULT;
1291     if (score >= 0) {
1292       trimpos = pos;
1293       score = 0;
1294     }
1295     debug8(printf("Trim qend pos %d, score %d, trimpos %d\n",pos,score,trimpos));
1296     prevpos = pos;
1297     i++;
1298   }
1299 
1300   score += (pos3 - prevpos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_LAST;
1301   if (score >= 0) {
1302     trimpos = pos3;
1303     /* score = 0; */
1304   }
1305 
1306   if (novelsplicingp == false && trimpos == querylength - 1 &&
1307       left + (Univcoord_T) querylength <= chrhigh) {
1308     /* For DNA-seq, if still within the chromosome, take the final mismatch at end of read */
1309     debug8(printf("Advancing 1 bp to end => trimpos %d\n",pos3));
1310     return querylength;
1311   } else {
1312     debug8(printf("Final qend pos %d => trimpos %d\n",pos,trimpos));
1313     return trimpos;		/* qend is outside the region */
1314   }
1315 }
1316 
1317 
1318 /* Returns true if splice was found, false otherwise.  If true, result
1319    is the number of spliceends.  If false, result is the single trim
1320    position */
1321 bool
Substring_trimmed_qstarts(int * result,Splicetype_T * splicetype,int ** ambig_qstarts,double ** ambig_probs_5,Univcoord_T univdiagonal,int qend,int querylength,bool plusp,int genestrand,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T chroffset,int sensedir)1322 Substring_trimmed_qstarts (int *result, Splicetype_T *splicetype, int **ambig_qstarts, double **ambig_probs_5,
1323 			   Univcoord_T univdiagonal, int qend, int querylength,
1324 			   bool plusp, int genestrand, int *mismatch_positions_alloc, Compress_T query_compress,
1325 			   Univcoord_T chroffset, int sensedir) {
1326   Univcoord_T left;
1327   int trimpos, pos5;
1328   int nspliceends;
1329   int nmismatches;
1330 
1331 
1332   debug8(printf("\n***Entered Substring_trimmed_qstarts with univdiagonal %u, qend %d..%d, plusp %d, sensedir %d\n",
1333 		univdiagonal-chroffset,0,qend,plusp,sensedir));
1334 
1335   left = univdiagonal - (Univcoord_T) querylength;
1336   pos5 = (univdiagonal >= (Univcoord_T) querylength) ? 0 : (int) -left;
1337 
1338   trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
1339 					   left,chroffset,pos5,/*pos3*/qend,querylength,plusp,genestrand);
1340   debug8(printf("trimpos %d (relative to %d)\n",trimpos,/*qstart*/0));
1341 
1342   if (trimpos >= qend) {
1343     debug8(printf("trimpos %d >= qend %d, so returning -1\n",trimpos,qend));
1344     *result = -1;
1345     return false;
1346 
1347   } else if (novelsplicingp == false) {
1348     /* Keep given trim */
1349     *result = trimpos;
1350     return false;
1351 
1352   } else if (trimpos == pos5) {
1353     /* Keep given trim */
1354     *result = trimpos;
1355     return false;
1356 
1357   } else if (trimpos <= pos5 + ACCEPTABLE_TRIM) {
1358     /* Accept splice if probability is high */
1359     debug8(printf("Accepting splice if probability is high\n"));
1360     if ((nspliceends = Splice_trim_novel_spliceends_5(&(*splicetype),&(*ambig_qstarts),&(*ambig_probs_5),
1361 						      left,/*qstart*/pos5,qend,
1362 						      mismatch_positions_alloc,nmismatches,chroffset,
1363 						      END_SPLICESITE_PROB_CLOSE,MAX_NCONSECUTIVE_CLOSE,
1364 						      plusp,sensedir)) == 0) {
1365       *result = trimpos;  /* was pos5, but can lead to false positive mismatches at end */
1366       return false;
1367     } else {
1368       *result = nspliceends;
1369       return true;
1370     }
1371 
1372   } else {
1373     if ((nspliceends = Splice_trim_novel_spliceends_5(&(*splicetype),&(*ambig_qstarts),&(*ambig_probs_5),
1374 						      left,/*qstart*/trimpos,qend,
1375 						      mismatch_positions_alloc,nmismatches,chroffset,
1376 						      END_SPLICESITE_PROB_FAR,MAX_NCONSECUTIVE_FAR,
1377 						      plusp,sensedir)) == 0) {
1378       *result = trimpos;
1379       return false;
1380 
1381     } else {
1382       *result = nspliceends;
1383       return true;
1384     }
1385   }
1386 }
1387 
1388 
1389 bool
Substring_qstart_trim(int * trimpos,Splicetype_T * splicetype,double * ambig_prob_qstart,Univcoord_T univdiagonal,int pos3,int querylength,bool plusp,int genestrand,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T chroffset,int sensedir)1390 Substring_qstart_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_qstart,
1391 		       Univcoord_T univdiagonal, int pos3, int querylength, bool plusp, int genestrand,
1392 		       int *mismatch_positions_alloc, Compress_T query_compress,
1393 		       Univcoord_T chroffset, int sensedir) {
1394   Univcoord_T left;
1395   int pos5;
1396   int nspliceends;
1397   int nmismatches;
1398   int *ambig_qstarts;
1399   double *ambig_probs_5;
1400 
1401   debug8(printf("\n***Entered Substring_qstart_trim with univdiagonal %u, pos3 %d, plusp %d, sensedir %d\n",
1402 		univdiagonal,pos3,plusp,sensedir));
1403 
1404   left = univdiagonal - (Univcoord_T) querylength;
1405   pos5 = (univdiagonal >= (Univcoord_T) querylength) ? 0 : (int) -left;
1406 
1407   *trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
1408 					    left,chroffset,pos5,pos3,querylength,plusp,genestrand);
1409   debug8(printf("trimpos %d (relative to %d)\n",*trimpos,pos5));
1410 
1411   if (*trimpos >= pos3) {
1412     debug8(printf("trimpos %d >= pos3 %d, so returning -1\n",*trimpos,pos3));
1413     *trimpos = -1;
1414     return false;
1415 
1416   } else if (novelsplicingp == false) {
1417     /* Keep given trim */
1418     *ambig_prob_qstart = 0.0;
1419     return false;
1420 
1421   } else if (*trimpos == pos5) {
1422     /* Keep given trim */
1423     *ambig_prob_qstart = 0.0;
1424     return false;
1425 
1426   } else if (*trimpos <= pos5 + ACCEPTABLE_TRIM) {
1427     /* Accept splice if probability is high */
1428     debug8(printf("Accepting splice if probability is high\n"));
1429     if ((nspliceends = Splice_trim_novel_spliceends_5(&(*splicetype),&ambig_qstarts,&ambig_probs_5,
1430 						      left,/*qstart*/pos5,/*qend*/pos3,
1431 						      mismatch_positions_alloc,nmismatches,chroffset,
1432 						      END_SPLICESITE_PROB_CLOSE,MAX_NCONSECUTIVE_CLOSE,
1433 						      plusp,sensedir)) == 0) {
1434       /* Keep given trim */
1435       /* *trimpos = pos5;  -- can lead to false positive mismatches at end */
1436       *ambig_prob_qstart = 0.0;
1437       return false;
1438 
1439     } else {
1440       /* TODO: Make sure this is the farthest one */
1441       *trimpos = ambig_qstarts[0];
1442       *ambig_prob_qstart = ambig_probs_5[0];
1443       FREE(ambig_qstarts);
1444       FREE(ambig_probs_5);
1445       return true;
1446     }
1447 
1448   } else {
1449     if ((nspliceends = Splice_trim_novel_spliceends_5(&(*splicetype),&ambig_qstarts,&ambig_probs_5,
1450 						      left,/*qstart*/(*trimpos),/*qend*/pos3,
1451 						      mismatch_positions_alloc,nmismatches,chroffset,
1452 						      END_SPLICESITE_PROB_FAR,MAX_NCONSECUTIVE_FAR,
1453 						      plusp,sensedir)) == 0) {
1454       /* Keep given trim */
1455       *ambig_prob_qstart = 0.0;
1456       return false;
1457 
1458     } else {
1459       /* TODO: Make sure this is the farthest one */
1460       *trimpos = ambig_qstarts[0];
1461       *ambig_prob_qstart = ambig_probs_5[0];
1462       FREE(ambig_qstarts);
1463       FREE(ambig_probs_5);
1464       return true;
1465     }
1466   }
1467 }
1468 
1469 
1470 /* Returns true if splice was found, false otherwise.  If true, result
1471    is the number of spliceends.  If false, result is the single trim
1472    position */
1473 bool
Substring_trimmed_qends(int * result,Splicetype_T * splicetype,int ** ambig_qends,double ** ambig_probs_3,Univcoord_T univdiagonal,int qstart,int querylength,bool plusp,int genestrand,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T chroffset,Univcoord_T chrhigh,int sensedir)1474 Substring_trimmed_qends (int *result, Splicetype_T *splicetype, int **ambig_qends, double **ambig_probs_3,
1475 			 Univcoord_T univdiagonal, int qstart, int querylength,
1476 			 bool plusp, int genestrand, int *mismatch_positions_alloc, Compress_T query_compress,
1477 			 Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir) {
1478   Univcoord_T left;
1479   int trimpos, pos3;
1480   int nspliceends;
1481   int nmismatches;
1482 
1483   debug8(printf("\n***Entered Substring_trimmed_qends with univdiagonal %u, qstart %d..%d, plusp %d, sensedir %d\n",
1484 		univdiagonal-chroffset,qstart,querylength,plusp,sensedir));
1485 
1486   left = univdiagonal - (Univcoord_T) querylength;
1487   pos3 = (univdiagonal <= genomelength) ? querylength : (int) (genomelength - left);
1488 
1489   trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
1490 					 left,chrhigh,/*pos5*/qstart,pos3,querylength,plusp,genestrand);
1491   debug8(printf("trimpos %d (relative to %d)\n",trimpos,/*qend*/querylength));
1492 
1493   if (trimpos <= qstart) {
1494     debug8(printf("trimpos %d <= qstart %d, so returning -1\n",trimpos,qstart));
1495     *result = -1;
1496     return false;
1497 
1498   } else if (novelsplicingp == false) {
1499     /* Keep given trim */
1500     *result = trimpos;
1501     return false;
1502 
1503   } else if (trimpos == pos3) {
1504     /* Keep given trim */
1505     *result = trimpos;
1506     return false;
1507 
1508   } else if (trimpos >= pos3 - ACCEPTABLE_TRIM) {
1509     /* Accept splice if probability is high */
1510     debug8(printf("Accepting splice if probability is high\n"));
1511     if ((nspliceends = Splice_trim_novel_spliceends_3(&(*splicetype),&(*ambig_qends),&(*ambig_probs_3),
1512 						      left,qstart,/*qend*/pos3,querylength,
1513 						      mismatch_positions_alloc,nmismatches,chroffset,
1514 						      END_SPLICESITE_PROB_CLOSE,MAX_NCONSECUTIVE_CLOSE,
1515 						      plusp,sensedir)) == 0) {
1516       *result = trimpos;  /* was pos3, but can lead to false positive mismatches at end */
1517       return false;
1518     } else {
1519       *result = nspliceends;
1520       return true;
1521     }
1522 
1523   } else {
1524     if ((nspliceends = Splice_trim_novel_spliceends_3(&(*splicetype),&(*ambig_qends),&(*ambig_probs_3),
1525 						      left,qstart,/*qend*/trimpos,querylength,
1526 						      mismatch_positions_alloc,nmismatches,chroffset,
1527 						      END_SPLICESITE_PROB_FAR,MAX_NCONSECUTIVE_FAR,
1528 						      plusp,sensedir)) == 0) {
1529       *result = trimpos;
1530       return false;
1531     } else {
1532       *result = nspliceends;
1533       return true;
1534     }
1535   }
1536 }
1537 
1538 
1539 bool
Substring_qend_trim(int * trimpos,Splicetype_T * splicetype,double * ambig_prob_qend,Univcoord_T univdiagonal,int pos5,int querylength,bool plusp,int genestrand,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T chroffset,Univcoord_T chrhigh,int sensedir)1540 Substring_qend_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_qend,
1541 		     Univcoord_T univdiagonal, int pos5, int querylength, bool plusp, int genestrand,
1542 		     int *mismatch_positions_alloc, Compress_T query_compress,
1543 		     Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir) {
1544   Univcoord_T left;
1545   int pos3;
1546   int nspliceends;
1547   int nmismatches;
1548   int *ambig_qends;
1549   double *ambig_probs_3;
1550 
1551   debug8(printf("\n***Entered Substring_qend_trim with univdiagonal %u, pos5 %d, plusp %d, sensedir %d\n",
1552 		univdiagonal,pos5,plusp,sensedir));
1553 
1554   left = univdiagonal - (Univcoord_T) querylength;
1555   pos3 = (univdiagonal <= genomelength) ? querylength : (int) (genomelength - left);
1556 
1557   *trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
1558 					  left,chrhigh,pos5,pos3,querylength,plusp,genestrand);
1559   debug8(printf("trimpos %d (relative to %d)\n",*trimpos,pos3));
1560 
1561   if (*trimpos <= pos5) {
1562     debug8(printf("trimpos %d <= pos5 %d, so returning -1\n",*trimpos,pos5));
1563     *trimpos = -1;
1564     return false;
1565 
1566   } else if (novelsplicingp == false) {
1567     /* Keep given trim */
1568     *ambig_prob_qend = 0.0;
1569     return false;
1570 
1571   } else if (*trimpos == pos3) {
1572     /* Keep given trim */
1573     *ambig_prob_qend = 0.0;
1574     return false;
1575 
1576   } else if (*trimpos >= pos3 - ACCEPTABLE_TRIM) {
1577     /* Accept splice if probability is high */
1578     debug8(printf("Accepting splice if probability is high\n"));
1579     if ((nspliceends = Splice_trim_novel_spliceends_3(&(*splicetype),&ambig_qends,&ambig_probs_3,
1580 						      left,/*qstart*/pos5,/*qend*/pos3,querylength,
1581 						      mismatch_positions_alloc,nmismatches,chroffset,
1582 						      END_SPLICESITE_PROB_CLOSE,MAX_NCONSECUTIVE_CLOSE,
1583 						      plusp,sensedir)) == 0) {
1584       /* Keep given trim */
1585       /* *trimpos = pos3;  -- can lead to false positive mismatches at end */
1586       *ambig_prob_qend = 0.0;
1587       return false;
1588     } else {
1589       /* TODO: Make sure this is the farthest one */
1590       *trimpos = ambig_qends[0];
1591       *ambig_prob_qend = ambig_probs_3[0];
1592       FREE(ambig_qends);
1593       FREE(ambig_probs_3);
1594       return true;
1595     }
1596 
1597   } else {
1598     if ((nspliceends = Splice_trim_novel_spliceends_3(&(*splicetype),&ambig_qends,&ambig_probs_3,
1599 						      left,/*qstart*/pos5,/*qend*/*trimpos,querylength,
1600 						      mismatch_positions_alloc,nmismatches,chroffset,
1601 						      END_SPLICESITE_PROB_FAR,MAX_NCONSECUTIVE_FAR,
1602 						      plusp,sensedir)) == 0) {
1603       /* Keep given trim */
1604       *ambig_prob_qend = 0.0;
1605       return false;
1606     } else {
1607       /* TODO: Make sure this is the farthest one */
1608       *trimpos = ambig_qends[0];
1609       *ambig_prob_qend = ambig_probs_3[0];
1610       FREE(ambig_qends);
1611       FREE(ambig_probs_3);
1612       return true;
1613     }
1614   }
1615 }
1616 
1617 
1618 
1619 /* Modified from Substring_new to return nmatches_plus_spliced_trims only */
1620 int
Substring_compute_nmatches(int * ref_nmatches_plus_spliced_trims,Univcoord_T left,int querystart,int queryend,int querylength,bool plusp,int genestrand,Compress_T query_compress,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool splice_querystart_p,bool splice_queryend_p,bool chrnum_fixed_p)1621 Substring_compute_nmatches (int *ref_nmatches_plus_spliced_trims, Univcoord_T left,
1622 			    int querystart, int queryend, int querylength,
1623 			    bool plusp, int genestrand, Compress_T query_compress,
1624 			    Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
1625 			    bool splice_querystart_p, bool splice_queryend_p, bool chrnum_fixed_p) {
1626   int nmatches_plus_spliced_trims, nmismatches_bothdiff, nmismatches_refdiff, amb_length;
1627   Univcoord_T alignstart, alignend;
1628   int outofbounds_start, outofbounds_end;
1629   int trim_at_querystart, trim_at_queryend;
1630 
1631 
1632   debug2(printf("\n***Entered Substring_compute_nmatches with left %u, chr %d, genome %u..%u, query %d..%d, plusp %d, splice_querystart_p %d, splice_queryend_p %d\n",
1633 		left,chrnum,left+querystart-chroffset,left+queryend-chroffset,querystart,queryend,plusp,splice_querystart_p,splice_queryend_p));
1634 
1635 #if 0
1636   if (plusp == true) {
1637     genomicstart = left;
1638     genomicend = left + querylength;
1639   } else {
1640     genomicstart = left + querylength;
1641     genomicend = left;
1642   }
1643 #endif
1644 
1645   trim_at_querystart = querystart;
1646   trim_at_queryend = querylength - queryend;
1647 
1648   if (chrnum_fixed_p == true) {
1649     /* Chromosomal information determined by the middle diagonal */
1650     if (plusp) {
1651       alignstart = left + trim_at_querystart;
1652       alignend = left + (querylength - trim_at_queryend);
1653 
1654       if (alignstart >= chroffset) {
1655 	outofbounds_start = 0;
1656       } else {
1657 	outofbounds_start = chroffset - alignstart;
1658       }
1659       if (alignend <= chrhigh) {
1660 	outofbounds_end = 0;
1661       } else {
1662 	outofbounds_end = alignend - chrhigh;
1663       }
1664 
1665     } else {
1666       alignstart = left + (querylength - trim_at_querystart);
1667       alignend = left + trim_at_queryend;
1668 
1669       if (alignend >= chroffset) {
1670 	outofbounds_end = 0;
1671       } else {
1672 	outofbounds_end = chroffset - alignend;
1673       }
1674       if (alignstart <= chrhigh) {
1675 	outofbounds_start = 0;
1676       } else {
1677 	outofbounds_start = alignstart - chrhigh;
1678       }
1679     }
1680 
1681   } else {
1682     /* Determine chromosomal information */
1683     if (plusp) {
1684       alignstart = left + trim_at_querystart;
1685       alignend = left + (querylength - trim_at_queryend);
1686 
1687       if (alignend < chroffset) {
1688 	/* Need to recompute chromosome bounds (unexpected since chroffset should be based on left) */
1689 	debug2(printf("Plus: recomputing chromosomal bounds\n"));
1690 	chrnum = Univ_IIT_get_one(chromosome_iit,alignstart,alignstart);
1691 	Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1692 
1693       } else if (alignstart >= chrhigh) {
1694 	/* Need to recompute chromosome bounds */
1695 	debug2(printf("Plus: recomputing chromosomal bounds\n"));
1696 	chrnum = Univ_IIT_get_one(chromosome_iit,alignstart,alignstart);
1697 	Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1698       }
1699 
1700       if (alignend <= chrhigh) {
1701 	/* Alignment is within the chromosome */
1702 	debug2(printf("alignend <= chrhigh, so alignment is within the chromosome\n"));
1703 	outofbounds_start = outofbounds_end = 0;
1704       } else {
1705 	/* Alignment straddles the high bound of the chromosome */
1706 	outofbounds_start = chrhigh - alignstart;
1707 	outofbounds_end = alignend - chrhigh;
1708 	if (outofbounds_start >= outofbounds_end) {
1709 	  /* Keep in existing chromosome */
1710 	  outofbounds_start = 0;
1711 	} else if (++chrnum > nchromosomes) {
1712 	  /* Went past the end of the genome */
1713 	  return -1;
1714 	} else {
1715 	  /* Move to next chromosome */
1716 	  debug2(printf("Moving to next chromosome\n"));
1717 	  Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1718 	  if (alignend <= chrhigh) {
1719 	    /* Alignment is within the new chromosome */
1720 	    debug2(printf("alignend <= new chrhigh %u, so alignment is within the new chromosome\n",chrhigh));
1721 	    outofbounds_end = 0;
1722 	  } else {
1723 	    outofbounds_end = alignend - chrhigh;
1724 	  }
1725 	}
1726       }
1727 
1728     } else {
1729       alignstart = left + (querylength - trim_at_querystart);
1730       alignend = left + trim_at_queryend;
1731 
1732       if (alignstart < chroffset) {
1733 	/* Need to recompute chromosome bounds (unexpected since chroffset should be based on left) */
1734 	debug2(printf("Minus: Recomputing chromosomal bounds\n"));
1735 	chrnum = Univ_IIT_get_one(chromosome_iit,alignend,alignend);
1736 	Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1737 
1738       } else if (alignend >= chrhigh) {
1739 	/* Need to recompute chromosome bounds */
1740 	debug2(printf("Minus: Recomputing chromosomal bounds\n"));
1741 	chrnum = Univ_IIT_get_one(chromosome_iit,alignend,alignend);
1742 	Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1743       }
1744 
1745       if (alignstart <= chrhigh) {
1746 	/* Alignment is within the chromosome */
1747 	debug2(printf("alignstart <= chrhigh, so alignment is within the chromosome\n"));
1748 	outofbounds_start = outofbounds_end = 0;
1749       } else {
1750 	/* Alignment straddles the high bound of the chromosome */
1751 	outofbounds_start = alignstart - chrhigh;
1752 	outofbounds_end = chrhigh - alignend;
1753 	if (outofbounds_end >= outofbounds_start) {
1754 	  /* Keep in existing chromosome */
1755 	  outofbounds_end = 0;
1756 	} else if (++chrnum > nchromosomes) {
1757 	  /* Went past the end of the genome */
1758 	  return -1;
1759 	} else {
1760 	  /* Move to next chromosome */
1761 	  debug2(printf("Moving to next chromosome\n"));
1762 	  Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1763 	  if (alignstart <= chrhigh) {
1764 	    debug2(printf("alignstart <= new chrhigh %u, so alignment is within the new chromosome\n",chrhigh));
1765 	    outofbounds_start = 0;
1766 	  } else {
1767 	    outofbounds_start = alignstart - chrhigh;
1768 	  }
1769 	}
1770       }
1771     }
1772   }
1773 
1774   /* outofbounds values are relative to preliminary alignstart and alignend */
1775   trim_at_querystart += outofbounds_start;
1776   trim_at_queryend += outofbounds_end;
1777 
1778   querystart = trim_at_querystart;
1779   queryend = querylength - trim_at_queryend;
1780   if (querystart >= queryend) {
1781     /* Can happen if multiple mismatches result in trimming down to 0 */
1782     return -1;
1783 
1784   } else if (plusp == true) {
1785     debug2(printf("Counting mismatches from querystart %d to queryend %d\n",querystart,queryend));
1786     nmismatches_bothdiff =
1787       Genome_count_mismatches_substring(&nmismatches_refdiff,genomebits,genomebits_alt,query_compress,left,
1788 					/*pos5*/querystart,/*pos3*/queryend,/*plusp*/true,genestrand);
1789   } else {
1790     debug2(printf("Counting mismatches from querystart %d to queryend %d\n",querystart,queryend));
1791     nmismatches_bothdiff =
1792       Genome_count_mismatches_substring(&nmismatches_refdiff,genomebits,genomebits_alt,query_compress,left,
1793 					/*pos5*/querylength - queryend,/*pos3*/querylength - querystart,
1794 					/*plusp*/false,genestrand);
1795   }
1796 
1797   /* Needs to be identical to code in Substring_nmatches.  Don't need to handle alts_substring, though. */
1798   amb_length = 0;
1799   if (splice_querystart_p == true) {
1800     amb_length += trim_at_querystart;
1801   }
1802   if (splice_queryend_p == true) {
1803     amb_length += trim_at_queryend;
1804   }
1805   nmatches_plus_spliced_trims = amb_length + (queryend - querystart) - nmismatches_bothdiff;
1806   *ref_nmatches_plus_spliced_trims = amb_length + (queryend - querystart) - nmismatches_refdiff;
1807 
1808   debug2(printf("Substring_compute_nmatches returning %d matches\n",nmatches_plus_spliced_trims));
1809   return nmatches_plus_spliced_trims;
1810 }
1811 
1812 
1813 /* Want querylength and not querylength_adj */
1814 /* If orig_nmismatches < 0, then this procedure (re-)computes the value */
1815 
1816 /* For trimming: querystart and queryend should be used as a basis for
1817    trim_novel_spliceends (which search to both sides of the point, but
1818    not for trim_querystart and trim_queryend, which search only
1819    inwards.  For those procedures, should use querystart of 0 and
1820    queryend of querylength, and the results are absolute, not relative
1821    to the given querystart and queryend */
1822 
1823 /* Assumes that chrnum is determined correctly, even for straddles, from middle diagonal */
1824 T
Substring_new(int nmismatches,int ref_nmismatches,Univcoord_T left,int querystart,int queryend,int querylength,bool plusp,int genestrand,Compress_T query_compress,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool splice_querystart_p,Splicetype_T splicetype_querystart,double ambig_prob_querystart,bool splice_queryend_p,Splicetype_T splicetype_queryend,double ambig_prob_queryend,int sensedir)1825 Substring_new (int nmismatches, int ref_nmismatches, Univcoord_T left,
1826 	       int querystart, int queryend, int querylength, bool plusp, int genestrand,
1827 	       Compress_T query_compress, Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
1828 	       bool splice_querystart_p, Splicetype_T splicetype_querystart, double ambig_prob_querystart,
1829 	       bool splice_queryend_p, Splicetype_T splicetype_queryend, double ambig_prob_queryend,
1830 	       int sensedir) {
1831   T new;
1832   int trim_querystart, trim_queryend;
1833   Univcoord_T alignstart, alignend;
1834   int outofbounds_start, outofbounds_end;
1835 
1836 
1837   debug2(printf("\n***Entered Substring_new with left %u, chr %d, genome %d..%d, query %d..%d, plusp %d, sensedir %d, splice_querystart_p %d, splice_queryend_p %d\n",
1838 		left,chrnum,(int) (left+querystart-chroffset),(int) (left+queryend-chroffset),querystart,queryend,plusp,sensedir,splice_querystart_p,splice_queryend_p));
1839 
1840   /* assert(queryend > querystart); -- Assertion does not hold.  Sometimes queryend == querystart */
1841 
1842   new = (T) MALLOC_OUT(sizeof(*new));
1843   debug2(printf("substring %p:\n",new));
1844 
1845   /* Required values if we abort and invoke Substring_free early */
1846   new->alts_ncoords = 0;
1847   new->genomic_bothdiff = (char *) NULL;
1848   new->genomic_refdiff = (char *) NULL;
1849 
1850   new->querylength = querylength;
1851 
1852   /* These values are required by substring_trim_novel_spliceends */
1853   new->left = left;
1854   new->plusp = plusp;
1855   new->genestrand = genestrand;
1856   new->query_compress = query_compress;
1857 
1858   if (plusp == true) {
1859     new->genomicstart = left;
1860     new->genomicend = left + querylength;
1861 
1862     if (left + querylength > chroffset + querylength) {
1863       new->querystart_chrbound = 0;
1864     } else {
1865       new->querystart_chrbound = chroffset - left;
1866     }
1867     if (left + querylength < chrhigh) {
1868       new->queryend_chrbound = querylength;
1869     } else {
1870       new->queryend_chrbound = chrhigh - left;
1871     }
1872 
1873   } else {
1874     new->genomicstart = left + querylength;
1875     new->genomicend = left;
1876 
1877     if (left + querylength < chrhigh) {
1878       new->querystart_chrbound = 0;
1879     } else {
1880       new->querystart_chrbound = (left + querylength) - chrhigh;
1881     }
1882     if (left + querylength > chroffset + querylength) {
1883       new->queryend_chrbound = querylength;
1884     } else {
1885       new->queryend_chrbound = (left + querylength) - chroffset;
1886     }
1887   }
1888 
1889   new->trim_querystart_splicep = splice_querystart_p;
1890   new->trim_queryend_splicep = splice_queryend_p;
1891 
1892   new->querystart_pretrim = querystart;
1893   new->queryend_pretrim = queryend;
1894 
1895   trim_querystart = querystart;
1896   trim_queryend = querylength - queryend;
1897 
1898   new->sensedir = sensedir;
1899 
1900   new->amb_splice_pos = 0;
1901   new->splicecoord_D = new->splicecoord_A = 0;
1902   new->siteD_pos = new->siteA_pos = new->siteN_pos = 0;
1903   new->siteD_prob = new->siteA_prob = 0.0;
1904 
1905   if (splice_querystart_p == false) {
1906     new->start_amb_length = 0;
1907     new->amb_type = END;
1908     new->start_endtype = END;
1909 
1910   } else if (splicetype_querystart == DONOR || splicetype_querystart == ANTIDONOR) {
1911     new->start_amb_length = querystart;
1912     new->amb_type = DON;
1913     new->start_endtype = DON;
1914     new->siteD_prob = ambig_prob_querystart;
1915 
1916   } else {
1917     new->start_amb_length = querystart;
1918     new->amb_type = ACC;
1919     new->start_endtype = ACC;
1920     new->siteA_prob = ambig_prob_querystart;
1921   }
1922 
1923   if (splice_queryend_p == false) {
1924     new->end_amb_length = 0;
1925     new->amb_type = END;
1926     new->end_endtype = END;
1927 
1928   } else if (splicetype_queryend == DONOR || splicetype_queryend == ANTIDONOR) {
1929     new->end_amb_length = querylength - queryend;
1930     new->amb_type = DON;
1931     new->end_endtype = DON;
1932     new->siteD_prob = ambig_prob_queryend;
1933 
1934   } else {
1935     new->end_amb_length = querylength - queryend;
1936     new->amb_type = ACC;
1937     new->end_endtype = ACC;
1938     new->siteA_prob = ambig_prob_queryend;
1939   }
1940 
1941 
1942   /* Chromosomal information determined by the middle diagonal */
1943   if (plusp) {
1944     alignstart = left + trim_querystart;
1945     alignend = left + (querylength - trim_queryend);
1946 
1947 #if 1
1948     /* Needed to protect against substrings on the wrong chromosome */
1949     if (alignend < chroffset || alignstart >= chrhigh) {
1950       debug2(printf("Substring fails because alignend %u < chroffset %u or alignstart %u >= chrhigh %u\n",
1951 		    alignend,chroffset,alignstart,chrhigh));
1952       Substring_free(&new);
1953       return (T) NULL;
1954     }
1955 #endif
1956 
1957     if (alignstart >= chroffset) {
1958       outofbounds_start = 0;
1959     } else {
1960       outofbounds_start = chroffset - alignstart;
1961     }
1962     if (alignend <= chrhigh) {
1963       outofbounds_end = 0;
1964     } else {
1965       outofbounds_end = alignend - chrhigh;
1966     }
1967 
1968   } else {
1969     alignstart = left + (querylength - trim_querystart);
1970     alignend = left + trim_queryend;
1971 
1972 #if 1
1973     /* Needed to protect against substrings on the wrong chromosome */
1974     if (alignstart < chroffset || alignend >= chrhigh) {
1975       debug2(printf("Substring fails because alignstart %u < chroffset %u or alignend %u >= chrhigh %u\n",
1976 		    alignstart,chroffset,alignend,chrhigh));
1977       Substring_free(&new);
1978       return (T) NULL;
1979     }
1980 #endif
1981 
1982     if (alignend >= chroffset) {
1983       outofbounds_end = 0;
1984     } else {
1985       outofbounds_end = chroffset - alignend;
1986     }
1987     if (alignstart <= chrhigh) {
1988       outofbounds_start = 0;
1989     } else {
1990       outofbounds_start = alignstart - chrhigh;
1991     }
1992   }
1993 
1994   /* outofbounds values are relative to preliminary alignstart and alignend */
1995   trim_querystart += outofbounds_start;
1996   trim_queryend += outofbounds_end;
1997 
1998 #if 0
1999   if (trim_querystart + trim_queryend >= querylength) {
2000     debug2(printf("Substring fails because trim_querystart %d + trim_queryend %d >= querylength %d\n",
2001 		  trim_querystart,trim_queryend,querylength));
2002     Substring_free(&new);
2003     return (T) NULL;
2004   }
2005 #endif
2006 
2007 #ifdef DEBUG2
2008   printf("Outofbounds start %d, outofbounds end %d\n",outofbounds_start,outofbounds_end);
2009   if (outofbounds_start > 0) {
2010     printf("Out of bounds: Revising trim_querystart to be %d\n",trim_querystart);
2011   }
2012   if (outofbounds_end > 0) {
2013     printf("Out of bounds: Revising trim_queryend to be %d\n",trim_queryend);
2014   }
2015   printf("Got trims of %d and %d\n",trim_querystart,trim_queryend);
2016 #endif
2017 
2018 
2019   debug2(printf("\n***chrnum %d (chroffset %u, chrhigh %u), plusp %d\n",chrnum,chroffset,chrhigh,plusp));
2020   new->chrnum = chrnum;
2021   new->chroffset = chroffset;
2022   new->chrhigh = chrhigh;
2023   new->chrlength = chrlength;
2024 
2025   /* mandatory_trim_querystart and mandatory_trim_queryend also set during aliasing for circular chromosomes */
2026   /* These calculations are equivalent to those for querystart_chrbound and queryend_chrbound */
2027   if (left + querylength < new->chroffset + querylength) {
2028     new->mandatory_trim_querystart = new->chroffset - left;
2029     assert(new->mandatory_trim_querystart >= 0);
2030   } else {
2031     new->mandatory_trim_querystart = 0;
2032   }
2033   if (left + querylength >= new->chrhigh) {
2034     new->mandatory_trim_queryend = (left + querylength) - new->chrhigh;
2035     assert(new->mandatory_trim_queryend >= 0);
2036   } else {
2037     new->mandatory_trim_queryend = 0;
2038   }
2039   debug2(printf("mandatory_trim_querystart %d, mandatory_trim_queryend %d\n",new->mandatory_trim_querystart,new->mandatory_trim_queryend));
2040 
2041 
2042   if (querystart != trim_querystart || queryend != querylength - trim_queryend) {
2043     nmismatches = -1;		/* Need to recalculate, because of change in querystart */
2044   }
2045   new->querystart = trim_querystart;
2046   new->queryend = querylength - trim_queryend;
2047   debug2(printf("querystart %d, queryend %d\n",new->querystart,new->queryend));
2048 
2049   assert(new->querystart >= 0);
2050   assert(new->querystart <= querylength);
2051   assert(new->queryend >= 0);
2052   assert(new->queryend <= querylength);
2053 
2054   /* Compute coordinates */
2055   if (new->queryend <= new->querystart) {
2056     /* Can happen if multiple mismatches result in trimming down to 0 */
2057     Substring_free(&new);
2058     return (T) NULL;
2059 
2060   } else if (plusp == true) {
2061     new->alignstart_trim = left + new->querystart;
2062     new->alignend_trim = left + new->queryend;
2063 
2064     debug2(printf("left is %u\n",new->left));
2065     debug2(printf("genomicstart is %u, genomicend is %u\n",new->genomicstart,new->genomicend));
2066     debug2(printf("querylength is %d, alignstart is %u, alignend is %u\n",querylength,new->alignstart_trim,new->alignend_trim));
2067     assert(new->alignstart_trim >= chroffset);
2068     assert(new->alignend_trim <= chrhigh);
2069 
2070     debug2(printf("Counting mismatches from querystart %d to queryend %d\n",new->querystart,new->queryend));
2071     if (nmismatches >= 0) {
2072       debug7(printf("Checking mismatches at %u from querystart %d to queryend %d\n",left - chroffset,new->querystart,new->queryend));
2073       debug7(printf("%d vs %d\n",nmismatches,
2074 		    Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2075 						      /*pos5*/new->querystart,/*pos3*/new->queryend,
2076 						      /*plusp*/true,genestrand)));
2077 #ifdef CHECK_NMISMATCHES
2078       assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2079 							      /*pos5*/new->querystart,/*pos3*/new->queryend,
2080 							      /*plusp*/true,genestrand));
2081 #endif
2082     } else {
2083       nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2084 						      /*pos5*/new->querystart,/*pos3*/new->queryend,
2085 						      /*plusp*/true,genestrand);
2086     }
2087 
2088   } else {
2089     new->alignstart_trim = left + (querylength - new->querystart);
2090     new->alignend_trim = left + (querylength - new->queryend);
2091 
2092     debug2(printf("left is %u\n",new->left));
2093     debug2(printf("genomicstart is %u, genomicend is %u\n",new->genomicstart,new->genomicend));
2094     debug2(printf("querylength is %d, alignstart is %u, alignend is %u\n",querylength,new->alignstart_trim,new->alignend_trim));
2095     assert(new->alignstart_trim <= chrhigh);
2096     assert(new->alignend_trim >= chroffset);
2097 
2098     if (nmismatches >= 0) {
2099       debug7(printf("Checking mismatches at %u from querystart %d to queryend %d\n",left - chroffset,new->querystart,new->queryend));
2100       debug7(printf("%d vs %d\n",nmismatches,
2101 		    Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2102 						      /*pos5*/querylength - new->queryend,/*pos3*/querylength - new->querystart,
2103 						      /*plusp*/false,genestrand)));
2104 #ifdef CHECK_NMISMATCHES
2105       assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2106 							      /*pos5*/querylength - new->queryend,/*pos3*/querylength - new->querystart,
2107 							      /*plusp*/false,genestrand));
2108 #endif
2109     } else {
2110       nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2111 						      /*pos5*/querylength - new->queryend,/*pos3*/querylength - new->querystart,
2112 						      /*plusp*/false,genestrand);
2113     }
2114   }
2115 
2116   new->nmismatches_bothdiff = nmismatches;
2117   new->nmismatches_refdiff = ref_nmismatches;
2118 
2119   new->nmatches_to_trims = (new->queryend - new->querystart) - new->nmismatches_bothdiff;
2120   new->nmatches_plus_spliced_trims = new->nmatches_to_trims + new->start_amb_length + new->end_amb_length;
2121 
2122   new->ref_nmatches_to_trims = (new->queryend - new->querystart) - new->nmismatches_refdiff;
2123   new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + new->start_amb_length + new->end_amb_length;
2124 
2125   debug2(printf("nmatches_to_trims %d = queryend %d - querystart %d - nmismatches_bothdiff %d\n",
2126 		new->nmatches_to_trims,new->queryend,new->querystart,new->nmismatches_bothdiff));
2127   assert(new->nmatches_to_trims >= 0);
2128 
2129   new->alts_ncoords = 0;
2130   new->alts_coords = (Univcoord_T *) NULL;
2131   new->alts_knowni = (int *) NULL;
2132   new->alts_nmismatches = (int *) NULL;
2133   new->alts_ref_nmismatches = (int *) NULL;
2134   new->alts_probs = (double *) NULL;
2135 
2136   debug2(printf("Substring_new returning %d matches\n",new->nmatches_plus_spliced_trims));
2137 #if 0
2138   debug2(printf("** Returning substring %p, query %d..%d, trim %d..%d, nmatches_plus_spliced_trims %d, nmismatches_refdiff %d, nmismatches_bothdiff %d, amb_lengths %d and %d\n",
2139 		new,new->querystart,new->queryend,trim_querystart,trim_queryend,nmatches_plus_spliced_trims,
2140 		new->nmismatches_refdiff,new->nmismatches_bothdiff,Substring_start_amb_length(new),Substring_end_amb_length(new)));
2141 #endif
2142 
2143   assert(new->nmismatches_bothdiff >= 0);
2144   return new;
2145 }
2146 
2147 
2148 /* Copied from Stage3end_new_substitution */
2149 T
Substring_extend_anchor_querystart(T anchor,T substring,int * mismatch_positions_alloc,Compress_T query_compress)2150 Substring_extend_anchor_querystart (T anchor, T substring, int *mismatch_positions_alloc,
2151 				    Compress_T query_compress) {
2152   T new_anchor;
2153   int qstart, qend, nmismatches, ref_nmismatches;
2154   bool splice_querystart_p, splice_queryend_p;
2155   Splicetype_T splicetype_querystart, splicetype_queryend;
2156   double ambig_prob_querystart, ambig_prob_queryend;
2157 
2158   if (substring->querystart != 0) {
2159     /* Keep previously computed bounds for anchor */
2160     return anchor;
2161 
2162   } else if (anchor->plusp == true) {
2163     qend = anchor->queryend;
2164     splice_querystart_p = Substring_qstart_trim(&qstart,&splicetype_querystart,&ambig_prob_querystart,
2165 						/*univdiagonal*/anchor->left + anchor->querylength,/*pos3*/qend,
2166 						anchor->querylength,/*plusp*/true,anchor->genestrand,
2167 						mismatch_positions_alloc,query_compress,
2168 						anchor->chroffset,anchor->sensedir);
2169     /* printf("Substring_extend_anchor_querystart, plus, yields splicep %d, qstart %d, prob %f\n",
2170        splice_querystart_p,qstart,ambig_prob_querystart); */
2171 
2172     if (qstart < 0) {
2173       return anchor;
2174 
2175     } else if (qend <= qstart) {
2176       return anchor;
2177 
2178     } else {
2179       if (anchor->end_endtype == END) {
2180 	splice_queryend_p = false;
2181       } else if (anchor->end_endtype == DON) {
2182 	splice_queryend_p = true;
2183 	splicetype_queryend = DONOR;
2184 	ambig_prob_queryend = anchor->siteD_prob;
2185       } else {
2186 	splice_queryend_p = true;
2187 	splicetype_queryend = ACCEPTOR;
2188 	ambig_prob_queryend = anchor->siteA_prob;
2189       }
2190 
2191       nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,anchor->left,
2192 						      /*pos5*/qstart,/*pos3*/qend,
2193 						      /*plusp*/true,anchor->genestrand);
2194       if ((new_anchor = Substring_new(nmismatches,ref_nmismatches,anchor->left,/*querystart*/qstart,/*queryend*/qend,
2195 				      anchor->querylength,/*plusp*/true,anchor->genestrand,query_compress,
2196 				      anchor->chrnum,anchor->chroffset,anchor->chrhigh,anchor->chrlength,
2197 				      splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
2198 				      splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
2199 				      anchor->sensedir)) == NULL) {
2200 	return anchor;
2201       } else {
2202 	Substring_free(&anchor);
2203 	return new_anchor;
2204       }
2205     }
2206 
2207   } else {
2208     qstart = anchor->querylength - anchor->queryend;
2209     splice_querystart_p = Substring_qend_trim(&qend,&splicetype_querystart,&ambig_prob_querystart,
2210 					      /*univdiagonal*/anchor->left + anchor->querylength,/*pos5*/qstart,
2211 					      anchor->querylength,/*plusp*/false,anchor->genestrand,
2212 					      mismatch_positions_alloc,query_compress,
2213 					      anchor->chroffset,anchor->chrhigh,anchor->sensedir);
2214     /* printf("Substring_extend_anchor_querystart, minus, yields splicep %d, qend %d, prob %f\n",
2215        splice_querystart_p,qend,ambig_prob_querystart); */
2216 
2217     if (qend < 0) {
2218       return anchor;
2219 
2220     } else if (qend <= qstart) {
2221       return anchor;
2222 
2223     } else {
2224       if (anchor->end_endtype == END) {
2225 	splice_queryend_p = false;
2226       } else if (anchor->end_endtype == DON) {
2227 	splice_queryend_p = true;
2228 	splicetype_queryend = ANTIDONOR;
2229 	ambig_prob_queryend = anchor->siteD_prob;
2230       } else {
2231 	splice_queryend_p = true;
2232 	splicetype_queryend = ANTIACCEPTOR;
2233 	ambig_prob_queryend = anchor->siteA_prob;
2234       }
2235 
2236       nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,anchor->left,
2237 						      /*pos5*/qstart,/*pos3*/qend,
2238 						      /*plusp*/false,anchor->genestrand);
2239       if ((new_anchor = Substring_new(nmismatches,ref_nmismatches,anchor->left,
2240 				      /*querystart*/anchor->querylength - qend,
2241 				      /*queryend*/anchor->querylength - qstart,
2242 				      anchor->querylength,/*plusp*/false,anchor->genestrand,query_compress,
2243 				      anchor->chrnum,anchor->chroffset,anchor->chrhigh,anchor->chrlength,
2244 				      splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
2245 				      splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
2246 				      anchor->sensedir)) == NULL) {
2247 	return anchor;
2248       } else {
2249 	Substring_free(&anchor);
2250 	return new_anchor;
2251       }
2252     }
2253   }
2254 }
2255 
2256 
2257 /* Copied from Stage3end_new_substitution */
2258 T
Substring_extend_anchor_queryend(T anchor,T substring,int * mismatch_positions_alloc,Compress_T query_compress)2259 Substring_extend_anchor_queryend (T anchor, T substring, int *mismatch_positions_alloc,
2260 				  Compress_T query_compress) {
2261   T new_anchor;
2262   int qstart, qend, nmismatches, ref_nmismatches;
2263   bool splice_querystart_p, splice_queryend_p;
2264   Splicetype_T splicetype_querystart, splicetype_queryend;
2265   double ambig_prob_querystart, ambig_prob_queryend;
2266 
2267   if (substring->queryend != substring->querylength) {
2268     /* Keep previously computed bounds for anchor */
2269     return anchor;
2270 
2271   } else if (anchor->plusp == true) {
2272     qstart = anchor->querystart;
2273     splice_queryend_p = Substring_qend_trim(&qend,&splicetype_queryend,&ambig_prob_queryend,
2274 					    /*univdiagonal*/anchor->left + anchor->querylength,/*pos5*/qstart,
2275 					    anchor->querylength,/*plusp*/true,anchor->genestrand,
2276 					    mismatch_positions_alloc,query_compress,
2277 					    anchor->chroffset,anchor->chrhigh,anchor->sensedir);
2278     /* printf("Substring_extend_anchor_queryend, plus, yields splicep %d, qend %d, prob %f\n",
2279        splice_queryend_p,qend,ambig_prob_queryend); */
2280 
2281     if (qend < 0) {
2282       return anchor;
2283 
2284     } else if (qend <= qstart) {
2285       return anchor;
2286 
2287     } else {
2288       if (anchor->start_endtype == END) {
2289 	splice_querystart_p = false;
2290       } else if (anchor->start_endtype == DON) {
2291 	splice_querystart_p = true;
2292 	splicetype_querystart = DONOR;
2293 	ambig_prob_querystart = anchor->siteD_prob;
2294       } else {
2295 	splice_querystart_p = true;
2296 	splicetype_querystart = ACCEPTOR;
2297 	ambig_prob_querystart = anchor->siteA_prob;
2298       }
2299 
2300       nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,anchor->left,
2301 						      /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,anchor->genestrand);
2302       if ((new_anchor = Substring_new(nmismatches,ref_nmismatches,anchor->left,/*querystart*/qstart,/*queryend*/qend,
2303 				      anchor->querylength,/*plusp*/true,anchor->genestrand,query_compress,
2304 				      anchor->chrnum,anchor->chroffset,anchor->chrhigh,anchor->chrlength,
2305 				      splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
2306 				      splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
2307 				      anchor->sensedir)) == NULL) {
2308 	return anchor;
2309       } else {
2310 	Substring_free(&anchor);
2311 	return new_anchor;
2312       }
2313     }
2314 
2315   } else {
2316     qend = anchor->querylength - anchor->querystart;
2317     splice_queryend_p = Substring_qstart_trim(&qstart,&splicetype_queryend,&ambig_prob_queryend,
2318 					      /*univdiagonal*/anchor->left + anchor->querylength,
2319 					      /*pos3*/qend,substring->querylength,
2320 					      /*plusp*/false,anchor->genestrand,
2321 					      mismatch_positions_alloc,query_compress,
2322 					      anchor->chroffset,anchor->sensedir);
2323     /*printf("Substring_extend_anchor_queryend, minus, yields splicep %d, qstart %d, prob %f\n",
2324       splice_queryend_p,qstart,ambig_prob_queryend); */
2325 
2326     if (qstart < 0) {
2327       return anchor;
2328 
2329     } else if (qend <= qstart) {
2330       return anchor;
2331 
2332     } else {
2333       if (anchor->start_endtype == END) {
2334 	splice_querystart_p = false;
2335       } else if (anchor->start_endtype == DON) {
2336 	splice_querystart_p = true;
2337 	splicetype_querystart = ANTIDONOR;
2338 	ambig_prob_querystart = anchor->siteD_prob;
2339       } else {
2340 	splice_querystart_p = true;
2341 	splicetype_querystart = ANTIACCEPTOR;
2342 	ambig_prob_querystart = anchor->siteA_prob;
2343       }
2344 
2345       nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,anchor->left,
2346 						      /*pos5*/qstart,/*pos3*/qend,/*plusp*/false,anchor->genestrand);
2347       if ((new_anchor = Substring_new(nmismatches,ref_nmismatches,anchor->left,
2348 				      /*querystart*/anchor->querylength - qend,
2349 				      /*queryend*/anchor->querylength - qstart,
2350 				      anchor->querylength,/*plusp*/false,anchor->genestrand,query_compress,
2351 				      anchor->chrnum,anchor->chroffset,anchor->chrhigh,anchor->chrlength,
2352 				      splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
2353 				      splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
2354 				      anchor->sensedir)) == NULL) {
2355 	return anchor;
2356       } else {
2357 	Substring_free(&anchor);
2358 	return new_anchor;
2359       }
2360     }
2361   }
2362 }
2363 
2364 
2365 #if 0
2366 /* No longer used */
2367 /* Used when converting pairs to substrings.  Assumes nmismatches and
2368    sensedir are correct, and no trimming is needed */
2369 T
2370 Substring_new_simple (int nmismatches, Univcoord_T left, int querystart, int queryend, int querylength,
2371 		      bool plusp, int genestrand, Compress_T query_compress,
2372 		      Endtype_T start_endtype, Endtype_T end_endtype,
2373 		      Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
2374 		      int sensedir) {
2375   T new;
2376 
2377 
2378   debug2(printf("\n***Entered Substring_new_simple with left %u, chr %d, genome %u..%u, query %d..%d, plusp %d\n",
2379 		left,chrnum,left+querystart-chroffset,left+queryend-chroffset,querystart,queryend,plusp));
2380 
2381   /* assert(queryend > querystart); -- Assertion does not hold.  Sometimes queryend == querystart */
2382   new = (T) MALLOC_OUT(sizeof(*new));
2383   debug2(printf("substring %p:\n",new));
2384 
2385 
2386   /* Required values if we abort and invoke Substring_free early */
2387   new->alts_ncoords = 0;
2388   new->genomic_bothdiff = (char *) NULL;
2389   new->genomic_refdiff = (char *) NULL;
2390 
2391   new->start_endtype = start_endtype;
2392   new->end_endtype = end_endtype;
2393 
2394   /* Revised later for the first and last substrings */
2395   new->querystart_pretrim = querystart;
2396   new->queryend_pretrim = queryend;
2397 
2398   new->querylength = querylength;
2399 
2400   /* These values are required by substring_trim_novel_spliceends */
2401   new->left = left;
2402   new->plusp = plusp;
2403   new->genestrand = genestrand;
2404   new->query_compress = query_compress;
2405 
2406   if (plusp == true) {
2407     new->genomicstart = left;
2408     new->genomicend = left + querylength;
2409 
2410     if (left + querylength >= chroffset + querylength) {
2411       new->querystart_chrbound = 0;
2412     } else {
2413       new->querystart_chrbound = chroffset - left;
2414     }
2415     if (left + querylength < chrhigh) {
2416       new->queryend_chrbound = querylength;
2417     } else {
2418       new->queryend_chrbound = chrhigh - left;
2419     }
2420 
2421   } else {
2422     new->genomicstart = left + querylength;
2423     new->genomicend = left;
2424 
2425     if (left + querylength < chrhigh) {
2426       new->querystart_chrbound = 0;
2427     } else {
2428       new->querystart_chrbound = (left + querylength) - chrhigh;
2429     }
2430     if (left + querylength > chroffset + querylength) {
2431       new->queryend_chrbound = querylength;
2432     } else {
2433       new->queryend_chrbound = (left + querylength) - chroffset;
2434     }
2435   }
2436 
2437   new->amb_splice_pos = 0;
2438 
2439   new->splicecoord_D = new->splicecoord_A = 0;
2440   new->siteD_pos = new->siteA_pos = new->siteN_pos = 0;
2441 
2442   new->siteD_prob = new->siteA_prob = 0.0;
2443 
2444   new->trim_querystart_splicep = new->trim_queryend_splicep = false;
2445   new->start_amb_length = new->end_amb_length = 0;
2446   new->sensedir = sensedir;
2447 
2448   /* Assume coordinates are all within the given chromosome */
2449   debug2(printf("\n***chrnum %d (chroffset %u, chrhigh %u), plusp %d\n",chrnum,chroffset,chrhigh,plusp));
2450   new->chrnum = chrnum;
2451   new->chroffset = chroffset;
2452   new->chrhigh = chrhigh;
2453   new->chrlength = chrlength;
2454 
2455   /* mandatory_trim_querystart and mandatory_trim_queryend also set during aliasing for circular chromosomes */
2456   if (left < new->chroffset) {
2457     new->mandatory_trim_querystart = new->chroffset - left;
2458     assert(new->mandatory_trim_querystart >= 0);
2459   } else {
2460     new->mandatory_trim_querystart = 0;
2461   }
2462   if (left + querylength >= new->chrhigh) {
2463     new->mandatory_trim_queryend = (left + querylength) - new->chrhigh;
2464     assert(new->mandatory_trim_queryend >= 0);
2465   } else {
2466     new->mandatory_trim_queryend = 0;
2467   }
2468   debug2(printf("mandatory_trim_querystart %d, mandatory_trim_queryend %d\n",new->mandatory_trim_querystart,new->mandatory_trim_queryend));
2469 
2470   new->querystart = querystart;
2471   new->queryend = queryend;
2472   debug2(printf("querystart %d, queryend %d\n",new->querystart,new->queryend));
2473 
2474   assert(new->querystart >= 0);
2475   assert(new->querystart <= querylength);
2476   assert(new->queryend >= 0);
2477   assert(new->queryend <= querylength);
2478 
2479   /* Compute coordinates */
2480   if (plusp == true) {
2481     new->alignstart_trim = left + new->querystart;
2482     new->alignend_trim = left + new->queryend;
2483 
2484     debug2(printf("left is %u\n",new->left));
2485     debug2(printf("genomicstart is %u, genomicend is %u\n",new->genomicstart,new->genomicend));
2486     debug2(printf("querylength is %d, alignstart is %u, alignend is %u\n",querylength,new->alignstart_trim,new->alignend_trim));
2487     assert(new->alignstart_trim >= chroffset);
2488     assert(new->alignend_trim <= chrhigh);
2489 
2490   } else {
2491     new->alignstart_trim = left + (querylength - new->querystart);
2492     new->alignend_trim = left + (querylength - new->queryend);
2493 
2494     debug2(printf("left is %u\n",new->left));
2495     debug2(printf("genomicstart is %u, genomicend is %u\n",new->genomicstart,new->genomicend));
2496     debug2(printf("querylength is %d, alignstart is %u, alignend is %u\n",querylength,new->alignstart_trim,new->alignend_trim));
2497     assert(new->alignstart_trim <= chrhigh);
2498     assert(new->alignend_trim >= chroffset);
2499   }
2500 
2501 
2502   new->nmismatches_bothdiff = nmismatches;
2503   new->nmismatches_refdiff = nmismatches;
2504   /* new->nmatches_plus_spliced_trims = (new->queryend - new->querystart) - new->nmismatches_bothdiff; */
2505 
2506   new->nmatches_plus_spliced_trims = new->nmatches_to_trims = (new->queryend - new->querystart) - new->nmismatches_bothdiff;
2507   new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims = (new->queryend - new->querystart) - new->nmismatches_refdiff;
2508 
2509   new->alts_ncoords = 0;
2510   new->alts_coords = (Univcoord_T *) NULL;
2511   new->alts_knowni = (int *) NULL;
2512   new->alts_nmismatches = (int *) NULL;
2513   new->alts_ref_nmismatches = (int *) NULL;
2514   new->alts_probs = (double *) NULL;
2515   new->amb_type = END;
2516 
2517   debug2(printf("** Returning substring %p, query %d..%d, trim %d..%d, nmismatches %d\n",
2518 		new,new->querystart,new->queryend,querystart,querylength - queryend,new->nmismatches_bothdiff));
2519 
2520   assert(new->nmismatches_bothdiff >= 0);
2521 
2522   if (new->querystart >= new->queryend) {
2523     /* Can happen if multiple mismatches result in trimming down to 0 */
2524     Substring_free(&new);
2525     return (T) NULL;
2526   } else {
2527     return new;
2528   }
2529 }
2530 #endif
2531 
2532 
2533 static Univcoord_T
Univcoord_max(Univcoord_T * list,int n)2534 Univcoord_max (Univcoord_T *list, int n) {
2535   Univcoord_T m = list[0];
2536   int i;
2537 
2538   for (i = 1; i < n; i++) {
2539     if (list[i] > m) {
2540       m = list[i];
2541     }
2542   }
2543 
2544   return m;
2545 }
2546 
2547 static Univcoord_T
Univcoord_min(Univcoord_T * list,int n)2548 Univcoord_min (Univcoord_T *list, int n) {
2549   Univcoord_T m = list[0];
2550   int i;
2551 
2552   for (i = 1; i < n; i++) {
2553     if (list[i] < m) {
2554       m = list[i];
2555     }
2556   }
2557 
2558   return m;
2559 }
2560 
2561 static int
int_min(int * list,int n)2562 int_min (int *list, int n) {
2563   int m = list[0];
2564   int i;
2565 
2566   for (i = 1; i < n; i++) {
2567     if (list[i] < m) {
2568       m = list[i];
2569     }
2570   }
2571 
2572   return m;
2573 }
2574 
2575 
2576 T
Substring_new_alts_D(int querystart,int queryend,int splice_pos,int querylength,bool plusp,int genestrand,Compress_T query_compress,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Univcoord_T * alts_coords,int * alts_knowni,int * alts_nmismatches,int * alts_ref_nmismatches,double * alts_probs,int alts_ncoords,double alts_common_prob,bool substring1p)2577 Substring_new_alts_D (int querystart, int queryend, int splice_pos, int querylength,
2578 		      bool plusp, int genestrand, Compress_T query_compress,
2579 		      Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
2580 		      Univcoord_T *alts_coords, int *alts_knowni, int *alts_nmismatches,
2581 		      int *alts_ref_nmismatches, double *alts_probs,
2582 		      int alts_ncoords, double alts_common_prob, bool substring1p) {
2583   T new = (T) MALLOC_OUT(sizeof(*new));
2584 #ifdef DEBUG2
2585   int i;
2586 #endif
2587 
2588   debug2(printf("Entered Substring_new_alts_D with chrnum %d (chroffset %u, chrhigh %u), %d..%d, splice_pos %d, querylength %d, plusp %d\n",
2589 		chrnum,chroffset,chrhigh,querystart,queryend,splice_pos,querylength,plusp));
2590   debug2(printf("alts_common_prob %f\n",alts_common_prob));
2591 
2592   new->chrnum = chrnum;
2593   new->chroffset = chroffset;
2594   new->chrhigh = chrhigh;
2595   new->chrlength = chrlength;
2596 
2597   new->left = 0;
2598   if (plusp == true) {
2599     new->genomicstart = Univcoord_max(alts_coords,alts_ncoords);
2600     new->genomicend = Univcoord_min(alts_coords,alts_ncoords);
2601   } else {
2602     new->genomicstart = Univcoord_min(alts_coords,alts_ncoords);
2603     new->genomicend = Univcoord_max(alts_coords,alts_ncoords);
2604   }
2605 
2606   new->querystart_chrbound = 0;
2607   new->queryend_chrbound = querylength;
2608 
2609   new->start_endtype = END;
2610   new->end_endtype = END;
2611 
2612   new->querystart = new->querystart_pretrim = querystart;
2613   new->queryend = new->queryend_pretrim = queryend;
2614   new->querylength = querylength;
2615 
2616   new->alignstart_trim = 0;
2617   new->alignend_trim = 0;
2618 
2619   new->plusp = plusp;
2620   new->genestrand = genestrand;
2621   new->query_compress = query_compress;
2622 
2623   new->siteD_prob = 0.0;
2624   new->siteA_prob = alts_common_prob;
2625 
2626   new->nmismatches_bothdiff = int_min(alts_nmismatches,alts_ncoords);
2627   new->nmismatches_refdiff = int_min(alts_ref_nmismatches,alts_ncoords);
2628   debug2(printf("nmismatches_bothdiff due to alts_nmismatches is %d\n",new->nmismatches_bothdiff));
2629 
2630   /* Works because new->querystart == querystart and new->queryend == queryend */
2631   /* new->nmatches_plus_spliced_trims = (queryend - querystart) - new->nmismatches_bothdiff; */
2632 
2633   new->genomic_bothdiff = (char *) NULL;
2634   new->genomic_refdiff = (char *) NULL;
2635 
2636   if (substring1p == true) {
2637     debug2(printf("substring1p is true, so setting amb_lengths to be %d and %d\n",queryend,0));
2638     if (alts_common_prob >= 0.9) {
2639       /* Treat as a substring */
2640       new->nmatches_to_trims = new->nmatches_plus_spliced_trims = queryend - new->nmismatches_bothdiff;
2641       new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = queryend - new->nmismatches_refdiff;
2642       new->start_amb_length = new->end_amb_length = 0;
2643     } else {
2644       /* Treat as ambiguous splice site */
2645       new->nmatches_to_trims = new->nmatches_plus_spliced_trims = 0;
2646       new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = 0;
2647       new->start_amb_length = queryend - new->nmismatches_bothdiff; /* This is the amb length for the entire Stage3end_T object */
2648       new->end_amb_length = 0;
2649     }
2650   } else {
2651     debug2(printf("substring1p is false, so setting amb_lengths to be %d and %d\n",0,querylength - querystart));
2652     if (alts_common_prob >= 0.9) {
2653       /* Treat as a substring */
2654       new->nmatches_to_trims = new->nmatches_plus_spliced_trims = querylength - querystart - new->nmismatches_bothdiff;
2655       new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = querylength - querystart - new->nmismatches_refdiff;
2656       new->start_amb_length = new->end_amb_length = 0;
2657     } else {
2658       /* Treat as ambiguous splice site */
2659       new->nmatches_to_trims = new->nmatches_plus_spliced_trims = 0;
2660       new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = 0;
2661       new->start_amb_length = 0;
2662       new->end_amb_length = querylength - querystart - new->nmismatches_bothdiff; /* This is the amb length for the entire Stage3end_T object */
2663     }
2664   }
2665   new->mandatory_trim_querystart = 0;
2666   new->mandatory_trim_queryend = 0;
2667   new->trim_querystart_splicep = new->trim_queryend_splicep = false;
2668   /* new->start_amb_length = new->end_amb_length = 0; -- Set above */
2669 
2670 #ifdef DEBUG2
2671   for (i = 0; i < alts_ncoords; i++) {
2672     printf("%u %f\n",alts_coords[i],alts_probs[i]);
2673   }
2674 #endif
2675   new->alts_ncoords = alts_ncoords;
2676   new->alts_coords = alts_coords;
2677   new->alts_knowni = alts_knowni;
2678   new->alts_nmismatches = alts_nmismatches;
2679   new->alts_ref_nmismatches = alts_ref_nmismatches;
2680   new->alts_probs = alts_probs;
2681   new->amb_splice_pos = splice_pos;
2682   new->amb_type = DON;
2683 
2684   assert(new->nmismatches_bothdiff >= 0);
2685 
2686   if (new->querystart >= new->queryend) {
2687     /* Can happen if multiple mismatches result in trimming down to 0 */
2688     Substring_free(&new);
2689     return (T) NULL;
2690   } else {
2691     return new;
2692   }
2693 }
2694 
2695 T
Substring_new_alts_A(int querystart,int queryend,int splice_pos,int querylength,bool plusp,int genestrand,Compress_T query_compress,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Univcoord_T * alts_coords,int * alts_knowni,int * alts_nmismatches,int * alts_ref_nmismatches,double * alts_probs,int alts_ncoords,double alts_common_prob,bool substring1p)2696 Substring_new_alts_A (int querystart, int queryend, int splice_pos, int querylength,
2697 		      bool plusp, int genestrand, Compress_T query_compress,
2698 		      Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
2699 		      Univcoord_T *alts_coords, int *alts_knowni, int *alts_nmismatches,
2700 		      int *alts_ref_nmismatches, double *alts_probs,
2701 		      int alts_ncoords, double alts_common_prob, bool substring1p) {
2702   T new = (T) MALLOC_OUT(sizeof(*new));
2703 #ifdef DEBUG2
2704   int i;
2705 #endif
2706 
2707   debug2(printf("Entered Substring_new_alts_A with chrnum %d (chroffset %u, chrhigh %u), %d..%d, splice_pos %d, querylength %d, plusp %d\n",
2708 		chrnum,chroffset,chrhigh,querystart,queryend,splice_pos,querylength,plusp));
2709   debug2(printf("alts_common_prob %f\n",alts_common_prob));
2710 
2711   new->chrnum = chrnum;
2712   new->chroffset = chroffset;
2713   new->chrhigh = chrhigh;
2714   new->chrlength = chrlength;
2715 
2716   new->left = 0;
2717   if (plusp == true) {
2718     new->genomicstart = Univcoord_max(alts_coords,alts_ncoords);
2719     new->genomicend = Univcoord_min(alts_coords,alts_ncoords);
2720   } else {
2721     new->genomicstart = Univcoord_min(alts_coords,alts_ncoords);
2722     new->genomicend = Univcoord_max(alts_coords,alts_ncoords);
2723   }
2724 
2725   new->querystart_chrbound = 0;
2726   new->queryend_chrbound = querylength;
2727 
2728   new->start_endtype = END;
2729   new->end_endtype = END;
2730 
2731   new->querystart = new->querystart_pretrim = querystart;
2732   new->queryend = new->queryend_pretrim = queryend;
2733   new->querylength = querylength;
2734 
2735   new->alignstart_trim = 0;
2736   new->alignend_trim = 0;
2737 
2738   new->plusp = plusp;
2739   new->genestrand = genestrand;
2740   new->query_compress = query_compress;
2741 
2742   new->siteA_prob = 0.0;
2743   new->siteD_prob = alts_common_prob;
2744 
2745   new->nmismatches_bothdiff = int_min(alts_nmismatches,alts_ncoords);
2746   new->nmismatches_refdiff = int_min(alts_ref_nmismatches,alts_ncoords);
2747   debug2(printf("nmismatches_bothdiff due to alts_nmismatches is %d\n",new->nmismatches_bothdiff));
2748 
2749   /* Works because new->querystart == querystart and new->queryend == queryend */
2750   /* new->nmatches_plus_spliced_trims = (queryend - querystart) - new->nmismatches_bothdiff; */
2751 
2752   new->genomic_bothdiff = (char *) NULL;
2753   new->genomic_refdiff = (char *) NULL;
2754 
2755   if (substring1p == true) {
2756     debug2(printf("substring1p is true, so setting trims to be %d and %d\n",querystart,0));
2757     if (alts_common_prob >= 0.9) {
2758       /* Treat as a substring */
2759       new->nmatches_to_trims = new->nmatches_plus_spliced_trims = queryend - new->nmismatches_bothdiff;
2760       new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = queryend - new->nmismatches_refdiff;
2761       new->start_amb_length = new->end_amb_length = 0;
2762     } else {
2763       /* Treat as ambiguous splice site */
2764       new->nmatches_to_trims = new->nmatches_plus_spliced_trims = 0;
2765       new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = 0;
2766       new->start_amb_length = queryend - new->nmismatches_bothdiff; /* This is the amb length for the entire Stage3end_T object */
2767       new->end_amb_length = 0;
2768     }
2769   } else {
2770     debug2(printf("substring1p is false, so setting trims to be %d and %d\n",0,querylength - queryend));
2771     new->nmatches_to_trims = querylength - querystart - new->nmismatches_bothdiff;
2772     new->ref_nmatches_to_trims = querylength - querystart - new->nmismatches_refdiff;
2773     if (alts_common_prob >= 0.9) {
2774       /* Treat as a substring */
2775       new->nmatches_to_trims = new->nmatches_plus_spliced_trims = querylength - querystart - new->nmismatches_bothdiff;
2776       new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = querylength - querystart - new->nmismatches_refdiff;
2777       new->start_amb_length = new->end_amb_length = 0;
2778     } else {
2779       /* Treat as ambiguous splice site */
2780       new->nmatches_to_trims = new->nmatches_plus_spliced_trims = 0;
2781       new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = 0;
2782       new->start_amb_length = 0;
2783       new->end_amb_length = querylength - querystart - new->nmismatches_bothdiff; /* This is the amb length for the entire Stage3end_T object */
2784     }
2785   }
2786   new->mandatory_trim_querystart = 0;
2787   new->mandatory_trim_queryend = 0;
2788   new->trim_querystart_splicep = new->trim_queryend_splicep = false;
2789 
2790 #ifdef DEBUG2
2791   for (i = 0; i < alts_ncoords; i++) {
2792     printf("%u %f\n",alts_coords[i],alts_probs[i]);
2793   }
2794 #endif
2795 
2796   new->alts_ncoords = alts_ncoords;
2797   new->alts_coords = alts_coords;
2798   new->alts_knowni = alts_knowni;
2799   new->alts_nmismatches = alts_nmismatches;
2800   new->alts_ref_nmismatches = alts_ref_nmismatches;
2801   new->alts_probs = alts_probs;
2802   new->amb_splice_pos = splice_pos;
2803   new->amb_type = ACC;
2804 
2805   assert(new->nmismatches_bothdiff >= 0);
2806 
2807   if (new->querystart >= new->queryend) {
2808     /* Can happen if multiple mismatches result in trimming down to 0 */
2809     Substring_free(&new);
2810     return (T) NULL;
2811   } else {
2812     return new;
2813   }
2814 }
2815 
2816 
2817 Univcoord_T
Substring_set_alt(double * donor_prob,double * acceptor_prob,Univcoord_T * genomicstart,Univcoord_T * genomicend,T this,int bingoi)2818 Substring_set_alt (double *donor_prob, double *acceptor_prob, Univcoord_T *genomicstart, Univcoord_T *genomicend,
2819 		   T this, int bingoi) {
2820 
2821 #ifdef DEBUG10
2822   printf("Entered Substring_set_alt for %d..%d.  plusp %d, ",this->querystart,this->queryend,this->plusp);
2823   if (this->amb_type == DON) {
2824     printf("type DON\n");
2825   } else {
2826     printf("type ACC\n");
2827   }
2828 #endif
2829 
2830   assert(this->alts_probs != NULL);
2831 
2832   this->nmismatches_bothdiff = this->alts_nmismatches[bingoi];
2833   this->nmismatches_refdiff = this->alts_ref_nmismatches[bingoi];
2834 
2835   if (this->plusp == true) {
2836     if (this->amb_type == DON) {
2837       *acceptor_prob = this->siteA_prob;
2838       *donor_prob = this->siteD_prob = this->alts_probs[bingoi];
2839       this->splicecoord_D = this->alts_coords[bingoi];
2840       this->splicesitesD_knowni = this->alts_knowni[bingoi];
2841       this->left = this->splicecoord_D - this->amb_splice_pos;
2842     } else {
2843       *donor_prob = this->siteD_prob;
2844       *acceptor_prob = this->siteA_prob = this->alts_probs[bingoi];
2845       this->splicecoord_A = this->alts_coords[bingoi];
2846       this->splicesitesA_knowni = this->alts_knowni[bingoi];
2847       this->left = this->splicecoord_A - this->amb_splice_pos;
2848     }
2849 
2850     debug10(printf("left %u [%u]\n",this->left,this->left - this->chroffset));
2851     *genomicstart = this->genomicstart = this->left;
2852     *genomicend = this->genomicend = this->left + this->querylength;
2853 
2854     debug10(printf("genomicstart %u [%u]\n",*genomicstart,*genomicstart - this->chroffset));
2855     debug10(printf("genomicend %u [%u]\n",*genomicend,*genomicend - this->chroffset));
2856 
2857     this->alignstart_trim = this->genomicstart + this->querystart;
2858     this->alignend_trim =  this->genomicstart + this->queryend;
2859     /* this->nmatches_plus_spliced_trims = (this->alignend_trim - this->alignstart_trim) - this->nmismatches_bothdiff; */
2860 
2861     debug10(printf("querypos %d..%d, alignstart is %u (%u), alignend is %u (%u), genomicstart is %u, genomicend is %u\n",
2862 		  this->querystart,this->queryend,this->alignstart_trim,this->alignstart_trim - this->chroffset,
2863 		  this->alignend_trim,this->alignend_trim - this->chroffset,this->genomicstart,this->genomicend));
2864 
2865   } else {
2866     if (this->amb_type == DON) {
2867       *acceptor_prob = this->siteA_prob;
2868       *donor_prob = this->siteD_prob = this->alts_probs[bingoi];
2869       this->splicecoord_D = this->alts_coords[bingoi];
2870       this->splicesitesD_knowni = this->alts_knowni[bingoi];
2871       /* this->left = this->splicecoord_D - (this->querylength - this->amb_splice_pos); */
2872       this->left = this->splicecoord_D - this->amb_splice_pos; /* amb_splice_pos is based on left */
2873       debug10(printf("left %u [%u] = %u - (%d - %d)\n",
2874 		     this->left,this->left - this->chroffset,this->splicecoord_D - this->chroffset,
2875 		     this->querylength,this->amb_splice_pos));
2876     } else {
2877       *donor_prob = this->siteD_prob;
2878       *acceptor_prob = this->siteA_prob = this->alts_probs[bingoi];
2879       this->splicecoord_A = this->alts_coords[bingoi];
2880       this->splicesitesA_knowni = this->alts_knowni[bingoi];
2881       /* this->left = this->splicecoord_A - (this->querylength - this->amb_splice_pos); */
2882       this->left = this->splicecoord_A - this->amb_splice_pos; /* amb_splice_pos is based on left */
2883       debug10(printf("left %u [%u] = %u - (%d - %d)\n",
2884 		     this->left,this->left - this->chroffset,this->splicecoord_A - this->chroffset,
2885 		     this->querylength,this->amb_splice_pos));
2886     }
2887 
2888     *genomicend = this->genomicend = this->left;
2889     *genomicstart = this->genomicstart = this->left + this->querylength;
2890     debug10(printf("genomicstart %u [%u]\n",*genomicstart,*genomicstart - this->chroffset));
2891     debug10(printf("genomicend %u [%u]\n",*genomicend,*genomicend - this->chroffset));
2892 
2893     this->alignend_trim = this->genomicstart - this->queryend;
2894     this->alignstart_trim = this->genomicstart - this->querystart;
2895     /* this->nmatches_plus_spliced_trims = (this->alignstart_trim - this->alignend_trim) - this->nmismatches_bothdiff; */
2896 
2897     debug10(printf("querypos %d..%d, alignstart is %u (%u), alignend is %u (%u), genomicstart is %u, genomicend is %u\n",
2898 		  this->querystart,this->queryend,this->alignstart_trim,this->alignstart_trim - this->chroffset,
2899 		  this->alignend_trim,this->alignend_trim - this->chroffset,this->genomicstart,this->genomicend));
2900   }
2901 
2902   this->start_amb_length = this->end_amb_length = 0;
2903   this->amb_type = END;
2904 
2905   this->nmatches_plus_spliced_trims = this->nmatches_to_trims = this->queryend - this->querystart - this->nmismatches_bothdiff;
2906   this->ref_nmatches_plus_spliced_trims = this->ref_nmatches_to_trims = this->queryend - this->querystart - this->nmismatches_refdiff;
2907 
2908   this->alts_ncoords = 0;	/* To signify that alts have been resolved */
2909   FREE_OUT(this->alts_coords);
2910   FREE_OUT(this->alts_knowni);
2911   FREE_OUT(this->alts_nmismatches);
2912   FREE_OUT(this->alts_ref_nmismatches);
2913   FREE_OUT(this->alts_probs);
2914 
2915   return this->left;
2916 }
2917 
2918 
2919 float
Substring_compute_mapq(T this,char * quality_string)2920 Substring_compute_mapq (T this, char *quality_string) {
2921   int mapq_start, mapq_end;
2922   float best_loglik, loglik;
2923   Univcoord_T left, splicecoord;
2924   int i;
2925 
2926   /* mapq */
2927   mapq_start = this->querystart;
2928   mapq_end = this->queryend;
2929 
2930   /* It appears from simulated reads that it is better not to trim in
2931      computing MAPQ.  The correct mapping then tends to be selected
2932      with a higher MAPQ score. */
2933   /* But if all ends are terminals, then terminal parts should not be
2934      included in MAPQ scoring */
2935 
2936   if (this->nmismatches_bothdiff == 0) {  /* was this->exactp == true */
2937     /* this->mapq_loglik = MAPQ_loglik_exact(quality_string,0,querylength); */
2938     this->mapq_loglik = 0.0;
2939 
2940   } else if (this->alts_ncoords > 0) {
2941     if (this->plusp == true) {
2942       splicecoord = this->alts_coords[0];
2943       left = splicecoord - this->amb_splice_pos;
2944       best_loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2945 				this->query_compress,left,mapq_start,mapq_end,
2946 				this->querylength,quality_string,/*plusp*/true,this->genestrand);
2947       for (i = 1; i < this->alts_ncoords; i++) {
2948 	splicecoord = this->alts_coords[i];
2949 	left = splicecoord - this->amb_splice_pos;
2950 	if ((loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2951 				  this->query_compress,left,mapq_start,mapq_end,
2952 				  this->querylength,quality_string,/*plusp*/true,this->genestrand)) > best_loglik) {
2953 	  best_loglik = loglik;
2954 	}
2955       }
2956     } else {
2957       splicecoord = this->alts_coords[0];
2958       left = splicecoord - (this->querylength - this->amb_splice_pos);
2959       best_loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2960 				this->query_compress,left,mapq_start,mapq_end,
2961 				this->querylength,quality_string,/*plusp*/false,this->genestrand);
2962       for (i = 1; i < this->alts_ncoords; i++) {
2963 	splicecoord = this->alts_coords[i];
2964 	left = splicecoord - (this->querylength - this->amb_splice_pos);
2965 	if ((loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2966 				  this->query_compress,left,mapq_start,mapq_end,
2967 				  this->querylength,quality_string,/*plusp*/false,this->genestrand)) > best_loglik) {
2968 	  best_loglik = loglik;
2969 	}
2970       }
2971     }
2972 
2973     this->mapq_loglik = best_loglik;
2974 
2975   } else {
2976     debug2(printf("trim_querystart %d, trim_queryend %d, mapq_start = %d, mapq_end = %d\n",
2977 		  this->querystart,this->querylength - this->queryend,mapq_start,mapq_end));
2978     this->mapq_loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2979 				    this->query_compress,this->left,mapq_start,mapq_end,
2980 				    this->querylength,quality_string,this->plusp,this->genestrand);
2981     debug2(printf("Substring %u..%u gets loglik %f\n",this->genomicstart - this->chroffset,
2982 		  this->genomicend - this->chroffset,this->mapq_loglik));
2983   }
2984 
2985   return this->mapq_loglik;
2986 }
2987 
2988 
2989 /* Sets genomic_bothdiff, genomic_refdiff, and nmismatches_refdiff */
2990 int
Substring_display_prep(T this,char * queryuc_ptr,int querylength,int extraleft,int extraright,Genome_T genome)2991 Substring_display_prep (T this, char *queryuc_ptr, int querylength,
2992 			int extraleft, int extraright, Genome_T genome) {
2993 
2994 #if defined(HAVE_ALLOCA)
2995   char *genomic_diff, *gbuffer;
2996 #else
2997   char *genomic_diff, *gbuffer;
2998 #endif
2999 
3000   /* printf("left:%u nmismatches:%d extraleft:%d extraright:%d\n",
3001      this->left - this->chroffset,this->nmismatches_bothdiff,extraleft,extraright); */
3002 
3003   if (output_type == M8_OUTPUT && snps_iit == NULL && maskedp == false) {
3004     /* Print procedures do not refer to genomic sequence */
3005     this->genomic_refdiff = this->genomic_bothdiff = (char *) NULL;
3006     this->nmismatches_refdiff = this->nmismatches_bothdiff;
3007 
3008   } else if (this->nmismatches_bothdiff == 0 && extraleft == 0 && extraright == 0 &&
3009 	     snps_iit == NULL && maskedp == false) {
3010     /* Print procedures will refer to query sequence */
3011     this->genomic_refdiff = this->genomic_bothdiff = (char *) NULL;
3012     this->nmismatches_refdiff = 0;
3013 
3014   } else if (this->plusp == true) {
3015     /* Used to be this->genomiclength, but doesn't work for large insertions */
3016 #if defined(HAVE_ALLOCA)
3017     gbuffer = (char *) ALLOCA((querylength+1) * sizeof(char));
3018 #else
3019     gbuffer = (char *) MALLOC((querylength+1) * sizeof(char));
3020 #endif
3021 
3022     debug1(printf("Obtaining genomic_diff from left %u (%u) for querylength %d\n",
3023 		  this->left,this->left - this->chroffset,querylength));
3024     Genome_fill_buffer_simple(genome,this->left,querylength,gbuffer);
3025     genomic_diff = gbuffer;
3026     debug1(printf("%s\n",genomic_diff));
3027 
3028     if (maskedp == true) {
3029       Genome_mark_mismatches_ref(genomic_diff,querylength,this->query_compress,
3030 				 this->left,/*pos5*/this->querystart,/*pos3*/this->queryend,
3031 				 /*plusp*/true,this->genestrand);
3032     } else {
3033       Genome_mark_mismatches(genomic_diff,querylength,this->query_compress,
3034 			     this->left,/*pos5*/this->querystart,/*pos3*/this->queryend,
3035 			     /*plusp*/true,this->genestrand);
3036     }
3037 
3038     /* Need to perform embellish to put dashes in */
3039     this->genomic_bothdiff =
3040       embellish_genomic(this->genomic_bothdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
3041 			querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3042 			extraleft,extraright,/*plusp*/true,this->genestrand);
3043 
3044     if (snps_iit == NULL && maskedp == false) {
3045       this->genomic_refdiff = this->genomic_bothdiff;
3046       this->nmismatches_refdiff = this->nmismatches_bothdiff;
3047 
3048     } else {
3049       this->nmismatches_refdiff =
3050 	Genome_count_mismatches_substring_ref(genomebits,this->query_compress,this->left,
3051 					      /*pos5*/this->alignstart_trim - this->left,
3052 					      /*pos3*/this->alignend_trim - this->left,
3053 					      /*plusp*/true,this->genestrand);
3054 
3055       Genome_mark_mismatches_ref(genomic_diff,querylength,this->query_compress,this->left,
3056 				 /*pos5*/this->querystart,/*pos3*/this->queryend,
3057 				 /*plusp*/true,this->genestrand);
3058       if (output_type == STD_OUTPUT) {
3059 	this->genomic_refdiff =
3060 	  embellish_genomic(this->genomic_refdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
3061 			    querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3062 			    extraleft,extraright,/*plusp*/true,this->genestrand);
3063       }
3064     }
3065 
3066     if (output_type == SAM_OUTPUT) {
3067       this->genomic_refdiff =
3068 	embellish_genomic_sam(this->genomic_refdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
3069 			      querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3070 			      extraleft,extraright,/*plusp*/true,this->genestrand);
3071     }
3072 
3073 #if defined(HAVE_ALLOCA)
3074     FREEA(gbuffer);
3075 #else
3076     FREE(gbuffer);
3077 #endif
3078 
3079   } else {
3080     /* Used to be this->genomiclength, but doesn't work for large insertions */
3081 #if defined(HAVE_ALLOCA)
3082     gbuffer = (char *) ALLOCA((querylength+1) * sizeof(char));
3083 #else
3084     gbuffer = (char *) MALLOC((querylength+1) * sizeof(char));
3085 #endif
3086 
3087     debug1(printf("Obtaining genomic_diff from left %u (%u) for querylength %d, and complemented\n",
3088 		  this->left,this->left - this->chroffset,querylength));
3089     Genome_fill_buffer_simple(genome,this->left,querylength,gbuffer);
3090     genomic_diff = make_complement_inplace(gbuffer,querylength);
3091     debug1(printf("%s\n",genomic_diff));
3092 
3093     if (maskedp == true) {
3094       Genome_mark_mismatches_ref(genomic_diff,querylength,this->query_compress,
3095 				 this->left,/*pos5*/querylength - this->queryend,
3096 				 /*pos3*/querylength - this->querystart,
3097 				 /*plusp*/false,this->genestrand);
3098     } else {
3099       Genome_mark_mismatches(genomic_diff,querylength,this->query_compress,
3100 			     this->left,/*pos5*/querylength - this->queryend,
3101 			     /*pos3*/querylength - this->querystart,
3102 			     /*plusp*/false,this->genestrand);
3103     }
3104 
3105     /* Need to perform embellish to put dashes in */
3106     this->genomic_bothdiff =
3107       embellish_genomic(this->genomic_bothdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
3108 			querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3109 			extraleft,extraright,/*plusp*/false,this->genestrand);
3110 
3111     if (snps_iit == NULL && maskedp == false) {
3112       this->genomic_refdiff = this->genomic_bothdiff;
3113       this->nmismatches_refdiff = this->nmismatches_bothdiff;
3114 
3115     } else {
3116       this->nmismatches_refdiff =
3117 	Genome_count_mismatches_substring_ref(genomebits,this->query_compress,this->left,
3118 					      /*pos5*/this->alignend_trim - this->left,
3119 					      /*pos3*/this->alignstart_trim - this->left,/*plusp*/false,
3120 					      this->genestrand);
3121 
3122       Genome_mark_mismatches_ref(genomic_diff,querylength,this->query_compress,this->left,
3123 				 /*pos5*/querylength - this->queryend,
3124 				 /*pos3*/querylength - this->querystart,
3125 				 /*plusp*/false,this->genestrand);
3126 
3127       if (output_type == STD_OUTPUT) {
3128 	this->genomic_refdiff =
3129 	  embellish_genomic(this->genomic_refdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
3130 			    querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3131 			    extraleft,extraright,/*plusp*/false,this->genestrand);
3132       }
3133     }
3134 
3135     if (output_type == SAM_OUTPUT) {
3136       this->genomic_refdiff =
3137 	embellish_genomic_sam(this->genomic_refdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
3138 			      querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3139 			      extraleft,extraright,/*plusp*/false,this->genestrand);
3140     }
3141 
3142 #if defined(HAVE_ALLOCA)
3143     FREEA(gbuffer);
3144 #else
3145     FREE(gbuffer);
3146 #endif
3147   }
3148 
3149   return this->nmismatches_refdiff;
3150 }
3151 
3152 
3153 char *
Substring_genomic_sequence(int * seqlength,T this,Genome_T genome)3154 Substring_genomic_sequence (int *seqlength, T this, Genome_T genome) {
3155   char *gbuffer;
3156 
3157   *seqlength = this->queryend - this->querystart;
3158   gbuffer = (char *) MALLOC((*seqlength+1) * sizeof(char));
3159   if (this->plusp == true) {
3160     Genome_fill_buffer_simple(genome,this->left + this->querystart,*seqlength,gbuffer);
3161     return gbuffer;
3162   } else {
3163     Genome_fill_buffer_simple(genome,this->left + (this->querylength - this->queryend),*seqlength,gbuffer);
3164     return make_complement_inplace(gbuffer,*seqlength);
3165   }
3166 }
3167 
3168 
3169 Univcoord_T
Substring_left(T this)3170 Substring_left (T this) {
3171   return this->left;
3172 }
3173 
3174 
3175 Univcoord_T
Substring_splicecoord_D(T this)3176 Substring_splicecoord_D (T this) {
3177   return this->splicecoord_D;
3178 }
3179 
3180 Univcoord_T
Substring_splicecoord_A(T this)3181 Substring_splicecoord_A (T this) {
3182   return this->splicecoord_A;
3183 }
3184 
3185 
3186 /* Called only by samprint, for donor or acceptor substrings */
3187 char
Substring_chimera_strand(T this)3188 Substring_chimera_strand (T this) {
3189   int sensedir;
3190 
3191   if ((sensedir = this->sensedir) != SENSE_ANTI) {
3192     if (this->plusp == true) {
3193       return '+';
3194     } else {
3195       return '-';
3196     }
3197   } else {
3198     if (this->plusp == true) {
3199       return '-';
3200     } else {
3201       return '+';
3202     }
3203   }
3204 }
3205 
3206 
3207 /* Called only by samprint */
3208 Chrpos_T
Substring_chr_splicecoord_D(T this,char donor_strand)3209 Substring_chr_splicecoord_D (T this, char donor_strand) {
3210   if (donor_strand == '+') {
3211     return (Chrpos_T) (this->splicecoord_D - this->chroffset);
3212   } else if (donor_strand == '-') {
3213     return (Chrpos_T) (this->splicecoord_D - this->chroffset + 1);
3214   } else {
3215     abort();
3216   }
3217 }
3218 
3219 /* Called only by samprint */
3220 Chrpos_T
Substring_chr_splicecoord_A(T this,char acceptor_strand)3221 Substring_chr_splicecoord_A (T this, char acceptor_strand) {
3222   if (acceptor_strand == '+') {
3223     return (Chrpos_T) (this->splicecoord_A - this->chroffset + 1);
3224   } else if (acceptor_strand == '-') {
3225     return (Chrpos_T) (this->splicecoord_A - this->chroffset);
3226   } else {
3227     abort();
3228   }
3229 }
3230 
3231 int
Substring_splicesitesD_knowni(T this)3232 Substring_splicesitesD_knowni (T this) {
3233   return this->splicesitesD_knowni;
3234 }
3235 
3236 int
Substring_splicesitesA_knowni(T this)3237 Substring_splicesitesA_knowni (T this) {
3238   return this->splicesitesA_knowni;
3239 }
3240 
3241 bool
Substring_plusp(T this)3242 Substring_plusp (T this) {
3243   return this->plusp;
3244 }
3245 
3246 int
Substring_sensedir(T this)3247 Substring_sensedir (T this) {
3248   return this->sensedir;
3249 }
3250 
3251 int
Substring_genestrand(T this)3252 Substring_genestrand (T this) {
3253   return this->genestrand;
3254 }
3255 
3256 char *
Substring_genomic_bothdiff(T this)3257 Substring_genomic_bothdiff (T this) {
3258   return this->genomic_bothdiff;
3259 }
3260 
3261 char *
Substring_genomic_refdiff(T this)3262 Substring_genomic_refdiff (T this) {
3263   return this->genomic_refdiff;
3264 }
3265 
3266 int
Substring_nmismatches_bothdiff(T this)3267 Substring_nmismatches_bothdiff (T this) {
3268   return this->nmismatches_bothdiff;
3269 }
3270 
3271 int
Substring_nmismatches_refdiff(T this)3272 Substring_nmismatches_refdiff (T this) {
3273   return this->nmismatches_refdiff;
3274 }
3275 
3276 int
Substring_nmatches_to_trims(T this)3277 Substring_nmatches_to_trims (T this) {
3278 #if 0
3279   if (this->alts_ncoords > 0) {
3280     /* The entire substring is "trimmed" */
3281     return 0;
3282   } else {
3283     return this->queryend - this->querystart - this->nmismatches_bothdiff;
3284   }
3285 #else
3286   return this->nmatches_to_trims;
3287 #endif
3288 }
3289 
3290 int
Substring_ref_nmatches_to_trims(T this)3291 Substring_ref_nmatches_to_trims (T this) {
3292   return this->ref_nmatches_to_trims;
3293 }
3294 
3295 /* nmatches_to_trims plus amb_length */
3296 int
Substring_nmatches(T this)3297 Substring_nmatches (T this) {
3298 #if 0
3299   int amb_length;
3300 
3301   if (this->alts_ncoords > 0) {
3302     return this->start_amb_length + this->end_amb_length;
3303   } else {
3304     amb_length = 0;
3305     if (this->trim_querystart_splicep == true) {
3306       amb_length += this->querystart;
3307     }
3308     if (this->trim_queryend_splicep == true) {
3309       amb_length += this->querylength - this->queryend;
3310     }
3311     return amb_length + /*nmatches_to_trims*/ (this->queryend - this->querystart - this->nmismatches_bothdiff);
3312   }
3313 #else
3314   return this->nmatches_plus_spliced_trims;
3315 #endif
3316 }
3317 
3318 int
Substring_ref_nmatches(T this)3319 Substring_ref_nmatches (T this) {
3320 #if 0
3321   int amb_length;
3322 
3323   if (this->alts_ncoords > 0) {
3324     return this->start_amb_length + this->end_amb_length;
3325   } else {
3326     amb_length = 0;
3327     if (this->trim_querystart_splicep == true) {
3328       amb_length += this->querystart;
3329     }
3330     if (this->trim_queryend_splicep == true) {
3331       amb_length += this->querylength - this->queryend;
3332     }
3333     return amb_length + /*nmatches_to_trims*/ (this->queryend - this->querystart - this->nmismatches_bothdiff);
3334   }
3335 #else
3336   return this->ref_nmatches_plus_spliced_trims;
3337 #endif
3338 }
3339 
3340 
3341 Endtype_T
Substring_start_endtype(T this)3342 Substring_start_endtype (T this) {
3343   return this->start_endtype;
3344 }
3345 
3346 Endtype_T
Substring_end_endtype(T this)3347 Substring_end_endtype (T this) {
3348   return this->end_endtype;
3349 }
3350 
3351 float
Substring_mapq_loglik(T this)3352 Substring_mapq_loglik (T this) {
3353   return this->mapq_loglik;
3354 }
3355 
3356 int
Substring_trim_querystart(T this)3357 Substring_trim_querystart (T this) {
3358   return this->querystart;
3359 }
3360 
3361 int
Substring_trim_queryend(T this)3362 Substring_trim_queryend (T this) {
3363   return this->querylength - this->queryend;
3364 }
3365 
3366 bool
Substring_trim_querystart_splicep(T this)3367 Substring_trim_querystart_splicep (T this) {
3368   return this->trim_querystart_splicep;
3369 }
3370 
3371 bool
Substring_trim_queryend_splicep(T this)3372 Substring_trim_queryend_splicep (T this) {
3373   return this->trim_queryend_splicep;
3374 }
3375 
3376 int
Substring_mandatory_trim_querystart(T this)3377 Substring_mandatory_trim_querystart (T this) {
3378   return this->mandatory_trim_querystart;
3379 }
3380 
3381 int
Substring_mandatory_trim_queryend(T this)3382 Substring_mandatory_trim_queryend (T this) {
3383   return this->mandatory_trim_queryend;
3384 }
3385 
3386 
3387 int
Substring_querystart(T this)3388 Substring_querystart (T this) {
3389   return this->querystart;
3390 }
3391 
3392 int
Substring_querystart_pretrim(T this)3393 Substring_querystart_pretrim (T this) {
3394   return this->querystart_pretrim;
3395 }
3396 
3397 int
Substring_queryend(T this)3398 Substring_queryend (T this) {
3399   return this->queryend;
3400 }
3401 
3402 int
Substring_querylength(T this)3403 Substring_querylength (T this) {
3404   return this->querylength;
3405 }
3406 
3407 int
Substring_querystart_chrbound(T this)3408 Substring_querystart_chrbound (T this) {
3409   return this->querystart_chrbound;
3410 }
3411 
3412 int
Substring_queryend_chrbound(T this)3413 Substring_queryend_chrbound (T this) {
3414   return this->queryend_chrbound;
3415 }
3416 
3417 
3418 int
Substring_match_length(T this)3419 Substring_match_length (T this) {
3420   if (this == NULL) {
3421     return 0;
3422   } else {
3423     return this->queryend - this->querystart;
3424   }
3425 }
3426 
3427 int
Substring_match_length_pretrim(T this)3428 Substring_match_length_pretrim (T this) {
3429   if (this == NULL) {
3430     return 0;
3431   } else {
3432     return this->queryend_pretrim - this->querystart_pretrim;
3433   }
3434 }
3435 
3436 
3437 /* Mapped and unmapped */
3438 int
Substring_amb_length(T this)3439 Substring_amb_length (T this) {
3440   return this->start_amb_length + this->end_amb_length;
3441 }
3442 
3443 
3444 /* Mapped and unmapped */
3445 int
Substring_start_amb_length(T this)3446 Substring_start_amb_length (T this) {
3447   return this->start_amb_length;
3448 }
3449 
3450 
3451 /* Mapped and unmapped */
3452 int
Substring_end_amb_length(T this)3453 Substring_end_amb_length (T this) {
3454   return this->end_amb_length;
3455 }
3456 
3457 
3458 bool
Substring_equal(T substring1,T substring2)3459 Substring_equal (T substring1, T substring2) {
3460   if (substring1->genomicstart != substring2->genomicstart) {
3461     return false;
3462   } else if (substring1->genomicend != substring2->genomicend) {
3463     return false;
3464   } else if (substring1->alignstart_trim != substring2->alignstart_trim) {
3465     return false;
3466   } else if (substring1->alignend_trim != substring2->alignend_trim) {
3467     return false;
3468   } else if (substring1->start_amb_length != substring2->start_amb_length) {
3469     return false;
3470   } else if (substring1->end_amb_length != substring2->end_amb_length) {
3471     return false;
3472   } else {
3473     return true;
3474   }
3475 }
3476 
3477 
3478 Chrpos_T
Substring_genomic_alignment_length(T this)3479 Substring_genomic_alignment_length (T this) {
3480   /* Don't check for alts, as long as we return 0 */
3481   /* assert(this->alts_ncoords == 0); */
3482 
3483   if (this == NULL) {
3484     return (Chrpos_T) 0;
3485   } else if (this->alts_ncoords > 0) {
3486     return (Chrpos_T) 0;
3487   } else if (this->alignend_trim > this->alignstart_trim) {
3488     return this->alignend_trim - this->alignstart_trim;
3489   } else {
3490     return this->alignstart_trim - this->alignend_trim;
3491   }
3492 }
3493 
3494 
3495 Chrnum_T
Substring_chrnum(T this)3496 Substring_chrnum (T this) {
3497   return this->chrnum;
3498 }
3499 
3500 Univcoord_T
Substring_chroffset(T this)3501 Substring_chroffset (T this) {
3502   return this->chroffset;
3503 }
3504 
3505 Univcoord_T
Substring_chrhigh(T this)3506 Substring_chrhigh (T this) {
3507   return this->chrhigh;
3508 }
3509 
3510 Chrpos_T
Substring_chrlength(T this)3511 Substring_chrlength (T this) {
3512   return this->chrlength;
3513 }
3514 
3515 Chrpos_T
Substring_chrpos_low(T this)3516 Substring_chrpos_low (T this) {
3517   /* Need to check for alts, because then alignstart_trim and alignend_trim are set to 0 */
3518   assert(this->alts_ncoords == 0);
3519 
3520   if (this->plusp == true) {
3521     return this->alignstart_trim - this->chroffset;
3522   } else {
3523     return this->alignend_trim - this->chroffset;
3524   }
3525 }
3526 
3527 Chrpos_T
Substring_chrpos_high(T this)3528 Substring_chrpos_high (T this) {
3529   /* Need to check for alts, because then alignstart_trim and alignend_trim are set to 0 */
3530   assert(this->alts_ncoords == 0);
3531 
3532   if (this->plusp == true) {
3533     return this->alignend_trim - this->chroffset;
3534   } else {
3535     return this->alignstart_trim - this->chroffset;
3536   }
3537 }
3538 
3539 
3540 Chrpos_T
Substring_alignstart_trim_chr(T this)3541 Substring_alignstart_trim_chr (T this) {
3542   /* Need to check for alts, because then alignstart_trim and alignend_trim are set to 0 */
3543   assert(this->alts_ncoords == 0);
3544 
3545   /* Previously had provision for alts_coords */
3546   return this->alignstart_trim - this->chroffset;
3547 }
3548 
3549 Chrpos_T
Substring_alignend_trim_chr(T this)3550 Substring_alignend_trim_chr (T this) {
3551   /* Need to check for alts, because then alignstart_trim and alignend_trim are set to 0 */
3552   assert(this->alts_ncoords == 0);
3553 
3554   /* Previously had provision for alts_coords */
3555   return this->alignend_trim - this->chroffset;
3556 }
3557 
3558 Univcoord_T
Substring_alignstart_trim(T this)3559 Substring_alignstart_trim (T this) {
3560   return this->alignstart_trim;
3561 }
3562 
3563 Univcoord_T
Substring_alignend_trim(T this)3564 Substring_alignend_trim (T this) {
3565   return this->alignend_trim;
3566 }
3567 
3568 
3569 Univcoord_T
Substring_left_genomicseg(T this)3570 Substring_left_genomicseg (T this) {
3571   return this->left;
3572 }
3573 
3574 Univcoord_T
Substring_genomicstart(T this)3575 Substring_genomicstart (T this) {
3576   return this->genomicstart;
3577 }
3578 
3579 Univcoord_T
Substring_genomicend(T this)3580 Substring_genomicend (T this) {
3581   return this->genomicend;
3582 }
3583 
3584 double
Substring_amb_prob(T this)3585 Substring_amb_prob (T this) {
3586   if (this->amb_type == DON) {
3587     return this->siteD_prob;
3588   } else if (this->amb_type == ACC) {
3589     return this->siteA_prob;
3590   } else {
3591     return 0.0;
3592   }
3593 }
3594 
3595 
3596 double
Substring_amb_donor_prob(T this)3597 Substring_amb_donor_prob (T this) {
3598   double max;
3599   int i;
3600 
3601   if (this->alts_ncoords == 0) {
3602     return this->siteD_prob;
3603   } else if (this->amb_type == DON) {
3604     max = this->alts_probs[0];
3605     for (i = 1; i < this->alts_ncoords; i++) {
3606       if (this->alts_probs[i] > max) {
3607 	max = this->alts_probs[i];
3608       }
3609     }
3610     return max;
3611   } else {
3612     return this->siteD_prob;
3613   }
3614 }
3615 
3616 double
Substring_amb_acceptor_prob(T this)3617 Substring_amb_acceptor_prob (T this) {
3618   double max;
3619   int i;
3620 
3621   if (this->alts_ncoords == 0) {
3622     return this->siteA_prob;
3623   } else if (this->amb_type == ACC) {
3624     max = this->alts_probs[0];
3625     for (i = 1; i < this->alts_ncoords; i++) {
3626       if (this->alts_probs[i] > max) {
3627 	max = this->alts_probs[i];
3628       }
3629     }
3630     return max;
3631   } else {
3632     return this->siteA_prob;
3633   }
3634 }
3635 
3636 
3637 
3638 double
Substring_siteD_prob(T this)3639 Substring_siteD_prob (T this) {
3640   return this->siteD_prob;
3641 }
3642 
3643 double
Substring_siteA_prob(T this)3644 Substring_siteA_prob (T this) {
3645   return this->siteA_prob;
3646 }
3647 
3648 int
Substring_siteD_pos(T this)3649 Substring_siteD_pos (T this) {
3650   return this->siteD_pos;
3651 }
3652 
3653 int
Substring_siteA_pos(T this)3654 Substring_siteA_pos (T this) {
3655   return this->siteA_pos;
3656 }
3657 
3658 int
Substring_siteN_pos(T this)3659 Substring_siteN_pos (T this) {
3660   return this->siteN_pos;
3661 }
3662 
3663 
3664 bool
Substring_ambiguous_p(T this)3665 Substring_ambiguous_p (T this) {
3666   if (this->alts_ncoords > 0) {
3667     return false;
3668   } else if (this->amb_type == END) {
3669     return false;
3670   } else {
3671     return true;
3672   }
3673 }
3674 
3675 bool
Substring_has_alts_p(T this)3676 Substring_has_alts_p (T this) {
3677   if (this->alts_ncoords > 0) {
3678     return true;
3679   } else {
3680     return false;
3681   }
3682 }
3683 
3684 int
Substring_alts_ncoords(T this)3685 Substring_alts_ncoords (T this) {
3686   return this->alts_ncoords;
3687 }
3688 
3689 Univcoord_T *
Substring_alts_coords(T this)3690 Substring_alts_coords (T this) {
3691   return this->alts_coords;
3692 }
3693 
3694 double
Substring_alts_common_prob(T this)3695 Substring_alts_common_prob (T this) {
3696   if (this->amb_type == DON) {
3697     return this->siteA_prob;
3698   } else if (this->amb_type == ACC) {
3699     return this->siteD_prob;
3700   } else {
3701     return 0.0;
3702   }
3703 }
3704 
3705 /* Debugging procedure */
3706 void
Substring_print_alts_coords(T this)3707 Substring_print_alts_coords (T this) {
3708   int i;
3709 
3710 #ifdef LARGE_GENOMES
3711   for (i = 0; i < this->alts_ncoords; i++) {
3712     printf(" %llu",this->alts_coords[i] - this->chroffset);
3713   }
3714 #else
3715   for (i = 0; i < this->alts_ncoords; i++) {
3716     printf(" %u",this->alts_coords[i] - this->chroffset);
3717   }
3718 #endif
3719 
3720   return;
3721 }
3722 
3723 
3724 int *
Substring_alts_nmismatches(T this)3725 Substring_alts_nmismatches (T this) {
3726   return this->alts_nmismatches;
3727 }
3728 
3729 int *
Substring_alts_ref_nmismatches(T this)3730 Substring_alts_ref_nmismatches (T this) {
3731   return this->alts_ref_nmismatches;
3732 }
3733 
3734 /* circularpos measures query distance from SAM chrlow to origin */
3735 int
Substring_circularpos(T this)3736 Substring_circularpos (T this) {
3737   if (this == NULL) {
3738     return -1;
3739 
3740   } else if (this->plusp == true) {
3741     debug12(printf("Substring circularpos plus: %u..%u vs %u+%u\n",
3742 		   this->alignstart_trim,this->alignend_trim,this->chroffset,this->chrlength));
3743     if (this->alignstart_trim > this->chroffset + this->chrlength) {
3744       debug12(printf("all of substring is in the upper part, so previous substring was in lower part\n"));
3745       debug12(printf("returning querystart %d\n",this->querystart));
3746       return this->querystart;
3747 
3748     } else if (this->alignend_trim > this->chroffset + this->chrlength) {
3749       debug12(printf("substring straddles the circular origin\n"));
3750       debug12(printf("returning querystart %d + chroffset %u + chrlength %u - alignstart_trim %d\n",
3751 		     this->querystart,this->chroffset,this->chrlength,this->alignstart_trim));
3752       /* return (this->querystart - this->trim_querystart) + (this->chroffset + this->chrlength) - this->alignstart; */
3753       return this->querystart + (this->chroffset + this->chrlength) - this->alignstart_trim;
3754 
3755     } else {
3756       return -1;
3757     }
3758 
3759   } else {
3760     debug12(printf("Substring circularpos minus: %u..%u vs %u+%u\n",
3761 		   this->alignstart_trim,this->alignend_trim,this->chroffset,this->chrlength));
3762     if (this->alignend_trim > this->chroffset + this->chrlength) {
3763       debug12(printf("all of substring is in the upper part, so previous substring was in lower part\n"));
3764       debug12(printf("returning querylength %d - queryend %d\n",this->querylength,this->queryend));
3765       return this->querylength - this->queryend;
3766 
3767     } else if (this->alignstart_trim > this->chroffset + this->chrlength) {
3768       debug12(printf("returning querylength %d - queryend %d + chroffset %u + chrlength %u - alignend_trim %d\n",
3769 		     this->querylength,this->queryend,this->chroffset,this->chrlength,this->alignend_trim));
3770       /* return ((this->querylength - this->trim_queryend) - this->queryend) + (this->chroffset + this->chrlength) - this->alignend; */
3771       return (this->querylength - this->queryend) + (this->chroffset + this->chrlength) - this->alignend_trim;
3772 
3773     } else {
3774       return -1;
3775     }
3776   }
3777 }
3778 
3779 
3780 T
Substring_copy(T old)3781 Substring_copy (T old) {
3782   T new;
3783 
3784   if (old == NULL) {
3785     return (T) NULL;
3786   } else {
3787     new = (T) MALLOC_OUT(sizeof(*new));
3788     debug2(printf("substring %p is a copy of %p\n",new,old));
3789 
3790     new->nmismatches_bothdiff = old->nmismatches_bothdiff;
3791     new->nmismatches_refdiff = old->nmismatches_refdiff;
3792 
3793     new->nmatches_plus_spliced_trims = old->nmatches_plus_spliced_trims;
3794     new->nmatches_to_trims = old->nmatches_to_trims;
3795 
3796     new->ref_nmatches_plus_spliced_trims = old->ref_nmatches_plus_spliced_trims;
3797     new->ref_nmatches_to_trims = old->ref_nmatches_to_trims;
3798 
3799     new->mandatory_trim_querystart = old->mandatory_trim_querystart;
3800     new->mandatory_trim_queryend = old->mandatory_trim_queryend;
3801     new->start_amb_length = old->start_amb_length;
3802     new->end_amb_length = old->end_amb_length;
3803 
3804     new->trim_querystart_splicep = old->trim_querystart_splicep;
3805     new->trim_queryend_splicep = old->trim_queryend_splicep;
3806 
3807     new->chrnum = old->chrnum;
3808     new->chroffset = old->chroffset;
3809     new->chrhigh = old->chrhigh;
3810     new->chrlength = old->chrlength;
3811 
3812     new->left = old->left;
3813     new->genomicstart = old->genomicstart;
3814     new->genomicend = old->genomicend;
3815 
3816     new->start_endtype = old->start_endtype;
3817     new->end_endtype = old->end_endtype;
3818 
3819     new->querystart_pretrim = old->querystart_pretrim;
3820     new->queryend_pretrim = old->queryend_pretrim;
3821     new->querystart = old->querystart;
3822     new->queryend = old->queryend;
3823     new->querystart_chrbound = old->querystart_chrbound;
3824     new->queryend_chrbound = old->queryend_chrbound;
3825 
3826     new->amb_splice_pos = old->amb_splice_pos;
3827     new->querylength = old->querylength;
3828 
3829     new->alignstart_trim = old->alignstart_trim;
3830     new->alignend_trim = old->alignend_trim;
3831 
3832     new->plusp = old->plusp;
3833     new->genestrand = old->genestrand;
3834     new->query_compress = old->query_compress;
3835 
3836     if (old->genomic_bothdiff == NULL) {
3837       new->genomic_bothdiff = (char *) NULL;
3838       new->genomic_refdiff = (char *) NULL;
3839 
3840     } else if (old->genomic_refdiff == old->genomic_bothdiff) {
3841       assert((int) strlen(old->genomic_bothdiff) == old->querylength);
3842       new->genomic_bothdiff = (char *) CALLOC_OUT(strlen(old->genomic_bothdiff)+1,sizeof(char));
3843       strcpy(new->genomic_bothdiff,old->genomic_bothdiff);
3844       new->genomic_refdiff = new->genomic_bothdiff;
3845 
3846     } else {
3847       assert((int) strlen(old->genomic_bothdiff) == old->querylength);
3848       assert((int) strlen(old->genomic_refdiff) == old->querylength);
3849       new->genomic_bothdiff = (char *) CALLOC_OUT(strlen(old->genomic_bothdiff)+1,sizeof(char));
3850       strcpy(new->genomic_bothdiff,old->genomic_bothdiff);
3851       new->genomic_refdiff = (char *) CALLOC_OUT(strlen(old->genomic_refdiff)+1,sizeof(char));
3852       strcpy(new->genomic_refdiff,old->genomic_refdiff);
3853     }
3854 
3855     new->mapq_loglik = old->mapq_loglik;
3856 
3857     new->sensedir = old->sensedir;
3858 
3859     new->splicecoord_D = old->splicecoord_D;
3860     new->splicesitesD_knowni = old->splicesitesD_knowni;
3861     new->siteD_pos = old->siteD_pos;
3862     new->siteD_prob = old->siteD_prob;
3863 
3864     new->splicecoord_A = old->splicecoord_A;
3865     new->splicesitesA_knowni = old->splicesitesA_knowni;
3866     new->siteA_pos = old->siteA_pos;
3867     new->siteA_prob = old->siteA_prob;
3868 
3869     new->siteN_pos = old->siteN_pos;
3870 
3871     if (old->alts_ncoords == 0) {
3872       new->alts_ncoords = 0;
3873       new->alts_coords = (Univcoord_T *) NULL;
3874       new->alts_knowni = (int *) NULL;
3875       new->alts_nmismatches = (int *) NULL;
3876       new->alts_ref_nmismatches = (int *) NULL;
3877       new->alts_probs = (double *) NULL;
3878     } else {
3879       new->alts_ncoords = old->alts_ncoords;
3880       new->alts_coords = (Univcoord_T *) MALLOC_OUT(old->alts_ncoords * sizeof(Univcoord_T));
3881       new->alts_knowni = (int *) MALLOC_OUT(old->alts_ncoords * sizeof(int));
3882       new->alts_nmismatches = (int *) MALLOC_OUT(old->alts_ncoords * sizeof(int));
3883       new->alts_ref_nmismatches = (int *) MALLOC_OUT(old->alts_ncoords * sizeof(int));
3884       new->alts_probs = (double *) MALLOC_OUT(old->alts_ncoords * sizeof(double));
3885 
3886       memcpy(new->alts_coords,old->alts_coords,old->alts_ncoords * sizeof(Univcoord_T));
3887       memcpy(new->alts_knowni,old->alts_knowni,old->alts_ncoords * sizeof(int));
3888       memcpy(new->alts_nmismatches,old->alts_nmismatches,old->alts_ncoords * sizeof(int));
3889       memcpy(new->alts_ref_nmismatches,old->alts_ref_nmismatches,old->alts_ncoords * sizeof(int));
3890       memcpy(new->alts_probs,old->alts_probs,old->alts_ncoords * sizeof(double));
3891     }
3892     new->amb_type = old->amb_type;
3893 
3894     return new;
3895   }
3896 }
3897 
3898 
3899 /* nmismatches is between donor_pos and endpos */
3900 T
Substring_new_donor(int nmismatches,int ref_nmismatches,Univcoord_T donor_coord,int donor_knowni,int querystart,int queryend,int sitepos,double donor_prob,bool splice_querystart_p,Splicetype_T splicetype_querystart,double ambig_prob_querystart,bool splice_queryend_p,Splicetype_T splicetype_queryend,double ambig_prob_queryend,Univcoord_T left,Compress_T query_compress,int querylength,bool plusp,int genestrand,int sensedir,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)3901 Substring_new_donor (int nmismatches, int ref_nmismatches, Univcoord_T donor_coord, int donor_knowni,
3902 		     int querystart, int queryend, int sitepos, double donor_prob,
3903 		     bool splice_querystart_p, Splicetype_T splicetype_querystart, double ambig_prob_querystart,
3904 		     bool splice_queryend_p, Splicetype_T splicetype_queryend, double ambig_prob_queryend,
3905 
3906 		     Univcoord_T left, Compress_T query_compress, int querylength, bool plusp, int genestrand, int sensedir,
3907 		     Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
3908   T new;
3909 
3910   /* Previously checked if left >= chroffset + chrlength to exclude
3911      the duplicate length, but now excluding all translocations to
3912      circular chromosomes */
3913 
3914   if (chroffset + chrlength < chrhigh) {
3915     /* Don't splice to circular chromosomes */
3916     return (T) NULL;
3917 
3918   } else if ((new = Substring_new(nmismatches,ref_nmismatches,left,querystart,queryend,querylength,
3919 				  plusp,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
3920 				  splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
3921 				  splice_queryend_p,splicetype_queryend,ambig_prob_queryend,sensedir)) == NULL) {
3922     return (T) NULL;
3923   }
3924 
3925   debug2(printf("Making new donor with splicesites_i %d, coord %u and left %u, plusp %d, sensedir %d, query %d..%d, trim %d..%d\n",
3926 		donor_knowni,donor_coord,left,plusp,sensedir,new->querystart,new->queryend,
3927 		new->querystart,querylength - new->queryend));
3928   debug2(printf("Setting siteD_prob to be %f\n",donor_prob));
3929 
3930   new->splicecoord_D = donor_coord;
3931   new->splicesitesD_knowni = donor_knowni;
3932 
3933   new->sensedir = sensedir;
3934 
3935   new->siteD_pos = sitepos;	/* not a qpos */
3936   new->siteD_prob = donor_prob;
3937 
3938   return new;
3939 }
3940 
3941 
3942 /* nmismatches is between acceptor_pos and endpos */
3943 T
Substring_new_acceptor(int nmismatches,int ref_nmismatches,Univcoord_T acceptor_coord,int acceptor_knowni,int querystart,int queryend,int sitepos,double acceptor_prob,bool splice_querystart_p,Splicetype_T splicetype_querystart,double ambig_prob_querystart,bool splice_queryend_p,Splicetype_T splicetype_queryend,double ambig_prob_queryend,Univcoord_T left,Compress_T query_compress,int querylength,bool plusp,int genestrand,int sensedir,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)3944 Substring_new_acceptor (int nmismatches, int ref_nmismatches, Univcoord_T acceptor_coord, int acceptor_knowni,
3945 			int querystart, int queryend, int sitepos, double acceptor_prob,
3946 			bool splice_querystart_p, Splicetype_T splicetype_querystart, double ambig_prob_querystart,
3947 			bool splice_queryend_p, Splicetype_T splicetype_queryend, double ambig_prob_queryend,
3948 
3949 			Univcoord_T left, Compress_T query_compress, int querylength, bool plusp, int genestrand, int sensedir,
3950 			Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
3951   T new;
3952 
3953   /* Previously checked if left >= chroffset + chrlength to exclude
3954      the duplicate length, but now excluding all translocations to
3955      circular chromosomes */
3956 
3957   if (chroffset + chrlength < chrhigh) {
3958     /* Don't splice to circular chromosomes */
3959     return (T) NULL;
3960 
3961   } else if ((new = Substring_new(nmismatches,ref_nmismatches,left,querystart,queryend,querylength,
3962 				  plusp,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
3963 				  splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
3964 				  splice_queryend_p,splicetype_queryend,ambig_prob_queryend,sensedir)) == NULL) {
3965     return (T) NULL;
3966   }
3967 
3968   debug2(printf("Making new acceptor with splicesites_i %d, coord %u and left %u, plusp %d, sensedir %d, query %d..%d, trim %d..%d\n",
3969 		acceptor_knowni,acceptor_coord,left,plusp,sensedir,new->querystart,new->queryend,
3970 		new->querystart,querylength - new->queryend));
3971   debug2(printf("Setting siteA_prob to be %f\n",acceptor_prob));
3972 
3973   new->splicecoord_A = acceptor_coord;
3974   new->splicesitesA_knowni = acceptor_knowni;
3975 
3976   new->sensedir = sensedir;
3977 
3978   new->siteA_pos = sitepos;	/* not a qpos */
3979   new->siteA_prob = acceptor_prob;
3980 
3981   return new;
3982 }
3983 
3984 
3985 void
Substring_label_donor(T this,int splice_pos,double donor_prob,int sensedir_distant_guess)3986 Substring_label_donor (T this, int splice_pos, double donor_prob, int sensedir_distant_guess) {
3987   if (this->plusp == true) {
3988     this->splicecoord_D = this->left + splice_pos;
3989   } else {
3990     this->splicecoord_D = this->left + this->querylength - splice_pos;
3991   }
3992   /* new->splicesitesD_knowni = donor_knowni; */
3993   /* new->sensedir = sensedir; */
3994   this->sensedir = sensedir_distant_guess;
3995 
3996   this->siteD_pos = splice_pos;	/* not a qpos */
3997   this->siteD_prob = donor_prob;
3998 
3999   /* printf("Labeling donor with splicecoord %llu and prob %f\n",this->splicecoord_D,donor_prob); */
4000 
4001   return;
4002 }
4003 
4004 
4005 void
Substring_label_acceptor(T this,int splice_pos,double acceptor_prob,int sensedir_distant_guess)4006 Substring_label_acceptor (T this, int splice_pos, double acceptor_prob, int sensedir_distant_guess) {
4007   if (this->plusp == true) {
4008     this->splicecoord_A = this->left + splice_pos;
4009   } else {
4010     this->splicecoord_A = this->left + this->querylength - splice_pos;
4011   }
4012   /* new->splicesitesA_knowni = acceptor_knowni; */
4013   /* new->sensedir = sensedir; */
4014   this->sensedir = sensedir_distant_guess;
4015 
4016   this->siteA_pos = splice_pos;	/* not a qpos */
4017   this->siteA_prob = acceptor_prob;
4018 
4019   /* printf("Labeling acceptor with splicecoord %llu and prob %f\n",this->splicecoord_A,acceptor_prob); */
4020 
4021   return;
4022 }
4023 
4024 
4025 
4026 /* Used for distant DNA splicing.  Uses querystart and queryend, not qstart and qend.  middlepos is queryend */
4027 T
Substring_new_startfrag(int nmismatches,int querystart,int queryend,Univcoord_T left,Compress_T query_compress,int querylength,bool plusp,int genestrand,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)4028 Substring_new_startfrag (int nmismatches, int querystart, int queryend,
4029 			 Univcoord_T left, Compress_T query_compress, int querylength, bool plusp, int genestrand,
4030 			 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
4031   T new;
4032   int ref_nmismatches = nmismatches;
4033 
4034   /* Previously checked if left >= chroffset + chrlength to exclude
4035      the duplicate length, but now excluding all translocations to
4036      circular chromosomes */
4037 
4038   if (chroffset + chrlength < chrhigh) {
4039     /* Don't splice to circular chromosomes */
4040     return (T) NULL;
4041 
4042   } else if ((new = Substring_new(nmismatches,ref_nmismatches,left,querystart,queryend,querylength,
4043 				  plusp,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
4044 				  /*splice_querystart_p*/false,/*splicetype_querystart*/NO_SPLICE,/*ambig_prob_querystart*/0.0,
4045 				  /*splice_queryend_p*/false,/*splicetype_queryend*/NO_SPLICE,/*ambig_prob_queryend*/0.0,
4046 				  /*sensedir*/SENSE_NULL)) == NULL) {
4047     return (T) NULL;
4048 
4049   } else {
4050     debug2(printf("Making new startfrag with left %u, plusp %d, query %d..%d, genome %u..%u\n",
4051 		  left,plusp,querystart,queryend,
4052 		  Substring_alignstart_trim(new),Substring_alignend_trim(new)));
4053     new->siteN_pos = queryend;
4054     return new;
4055   }
4056 }
4057 
4058 
4059 /* Used for distant DNA splicing.  Uses querystart and queryend, not qstart and qend.  middlepos is querystart */
4060 T
Substring_new_endfrag(int nmismatches,int querystart,int queryend,Univcoord_T left,Compress_T query_compress,int querylength,bool plusp,int genestrand,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)4061 Substring_new_endfrag (int nmismatches, int querystart, int queryend,
4062 		       Univcoord_T left, Compress_T query_compress, int querylength, bool plusp, int genestrand,
4063 		       Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
4064   T new;
4065   int ref_nmismatches = nmismatches;
4066 
4067   /* Previously checked if left >= chroffset + chrlength to exclude
4068      the duplicate length, but now excluding all translocations to
4069      circular chromosomes */
4070 
4071   if (chroffset + chrlength < chrhigh) {
4072     /* Don't splice to circular chromosomes */
4073     return (T) NULL;
4074 
4075   } else if ((new = Substring_new(nmismatches,ref_nmismatches,left,querystart,queryend,querylength,
4076 				  plusp,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
4077 				  /*splice_querystart_p*/false,/*splicetype_querystart*/NO_SPLICE,/*ambig_prob_querystart*/0.0,
4078 				  /*splice_queryend_p*/false,/*splicetype_queryend*/NO_SPLICE,/*ambig_prob_queryend*/0.0,
4079 				  /*sensedir*/SENSE_NULL)) == NULL) {
4080     return (T) NULL;
4081 
4082   } else {
4083     debug2(printf("Making new endfrag with left %u, plusp %d, %d..%d, genome %u..%u\n",
4084 		  left,plusp,querystart,queryend,
4085 		  Substring_alignstart_trim(new),Substring_alignend_trim(new)));
4086     new->siteN_pos = querystart;
4087     return new;
4088   }
4089 }
4090 
4091 
4092 T
Substring_trim_startfrag(int nmismatches,T old,int new_queryend)4093 Substring_trim_startfrag (int nmismatches, T old, int new_queryend) {
4094   T new;
4095   int ref_nmismatches = nmismatches;
4096 
4097   debug2(printf("Entered Substring_trim_startfrag with %d..%d -> %d..%d\n",
4098 		old->querystart,old->queryend,old->querystart,new_queryend));
4099 
4100   if ((new = Substring_new(nmismatches,ref_nmismatches,old->left,old->querystart,new_queryend,old->querylength,
4101 			   old->plusp,old->genestrand,old->query_compress,
4102 			   old->chrnum,old->chroffset,old->chrhigh,old->chrlength,
4103 			   /*splice_querystart_p*/false,/*splicetype_querystart*/NO_SPLICE,/*ambig_prob_querystart*/0.0,
4104 			   /*splice_queryend_p*/false,/*splicetype_queryend*/NO_SPLICE,/*ambig_prob_queryend*/0.0,
4105 			   /*sensedir*/SENSE_NULL)) == NULL) {
4106     debug2(printf("Returning NULL\n"));
4107     return (T) NULL;
4108 
4109   } else {
4110     debug2(printf("Making new startfrag with left %u, plusp %d, query %d..%d, genome %u..%u\n",
4111 		  old->left,old->plusp,old->querystart,new_queryend,
4112 		  Substring_alignstart_trim(new),Substring_alignend_trim(new)));
4113     new->siteN_pos = new_queryend;
4114     return new;
4115   }
4116 }
4117 
4118 T
Substring_trim_endfrag(int nmismatches,T old,int new_querystart)4119 Substring_trim_endfrag (int nmismatches, T old, int new_querystart) {
4120   T new;
4121   int ref_nmismatches = nmismatches;
4122 
4123   debug2(printf("Entered Substring_trim_endfrag with %d..%d -> %d..%d\n",
4124 		old->querystart,old->queryend,new_querystart,old->queryend));
4125 
4126   if ((new = Substring_new(nmismatches,ref_nmismatches,old->left,new_querystart,old->queryend,old->querylength,
4127 			   old->plusp,old->genestrand,old->query_compress,
4128 			   old->chrnum,old->chroffset,old->chrhigh,old->chrlength,
4129 			   /*splice_querystart_p*/false,/*splicetype_querystart*/NO_SPLICE,/*ambig_prob_querystart*/0.0,
4130 			   /*splice_queryend_p*/false,/*splicetype_queryend*/NO_SPLICE,/*ambig_prob_queryend*/0.0,
4131 			   /*sensedir*/SENSE_NULL)) == NULL) {
4132     debug2(printf("Returning NULL\n"));
4133     return (T) NULL;
4134 
4135   } else {
4136     debug2(printf("Making new endfrag with left %u, plusp %d, %d..%d, genome %u..%u\n",
4137 		  old->left,old->plusp,new_querystart,old->queryend,
4138 		  Substring_alignstart_trim(new),Substring_alignend_trim(new)));
4139     new->siteN_pos = new_querystart;
4140     return new;
4141   }
4142 }
4143 
4144 
4145 
4146 static int
ascending_siteD_pos_cmp(const void * a,const void * b)4147 ascending_siteD_pos_cmp (const void *a, const void *b) {
4148   T x = * (T *) a;
4149   T y = * (T *) b;
4150 
4151   if (x->siteD_pos < y->siteD_pos) {
4152     return -1;
4153   } else if (x->siteD_pos > y->siteD_pos) {
4154     return +1;
4155   } else if (x->genomicstart < y->genomicstart) {
4156     return -1;
4157   } else if (x->genomicstart > y->genomicstart) {
4158     return +1;
4159   } else {
4160     return 0;
4161   }
4162 }
4163 
4164 static int
ascending_siteA_pos_cmp(const void * a,const void * b)4165 ascending_siteA_pos_cmp (const void *a, const void *b) {
4166   T x = * (T *) a;
4167   T y = * (T *) b;
4168 
4169   if (x->siteA_pos < y->siteA_pos) {
4170     return -1;
4171   } else if (x->siteA_pos > y->siteA_pos) {
4172     return +1;
4173   } else if (x->genomicstart < y->genomicstart) {
4174     return -1;
4175   } else if (x->genomicstart > y->genomicstart) {
4176     return +1;
4177   } else {
4178     return 0;
4179   }
4180 }
4181 
4182 static int
ascending_siteN_pos_cmp(const void * a,const void * b)4183 ascending_siteN_pos_cmp (const void *a, const void *b) {
4184   T x = * (T *) a;
4185   T y = * (T *) b;
4186 
4187   if (x->siteN_pos < y->siteN_pos) {
4188     return -1;
4189   } else if (x->siteN_pos > y->siteN_pos) {
4190     return +1;
4191   } else if (x->genomicstart < y->genomicstart) {
4192     return -1;
4193   } else if (x->genomicstart > y->genomicstart) {
4194     return +1;
4195   } else {
4196     return 0;
4197   }
4198 }
4199 
4200 static int
descending_siteD_pos_cmp(const void * a,const void * b)4201 descending_siteD_pos_cmp (const void *a, const void *b) {
4202   T x = * (T *) a;
4203   T y = * (T *) b;
4204 
4205   if (x->siteD_pos < y->siteD_pos) {
4206     return -1;
4207   } else if (x->siteD_pos > y->siteD_pos) {
4208     return +1;
4209   } else if (x->genomicstart > y->genomicstart) {
4210     return -1;
4211   } else if (x->genomicstart < y->genomicstart) {
4212     return +1;
4213   } else {
4214     return 0;
4215   }
4216 }
4217 
4218 static int
descending_siteA_pos_cmp(const void * a,const void * b)4219 descending_siteA_pos_cmp (const void *a, const void *b) {
4220   T x = * (T *) a;
4221   T y = * (T *) b;
4222 
4223   if (x->siteA_pos < y->siteA_pos) {
4224     return -1;
4225   } else if (x->siteA_pos > y->siteA_pos) {
4226     return +1;
4227   } else if (x->genomicstart > y->genomicstart) {
4228     return -1;
4229   } else if (x->genomicstart < y->genomicstart) {
4230     return +1;
4231   } else {
4232     return 0;
4233   }
4234 }
4235 
4236 static int
descending_siteN_pos_cmp(const void * a,const void * b)4237 descending_siteN_pos_cmp (const void *a, const void *b) {
4238   T x = * (T *) a;
4239   T y = * (T *) b;
4240 
4241   if (x->siteN_pos < y->siteN_pos) {
4242     return -1;
4243   } else if (x->siteN_pos > y->siteN_pos) {
4244     return +1;
4245   } else if (x->genomicstart > y->genomicstart) {
4246     return -1;
4247   } else if (x->genomicstart < y->genomicstart) {
4248     return +1;
4249   } else {
4250     return 0;
4251   }
4252 }
4253 
4254 List_T
Substring_sort_siteD_halves(List_T hitlist,Listpool_T listpool,bool ascendingp)4255 Substring_sort_siteD_halves (List_T hitlist, Listpool_T listpool, bool ascendingp) {
4256   List_T sorted = NULL;
4257   T x, *hits;
4258   int n, i, j;
4259   bool *eliminate;
4260 
4261   n = List_length(hitlist);
4262   debug(printf("Checking %d spliceends for duplicates...",n));
4263   if (n == 0) {
4264     debug(printf("\n"));
4265     return NULL;
4266   }
4267 
4268   hits = (T *) MALLOCA(n * sizeof(T));
4269   List_fill_array((void **) hits,hitlist);
4270   /* List_free(&hitlist); -- allocated by Listpool_push */
4271 
4272   if (ascendingp == true) {
4273     qsort(hits,n,sizeof(T),ascending_siteD_pos_cmp);
4274   } else {
4275     qsort(hits,n,sizeof(T),descending_siteD_pos_cmp);
4276   }
4277 
4278   /* Check for duplicates */
4279   eliminate = (bool *) CALLOCA(n,sizeof(bool));
4280   for (i = 0; i < n; i++) {
4281     x = hits[i];
4282     j = i+1;
4283     while (j < n && hits[j]->siteD_pos == x->siteD_pos && hits[j]->genomicstart == x->genomicstart) {
4284       eliminate[j] = true;
4285       j++;
4286     }
4287   }
4288 
4289   debug(j = 0);
4290   for (i = n-1; i >= 0; i--) {
4291     x = hits[i];
4292     if (eliminate[i] == false) {
4293       sorted = Listpool_push(sorted,listpool,(void *) x);
4294     } else {
4295       Substring_free(&x);
4296       debug(j++);
4297     }
4298   }
4299   debug(printf("%d eliminated\n",j));
4300 
4301   FREEA(hits);
4302   FREEA(eliminate);
4303 
4304   return sorted;
4305 }
4306 
4307 List_T
Substring_sort_siteA_halves(List_T hitlist,Listpool_T listpool,bool ascendingp)4308 Substring_sort_siteA_halves (List_T hitlist, Listpool_T listpool, bool ascendingp) {
4309   List_T sorted = NULL;
4310   T x, *hits;
4311   int n, i, j;
4312   bool *eliminate;
4313 
4314   n = List_length(hitlist);
4315   debug(printf("Checking %d spliceends for duplicates...",n));
4316   if (n == 0) {
4317     debug(printf("\n"));
4318     return NULL;
4319   }
4320 
4321   hits = (T *) MALLOCA(n * sizeof(T));
4322   List_fill_array((void **) hits,hitlist);
4323   /* List_free(&hitlist); -- allocated by Listpool_push */
4324 
4325   if (ascendingp == true) {
4326     qsort(hits,n,sizeof(T),ascending_siteA_pos_cmp);
4327   } else {
4328     qsort(hits,n,sizeof(T),descending_siteA_pos_cmp);
4329   }
4330 
4331   /* Check for duplicates */
4332   eliminate = (bool *) CALLOCA(n,sizeof(bool));
4333   for (i = 0; i < n; i++) {
4334     x = hits[i];
4335     j = i+1;
4336     while (j < n && hits[j]->siteA_pos == x->siteA_pos && hits[j]->genomicstart == x->genomicstart) {
4337       eliminate[j] = true;
4338       j++;
4339     }
4340   }
4341 
4342   debug(j = 0);
4343   for (i = n-1; i >= 0; i--) {
4344     x = hits[i];
4345     if (eliminate[i] == false) {
4346       sorted = Listpool_push(sorted,listpool,(void *) x);
4347     } else {
4348       Substring_free(&x);
4349       debug(j++);
4350     }
4351   }
4352   debug(printf("%d eliminated\n",j));
4353 
4354   FREEA(hits);
4355   FREEA(eliminate);
4356 
4357   return sorted;
4358 }
4359 
4360 List_T
Substring_sort_siteN_halves(List_T hitlist,Listpool_T listpool,bool ascendingp)4361 Substring_sort_siteN_halves (List_T hitlist, Listpool_T listpool, bool ascendingp) {
4362   List_T sorted = NULL;
4363   T x, *hits;
4364   int n, i, j;
4365   bool *eliminate;
4366 
4367   n = List_length(hitlist);
4368   debug(printf("Checking %d spliceends for duplicates...",n));
4369   if (n == 0) {
4370     debug(printf("\n"));
4371     return NULL;
4372   }
4373 
4374   hits = (T *) MALLOCA(n * sizeof(T));
4375   List_fill_array((void **) hits,hitlist);
4376   /* List_free(&hitlist); -- allocated by Listpool_push */
4377 
4378   if (ascendingp == true) {
4379     qsort(hits,n,sizeof(T),ascending_siteN_pos_cmp);
4380   } else {
4381     qsort(hits,n,sizeof(T),descending_siteN_pos_cmp);
4382   }
4383 
4384   /* Check for duplicates */
4385   eliminate = (bool *) CALLOCA(n,sizeof(bool));
4386   for (i = 0; i < n; i++) {
4387     x = hits[i];
4388     j = i+1;
4389     while (j < n && hits[j]->siteN_pos == x->siteN_pos && hits[j]->genomicstart == x->genomicstart) {
4390       eliminate[j] = true;
4391       j++;
4392     }
4393   }
4394 
4395   debug(j = 0);
4396   for (i = n-1; i >= 0; i--) {
4397     x = hits[i];
4398     if (eliminate[i] == false) {
4399       sorted = Listpool_push(sorted,listpool,(void *) x);
4400     } else {
4401       Substring_free(&x);
4402       debug(j++);
4403     }
4404   }
4405   debug(printf("%d eliminated\n",j));
4406 
4407   FREEA(hits);
4408   FREEA(eliminate);
4409 
4410   return sorted;
4411 }
4412 
4413 
4414 static void
print_snp_labels(Filestring_T fp,T this,Shortread_T queryseq)4415 print_snp_labels (Filestring_T fp, T this, Shortread_T queryseq) {
4416   int *snps, nsnps, querypos, i;
4417   char *label, *seq1, *seq2;
4418   bool allocp, printp = false;
4419   Interval_T interval;
4420   Chrpos_T position;
4421 
4422   if (this->plusp == true) {
4423     snps = IIT_get_with_divno(&nsnps,snps_iit,
4424 			      snps_divint_crosstable[this->chrnum],
4425 			      this->alignstart_trim - this->chroffset + 1,this->alignend_trim - this->chroffset,
4426 			      /*sortp*/false);
4427   } else {
4428     snps = IIT_get_with_divno(&nsnps,snps_iit,
4429 			      snps_divint_crosstable[this->chrnum],
4430 			      this->alignend_trim - this->chroffset + 1,this->alignstart_trim - this->chroffset,
4431 			      /*sortp*/false);
4432   }
4433 
4434   FPRINTF(fp,",snps:");
4435 
4436   seq1 = Shortread_fullpointer_uc(queryseq);
4437   if (this->genomic_bothdiff == NULL) {
4438     seq2 = Shortread_fullpointer(queryseq);
4439   } else {
4440     seq2 = this->genomic_bothdiff;
4441   }
4442 
4443   if (this->plusp) {
4444 
4445 #if 0
4446     for (i = 0; i < nsnps; i++) {
4447       interval = IIT_interval(snps_iit,snps[i]);
4448       position = Interval_low(interval);
4449       querypos = position - (this->genomicstart-this->chroffset) - 1;
4450       printf("%d ",querypos);
4451     }
4452     printf("\n");
4453 #endif
4454 
4455     for (i = 0; i < nsnps; i++) {
4456       interval = IIT_interval(snps_iit,snps[i]);
4457       position = Interval_low(interval);
4458       querypos = position - (this->genomicstart-this->chroffset) - 1;
4459       /* assert(querypos >= 0 && querypos < this->genomiclength); */
4460 
4461 #if 0
4462       alleles = IIT_typestring(snps_iit,Interval_type(interval));
4463       if (c == alleles[0] || c == alleles[1]) ;
4464 #endif
4465 
4466       if (isupper(seq2[querypos]) && seq1[querypos] != seq2[querypos]) {
4467 	label = IIT_label(snps_iit,snps[i],&allocp);
4468 	if (printp) {
4469 	  FPRINTF(fp,"|");
4470 	}
4471 	FPRINTF(fp,"%d@",querypos+1);
4472 	FPRINTF(fp,"%s",label);
4473 	printp = true;
4474 	if (allocp) FREE(label);
4475       }
4476     }
4477 
4478   } else {
4479 
4480     for (i = nsnps-1; i >= 0; i--) {
4481       interval = IIT_interval(snps_iit,snps[i]);
4482       position = Interval_low(interval);
4483       querypos = (this->genomicstart-this->chroffset) - position;
4484       /* assert(querypos >= 0 && querypos < this->genomiclength); */
4485 
4486 #if 0
4487       /* printf("\n%d%c\n",querypos,c); */
4488       alleles = IIT_typestring(snps_iit,Interval_type(interval));
4489       if (c == alleles[0] || c == alleles[1]) ;
4490 #endif
4491 
4492       if (isupper(seq2[querypos]) && seq1[querypos] != seq2[querypos]) {
4493 	label = IIT_label(snps_iit,snps[i],&allocp);
4494 	if (printp) {
4495 	  FPRINTF(fp,"|");
4496 	}
4497 	FPRINTF(fp,"%d@",querypos+1);
4498 	FPRINTF(fp,"%s",label);
4499 	printp = true;
4500 	if (allocp) FREE(label);
4501       }
4502     }
4503 
4504   }
4505 
4506   FREE(snps);
4507 
4508   return;
4509 }
4510 
4511 
4512 Chrpos_T
Substring_compute_chrpos(T this,int hardclip_low,bool hide_soft_clips_p)4513 Substring_compute_chrpos (T this, int hardclip_low, bool hide_soft_clips_p) {
4514   Chrpos_T chrpos_low;
4515 
4516   if (hide_soft_clips_p == true) {
4517     if (this->plusp == true) {
4518       /* Add 1 to report in 1-based coordinates */
4519       chrpos_low = this->genomicstart - this->chroffset + 1;
4520       chrpos_low += hardclip_low;
4521       /* *chrpos_high = this->genomicend - this->chroffset + 1; */
4522       /* *chrpos_high -= hardclip_high; */
4523 
4524     } else {
4525       /* Add 1 to report in 1-based coordinates */
4526       chrpos_low = this->genomicend - this->chroffset + 1;
4527       chrpos_low += hardclip_low;
4528       /* *chrpos_high = this->genomicstart - this->chroffset + 1; */
4529       /* *chrpos_high -= hardclip_high; */
4530     }
4531 
4532   } else {
4533     if (this->plusp == true) {
4534       chrpos_low = this->genomicstart - this->chroffset + 1;
4535       if (this->querystart > hardclip_low) {
4536 	chrpos_low += this->querystart; /* not querystart_orig */
4537       } else {
4538 	chrpos_low += hardclip_low;
4539       }
4540 
4541 #if 0
4542       *chrpos_high = this->genomicend - this->chroffset + 1;
4543       if (this->querylength - this->queryend > hardclip_high) {
4544 	*chrpos_high -= this->querylength - this->queryend;
4545       } else {
4546 	*chrpos_high -= hardclip_high;
4547       }
4548 #endif
4549 
4550     } else {
4551       chrpos_low = this->genomicend - this->chroffset + 1;
4552       if (this->querylength - this->queryend > hardclip_low) {
4553 	chrpos_low += this->querylength - this->queryend; /* not queryend_orig */
4554       } else {
4555 	chrpos_low += hardclip_low;
4556       }
4557 
4558 #if 0
4559       *chrpos_high = this->genomicstart - this->chroffset + 1;
4560       if (this->querystart > hardclip_high) {
4561 	*chrpos_high -= this->querystart;
4562       } else {
4563 	*chrpos_high -= hardclip_high;
4564       }
4565 #endif
4566     }
4567   }
4568 
4569   return chrpos_low;
4570 }
4571 
4572 
4573 
4574 /* Taken from NCBI Blast 2.2.29, algo/blast/core/blast_stat.c */
4575 /* Karlin-Altschul formula: m n exp(-lambda * S + log k) = k m n exp(-lambda * S) */
4576 /* Also in pair.c */
4577 
4578 static double
blast_evalue(int alignlength,int nmismatches)4579 blast_evalue (int alignlength, int nmismatches) {
4580   double k = 0.1;
4581   double lambda = 1.58;		/* For a +1, -1 scoring scheme */
4582   double score;
4583 
4584   score = (double) ((alignlength - nmismatches) /* scored as +1 */ - nmismatches /* scored as -1 */);
4585 
4586   return k * (double) alignlength * genomesize * exp(-lambda * score);
4587 }
4588 
4589 static double
blast_bitscore(int alignlength,int nmismatches)4590 blast_bitscore (int alignlength, int nmismatches) {
4591   double k = 0.1;
4592   double lambda = 1.58;		/* For a +1, -1 scoring scheme */
4593   double score;
4594 
4595   score = (double) ((alignlength - nmismatches) /* scored as +1 */ - nmismatches /* scored as -1 */);
4596   return (score * lambda - log(k)) / log(2.0);
4597 }
4598 
4599 
4600 double
Substring_evalue(T substring)4601 Substring_evalue (T substring) {
4602   int alignlength_trim;
4603 
4604 #if 0
4605   /* Doesn't work for ambiguous substrings */
4606   if (substring->plusp == true) {
4607     alignlength_trim = (int) (substring->alignend_trim - substring->alignstart_trim);
4608   } else {
4609     alignlength_trim = (int) (substring->alignstart_trim - substring->alignend_trim);
4610   }
4611   assert(alignlength_trim == substring->queryend - substring->querystart);
4612 #else
4613   alignlength_trim = substring->queryend - substring->querystart;
4614 #endif
4615 
4616   return blast_evalue(alignlength_trim,substring->nmismatches_bothdiff);
4617 }
4618 
4619 
4620 void
Substring_print_m8(Filestring_T fp,T substring,Shortread_T headerseq,char * acc_suffix,char * chr,bool invertp)4621 Substring_print_m8 (Filestring_T fp, T substring, Shortread_T headerseq, char *acc_suffix,
4622 		    char *chr, bool invertp) {
4623   double identity;
4624   int alignlength_trim;
4625 
4626   FPRINTF(fp,"%s%s",Shortread_accession(headerseq),acc_suffix); /* field 0: accession */
4627 
4628   FPRINTF(fp,"\t%s",chr);	/* field 1: chr */
4629 
4630   /* field 2: identity */
4631   if (substring->plusp == true) {
4632     alignlength_trim = (int) (substring->alignend_trim - substring->alignstart_trim);
4633   } else {
4634     alignlength_trim = (int) (substring->alignstart_trim - substring->alignend_trim);
4635   }
4636 
4637   identity = (double) (alignlength_trim - substring->nmismatches_bothdiff)/(double) alignlength_trim;
4638   FPRINTF(fp,"\t%.1f",100.0*identity);
4639 
4640 
4641   FPRINTF(fp,"\t%d",alignlength_trim); /* field 3: query length */
4642 
4643   FPRINTF(fp,"\t%d",substring->nmismatches_bothdiff); /* field 4: nmismatches */
4644 
4645   FPRINTF(fp,"\t0");		/* field 5: gap openings */
4646 
4647   FPRINTF(fp,"\t%d",substring->querystart + 1); /* field 6: query start */
4648 
4649   FPRINTF(fp,"\t%d",substring->queryend); /* field 7: query end */
4650 
4651   /* fields 8 and 9: chr start and end */
4652   if (substring->plusp == true) {
4653     if (invertp == false) {
4654       FPRINTF(fp,"\t%u\t%u",substring->alignstart_trim - substring->chroffset + 1,
4655 	      substring->alignend_trim - substring->chroffset);
4656     } else {
4657       FPRINTF(fp,"\t%u\t%u",substring->alignend_trim - substring->chroffset,
4658 	      substring->alignstart_trim - substring->chroffset + 1);
4659     }
4660   } else {
4661     if (invertp == false) {
4662       FPRINTF(fp,"\t%u\t%u",substring->alignstart_trim - substring->chroffset,
4663 	      substring->alignend_trim - substring->chroffset + 1);
4664     } else {
4665       FPRINTF(fp,"\t%u\t%u",substring->alignend_trim - substring->chroffset + 1,
4666 	      substring->alignstart_trim - substring->chroffset);
4667     }
4668   }
4669 
4670   /* field 10: E value */
4671   FPRINTF(fp,"\t%.2g",blast_evalue(alignlength_trim,substring->nmismatches_bothdiff));
4672 
4673  /* field 11: bit score */
4674   FPRINTF(fp,"\t%.1f",blast_bitscore(alignlength_trim,substring->nmismatches_bothdiff));
4675 
4676   FPRINTF(fp,"\n");
4677 
4678   return;
4679 }
4680 
4681 
4682 
4683 static void
print_forward(Filestring_T fp,char * string,int n)4684 print_forward (Filestring_T fp, char *string, int n) {
4685 
4686   FPRINTF(fp,"%.*s",n,string);
4687   return;
4688 }
4689 
4690 static void
print_lc(Filestring_T fp,char * string,int n)4691 print_lc (Filestring_T fp, char *string, int n) {
4692   int i;
4693 
4694   for (i = 0; i < n; i++) {
4695     FPRINTF(fp,"%c",(char) tolower(string[i]));
4696   }
4697   return;
4698 }
4699 
4700 static void
print_revcomp(Filestring_T fp,char * nt,int len)4701 print_revcomp (Filestring_T fp, char *nt, int len) {
4702 
4703   FPRINTF(fp,"%.*R",len,nt);
4704   return;
4705 }
4706 
4707 static void
print_revcomp_lc(Filestring_T fp,char * nt,int len)4708 print_revcomp_lc (Filestring_T fp, char *nt, int len) {
4709   int i;
4710 
4711   for (i = len-1; i >= 0; --i) {
4712     FPRINTF(fp,"%c",(char) tolower(complCode[(int) nt[i]]));
4713   }
4714   return;
4715 }
4716 
4717 
4718 static void
print_genomic(Filestring_T fp,T substring,char * deletion,int deletionlength,bool invertp,Shortread_T queryseq)4719 print_genomic (Filestring_T fp, T substring, char *deletion, int deletionlength, bool invertp,
4720 	       Shortread_T queryseq) {
4721   int i;
4722 
4723   /* Now handled by calculation of queryleft/queryright by Stage3end_display_prep */
4724   deletion = NULL;
4725   deletionlength = 0;
4726 
4727   if (invertp == false) {
4728     if (substring->genomic_bothdiff == NULL) {
4729       /* Exact match */
4730       Shortread_print_subseq_uc(fp,queryseq,substring->querystart,substring->queryend);
4731 
4732     } else if (show_refdiff_p == true) {
4733       print_forward(fp,substring->genomic_refdiff,substring->queryend);
4734       if (deletion != NULL) {
4735 	print_lc(fp,deletion,deletionlength);
4736       }
4737       print_forward(fp,&(substring->genomic_refdiff[substring->queryend]),substring->querylength - deletionlength - substring->queryend);
4738 
4739     } else {
4740       print_forward(fp,substring->genomic_bothdiff,substring->queryend);
4741       if (deletion != NULL) {
4742 	print_lc(fp,deletion,deletionlength);
4743       }
4744       print_forward(fp,&(substring->genomic_bothdiff[substring->queryend]),substring->querylength - deletionlength - substring->queryend);
4745     }
4746 
4747     for (i = 0; i < Shortread_choplength(queryseq); i++) {
4748       FPRINTF(fp,"*");
4749     }
4750     FPRINTF(fp,"\t");
4751     FPRINTF(fp,"%d..%d",1 + substring->querystart,substring->queryend);
4752 
4753   } else {
4754     if (substring->genomic_bothdiff == NULL) {
4755       /* Exact match */
4756       Shortread_print_subseq_revcomp_uc(fp,queryseq,substring->querystart,substring->queryend);
4757 
4758     } else if (show_refdiff_p == true) {
4759       print_revcomp(fp,&(substring->genomic_refdiff[substring->querystart]),substring->querylength - substring->querystart);
4760       if (deletion != NULL) {
4761 	print_revcomp_lc(fp,deletion,deletionlength);
4762       }
4763       print_revcomp(fp,&(substring->genomic_refdiff[deletionlength]),substring->querystart - deletionlength);
4764 
4765     } else {
4766       print_revcomp(fp,&(substring->genomic_bothdiff[substring->querystart]),substring->querylength - substring->querystart);
4767       if (deletion != NULL) {
4768 	print_revcomp_lc(fp,deletion,deletionlength);
4769       }
4770       print_revcomp(fp,&(substring->genomic_bothdiff[deletionlength]),substring->querystart - deletionlength);
4771     }
4772     for (i = 0; i < Shortread_choplength(queryseq); i++) {
4773       FPRINTF(fp,"*");
4774     }
4775     FPRINTF(fp,"\t");
4776     FPRINTF(fp,"%d..%d",1 + substring->querylength - substring->queryend,
4777 	   substring->querylength - substring->querystart);
4778   }
4779   return;
4780 }
4781 
4782 
4783 static void
print_coordinates(Filestring_T fp,T substring,char * chr,bool invertp)4784 print_coordinates (Filestring_T fp, T substring, char *chr, bool invertp) {
4785 
4786   if (substring->plusp == true) {
4787     if (invertp == false) {
4788       FPRINTF(fp,"+%s:%u..%u",chr,substring->alignstart_trim - substring->chroffset + 1,
4789 	     substring->alignend_trim - substring->chroffset);
4790     } else {
4791       FPRINTF(fp,"-%s:%u..%u",chr,substring->alignend_trim - substring->chroffset,
4792 	     substring->alignstart_trim - substring->chroffset + 1);
4793     }
4794   } else {
4795     if (invertp == false) {
4796       FPRINTF(fp,"-%s:%u..%u",chr,substring->alignstart_trim - substring->chroffset,
4797 	     substring->alignend_trim - substring->chroffset + 1);
4798     } else {
4799       FPRINTF(fp,"+%s:%u..%u",chr,substring->alignend_trim - substring->chroffset + 1,
4800 	     substring->alignstart_trim - substring->chroffset);
4801     }
4802   }
4803 
4804   return;
4805 }
4806 
4807 
4808 
4809 void
Substring_print_alignment(Filestring_T fp,Junction_T pre_junction,T substring,Junction_T post_junction,Shortread_T queryseq,Genome_T genome,char * chr,bool invertp)4810 Substring_print_alignment (Filestring_T fp, Junction_T pre_junction, T substring, Junction_T post_junction,
4811 			   Shortread_T queryseq, Genome_T genome, char *chr, bool invertp) {
4812   char *deletion_string;
4813   int deletion_length;
4814   Junctiontype_T type1, type2;
4815   Chrpos_T splice_distance_1, splice_distance_2;
4816 
4817   if (post_junction == NULL) {
4818     deletion_string = (char *) NULL;
4819     deletion_length = 0;
4820   } else if (Junction_type(post_junction) != DEL_JUNCTION) {
4821     deletion_string = (char *) NULL;
4822     deletion_length = 0;
4823   } else {
4824     deletion_string = Junction_deletion_string(post_junction,genome,substring->plusp);
4825     deletion_length = Junction_nindels(post_junction);
4826   }
4827 
4828   print_genomic(fp,substring,deletion_string,deletion_length,invertp,queryseq);
4829   FREE(deletion_string);
4830   FPRINTF(fp,"\t");
4831   print_coordinates(fp,substring,chr,invertp);
4832 
4833   FPRINTF(fp,"\t");
4834   if (pre_junction == NULL) {
4835     type1 = NO_JUNCTION;
4836     /* Handle result of substring_trim_novel_spliceends */
4837     if (invertp == false) {
4838       if (substring->start_endtype == DON) {
4839 	FPRINTF(fp,"donor:%.2f",substring->siteD_prob);
4840       } else if (substring->start_endtype == ACC) {
4841 	FPRINTF(fp,"acceptor:%.2f",substring->siteA_prob);
4842       } else {
4843 	FPRINTF(fp,"start:%d",substring->querystart);
4844       }
4845     } else {
4846       if (substring->end_endtype == DON) {
4847 	FPRINTF(fp,"donor:%.2f",substring->siteD_prob);
4848       } else if (substring->end_endtype == ACC) {
4849 	FPRINTF(fp,"acceptor:%.2f",substring->siteA_prob);
4850       } else {
4851 	FPRINTF(fp,"start:%d",substring->querylength - substring->queryend);
4852       }
4853     }
4854   } else if ((type1 = Junction_type(pre_junction)) == INS_JUNCTION) {
4855     FPRINTF(fp,"ins:%d",Junction_nindels(pre_junction));
4856   } else if (type1 == DEL_JUNCTION) {
4857     FPRINTF(fp,"del:%d",Junction_nindels(pre_junction));
4858   } else if (type1 == SPLICE_JUNCTION) {
4859     if (invertp == false) {
4860       if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4861 	FPRINTF(fp,"donor:%.2f",Junction_donor_prob(pre_junction));
4862       } else {
4863 	FPRINTF(fp,"acceptor:%.2f",Junction_acceptor_prob(pre_junction));
4864       }
4865     } else {
4866       if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4867 	FPRINTF(fp,"acceptor:%.2f",Junction_acceptor_prob(pre_junction));
4868       } else {
4869 	FPRINTF(fp,"donor:%.2f",Junction_donor_prob(pre_junction));
4870       }
4871     }
4872   } else if (type1 == CHIMERA_JUNCTION) {
4873     FPRINTF(fp,"distant:%u",Junction_splice_distance(pre_junction));
4874   } else {
4875     abort();
4876   }
4877 
4878   FPRINTF(fp,"..");
4879 
4880   if (post_junction == NULL) {
4881     type2 = NO_JUNCTION;
4882     /* Handle result of substring_trim_novel_spliceends */
4883     if (invertp == false) {
4884       if (substring->end_endtype == DON) {
4885 	FPRINTF(fp,"donor:%.2f",substring->siteD_prob);
4886       } else if (substring->end_endtype == ACC) {
4887 	FPRINTF(fp,"acceptor:%.2f",substring->siteA_prob);
4888       } else {
4889 	FPRINTF(fp,"end:%d",substring->querylength - substring->queryend);
4890       }
4891     } else {
4892       if (substring->start_endtype == DON) {
4893 	FPRINTF(fp,"donor:%.2f",substring->siteD_prob);
4894       } else if (substring->start_endtype == ACC) {
4895 	FPRINTF(fp,"acceptor:%.2f",substring->siteA_prob);
4896       } else {
4897 	FPRINTF(fp,"end:%d",substring->querystart);
4898       }
4899     }
4900   } else if ((type2 = Junction_type(post_junction)) == INS_JUNCTION) {
4901     FPRINTF(fp,"ins:%d",Junction_nindels(post_junction));
4902   } else if (type2 == DEL_JUNCTION) {
4903     FPRINTF(fp,"del:%d",Junction_nindels(post_junction));
4904   } else if (type2 == SPLICE_JUNCTION) {
4905     if (invertp == false) {
4906       if (Junction_sensedir(post_junction) == SENSE_ANTI) {
4907 	FPRINTF(fp,"acceptor:%.2f",Junction_acceptor_prob(post_junction));
4908       } else {
4909 	FPRINTF(fp,"donor:%.2f",Junction_donor_prob(post_junction));
4910       }
4911     } else {
4912       if (Junction_sensedir(post_junction) == SENSE_ANTI) {
4913 	FPRINTF(fp,"donor:%.2f",Junction_donor_prob(post_junction));
4914       } else {
4915 	FPRINTF(fp,"acceptor:%.2f",Junction_acceptor_prob(post_junction));
4916       }
4917     }
4918   } else if (type2 == CHIMERA_JUNCTION) {
4919     FPRINTF(fp,"distant:%u",Junction_splice_distance(post_junction));
4920   } else {
4921     abort();
4922   }
4923 
4924 
4925   FPRINTF(fp,",matches:%d,sub:%d",Substring_nmatches_to_trims(substring),substring->nmismatches_bothdiff);
4926   if (print_nsnpdiffs_p) {
4927     FPRINTF(fp,"+%d=%d",substring->nmismatches_refdiff - substring->nmismatches_bothdiff,substring->nmismatches_refdiff);
4928     if (print_snplabels_p && substring->nmismatches_refdiff > substring->nmismatches_bothdiff) {
4929       print_snp_labels(fp,substring,queryseq);
4930     }
4931   }
4932 
4933   if (type1 == SPLICE_JUNCTION && type2 == SPLICE_JUNCTION) {
4934     if (invertp == false) {
4935       if (Junction_sensedir(pre_junction) == SENSE_FORWARD) {
4936 	FPRINTF(fp,",dir:sense");
4937       } else if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4938 	FPRINTF(fp,",dir:antisense");
4939       } else {
4940 	FPRINTF(fp,",dir:unknown");
4941       }
4942     } else {
4943       if (Junction_sensedir(pre_junction) == SENSE_FORWARD) {
4944 	FPRINTF(fp,",dir:antisense");
4945       } else if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4946 	FPRINTF(fp,",dir:sense");
4947       } else {
4948 	FPRINTF(fp,",dir:unknown");
4949       }
4950     }
4951     splice_distance_1 = Junction_splice_distance(pre_junction);
4952     splice_distance_2 = Junction_splice_distance(post_junction);
4953     if (splice_distance_1 == 0 && splice_distance_2 == 0) {
4954       /* Skip */
4955     } else if (splice_distance_1 == 0) {
4956       FPRINTF(fp,",splice_type:consistent");
4957       FPRINTF(fp,",splice_dist_2:%u",splice_distance_2);
4958     } else if (splice_distance_2 == 0) {
4959       FPRINTF(fp,",splice_type:consistent");
4960       FPRINTF(fp,",splice_dist_1:%u",splice_distance_1);
4961     } else {
4962       FPRINTF(fp,",splice_type:consistent");
4963       FPRINTF(fp,",splice_dist_1:%u",splice_distance_1);
4964       FPRINTF(fp,",splice_dist_2:%u",splice_distance_2);
4965     }
4966 
4967   } else if (type1 == SPLICE_JUNCTION) {
4968     if (invertp == false) {
4969       if (Junction_sensedir(pre_junction) == SENSE_FORWARD) {
4970 	FPRINTF(fp,",dir:sense");
4971       } else if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4972 	FPRINTF(fp,",dir:antisense");
4973       } else {
4974 	FPRINTF(fp,",dir:unknown");
4975       }
4976     } else {
4977       if (Junction_sensedir(pre_junction) == SENSE_FORWARD) {
4978 	FPRINTF(fp,",dir:antisense");
4979       } else if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4980 	FPRINTF(fp,",dir:sense");
4981       } else {
4982 	FPRINTF(fp,",dir:unknown");
4983       }
4984     }
4985     if ((splice_distance_1 = Junction_splice_distance(pre_junction)) > 0) {
4986       FPRINTF(fp,",splice_type:consistent");
4987       FPRINTF(fp,",splice_dist_1:%u",splice_distance_1);
4988     }
4989 
4990   } else if (type2 == SPLICE_JUNCTION) {
4991     if (invertp == false) {
4992       if (Junction_sensedir(post_junction) == SENSE_FORWARD) {
4993 	FPRINTF(fp,",dir:sense");
4994       } else if (Junction_sensedir(post_junction) == SENSE_ANTI) {
4995 	FPRINTF(fp,",dir:antisense");
4996       } else {
4997 	FPRINTF(fp,",dir:unknown");
4998       }
4999     } else {
5000       if (Junction_sensedir(post_junction) == SENSE_FORWARD) {
5001 	FPRINTF(fp,",dir:antisense");
5002       } else if (Junction_sensedir(post_junction) == SENSE_ANTI) {
5003 	FPRINTF(fp,",dir:sense");
5004       } else {
5005 	FPRINTF(fp,",dir:unknown");
5006       }
5007     }
5008     if ((splice_distance_2 = Junction_splice_distance(post_junction)) > 0) {
5009       FPRINTF(fp,",splice_type:consistent");
5010       FPRINTF(fp,",splice_dist_2:%u",splice_distance_2);
5011     }
5012   }
5013 
5014   return;
5015 }
5016 
5017 
5018 /************************************************************************
5019  *   Multiclean
5020  ************************************************************************/
5021 
5022 static char *
get_total_tally(long int * tally,char * ptr)5023 get_total_tally (long int *tally, char *ptr) {
5024   int n;
5025   char *end;
5026 
5027   if ((end = index(ptr,'\n')) == NULL) {
5028     fprintf(stderr,"Premature end of line %s\n",ptr);
5029     return 0;
5030   }
5031   /* fprintf(stderr,"Getting tally for %.*s\n",end-ptr,ptr); */
5032 
5033   while (ptr < end) {
5034     while (ptr < end && !isdigit((int) *ptr)) {
5035       ptr++;
5036     }
5037     if (ptr < end) {
5038       sscanf(ptr,"%d",&n);
5039 #if 0
5040       debug(if (n > 0) printf(" %d",n));
5041 #endif
5042       (*tally) += n;
5043       while (ptr < end && !isspace(*ptr)) {
5044 	ptr++;
5045       }
5046       while (ptr < end && isspace(*ptr)) {
5047 	ptr++;
5048       }
5049     }
5050   }
5051 
5052   return ptr;
5053 }
5054 
5055 long int
Substring_tally(T this,IIT_T tally_iit,int * tally_divint_crosstable)5056 Substring_tally (T this, IIT_T tally_iit, int *tally_divint_crosstable) {
5057   long int total = 0U;
5058   Interval_T interval;
5059   char *annotation, *restofheader, *ptr;
5060 #if 0
5061   bool alloc_chr_p;
5062 #endif
5063   bool allocp;
5064   Chrpos_T chrpos, intervalend;
5065 
5066   Chrpos_T coordstart, coordend, pos5, pos3;
5067   int *matches;
5068   int nmatches, i;
5069 
5070   pos5 = this->alignstart_trim - this->chroffset;
5071   pos3 = this->alignend_trim - this->chroffset;
5072 
5073   if (pos5 < pos3) {
5074     coordstart = pos5;
5075     coordend = pos3;
5076   } else {
5077     coordstart = pos3;
5078     coordend = pos5;
5079   }
5080   coordstart += 1;		/* Because tally IIT is 1-based */
5081   debug(printf("coordstart = %u, coordend = %u\n",coordstart,coordend));
5082 
5083 #if 0
5084   chr = Univ_IIT_label(chromosome_iit,this->chrnum,&alloc_chr_p);
5085 #endif
5086   matches = IIT_get_with_divno(&nmatches,tally_iit,tally_divint_crosstable[this->chrnum],
5087 			       coordstart,coordend,/*sortp*/false);
5088 
5089   for (i = 0; i < nmatches; i++) {
5090     annotation = IIT_annotation(&restofheader,tally_iit,matches[i],&allocp);
5091 
5092     interval = IIT_interval(tally_iit,matches[i]);
5093     chrpos = Interval_low(interval);
5094     intervalend = Interval_high(interval);
5095 
5096     ptr = annotation;
5097 
5098     while (chrpos < coordstart) {
5099       if ((ptr = index(ptr,'\n')) == NULL) {
5100 	fprintf(stderr,"Premature end of tally from %u to %u\n",
5101 		Interval_low(interval),Interval_high(interval));
5102 	return total;
5103       } else {
5104 	ptr++;
5105       }
5106       chrpos++;
5107     }
5108 
5109     while (chrpos <= intervalend && chrpos <= coordend) {
5110       ptr = get_total_tally(&total,ptr);
5111       ptr++;
5112       chrpos++;
5113     }
5114 
5115     if (allocp == true) {
5116       FREE(restofheader);
5117     }
5118   }
5119 
5120   FREE(matches);
5121 
5122 #if 0
5123   if (alloc_chr_p) {
5124     FREE(chr);
5125   }
5126 #endif
5127 
5128   debug(printf("Subtotal = %ld\n",total));
5129   return total;
5130 }
5131 
5132 
5133 bool
Substring_runlength_p(T this,IIT_T runlength_iit,int * runlength_divint_crosstable)5134 Substring_runlength_p (T this, IIT_T runlength_iit, int *runlength_divint_crosstable) {
5135   Chrpos_T coordstart, coordend, pos5, pos3;
5136 
5137   pos5 = this->alignstart_trim - this->chroffset;
5138   pos3 = this->alignend_trim - this->chroffset;
5139 
5140   if (pos5 < pos3) {
5141     coordstart = pos5;
5142     coordend = pos3;
5143   } else {
5144     coordstart = pos3;
5145     coordend = pos5;
5146   }
5147   coordstart += 1;		/* Because runlength IIT is 1-based */
5148   debug(printf("coordstart = %u, coordend = %u\n",coordstart,coordend));
5149 
5150   /* chr = Univ_IIT_label(chromosome_iit,this->chrnum,&alloc_chr_p); */
5151   return IIT_exists_with_divno(runlength_iit,runlength_divint_crosstable[this->chrnum],
5152 				coordstart,coordend);
5153 }
5154 
5155 
5156 int
Substring_count_mismatches_region(int * ref_nmismatches,T this,int trim_querystart,int trim_queryend)5157 Substring_count_mismatches_region (int *ref_nmismatches, T this, int trim_querystart, int trim_queryend) {
5158   int left_bound, right_bound;
5159 
5160   if (this == NULL) {
5161     return 0;
5162   } else {
5163     left_bound = trim_querystart;
5164     right_bound = this->querylength - trim_queryend;
5165 
5166     if (this->queryend_pretrim < left_bound) {
5167       return 0;
5168     } else if (this->querystart_pretrim > right_bound) {
5169       return 0;
5170     } else {
5171       if (this->querystart_pretrim > left_bound) {
5172 	left_bound = this->querystart_pretrim;
5173       }
5174       if (this->queryend_pretrim < right_bound) {
5175 	right_bound = this->queryend_pretrim;
5176       }
5177 
5178 #if 0
5179       /* Not sure why we have this assertion.  Values set by eventrim procedure in Stage3end_optimal_score_prefinal.  If assertion fails, then just return 0 */
5180       assert(left_bound <= right_bound);
5181 #endif
5182 
5183       if (left_bound > right_bound) {
5184 	return 0;
5185       } else if (this->plusp) {
5186 	return Genome_count_mismatches_substring(&(*ref_nmismatches),genomebits,genomebits_alt,this->query_compress,this->left,
5187 						 /*pos5*/left_bound,/*pos3*/right_bound,/*plusp*/true,this->genestrand);
5188       } else {
5189 	return Genome_count_mismatches_substring(&(*ref_nmismatches),genomebits,genomebits_alt,this->query_compress,this->left,
5190 						 /*pos5*/this->querylength - right_bound,
5191 						 /*pos3*/this->querylength - left_bound,
5192 						 /*plusp*/false,this->genestrand);
5193       }
5194     }
5195   }
5196 }
5197 
5198 
5199 /************************************************************************
5200  *   Conversion to Pair_T format
5201  ************************************************************************/
5202 
5203 List_T
Substring_convert_to_pairs_out(List_T pairs,T substring,int querylength,Shortread_T queryseq,int hardclip_low,int hardclip_high,int queryseq_offset)5204 Substring_convert_to_pairs_out (List_T pairs, T substring, int querylength, Shortread_T queryseq,
5205 				int hardclip_low, int hardclip_high, int queryseq_offset) {
5206   int querystart, queryend, querypos, i;
5207   Chrpos_T chrpos;
5208   char *seq1;
5209   char genome;
5210 
5211   if (substring == NULL) {
5212     return pairs;
5213   } else if (substring->alts_ncoords > 0) {
5214     return pairs;
5215   }
5216 
5217   debug6(printf("*** Entered Substring_convert_to_pairs with querylength %d, hardclip_low %d, hardclip_high %d, queryseq_offset %d\n",
5218 		querylength,hardclip_low,hardclip_high,queryseq_offset));
5219 
5220   seq1 = Shortread_fullpointer_uc(queryseq);
5221   if (substring->plusp == true) {
5222     if (hardclip_low > substring->querystart) {
5223       querystart = hardclip_low;
5224     } else {
5225       querystart = substring->querystart;
5226     }
5227 
5228     if (querylength - hardclip_high < substring->queryend) {
5229       queryend = querylength - hardclip_high;
5230     } else {
5231       queryend = substring->queryend;
5232     }
5233     /* Pairs are all zero-based, so do not add 1 */
5234 #if 0
5235     chrpos = substring->genomicstart_adj + querystart - substring->chroffset /*+ 1*/;
5236 #else
5237     chrpos = substring->genomicstart + querystart - substring->chroffset /*+ 1*/;
5238 #endif
5239 
5240     debug6(printf("plus conversion\n"));
5241     debug6(printf("querystart %d, queryend %d, plusp %d\n",querystart,queryend,substring->plusp));
5242     debug6(printf("alignstart %u, alignend %u\n",substring->alignstart_trim - substring->chroffset,
5243 		  substring->alignend_trim - substring->chroffset));
5244     debug6(printf("chrpos %u\n",chrpos));
5245 
5246     if (substring->genomic_bothdiff == NULL) {
5247       /* Exact match */
5248       for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5249 	pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5250 								seq1[i],/*comp*/MATCH_COMP,seq1[i]));
5251       }
5252     } else if (show_refdiff_p == true) {
5253       for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5254 	if (isupper(genome = substring->genomic_refdiff[i])) {
5255 	  assert(seq1[i] == genome || seq1[i] == 'N');
5256 	  pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5257 								  seq1[i],/*comp*/MATCH_COMP,genome));
5258 	} else {
5259 	  assert(seq1[i] != toupper(genome));
5260 	  pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5261 								  seq1[i],/*comp*/MISMATCH_COMP,toupper(genome)));
5262 	}
5263       }
5264     } else {
5265       /* printf("querystart %d, queryend %d\n",querystart,queryend); */
5266       /* printf("seq1   %s\n",seq1); */
5267       /* printf("genome %s\n",substring->genomic_bothdiff); */
5268       for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5269 	if (isupper(genome = substring->genomic_bothdiff[i])) {
5270 	  assert(seq1[i] == genome || seq1[i] == 'N');
5271 	  pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5272 								  seq1[i],/*comp*/MATCH_COMP,genome));
5273 	} else {
5274 	  assert(seq1[i] != toupper(genome));
5275 	  pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5276 								  seq1[i],/*comp*/MISMATCH_COMP,toupper(genome)));
5277 	}
5278       }
5279     }
5280 
5281   } else {
5282     if (hardclip_high > substring->querystart) {
5283       querystart = hardclip_high;
5284     } else {
5285       querystart = substring->querystart;
5286     }
5287 
5288     if (querylength - hardclip_low < substring->queryend) {
5289       queryend = querylength - hardclip_low;
5290     } else {
5291       queryend = substring->queryend;
5292     }
5293     /* For minus, to get 0-based coordinates, subtract 1 */
5294 #if 0
5295     chrpos = substring->genomicstart_adj - querystart - substring->chroffset - 1;
5296 #else
5297     chrpos = substring->genomicstart - querystart - substring->chroffset - 1;
5298 #endif
5299 
5300     debug6(printf("minus conversion\n"));
5301     debug6(printf("querystart %d, queryend %d, plusp %d\n",querystart,queryend,substring->plusp));
5302     debug6(printf("alignstart %u, alignend %u\n",substring->alignstart_trim - substring->chroffset,
5303 		  substring->alignend_trim - substring->chroffset));
5304     debug6(printf("chrpos %u\n",chrpos));
5305 
5306     if (substring->genomic_bothdiff == NULL) {
5307       /* Exact match */
5308       for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5309 	pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5310 								seq1[i],/*comp*/MATCH_COMP,seq1[i]));
5311       }
5312     } else if (show_refdiff_p == true) {
5313       for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5314 	if (isupper(genome = substring->genomic_refdiff[i])) {
5315 	  assert(seq1[i] == genome || seq1[i] == 'N');
5316 	  pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5317 								  seq1[i],/*comp*/MATCH_COMP,genome));
5318 	} else {
5319 	  assert(seq1[i] != toupper(genome));
5320 	  pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5321 								  seq1[i],/*comp*/MISMATCH_COMP,toupper(genome)));
5322 	}
5323       }
5324     } else {
5325       for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5326 	if (isupper(genome = substring->genomic_bothdiff[i])) {
5327 	  /* assert(seq1[i] == genome || seq1[i] == 'N'); */
5328 	  pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5329 								  seq1[i],/*comp*/MATCH_COMP,genome));
5330 	} else {
5331 	  /* assert(seq1[i] != toupper(genome)); */
5332 	  pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5333 								  seq1[i],/*comp*/MISMATCH_COMP,toupper(genome)));
5334 	}
5335       }
5336     }
5337   }
5338 
5339   debug6(Simplepair_dump_list(pairs,true));
5340   return pairs;
5341 }
5342 
5343 
5344 List_T
Substring_add_insertion_out(List_T pairs,T substringA,T substringB,int querylength,int insertionlength,Shortread_T queryseq,int hardclip_low,int hardclip_high,int queryseq_offset)5345 Substring_add_insertion_out (List_T pairs, T substringA, T substringB, int querylength,
5346 			     int insertionlength, Shortread_T queryseq,
5347 			     int hardclip_low, int hardclip_high, int queryseq_offset) {
5348   int querystartA, queryendA, querystartB, queryendB, querypos, i;
5349   Chrpos_T chrendA;
5350   char *seq1;
5351 
5352 
5353   if (substringA->plusp == true) {
5354     if (hardclip_low > substringA->querystart) {
5355       querystartA = hardclip_low;
5356     } else {
5357       querystartA = substringA->querystart;
5358     }
5359 
5360     if (querylength - hardclip_high < substringA->queryend) {
5361       queryendA = querylength - hardclip_high;
5362     } else {
5363       queryendA = substringA->queryend;
5364     }
5365 
5366     if (hardclip_low > substringB->querystart) {
5367       querystartB = hardclip_low;
5368     } else {
5369       querystartB = substringB->querystart;
5370     }
5371 
5372     if (querylength - hardclip_high < substringB->queryend) {
5373       queryendB = querylength - hardclip_high;
5374     } else {
5375       queryendB = substringB->queryend;
5376     }
5377 
5378     /* Pairs are all zero-based, so do not add 1 */
5379 #if 0
5380     chrendA = substringA->genomicstart_adj + queryendA - substringA->chroffset /*+ 1*/;
5381 #else
5382     chrendA = substringA->genomicstart + queryendA - substringA->chroffset /*+ 1*/;
5383 #endif
5384 
5385   } else {
5386     if (hardclip_high > substringA->querystart) {
5387       querystartA = hardclip_high;
5388     } else {
5389       querystartA = substringA->querystart;
5390     }
5391 
5392     if (querylength - hardclip_low < substringA->queryend) {
5393       queryendA = querylength - hardclip_low;
5394     } else {
5395       queryendA = substringA->queryend;
5396     }
5397 
5398     if (hardclip_high > substringB->querystart) {
5399       querystartB = hardclip_high;
5400     } else {
5401       querystartB = substringB->querystart;
5402     }
5403 
5404     if (querylength - hardclip_low < substringB->queryend) {
5405       queryendB = querylength - hardclip_low;
5406     } else {
5407       queryendB = substringB->queryend;
5408     }
5409 
5410     /* Pairs are all zero-based, so subtract 1 */
5411 #if 0
5412     chrendA = substringA->genomicstart_adj - queryendA - substringA->chroffset - 1;
5413 #else
5414     chrendA = substringA->genomicstart - queryendA - substringA->chroffset - 1;
5415 #endif
5416   }
5417 
5418   if (querystartA <= queryendA && querystartB <= queryendB) {
5419     seq1 = Shortread_fullpointer_uc(queryseq);
5420     querypos = queryendA + queryseq_offset;
5421     i = queryendA;
5422     while (--insertionlength >= 0) {
5423       pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos++,/*genomepos*/chrendA,
5424 							      seq1[i++],/*comp*/INDEL_COMP,' '));
5425     }
5426   }
5427 
5428   return pairs;
5429 }
5430 
5431 
5432 List_T
Substring_add_deletion_out(List_T pairs,T substringA,T substringB,int querylength,char * deletion,int deletionlength,int hardclip_low,int hardclip_high,int queryseq_offset)5433 Substring_add_deletion_out (List_T pairs, T substringA, T substringB, int querylength,
5434 			    char *deletion, int deletionlength,
5435 			    int hardclip_low, int hardclip_high, int queryseq_offset) {
5436   int querystartA, queryendA, querystartB, queryendB, querypos, k;
5437   Chrpos_T chrendA;
5438 
5439   if (substringA->plusp == true) {
5440     if (hardclip_low > substringA->querystart) {
5441       querystartA = hardclip_low;
5442     } else {
5443       querystartA = substringA->querystart;
5444     }
5445 
5446     if (querylength - hardclip_high < substringA->queryend) {
5447       queryendA = querylength - hardclip_high;
5448     } else {
5449       queryendA = substringA->queryend;
5450     }
5451 
5452     if (hardclip_low > substringB->querystart) {
5453       querystartB = hardclip_low;
5454     } else {
5455       querystartB = substringB->querystart;
5456     }
5457 
5458     if (querylength - hardclip_high < substringB->queryend) {
5459       queryendB = querylength - hardclip_high;
5460     } else {
5461       queryendB = substringB->queryend;
5462     }
5463 
5464     /* Pairs are all zero-based, so do not add 1 */
5465 #if 0
5466     chrendA = substringA->genomicstart_adj + queryendA - substringA->chroffset /*+ 1*/;
5467 #else
5468     chrendA = substringA->genomicstart + queryendA - substringA->chroffset /*+ 1*/;
5469 #endif
5470 
5471     if (querystartA < queryendA && querystartB < queryendB) {
5472       querypos = queryendA + queryseq_offset;
5473       for (k = 0; k < deletionlength; k++) {
5474 	pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrendA++,
5475 								' ',/*comp*/INDEL_COMP,deletion[k]));
5476       }
5477     }
5478 
5479   } else {
5480     if (hardclip_high > substringA->querystart) {
5481       querystartA = hardclip_high;
5482     } else {
5483       querystartA = substringA->querystart;
5484     }
5485 
5486     if (querylength - hardclip_low < substringA->queryend) {
5487       queryendA = querylength - hardclip_low;
5488     } else {
5489       queryendA = substringA->queryend;
5490     }
5491 
5492     if (hardclip_high > substringB->querystart) {
5493       querystartB = hardclip_high;
5494     } else {
5495       querystartB = substringB->querystart;
5496     }
5497 
5498     if (querylength - hardclip_low < substringB->queryend) {
5499       queryendB = querylength - hardclip_low;
5500     } else {
5501       queryendB = substringB->queryend;
5502     }
5503 
5504     /* Pairs are all zero-based, so subtract 1 */
5505 #if 0
5506     chrendA = substringA->genomicstart_adj - queryendA - substringA->chroffset - 1;
5507 #else
5508     chrendA = substringA->genomicstart - queryendA - substringA->chroffset - 1;
5509 #endif
5510 
5511     if (querystartA <= queryendA && querystartB <= queryendB) {
5512       querypos = queryendA + queryseq_offset;
5513       for (k = 0; k < deletionlength; k++) {
5514 	pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrendA--,
5515 								' ',/*comp*/INDEL_COMP,deletion[k]));
5516       }
5517     }
5518   }
5519 
5520   return pairs;
5521 }
5522 
5523 
5524 
5525 List_T
Substring_add_intron_out(List_T pairs,T substringA,T substringB,int querylength,int hardclip_low,int hardclip_high,int queryseq_offset)5526 Substring_add_intron_out (List_T pairs, T substringA, T substringB, int querylength,
5527 			  int hardclip_low, int hardclip_high, int queryseq_offset) {
5528   int querystartA, queryendA, querystartB, queryendB, querypos;
5529   Chrpos_T chrendA;
5530 
5531   if (substringA->plusp == true) {
5532     if (hardclip_low > substringA->querystart) {
5533       querystartA = hardclip_low;
5534     } else {
5535       querystartA = substringA->querystart;
5536     }
5537 
5538     if (querylength - hardclip_high < substringA->queryend) {
5539       queryendA = querylength - hardclip_high;
5540     } else {
5541       queryendA = substringA->queryend;
5542     }
5543 
5544     if (hardclip_low > substringB->querystart) {
5545       querystartB = hardclip_low;
5546     } else {
5547       querystartB = substringB->querystart;
5548     }
5549 
5550     if (querylength - hardclip_high < substringB->queryend) {
5551       queryendB = querylength - hardclip_high;
5552     } else {
5553       queryendB = substringB->queryend;
5554     }
5555 
5556     /* Pairs are all zero-based, so do not add 1 */
5557 #if 0
5558     chrendA = substringA->genomicstart_adj + queryendA - substringA->chroffset /*+ 1*/;
5559 #else
5560     chrendA = substringA->genomicstart + queryendA - substringA->chroffset /*+ 1*/;
5561 #endif
5562 
5563   } else {
5564     if (hardclip_high > substringA->querystart) {
5565       querystartA = hardclip_high;
5566     } else {
5567       querystartA = substringA->querystart;
5568     }
5569 
5570     if (querylength - hardclip_low < substringA->queryend) {
5571       queryendA = querylength - hardclip_low;
5572     } else {
5573       queryendA = substringA->queryend;
5574     }
5575 
5576     if (hardclip_high > substringB->querystart) {
5577       querystartB = hardclip_high;
5578     } else {
5579       querystartB = substringB->querystart;
5580     }
5581 
5582     if (querylength - hardclip_low < substringB->queryend) {
5583       queryendB = querylength - hardclip_low;
5584     } else {
5585       queryendB = substringB->queryend;
5586     }
5587 
5588     /* Pairs are all zero-based, so subtract 1 */
5589 #if 0
5590     chrendA = substringA->genomicstart_adj - queryendA - substringA->chroffset - 1;
5591 #else
5592     chrendA = substringA->genomicstart - queryendA - substringA->chroffset - 1;
5593 #endif
5594   }
5595 
5596   if (querystartA <= queryendA && querystartB <= queryendB) {
5597     /* Add gapholder */
5598     /* All we really need for Pair_print_sam is to set gapp to be true */
5599     querypos = queryendA + queryseq_offset;
5600     pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrendA,
5601 							    ' ',/*comp*/FWD_CANONICAL_INTRON_COMP,' '));
5602   }
5603 
5604   return pairs;
5605 }
5606 
5607 
5608 
5609 void
Substring_setup(bool print_nsnpdiffs_p_in,bool print_snplabels_p_in,bool show_refdiff_p_in,IIT_T snps_iit_in,int * snps_divint_crosstable_in,Genome_T genomebits_in,Genome_T genomebits_alt_in,Univ_IIT_T chromosome_iit_in,Univcoord_T genomelength_in,int nchromosomes_in,int circular_typeint_in,bool novelsplicingp_in,bool knownsplicingp_in,Outputtype_T output_type_in,Mode_T mode_in,bool maskedp_in)5610 Substring_setup (bool print_nsnpdiffs_p_in, bool print_snplabels_p_in,
5611 		 bool show_refdiff_p_in, IIT_T snps_iit_in, int *snps_divint_crosstable_in,
5612 		 Genome_T genomebits_in, Genome_T genomebits_alt_in,
5613 		 Univ_IIT_T chromosome_iit_in, Univcoord_T genomelength_in,
5614 		 int nchromosomes_in, int circular_typeint_in,
5615 		 bool novelsplicingp_in, bool knownsplicingp_in,
5616 		 Outputtype_T output_type_in, Mode_T mode_in, bool maskedp_in) {
5617   print_nsnpdiffs_p = print_nsnpdiffs_p_in;
5618   print_snplabels_p = print_snplabels_p_in;
5619   show_refdiff_p = show_refdiff_p_in;
5620 
5621   snps_iit = snps_iit_in;
5622   snps_divint_crosstable = snps_divint_crosstable_in;
5623 
5624   genomebits = genomebits_in;
5625   genomebits_alt = genomebits_alt_in;
5626 
5627   chromosome_iit = chromosome_iit_in;
5628   genomelength = genomelength_in;
5629   genomesize = (double) Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias_p*/false);
5630   nchromosomes = nchromosomes_in;
5631   circular_typeint = circular_typeint_in;
5632 
5633   novelsplicingp = novelsplicingp_in;
5634   knownsplicingp = knownsplicingp_in;
5635 
5636   output_type = output_type_in;
5637   mode = mode_in;
5638 
5639   maskedp = maskedp_in;
5640 
5641   return;
5642 }
5643 
5644 
5645 
5646